src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory;
 315
 316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 318 Lisp_Object Qstart_process, Qopen_network_stream;
 319 Lisp_Object Qtarget_idx;
 320
 321 int coding_system_require_warning;
 322
 323 Lisp_Object Vselect_safe_coding_system_function;
 324
 325 /* Mnemonic string for each format of end-of-line.  */
 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 327 /* Mnemonic string to indicate format of end-of-line is not yet
 328    decided.  */
 329 Lisp_Object eol_mnemonic_undecided;
 330
 331 #ifdef emacs
 332
 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 334
 335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 336
 337 /* Coding system emacs-mule and raw-text are for converting only
 338    end-of-line format.  */
 339 Lisp_Object Qemacs_mule, Qraw_text;
 340 Lisp_Object Qutf_8_emacs;
 341
 342 /* Coding-systems are handed between Emacs Lisp programs and C internal
 343    routines by the following three variables.  */
 344 /* Coding-system for reading files and receiving data from process.  */
 345 Lisp_Object Vcoding_system_for_read;
 346 /* Coding-system for writing files and sending data to process.  */
 347 Lisp_Object Vcoding_system_for_write;
 348 /* Coding-system actually used in the latest I/O.  */
 349 Lisp_Object Vlast_coding_system_used;
 350
 351 /* A vector of length 256 which contains information about special
 352    Latin codes (especially for dealing with Microsoft codes).  */
 353 Lisp_Object Vlatin_extra_code_table;
 354
 355 /* Flag to inhibit code conversion of end-of-line format.  */
 356 int inhibit_eol_conversion;
 357
 358 /* Flag to inhibit ISO2022 escape sequence detection.  */
 359 int inhibit_iso_escape_detection;
 360
 361 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 362 int inherit_process_coding_system;
 363
 364 /* Coding system to be used to encode text for terminal display.  */
 365 struct coding_system terminal_coding;
 366
 367 /* Coding system to be used to encode text for terminal display when
 368    terminal coding system is nil.  */
 369 struct coding_system safe_terminal_coding;
 370
 371 /* Coding system of what is sent from terminal keyboard.  */
 372 struct coding_system keyboard_coding;
 373
 374 Lisp_Object Vfile_coding_system_alist;
 375 Lisp_Object Vprocess_coding_system_alist;
 376 Lisp_Object Vnetwork_coding_system_alist;
 377
 378 Lisp_Object Vlocale_coding_system;
 379
 380 #endif /* emacs */
 381
 382 /* Flag to tell if we look up translation table on character code
 383    conversion.  */
 384 Lisp_Object Venable_character_translation;
 385 /* Standard translation table to look up on decoding (reading).  */
 386 Lisp_Object Vstandard_translation_table_for_decode;
 387 /* Standard translation table to look up on encoding (writing).  */
 388 Lisp_Object Vstandard_translation_table_for_encode;
 389
 390 Lisp_Object Qtranslation_table;
 391 Lisp_Object Qtranslation_table_id;
 392 Lisp_Object Qtranslation_table_for_decode;
 393 Lisp_Object Qtranslation_table_for_encode;
 394
 395 /* Alist of charsets vs revision number.  */
 396 static Lisp_Object Vcharset_revision_table;
 397
 398 /* Default coding systems used for process I/O.  */
 399 Lisp_Object Vdefault_process_coding_system;
 400
 401 /* Char table for translating Quail and self-inserting input.  */
 402 Lisp_Object Vtranslation_table_for_input;
 403
 404 /* Two special coding systems.  */
 405 Lisp_Object Vsjis_coding_system;
 406 Lisp_Object Vbig5_coding_system;
 407
 408
 409 static int detect_coding_utf_8 P_ ((struct coding_system *,
 410                                     struct coding_detection_info *info));
 411 static void decode_coding_utf_8 P_ ((struct coding_system *));
 412 static int encode_coding_utf_8 P_ ((struct coding_system *));
 413
 414 static int detect_coding_utf_16 P_ ((struct coding_system *,
 415                                      struct coding_detection_info *info));
 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
 418
 419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 420                                        struct coding_detection_info *info));
 421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 423
 424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 425                                          struct coding_detection_info *info));
 426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 428
 429 static int detect_coding_sjis P_ ((struct coding_system *,
 430                                    struct coding_detection_info *info));
 431 static void decode_coding_sjis P_ ((struct coding_system *));
 432 static int encode_coding_sjis P_ ((struct coding_system *));
 433
 434 static int detect_coding_big5 P_ ((struct coding_system *,
 435                                    struct coding_detection_info *info));
 436 static void decode_coding_big5 P_ ((struct coding_system *));
 437 static int encode_coding_big5 P_ ((struct coding_system *));
 438
 439 static int detect_coding_ccl P_ ((struct coding_system *,
 440                                   struct coding_detection_info *info));
 441 static void decode_coding_ccl P_ ((struct coding_system *));
 442 static int encode_coding_ccl P_ ((struct coding_system *));
 443
 444 static void decode_coding_raw_text P_ ((struct coding_system *));
 445 static int encode_coding_raw_text P_ ((struct coding_system *));
 446
 447
 448 /* ISO2022 section */
 449
 450 #define CODING_ISO_INITIAL(coding, reg)                 \
 451   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 452                      coding_attr_iso_initial),          \
 453                reg)))
 454
 455
 456 #define CODING_ISO_REQUEST(coding, charset_id)  \
 457   ((charset_id <= (coding)->max_charset_id      \
 458     ? (coding)->safe_charsets[charset_id]       \
 459     : -1))
 460
 461
 462 #define CODING_ISO_FLAGS(coding)        \
 463   ((coding)->spec.iso_2022.flags)
 464 #define CODING_ISO_DESIGNATION(coding, reg)     \
 465   ((coding)->spec.iso_2022.current_designation[reg])
 466 #define CODING_ISO_INVOCATION(coding, plane)    \
 467   ((coding)->spec.iso_2022.current_invocation[plane])
 468 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 469   ((coding)->spec.iso_2022.single_shifting)
 470 #define CODING_ISO_BOL(coding)  \
 471   ((coding)->spec.iso_2022.bol)
 472 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 473   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 474
 475 /* Control characters of ISO2022.  */
 476                         /* code */      /* function */
 477 #define ISO_CODE_LF     0x0A            /* line-feed */
 478 #define ISO_CODE_CR     0x0D            /* carriage-return */
 479 #define ISO_CODE_SO     0x0E            /* shift-out */
 480 #define ISO_CODE_SI     0x0F            /* shift-in */
 481 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 482 #define ISO_CODE_ESC    0x1B            /* escape */
 483 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 484 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 485 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 486
 487 /* All code (1-byte) of ISO2022 is classified into one of the
 488    followings.  */
 489 enum iso_code_class_type
 490   {
 491     ISO_control_0,              /* Control codes in the range
 492                                    0x00..0x1F and 0x7F, except for the
 493                                    following 5 codes.  */
 494     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 495     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 496     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 497     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 498     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 499     ISO_control_1,              /* Control codes in the range
 500                                    0x80..0x9F, except for the
 501                                    following 3 codes.  */
 502     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 503     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 504     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 505     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 506     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 507     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 508     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 509   };
 510
 511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 512     `iso-flags' attribute of an iso2022 coding system.  */
 513
 514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 515    instead of the correct short-form sequence (e.g. ESC $ A).  */
 516 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 517
 518 /* If set, reset graphic planes and registers at end-of-line to the
 519    initial state.  */
 520 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 521
 522 /* If set, reset graphic planes and registers before any control
 523    characters to the initial state.  */
 524 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 525
 526 /* If set, encode by 7-bit environment.  */
 527 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 528
 529 /* If set, use locking-shift function.  */
 530 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 531
 532 /* If set, use single-shift function.  Overwrite
 533    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 534 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 535
 536 /* If set, use designation escape sequence.  */
 537 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 538
 539 /* If set, produce revision number sequence.  */
 540 #define CODING_ISO_FLAG_REVISION        0x0080
 541
 542 /* If set, produce ISO6429's direction specifying sequence.  */
 543 #define CODING_ISO_FLAG_DIRECTION       0x0100
 544
 545 /* If set, assume designation states are reset at beginning of line on
 546    output.  */
 547 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 548
 549 /* If set, designation sequence should be placed at beginning of line
 550    on output.  */
 551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 552
 553 /* If set, do not encode unsafe charactes on output.  */
 554 #define CODING_ISO_FLAG_SAFE            0x0800
 555
 556 /* If set, extra latin codes (128..159) are accepted as a valid code
 557    on input.  */
 558 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 559
 560 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 561
 562 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 563
 564 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 565
 566 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 567
 568 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 569
 570 /* A character to be produced on output if encoding of the original
 571    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 573
 574
 575 /* UTF-16 section */
 576 #define CODING_UTF_16_BOM(coding)       \
 577   ((coding)->spec.utf_16.bom)
 578
 579 #define CODING_UTF_16_ENDIAN(coding)    \
 580   ((coding)->spec.utf_16.endian)
 581
 582 #define CODING_UTF_16_SURROGATE(coding) \
 583   ((coding)->spec.utf_16.surrogate)
 584
 585
 586 /* CCL section */
 587 #define CODING_CCL_DECODER(coding)      \
 588   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 589 #define CODING_CCL_ENCODER(coding)      \
 590   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 591 #define CODING_CCL_VALIDS(coding)                                          \
 592   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 593
 594 /* Index for each coding category in `coding_categories' */
 595
 596 enum coding_category
 597   {
 598     coding_category_iso_7,
 599     coding_category_iso_7_tight,
 600     coding_category_iso_8_1,
 601     coding_category_iso_8_2,
 602     coding_category_iso_7_else,
 603     coding_category_iso_8_else,
 604     coding_category_utf_8,
 605     coding_category_utf_16_auto,
 606     coding_category_utf_16_be,
 607     coding_category_utf_16_le,
 608     coding_category_utf_16_be_nosig,
 609     coding_category_utf_16_le_nosig,
 610     coding_category_charset,
 611     coding_category_sjis,
 612     coding_category_big5,
 613     coding_category_ccl,
 614     coding_category_emacs_mule,
 615     /* All above are targets of code detection.  */
 616     coding_category_raw_text,
 617     coding_category_undecided,
 618     coding_category_max
 619   };
 620
 621 /* Definitions of flag bits used in detect_coding_XXXX.  */
 622 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 623 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 624 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 625 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 626 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 627 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 628 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 629 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 630 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 631 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 632 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 633 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 634 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 635 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 636 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 637 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 638 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 639 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 640
 641 /* This value is returned if detect_coding_mask () find nothing other
 642    than ASCII characters.  */
 643 #define CATEGORY_MASK_ANY               \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_8_1              \
 647    | CATEGORY_MASK_ISO_8_2              \
 648    | CATEGORY_MASK_ISO_7_ELSE           \
 649    | CATEGORY_MASK_ISO_8_ELSE           \
 650    | CATEGORY_MASK_UTF_8                \
 651    | CATEGORY_MASK_UTF_16_BE            \
 652    | CATEGORY_MASK_UTF_16_LE            \
 653    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 654    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 655    | CATEGORY_MASK_CHARSET              \
 656    | CATEGORY_MASK_SJIS                 \
 657    | CATEGORY_MASK_BIG5                 \
 658    | CATEGORY_MASK_CCL                  \
 659    | CATEGORY_MASK_EMACS_MULE)
 660
 661
 662 #define CATEGORY_MASK_ISO_7BIT \
 663   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 664
 665 #define CATEGORY_MASK_ISO_8BIT \
 666   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 667
 668 #define CATEGORY_MASK_ISO_ELSE \
 669   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 670
 671 #define CATEGORY_MASK_ISO_ESCAPE        \
 672   (CATEGORY_MASK_ISO_7                  \
 673    | CATEGORY_MASK_ISO_7_TIGHT          \
 674    | CATEGORY_MASK_ISO_7_ELSE           \
 675    | CATEGORY_MASK_ISO_8_ELSE)
 676
 677 #define CATEGORY_MASK_ISO       \
 678   (  CATEGORY_MASK_ISO_7BIT     \
 679      | CATEGORY_MASK_ISO_8BIT   \
 680      | CATEGORY_MASK_ISO_ELSE)
 681
 682 #define CATEGORY_MASK_UTF_16            \
 683   (CATEGORY_MASK_UTF_16_BE              \
 684    | CATEGORY_MASK_UTF_16_LE            \
 685    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 686    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 687
 688
 689 /* List of symbols `coding-category-xxx' ordered by priority.  This
 690    variable is exposed to Emacs Lisp.  */
 691 static Lisp_Object Vcoding_category_list;
 692
 693 /* Table of coding categories (Lisp symbols).  This variable is for
 694    internal use oly.  */
 695 static Lisp_Object Vcoding_category_table;
 696
 697 /* Table of coding-categories ordered by priority.  */
 698 static enum coding_category coding_priorities[coding_category_max];
 699
 700 /* Nth element is a coding context for the coding system bound to the
 701    Nth coding category.  */
 702 static struct coding_system coding_categories[coding_category_max];
 703
 704 /*** Commonly used macros and functions ***/
 705
 706 #ifndef min
 707 #define min(a, b) ((a) < (b) ? (a) : (b))
 708 #endif
 709 #ifndef max
 710 #define max(a, b) ((a) > (b) ? (a) : (b))
 711 #endif
 712
 713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 714   do {                                                          \
 715     attrs = CODING_ID_ATTRS (coding->id);                       \
 716     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 717     if (VECTORP (eol_type))                                     \
 718       eol_type = Qunix;                                         \
 719     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 720   } while (0)
 721
 722
 723 /* Safely get one byte from the source text pointed by SRC which ends
 724    at SRC_END, and set C to that byte.  If there are not enough bytes
 725    in the source, it jumps to `no_more_source'.  The caller
 726    should declare and set these variables appropriately in advance:
 727         src, src_end, multibytep
 728 */
 729
 730 #define ONE_MORE_BYTE(c)                                        \
 731   do {                                                          \
 732     if (src == src_end)                                         \
 733       {                                                         \
 734         if (src_base < src)                                     \
 735           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 736         goto no_more_source;                                    \
 737       }                                                         \
 738     c = *src++;                                                 \
 739     if (multibytep && (c & 0x80))                               \
 740       {                                                         \
 741         if ((c & 0xFE) != 0xC0)                                 \
 742           error ("Undecodable char found");                     \
 743         c = ((c & 1) << 6) | *src++;                            \
 744       }                                                         \
 745     consumed_chars++;                                           \
 746   } while (0)
 747
 748
 749 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 750   do {                                          \
 751     c = *src++;                                 \
 752     if (multibytep && (c & 0x80))               \
 753       {                                         \
 754         if ((c & 0xFE) != 0xC0)                 \
 755           error ("Undecodable char found");     \
 756         c = ((c & 1) << 6) | *src++;            \
 757       }                                         \
 758     consumed_chars++;                           \
 759   } while (0)
 760
 761
 762 /* Store a byte C in the place pointed by DST and increment DST to the
 763    next free point, and increment PRODUCED_CHARS.  The caller should
 764    assure that C is 0..127, and declare and set the variable `dst'
 765    appropriately in advance.
 766 */
 767
 768
 769 #define EMIT_ONE_ASCII_BYTE(c)  \
 770   do {                          \
 771     produced_chars++;           \
 772     *dst++ = (c);               \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     *dst++ = (c1), *dst++ = (c2);       \
 782   } while (0)
 783
 784
 785 /* Store a byte C in the place pointed by DST and increment DST to the
 786    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 787    nonzero, store in an appropriate multibyte from.  The caller should
 788    declare and set the variables `dst' and `multibytep' appropriately
 789    in advance.  */
 790
 791 #define EMIT_ONE_BYTE(c)                \
 792   do {                                  \
 793     produced_chars++;                   \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch = (c);                   \
 797         if (ch >= 0x80)                 \
 798           ch = BYTE8_TO_CHAR (ch);      \
 799         CHAR_STRING_ADVANCE (ch, dst);  \
 800       }                                 \
 801     else                                \
 802       *dst++ = (c);                     \
 803   } while (0)
 804
 805
 806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 807
 808 #define EMIT_TWO_BYTES(c1, c2)          \
 809   do {                                  \
 810     produced_chars += 2;                \
 811     if (multibytep)                     \
 812       {                                 \
 813         int ch;                         \
 814                                         \
 815         ch = (c1);                      \
 816         if (ch >= 0x80)                 \
 817           ch = BYTE8_TO_CHAR (ch);      \
 818         CHAR_STRING_ADVANCE (ch, dst);  \
 819         ch = (c2);                      \
 820         if (ch >= 0x80)                 \
 821           ch = BYTE8_TO_CHAR (ch);      \
 822         CHAR_STRING_ADVANCE (ch, dst);  \
 823       }                                 \
 824     else                                \
 825       {                                 \
 826         *dst++ = (c1);                  \
 827         *dst++ = (c2);                  \
 828       }                                 \
 829   } while (0)
 830
 831
 832 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 833   do {                                  \
 834     EMIT_ONE_BYTE (c1);                 \
 835     EMIT_TWO_BYTES (c2, c3);            \
 836   } while (0)
 837
 838
 839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 840   do {                                          \
 841     EMIT_TWO_BYTES (c1, c2);                    \
 842     EMIT_TWO_BYTES (c3, c4);                    \
 843   } while (0)
 844
 845
 846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 847   do {                                                                       \
 848     charset_map_loaded = 0;                                                  \
 849     c = DECODE_CHAR (charset, code);                                         \
 850     if (charset_map_loaded)                                                  \
 851       {                                                                      \
 852         const unsigned char *orig = coding->source;                          \
 853         EMACS_INT offset;                                                    \
 854                                                                              \
 855         coding_set_source (coding);                                          \
 856         offset = coding->source - orig;                                      \
 857         src += offset;                                                       \
 858         src_base += offset;                                                  \
 859         src_end += offset;                                                   \
 860       }                                                                      \
 861   } while (0)
 862
 863
 864 #define ASSURE_DESTINATION(bytes)                               \
 865   do {                                                          \
 866     if (dst + (bytes) >= dst_end)                               \
 867       {                                                         \
 868         int more_bytes = charbuf_end - charbuf + (bytes);       \
 869                                                                 \
 870         dst = alloc_destination (coding, more_bytes, dst);      \
 871         dst_end = coding->destination + coding->dst_bytes;      \
 872       }                                                         \
 873   } while (0)
 874
 875
 876
 877 static void
 878 coding_set_source (coding)
 879      struct coding_system *coding;
 880 {
 881   if (BUFFERP (coding->src_object))
 882     {
 883       struct buffer *buf = XBUFFER (coding->src_object);
 884
 885       if (coding->src_pos < 0)
 886         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 887       else
 888         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 889     }
 890   else if (STRINGP (coding->src_object))
 891     {
 892       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 893     }
 894   else
 895     /* Otherwise, the source is C string and is never relocated
 896        automatically.  Thus we don't have to update anything.  */
 897     ;
 898 }
 899
 900 static void
 901 coding_set_destination (coding)
 902      struct coding_system *coding;
 903 {
 904   if (BUFFERP (coding->dst_object))
 905     {
 906       if (coding->src_pos < 0)
 907         {
 908           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 909           coding->dst_bytes = (GAP_END_ADDR
 910                                - (coding->src_bytes - coding->consumed)
 911                                - coding->destination);
 912         }
 913       else
 914         {
 915           /* We are sure that coding->dst_pos_byte is before the gap
 916              of the buffer. */
 917           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 918                                  + coding->dst_pos_byte - 1);
 919           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 920                                - coding->destination);
 921         }
 922     }
 923   else
 924     /* Otherwise, the destination is C string and is never relocated
 925        automatically.  Thus we don't have to update anything.  */
 926     ;
 927 }
 928
 929
 930 static void
 931 coding_alloc_by_realloc (coding, bytes)
 932      struct coding_system *coding;
 933      EMACS_INT bytes;
 934 {
 935   coding->destination = (unsigned char *) xrealloc (coding->destination,
 936                                                     coding->dst_bytes + bytes);
 937   coding->dst_bytes += bytes;
 938 }
 939
 940 static void
 941 coding_alloc_by_making_gap (coding, bytes)
 942      struct coding_system *coding;
 943      EMACS_INT bytes;
 944 {
 945   if (BUFFERP (coding->dst_object)
 946       && EQ (coding->src_object, coding->dst_object))
 947     {
 948       EMACS_INT add = coding->src_bytes - coding->consumed;
 949
 950       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 951       make_gap (bytes);
 952       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 953     }
 954   else
 955     {
 956       Lisp_Object this_buffer;
 957
 958       this_buffer = Fcurrent_buffer ();
 959       set_buffer_internal (XBUFFER (coding->dst_object));
 960       make_gap (bytes);
 961       set_buffer_internal (XBUFFER (this_buffer));
 962     }
 963 }
 964
 965
 966 static unsigned char *
 967 alloc_destination (coding, nbytes, dst)
 968      struct coding_system *coding;
 969      EMACS_INT nbytes;
 970      unsigned char *dst;
 971 {
 972   EMACS_INT offset = dst - coding->destination;
 973
 974   if (BUFFERP (coding->dst_object))
 975     coding_alloc_by_making_gap (coding, nbytes);
 976   else
 977     coding_alloc_by_realloc (coding, nbytes);
 978   coding->result = CODING_RESULT_SUCCESS;
 979   coding_set_destination (coding);
 980   dst = coding->destination + offset;
 981   return dst;
 982 }
 983
 984 /** Macros for annotations.  */
 985
 986 /* Maximum length of annotation data (sum of annotations for
 987    composition and charset).  */
 988 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
 989
 990 /* An annotation data is stored in the array coding->charbuf in this
 991    format:
 992      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
 993    LENGTH is the number of elements in the annotation.
 994    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
 995    FROM and TO specify the range of text annotated.  They are relative
 996    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
 997
 998    The format of the following elements depend on ANNOTATION_MASK.
 999
1000    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1001    follows:
1002      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1003    METHOD is one of enum composition_method.
1004    Optionnal COMPOSITION-COMPONENTS are characters and composition
1005    rules.
1006
1007    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1008    follows.  */
1009
1010 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1011   do {                                                  \
1012     *(buf)++ = -(len);                                  \
1013     *(buf)++ = (mask);                                  \
1014     *(buf)++ = (from);                                  \
1015     *(buf)++ = (to);                                    \
1016     coding->annotated = 1;                              \
1017   } while (0);
1018
1019 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1020   do {                                                                        \
1021     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1022     *buf++ = method;                                                          \
1023   } while (0)
1024
1025
1026 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1027   do {                                                                    \
1028     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1029     *buf++ = id;                                                          \
1030   } while (0)
1031
1032 \f
1033 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1034
1035
1036
1037 \f
1038 /*** 3. UTF-8 ***/
1039
1040 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1041    Check if a text is encoded in UTF-8.  If it is, return 1, else
1042    return 0.  */
1043
1044 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1045 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1046 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1047 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1048 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1049 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1050
1051 static int
1052 detect_coding_utf_8 (coding, detect_info)
1053      struct coding_system *coding;
1054      struct coding_detection_info *detect_info;
1055 {
1056   const unsigned char *src = coding->source, *src_base = src;
1057   const unsigned char *src_end = coding->source + coding->src_bytes;
1058   int multibytep = coding->src_multibyte;
1059   int consumed_chars = 0;
1060   int found = 0;
1061   int incomplete;
1062
1063   detect_info->checked |= CATEGORY_MASK_UTF_8;
1064   /* A coding system of this category is always ASCII compatible.  */
1065   src += coding->head_ascii;
1066
1067   while (1)
1068     {
1069       int c, c1, c2, c3, c4;
1070
1071       incomplete = 0;
1072       ONE_MORE_BYTE (c);
1073       if (UTF_8_1_OCTET_P (c))
1074         continue;
1075       incomplete = 1;
1076       ONE_MORE_BYTE (c1);
1077       if (! UTF_8_EXTRA_OCTET_P (c1))
1078         break;
1079       if (UTF_8_2_OCTET_LEADING_P (c))
1080         {
1081           found = CATEGORY_MASK_UTF_8;
1082           continue;
1083         }
1084       ONE_MORE_BYTE (c2);
1085       if (! UTF_8_EXTRA_OCTET_P (c2))
1086         break;
1087       if (UTF_8_3_OCTET_LEADING_P (c))
1088         {
1089           found = CATEGORY_MASK_UTF_8;
1090           continue;
1091         }
1092       ONE_MORE_BYTE (c3);
1093       if (! UTF_8_EXTRA_OCTET_P (c3))
1094         break;
1095       if (UTF_8_4_OCTET_LEADING_P (c))
1096         {
1097           found = CATEGORY_MASK_UTF_8;
1098           continue;
1099         }
1100       ONE_MORE_BYTE (c4);
1101       if (! UTF_8_EXTRA_OCTET_P (c4))
1102         break;
1103       if (UTF_8_5_OCTET_LEADING_P (c))
1104         {
1105           found = CATEGORY_MASK_UTF_8;
1106           continue;
1107         }
1108       break;
1109     }
1110   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1111   return 0;
1112
1113  no_more_source:
1114   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1115     {
1116       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1117       return 0;
1118     }
1119   detect_info->found |= found;
1120   return 1;
1121 }
1122
1123
1124 static void
1125 decode_coding_utf_8 (coding)
1126      struct coding_system *coding;
1127 {
1128   const unsigned char *src = coding->source + coding->consumed;
1129   const unsigned char *src_end = coding->source + coding->src_bytes;
1130   const unsigned char *src_base;
1131   int *charbuf = coding->charbuf;
1132   int *charbuf_end = charbuf + coding->charbuf_size;
1133   int consumed_chars = 0, consumed_chars_base;
1134   int multibytep = coding->src_multibyte;
1135   Lisp_Object attr, eol_type, charset_list;
1136
1137   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1138
1139   while (1)
1140     {
1141       int c, c1, c2, c3, c4, c5;
1142
1143       src_base = src;
1144       consumed_chars_base = consumed_chars;
1145
1146       if (charbuf >= charbuf_end)
1147         break;
1148
1149       ONE_MORE_BYTE (c1);
1150       if (UTF_8_1_OCTET_P(c1))
1151         {
1152           c = c1;
1153           if (c == '\r')
1154             {
1155               if (EQ (eol_type, Qdos))
1156                 {
1157                   if (src == src_end)
1158                     {
1159                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1160                       goto no_more_source;
1161                     }
1162                   if (*src == '\n')
1163                     ONE_MORE_BYTE (c);
1164                 }
1165               else if (EQ (eol_type, Qmac))
1166                 c = '\n';
1167             }
1168         }
1169       else
1170         {
1171           ONE_MORE_BYTE (c2);
1172           if (! UTF_8_EXTRA_OCTET_P (c2))
1173             goto invalid_code;
1174           if (UTF_8_2_OCTET_LEADING_P (c1))
1175             {
1176               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1177               /* Reject overlong sequences here and below.  Encoders
1178                  producing them are incorrect, they can be misleading,
1179                  and they mess up read/write invariance.  */
1180               if (c < 128)
1181                 goto invalid_code;
1182             }
1183           else
1184             {
1185               ONE_MORE_BYTE (c3);
1186               if (! UTF_8_EXTRA_OCTET_P (c3))
1187                 goto invalid_code;
1188               if (UTF_8_3_OCTET_LEADING_P (c1))
1189                 {
1190                   c = (((c1 & 0xF) << 12)
1191                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1192                   if (c < 0x800
1193                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1194                     goto invalid_code;
1195                 }
1196               else
1197                 {
1198                   ONE_MORE_BYTE (c4);
1199                   if (! UTF_8_EXTRA_OCTET_P (c4))
1200                     goto invalid_code;
1201                   if (UTF_8_4_OCTET_LEADING_P (c1))
1202                     {
1203                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1204                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1205                     if (c < 0x10000)
1206                       goto invalid_code;
1207                     }
1208                   else
1209                     {
1210                       ONE_MORE_BYTE (c5);
1211                       if (! UTF_8_EXTRA_OCTET_P (c5))
1212                         goto invalid_code;
1213                       if (UTF_8_5_OCTET_LEADING_P (c1))
1214                         {
1215                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1216                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1217                                | (c5 & 0x3F));
1218                           if ((c > MAX_CHAR) || (c < 0x200000))
1219                             goto invalid_code;
1220                         }
1221                       else
1222                         goto invalid_code;
1223                     }
1224                 }
1225             }
1226         }
1227
1228       *charbuf++ = c;
1229       continue;
1230
1231     invalid_code:
1232       src = src_base;
1233       consumed_chars = consumed_chars_base;
1234       ONE_MORE_BYTE (c);
1235       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1236       coding->errors++;
1237     }
1238
1239  no_more_source:
1240   coding->consumed_char += consumed_chars_base;
1241   coding->consumed = src_base - coding->source;
1242   coding->charbuf_used = charbuf - coding->charbuf;
1243 }
1244
1245
1246 static int
1247 encode_coding_utf_8 (coding)
1248      struct coding_system *coding;
1249 {
1250   int multibytep = coding->dst_multibyte;
1251   int *charbuf = coding->charbuf;
1252   int *charbuf_end = charbuf + coding->charbuf_used;
1253   unsigned char *dst = coding->destination + coding->produced;
1254   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1255   int produced_chars = 0;
1256   int c;
1257
1258   if (multibytep)
1259     {
1260       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1261
1262       while (charbuf < charbuf_end)
1263         {
1264           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1265
1266           ASSURE_DESTINATION (safe_room);
1267           c = *charbuf++;
1268           if (CHAR_BYTE8_P (c))
1269             {
1270               c = CHAR_TO_BYTE8 (c);
1271               EMIT_ONE_BYTE (c);
1272             }
1273           else
1274             {
1275               CHAR_STRING_ADVANCE (c, pend);
1276               for (p = str; p < pend; p++)
1277                 EMIT_ONE_BYTE (*p);
1278             }
1279         }
1280     }
1281   else
1282     {
1283       int safe_room = MAX_MULTIBYTE_LENGTH;
1284
1285       while (charbuf < charbuf_end)
1286         {
1287           ASSURE_DESTINATION (safe_room);
1288           c = *charbuf++;
1289           dst += CHAR_STRING (c, dst);
1290           produced_chars++;
1291         }
1292     }
1293   coding->result = CODING_RESULT_SUCCESS;
1294   coding->produced_char += produced_chars;
1295   coding->produced = dst - coding->destination;
1296   return 0;
1297 }
1298
1299
1300 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1301    Check if a text is encoded in one of UTF-16 based coding systems.
1302    If it is, return 1, else return 0.  */
1303
1304 #define UTF_16_HIGH_SURROGATE_P(val) \
1305   (((val) & 0xFC00) == 0xD800)
1306
1307 #define UTF_16_LOW_SURROGATE_P(val) \
1308   (((val) & 0xFC00) == 0xDC00)
1309
1310 #define UTF_16_INVALID_P(val)   \
1311   (((val) == 0xFFFE)            \
1312    || ((val) == 0xFFFF)         \
1313    || UTF_16_LOW_SURROGATE_P (val))
1314
1315
1316 static int
1317 detect_coding_utf_16 (coding, detect_info)
1318      struct coding_system *coding;
1319      struct coding_detection_info *detect_info;
1320 {
1321   const unsigned char *src = coding->source, *src_base = src;
1322   const unsigned char *src_end = coding->source + coding->src_bytes;
1323   int multibytep = coding->src_multibyte;
1324   int consumed_chars = 0;
1325   int c1, c2;
1326
1327   detect_info->checked |= CATEGORY_MASK_UTF_16;
1328
1329   if (coding->mode & CODING_MODE_LAST_BLOCK
1330       && (coding->src_bytes & 1))
1331     {
1332       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1333       return 0;
1334     }
1335   ONE_MORE_BYTE (c1);
1336   ONE_MORE_BYTE (c2);
1337
1338   if ((c1 == 0xFF) && (c2 == 0xFE))
1339     {
1340       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1341                              | CATEGORY_MASK_UTF_16_AUTO);
1342       detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1343     }
1344   else if ((c1 == 0xFE) && (c2 == 0xFF))
1345     {
1346       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1347                              | CATEGORY_MASK_UTF_16_AUTO);
1348       detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1349     }
1350  no_more_source:
1351   return 1;
1352 }
1353
1354 static void
1355 decode_coding_utf_16 (coding)
1356      struct coding_system *coding;
1357 {
1358   const unsigned char *src = coding->source + coding->consumed;
1359   const unsigned char *src_end = coding->source + coding->src_bytes;
1360   const unsigned char *src_base;
1361   int *charbuf = coding->charbuf;
1362   int *charbuf_end = charbuf + coding->charbuf_size;
1363   int consumed_chars = 0, consumed_chars_base;
1364   int multibytep = coding->src_multibyte;
1365   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1366   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1367   int surrogate = CODING_UTF_16_SURROGATE (coding);
1368   Lisp_Object attr, eol_type, charset_list;
1369
1370   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1371
1372   if (bom == utf_16_with_bom)
1373     {
1374       int c, c1, c2;
1375
1376       src_base = src;
1377       ONE_MORE_BYTE (c1);
1378       ONE_MORE_BYTE (c2);
1379       c = (c1 << 8) | c2;
1380
1381       if (endian == utf_16_big_endian
1382           ? c != 0xFEFF : c != 0xFFFE)
1383         {
1384           /* The first two bytes are not BOM.  Treat them as bytes
1385              for a normal character.  */
1386           src = src_base;
1387           coding->errors++;
1388         }
1389       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1390     }
1391   else if (bom == utf_16_detect_bom)
1392     {
1393       /* We have already tried to detect BOM and failed in
1394          detect_coding.  */
1395       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396     }
1397
1398   while (1)
1399     {
1400       int c, c1, c2;
1401
1402       src_base = src;
1403       consumed_chars_base = consumed_chars;
1404
1405       if (charbuf + 2 >= charbuf_end)
1406         break;
1407
1408       ONE_MORE_BYTE (c1);
1409       ONE_MORE_BYTE (c2);
1410       c = (endian == utf_16_big_endian
1411            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1412       if (surrogate)
1413         {
1414           if (! UTF_16_LOW_SURROGATE_P (c))
1415             {
1416               if (endian == utf_16_big_endian)
1417                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1418               else
1419                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1420               *charbuf++ = c1;
1421               *charbuf++ = c2;
1422               coding->errors++;
1423               if (UTF_16_HIGH_SURROGATE_P (c))
1424                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1425               else
1426                 *charbuf++ = c;
1427             }
1428           else
1429             {
1430               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1431               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1432               *charbuf++ = c;
1433             }
1434         }
1435       else
1436         {
1437           if (UTF_16_HIGH_SURROGATE_P (c))
1438             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1439           else
1440             *charbuf++ = c;
1441         }
1442     }
1443
1444  no_more_source:
1445   coding->consumed_char += consumed_chars_base;
1446   coding->consumed = src_base - coding->source;
1447   coding->charbuf_used = charbuf - coding->charbuf;
1448 }
1449
1450 static int
1451 encode_coding_utf_16 (coding)
1452      struct coding_system *coding;
1453 {
1454   int multibytep = coding->dst_multibyte;
1455   int *charbuf = coding->charbuf;
1456   int *charbuf_end = charbuf + coding->charbuf_used;
1457   unsigned char *dst = coding->destination + coding->produced;
1458   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1459   int safe_room = 8;
1460   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1461   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1462   int produced_chars = 0;
1463   Lisp_Object attrs, eol_type, charset_list;
1464   int c;
1465
1466   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1467
1468   if (bom != utf_16_without_bom)
1469     {
1470       ASSURE_DESTINATION (safe_room);
1471       if (big_endian)
1472         EMIT_TWO_BYTES (0xFE, 0xFF);
1473       else
1474         EMIT_TWO_BYTES (0xFF, 0xFE);
1475       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1476     }
1477
1478   while (charbuf < charbuf_end)
1479     {
1480       ASSURE_DESTINATION (safe_room);
1481       c = *charbuf++;
1482       if (c >= MAX_UNICODE_CHAR)
1483         c = coding->default_char;
1484
1485       if (c < 0x10000)
1486         {
1487           if (big_endian)
1488             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1489           else
1490             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1491         }
1492       else
1493         {
1494           int c1, c2;
1495
1496           c -= 0x10000;
1497           c1 = (c >> 10) + 0xD800;
1498           c2 = (c & 0x3FF) + 0xDC00;
1499           if (big_endian)
1500             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1501           else
1502             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1503         }
1504     }
1505   coding->result = CODING_RESULT_SUCCESS;
1506   coding->produced = dst - coding->destination;
1507   coding->produced_char += produced_chars;
1508   return 0;
1509 }
1510
1511 \f
1512 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1513
1514 /* Emacs' internal format for representation of multiple character
1515    sets is a kind of multi-byte encoding, i.e. characters are
1516    represented by variable-length sequences of one-byte codes.
1517
1518    ASCII characters and control characters (e.g. `tab', `newline') are
1519    represented by one-byte sequences which are their ASCII codes, in
1520    the range 0x00 through 0x7F.
1521
1522    8-bit characters of the range 0x80..0x9F are represented by
1523    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1524    code + 0x20).
1525
1526    8-bit characters of the range 0xA0..0xFF are represented by
1527    one-byte sequences which are their 8-bit code.
1528
1529    The other characters are represented by a sequence of `base
1530    leading-code', optional `extended leading-code', and one or two
1531    `position-code's.  The length of the sequence is determined by the
1532    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1533    whereas extended leading-code and position-code take the range 0xA0
1534    through 0xFF.  See `charset.h' for more details about leading-code
1535    and position-code.
1536
1537    --- CODE RANGE of Emacs' internal format ---
1538    character set        range
1539    -------------        -----
1540    ascii                0x00..0x7F
1541    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1542    eight-bit-graphic    0xA0..0xBF
1543    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1544    ---------------------------------------------
1545
1546    As this is the internal character representation, the format is
1547    usually not used externally (i.e. in a file or in a data sent to a
1548    process).  But, it is possible to have a text externally in this
1549    format (i.e. by encoding by the coding system `emacs-mule').
1550
1551    In that case, a sequence of one-byte codes has a slightly different
1552    form.
1553
1554    At first, all characters in eight-bit-control are represented by
1555    one-byte sequences which are their 8-bit code.
1556
1557    Next, character composition data are represented by the byte
1558    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1559    where,
1560         METHOD is 0xF0 plus one of composition method (enum
1561         composition_method),
1562
1563         BYTES is 0xA0 plus a byte length of this composition data,
1564
1565         CHARS is 0x20 plus a number of characters composed by this
1566         data,
1567
1568         COMPONENTs are characters of multibye form or composition
1569         rules encoded by two-byte of ASCII codes.
1570
1571    In addition, for backward compatibility, the following formats are
1572    also recognized as composition data on decoding.
1573
1574    0x80 MSEQ ...
1575    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1576
1577    Here,
1578         MSEQ is a multibyte form but in these special format:
1579           ASCII: 0xA0 ASCII_CODE+0x80,
1580           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1581         RULE is a one byte code of the range 0xA0..0xF0 that
1582         represents a composition rule.
1583   */
1584
1585 char emacs_mule_bytes[256];
1586
1587 int
1588 emacs_mule_char (coding, src, nbytes, nchars, id)
1589      struct coding_system *coding;
1590      unsigned char *src;
1591      int *nbytes, *nchars, *id;
1592 {
1593   const unsigned char *src_end = coding->source + coding->src_bytes;
1594   const unsigned char *src_base = src;
1595   int multibytep = coding->src_multibyte;
1596   struct charset *charset;
1597   unsigned code;
1598   int c;
1599   int consumed_chars = 0;
1600
1601   ONE_MORE_BYTE (c);
1602   switch (emacs_mule_bytes[c])
1603     {
1604     case 2:
1605       if (! (charset = emacs_mule_charset[c]))
1606         goto invalid_code;
1607       ONE_MORE_BYTE (c);
1608       code = c & 0x7F;
1609       break;
1610
1611     case 3:
1612       if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1613           || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1614         {
1615           ONE_MORE_BYTE (c);
1616           if (! (charset = emacs_mule_charset[c]))
1617             goto invalid_code;
1618           ONE_MORE_BYTE (c);
1619           code = c & 0x7F;
1620         }
1621       else
1622         {
1623           if (! (charset = emacs_mule_charset[c]))
1624             goto invalid_code;
1625           ONE_MORE_BYTE (c);
1626           code = (c & 0x7F) << 8;
1627           ONE_MORE_BYTE (c);
1628           code |= c & 0x7F;
1629         }
1630       break;
1631
1632     case 4:
1633       ONE_MORE_BYTE (c);
1634       if (! (charset = emacs_mule_charset[c]))
1635         goto invalid_code;
1636       ONE_MORE_BYTE (c);
1637       code = (c & 0x7F) << 8;
1638       ONE_MORE_BYTE (c);
1639       code |= c & 0x7F;
1640       break;
1641
1642     case 1:
1643       code = c;
1644       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1645                                  ? charset_ascii : charset_eight_bit);
1646       break;
1647
1648     default:
1649       abort ();
1650     }
1651   c = DECODE_CHAR (charset, code);
1652   if (c < 0)
1653     goto invalid_code;
1654   *nbytes = src - src_base;
1655   *nchars = consumed_chars;
1656   if (id)
1657     *id = charset->id;
1658   return c;
1659
1660  no_more_source:
1661   return -2;
1662
1663  invalid_code:
1664   return -1;
1665 }
1666
1667
1668 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1669    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1670    else return 0.  */
1671
1672 static int
1673 detect_coding_emacs_mule (coding, detect_info)
1674      struct coding_system *coding;
1675      struct coding_detection_info *detect_info;
1676 {
1677   const unsigned char *src = coding->source, *src_base = src;
1678   const unsigned char *src_end = coding->source + coding->src_bytes;
1679   int multibytep = coding->src_multibyte;
1680   int consumed_chars = 0;
1681   int c;
1682   int found = 0;
1683   int incomplete;
1684
1685   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1686   /* A coding system of this category is always ASCII compatible.  */
1687   src += coding->head_ascii;
1688
1689   while (1)
1690     {
1691       incomplete = 0;
1692       ONE_MORE_BYTE (c);
1693       incomplete = 1;
1694
1695       if (c == 0x80)
1696         {
1697           /* Perhaps the start of composite character.  We simple skip
1698              it because analyzing it is too heavy for detecting.  But,
1699              at least, we check that the composite character
1700              constitues of more than 4 bytes.  */
1701           const unsigned char *src_base;
1702
1703         repeat:
1704           src_base = src;
1705           do
1706             {
1707               ONE_MORE_BYTE (c);
1708             }
1709           while (c >= 0xA0);
1710
1711           if (src - src_base <= 4)
1712             break;
1713           found = CATEGORY_MASK_EMACS_MULE;
1714           if (c == 0x80)
1715             goto repeat;
1716         }
1717
1718       if (c < 0x80)
1719         {
1720           if (c < 0x20
1721               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1722             break;
1723         }
1724       else
1725         {
1726           const unsigned char *src_base = src - 1;
1727
1728           do
1729             {
1730               ONE_MORE_BYTE (c);
1731             }
1732           while (c >= 0xA0);
1733           if (src - src_base != emacs_mule_bytes[*src_base])
1734             break;
1735           found = CATEGORY_MASK_EMACS_MULE;
1736         }
1737     }
1738   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1739   return 0;
1740
1741  no_more_source:
1742   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1743     {
1744       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745       return 0;
1746     }
1747   detect_info->found |= found;
1748   return 1;
1749 }
1750
1751
1752 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1753
1754 /* Decode a character represented as a component of composition
1755    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1756    update SRC to the head of next character (or an encoded composition
1757    rule).  If SRC doesn't points a composition component, set C to -1.
1758    If SRC points an invalid byte sequence, global exit by a return
1759    value 0.  */
1760
1761 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1762   if (1)                                                        \
1763     {                                                           \
1764       int c;                                                    \
1765       int nbytes, nchars;                                       \
1766                                                                 \
1767       if (src == src_end)                                       \
1768         break;                                                  \
1769       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1770       if (c < 0)                                                \
1771         {                                                       \
1772           if (c == -2)                                          \
1773             break;                                              \
1774           goto invalid_code;                                    \
1775         }                                                       \
1776       *buf++ = c;                                               \
1777       src += nbytes;                                            \
1778       consumed_chars += nchars;                                 \
1779     }                                                           \
1780   else
1781
1782
1783 /* Decode a composition rule represented as a component of composition
1784    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1785    and increment BUF.  If SRC points an invalid byte sequence, set C
1786    to -1.  */
1787
1788 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1789   do {                                                  \
1790     int c, gref, nref;                                  \
1791                                                         \
1792     if (src >= src_end)                                 \
1793       goto invalid_code;                                \
1794     ONE_MORE_BYTE_NO_CHECK (c);                         \
1795     c -= 0x20;                                          \
1796     if (c < 0 || c >= 81)                               \
1797       goto invalid_code;                                \
1798                                                         \
1799     gref = c / 9, nref = c % 9;                         \
1800     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1801   } while (0)
1802
1803
1804 /* Decode a composition rule represented as a component of composition
1805    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1806    and increment BUF.  If SRC points an invalid byte sequence, set C
1807    to -1.  */
1808
1809 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1810   do {                                                  \
1811     int gref, nref;                                     \
1812                                                         \
1813     if (src + 1>= src_end)                              \
1814       goto invalid_code;                                \
1815     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1816     gref -= 0x20;                                       \
1817     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1818     nref -= 0x20;                                       \
1819     if (gref < 0 || gref >= 81                          \
1820         || nref < 0 || nref >= 81)                      \
1821       goto invalid_code;                                \
1822     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1823   } while (0)
1824
1825
1826 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1827   do {                                                                  \
1828     /* Emacs 21 style format.  The first three bytes at SRC are         \
1829        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1830        the byte length of this composition information, CHARS is the    \
1831        number of characters composed by this composition.  */           \
1832     enum composition_method method = c - 0xF2;                          \
1833     int *charbuf_base = charbuf;                                        \
1834     int from, to;                                                       \
1835     int consumed_chars_limit;                                           \
1836     int nbytes, nchars;                                                 \
1837                                                                         \
1838     ONE_MORE_BYTE (c);                                                  \
1839     nbytes = c - 0xA0;                                                  \
1840     if (nbytes < 3)                                                     \
1841       goto invalid_code;                                                \
1842     ONE_MORE_BYTE (c);                                                  \
1843     nchars = c - 0xA0;                                                  \
1844     from = coding->produced + char_offset;                              \
1845     to = from + nchars;                                                 \
1846     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1847     consumed_chars_limit = consumed_chars_base + nbytes;                \
1848     if (method != COMPOSITION_RELATIVE)                                 \
1849       {                                                                 \
1850         int i = 0;                                                      \
1851         while (consumed_chars < consumed_chars_limit)                   \
1852           {                                                             \
1853             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1854               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1855             else                                                        \
1856               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1857             i++;                                                        \
1858           }                                                             \
1859         if (consumed_chars < consumed_chars_limit)                      \
1860           goto invalid_code;                                            \
1861         charbuf_base[0] -= i;                                           \
1862       }                                                                 \
1863   } while (0)
1864
1865
1866 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1867   do {                                                          \
1868     /* Emacs 20 style format for relative composition.  */      \
1869     /* Store multibyte form of characters to be composed.  */   \
1870     enum composition_method method = COMPOSITION_RELATIVE;      \
1871     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1872     int *buf = components;                                      \
1873     int i, j;                                                   \
1874     int from, to;                                               \
1875                                                                 \
1876     src = src_base;                                             \
1877     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1878     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1879       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1880     if (i < 2)                                                  \
1881       goto invalid_code;                                        \
1882     from = coding->produced_char + char_offset;                 \
1883     to = from + i;                                              \
1884     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1885     for (j = 0; j < i; j++)                                     \
1886       *charbuf++ = components[j];                               \
1887   } while (0)
1888
1889
1890 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1891   do {                                                          \
1892     /* Emacs 20 style format for rule-base composition.  */     \
1893     /* Store multibyte form of characters to be composed.  */   \
1894     enum composition_method method = COMPOSITION_WITH_RULE;     \
1895     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1896     int *buf = components;                                      \
1897     int i, j;                                                   \
1898     int from, to;                                               \
1899                                                                 \
1900     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1901     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1902       {                                                         \
1903         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1904         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1905       }                                                         \
1906     if (i < 1 || (buf - components) % 2 == 0)                   \
1907       goto invalid_code;                                        \
1908     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1909       goto no_more_source;                                      \
1910     from = coding->produced_char + char_offset;                 \
1911     to = from + i;                                              \
1912     ADD_COMPOSITION_DATA (buf, from, to, method);               \
1913     for (j = 0; j < i; j++)                                     \
1914       *charbuf++ = components[j];                               \
1915     for (j = 0; j < i; j += 2)                                  \
1916       *charbuf++ = components[j];                               \
1917   } while (0)
1918
1919
1920 static void
1921 decode_coding_emacs_mule (coding)
1922      struct coding_system *coding;
1923 {
1924   const unsigned char *src = coding->source + coding->consumed;
1925   const unsigned char *src_end = coding->source + coding->src_bytes;
1926   const unsigned char *src_base;
1927   int *charbuf = coding->charbuf;
1928   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1929   int consumed_chars = 0, consumed_chars_base;
1930   int multibytep = coding->src_multibyte;
1931   Lisp_Object attrs, eol_type, charset_list;
1932   int char_offset = coding->produced_char;
1933   int last_offset = char_offset;
1934   int last_id = charset_ascii;
1935
1936   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1937
1938   while (1)
1939     {
1940       int c;
1941
1942       src_base = src;
1943       consumed_chars_base = consumed_chars;
1944
1945       if (charbuf >= charbuf_end)
1946         break;
1947
1948       ONE_MORE_BYTE (c);
1949
1950       if (c < 0x80)
1951         {
1952           if (c == '\r')
1953             {
1954               if (EQ (eol_type, Qdos))
1955                 {
1956                   if (src == src_end)
1957                     {
1958                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1959                       goto no_more_source;
1960                     }
1961                   if (*src == '\n')
1962                     ONE_MORE_BYTE (c);
1963                 }
1964               else if (EQ (eol_type, Qmac))
1965                 c = '\n';
1966             }
1967           *charbuf++ = c;
1968           char_offset++;
1969         }
1970       else if (c == 0x80)
1971         {
1972           ONE_MORE_BYTE (c);
1973           if (c - 0xF2 >= COMPOSITION_RELATIVE
1974               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1975             DECODE_EMACS_MULE_21_COMPOSITION (c);
1976           else if (c < 0xC0)
1977             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1978           else if (c == 0xFF)
1979             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1980           else
1981             goto invalid_code;
1982         }
1983       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1984         {
1985           int nbytes, nchars;
1986           int id;
1987
1988           src = src_base;
1989           consumed_chars = consumed_chars_base;
1990           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1991           if (c < 0)
1992             {
1993               if (c == -2)
1994                 break;
1995               goto invalid_code;
1996             }
1997           if (last_id != id)
1998             {
1999               if (last_id != charset_ascii)
2000                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2001               last_id = id;
2002               last_offset = char_offset;
2003             }
2004           *charbuf++ = c;
2005           src += nbytes;
2006           consumed_chars += nchars;
2007           char_offset++;
2008         }
2009       continue;
2010
2011     invalid_code:
2012       src = src_base;
2013       consumed_chars = consumed_chars_base;
2014       ONE_MORE_BYTE (c);
2015       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2016       char_offset++;
2017       coding->errors++;
2018     }
2019
2020  no_more_source:
2021   if (last_id != charset_ascii)
2022     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2023   coding->consumed_char += consumed_chars_base;
2024   coding->consumed = src_base - coding->source;
2025   coding->charbuf_used = charbuf - coding->charbuf;
2026 }
2027
2028
2029 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2030   do {                                          \
2031     if (id < 0xA0)                              \
2032       codes[0] = id, codes[1] = 0;              \
2033     else if (id < 0xE0)                         \
2034       codes[0] = 0x9A, codes[1] = id;           \
2035     else if (id < 0xF0)                         \
2036       codes[0] = 0x9B, codes[1] = id;           \
2037     else if (id < 0xF5)                         \
2038       codes[0] = 0x9C, codes[1] = id;           \
2039     else                                        \
2040       codes[0] = 0x9D, codes[1] = id;           \
2041   } while (0);
2042
2043
2044 static int
2045 encode_coding_emacs_mule (coding)
2046      struct coding_system *coding;
2047 {
2048   int multibytep = coding->dst_multibyte;
2049   int *charbuf = coding->charbuf;
2050   int *charbuf_end = charbuf + coding->charbuf_used;
2051   unsigned char *dst = coding->destination + coding->produced;
2052   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2053   int safe_room = 8;
2054   int produced_chars = 0;
2055   Lisp_Object attrs, eol_type, charset_list;
2056   int c;
2057   int preferred_charset_id = -1;
2058
2059   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2060   if (! EQ (charset_list, Vemacs_mule_charset_list))
2061     {
2062       CODING_ATTR_CHARSET_LIST (attrs)
2063         = charset_list = Vemacs_mule_charset_list;
2064     }
2065
2066   while (charbuf < charbuf_end)
2067     {
2068       ASSURE_DESTINATION (safe_room);
2069       c = *charbuf++;
2070
2071       if (c < 0)
2072         {
2073           /* Handle an annotation.  */
2074           switch (*charbuf)
2075             {
2076             case CODING_ANNOTATE_COMPOSITION_MASK:
2077               /* Not yet implemented.  */
2078               break;
2079             case CODING_ANNOTATE_CHARSET_MASK:
2080               preferred_charset_id = charbuf[3];
2081               if (preferred_charset_id >= 0
2082                   && NILP (Fmemq (make_number (preferred_charset_id),
2083                                   charset_list)))
2084                 preferred_charset_id = -1;
2085               break;
2086             default:
2087               abort ();
2088             }
2089           charbuf += -c - 1;
2090           continue;
2091         }
2092
2093       if (ASCII_CHAR_P (c))
2094         EMIT_ONE_ASCII_BYTE (c);
2095       else if (CHAR_BYTE8_P (c))
2096         {
2097           c = CHAR_TO_BYTE8 (c);
2098           EMIT_ONE_BYTE (c);
2099         }
2100       else
2101         {
2102           struct charset *charset;
2103           unsigned code;
2104           int dimension;
2105           int emacs_mule_id;
2106           unsigned char leading_codes[2];
2107
2108           if (preferred_charset_id >= 0)
2109             {
2110               charset = CHARSET_FROM_ID (preferred_charset_id);
2111               if (! CHAR_CHARSET_P (c, charset))
2112                 charset = char_charset (c, charset_list, NULL);
2113             }
2114           else
2115             charset = char_charset (c, charset_list, &code);
2116           if (! charset)
2117             {
2118               c = coding->default_char;
2119               if (ASCII_CHAR_P (c))
2120                 {
2121                   EMIT_ONE_ASCII_BYTE (c);
2122                   continue;
2123                 }
2124               charset = char_charset (c, charset_list, &code);
2125             }
2126           dimension = CHARSET_DIMENSION (charset);
2127           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2128           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2129           EMIT_ONE_BYTE (leading_codes[0]);
2130           if (leading_codes[1])
2131             EMIT_ONE_BYTE (leading_codes[1]);
2132           if (dimension == 1)
2133             EMIT_ONE_BYTE (code | 0x80);
2134           else
2135             {
2136               code |= 0x8080;
2137               EMIT_ONE_BYTE (code >> 8);
2138               EMIT_ONE_BYTE (code & 0xFF);
2139             }
2140         }
2141     }
2142   coding->result = CODING_RESULT_SUCCESS;
2143   coding->produced_char += produced_chars;
2144   coding->produced = dst - coding->destination;
2145   return 0;
2146 }
2147
2148 \f
2149 /*** 7. ISO2022 handlers ***/
2150
2151 /* The following note describes the coding system ISO2022 briefly.
2152    Since the intention of this note is to help understand the
2153    functions in this file, some parts are NOT ACCURATE or are OVERLY
2154    SIMPLIFIED.  For thorough understanding, please refer to the
2155    original document of ISO2022.  This is equivalent to the standard
2156    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2157
2158    ISO2022 provides many mechanisms to encode several character sets
2159    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2160    is encoded using bytes less than 128.  This may make the encoded
2161    text a little bit longer, but the text passes more easily through
2162    several types of gateway, some of which strip off the MSB (Most
2163    Significant Bit).
2164
2165    There are two kinds of character sets: control character sets and
2166    graphic character sets.  The former contain control characters such
2167    as `newline' and `escape' to provide control functions (control
2168    functions are also provided by escape sequences).  The latter
2169    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2170    two control character sets and many graphic character sets.
2171
2172    Graphic character sets are classified into one of the following
2173    four classes, according to the number of bytes (DIMENSION) and
2174    number of characters in one dimension (CHARS) of the set:
2175    - DIMENSION1_CHARS94
2176    - DIMENSION1_CHARS96
2177    - DIMENSION2_CHARS94
2178    - DIMENSION2_CHARS96
2179
2180    In addition, each character set is assigned an identification tag,
2181    unique for each set, called the "final character" (denoted as <F>
2182    hereafter).  The <F> of each character set is decided by ECMA(*)
2183    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2184    (0x30..0x3F are for private use only).
2185
2186    Note (*): ECMA = European Computer Manufacturers Association
2187
2188    Here are examples of graphic character sets [NAME(<F>)]:
2189         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192         o DIMENSION2_CHARS96 -- none for the moment
2193
2194    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2195         C0 [0x00..0x1F] -- control character plane 0
2196         GL [0x20..0x7F] -- graphic character plane 0
2197         C1 [0x80..0x9F] -- control character plane 1
2198         GR [0xA0..0xFF] -- graphic character plane 1
2199
2200    A control character set is directly designated and invoked to C0 or
2201    C1 by an escape sequence.  The most common case is that:
2202    - ISO646's  control character set is designated/invoked to C0, and
2203    - ISO6429's control character set is designated/invoked to C1,
2204    and usually these designations/invocations are omitted in encoded
2205    text.  In a 7-bit environment, only C0 can be used, and a control
2206    character for C1 is encoded by an appropriate escape sequence to
2207    fit into the environment.  All control characters for C1 are
2208    defined to have corresponding escape sequences.
2209
2210    A graphic character set is at first designated to one of four
2211    graphic registers (G0 through G3), then these graphic registers are
2212    invoked to GL or GR.  These designations and invocations can be
2213    done independently.  The most common case is that G0 is invoked to
2214    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2215    these invocations and designations are omitted in encoded text.
2216    In a 7-bit environment, only GL can be used.
2217
2218    When a graphic character set of CHARS94 is invoked to GL, codes
2219    0x20 and 0x7F of the GL area work as control characters SPACE and
2220    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2221    be used.
2222
2223    There are two ways of invocation: locking-shift and single-shift.
2224    With locking-shift, the invocation lasts until the next different
2225    invocation, whereas with single-shift, the invocation affects the
2226    following character only and doesn't affect the locking-shift
2227    state.  Invocations are done by the following control characters or
2228    escape sequences:
2229
2230    ----------------------------------------------------------------------
2231    abbrev  function                  cntrl escape seq   description
2232    ----------------------------------------------------------------------
2233    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2234    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2235    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2236    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2237    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2238    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2239    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2240    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2241    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2242    ----------------------------------------------------------------------
2243    (*) These are not used by any known coding system.
2244
2245    Control characters for these functions are defined by macros
2246    ISO_CODE_XXX in `coding.h'.
2247
2248    Designations are done by the following escape sequences:
2249    ----------------------------------------------------------------------
2250    escape sequence      description
2251    ----------------------------------------------------------------------
2252    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2253    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2254    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2255    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2256    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2257    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2258    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2259    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2260    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2261    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2262    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2263    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2264    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2265    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2266    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2267    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2268    ----------------------------------------------------------------------
2269
2270    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2271    of dimension 1, chars 94, and final character <F>, etc...
2272
2273    Note (*): Although these designations are not allowed in ISO2022,
2274    Emacs accepts them on decoding, and produces them on encoding
2275    CHARS96 character sets in a coding system which is characterized as
2276    7-bit environment, non-locking-shift, and non-single-shift.
2277
2278    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2279    '(' must be omitted.  We refer to this as "short-form" hereafter.
2280
2281    Now you may notice that there are a lot of ways of encoding the
2282    same multilingual text in ISO2022.  Actually, there exist many
2283    coding systems such as Compound Text (used in X11's inter client
2284    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2286    localized platforms), and all of these are variants of ISO2022.
2287
2288    In addition to the above, Emacs handles two more kinds of escape
2289    sequences: ISO6429's direction specification and Emacs' private
2290    sequence for specifying character composition.
2291
2292    ISO6429's direction specification takes the following form:
2293         o CSI ']'      -- end of the current direction
2294         o CSI '0' ']'  -- end of the current direction
2295         o CSI '1' ']'  -- start of left-to-right text
2296         o CSI '2' ']'  -- start of right-to-left text
2297    The control character CSI (0x9B: control sequence introducer) is
2298    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2299
2300    Character composition specification takes the following form:
2301         o ESC '0' -- start relative composition
2302         o ESC '1' -- end composition
2303         o ESC '2' -- start rule-base composition (*)
2304         o ESC '3' -- start relative composition with alternate chars  (**)
2305         o ESC '4' -- start rule-base composition with alternate chars  (**)
2306   Since these are not standard escape sequences of any ISO standard,
2307   the use of them with these meanings is restricted to Emacs only.
2308
2309   (*) This form is used only in Emacs 20.7 and older versions,
2310   but newer versions can safely decode it.
2311   (**) This form is used only in Emacs 21.1 and newer versions,
2312   and older versions can't decode it.
2313
2314   Here's a list of example usages of these composition escape
2315   sequences (categorized by `enum composition_method').
2316
2317   COMPOSITION_RELATIVE:
2318         ESC 0 CHAR [ CHAR ] ESC 1
2319   COMPOSITION_WITH_RULE:
2320         ESC 2 CHAR [ RULE CHAR ] ESC 1
2321   COMPOSITION_WITH_ALTCHARS:
2322         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2323   COMPOSITION_WITH_RULE_ALTCHARS:
2324         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2325
2326 enum iso_code_class_type iso_code_class[256];
2327
2328 #define SAFE_CHARSET_P(coding, id)      \
2329   ((id) <= (coding)->max_charset_id     \
2330    && (coding)->safe_charsets[id] >= 0)
2331
2332
2333 #define SHIFT_OUT_OK(category)  \
2334   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2335
2336 static void
2337 setup_iso_safe_charsets (attrs)
2338      Lisp_Object attrs;
2339 {
2340   Lisp_Object charset_list, safe_charsets;
2341   Lisp_Object request;
2342   Lisp_Object reg_usage;
2343   Lisp_Object tail;
2344   int reg94, reg96;
2345   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2346   int max_charset_id;
2347
2348   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2349   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2350       && ! EQ (charset_list, Viso_2022_charset_list))
2351     {
2352       CODING_ATTR_CHARSET_LIST (attrs)
2353         = charset_list = Viso_2022_charset_list;
2354       ASET (attrs, coding_attr_safe_charsets, Qnil);
2355     }
2356
2357   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2358     return;
2359
2360   max_charset_id = 0;
2361   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2362     {
2363       int id = XINT (XCAR (tail));
2364       if (max_charset_id < id)
2365         max_charset_id = id;
2366     }
2367
2368   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2369                                 make_number (255));
2370   request = AREF (attrs, coding_attr_iso_request);
2371   reg_usage = AREF (attrs, coding_attr_iso_usage);
2372   reg94 = XINT (XCAR (reg_usage));
2373   reg96 = XINT (XCDR (reg_usage));
2374
2375   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2376     {
2377       Lisp_Object id;
2378       Lisp_Object reg;
2379       struct charset *charset;
2380
2381       id = XCAR (tail);
2382       charset = CHARSET_FROM_ID (XINT (id));
2383       reg = Fcdr (Fassq (id, request));
2384       if (! NILP (reg))
2385         SSET (safe_charsets, XINT (id), XINT (reg));
2386       else if (charset->iso_chars_96)
2387         {
2388           if (reg96 < 4)
2389             SSET (safe_charsets, XINT (id), reg96);
2390         }
2391       else
2392         {
2393           if (reg94 < 4)
2394             SSET (safe_charsets, XINT (id), reg94);
2395         }
2396     }
2397   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2398 }
2399
2400
2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2402    Check if a text is encoded in one of ISO-2022 based codig systems.
2403    If it is, return 1, else return 0.  */
2404
2405 static int
2406 detect_coding_iso_2022 (coding, detect_info)
2407      struct coding_system *coding;
2408      struct coding_detection_info *detect_info;
2409 {
2410   const unsigned char *src = coding->source, *src_base = src;
2411   const unsigned char *src_end = coding->source + coding->src_bytes;
2412   int multibytep = coding->src_multibyte;
2413   int single_shifting = 0;
2414   int id;
2415   int c, c1;
2416   int consumed_chars = 0;
2417   int i;
2418   int rejected = 0;
2419   int found = 0;
2420
2421   detect_info->checked |= CATEGORY_MASK_ISO;
2422
2423   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2424     {
2425       struct coding_system *this = &(coding_categories[i]);
2426       Lisp_Object attrs, val;
2427
2428       attrs = CODING_ID_ATTRS (this->id);
2429       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2431         setup_iso_safe_charsets (attrs);
2432       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2433       this->max_charset_id = SCHARS (val) - 1;
2434       this->safe_charsets = (char *) SDATA (val);
2435     }
2436
2437   /* A coding system of this category is always ASCII compatible.  */
2438   src += coding->head_ascii;
2439
2440   while (rejected != CATEGORY_MASK_ISO)
2441     {
2442       ONE_MORE_BYTE (c);
2443       switch (c)
2444         {
2445         case ISO_CODE_ESC:
2446           if (inhibit_iso_escape_detection)
2447             break;
2448           single_shifting = 0;
2449           ONE_MORE_BYTE (c);
2450           if (c >= '(' && c <= '/')
2451             {
2452               /* Designation sequence for a charset of dimension 1.  */
2453               ONE_MORE_BYTE (c1);
2454               if (c1 < ' ' || c1 >= 0x80
2455                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2456                 /* Invalid designation sequence.  Just ignore.  */
2457                 break;
2458             }
2459           else if (c == '$')
2460             {
2461               /* Designation sequence for a charset of dimension 2.  */
2462               ONE_MORE_BYTE (c);
2463               if (c >= '@' && c <= 'B')
2464                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2465                 id = iso_charset_table[1][0][c];
2466               else if (c >= '(' && c <= '/')
2467                 {
2468                   ONE_MORE_BYTE (c1);
2469                   if (c1 < ' ' || c1 >= 0x80
2470                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2471                     /* Invalid designation sequence.  Just ignore.  */
2472                     break;
2473                 }
2474               else
2475                 /* Invalid designation sequence.  Just ignore it.  */
2476                 break;
2477             }
2478           else if (c == 'N' || c == 'O')
2479             {
2480               /* ESC <Fe> for SS2 or SS3.  */
2481               single_shifting = 1;
2482               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2483               break;
2484             }
2485           else if (c >= '0' && c <= '4')
2486             {
2487               /* ESC <Fp> for start/end composition.  */
2488               found |= CATEGORY_MASK_ISO;
2489               break;
2490             }
2491           else
2492             {
2493               /* Invalid escape sequence.  Just ignore it.  */
2494               break;
2495             }
2496
2497           /* We found a valid designation sequence for CHARSET.  */
2498           rejected |= CATEGORY_MASK_ISO_8BIT;
2499           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2500                               id))
2501             found |= CATEGORY_MASK_ISO_7;
2502           else
2503             rejected |= CATEGORY_MASK_ISO_7;
2504           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2505                               id))
2506             found |= CATEGORY_MASK_ISO_7_TIGHT;
2507           else
2508             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2509           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2510                               id))
2511             found |= CATEGORY_MASK_ISO_7_ELSE;
2512           else
2513             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2514           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2515                               id))
2516             found |= CATEGORY_MASK_ISO_8_ELSE;
2517           else
2518             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2519           break;
2520
2521         case ISO_CODE_SO:
2522         case ISO_CODE_SI:
2523           /* Locking shift out/in.  */
2524           if (inhibit_iso_escape_detection)
2525             break;
2526           single_shifting = 0;
2527           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2528           found |= CATEGORY_MASK_ISO_ELSE;
2529           break;
2530
2531         case ISO_CODE_CSI:
2532           /* Control sequence introducer.  */
2533           single_shifting = 0;
2534           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2535           found |= CATEGORY_MASK_ISO_8_ELSE;
2536           goto check_extra_latin;
2537
2538
2539         case ISO_CODE_SS2:
2540         case ISO_CODE_SS3:
2541           /* Single shift.   */
2542           if (inhibit_iso_escape_detection)
2543             break;
2544           single_shifting = 1;
2545           rejected |= CATEGORY_MASK_ISO_7BIT;
2546           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2547               & CODING_ISO_FLAG_SINGLE_SHIFT)
2548             found |= CATEGORY_MASK_ISO_8_1;
2549           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2550               & CODING_ISO_FLAG_SINGLE_SHIFT)
2551             found |= CATEGORY_MASK_ISO_8_2;
2552           goto check_extra_latin;
2553
2554         default:
2555           if (c < 0x80)
2556             {
2557               single_shifting = 0;
2558               break;
2559             }
2560           if (c >= 0xA0)
2561             {
2562               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2563               found |= CATEGORY_MASK_ISO_8_1;
2564               /* Check the length of succeeding codes of the range
2565                  0xA0..0FF.  If the byte length is even, we include
2566                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2567                  only when we are not single shifting.  */
2568               if (! single_shifting
2569                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2570                 {
2571                   int i = 1;
2572                   while (src < src_end)
2573                     {
2574                       ONE_MORE_BYTE (c);
2575                       if (c < 0xA0)
2576                         break;
2577                       i++;
2578                     }
2579
2580                   if (i & 1 && src < src_end)
2581                     rejected |= CATEGORY_MASK_ISO_8_2;
2582                   else
2583                     found |= CATEGORY_MASK_ISO_8_2;
2584                 }
2585               break;
2586             }
2587         check_extra_latin:
2588           single_shifting = 0;
2589           if (! VECTORP (Vlatin_extra_code_table)
2590               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2591             {
2592               rejected = CATEGORY_MASK_ISO;
2593               break;
2594             }
2595           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2596               & CODING_ISO_FLAG_LATIN_EXTRA)
2597             found |= CATEGORY_MASK_ISO_8_1;
2598           else
2599             rejected |= CATEGORY_MASK_ISO_8_1;
2600           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2601               & CODING_ISO_FLAG_LATIN_EXTRA)
2602             found |= CATEGORY_MASK_ISO_8_2;
2603           else
2604             rejected |= CATEGORY_MASK_ISO_8_2;
2605         }
2606     }
2607   detect_info->rejected |= CATEGORY_MASK_ISO;
2608   return 0;
2609
2610  no_more_source:
2611   detect_info->rejected |= rejected;
2612   detect_info->found |= (found & ~rejected);
2613   return 1;
2614 }
2615
2616
2617 /* Set designation state into CODING.  */
2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2619   do {                                                                  \
2620     int id, prev;                                                       \
2621                                                                         \
2622     if (final < '0' || final >= 128                                     \
2623         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2624         || !SAFE_CHARSET_P (coding, id))                                \
2625       {                                                                 \
2626         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2627         goto invalid_code;                                              \
2628       }                                                                 \
2629     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2630     if (id == charset_jisx0201_roman)                                   \
2631       {                                                                 \
2632         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2633           id = charset_ascii;                                           \
2634       }                                                                 \
2635     else if (id == charset_jisx0208_1978)                               \
2636       {                                                                 \
2637         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2638           id = charset_jisx0208;                                        \
2639       }                                                                 \
2640     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2641     /* If there was an invalid designation to REG previously, and this  \
2642        designation is ASCII to REG, we should keep this designation     \
2643        sequence.  */                                                    \
2644     if (prev == -2 && id == charset_ascii)                              \
2645       goto invalid_code;                                                \
2646   } while (0)
2647
2648
2649 #define MAYBE_FINISH_COMPOSITION()                              \
2650   do {                                                          \
2651     int i;                                                      \
2652     if (composition_state == COMPOSING_NO)                      \
2653       break;                                                    \
2654     /* It is assured that we have enough room for producing     \
2655        characters stored in the table `components'.  */         \
2656     if (charbuf + component_idx > charbuf_end)                  \
2657       goto no_more_source;                                      \
2658     composition_state = COMPOSING_NO;                           \
2659     if (method == COMPOSITION_RELATIVE                          \
2660         || method == COMPOSITION_WITH_ALTCHARS)                 \
2661       {                                                         \
2662         for (i = 0; i < component_idx; i++)                     \
2663           *charbuf++ = components[i];                           \
2664         char_offset += component_idx;                           \
2665       }                                                         \
2666     else                                                        \
2667       {                                                         \
2668         for (i = 0; i < component_idx; i += 2)                  \
2669           *charbuf++ = components[i];                           \
2670         char_offset += (component_idx / 2) + 1;                 \
2671       }                                                         \
2672   } while (0)
2673
2674
2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2678    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2680   */
2681
2682 #define DECODE_COMPOSITION_START(c1)                                    \
2683   do {                                                                  \
2684     if (c1 == '0'                                                       \
2685         && composition_state == COMPOSING_COMPONENT_RULE)               \
2686       {                                                                 \
2687         component_len = component_idx;                                  \
2688         composition_state = COMPOSING_CHAR;                             \
2689       }                                                                 \
2690     else                                                                \
2691       {                                                                 \
2692         const unsigned char *p;                                         \
2693                                                                         \
2694         MAYBE_FINISH_COMPOSITION ();                                    \
2695         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2696           goto no_more_source;                                          \
2697         for (p = src; p < src_end - 1; p++)                             \
2698           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2699             break;                                                      \
2700         if (p == src_end - 1)                                           \
2701           {                                                             \
2702             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2703               goto invalid_code;                                        \
2704             goto no_more_source;                                        \
2705           }                                                             \
2706                                                                         \
2707         /* This is surely the start of a composition.  */               \
2708         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2709                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2710                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2711                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2712         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2713                              : COMPOSING_COMPONENT_CHAR);               \
2714         component_idx = component_len = 0;                              \
2715       }                                                                 \
2716   } while (0)
2717
2718
2719 /* Handle compositoin end sequence ESC 1.  */
2720
2721 #define DECODE_COMPOSITION_END()                                        \
2722   do {                                                                  \
2723     int nchars = (component_len > 0 ? component_idx - component_len     \
2724                   : method == COMPOSITION_RELATIVE ? component_idx      \
2725                   : (component_idx + 1) / 2);                           \
2726     int i;                                                              \
2727     int *saved_charbuf = charbuf;                                       \
2728     int from = char_offset;                                             \
2729     int to = from + nchars;                                             \
2730                                                                         \
2731     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2732     if (method != COMPOSITION_RELATIVE)                                 \
2733       {                                                                 \
2734         if (component_len == 0)                                         \
2735           for (i = 0; i < component_idx; i++)                           \
2736             *charbuf++ = components[i];                                 \
2737         else                                                            \
2738           for (i = 0; i < component_len; i++)                           \
2739             *charbuf++ = components[i];                                 \
2740         *saved_charbuf = saved_charbuf - charbuf;                       \
2741       }                                                                 \
2742     if (method == COMPOSITION_WITH_RULE)                                \
2743       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2744         *charbuf++ = components[i];                                     \
2745     else                                                                \
2746       for (i = component_len; i < component_idx; i++, char_offset++)    \
2747         *charbuf++ = components[i];                                     \
2748     coding->annotated = 1;                                              \
2749     composition_state = COMPOSING_NO;                                   \
2750   } while (0)
2751
2752
2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
2754    from SRC) and store one encoded composition rule in
2755    coding->cmp_data.  */
2756
2757 #define DECODE_COMPOSITION_RULE(c1)                                     \
2758   do {                                                                  \
2759     (c1) -= 32;                                                         \
2760     if (c1 < 81)                /* old format (before ver.21) */        \
2761       {                                                                 \
2762         int gref = (c1) / 9;                                            \
2763         int nref = (c1) % 9;                                            \
2764         if (gref == 4) gref = 10;                                       \
2765         if (nref == 4) nref = 10;                                       \
2766         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2767       }                                                                 \
2768     else if (c1 < 93)           /* new format (after ver.21) */         \
2769       {                                                                 \
2770         ONE_MORE_BYTE (c2);                                             \
2771         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2772       }                                                                 \
2773     else                                                                \
2774       c1 = 0;                                                           \
2775   } while (0)
2776
2777
2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2779
2780 static void
2781 decode_coding_iso_2022 (coding)
2782      struct coding_system *coding;
2783 {
2784   const unsigned char *src = coding->source + coding->consumed;
2785   const unsigned char *src_end = coding->source + coding->src_bytes;
2786   const unsigned char *src_base;
2787   int *charbuf = coding->charbuf;
2788   int *charbuf_end
2789     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2790   int consumed_chars = 0, consumed_chars_base;
2791   int multibytep = coding->src_multibyte;
2792   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2793   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2794   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2795   struct charset *charset;
2796   int c;
2797   /* For handling composition sequence.  */
2798 #define COMPOSING_NO                    0
2799 #define COMPOSING_CHAR                  1
2800 #define COMPOSING_RULE                  2
2801 #define COMPOSING_COMPONENT_CHAR        3
2802 #define COMPOSING_COMPONENT_RULE        4
2803
2804   int composition_state = COMPOSING_NO;
2805   enum composition_method method;
2806   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2807   int component_idx;
2808   int component_len;
2809   Lisp_Object attrs, eol_type, charset_list;
2810   int char_offset = coding->produced_char;
2811   int last_offset = char_offset;
2812   int last_id = charset_ascii;
2813
2814   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2815   setup_iso_safe_charsets (attrs);
2816
2817   while (1)
2818     {
2819       int c1, c2;
2820
2821       src_base = src;
2822       consumed_chars_base = consumed_chars;
2823
2824       if (charbuf >= charbuf_end)
2825         break;
2826
2827       ONE_MORE_BYTE (c1);
2828
2829       /* We produce at most one character.  */
2830       switch (iso_code_class [c1])
2831         {
2832         case ISO_0x20_or_0x7F:
2833           if (composition_state != COMPOSING_NO)
2834             {
2835               if (composition_state == COMPOSING_RULE
2836                   || composition_state == COMPOSING_COMPONENT_RULE)
2837                 {
2838                   DECODE_COMPOSITION_RULE (c1);
2839                   components[component_idx++] = c1;
2840                   composition_state--;
2841                   continue;
2842                 }
2843             }
2844           if (charset_id_0 < 0
2845               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2846             /* This is SPACE or DEL.  */
2847             charset = CHARSET_FROM_ID (charset_ascii);
2848           else
2849             charset = CHARSET_FROM_ID (charset_id_0);
2850           break;
2851
2852         case ISO_graphic_plane_0:
2853           if (composition_state != COMPOSING_NO)
2854             {
2855               if (composition_state == COMPOSING_RULE
2856                   || composition_state == COMPOSING_COMPONENT_RULE)
2857                 {
2858                   DECODE_COMPOSITION_RULE (c1);
2859                   components[component_idx++] = c1;
2860                   composition_state--;
2861                   continue;
2862                 }
2863             }
2864           charset = CHARSET_FROM_ID (charset_id_0);
2865           break;
2866
2867         case ISO_0xA0_or_0xFF:
2868           if (charset_id_1 < 0
2869               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2870               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2871             goto invalid_code;
2872           /* This is a graphic character, we fall down ... */
2873
2874         case ISO_graphic_plane_1:
2875           if (charset_id_1 < 0)
2876             goto invalid_code;
2877           charset = CHARSET_FROM_ID (charset_id_1);
2878           break;
2879
2880         case ISO_carriage_return:
2881           if (c1 == '\r')
2882             {
2883               if (EQ (eol_type, Qdos))
2884                 {
2885                   if (src == src_end)
2886                     {
2887                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2888                       goto no_more_source;
2889                     }
2890                   if (*src == '\n')
2891                     ONE_MORE_BYTE (c1);
2892                 }
2893               else if (EQ (eol_type, Qmac))
2894                 c1 = '\n';
2895             }
2896           /* fall through */
2897
2898         case ISO_control_0:
2899           MAYBE_FINISH_COMPOSITION ();
2900           charset = CHARSET_FROM_ID (charset_ascii);
2901           break;
2902
2903         case ISO_control_1:
2904           MAYBE_FINISH_COMPOSITION ();
2905           goto invalid_code;
2906
2907         case ISO_shift_out:
2908           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2909               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2910             goto invalid_code;
2911           CODING_ISO_INVOCATION (coding, 0) = 1;
2912           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2913           continue;
2914
2915         case ISO_shift_in:
2916           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2917             goto invalid_code;
2918           CODING_ISO_INVOCATION (coding, 0) = 0;
2919           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2920           continue;
2921
2922         case ISO_single_shift_2_7:
2923         case ISO_single_shift_2:
2924           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2925             goto invalid_code;
2926           /* SS2 is handled as an escape sequence of ESC 'N' */
2927           c1 = 'N';
2928           goto label_escape_sequence;
2929
2930         case ISO_single_shift_3:
2931           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2932             goto invalid_code;
2933           /* SS2 is handled as an escape sequence of ESC 'O' */
2934           c1 = 'O';
2935           goto label_escape_sequence;
2936
2937         case ISO_control_sequence_introducer:
2938           /* CSI is handled as an escape sequence of ESC '[' ...  */
2939           c1 = '[';
2940           goto label_escape_sequence;
2941
2942         case ISO_escape:
2943           ONE_MORE_BYTE (c1);
2944         label_escape_sequence:
2945           /* Escape sequences handled here are invocation,
2946              designation, direction specification, and character
2947              composition specification.  */
2948           switch (c1)
2949             {
2950             case '&':           /* revision of following character set */
2951               ONE_MORE_BYTE (c1);
2952               if (!(c1 >= '@' && c1 <= '~'))
2953                 goto invalid_code;
2954               ONE_MORE_BYTE (c1);
2955               if (c1 != ISO_CODE_ESC)
2956                 goto invalid_code;
2957               ONE_MORE_BYTE (c1);
2958               goto label_escape_sequence;
2959
2960             case '$':           /* designation of 2-byte character set */
2961               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2962                 goto invalid_code;
2963               ONE_MORE_BYTE (c1);
2964               if (c1 >= '@' && c1 <= 'B')
2965                 {       /* designation of JISX0208.1978, GB2312.1980,
2966                            or JISX0208.1980 */
2967                   DECODE_DESIGNATION (0, 2, 0, c1);
2968                 }
2969               else if (c1 >= 0x28 && c1 <= 0x2B)
2970                 {       /* designation of DIMENSION2_CHARS94 character set */
2971                   ONE_MORE_BYTE (c2);
2972                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2973                 }
2974               else if (c1 >= 0x2C && c1 <= 0x2F)
2975                 {       /* designation of DIMENSION2_CHARS96 character set */
2976                   ONE_MORE_BYTE (c2);
2977                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2978                 }
2979               else
2980                 goto invalid_code;
2981               /* We must update these variables now.  */
2982               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2983               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2984               continue;
2985
2986             case 'n':           /* invocation of locking-shift-2 */
2987               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2988                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2989                 goto invalid_code;
2990               CODING_ISO_INVOCATION (coding, 0) = 2;
2991               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2992               continue;
2993
2994             case 'o':           /* invocation of locking-shift-3 */
2995               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2996                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2997                 goto invalid_code;
2998               CODING_ISO_INVOCATION (coding, 0) = 3;
2999               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3000               continue;
3001
3002             case 'N':           /* invocation of single-shift-2 */
3003               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3004                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3005                 goto invalid_code;
3006               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3007               ONE_MORE_BYTE (c1);
3008               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3009                 goto invalid_code;
3010               break;
3011
3012             case 'O':           /* invocation of single-shift-3 */
3013               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3014                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3015                 goto invalid_code;
3016               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3017               ONE_MORE_BYTE (c1);
3018               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3019                 goto invalid_code;
3020               break;
3021
3022             case '0': case '2': case '3': case '4': /* start composition */
3023               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3024                 goto invalid_code;
3025               DECODE_COMPOSITION_START (c1);
3026               continue;
3027
3028             case '1':           /* end composition */
3029               if (composition_state == COMPOSING_NO)
3030                 goto invalid_code;
3031               DECODE_COMPOSITION_END ();
3032               continue;
3033
3034             case '[':           /* specification of direction */
3035               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3036                 goto invalid_code;
3037               /* For the moment, nested direction is not supported.
3038                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3039                  left-to-right, and nozero means right-to-left.  */
3040               ONE_MORE_BYTE (c1);
3041               switch (c1)
3042                 {
3043                 case ']':       /* end of the current direction */
3044                   coding->mode &= ~CODING_MODE_DIRECTION;
3045
3046                 case '0':       /* end of the current direction */
3047                 case '1':       /* start of left-to-right direction */
3048                   ONE_MORE_BYTE (c1);
3049                   if (c1 == ']')
3050                     coding->mode &= ~CODING_MODE_DIRECTION;
3051                   else
3052                     goto invalid_code;
3053                   break;
3054
3055                 case '2':       /* start of right-to-left direction */
3056                   ONE_MORE_BYTE (c1);
3057                   if (c1 == ']')
3058                     coding->mode |= CODING_MODE_DIRECTION;
3059                   else
3060                     goto invalid_code;
3061                   break;
3062
3063                 default:
3064                   goto invalid_code;
3065                 }
3066               continue;
3067
3068             case '%':
3069               ONE_MORE_BYTE (c1);
3070               if (c1 == '/')
3071                 {
3072                   /* CTEXT extended segment:
3073                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074                      We keep these bytes as is for the moment.
3075                      They may be decoded by post-read-conversion.  */
3076                   int dim, M, L;
3077                   int size;
3078
3079                   ONE_MORE_BYTE (dim);
3080                   ONE_MORE_BYTE (M);
3081                   ONE_MORE_BYTE (L);
3082                   size = ((M - 128) * 128) + (L - 128);
3083                   if (charbuf + 8 + size > charbuf_end)
3084                     goto break_loop;
3085                   *charbuf++ = ISO_CODE_ESC;
3086                   *charbuf++ = '%';
3087                   *charbuf++ = '/';
3088                   *charbuf++ = dim;
3089                   *charbuf++ = BYTE8_TO_CHAR (M);
3090                   *charbuf++ = BYTE8_TO_CHAR (L);
3091                   while (size-- > 0)
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3095                     }
3096                 }
3097               else if (c1 == 'G')
3098                 {
3099                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3100                      ESC % G --UTF-8-BYTES-- ESC % @
3101                      We keep these bytes as is for the moment.
3102                      They may be decoded by post-read-conversion.  */
3103                   int *p = charbuf;
3104
3105                   if (p + 6 > charbuf_end)
3106                     goto break_loop;
3107                   *p++ = ISO_CODE_ESC;
3108                   *p++ = '%';
3109                   *p++ = 'G';
3110                   while (p < charbuf_end)
3111                     {
3112                       ONE_MORE_BYTE (c1);
3113                       if (c1 == ISO_CODE_ESC
3114                           && src + 1 < src_end
3115                           && src[0] == '%'
3116                           && src[1] == '@')
3117                         break;
3118                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3119                     }
3120                   if (p + 3 > charbuf_end)
3121                     goto break_loop;
3122                   *p++ = ISO_CODE_ESC;
3123                   *p++ = '%';
3124                   *p++ = '@';
3125                   charbuf = p;
3126                 }
3127               else
3128                 goto invalid_code;
3129               continue;
3130               break;
3131
3132             default:
3133               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3134                 goto invalid_code;
3135               if (c1 >= 0x28 && c1 <= 0x2B)
3136                 {       /* designation of DIMENSION1_CHARS94 character set */
3137                   ONE_MORE_BYTE (c2);
3138                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3139                 }
3140               else if (c1 >= 0x2C && c1 <= 0x2F)
3141                 {       /* designation of DIMENSION1_CHARS96 character set */
3142                   ONE_MORE_BYTE (c2);
3143                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3144                 }
3145               else
3146                 goto invalid_code;
3147               /* We must update these variables now.  */
3148               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3149               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3150               continue;
3151             }
3152         }
3153
3154       if (charset->id != charset_ascii
3155           && last_id != charset->id)
3156         {
3157           if (last_id != charset_ascii)
3158             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3159           last_id = charset->id;
3160           last_offset = char_offset;
3161         }
3162
3163       /* Now we know CHARSET and 1st position code C1 of a character.
3164          Produce a decoded character while getting 2nd position code
3165          C2 if necessary.  */
3166       c1 &= 0x7F;
3167       if (CHARSET_DIMENSION (charset) > 1)
3168         {
3169           ONE_MORE_BYTE (c2);
3170           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3171             /* C2 is not in a valid range.  */
3172             goto invalid_code;
3173           c1 = (c1 << 8) | (c2 & 0x7F);
3174           if (CHARSET_DIMENSION (charset) > 2)
3175             {
3176               ONE_MORE_BYTE (c2);
3177               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3178                 /* C2 is not in a valid range.  */
3179                 goto invalid_code;
3180               c1 = (c1 << 8) | (c2 & 0x7F);
3181             }
3182         }
3183
3184       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3185       if (c < 0)
3186         {
3187           MAYBE_FINISH_COMPOSITION ();
3188           for (; src_base < src; src_base++, char_offset++)
3189             {
3190               if (ASCII_BYTE_P (*src_base))
3191                 *charbuf++ = *src_base;
3192               else
3193                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3194             }
3195         }
3196       else if (composition_state == COMPOSING_NO)
3197         {
3198           *charbuf++ = c;
3199           char_offset++;
3200         }
3201       else
3202         {
3203           components[component_idx++] = c;
3204           if (method == COMPOSITION_WITH_RULE
3205               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3206                   && composition_state == COMPOSING_COMPONENT_CHAR))
3207             composition_state++;
3208         }
3209       continue;
3210
3211     invalid_code:
3212       MAYBE_FINISH_COMPOSITION ();
3213       src = src_base;
3214       consumed_chars = consumed_chars_base;
3215       ONE_MORE_BYTE (c);
3216       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3217       char_offset++;
3218       coding->errors++;
3219       continue;
3220
3221     break_loop:
3222       break;
3223     }
3224
3225  no_more_source:
3226   if (last_id != charset_ascii)
3227     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228   coding->consumed_char += consumed_chars_base;
3229   coding->consumed = src_base - coding->source;
3230   coding->charbuf_used = charbuf - coding->charbuf;
3231 }
3232
3233
3234 /* ISO2022 encoding stuff.  */
3235
3236 /*
3237    It is not enough to say just "ISO2022" on encoding, we have to
3238    specify more details.  In Emacs, each coding system of ISO2022
3239    variant has the following specifications:
3240         1. Initial designation to G0 thru G3.
3241         2. Allows short-form designation?
3242         3. ASCII should be designated to G0 before control characters?
3243         4. ASCII should be designated to G0 at end of line?
3244         5. 7-bit environment or 8-bit environment?
3245         6. Use locking-shift?
3246         7. Use Single-shift?
3247    And the following two are only for Japanese:
3248         8. Use ASCII in place of JIS0201-1976-Roman?
3249         9. Use JISX0208-1983 in place of JISX0208-1978?
3250    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3252    details.
3253 */
3254
3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
3256    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3257    '@', 'A', or 'B' and the coding system CODING allows, produce
3258    designation sequence of short-form.  */
3259
3260 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3261   do {                                                                  \
3262     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3263     char *intermediate_char_94 = "()*+";                                \
3264     char *intermediate_char_96 = ",-./";                                \
3265     int revision = -1;                                                  \
3266     int c;                                                              \
3267                                                                         \
3268     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3269       revision = CHARSET_ISO_REVISION (charset);                        \
3270                                                                         \
3271     if (revision >= 0)                                                  \
3272       {                                                                 \
3273         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3274         EMIT_ONE_BYTE ('@' + revision);                                 \
3275       }                                                                 \
3276     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3277     if (CHARSET_DIMENSION (charset) == 1)                               \
3278       {                                                                 \
3279         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3280           c = intermediate_char_94[reg];                                \
3281         else                                                            \
3282           c = intermediate_char_96[reg];                                \
3283         EMIT_ONE_ASCII_BYTE (c);                                        \
3284       }                                                                 \
3285     else                                                                \
3286       {                                                                 \
3287         EMIT_ONE_ASCII_BYTE ('$');                                      \
3288         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3289           {                                                             \
3290             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3291                 || reg != 0                                             \
3292                 || final_char < '@' || final_char > 'B')                \
3293               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3294           }                                                             \
3295         else                                                            \
3296           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3297       }                                                                 \
3298     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3299                                                                         \
3300     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3301   } while (0)
3302
3303
3304 /* The following two macros produce codes (control character or escape
3305    sequence) for ISO2022 single-shift functions (single-shift-2 and
3306    single-shift-3).  */
3307
3308 #define ENCODE_SINGLE_SHIFT_2                                           \
3309   do {                                                                  \
3310     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3311       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3312     else                                                                \
3313       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3314     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3315   } while (0)
3316
3317
3318 #define ENCODE_SINGLE_SHIFT_3                                           \
3319   do {                                                                  \
3320     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3321       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3322     else                                                                \
3323       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3324     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3325   } while (0)
3326
3327
3328 /* The following four macros produce codes (control character or
3329    escape sequence) for ISO2022 locking-shift functions (shift-in,
3330    shift-out, locking-shift-2, and locking-shift-3).  */
3331
3332 #define ENCODE_SHIFT_IN                                 \
3333   do {                                                  \
3334     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3335     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3336   } while (0)
3337
3338
3339 #define ENCODE_SHIFT_OUT                                \
3340   do {                                                  \
3341     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3342     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3343   } while (0)
3344
3345
3346 #define ENCODE_LOCKING_SHIFT_2                          \
3347   do {                                                  \
3348     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3349     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3350   } while (0)
3351
3352
3353 #define ENCODE_LOCKING_SHIFT_3                          \
3354   do {                                                  \
3355     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3356     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3357   } while (0)
3358
3359
3360 /* Produce codes for a DIMENSION1 character whose character set is
3361    CHARSET and whose position-code is C1.  Designation and invocation
3362    sequences are also produced in advance if necessary.  */
3363
3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3365   do {                                                                  \
3366     int id = CHARSET_ID (charset);                                      \
3367                                                                         \
3368     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3369         && id == charset_ascii)                                         \
3370       {                                                                 \
3371         id = charset_jisx0201_roman;                                    \
3372         charset = CHARSET_FROM_ID (id);                                 \
3373       }                                                                 \
3374                                                                         \
3375     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3376       {                                                                 \
3377         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3378           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3379         else                                                            \
3380           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3381         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3382         break;                                                          \
3383       }                                                                 \
3384     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3385       {                                                                 \
3386         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3387         break;                                                          \
3388       }                                                                 \
3389     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3390       {                                                                 \
3391         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3392         break;                                                          \
3393       }                                                                 \
3394     else                                                                \
3395       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3396          must invoke it, or, at first, designate it to some graphic     \
3397          register.  Then repeat the loop to actually produce the        \
3398          character.  */                                                 \
3399       dst = encode_invocation_designation (charset, coding, dst,        \
3400                                            &produced_chars);            \
3401   } while (1)
3402
3403
3404 /* Produce codes for a DIMENSION2 character whose character set is
3405    CHARSET and whose position-codes are C1 and C2.  Designation and
3406    invocation codes are also produced in advance if necessary.  */
3407
3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3409   do {                                                                  \
3410     int id = CHARSET_ID (charset);                                      \
3411                                                                         \
3412     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3413         && id == charset_jisx0208)                                      \
3414       {                                                                 \
3415         id = charset_jisx0208_1978;                                     \
3416         charset = CHARSET_FROM_ID (id);                                 \
3417       }                                                                 \
3418                                                                         \
3419     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3420       {                                                                 \
3421         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3422           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3423         else                                                            \
3424           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3425         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3426         break;                                                          \
3427       }                                                                 \
3428     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3429       {                                                                 \
3430         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3431         break;                                                          \
3432       }                                                                 \
3433     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3434       {                                                                 \
3435         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3436         break;                                                          \
3437       }                                                                 \
3438     else                                                                \
3439       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3440          must invoke it, or, at first, designate it to some graphic     \
3441          register.  Then repeat the loop to actually produce the        \
3442          character.  */                                                 \
3443       dst = encode_invocation_designation (charset, coding, dst,        \
3444                                            &produced_chars);            \
3445   } while (1)
3446
3447
3448 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3449   do {                                                                     \
3450     int code = ENCODE_CHAR ((charset),(c));                                \
3451                                                                            \
3452     if (CHARSET_DIMENSION (charset) == 1)                                  \
3453       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3454     else                                                                   \
3455       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3456   } while (0)
3457
3458
3459 /* Produce designation and invocation codes at a place pointed by DST
3460    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3461    Return new DST.  */
3462
3463 unsigned char *
3464 encode_invocation_designation (charset, coding, dst, p_nchars)
3465      struct charset *charset;
3466      struct coding_system *coding;
3467      unsigned char *dst;
3468      int *p_nchars;
3469 {
3470   int multibytep = coding->dst_multibyte;
3471   int produced_chars = *p_nchars;
3472   int reg;                      /* graphic register number */
3473   int id = CHARSET_ID (charset);
3474
3475   /* At first, check designations.  */
3476   for (reg = 0; reg < 4; reg++)
3477     if (id == CODING_ISO_DESIGNATION (coding, reg))
3478       break;
3479
3480   if (reg >= 4)
3481     {
3482       /* CHARSET is not yet designated to any graphic registers.  */
3483       /* At first check the requested designation.  */
3484       reg = CODING_ISO_REQUEST (coding, id);
3485       if (reg < 0)
3486         /* Since CHARSET requests no special designation, designate it
3487            to graphic register 0.  */
3488         reg = 0;
3489
3490       ENCODE_DESIGNATION (charset, reg, coding);
3491     }
3492
3493   if (CODING_ISO_INVOCATION (coding, 0) != reg
3494       && CODING_ISO_INVOCATION (coding, 1) != reg)
3495     {
3496       /* Since the graphic register REG is not invoked to any graphic
3497          planes, invoke it to graphic plane 0.  */
3498       switch (reg)
3499         {
3500         case 0:                 /* graphic register 0 */
3501           ENCODE_SHIFT_IN;
3502           break;
3503
3504         case 1:                 /* graphic register 1 */
3505           ENCODE_SHIFT_OUT;
3506           break;
3507
3508         case 2:                 /* graphic register 2 */
3509           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3510             ENCODE_SINGLE_SHIFT_2;
3511           else
3512             ENCODE_LOCKING_SHIFT_2;
3513           break;
3514
3515         case 3:                 /* graphic register 3 */
3516           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3517             ENCODE_SINGLE_SHIFT_3;
3518           else
3519             ENCODE_LOCKING_SHIFT_3;
3520           break;
3521         }
3522     }
3523
3524   *p_nchars = produced_chars;
3525   return dst;
3526 }
3527
3528 /* The following three macros produce codes for indicating direction
3529    of text.  */
3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3531   do {                                                                  \
3532     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3533       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3534     else                                                                \
3535       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3536   } while (0)
3537
3538
3539 #define ENCODE_DIRECTION_R2L()                  \
3540   do {                                          \
3541     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3542     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3543   } while (0)
3544
3545
3546 #define ENCODE_DIRECTION_L2R()                  \
3547   do {                                          \
3548     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3549     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3550   } while (0)
3551
3552
3553 /* Produce codes for designation and invocation to reset the graphic
3554    planes and registers to initial state.  */
3555 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3556   do {                                                                  \
3557     int reg;                                                            \
3558     struct charset *charset;                                            \
3559                                                                         \
3560     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3561       ENCODE_SHIFT_IN;                                                  \
3562     for (reg = 0; reg < 4; reg++)                                       \
3563       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3564           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3565               != CODING_ISO_INITIAL (coding, reg)))                     \
3566         {                                                               \
3567           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568           ENCODE_DESIGNATION (charset, reg, coding);                    \
3569         }                                                               \
3570   } while (0)
3571
3572
3573 /* Produce designation sequences of charsets in the line started from
3574    SRC to a place pointed by DST, and return updated DST.
3575
3576    If the current block ends before any end-of-line, we may fail to
3577    find all the necessary designations.  */
3578
3579 static unsigned char *
3580 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3581      struct coding_system *coding;
3582      int *charbuf, *charbuf_end;
3583      unsigned char *dst;
3584 {
3585   struct charset *charset;
3586   /* Table of charsets to be designated to each graphic register.  */
3587   int r[4];
3588   int c, found = 0, reg;
3589   int produced_chars = 0;
3590   int multibytep = coding->dst_multibyte;
3591   Lisp_Object attrs;
3592   Lisp_Object charset_list;
3593
3594   attrs = CODING_ID_ATTRS (coding->id);
3595   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3596   if (EQ (charset_list, Qiso_2022))
3597     charset_list = Viso_2022_charset_list;
3598
3599   for (reg = 0; reg < 4; reg++)
3600     r[reg] = -1;
3601
3602   while (found < 4)
3603     {
3604       int id;
3605
3606       c = *charbuf++;
3607       if (c == '\n')
3608         break;
3609       charset = char_charset (c, charset_list, NULL);
3610       id = CHARSET_ID (charset);
3611       reg = CODING_ISO_REQUEST (coding, id);
3612       if (reg >= 0 && r[reg] < 0)
3613         {
3614           found++;
3615           r[reg] = id;
3616         }
3617     }
3618
3619   if (found)
3620     {
3621       for (reg = 0; reg < 4; reg++)
3622         if (r[reg] >= 0
3623             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3624           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3625     }
3626
3627   return dst;
3628 }
3629
3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3631
3632 static int
3633 encode_coding_iso_2022 (coding)
3634      struct coding_system *coding;
3635 {
3636   int multibytep = coding->dst_multibyte;
3637   int *charbuf = coding->charbuf;
3638   int *charbuf_end = charbuf + coding->charbuf_used;
3639   unsigned char *dst = coding->destination + coding->produced;
3640   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3641   int safe_room = 16;
3642   int bol_designation
3643     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644        && CODING_ISO_BOL (coding));
3645   int produced_chars = 0;
3646   Lisp_Object attrs, eol_type, charset_list;
3647   int ascii_compatible;
3648   int c;
3649   int preferred_charset_id = -1;
3650
3651   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3652   setup_iso_safe_charsets (attrs);
3653   /* Charset list may have been changed.  */
3654   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3655   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3656
3657   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3658
3659   while (charbuf < charbuf_end)
3660     {
3661       ASSURE_DESTINATION (safe_room);
3662
3663       if (bol_designation)
3664         {
3665           unsigned char *dst_prev = dst;
3666
3667           /* We have to produce designation sequences if any now.  */
3668           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3669           bol_designation = 0;
3670           /* We are sure that designation sequences are all ASCII bytes.  */
3671           produced_chars += dst - dst_prev;
3672         }
3673
3674       c = *charbuf++;
3675
3676       if (c < 0)
3677         {
3678           /* Handle an annotation.  */
3679           switch (*charbuf)
3680             {
3681             case CODING_ANNOTATE_COMPOSITION_MASK:
3682               /* Not yet implemented.  */
3683               break;
3684             case CODING_ANNOTATE_CHARSET_MASK:
3685               preferred_charset_id = charbuf[3];
3686               if (preferred_charset_id >= 0
3687                   && NILP (Fmemq (make_number (preferred_charset_id),
3688                                   charset_list)))
3689                 preferred_charset_id = -1;
3690               break;
3691             default:
3692               abort ();
3693             }
3694           charbuf += -c - 1;
3695           continue;
3696         }
3697
3698       /* Now encode the character C.  */
3699       if (c < 0x20 || c == 0x7F)
3700         {
3701           if (c == '\n'
3702               || (c == '\r' && EQ (eol_type, Qmac)))
3703             {
3704               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3705                 ENCODE_RESET_PLANE_AND_REGISTER ();
3706               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3707                 {
3708                   int i;
3709
3710                   for (i = 0; i < 4; i++)
3711                     CODING_ISO_DESIGNATION (coding, i)
3712                       = CODING_ISO_INITIAL (coding, i);
3713                 }
3714               bol_designation
3715                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3716             }
3717           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3718             ENCODE_RESET_PLANE_AND_REGISTER ();
3719           EMIT_ONE_ASCII_BYTE (c);
3720         }
3721       else if (ASCII_CHAR_P (c))
3722         {
3723           if (ascii_compatible)
3724             EMIT_ONE_ASCII_BYTE (c);
3725           else
3726             {
3727               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3728               ENCODE_ISO_CHARACTER (charset, c);
3729             }
3730         }
3731       else if (CHAR_BYTE8_P (c))
3732         {
3733           c = CHAR_TO_BYTE8 (c);
3734           EMIT_ONE_BYTE (c);
3735         }
3736       else
3737         {
3738           struct charset *charset;
3739
3740           if (preferred_charset_id >= 0)
3741             {
3742               charset = CHARSET_FROM_ID (preferred_charset_id);
3743               if (! CHAR_CHARSET_P (c, charset))
3744                 charset = char_charset (c, charset_list, NULL);
3745             }
3746           else
3747             charset = char_charset (c, charset_list, NULL);
3748           if (!charset)
3749             {
3750               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3751                 {
3752                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3753                   charset = CHARSET_FROM_ID (charset_ascii);
3754                 }
3755               else
3756                 {
3757                   c = coding->default_char;
3758                   charset = char_charset (c, charset_list, NULL);
3759                 }
3760             }
3761           ENCODE_ISO_CHARACTER (charset, c);
3762         }
3763     }
3764
3765   if (coding->mode & CODING_MODE_LAST_BLOCK
3766       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3767     {
3768       ASSURE_DESTINATION (safe_room);
3769       ENCODE_RESET_PLANE_AND_REGISTER ();
3770     }
3771   coding->result = CODING_RESULT_SUCCESS;
3772   CODING_ISO_BOL (coding) = bol_designation;
3773   coding->produced_char += produced_chars;
3774   coding->produced = dst - coding->destination;
3775   return 0;
3776 }
3777
3778 \f
3779 /*** 8,9. SJIS and BIG5 handlers ***/
3780
3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3782    quite widely.  So, for the moment, Emacs supports them in the bare
3783    C code.  But, in the future, they may be supported only by CCL.  */
3784
3785 /* SJIS is a coding system encoding three character sets: ASCII, right
3786    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3787    as is.  A character of charset katakana-jisx0201 is encoded by
3788    "position-code + 0x80".  A character of charset japanese-jisx0208
3789    is encoded in 2-byte but two position-codes are divided and shifted
3790    so that it fit in the range below.
3791
3792    --- CODE RANGE of SJIS ---
3793    (character set)      (range)
3794    ASCII                0x00 .. 0x7F
3795    KATAKANA-JISX0201    0xA0 .. 0xDF
3796    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3797             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3798    -------------------------------
3799
3800 */
3801
3802 /* BIG5 is a coding system encoding two character sets: ASCII and
3803    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3804    character set and is encoded in two-byte.
3805
3806    --- CODE RANGE of BIG5 ---
3807    (character set)      (range)
3808    ASCII                0x00 .. 0x7F
3809    Big5 (1st byte)      0xA1 .. 0xFE
3810         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3811    --------------------------
3812
3813   */
3814
3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816    Check if a text is encoded in SJIS.  If it is, return
3817    CATEGORY_MASK_SJIS, else return 0.  */
3818
3819 static int
3820 detect_coding_sjis (coding, detect_info)
3821      struct coding_system *coding;
3822      struct coding_detection_info *detect_info;
3823 {
3824   const unsigned char *src = coding->source, *src_base = src;
3825   const unsigned char *src_end = coding->source + coding->src_bytes;
3826   int multibytep = coding->src_multibyte;
3827   int consumed_chars = 0;
3828   int found = 0;
3829   int c;
3830   int incomplete;
3831
3832   detect_info->checked |= CATEGORY_MASK_SJIS;
3833   /* A coding system of this category is always ASCII compatible.  */
3834   src += coding->head_ascii;
3835
3836   while (1)
3837     {
3838       incomplete = 0;
3839       ONE_MORE_BYTE (c);
3840       incomplete = 1;
3841       if (c < 0x80)
3842         continue;
3843       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3844         {
3845           ONE_MORE_BYTE (c);
3846           if (c < 0x40 || c == 0x7F || c > 0xFC)
3847             break;
3848           found = CATEGORY_MASK_SJIS;
3849         }
3850       else if (c >= 0xA0 && c < 0xE0)
3851         found = CATEGORY_MASK_SJIS;
3852       else
3853         break;
3854     }
3855   detect_info->rejected |= CATEGORY_MASK_SJIS;
3856   return 0;
3857
3858  no_more_source:
3859   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3860     {
3861       detect_info->rejected |= CATEGORY_MASK_SJIS;
3862       return 0;
3863     }
3864   detect_info->found |= found;
3865   return 1;
3866 }
3867
3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869    Check if a text is encoded in BIG5.  If it is, return
3870    CATEGORY_MASK_BIG5, else return 0.  */
3871
3872 static int
3873 detect_coding_big5 (coding, detect_info)
3874      struct coding_system *coding;
3875      struct coding_detection_info *detect_info;
3876 {
3877   const unsigned char *src = coding->source, *src_base = src;
3878   const unsigned char *src_end = coding->source + coding->src_bytes;
3879   int multibytep = coding->src_multibyte;
3880   int consumed_chars = 0;
3881   int found = 0;
3882   int c;
3883   int incomplete;
3884
3885   detect_info->checked |= CATEGORY_MASK_BIG5;
3886   /* A coding system of this category is always ASCII compatible.  */
3887   src += coding->head_ascii;
3888
3889   while (1)
3890     {
3891       incomplete = 0;
3892       ONE_MORE_BYTE (c);
3893       incomplete = 1;
3894       if (c < 0x80)
3895         continue;
3896       if (c >= 0xA1)
3897         {
3898           ONE_MORE_BYTE (c);
3899           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3900             return 0;
3901           found = CATEGORY_MASK_BIG5;
3902         }
3903       else
3904         break;
3905     }
3906   detect_info->rejected |= CATEGORY_MASK_BIG5;
3907   return 0;
3908
3909  no_more_source:
3910   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3911     {
3912       detect_info->rejected |= CATEGORY_MASK_BIG5;
3913       return 0;
3914     }
3915   detect_info->found |= found;
3916   return 1;
3917 }
3918
3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3921
3922 static void
3923 decode_coding_sjis (coding)
3924      struct coding_system *coding;
3925 {
3926   const unsigned char *src = coding->source + coding->consumed;
3927   const unsigned char *src_end = coding->source + coding->src_bytes;
3928   const unsigned char *src_base;
3929   int *charbuf = coding->charbuf;
3930   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3931   int consumed_chars = 0, consumed_chars_base;
3932   int multibytep = coding->src_multibyte;
3933   struct charset *charset_roman, *charset_kanji, *charset_kana;
3934   Lisp_Object attrs, eol_type, charset_list, val;
3935   int char_offset = coding->produced_char;
3936   int last_offset = char_offset;
3937   int last_id = charset_ascii;
3938
3939   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3940
3941   val = charset_list;
3942   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3943   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3944   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3945
3946   while (1)
3947     {
3948       int c, c1;
3949
3950       src_base = src;
3951       consumed_chars_base = consumed_chars;
3952
3953       if (charbuf >= charbuf_end)
3954         break;
3955
3956       ONE_MORE_BYTE (c);
3957
3958       if (c == '\r')
3959         {
3960           if (EQ (eol_type, Qdos))
3961             {
3962               if (src == src_end)
3963                 {
3964                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3965                   goto no_more_source;
3966                 }
3967               if (*src == '\n')
3968                 ONE_MORE_BYTE (c);
3969             }
3970           else if (EQ (eol_type, Qmac))
3971             c = '\n';
3972         }
3973       else
3974         {
3975           struct charset *charset;
3976
3977           if (c < 0x80)
3978             charset = charset_roman;
3979           else
3980             {
3981               if (c >= 0xF0)
3982                 goto invalid_code;
3983               if (c < 0xA0 || c >= 0xE0)
3984                 {
3985                   /* SJIS -> JISX0208 */
3986                   ONE_MORE_BYTE (c1);
3987                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3988                     goto invalid_code;
3989                   c = (c << 8) | c1;
3990                   SJIS_TO_JIS (c);
3991                   charset = charset_kanji;
3992                 }
3993               else if (c > 0xA0)
3994                 {
3995                   /* SJIS -> JISX0201-Kana */
3996                   c &= 0x7F;
3997                   charset = charset_kana;
3998                 }
3999               else
4000                 goto invalid_code;
4001             }
4002           if (charset->id != charset_ascii
4003               && last_id != charset->id)
4004             {
4005               if (last_id != charset_ascii)
4006                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4007               last_id = charset->id;
4008               last_offset = char_offset;
4009             }
4010           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4011         }
4012       *charbuf++ = c;
4013       char_offset++;
4014       continue;
4015
4016     invalid_code:
4017       src = src_base;
4018       consumed_chars = consumed_chars_base;
4019       ONE_MORE_BYTE (c);
4020       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4021       char_offset++;
4022       coding->errors++;
4023     }
4024
4025  no_more_source:
4026   if (last_id != charset_ascii)
4027     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4028   coding->consumed_char += consumed_chars_base;
4029   coding->consumed = src_base - coding->source;
4030   coding->charbuf_used = charbuf - coding->charbuf;
4031 }
4032
4033 static void
4034 decode_coding_big5 (coding)
4035      struct coding_system *coding;
4036 {
4037   const unsigned char *src = coding->source + coding->consumed;
4038   const unsigned char *src_end = coding->source + coding->src_bytes;
4039   const unsigned char *src_base;
4040   int *charbuf = coding->charbuf;
4041   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4042   int consumed_chars = 0, consumed_chars_base;
4043   int multibytep = coding->src_multibyte;
4044   struct charset *charset_roman, *charset_big5;
4045   Lisp_Object attrs, eol_type, charset_list, val;
4046   int char_offset = coding->produced_char;
4047   int last_offset = char_offset;
4048   int last_id = charset_ascii;
4049
4050   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4051   val = charset_list;
4052   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4053   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4054
4055   while (1)
4056     {
4057       int c, c1;
4058
4059       src_base = src;
4060       consumed_chars_base = consumed_chars;
4061
4062       if (charbuf >= charbuf_end)
4063         break;
4064
4065       ONE_MORE_BYTE (c);
4066
4067       if (c == '\r')
4068         {
4069           if (EQ (eol_type, Qdos))
4070             {
4071               if (src == src_end)
4072                 {
4073                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4074                   goto no_more_source;
4075                 }
4076               if (*src == '\n')
4077                 ONE_MORE_BYTE (c);
4078             }
4079           else if (EQ (eol_type, Qmac))
4080             c = '\n';
4081         }
4082       else
4083         {
4084           struct charset *charset;
4085           if (c < 0x80)
4086             charset = charset_roman;
4087           else
4088             {
4089               /* BIG5 -> Big5 */
4090               if (c < 0xA1 || c > 0xFE)
4091                 goto invalid_code;
4092               ONE_MORE_BYTE (c1);
4093               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4094                 goto invalid_code;
4095               c = c << 8 | c1;
4096               charset = charset_big5;
4097             }
4098           if (charset->id != charset_ascii
4099               && last_id != charset->id)
4100             {
4101               if (last_id != charset_ascii)
4102                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4103               last_id = charset->id;
4104               last_offset = char_offset;
4105             }
4106           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4107         }
4108
4109       *charbuf++ = c;
4110       char_offset++;
4111       continue;
4112
4113     invalid_code:
4114       src = src_base;
4115       consumed_chars = consumed_chars_base;
4116       ONE_MORE_BYTE (c);
4117       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4118       char_offset++;
4119       coding->errors++;
4120     }
4121
4122  no_more_source:
4123   if (last_id != charset_ascii)
4124     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4125   coding->consumed_char += consumed_chars_base;
4126   coding->consumed = src_base - coding->source;
4127   coding->charbuf_used = charbuf - coding->charbuf;
4128 }
4129
4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4131    This function can encode charsets `ascii', `katakana-jisx0201',
4132    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4133    are sure that all these charsets are registered as official charset
4134    (i.e. do not have extended leading-codes).  Characters of other
4135    charsets are produced without any encoding.  If SJIS_P is 1, encode
4136    SJIS text, else encode BIG5 text.  */
4137
4138 static int
4139 encode_coding_sjis (coding)
4140      struct coding_system *coding;
4141 {
4142   int multibytep = coding->dst_multibyte;
4143   int *charbuf = coding->charbuf;
4144   int *charbuf_end = charbuf + coding->charbuf_used;
4145   unsigned char *dst = coding->destination + coding->produced;
4146   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147   int safe_room = 4;
4148   int produced_chars = 0;
4149   Lisp_Object attrs, eol_type, charset_list, val;
4150   int ascii_compatible;
4151   struct charset *charset_roman, *charset_kanji, *charset_kana;
4152   int c;
4153
4154   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4155   val = charset_list;
4156   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4157   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4158   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4159
4160   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4161
4162   while (charbuf < charbuf_end)
4163     {
4164       ASSURE_DESTINATION (safe_room);
4165       c = *charbuf++;
4166       /* Now encode the character C.  */
4167       if (ASCII_CHAR_P (c) && ascii_compatible)
4168         EMIT_ONE_ASCII_BYTE (c);
4169       else if (CHAR_BYTE8_P (c))
4170         {
4171           c = CHAR_TO_BYTE8 (c);
4172           EMIT_ONE_BYTE (c);
4173         }
4174       else
4175         {
4176           unsigned code;
4177           struct charset *charset = char_charset (c, charset_list, &code);
4178
4179           if (!charset)
4180             {
4181               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4182                 {
4183                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4184                   charset = CHARSET_FROM_ID (charset_ascii);
4185                 }
4186               else
4187                 {
4188                   c = coding->default_char;
4189                   charset = char_charset (c, charset_list, &code);
4190                 }
4191             }
4192           if (code == CHARSET_INVALID_CODE (charset))
4193             abort ();
4194           if (charset == charset_kanji)
4195             {
4196               int c1, c2;
4197               JIS_TO_SJIS (code);
4198               c1 = code >> 8, c2 = code & 0xFF;
4199               EMIT_TWO_BYTES (c1, c2);
4200             }
4201           else if (charset == charset_kana)
4202             EMIT_ONE_BYTE (code | 0x80);
4203           else
4204             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4205         }
4206     }
4207   coding->result = CODING_RESULT_SUCCESS;
4208   coding->produced_char += produced_chars;
4209   coding->produced = dst - coding->destination;
4210   return 0;
4211 }
4212
4213 static int
4214 encode_coding_big5 (coding)
4215      struct coding_system *coding;
4216 {
4217   int multibytep = coding->dst_multibyte;
4218   int *charbuf = coding->charbuf;
4219   int *charbuf_end = charbuf + coding->charbuf_used;
4220   unsigned char *dst = coding->destination + coding->produced;
4221   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4222   int safe_room = 4;
4223   int produced_chars = 0;
4224   Lisp_Object attrs, eol_type, charset_list, val;
4225   int ascii_compatible;
4226   struct charset *charset_roman, *charset_big5;
4227   int c;
4228
4229   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4230   val = charset_list;
4231   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4232   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4233   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235   while (charbuf < charbuf_end)
4236     {
4237       ASSURE_DESTINATION (safe_room);
4238       c = *charbuf++;
4239       /* Now encode the character C.  */
4240       if (ASCII_CHAR_P (c) && ascii_compatible)
4241         EMIT_ONE_ASCII_BYTE (c);
4242       else if (CHAR_BYTE8_P (c))
4243         {
4244           c = CHAR_TO_BYTE8 (c);
4245           EMIT_ONE_BYTE (c);
4246         }
4247       else
4248         {
4249           unsigned code;
4250           struct charset *charset = char_charset (c, charset_list, &code);
4251
4252           if (! charset)
4253             {
4254               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4255                 {
4256                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257                   charset = CHARSET_FROM_ID (charset_ascii);
4258                 }
4259               else
4260                 {
4261                   c = coding->default_char;
4262                   charset = char_charset (c, charset_list, &code);
4263                 }
4264             }
4265           if (code == CHARSET_INVALID_CODE (charset))
4266             abort ();
4267           if (charset == charset_big5)
4268             {
4269               int c1, c2;
4270
4271               c1 = code >> 8, c2 = code & 0xFF;
4272               EMIT_TWO_BYTES (c1, c2);
4273             }
4274           else
4275             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4276         }
4277     }
4278   coding->result = CODING_RESULT_SUCCESS;
4279   coding->produced_char += produced_chars;
4280   coding->produced = dst - coding->destination;
4281   return 0;
4282 }
4283
4284 \f
4285 /*** 10. CCL handlers ***/
4286
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288    Check if a text is encoded in a coding system of which
4289    encoder/decoder are written in CCL program.  If it is, return
4290    CATEGORY_MASK_CCL, else return 0.  */
4291
4292 static int
4293 detect_coding_ccl (coding, detect_info)
4294      struct coding_system *coding;
4295      struct coding_detection_info *detect_info;
4296 {
4297   const unsigned char *src = coding->source, *src_base = src;
4298   const unsigned char *src_end = coding->source + coding->src_bytes;
4299   int multibytep = coding->src_multibyte;
4300   int consumed_chars = 0;
4301   int found = 0;
4302   unsigned char *valids = CODING_CCL_VALIDS (coding);
4303   int head_ascii = coding->head_ascii;
4304   Lisp_Object attrs;
4305
4306   detect_info->checked |= CATEGORY_MASK_CCL;
4307
4308   coding = &coding_categories[coding_category_ccl];
4309   attrs = CODING_ID_ATTRS (coding->id);
4310   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4311     src += head_ascii;
4312
4313   while (1)
4314     {
4315       int c;
4316       ONE_MORE_BYTE (c);
4317       if (! valids[c])
4318         break;
4319       if ((valids[c] > 1))
4320         found = CATEGORY_MASK_CCL;
4321     }
4322   detect_info->rejected |= CATEGORY_MASK_CCL;
4323   return 0;
4324
4325  no_more_source:
4326   detect_info->found |= found;
4327   return 1;
4328 }
4329
4330 static void
4331 decode_coding_ccl (coding)
4332      struct coding_system *coding;
4333 {
4334   const unsigned char *src = coding->source + coding->consumed;
4335   const unsigned char *src_end = coding->source + coding->src_bytes;
4336   int *charbuf = coding->charbuf;
4337   int *charbuf_end = charbuf + coding->charbuf_size;
4338   int consumed_chars = 0;
4339   int multibytep = coding->src_multibyte;
4340   struct ccl_program ccl;
4341   int source_charbuf[1024];
4342   int source_byteidx[1024];
4343   Lisp_Object attrs, eol_type, charset_list;
4344
4345   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4346   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4347
4348   while (src < src_end)
4349     {
4350       const unsigned char *p = src;
4351       int *source, *source_end;
4352       int i = 0;
4353
4354       if (multibytep)
4355         while (i < 1024 && p < src_end)
4356           {
4357             source_byteidx[i] = p - src;
4358             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4359           }
4360       else
4361         while (i < 1024 && p < src_end)
4362           source_charbuf[i++] = *p++;
4363
4364       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4365         ccl.last_block = 1;
4366
4367       source = source_charbuf;
4368       source_end = source + i;
4369       while (source < source_end)
4370         {
4371           ccl_driver (&ccl, source, charbuf,
4372                       source_end - source, charbuf_end - charbuf,
4373                       charset_list);
4374           source += ccl.consumed;
4375           charbuf += ccl.produced;
4376           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4377             break;
4378         }
4379       if (source < source_end)
4380         src += source_byteidx[source - source_charbuf];
4381       else
4382         src = p;
4383       consumed_chars += source - source_charbuf;
4384
4385       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4386           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4387         break;
4388     }
4389
4390   switch (ccl.status)
4391     {
4392     case CCL_STAT_SUSPEND_BY_SRC:
4393       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394       break;
4395     case CCL_STAT_SUSPEND_BY_DST:
4396       break;
4397     case CCL_STAT_QUIT:
4398     case CCL_STAT_INVALID_CMD:
4399       coding->result = CODING_RESULT_INTERRUPT;
4400       break;
4401     default:
4402       coding->result = CODING_RESULT_SUCCESS;
4403       break;
4404     }
4405   coding->consumed_char += consumed_chars;
4406   coding->consumed = src - coding->source;
4407   coding->charbuf_used = charbuf - coding->charbuf;
4408 }
4409
4410 static int
4411 encode_coding_ccl (coding)
4412      struct coding_system *coding;
4413 {
4414   struct ccl_program ccl;
4415   int multibytep = coding->dst_multibyte;
4416   int *charbuf = coding->charbuf;
4417   int *charbuf_end = charbuf + coding->charbuf_used;
4418   unsigned char *dst = coding->destination + coding->produced;
4419   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4420   unsigned char *adjusted_dst_end = dst_end - 1;
4421   int destination_charbuf[1024];
4422   int i, produced_chars = 0;
4423   Lisp_Object attrs, eol_type, charset_list;
4424
4425   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4426   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4427
4428   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4429   ccl.dst_multibyte = coding->dst_multibyte;
4430
4431   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4432     {
4433       int dst_bytes = dst_end - dst;
4434       if (dst_bytes > 1024)
4435         dst_bytes = 1024;
4436
4437       ccl_driver (&ccl, charbuf, destination_charbuf,
4438                   charbuf_end - charbuf, dst_bytes, charset_list);
4439       charbuf += ccl.consumed;
4440       if (multibytep)
4441         for (i = 0; i < ccl.produced; i++)
4442           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4443       else
4444         {
4445           for (i = 0; i < ccl.produced; i++)
4446             *dst++ = destination_charbuf[i] & 0xFF;
4447           produced_chars += ccl.produced;
4448         }
4449     }
4450
4451   switch (ccl.status)
4452     {
4453     case CCL_STAT_SUSPEND_BY_SRC:
4454       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4455       break;
4456     case CCL_STAT_SUSPEND_BY_DST:
4457       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4458       break;
4459     case CCL_STAT_QUIT:
4460     case CCL_STAT_INVALID_CMD:
4461       coding->result = CODING_RESULT_INTERRUPT;
4462       break;
4463     default:
4464       coding->result = CODING_RESULT_SUCCESS;
4465       break;
4466     }
4467
4468   coding->produced_char += produced_chars;
4469   coding->produced = dst - coding->destination;
4470   return 0;
4471 }
4472
4473
4474 \f
4475 /*** 10, 11. no-conversion handlers ***/
4476
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4478
4479 static void
4480 decode_coding_raw_text (coding)
4481      struct coding_system *coding;
4482 {
4483   coding->chars_at_source = 1;
4484   coding->consumed_char = 0;
4485   coding->consumed = 0;
4486   coding->result = CODING_RESULT_SUCCESS;
4487 }
4488
4489 static int
4490 encode_coding_raw_text (coding)
4491      struct coding_system *coding;
4492 {
4493   int multibytep = coding->dst_multibyte;
4494   int *charbuf = coding->charbuf;
4495   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4496   unsigned char *dst = coding->destination + coding->produced;
4497   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4498   int produced_chars = 0;
4499   int c;
4500
4501   if (multibytep)
4502     {
4503       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4504
4505       if (coding->src_multibyte)
4506         while (charbuf < charbuf_end)
4507           {
4508             ASSURE_DESTINATION (safe_room);
4509             c = *charbuf++;
4510             if (ASCII_CHAR_P (c))
4511               EMIT_ONE_ASCII_BYTE (c);
4512             else if (CHAR_BYTE8_P (c))
4513               {
4514                 c = CHAR_TO_BYTE8 (c);
4515                 EMIT_ONE_BYTE (c);
4516               }
4517             else
4518               {
4519                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4520
4521                 CHAR_STRING_ADVANCE (c, p1);
4522                 while (p0 < p1)
4523                   {
4524                     EMIT_ONE_BYTE (*p0);
4525                     p0++;
4526                   }
4527               }
4528           }
4529       else
4530         while (charbuf < charbuf_end)
4531           {
4532             ASSURE_DESTINATION (safe_room);
4533             c = *charbuf++;
4534             EMIT_ONE_BYTE (c);
4535           }
4536     }
4537   else
4538     {
4539       if (coding->src_multibyte)
4540         {
4541           int safe_room = MAX_MULTIBYTE_LENGTH;
4542
4543           while (charbuf < charbuf_end)
4544             {
4545               ASSURE_DESTINATION (safe_room);
4546               c = *charbuf++;
4547               if (ASCII_CHAR_P (c))
4548                 *dst++ = c;
4549               else if (CHAR_BYTE8_P (c))
4550                 *dst++ = CHAR_TO_BYTE8 (c);
4551               else
4552                 CHAR_STRING_ADVANCE (c, dst);
4553               produced_chars++;
4554             }
4555         }
4556       else
4557         {
4558           ASSURE_DESTINATION (charbuf_end - charbuf);
4559           while (charbuf < charbuf_end && dst < dst_end)
4560             *dst++ = *charbuf++;
4561           produced_chars = dst - (coding->destination + coding->dst_bytes);
4562         }
4563     }
4564   coding->result = CODING_RESULT_SUCCESS;
4565   coding->produced_char += produced_chars;
4566   coding->produced = dst - coding->destination;
4567   return 0;
4568 }
4569
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571    Check if a text is encoded in a charset-based coding system.  If it
4572    is, return 1, else return 0.  */
4573
4574 static int
4575 detect_coding_charset (coding, detect_info)
4576      struct coding_system *coding;
4577      struct coding_detection_info *detect_info;
4578 {
4579   const unsigned char *src = coding->source, *src_base = src;
4580   const unsigned char *src_end = coding->source + coding->src_bytes;
4581   int multibytep = coding->src_multibyte;
4582   int consumed_chars = 0;
4583   Lisp_Object attrs, valids;
4584   int found = 0;
4585
4586   detect_info->checked |= CATEGORY_MASK_CHARSET;
4587
4588   coding = &coding_categories[coding_category_charset];
4589   attrs = CODING_ID_ATTRS (coding->id);
4590   valids = AREF (attrs, coding_attr_charset_valids);
4591
4592   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4593     src += coding->head_ascii;
4594
4595   while (1)
4596     {
4597       int c;
4598
4599       ONE_MORE_BYTE (c);
4600       if (NILP (AREF (valids, c)))
4601         break;
4602       if (c >= 0x80)
4603         found = CATEGORY_MASK_CHARSET;
4604     }
4605   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4606   return 0;
4607
4608  no_more_source:
4609   detect_info->found |= found;
4610   return 1;
4611 }
4612
4613 static void
4614 decode_coding_charset (coding)
4615      struct coding_system *coding;
4616 {
4617   const unsigned char *src = coding->source + coding->consumed;
4618   const unsigned char *src_end = coding->source + coding->src_bytes;
4619   const unsigned char *src_base;
4620   int *charbuf = coding->charbuf;
4621   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4622   int consumed_chars = 0, consumed_chars_base;
4623   int multibytep = coding->src_multibyte;
4624   Lisp_Object attrs, eol_type, charset_list, valids;
4625   int char_offset = coding->produced_char;
4626   int last_offset = char_offset;
4627   int last_id = charset_ascii;
4628
4629   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4630   valids = AREF (attrs, coding_attr_charset_valids);
4631
4632   while (1)
4633     {
4634       int c;
4635
4636       src_base = src;
4637       consumed_chars_base = consumed_chars;
4638
4639       if (charbuf >= charbuf_end)
4640         break;
4641
4642       ONE_MORE_BYTE (c);
4643       if (c == '\r')
4644         {
4645           /* Here we assume that no charset maps '\r' to something
4646              else.  */
4647           if (EQ (eol_type, Qdos))
4648             {
4649               if (src == src_end)
4650                 {
4651                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4652                   goto no_more_source;
4653                 }
4654               if (*src == '\n')
4655                 ONE_MORE_BYTE (c);
4656             }
4657           else if (EQ (eol_type, Qmac))
4658             c = '\n';
4659         }
4660       else
4661         {
4662           Lisp_Object val;
4663           struct charset *charset;
4664           int dim;
4665           int len = 1;
4666           unsigned code = c;
4667
4668           val = AREF (valids, c);
4669           if (NILP (val))
4670             goto invalid_code;
4671           if (INTEGERP (val))
4672             {
4673               charset = CHARSET_FROM_ID (XFASTINT (val));
4674               dim = CHARSET_DIMENSION (charset);
4675               while (len < dim)
4676                 {
4677                   ONE_MORE_BYTE (c);
4678                   code = (code << 8) | c;
4679                   len++;
4680                 }
4681               CODING_DECODE_CHAR (coding, src, src_base, src_end,
4682                                   charset, code, c);
4683             }
4684           else
4685             {
4686               /* VAL is a list of charset IDs.  It is assured that the
4687                  list is sorted by charset dimensions (smaller one
4688                  comes first).  */
4689               while (CONSP (val))
4690                 {
4691                   charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4692                   dim = CHARSET_DIMENSION (charset);
4693                   while (len < dim)
4694                     {
4695                       ONE_MORE_BYTE (c);
4696                       code = (code << 8) | c;
4697                       len++;
4698                     }
4699                   CODING_DECODE_CHAR (coding, src, src_base,
4700                                       src_end, charset, code, c);
4701                   if (c >= 0)
4702                     break;
4703                   val = XCDR (val);
4704                 }
4705             }
4706           if (c < 0)
4707             goto invalid_code;
4708           if (charset->id != charset_ascii
4709               && last_id != charset->id)
4710             {
4711               if (last_id != charset_ascii)
4712                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4713               last_id = charset->id;
4714               last_offset = char_offset;
4715             }
4716         }
4717       *charbuf++ = c;
4718       char_offset++;
4719       continue;
4720
4721     invalid_code:
4722       src = src_base;
4723       consumed_chars = consumed_chars_base;
4724       ONE_MORE_BYTE (c);
4725       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4726       char_offset++;
4727       coding->errors++;
4728     }
4729
4730  no_more_source:
4731   if (last_id != charset_ascii)
4732     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4733   coding->consumed_char += consumed_chars_base;
4734   coding->consumed = src_base - coding->source;
4735   coding->charbuf_used = charbuf - coding->charbuf;
4736 }
4737
4738 static int
4739 encode_coding_charset (coding)
4740      struct coding_system *coding;
4741 {
4742   int multibytep = coding->dst_multibyte;
4743   int *charbuf = coding->charbuf;
4744   int *charbuf_end = charbuf + coding->charbuf_used;
4745   unsigned char *dst = coding->destination + coding->produced;
4746   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4747   int safe_room = MAX_MULTIBYTE_LENGTH;
4748   int produced_chars = 0;
4749   Lisp_Object attrs, eol_type, charset_list;
4750   int ascii_compatible;
4751   int c;
4752
4753   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4754   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4755
4756   while (charbuf < charbuf_end)
4757     {
4758       struct charset *charset;
4759       unsigned code;
4760
4761       ASSURE_DESTINATION (safe_room);
4762       c = *charbuf++;
4763       if (ascii_compatible && ASCII_CHAR_P (c))
4764         EMIT_ONE_ASCII_BYTE (c);
4765       else if (CHAR_BYTE8_P (c))
4766         {
4767           c = CHAR_TO_BYTE8 (c);
4768           EMIT_ONE_BYTE (c);
4769         }
4770       else
4771         {
4772           charset = char_charset (c, charset_list, &code);
4773           if (charset)
4774             {
4775               if (CHARSET_DIMENSION (charset) == 1)
4776                 EMIT_ONE_BYTE (code);
4777               else if (CHARSET_DIMENSION (charset) == 2)
4778                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4779               else if (CHARSET_DIMENSION (charset) == 3)
4780                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4781               else
4782                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4783                                  (code >> 8) & 0xFF, code & 0xFF);
4784             }
4785           else
4786             {
4787               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4788                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4789               else
4790                 c = coding->default_char;
4791               EMIT_ONE_BYTE (c);
4792             }
4793         }
4794     }
4795
4796   coding->result = CODING_RESULT_SUCCESS;
4797   coding->produced_char += produced_chars;
4798   coding->produced = dst - coding->destination;
4799   return 0;
4800 }
4801
4802 \f
4803 /*** 7. C library functions ***/
4804
4805 /* Setup coding context CODING from information about CODING_SYSTEM.
4806    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4807    CODING_SYSTEM is invalid, signal an error.  */
4808
4809 void
4810 setup_coding_system (coding_system, coding)
4811      Lisp_Object coding_system;
4812      struct coding_system *coding;
4813 {
4814   Lisp_Object attrs;
4815   Lisp_Object eol_type;
4816   Lisp_Object coding_type;
4817   Lisp_Object val;
4818
4819   if (NILP (coding_system))
4820     coding_system = Qno_conversion;
4821
4822   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4823
4824   attrs = CODING_ID_ATTRS (coding->id);
4825   eol_type = CODING_ID_EOL_TYPE (coding->id);
4826
4827   coding->mode = 0;
4828   coding->head_ascii = -1;
4829   coding->common_flags
4830     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4831   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4832     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4833   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4834     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4835   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4836     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4837
4838   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4839   coding->max_charset_id = SCHARS (val) - 1;
4840   coding->safe_charsets = (char *) SDATA (val);
4841   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4842
4843   coding_type = CODING_ATTR_TYPE (attrs);
4844   if (EQ (coding_type, Qundecided))
4845     {
4846       coding->detector = NULL;
4847       coding->decoder = decode_coding_raw_text;
4848       coding->encoder = encode_coding_raw_text;
4849       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4850     }
4851   else if (EQ (coding_type, Qiso_2022))
4852     {
4853       int i;
4854       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4855
4856       /* Invoke graphic register 0 to plane 0.  */
4857       CODING_ISO_INVOCATION (coding, 0) = 0;
4858       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4859       CODING_ISO_INVOCATION (coding, 1)
4860         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4861       /* Setup the initial status of designation.  */
4862       for (i = 0; i < 4; i++)
4863         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4864       /* Not single shifting initially.  */
4865       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4866       /* Beginning of buffer should also be regarded as bol. */
4867       CODING_ISO_BOL (coding) = 1;
4868       coding->detector = detect_coding_iso_2022;
4869       coding->decoder = decode_coding_iso_2022;
4870       coding->encoder = encode_coding_iso_2022;
4871       if (flags & CODING_ISO_FLAG_SAFE)
4872         coding->mode |= CODING_MODE_SAFE_ENCODING;
4873       coding->common_flags
4874         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4875             | CODING_REQUIRE_FLUSHING_MASK);
4876       if (flags & CODING_ISO_FLAG_COMPOSITION)
4877         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4878       if (flags & CODING_ISO_FLAG_DESIGNATION)
4879         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4880       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4881         {
4882           setup_iso_safe_charsets (attrs);
4883           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4884           coding->max_charset_id = SCHARS (val) - 1;
4885           coding->safe_charsets = (char *) SDATA (val);
4886         }
4887       CODING_ISO_FLAGS (coding) = flags;
4888     }
4889   else if (EQ (coding_type, Qcharset))
4890     {
4891       coding->detector = detect_coding_charset;
4892       coding->decoder = decode_coding_charset;
4893       coding->encoder = encode_coding_charset;
4894       coding->common_flags
4895         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4896     }
4897   else if (EQ (coding_type, Qutf_8))
4898     {
4899       coding->detector = detect_coding_utf_8;
4900       coding->decoder = decode_coding_utf_8;
4901       coding->encoder = encode_coding_utf_8;
4902       coding->common_flags
4903         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4904     }
4905   else if (EQ (coding_type, Qutf_16))
4906     {
4907       val = AREF (attrs, coding_attr_utf_16_bom);
4908       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4909                                     : EQ (val, Qt) ? utf_16_with_bom
4910                                     : utf_16_without_bom);
4911       val = AREF (attrs, coding_attr_utf_16_endian);
4912       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4913                                        : utf_16_little_endian);
4914       CODING_UTF_16_SURROGATE (coding) = 0;
4915       coding->detector = detect_coding_utf_16;
4916       coding->decoder = decode_coding_utf_16;
4917       coding->encoder = encode_coding_utf_16;
4918       coding->common_flags
4919         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4920       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4921         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4922     }
4923   else if (EQ (coding_type, Qccl))
4924     {
4925       coding->detector = detect_coding_ccl;
4926       coding->decoder = decode_coding_ccl;
4927       coding->encoder = encode_coding_ccl;
4928       coding->common_flags
4929         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4930             | CODING_REQUIRE_FLUSHING_MASK);
4931     }
4932   else if (EQ (coding_type, Qemacs_mule))
4933     {
4934       coding->detector = detect_coding_emacs_mule;
4935       coding->decoder = decode_coding_emacs_mule;
4936       coding->encoder = encode_coding_emacs_mule;
4937       coding->common_flags
4938         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4939       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4940           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4941         {
4942           Lisp_Object tail, safe_charsets;
4943           int max_charset_id = 0;
4944
4945           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946                tail = XCDR (tail))
4947             if (max_charset_id < XFASTINT (XCAR (tail)))
4948               max_charset_id = XFASTINT (XCAR (tail));
4949           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4950                                         make_number (255));
4951           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4952                tail = XCDR (tail))
4953             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4954           coding->max_charset_id = max_charset_id;
4955           coding->safe_charsets = (char *) SDATA (safe_charsets);
4956         }
4957     }
4958   else if (EQ (coding_type, Qshift_jis))
4959     {
4960       coding->detector = detect_coding_sjis;
4961       coding->decoder = decode_coding_sjis;
4962       coding->encoder = encode_coding_sjis;
4963       coding->common_flags
4964         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4965     }
4966   else if (EQ (coding_type, Qbig5))
4967     {
4968       coding->detector = detect_coding_big5;
4969       coding->decoder = decode_coding_big5;
4970       coding->encoder = encode_coding_big5;
4971       coding->common_flags
4972         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4973     }
4974   else                          /* EQ (coding_type, Qraw_text) */
4975     {
4976       coding->detector = NULL;
4977       coding->decoder = decode_coding_raw_text;
4978       coding->encoder = encode_coding_raw_text;
4979     }
4980
4981   return;
4982 }
4983
4984 /* Return raw-text or one of its subsidiaries that has the same
4985    eol_type as CODING-SYSTEM.  */
4986
4987 Lisp_Object
4988 raw_text_coding_system (coding_system)
4989      Lisp_Object coding_system;
4990 {
4991   Lisp_Object spec, attrs;
4992   Lisp_Object eol_type, raw_text_eol_type;
4993
4994   if (NILP (coding_system))
4995     return Qraw_text;
4996   spec = CODING_SYSTEM_SPEC (coding_system);
4997   attrs = AREF (spec, 0);
4998
4999   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5000     return coding_system;
5001
5002   eol_type = AREF (spec, 2);
5003   if (VECTORP (eol_type))
5004     return Qraw_text;
5005   spec = CODING_SYSTEM_SPEC (Qraw_text);
5006   raw_text_eol_type = AREF (spec, 2);
5007   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5008           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5009           : AREF (raw_text_eol_type, 2));
5010 }
5011
5012
5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014    does, return one of the subsidiary that has the same eol-spec as
5015    PARENT.  Otherwise, return CODING_SYSTEM.  */
5016
5017 Lisp_Object
5018 coding_inherit_eol_type (coding_system, parent)
5019      Lisp_Object coding_system, parent;
5020 {
5021   Lisp_Object spec, eol_type;
5022
5023   if (NILP (coding_system))
5024     coding_system = Qraw_text;
5025   spec = CODING_SYSTEM_SPEC (coding_system);
5026   eol_type = AREF (spec, 2);
5027   if (VECTORP (eol_type)
5028       && ! NILP (parent))
5029     {
5030       Lisp_Object parent_spec;
5031       Lisp_Object parent_eol_type;
5032
5033       parent_spec
5034         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5035       parent_eol_type = AREF (parent_spec, 2);
5036       if (EQ (parent_eol_type, Qunix))
5037         coding_system = AREF (eol_type, 0);
5038       else if (EQ (parent_eol_type, Qdos))
5039         coding_system = AREF (eol_type, 1);
5040       else if (EQ (parent_eol_type, Qmac))
5041         coding_system = AREF (eol_type, 2);
5042     }
5043   return coding_system;
5044 }
5045
5046 /* Emacs has a mechanism to automatically detect a coding system if it
5047    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5048    it's impossible to distinguish some coding systems accurately
5049    because they use the same range of codes.  So, at first, coding
5050    systems are categorized into 7, those are:
5051
5052    o coding-category-emacs-mule
5053
5054         The category for a coding system which has the same code range
5055         as Emacs' internal format.  Assigned the coding-system (Lisp
5056         symbol) `emacs-mule' by default.
5057
5058    o coding-category-sjis
5059
5060         The category for a coding system which has the same code range
5061         as SJIS.  Assigned the coding-system (Lisp
5062         symbol) `japanese-shift-jis' by default.
5063
5064    o coding-category-iso-7
5065
5066         The category for a coding system which has the same code range
5067         as ISO2022 of 7-bit environment.  This doesn't use any locking
5068         shift and single shift functions.  This can encode/decode all
5069         charsets.  Assigned the coding-system (Lisp symbol)
5070         `iso-2022-7bit' by default.
5071
5072    o coding-category-iso-7-tight
5073
5074         Same as coding-category-iso-7 except that this can
5075         encode/decode only the specified charsets.
5076
5077    o coding-category-iso-8-1
5078
5079         The category for a coding system which has the same code range
5080         as ISO2022 of 8-bit environment and graphic plane 1 used only
5081         for DIMENSION1 charset.  This doesn't use any locking shift
5082         and single shift functions.  Assigned the coding-system (Lisp
5083         symbol) `iso-latin-1' by default.
5084
5085    o coding-category-iso-8-2
5086
5087         The category for a coding system which has the same code range
5088         as ISO2022 of 8-bit environment and graphic plane 1 used only
5089         for DIMENSION2 charset.  This doesn't use any locking shift
5090         and single shift functions.  Assigned the coding-system (Lisp
5091         symbol) `japanese-iso-8bit' by default.
5092
5093    o coding-category-iso-7-else
5094
5095         The category for a coding system which has the same code range
5096         as ISO2022 of 7-bit environemnt but uses locking shift or
5097         single shift functions.  Assigned the coding-system (Lisp
5098         symbol) `iso-2022-7bit-lock' by default.
5099
5100    o coding-category-iso-8-else
5101
5102         The category for a coding system which has the same code range
5103         as ISO2022 of 8-bit environemnt but uses locking shift or
5104         single shift functions.  Assigned the coding-system (Lisp
5105         symbol) `iso-2022-8bit-ss2' by default.
5106
5107    o coding-category-big5
5108
5109         The category for a coding system which has the same code range
5110         as BIG5.  Assigned the coding-system (Lisp symbol)
5111         `cn-big5' by default.
5112
5113    o coding-category-utf-8
5114
5115         The category for a coding system which has the same code range
5116         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5117         symbol) `utf-8' by default.
5118
5119    o coding-category-utf-16-be
5120
5121         The category for a coding system in which a text has an
5122         Unicode signature (cf. Unicode Standard) in the order of BIG
5123         endian at the head.  Assigned the coding-system (Lisp symbol)
5124         `utf-16-be' by default.
5125
5126    o coding-category-utf-16-le
5127
5128         The category for a coding system in which a text has an
5129         Unicode signature (cf. Unicode Standard) in the order of
5130         LITTLE endian at the head.  Assigned the coding-system (Lisp
5131         symbol) `utf-16-le' by default.
5132
5133    o coding-category-ccl
5134
5135         The category for a coding system of which encoder/decoder is
5136         written in CCL programs.  The default value is nil, i.e., no
5137         coding system is assigned.
5138
5139    o coding-category-binary
5140
5141         The category for a coding system not categorized in any of the
5142         above.  Assigned the coding-system (Lisp symbol)
5143         `no-conversion' by default.
5144
5145    Each of them is a Lisp symbol and the value is an actual
5146    `coding-system's (this is also a Lisp symbol) assigned by a user.
5147    What Emacs does actually is to detect a category of coding system.
5148    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5149    decide only one possible category, it selects a category of the
5150    highest priority.  Priorities of categories are also specified by a
5151    user in a Lisp variable `coding-category-list'.
5152
5153 */
5154
5155 #define EOL_SEEN_NONE   0
5156 #define EOL_SEEN_LF     1
5157 #define EOL_SEEN_CR     2
5158 #define EOL_SEEN_CRLF   4
5159
5160 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5161    SOURCE is encoded.  If CATEGORY is one of
5162    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5163    two-byte, else they are encoded by one-byte.
5164
5165    Return one of EOL_SEEN_XXX.  */
5166
5167 #define MAX_EOL_CHECK_COUNT 3
5168
5169 static int
5170 detect_eol (source, src_bytes, category)
5171      unsigned char *source;
5172      EMACS_INT src_bytes;
5173      enum coding_category category;
5174 {
5175   unsigned char *src = source, *src_end = src + src_bytes;
5176   unsigned char c;
5177   int total  = 0;
5178   int eol_seen = EOL_SEEN_NONE;
5179
5180   if ((1 << category) & CATEGORY_MASK_UTF_16)
5181     {
5182       int msb, lsb;
5183
5184       msb = category == (coding_category_utf_16_le
5185                          | coding_category_utf_16_le_nosig);
5186       lsb = 1 - msb;
5187
5188       while (src + 1 < src_end)
5189         {
5190           c = src[lsb];
5191           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5192             {
5193               int this_eol;
5194
5195               if (c == '\n')
5196                 this_eol = EOL_SEEN_LF;
5197               else if (src + 3 >= src_end
5198                        || src[msb + 2] != 0
5199                        || src[lsb + 2] != '\n')
5200                 this_eol = EOL_SEEN_CR;
5201               else
5202                 this_eol = EOL_SEEN_CRLF;
5203
5204               if (eol_seen == EOL_SEEN_NONE)
5205                 /* This is the first end-of-line.  */
5206                 eol_seen = this_eol;
5207               else if (eol_seen != this_eol)
5208                 {
5209                   /* The found type is different from what found before.  */
5210                   eol_seen = EOL_SEEN_LF;
5211                   break;
5212                 }
5213               if (++total == MAX_EOL_CHECK_COUNT)
5214                 break;
5215             }
5216           src += 2;
5217         }
5218     }
5219   else
5220     {
5221       while (src < src_end)
5222         {
5223           c = *src++;
5224           if (c == '\n' || c == '\r')
5225             {
5226               int this_eol;
5227
5228               if (c == '\n')
5229                 this_eol = EOL_SEEN_LF;
5230               else if (src >= src_end || *src != '\n')
5231                 this_eol = EOL_SEEN_CR;
5232               else
5233                 this_eol = EOL_SEEN_CRLF, src++;
5234
5235               if (eol_seen == EOL_SEEN_NONE)
5236                 /* This is the first end-of-line.  */
5237                 eol_seen = this_eol;
5238               else if (eol_seen != this_eol)
5239                 {
5240                   /* The found type is different from what found before.  */
5241                   eol_seen = EOL_SEEN_LF;
5242                   break;
5243                 }
5244               if (++total == MAX_EOL_CHECK_COUNT)
5245                 break;
5246             }
5247         }
5248     }
5249   return eol_seen;
5250 }
5251
5252
5253 static void
5254 adjust_coding_eol_type (coding, eol_seen)
5255      struct coding_system *coding;
5256      int eol_seen;
5257 {
5258   Lisp_Object eol_type;
5259
5260   eol_type = CODING_ID_EOL_TYPE (coding->id);
5261   if (eol_seen & EOL_SEEN_LF)
5262     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5263   else if (eol_seen & EOL_SEEN_CRLF)
5264     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5265   else if (eol_seen & EOL_SEEN_CR)
5266     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5267 }
5268
5269 /* Detect how a text specified in CODING is encoded.  If a coding
5270    system is detected, update fields of CODING by the detected coding
5271    system.  */
5272
5273 void
5274 detect_coding (coding)
5275      struct coding_system *coding;
5276 {
5277   const unsigned char *src, *src_end;
5278   Lisp_Object attrs, coding_type;
5279
5280   coding->consumed = coding->consumed_char = 0;
5281   coding->produced = coding->produced_char = 0;
5282   coding_set_source (coding);
5283
5284   src_end = coding->source + coding->src_bytes;
5285
5286   /* If we have not yet decided the text encoding type, detect it
5287      now.  */
5288   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5289     {
5290       int c, i;
5291
5292       for (src = coding->source; src < src_end; src++)
5293         {
5294           c = *src;
5295           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5296                                         || c == ISO_CODE_SI
5297                                         || c == ISO_CODE_SO)))
5298             break;
5299         }
5300       coding->head_ascii = src - (coding->source + coding->consumed);
5301
5302       if (coding->head_ascii < coding->src_bytes)
5303         {
5304           struct coding_detection_info detect_info;
5305           enum coding_category category;
5306           struct coding_system *this;
5307
5308           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5309           for (i = 0; i < coding_category_raw_text; i++)
5310             {
5311               category = coding_priorities[i];
5312               this = coding_categories + category;
5313               if (this->id < 0)
5314                 {
5315                   /* No coding system of this category is defined.  */
5316                   detect_info.rejected |= (1 << category);
5317                 }
5318               else if (category >= coding_category_raw_text)
5319                 continue;
5320               else if (detect_info.checked & (1 << category))
5321                 {
5322                   if (detect_info.found & (1 << category))
5323                     break;
5324                 }
5325               else if ((*(this->detector)) (coding, &detect_info)
5326                        && detect_info.found & (1 << category))
5327                 break;
5328             }
5329           if (i < coding_category_raw_text)
5330             setup_coding_system (CODING_ID_NAME (this->id), coding);
5331           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5332             setup_coding_system (Qraw_text, coding);
5333           else if (detect_info.rejected)
5334             for (i = 0; i < coding_category_raw_text; i++)
5335               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5336                 {
5337                   this = coding_categories + coding_priorities[i];
5338                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5339                   break;
5340                 }
5341         }
5342     }
5343   else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5344     {
5345       Lisp_Object coding_systems;
5346       struct coding_detection_info detect_info;
5347
5348       coding_systems
5349         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5350       detect_info.found = detect_info.rejected = 0;
5351       if (CONSP (coding_systems)
5352           && detect_coding_utf_16 (coding, &detect_info)
5353           && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5354                                    | CATEGORY_MASK_UTF_16_BE)))
5355         {
5356           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5357             setup_coding_system (XCAR (coding_systems), coding);
5358           else
5359             setup_coding_system (XCDR (coding_systems), coding);
5360         }
5361     }
5362
5363   attrs = CODING_ID_ATTRS (coding->id);
5364   coding_type = CODING_ATTR_TYPE (attrs);
5365
5366   /* If we have not yet decided the EOL type, detect it now.  But, the
5367      detection is impossible for a CCL based coding system, in which
5368      case, we detct the EOL type after decoding.  */
5369   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5370       && ! EQ (coding_type, Qccl))
5371     {
5372       int eol_seen = detect_eol (coding->source, coding->src_bytes,
5373                                  (enum coding_category) XINT (CODING_ATTR_CATEGORY (attrs)));
5374
5375       if (eol_seen != EOL_SEEN_NONE)
5376         adjust_coding_eol_type (coding, eol_seen);
5377     }
5378 }
5379
5380
5381 static void
5382 decode_eol (coding)
5383      struct coding_system *coding;
5384 {
5385   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5386     {
5387       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5388       unsigned char *pend = p + coding->produced;
5389       int eol_seen = EOL_SEEN_NONE;
5390
5391       for (; p < pend; p++)
5392         {
5393           if (*p == '\n')
5394             eol_seen |= EOL_SEEN_LF;
5395           else if (*p == '\r')
5396             {
5397               if (p + 1 < pend && *(p + 1) == '\n')
5398                 {
5399                   eol_seen |= EOL_SEEN_CRLF;
5400                   p++;
5401                 }
5402               else
5403                 eol_seen |= EOL_SEEN_CR;
5404             }
5405         }
5406       if (eol_seen != EOL_SEEN_NONE)
5407         adjust_coding_eol_type (coding, eol_seen);
5408     }
5409
5410   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5411     {
5412       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5413       unsigned char *pend = p + coding->produced;
5414
5415       for (; p < pend; p++)
5416         if (*p == '\r')
5417           *p = '\n';
5418     }
5419   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5420     {
5421       unsigned char *p, *pbeg, *pend;
5422       Lisp_Object undo_list;
5423
5424       move_gap_both (coding->dst_pos + coding->produced_char,
5425                      coding->dst_pos_byte + coding->produced);
5426       undo_list = current_buffer->undo_list;
5427       current_buffer->undo_list = Qt;
5428       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5429       current_buffer->undo_list = undo_list;
5430       pbeg = GPT_ADDR;
5431       pend = pbeg + coding->produced;
5432
5433       for (p = pend - 1; p >= pbeg; p--)
5434         if (*p == '\r')
5435           {
5436             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5437             pend--;
5438           }
5439       coding->produced_char -= coding->produced - (pend - pbeg);
5440       coding->produced = pend - pbeg;
5441       insert_from_gap (coding->produced_char, coding->produced);
5442     }
5443 }
5444
5445 static void
5446 translate_chars (coding, table)
5447      struct coding_system *coding;
5448      Lisp_Object table;
5449 {
5450   int *charbuf = coding->charbuf;
5451   int *charbuf_end = charbuf + coding->charbuf_used;
5452   int c;
5453
5454   if (coding->chars_at_source)
5455     return;
5456
5457   while (charbuf < charbuf_end)
5458     {
5459       c = *charbuf;
5460       if (c < 0)
5461         charbuf += c;
5462       else
5463         *charbuf++ = translate_char (table, c);
5464     }
5465 }
5466
5467 static int
5468 produce_chars (coding)
5469      struct coding_system *coding;
5470 {
5471   unsigned char *dst = coding->destination + coding->produced;
5472   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5473   int produced;
5474   int produced_chars = 0;
5475
5476   if (! coding->chars_at_source)
5477     {
5478       /* Characters are in coding->charbuf.  */
5479       int *buf = coding->charbuf;
5480       int *buf_end = buf + coding->charbuf_used;
5481       unsigned char *adjusted_dst_end;
5482
5483       if (BUFFERP (coding->src_object)
5484           && EQ (coding->src_object, coding->dst_object))
5485         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5486       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5487
5488       while (buf < buf_end)
5489         {
5490           int c = *buf++;
5491
5492           if (dst >= adjusted_dst_end)
5493             {
5494               dst = alloc_destination (coding,
5495                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5496                                        dst);
5497               dst_end = coding->destination + coding->dst_bytes;
5498               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5499             }
5500           if (c >= 0)
5501             {
5502               if (coding->dst_multibyte
5503                   || ! CHAR_BYTE8_P (c))
5504                 CHAR_STRING_ADVANCE (c, dst);
5505               else
5506                 *dst++ = CHAR_TO_BYTE8 (c);
5507               produced_chars++;
5508             }
5509           else
5510             /* This is an annotation datum.  (-C) is the length of
5511                it.  */
5512             buf += -c - 1;
5513         }
5514     }
5515   else
5516     {
5517       const unsigned char *src = coding->source;
5518       const unsigned char *src_end = src + coding->src_bytes;
5519       Lisp_Object eol_type;
5520
5521       eol_type = CODING_ID_EOL_TYPE (coding->id);
5522
5523       if (coding->src_multibyte != coding->dst_multibyte)
5524         {
5525           if (coding->src_multibyte)
5526             {
5527               int multibytep = 1;
5528               int consumed_chars;
5529
5530               while (1)
5531                 {
5532                   const unsigned char *src_base = src;
5533                   int c;
5534
5535                   ONE_MORE_BYTE (c);
5536                   if (c == '\r')
5537                     {
5538                       if (EQ (eol_type, Qdos))
5539                         {
5540                           if (src == src_end)
5541                             {
5542                               coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5543                               goto no_more_source;
5544                             }
5545                           if (*src == '\n')
5546                             c = *src++;
5547                         }
5548                       else if (EQ (eol_type, Qmac))
5549                         c = '\n';
5550                     }
5551                   if (dst == dst_end)
5552                     {
5553                       coding->consumed = src - coding->source;
5554
5555                     if (EQ (coding->src_object, coding->dst_object))
5556                       dst_end = (unsigned char *) src;
5557                     if (dst == dst_end)
5558                       {
5559                         dst = alloc_destination (coding, src_end - src + 1,
5560                                                  dst);
5561                         dst_end = coding->destination + coding->dst_bytes;
5562                         coding_set_source (coding);
5563                         src = coding->source + coding->consumed;
5564                         src_end = coding->source + coding->src_bytes;
5565                       }
5566                     }
5567                   *dst++ = c;
5568                   produced_chars++;
5569                 }
5570             no_more_source:
5571               ;
5572             }
5573           else
5574             while (src < src_end)
5575               {
5576                 int multibytep = 1;
5577                 int c = *src++;
5578
5579                 if (c == '\r')
5580                   {
5581                     if (EQ (eol_type, Qdos))
5582                       {
5583                         if (src < src_end
5584                             && *src == '\n')
5585                           c = *src++;
5586                       }
5587                     else if (EQ (eol_type, Qmac))
5588                       c = '\n';
5589                   }
5590                 if (dst >= dst_end - 1)
5591                   {
5592                     coding->consumed = src - coding->source;
5593
5594                     if (EQ (coding->src_object, coding->dst_object))
5595                       dst_end = (unsigned char *) src;
5596                     if (dst >= dst_end - 1)
5597                       {
5598                         dst = alloc_destination (coding, src_end - src + 2,
5599                                                  dst);
5600                         dst_end = coding->destination + coding->dst_bytes;
5601                         coding_set_source (coding);
5602                         src = coding->source + coding->consumed;
5603                         src_end = coding->source + coding->src_bytes;
5604                       }
5605                   }
5606                 EMIT_ONE_BYTE (c);
5607               }
5608         }
5609       else
5610         {
5611           if (!EQ (coding->src_object, coding->dst_object))
5612             {
5613               int require = coding->src_bytes - coding->dst_bytes;
5614
5615               if (require > 0)
5616                 {
5617                   EMACS_INT offset = src - coding->source;
5618
5619                   dst = alloc_destination (coding, require, dst);
5620                   coding_set_source (coding);
5621                   src = coding->source + offset;
5622                   src_end = coding->source + coding->src_bytes;
5623                 }
5624             }
5625           produced_chars = coding->src_chars;
5626           while (src < src_end)
5627             {
5628               int c = *src++;
5629
5630               if (c == '\r')
5631                 {
5632                   if (EQ (eol_type, Qdos))
5633                     {
5634                       if (src < src_end
5635                           && *src == '\n')
5636                         c = *src++;
5637                       produced_chars--;
5638                     }
5639                   else if (EQ (eol_type, Qmac))
5640                     c = '\n';
5641                 }
5642               *dst++ = c;
5643             }
5644         }
5645       coding->consumed = coding->src_bytes;
5646       coding->consumed_char = coding->src_chars;
5647     }
5648
5649   produced = dst - (coding->destination + coding->produced);
5650   if (BUFFERP (coding->dst_object))
5651     insert_from_gap (produced_chars, produced);
5652   coding->produced += produced;
5653   coding->produced_char += produced_chars;
5654   return produced_chars;
5655 }
5656
5657 /* Compose text in CODING->object according to the annotation data at
5658    CHARBUF.  CHARBUF is an array:
5659      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5660  */
5661
5662 static INLINE void
5663 produce_composition (coding, charbuf)
5664      struct coding_system *coding;
5665      int *charbuf;
5666 {
5667   int len;
5668   EMACS_INT from, to;
5669   enum composition_method method;
5670   Lisp_Object components;
5671
5672   len = -charbuf[0];
5673   from = coding->dst_pos + charbuf[2];
5674   to = coding->dst_pos + charbuf[3];
5675   method = (enum composition_method) (charbuf[4]);
5676
5677   if (method == COMPOSITION_RELATIVE)
5678     components = Qnil;
5679   else
5680     {
5681       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5682       int i;
5683
5684       len -= 5;
5685       charbuf += 5;
5686       for (i = 0; i < len; i++)
5687         args[i] = make_number (charbuf[i]);
5688       components = (method == COMPOSITION_WITH_ALTCHARS
5689                     ? Fstring (len, args) : Fvector (len, args));
5690     }
5691   compose_text (from, to, components, Qnil, coding->dst_object);
5692 }
5693
5694
5695 /* Put `charset' property on text in CODING->object according to
5696    the annotation data at CHARBUF.  CHARBUF is an array:
5697      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5698  */
5699
5700 static INLINE void
5701 produce_charset (coding, charbuf)
5702      struct coding_system *coding;
5703      int *charbuf;
5704 {
5705   EMACS_INT from = coding->dst_pos + charbuf[2];
5706   EMACS_INT to = coding->dst_pos + charbuf[3];
5707   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5708
5709   Fput_text_property (make_number (from), make_number (to),
5710                       Qcharset, CHARSET_NAME (charset),
5711                       coding->dst_object);
5712 }
5713
5714
5715 #define CHARBUF_SIZE 0x4000
5716
5717 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5718   do {                                                                  \
5719     int size = CHARBUF_SIZE;;                                           \
5720                                                                         \
5721     coding->charbuf = NULL;                                             \
5722     while (size > 1024)                                                 \
5723       {                                                                 \
5724         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5725         if (coding->charbuf)                                            \
5726           break;                                                        \
5727         size >>= 1;                                                     \
5728       }                                                                 \
5729     if (! coding->charbuf)                                              \
5730       {                                                                 \
5731         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5732         return coding->result;                                          \
5733       }                                                                 \
5734     coding->charbuf_size = size;                                        \
5735   } while (0)
5736
5737
5738 static void
5739 produce_annotation (coding)
5740      struct coding_system *coding;
5741 {
5742   int *charbuf = coding->charbuf;
5743   int *charbuf_end = charbuf + coding->charbuf_used;
5744
5745   if (NILP (coding->dst_object))
5746     return;
5747
5748   while (charbuf < charbuf_end)
5749     {
5750       if (*charbuf >= 0)
5751         charbuf++;
5752       else
5753         {
5754           int len = -*charbuf;
5755           switch (charbuf[1])
5756             {
5757             case CODING_ANNOTATE_COMPOSITION_MASK:
5758               produce_composition (coding, charbuf);
5759               break;
5760             case CODING_ANNOTATE_CHARSET_MASK:
5761               produce_charset (coding, charbuf);
5762               break;
5763             default:
5764               abort ();
5765             }
5766           charbuf += len;
5767         }
5768     }
5769 }
5770
5771 /* Decode the data at CODING->src_object into CODING->dst_object.
5772    CODING->src_object is a buffer, a string, or nil.
5773    CODING->dst_object is a buffer.
5774
5775    If CODING->src_object is a buffer, it must be the current buffer.
5776    In this case, if CODING->src_pos is positive, it is a position of
5777    the source text in the buffer, otherwise, the source text is in the
5778    gap area of the buffer, and CODING->src_pos specifies the offset of
5779    the text from GPT (which must be the same as PT).  If this is the
5780    same buffer as CODING->dst_object, CODING->src_pos must be
5781    negative.
5782
5783    If CODING->src_object is a string, CODING->src_pos in an index to
5784    that string.
5785
5786    If CODING->src_object is nil, CODING->source must already point to
5787    the non-relocatable memory area.  In this case, CODING->src_pos is
5788    an offset from CODING->source.
5789
5790    The decoded data is inserted at the current point of the buffer
5791    CODING->dst_object.
5792 */
5793
5794 static int
5795 decode_coding (coding)
5796      struct coding_system *coding;
5797 {
5798   Lisp_Object attrs;
5799
5800   if (BUFFERP (coding->src_object)
5801       && coding->src_pos > 0
5802       && coding->src_pos < GPT
5803       && coding->src_pos + coding->src_chars > GPT)
5804     move_gap_both (coding->src_pos, coding->src_pos_byte);
5805
5806   if (BUFFERP (coding->dst_object))
5807     {
5808       if (current_buffer != XBUFFER (coding->dst_object))
5809         set_buffer_internal (XBUFFER (coding->dst_object));
5810       if (GPT != PT)
5811         move_gap_both (PT, PT_BYTE);
5812     }
5813
5814   coding->consumed = coding->consumed_char = 0;
5815   coding->produced = coding->produced_char = 0;
5816   coding->chars_at_source = 0;
5817   coding->result = CODING_RESULT_SUCCESS;
5818   coding->errors = 0;
5819
5820   ALLOC_CONVERSION_WORK_AREA (coding);
5821
5822   attrs = CODING_ID_ATTRS (coding->id);
5823
5824   do
5825     {
5826       coding_set_source (coding);
5827       coding->annotated = 0;
5828       (*(coding->decoder)) (coding);
5829       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5830         translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5831       else if (!NILP (Vstandard_translation_table_for_decode))
5832         translate_chars (coding, Vstandard_translation_table_for_decode);
5833       coding_set_destination (coding);
5834       produce_chars (coding);
5835       if (coding->annotated)
5836         produce_annotation (coding);
5837     }
5838   while (coding->consumed < coding->src_bytes
5839          && ! coding->result);
5840
5841   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5842       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5843       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5844     decode_eol (coding);
5845
5846   coding->carryover_bytes = 0;
5847   if (coding->consumed < coding->src_bytes)
5848     {
5849       int nbytes = coding->src_bytes - coding->consumed;
5850       const unsigned char *src;
5851
5852       coding_set_source (coding);
5853       coding_set_destination (coding);
5854       src = coding->source + coding->consumed;
5855
5856       if (coding->mode & CODING_MODE_LAST_BLOCK)
5857         {
5858           /* Flush out unprocessed data as binary chars.  We are sure
5859              that the number of data is less than the size of
5860              coding->charbuf.  */
5861           while (nbytes-- > 0)
5862             {
5863               int c = *src++;
5864
5865               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5866             }
5867           produce_chars (coding);
5868         }
5869       else
5870         {
5871           /* Record unprocessed bytes in coding->carryover.  We are
5872              sure that the number of data is less than the size of
5873              coding->carryover.  */
5874           unsigned char *p = coding->carryover;
5875
5876           coding->carryover_bytes = nbytes;
5877           while (nbytes-- > 0)
5878             *p++ = *src++;
5879         }
5880       coding->consumed = coding->src_bytes;
5881     }
5882
5883   return coding->result;
5884 }
5885
5886
5887 /* Extract an annotation datum from a composition starting at POS and
5888    ending before LIMIT of CODING->src_object (buffer or string), store
5889    the data in BUF, set *STOP to a starting position of the next
5890    composition (if any) or to LIMIT, and return the address of the
5891    next element of BUF.
5892
5893    If such an annotation is not found, set *STOP to a starting
5894    position of a composition after POS (if any) or to LIMIT, and
5895    return BUF.  */
5896
5897 static INLINE int *
5898 handle_composition_annotation (pos, limit, coding, buf, stop)
5899      EMACS_INT pos, limit;
5900      struct coding_system *coding;
5901      int *buf;
5902      EMACS_INT *stop;
5903 {
5904   EMACS_INT start, end;
5905   Lisp_Object prop;
5906
5907   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5908       || end > limit)
5909     *stop = limit;
5910   else if (start > pos)
5911     *stop = start;
5912   else
5913     {
5914       if (start == pos)
5915         {
5916           /* We found a composition.  Store the corresponding
5917              annotation data in BUF.  */
5918           int *head = buf;
5919           enum composition_method method = COMPOSITION_METHOD (prop);
5920           int nchars = COMPOSITION_LENGTH (prop);
5921
5922           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5923           if (method != COMPOSITION_RELATIVE)
5924             {
5925               Lisp_Object components;
5926               int len, i, i_byte;
5927
5928               components = COMPOSITION_COMPONENTS (prop);
5929               if (VECTORP (components))
5930                 {
5931                   len = XVECTOR (components)->size;
5932                   for (i = 0; i < len; i++)
5933                     *buf++ = XINT (AREF (components, i));
5934                 }
5935               else if (STRINGP (components))
5936                 {
5937                   len = SCHARS (components);
5938                   i = i_byte = 0;
5939                   while (i < len)
5940                     {
5941                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5942                       buf++;
5943                     }
5944                 }
5945               else if (INTEGERP (components))
5946                 {
5947                   len = 1;
5948                   *buf++ = XINT (components);
5949                 }
5950               else if (CONSP (components))
5951                 {
5952                   for (len = 0; CONSP (components);
5953                        len++, components = XCDR (components))
5954                     *buf++ = XINT (XCAR (components));
5955                 }
5956               else
5957                 abort ();
5958               *head -= len;
5959             }
5960         }
5961
5962       if (find_composition (end, limit, &start, &end, &prop,
5963                             coding->src_object)
5964           && end <= limit)
5965         *stop = start;
5966       else
5967         *stop = limit;
5968     }
5969   return buf;
5970 }
5971
5972
5973 /* Extract an annotation datum from a text property `charset' at POS of
5974    CODING->src_object (buffer of string), store the data in BUF, set
5975    *STOP to the position where the value of `charset' property changes
5976    (limiting by LIMIT), and return the address of the next element of
5977    BUF.
5978
5979    If the property value is nil, set *STOP to the position where the
5980    property value is non-nil (limiting by LIMIT), and return BUF.  */
5981
5982 static INLINE int *
5983 handle_charset_annotation (pos, limit, coding, buf, stop)
5984      EMACS_INT pos, limit;
5985      struct coding_system *coding;
5986      int *buf;
5987      EMACS_INT *stop;
5988 {
5989   Lisp_Object val, next;
5990   int id;
5991
5992   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5993   if (! NILP (val) && CHARSETP (val))
5994     id = XINT (CHARSET_SYMBOL_ID (val));
5995   else
5996     id = -1;
5997   ADD_CHARSET_DATA (buf, 0, 0, id);
5998   next = Fnext_single_property_change (make_number (pos), Qcharset,
5999                                        coding->src_object,
6000                                        make_number (limit));
6001   *stop = XINT (next);
6002   return buf;
6003 }
6004
6005
6006 static void
6007 consume_chars (coding)
6008      struct coding_system *coding;
6009 {
6010   int *buf = coding->charbuf;
6011   int *buf_end = coding->charbuf + coding->charbuf_size;
6012   const unsigned char *src = coding->source + coding->consumed;
6013   const unsigned char *src_end = coding->source + coding->src_bytes;
6014   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6015   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6016   int multibytep = coding->src_multibyte;
6017   Lisp_Object eol_type;
6018   int c;
6019   EMACS_INT stop, stop_composition, stop_charset;
6020
6021   eol_type = CODING_ID_EOL_TYPE (coding->id);
6022   if (VECTORP (eol_type))
6023     eol_type = Qunix;
6024
6025   /* Note: composition handling is not yet implemented.  */
6026   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6027
6028   if (NILP (coding->src_object))
6029     stop = stop_composition = stop_charset = end_pos;
6030   else
6031     {
6032       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6033         stop = stop_composition = pos;
6034       else
6035         stop = stop_composition = end_pos;
6036       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6037         stop = stop_charset = pos;
6038       else
6039         stop_charset = end_pos;
6040     }
6041
6042   /* Compensate for CRLF and annotation.  */
6043   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6044   while (buf < buf_end)
6045     {
6046       if (pos == stop)
6047         {
6048           if (pos == end_pos)
6049             break;
6050           if (pos == stop_composition)
6051             buf = handle_composition_annotation (pos, end_pos, coding,
6052                                                  buf, &stop_composition);
6053           if (pos == stop_charset)
6054             buf = handle_charset_annotation (pos, end_pos, coding,
6055                                              buf, &stop_charset);
6056           stop = (stop_composition < stop_charset
6057                   ? stop_composition : stop_charset);
6058         }
6059
6060       if (! multibytep)
6061         {
6062           EMACS_INT bytes;
6063
6064           if (! CODING_FOR_UNIBYTE (coding)
6065               && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6066             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6067           else
6068             c = *src++, pos++;
6069         }
6070       else
6071         c = STRING_CHAR_ADVANCE (src), pos++;
6072       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6073         c = '\n';
6074       if (! EQ (eol_type, Qunix))
6075         {
6076           if (c == '\n')
6077             {
6078               if (EQ (eol_type, Qdos))
6079                 *buf++ = '\r';
6080               else
6081                 c = '\r';
6082             }
6083         }
6084       *buf++ = c;
6085     }
6086
6087   coding->consumed = src - coding->source;
6088   coding->consumed_char = pos - coding->src_pos;
6089   coding->charbuf_used = buf - coding->charbuf;
6090   coding->chars_at_source = 0;
6091 }
6092
6093
6094 /* Encode the text at CODING->src_object into CODING->dst_object.
6095    CODING->src_object is a buffer or a string.
6096    CODING->dst_object is a buffer or nil.
6097
6098    If CODING->src_object is a buffer, it must be the current buffer.
6099    In this case, if CODING->src_pos is positive, it is a position of
6100    the source text in the buffer, otherwise. the source text is in the
6101    gap area of the buffer, and coding->src_pos specifies the offset of
6102    the text from GPT (which must be the same as PT).  If this is the
6103    same buffer as CODING->dst_object, CODING->src_pos must be
6104    negative and CODING should not have `pre-write-conversion'.
6105
6106    If CODING->src_object is a string, CODING should not have
6107    `pre-write-conversion'.
6108
6109    If CODING->dst_object is a buffer, the encoded data is inserted at
6110    the current point of that buffer.
6111
6112    If CODING->dst_object is nil, the encoded data is placed at the
6113    memory area specified by CODING->destination.  */
6114
6115 static int
6116 encode_coding (coding)
6117      struct coding_system *coding;
6118 {
6119   Lisp_Object attrs;
6120
6121   attrs = CODING_ID_ATTRS (coding->id);
6122
6123   if (BUFFERP (coding->dst_object))
6124     {
6125       set_buffer_internal (XBUFFER (coding->dst_object));
6126       coding->dst_multibyte
6127         = ! NILP (current_buffer->enable_multibyte_characters);
6128     }
6129
6130   coding->consumed = coding->consumed_char = 0;
6131   coding->produced = coding->produced_char = 0;
6132   coding->result = CODING_RESULT_SUCCESS;
6133   coding->errors = 0;
6134
6135   ALLOC_CONVERSION_WORK_AREA (coding);
6136
6137   do {
6138     coding_set_source (coding);
6139     consume_chars (coding);
6140
6141     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6142       translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6143     else if (!NILP (Vstandard_translation_table_for_encode))
6144       translate_chars (coding, Vstandard_translation_table_for_encode);
6145
6146     coding_set_destination (coding);
6147     (*(coding->encoder)) (coding);
6148   } while (coding->consumed_char < coding->src_chars);
6149
6150   if (BUFFERP (coding->dst_object))
6151     insert_from_gap (coding->produced_char, coding->produced);
6152
6153   return (coding->result);
6154 }
6155
6156
6157 /* Stack of working buffers used in code conversion.  An nil element
6158    means that the code conversion of that level is not using a working
6159    buffer.  */
6160 Lisp_Object Vcode_conversion_work_buf_list;
6161
6162 /* A working buffer used by the top level conversion.  */
6163 Lisp_Object Vcode_conversion_reused_work_buf;
6164
6165
6166 /* Return a working buffer that can be freely used by the following
6167    code conversion.  MULTIBYTEP specifies the multibyteness of the
6168    buffer.  */
6169
6170 Lisp_Object
6171 make_conversion_work_buffer (multibytep, depth)
6172      int multibytep, depth;
6173 {
6174   struct buffer *current = current_buffer;
6175   Lisp_Object buf, name;
6176
6177   if (depth == 0)
6178     {
6179       if (NILP (Vcode_conversion_reused_work_buf))
6180         Vcode_conversion_reused_work_buf
6181           = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6182       buf = Vcode_conversion_reused_work_buf;
6183     }
6184   else
6185     {
6186       if (depth < 0)
6187         {
6188           name = build_string (" *code-converting-work*");
6189           name = Fgenerate_new_buffer_name (name, Qnil);
6190         }
6191       else
6192         {
6193           char str[128];
6194
6195           sprintf (str, " *code-converting-work*<%d>", depth);
6196           name = build_string (str);
6197         }
6198       buf = Fget_buffer_create (name);
6199     }
6200   set_buffer_internal (XBUFFER (buf));
6201   current_buffer->undo_list = Qt;
6202   Ferase_buffer ();
6203   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
6204   set_buffer_internal (current);
6205   return buf;
6206 }
6207
6208 static Lisp_Object
6209 code_conversion_restore (buffer)
6210      Lisp_Object buffer;
6211 {
6212   Lisp_Object workbuf;
6213
6214   workbuf = XCAR (Vcode_conversion_work_buf_list);
6215   if (! NILP (workbuf)
6216       && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6217       && ! NILP (Fbuffer_live_p (workbuf)))
6218     Fkill_buffer (workbuf);
6219   Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6220   set_buffer_internal (XBUFFER (buffer));
6221   return Qnil;
6222 }
6223
6224 static Lisp_Object
6225 code_conversion_save (buffer, with_work_buf, multibyte)
6226      Lisp_Object buffer;
6227      int with_work_buf, multibyte;
6228 {
6229   Lisp_Object workbuf;
6230
6231   if (with_work_buf)
6232     {
6233       int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6234
6235       workbuf = make_conversion_work_buffer (multibyte, depth);
6236     }
6237   else
6238     workbuf = Qnil;
6239   Vcode_conversion_work_buf_list
6240     = Fcons (workbuf, Vcode_conversion_work_buf_list);
6241   record_unwind_protect (code_conversion_restore, buffer);
6242   return workbuf;
6243 }
6244
6245 int
6246 decode_coding_gap (coding, chars, bytes)
6247      struct coding_system *coding;
6248      EMACS_INT chars, bytes;
6249 {
6250   int count = specpdl_ptr - specpdl;
6251   Lisp_Object attrs;
6252   Lisp_Object buffer;
6253
6254   buffer = Fcurrent_buffer ();
6255   code_conversion_save (buffer, 0, 0);
6256
6257   coding->src_object = buffer;
6258   coding->src_chars = chars;
6259   coding->src_bytes = bytes;
6260   coding->src_pos = -chars;
6261   coding->src_pos_byte = -bytes;
6262   coding->src_multibyte = chars < bytes;
6263   coding->dst_object = buffer;
6264   coding->dst_pos = PT;
6265   coding->dst_pos_byte = PT_BYTE;
6266   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6267   coding->mode |= CODING_MODE_LAST_BLOCK;
6268
6269   if (CODING_REQUIRE_DETECTION (coding))
6270     detect_coding (coding);
6271
6272   decode_coding (coding);
6273
6274   attrs = CODING_ID_ATTRS (coding->id);
6275   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6276     {
6277       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6278       Lisp_Object val;
6279
6280       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6281       val = call1 (CODING_ATTR_POST_READ (attrs),
6282                    make_number (coding->produced_char));
6283       CHECK_NATNUM (val);
6284       coding->produced_char += Z - prev_Z;
6285       coding->produced += Z_BYTE - prev_Z_BYTE;
6286     }
6287
6288   unbind_to (count, Qnil);
6289   return coding->result;
6290 }
6291
6292 int
6293 encode_coding_gap (coding, chars, bytes)
6294      struct coding_system *coding;
6295      EMACS_INT chars, bytes;
6296 {
6297   int count = specpdl_ptr - specpdl;
6298   Lisp_Object buffer;
6299
6300   buffer = Fcurrent_buffer ();
6301   code_conversion_save (buffer, 0, 0);
6302
6303   coding->src_object = buffer;
6304   coding->src_chars = chars;
6305   coding->src_bytes = bytes;
6306   coding->src_pos = -chars;
6307   coding->src_pos_byte = -bytes;
6308   coding->src_multibyte = chars < bytes;
6309   coding->dst_object = coding->src_object;
6310   coding->dst_pos = PT;
6311   coding->dst_pos_byte = PT_BYTE;
6312
6313   encode_coding (coding);
6314
6315   unbind_to (count, Qnil);
6316   return coding->result;
6317 }
6318
6319
6320 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6321    SRC_OBJECT into DST_OBJECT by coding context CODING.
6322
6323    SRC_OBJECT is a buffer, a string, or Qnil.
6324
6325    If it is a buffer, the text is at point of the buffer.  FROM and TO
6326    are positions in the buffer.
6327
6328    If it is a string, the text is at the beginning of the string.
6329    FROM and TO are indices to the string.
6330
6331    If it is nil, the text is at coding->source.  FROM and TO are
6332    indices to coding->source.
6333
6334    DST_OBJECT is a buffer, Qt, or Qnil.
6335
6336    If it is a buffer, the decoded text is inserted at point of the
6337    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6338    is deleted.
6339
6340    If it is Qt, a string is made from the decoded text, and
6341    set in CODING->dst_object.
6342
6343    If it is Qnil, the decoded text is stored at CODING->destination.
6344    The caller must allocate CODING->dst_bytes bytes at
6345    CODING->destination by xmalloc.  If the decoded text is longer than
6346    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6347  */
6348
6349 void
6350 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6351                       dst_object)
6352      struct coding_system *coding;
6353      Lisp_Object src_object;
6354      EMACS_INT from, from_byte, to, to_byte;
6355      Lisp_Object dst_object;
6356 {
6357   int count = specpdl_ptr - specpdl;
6358   unsigned char *destination;
6359   EMACS_INT dst_bytes;
6360   EMACS_INT chars = to - from;
6361   EMACS_INT bytes = to_byte - from_byte;
6362   Lisp_Object attrs;
6363   Lisp_Object buffer;
6364   int saved_pt = -1, saved_pt_byte;
6365
6366   buffer = Fcurrent_buffer ();
6367
6368   if (NILP (dst_object))
6369     {
6370       destination = coding->destination;
6371       dst_bytes = coding->dst_bytes;
6372     }
6373
6374   coding->src_object = src_object;
6375   coding->src_chars = chars;
6376   coding->src_bytes = bytes;
6377   coding->src_multibyte = chars < bytes;
6378
6379   if (STRINGP (src_object))
6380     {
6381       coding->src_pos = from;
6382       coding->src_pos_byte = from_byte;
6383     }
6384   else if (BUFFERP (src_object))
6385     {
6386       set_buffer_internal (XBUFFER (src_object));
6387       if (from != GPT)
6388         move_gap_both (from, from_byte);
6389       if (EQ (src_object, dst_object))
6390         {
6391           saved_pt = PT, saved_pt_byte = PT_BYTE;
6392           TEMP_SET_PT_BOTH (from, from_byte);
6393           del_range_both (from, from_byte, to, to_byte, 1);
6394           coding->src_pos = -chars;
6395           coding->src_pos_byte = -bytes;
6396         }
6397       else
6398         {
6399           coding->src_pos = from;
6400           coding->src_pos_byte = from_byte;
6401         }
6402     }
6403
6404   if (CODING_REQUIRE_DETECTION (coding))
6405     detect_coding (coding);
6406   attrs = CODING_ID_ATTRS (coding->id);
6407
6408   if (EQ (dst_object, Qt)
6409       || (! NILP (CODING_ATTR_POST_READ (attrs))
6410           && NILP (dst_object)))
6411     {
6412       coding->dst_object = code_conversion_save (buffer, 1, 1);
6413       coding->dst_pos = BEG;
6414       coding->dst_pos_byte = BEG_BYTE;
6415       coding->dst_multibyte = 1;
6416     }
6417   else if (BUFFERP (dst_object))
6418     {
6419       code_conversion_save (buffer, 0, 0);
6420       coding->dst_object = dst_object;
6421       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6422       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6423       coding->dst_multibyte
6424         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6425     }
6426   else
6427     {
6428       code_conversion_save (buffer, 0, 0);
6429       coding->dst_object = Qnil;
6430       coding->dst_multibyte = 1;
6431     }
6432
6433   decode_coding (coding);
6434
6435   if (BUFFERP (coding->dst_object))
6436     set_buffer_internal (XBUFFER (coding->dst_object));
6437
6438   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6439     {
6440       struct gcpro gcpro1, gcpro2;
6441       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6442       Lisp_Object val;
6443
6444       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6445       GCPRO2 (coding->src_object, coding->dst_object);
6446       val = call1 (CODING_ATTR_POST_READ (attrs),
6447                    make_number (coding->produced_char));
6448       UNGCPRO;
6449       CHECK_NATNUM (val);
6450       coding->produced_char += Z - prev_Z;
6451       coding->produced += Z_BYTE - prev_Z_BYTE;
6452     }
6453
6454   if (EQ (dst_object, Qt))
6455     {
6456       coding->dst_object = Fbuffer_string ();
6457     }
6458   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6459     {
6460       set_buffer_internal (XBUFFER (coding->dst_object));
6461       if (dst_bytes < coding->produced)
6462         {
6463           destination
6464             = (unsigned char *) xrealloc (destination, coding->produced);
6465           if (! destination)
6466             {
6467               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6468               unbind_to (count, Qnil);
6469               return;
6470             }
6471           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6472             move_gap_both (BEGV, BEGV_BYTE);
6473           bcopy (BEGV_ADDR, destination, coding->produced);
6474           coding->destination = destination;
6475         }
6476     }
6477
6478   if (saved_pt >= 0)
6479     {
6480       /* This is the case of:
6481          (BUFFERP (src_object) && EQ (src_object, dst_object))
6482          As we have moved PT while replacing the original buffer
6483          contents, we must recover it now.  */
6484       set_buffer_internal (XBUFFER (src_object));
6485       if (saved_pt < from)
6486         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6487       else if (saved_pt < from + chars)
6488         TEMP_SET_PT_BOTH (from, from_byte);
6489       else if (! NILP (current_buffer->enable_multibyte_characters))
6490         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6491                           saved_pt_byte + (coding->produced - bytes));
6492       else
6493         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6494                           saved_pt_byte + (coding->produced - bytes));
6495     }
6496
6497   unbind_to (count, Qnil);
6498 }
6499
6500
6501 void
6502 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6503                       dst_object)
6504      struct coding_system *coding;
6505      Lisp_Object src_object;
6506      EMACS_INT from, from_byte, to, to_byte;
6507      Lisp_Object dst_object;
6508 {
6509   int count = specpdl_ptr - specpdl;
6510   EMACS_INT chars = to - from;
6511   EMACS_INT bytes = to_byte - from_byte;
6512   Lisp_Object attrs;
6513   Lisp_Object buffer;
6514   int saved_pt = -1, saved_pt_byte;
6515
6516   buffer = Fcurrent_buffer ();
6517
6518   coding->src_object = src_object;
6519   coding->src_chars = chars;
6520   coding->src_bytes = bytes;
6521   coding->src_multibyte = chars < bytes;
6522
6523   attrs = CODING_ID_ATTRS (coding->id);
6524
6525   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6526     {
6527       coding->src_object = code_conversion_save (buffer, 1,
6528                                                  coding->src_multibyte);
6529       set_buffer_internal (XBUFFER (coding->src_object));
6530       if (STRINGP (src_object))
6531         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6532       else if (BUFFERP (src_object))
6533         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6534       else
6535         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6536
6537       if (EQ (src_object, dst_object))
6538         {
6539           set_buffer_internal (XBUFFER (src_object));
6540           saved_pt = PT, saved_pt_byte = PT_BYTE;
6541           del_range_both (from, from_byte, to, to_byte, 1);
6542           set_buffer_internal (XBUFFER (coding->src_object));
6543         }
6544
6545       call2 (CODING_ATTR_PRE_WRITE (attrs),
6546              make_number (BEG), make_number (Z));
6547       coding->src_object = Fcurrent_buffer ();
6548       if (BEG != GPT)
6549         move_gap_both (BEG, BEG_BYTE);
6550       coding->src_chars = Z - BEG;
6551       coding->src_bytes = Z_BYTE - BEG_BYTE;
6552       coding->src_pos = BEG;
6553       coding->src_pos_byte = BEG_BYTE;
6554       coding->src_multibyte = Z < Z_BYTE;
6555     }
6556   else if (STRINGP (src_object))
6557     {
6558       code_conversion_save (buffer, 0, 0);
6559       coding->src_pos = from;
6560       coding->src_pos_byte = from_byte;
6561     }
6562   else if (BUFFERP (src_object))
6563     {
6564       code_conversion_save (buffer, 0, 0);
6565       set_buffer_internal (XBUFFER (src_object));
6566       if (EQ (src_object, dst_object))
6567         {
6568           saved_pt = PT, saved_pt_byte = PT_BYTE;
6569           coding->src_object = del_range_1 (from, to, 1, 1);
6570           coding->src_pos = 0;
6571           coding->src_pos_byte = 0;
6572         }
6573       else
6574         {
6575           if (from < GPT && to >= GPT)
6576             move_gap_both (from, from_byte);
6577           coding->src_pos = from;
6578           coding->src_pos_byte = from_byte;
6579         }
6580     }
6581   else
6582     code_conversion_save (buffer, 0, 0);
6583
6584   if (BUFFERP (dst_object))
6585     {
6586       coding->dst_object = dst_object;
6587       if (EQ (src_object, dst_object))
6588         {
6589           coding->dst_pos = from;
6590           coding->dst_pos_byte = from_byte;
6591         }
6592       else
6593         {
6594           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6595           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6596         }
6597       coding->dst_multibyte
6598         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6599     }
6600   else if (EQ (dst_object, Qt))
6601     {
6602       coding->dst_object = Qnil;
6603       coding->dst_bytes = coding->src_chars;
6604       if (coding->dst_bytes == 0)
6605         coding->dst_bytes = 1;
6606       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6607       coding->dst_multibyte = 0;
6608     }
6609   else
6610     {
6611       coding->dst_object = Qnil;
6612       coding->dst_multibyte = 0;
6613     }
6614
6615   encode_coding (coding);
6616
6617   if (EQ (dst_object, Qt))
6618     {
6619       if (BUFFERP (coding->dst_object))
6620         coding->dst_object = Fbuffer_string ();
6621       else
6622         {
6623           coding->dst_object
6624             = make_unibyte_string ((char *) coding->destination,
6625                                    coding->produced);
6626           xfree (coding->destination);
6627         }
6628     }
6629
6630   if (saved_pt >= 0)
6631     {
6632       /* This is the case of:
6633          (BUFFERP (src_object) && EQ (src_object, dst_object))
6634          As we have moved PT while replacing the original buffer
6635          contents, we must recover it now.  */
6636       set_buffer_internal (XBUFFER (src_object));
6637       if (saved_pt < from)
6638         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6639       else if (saved_pt < from + chars)
6640         TEMP_SET_PT_BOTH (from, from_byte);
6641       else if (! NILP (current_buffer->enable_multibyte_characters))
6642         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6643                           saved_pt_byte + (coding->produced - bytes));
6644       else
6645         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6646                           saved_pt_byte + (coding->produced - bytes));
6647     }
6648
6649   unbind_to (count, Qnil);
6650 }
6651
6652
6653 Lisp_Object
6654 preferred_coding_system ()
6655 {
6656   int id = coding_categories[coding_priorities[0]].id;
6657
6658   return CODING_ID_NAME (id);
6659 }
6660
6661 \f
6662 #ifdef emacs
6663 /*** 8. Emacs Lisp library functions ***/
6664
6665 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6666        doc: /* Return t if OBJECT is nil or a coding-system.
6667 See the documentation of `define-coding-system' for information
6668 about coding-system objects.  */)
6669      (obj)
6670      Lisp_Object obj;
6671 {
6672   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6673 }
6674
6675 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6676        Sread_non_nil_coding_system, 1, 1, 0,
6677        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6678      (prompt)
6679      Lisp_Object prompt;
6680 {
6681   Lisp_Object val;
6682   do
6683     {
6684       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6685                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6686     }
6687   while (SCHARS (val) == 0);
6688   return (Fintern (val, Qnil));
6689 }
6690
6691 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6692        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6693 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6694      (prompt, default_coding_system)
6695      Lisp_Object prompt, default_coding_system;
6696 {
6697   Lisp_Object val;
6698   if (SYMBOLP (default_coding_system))
6699     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6700   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6701                           Qt, Qnil, Qcoding_system_history,
6702                           default_coding_system, Qnil);
6703   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6704 }
6705
6706 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6707        1, 1, 0,
6708        doc: /* Check validity of CODING-SYSTEM.
6709 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6710   (coding_system)
6711      Lisp_Object coding_system;
6712 {
6713   CHECK_SYMBOL (coding_system);
6714   if (!NILP (Fcoding_system_p (coding_system)))
6715     return coding_system;
6716   while (1)
6717     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6718 }
6719
6720 \f
6721 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6722    HIGHEST is nonzero, return the coding system of the highest
6723    priority among the detected coding systems.  Otherwize return a
6724    list of detected coding systems sorted by their priorities.  If
6725    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6726    multibyte form but contains only ASCII and eight-bit chars.
6727    Otherwise, the bytes are raw bytes.
6728
6729    CODING-SYSTEM controls the detection as below:
6730
6731    If it is nil, detect both text-format and eol-format.  If the
6732    text-format part of CODING-SYSTEM is already specified
6733    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6734    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6735    detect only text-format.  */
6736
6737 Lisp_Object
6738 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6739      const unsigned char *src;
6740      int src_bytes, highest;
6741      int multibytep;
6742      Lisp_Object coding_system;
6743 {
6744   const unsigned char *src_end = src + src_bytes;
6745   Lisp_Object attrs, eol_type;
6746   Lisp_Object val;
6747   struct coding_system coding;
6748   int id;
6749   struct coding_detection_info detect_info;
6750
6751   if (NILP (coding_system))
6752     coding_system = Qundecided;
6753   setup_coding_system (coding_system, &coding);
6754   attrs = CODING_ID_ATTRS (coding.id);
6755   eol_type = CODING_ID_EOL_TYPE (coding.id);
6756   coding_system = CODING_ATTR_BASE_NAME (attrs);
6757
6758   coding.source = src;
6759   coding.src_bytes = src_bytes;
6760   coding.src_multibyte = multibytep;
6761   coding.consumed = 0;
6762   coding.mode |= CODING_MODE_LAST_BLOCK;
6763
6764   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6765
6766   /* At first, detect text-format if necessary.  */
6767   if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6768     {
6769       enum coding_category category;
6770       struct coding_system *this;
6771       int c, i;
6772
6773       for (; src < src_end; src++)
6774         {
6775           c = *src;
6776           if (c & 0x80
6777               || (c < 0x20 && (c == ISO_CODE_ESC
6778                                || c == ISO_CODE_SI
6779                                || c == ISO_CODE_SO)))
6780             break;
6781         }
6782       coding.head_ascii = src - coding.source;
6783
6784       if (src < src_end)
6785         for (i = 0; i < coding_category_raw_text; i++)
6786           {
6787             category = coding_priorities[i];
6788             this = coding_categories + category;
6789
6790             if (this->id < 0)
6791               {
6792                 /* No coding system of this category is defined.  */
6793                 detect_info.rejected |= (1 << category);
6794               }
6795             else if (category >= coding_category_raw_text)
6796               continue;
6797             else if (detect_info.checked & (1 << category))
6798               {
6799                 if (highest
6800                     && (detect_info.found & (1 << category)))
6801                   break;
6802               }
6803             else
6804               {
6805                 if ((*(this->detector)) (&coding, &detect_info)
6806                     && highest
6807                     && (detect_info.found & (1 << category)))
6808                   break;
6809               }
6810           }
6811
6812
6813       if (detect_info.rejected == CATEGORY_MASK_ANY)
6814         {
6815           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6816           id = coding_categories[coding_category_raw_text].id;
6817           val = Fcons (make_number (id), Qnil);
6818         }
6819       else if (! detect_info.rejected && ! detect_info.found)
6820         {
6821           detect_info.found = CATEGORY_MASK_ANY;
6822           id = coding_categories[coding_category_undecided].id;
6823           val = Fcons (make_number (id), Qnil);
6824         }
6825       else if (highest)
6826         {
6827           if (detect_info.found)
6828             {
6829               detect_info.found = 1 << category;
6830               val = Fcons (make_number (this->id), Qnil);
6831             }
6832           else
6833             for (i = 0; i < coding_category_raw_text; i++)
6834               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6835                 {
6836                   detect_info.found = 1 << coding_priorities[i];
6837                   id = coding_categories[coding_priorities[i]].id;
6838                   val = Fcons (make_number (id), Qnil);
6839                   break;
6840                 }
6841         }
6842       else
6843         {
6844           int mask = detect_info.rejected | detect_info.found;
6845           int found = 0;
6846           val = Qnil;
6847
6848           for (i = coding_category_raw_text - 1; i >= 0; i--)
6849             {
6850               category = coding_priorities[i];
6851               if (! (mask & (1 << category)))
6852                 {
6853                   found |= 1 << category;
6854                   id = coding_categories[category].id;
6855                   val = Fcons (make_number (id), val);
6856                 }
6857             }
6858           for (i = coding_category_raw_text - 1; i >= 0; i--)
6859             {
6860               category = coding_priorities[i];
6861               if (detect_info.found & (1 << category))
6862                 {
6863                   id = coding_categories[category].id;
6864                   val = Fcons (make_number (id), val);
6865                 }
6866             }
6867           detect_info.found |= found;
6868         }
6869     }
6870   else
6871     {
6872       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6873       val = Fcons (make_number (coding.id), Qnil);
6874     }
6875
6876   /* Then, detect eol-format if necessary.  */
6877   {
6878     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6879     Lisp_Object tail;
6880
6881     if (VECTORP (eol_type))
6882       {
6883         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6884           normal_eol = detect_eol (coding.source, src_bytes,
6885                                    coding_category_raw_text);
6886         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6887                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
6888           utf_16_be_eol = detect_eol (coding.source, src_bytes,
6889                                       coding_category_utf_16_be);
6890         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6891                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
6892           utf_16_le_eol = detect_eol (coding.source, src_bytes,
6893                                       coding_category_utf_16_le);
6894       }
6895     else
6896       {
6897         if (EQ (eol_type, Qunix))
6898           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6899         else if (EQ (eol_type, Qdos))
6900           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6901         else
6902           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6903       }
6904
6905     for (tail = val; CONSP (tail); tail = XCDR (tail))
6906       {
6907         enum coding_category category;
6908         int this_eol;
6909
6910         id = XINT (XCAR (tail));
6911         attrs = CODING_ID_ATTRS (id);
6912         category = XINT (CODING_ATTR_CATEGORY (attrs));
6913         eol_type = CODING_ID_EOL_TYPE (id);
6914         if (VECTORP (eol_type))
6915           {
6916             if (category == coding_category_utf_16_be
6917                 || category == coding_category_utf_16_be_nosig)
6918               this_eol = utf_16_be_eol;
6919             else if (category == coding_category_utf_16_le
6920                      || category == coding_category_utf_16_le_nosig)
6921               this_eol = utf_16_le_eol;
6922             else
6923               this_eol = normal_eol;
6924
6925             if (this_eol == EOL_SEEN_LF)
6926               XSETCAR (tail, AREF (eol_type, 0));
6927             else if (this_eol == EOL_SEEN_CRLF)
6928               XSETCAR (tail, AREF (eol_type, 1));
6929             else if (this_eol == EOL_SEEN_CR)
6930               XSETCAR (tail, AREF (eol_type, 2));
6931             else
6932               XSETCAR (tail, CODING_ID_NAME (id));
6933           }
6934         else
6935           XSETCAR (tail, CODING_ID_NAME (id));
6936       }
6937   }
6938
6939   return (highest ? XCAR (val) : val);
6940 }
6941
6942
6943 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6944        2, 3, 0,
6945        doc: /* Detect coding system of the text in the region between START and END.
6946 Return a list of possible coding systems ordered by priority.
6947
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6950 end-of-line format.
6951
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority.  */)
6954      (start, end, highest)
6955      Lisp_Object start, end, highest;
6956 {
6957   int from, to;
6958   int from_byte, to_byte;
6959
6960   CHECK_NUMBER_COERCE_MARKER (start);
6961   CHECK_NUMBER_COERCE_MARKER (end);
6962
6963   validate_region (&start, &end);
6964   from = XINT (start), to = XINT (end);
6965   from_byte = CHAR_TO_BYTE (from);
6966   to_byte = CHAR_TO_BYTE (to);
6967
6968   if (from < GPT && to >= GPT)
6969     move_gap_both (to, to_byte);
6970
6971   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6972                                to_byte - from_byte,
6973                                !NILP (highest),
6974                                !NILP (current_buffer
6975                                       ->enable_multibyte_characters),
6976                                Qnil);
6977 }
6978
6979 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6980        1, 2, 0,
6981        doc: /* Detect coding system of the text in STRING.
6982 Return a list of possible coding systems ordered by priority.
6983
6984 If only ASCII characters are found, it returns a list of single element
6985 `undecided' or its subsidiary coding system according to a detected
6986 end-of-line format.
6987
6988 If optional argument HIGHEST is non-nil, return the coding system of
6989 highest priority.  */)
6990      (string, highest)
6991      Lisp_Object string, highest;
6992 {
6993   CHECK_STRING (string);
6994
6995   return detect_coding_system (SDATA (string), SBYTES (string),
6996                                !NILP (highest), STRING_MULTIBYTE (string),
6997                                Qnil);
6998 }
6999
7000
7001 static INLINE int
7002 char_encodable_p (c, attrs)
7003      int c;
7004      Lisp_Object attrs;
7005 {
7006   Lisp_Object tail;
7007   struct charset *charset;
7008
7009   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7010        CONSP (tail); tail = XCDR (tail))
7011     {
7012       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7013       if (CHAR_CHARSET_P (c, charset))
7014         break;
7015     }
7016   return (! NILP (tail));
7017 }
7018
7019
7020 /* Return a list of coding systems that safely encode the text between
7021    START and END.  If EXCLUDE is non-nil, it is a list of coding
7022    systems not to check.  The returned list doesn't contain any such
7023    coding systems.  In any case, if the text contains only ASCII or is
7024    unibyte, return t.  */
7025
7026 DEFUN ("find-coding-systems-region-internal",
7027        Ffind_coding_systems_region_internal,
7028        Sfind_coding_systems_region_internal, 2, 3, 0,
7029        doc: /* Internal use only.  */)
7030      (start, end, exclude)
7031      Lisp_Object start, end, exclude;
7032 {
7033   Lisp_Object coding_attrs_list, safe_codings;
7034   EMACS_INT start_byte, end_byte;
7035   const unsigned char *p, *pbeg, *pend;
7036   int c;
7037   Lisp_Object tail, elt;
7038
7039   if (STRINGP (start))
7040     {
7041       if (!STRING_MULTIBYTE (start)
7042           || SCHARS (start) == SBYTES (start))
7043         return Qt;
7044       start_byte = 0;
7045       end_byte = SBYTES (start);
7046     }
7047   else
7048     {
7049       CHECK_NUMBER_COERCE_MARKER (start);
7050       CHECK_NUMBER_COERCE_MARKER (end);
7051       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7052         args_out_of_range (start, end);
7053       if (NILP (current_buffer->enable_multibyte_characters))
7054         return Qt;
7055       start_byte = CHAR_TO_BYTE (XINT (start));
7056       end_byte = CHAR_TO_BYTE (XINT (end));
7057       if (XINT (end) - XINT (start) == end_byte - start_byte)
7058         return Qt;
7059
7060       if (XINT (start) < GPT && XINT (end) > GPT)
7061         {
7062           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7063             move_gap_both (XINT (start), start_byte);
7064           else
7065             move_gap_both (XINT (end), end_byte);
7066         }
7067     }
7068
7069   coding_attrs_list = Qnil;
7070   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7071     if (NILP (exclude)
7072         || NILP (Fmemq (XCAR (tail), exclude)))
7073       {
7074         Lisp_Object attrs;
7075
7076         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7077         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7078             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7079           coding_attrs_list = Fcons (attrs, coding_attrs_list);
7080       }
7081
7082   if (STRINGP (start))
7083     p = pbeg = SDATA (start);
7084   else
7085     p = pbeg = BYTE_POS_ADDR (start_byte);
7086   pend = p + (end_byte - start_byte);
7087
7088   while (p < pend && ASCII_BYTE_P (*p)) p++;
7089   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7090
7091   while (p < pend)
7092     {
7093       if (ASCII_BYTE_P (*p))
7094         p++;
7095       else
7096         {
7097           c = STRING_CHAR_ADVANCE (p);
7098
7099           charset_map_loaded = 0;
7100           for (tail = coding_attrs_list; CONSP (tail);)
7101             {
7102               elt = XCAR (tail);
7103               if (NILP (elt))
7104                 tail = XCDR (tail);
7105               else if (char_encodable_p (c, elt))
7106                 tail = XCDR (tail);
7107               else if (CONSP (XCDR (tail)))
7108                 {
7109                   XSETCAR (tail, XCAR (XCDR (tail)));
7110                   XSETCDR (tail, XCDR (XCDR (tail)));
7111                 }
7112               else
7113                 {
7114                   XSETCAR (tail, Qnil);
7115                   tail = XCDR (tail);
7116                 }
7117             }
7118           if (charset_map_loaded)
7119             {
7120               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7121
7122               if (STRINGP (start))
7123                 pbeg = SDATA (start);
7124               else
7125                 pbeg = BYTE_POS_ADDR (start_byte);
7126               p = pbeg + p_offset;
7127               pend = pbeg + pend_offset;
7128             }
7129         }
7130     }
7131
7132   safe_codings = Qnil;
7133   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7134     if (! NILP (XCAR (tail)))
7135       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7136
7137   return safe_codings;
7138 }
7139
7140
7141 DEFUN ("unencodable-char-position", Funencodable_char_position,
7142        Sunencodable_char_position, 3, 5, 0,
7143        doc: /*
7144 Return position of first un-encodable character in a region.
7145 START and END specfiy the region and CODING-SYSTEM specifies the
7146 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7147
7148 If optional 4th argument COUNT is non-nil, it specifies at most how
7149 many un-encodable characters to search.  In this case, the value is a
7150 list of positions.
7151
7152 If optional 5th argument STRING is non-nil, it is a string to search
7153 for un-encodable characters.  In that case, START and END are indexes
7154 to the string.  */)
7155      (start, end, coding_system, count, string)
7156      Lisp_Object start, end, coding_system, count, string;
7157 {
7158   int n;
7159   struct coding_system coding;
7160   Lisp_Object attrs, charset_list;
7161   Lisp_Object positions;
7162   int from, to;
7163   const unsigned char *p, *stop, *pend;
7164   int ascii_compatible;
7165
7166   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7167   attrs = CODING_ID_ATTRS (coding.id);
7168   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7169     return Qnil;
7170   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7171   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7172
7173   if (NILP (string))
7174     {
7175       validate_region (&start, &end);
7176       from = XINT (start);
7177       to = XINT (end);
7178       if (NILP (current_buffer->enable_multibyte_characters)
7179           || (ascii_compatible
7180               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7181         return Qnil;
7182       p = CHAR_POS_ADDR (from);
7183       pend = CHAR_POS_ADDR (to);
7184       if (from < GPT && to >= GPT)
7185         stop = GPT_ADDR;
7186       else
7187         stop = pend;
7188     }
7189   else
7190     {
7191       CHECK_STRING (string);
7192       CHECK_NATNUM (start);
7193       CHECK_NATNUM (end);
7194       from = XINT (start);
7195       to = XINT (end);
7196       if (from > to
7197           || to > SCHARS (string))
7198         args_out_of_range_3 (string, start, end);
7199       if (! STRING_MULTIBYTE (string))
7200         return Qnil;
7201       p = SDATA (string) + string_char_to_byte (string, from);
7202       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7203       if (ascii_compatible && (to - from) == (pend - p))
7204         return Qnil;
7205     }
7206
7207   if (NILP (count))
7208     n = 1;
7209   else
7210     {
7211       CHECK_NATNUM (count);
7212       n = XINT (count);
7213     }
7214
7215   positions = Qnil;
7216   while (1)
7217     {
7218       int c;
7219
7220       if (ascii_compatible)
7221         while (p < stop && ASCII_BYTE_P (*p))
7222           p++, from++;
7223       if (p >= stop)
7224         {
7225           if (p >= pend)
7226             break;
7227           stop = pend;
7228           p = GAP_END_ADDR;
7229         }
7230
7231       c = STRING_CHAR_ADVANCE (p);
7232       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7233           && ! char_charset (c, charset_list, NULL))
7234         {
7235           positions = Fcons (make_number (from), positions);
7236           n--;
7237           if (n == 0)
7238             break;
7239         }
7240
7241       from++;
7242     }
7243
7244   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7245 }
7246
7247
7248 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7249        Scheck_coding_systems_region, 3, 3, 0,
7250        doc: /* Check if the region is encodable by coding systems.
7251
7252 START and END are buffer positions specifying the region.
7253 CODING-SYSTEM-LIST is a list of coding systems to check.
7254
7255 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7256 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7257 whole region, POS0, POS1, ... are buffer positions where non-encodable
7258 characters are found.
7259
7260 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7261 value is nil.
7262
7263 START may be a string.  In that case, check if the string is
7264 encodable, and the value contains indices to the string instead of
7265 buffer positions.  END is ignored.  */)
7266      (start, end, coding_system_list)
7267      Lisp_Object start, end, coding_system_list;
7268 {
7269   Lisp_Object list;
7270   EMACS_INT start_byte, end_byte;
7271   int pos;
7272   const unsigned char *p, *pbeg, *pend;
7273   int c;
7274   Lisp_Object tail, elt;
7275
7276   if (STRINGP (start))
7277     {
7278       if (!STRING_MULTIBYTE (start)
7279           && SCHARS (start) != SBYTES (start))
7280         return Qnil;
7281       start_byte = 0;
7282       end_byte = SBYTES (start);
7283       pos = 0;
7284     }
7285   else
7286     {
7287       CHECK_NUMBER_COERCE_MARKER (start);
7288       CHECK_NUMBER_COERCE_MARKER (end);
7289       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7290         args_out_of_range (start, end);
7291       if (NILP (current_buffer->enable_multibyte_characters))
7292         return Qnil;
7293       start_byte = CHAR_TO_BYTE (XINT (start));
7294       end_byte = CHAR_TO_BYTE (XINT (end));
7295       if (XINT (end) - XINT (start) == end_byte - start_byte)
7296         return Qt;
7297
7298       if (XINT (start) < GPT && XINT (end) > GPT)
7299         {
7300           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7301             move_gap_both (XINT (start), start_byte);
7302           else
7303             move_gap_both (XINT (end), end_byte);
7304         }
7305       pos = XINT (start);
7306     }
7307
7308   list = Qnil;
7309   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7310     {
7311       elt = XCAR (tail);
7312       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7313                                        Qnil)),
7314                     list);
7315     }
7316
7317   if (STRINGP (start))
7318     p = pbeg = SDATA (start);
7319   else
7320     p = pbeg = BYTE_POS_ADDR (start_byte);
7321   pend = p + (end_byte - start_byte);
7322
7323   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7324   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7325
7326   while (p < pend)
7327     {
7328       if (ASCII_BYTE_P (*p))
7329         p++;
7330       else
7331         {
7332           c = STRING_CHAR_ADVANCE (p);
7333
7334           charset_map_loaded = 0;
7335           for (tail = list; CONSP (tail); tail = XCDR (tail))
7336             {
7337               elt = XCDR (XCAR (tail));
7338               if (! char_encodable_p (c, XCAR (elt)))
7339                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7340             }
7341           if (charset_map_loaded)
7342             {
7343               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7344
7345               if (STRINGP (start))
7346                 pbeg = SDATA (start);
7347               else
7348                 pbeg = BYTE_POS_ADDR (start_byte);
7349               p = pbeg + p_offset;
7350               pend = pbeg + pend_offset;
7351             }
7352         }
7353       pos++;
7354     }
7355
7356   tail = list;
7357   list = Qnil;
7358   for (; CONSP (tail); tail = XCDR (tail))
7359     {
7360       elt = XCAR (tail);
7361       if (CONSP (XCDR (XCDR (elt))))
7362         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7363                       list);
7364     }
7365
7366   return list;
7367 }
7368
7369
7370
7371 Lisp_Object
7372 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7373      Lisp_Object start, end, coding_system, dst_object;
7374      int encodep, norecord;
7375 {
7376   struct coding_system coding;
7377   EMACS_INT from, from_byte, to, to_byte;
7378   Lisp_Object src_object;
7379
7380   CHECK_NUMBER_COERCE_MARKER (start);
7381   CHECK_NUMBER_COERCE_MARKER (end);
7382   if (NILP (coding_system))
7383     coding_system = Qno_conversion;
7384   else
7385     CHECK_CODING_SYSTEM (coding_system);
7386   src_object = Fcurrent_buffer ();
7387   if (NILP (dst_object))
7388     dst_object = src_object;
7389   else if (! EQ (dst_object, Qt))
7390     CHECK_BUFFER (dst_object);
7391
7392   validate_region (&start, &end);
7393   from = XFASTINT (start);
7394   from_byte = CHAR_TO_BYTE (from);
7395   to = XFASTINT (end);
7396   to_byte = CHAR_TO_BYTE (to);
7397
7398   setup_coding_system (coding_system, &coding);
7399   coding.mode |= CODING_MODE_LAST_BLOCK;
7400
7401   if (encodep)
7402     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7403                           dst_object);
7404   else
7405     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7406                           dst_object);
7407   if (! norecord)
7408     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7409
7410   if (coding.result != CODING_RESULT_SUCCESS)
7411     error ("Code conversion error: %d", coding.result);
7412
7413   return (BUFFERP (dst_object)
7414           ? make_number (coding.produced_char)
7415           : coding.dst_object);
7416 }
7417
7418
7419 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7420        3, 4, "r\nzCoding system: ",
7421        doc: /* Decode the current region from the specified coding system.
7422 When called from a program, takes four arguments:
7423         START, END, CODING-SYSTEM, and DESTINATION.
7424 START and END are buffer positions.
7425
7426 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7427 If nil, the region between START and END is replace by the decoded text.
7428 If buffer, the decoded text is inserted in the buffer.
7429 If t, the decoded text is returned.
7430
7431 This function sets `last-coding-system-used' to the precise coding system
7432 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7433 not fully specified.)
7434 It returns the length of the decoded text.  */)
7435      (start, end, coding_system, destination)
7436      Lisp_Object start, end, coding_system, destination;
7437 {
7438   return code_convert_region (start, end, coding_system, destination, 0, 0);
7439 }
7440
7441 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7442        3, 4, "r\nzCoding system: ",
7443        doc: /* Encode the current region by specified coding system.
7444 When called from a program, takes three arguments:
7445 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7446
7447 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7448 If nil, the region between START and END is replace by the encoded text.
7449 If buffer, the encoded text is inserted in the buffer.
7450 If t, the encoded text is returned.
7451
7452 This function sets `last-coding-system-used' to the precise coding system
7453 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7454 not fully specified.)
7455 It returns the length of the encoded text.  */)
7456   (start, end, coding_system, destination)
7457      Lisp_Object start, end, coding_system, destination;
7458 {
7459   return code_convert_region (start, end, coding_system, destination, 1, 0);
7460 }
7461
7462 Lisp_Object
7463 code_convert_string (string, coding_system, dst_object,
7464                      encodep, nocopy, norecord)
7465      Lisp_Object string, coding_system, dst_object;
7466      int encodep, nocopy, norecord;
7467 {
7468   struct coding_system coding;
7469   EMACS_INT chars, bytes;
7470
7471   CHECK_STRING (string);
7472   if (NILP (coding_system))
7473     {
7474       if (! norecord)
7475         Vlast_coding_system_used = Qno_conversion;
7476       if (NILP (dst_object))
7477         return (nocopy ? Fcopy_sequence (string) : string);
7478     }
7479
7480   if (NILP (coding_system))
7481     coding_system = Qno_conversion;
7482   else
7483     CHECK_CODING_SYSTEM (coding_system);
7484   if (NILP (dst_object))
7485     dst_object = Qt;
7486   else if (! EQ (dst_object, Qt))
7487     CHECK_BUFFER (dst_object);
7488
7489   setup_coding_system (coding_system, &coding);
7490   coding.mode |= CODING_MODE_LAST_BLOCK;
7491   chars = SCHARS (string);
7492   bytes = SBYTES (string);
7493   if (encodep)
7494     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7495   else
7496     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7497   if (! norecord)
7498     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7499
7500   if (coding.result != CODING_RESULT_SUCCESS)
7501     error ("Code conversion error: %d", coding.result);
7502
7503   return (BUFFERP (dst_object)
7504           ? make_number (coding.produced_char)
7505           : coding.dst_object);
7506 }
7507
7508
7509 /* Encode or decode STRING according to CODING_SYSTEM.
7510    Do not set Vlast_coding_system_used.
7511
7512    This function is called only from macros DECODE_FILE and
7513    ENCODE_FILE, thus we ignore character composition.  */
7514
7515 Lisp_Object
7516 code_convert_string_norecord (string, coding_system, encodep)
7517      Lisp_Object string, coding_system;
7518      int encodep;
7519 {
7520   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7521 }
7522
7523
7524 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7525        2, 4, 0,
7526        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7527
7528 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7529 if the decoding operation is trivial.
7530
7531 Optional fourth arg BUFFER non-nil meant that the decoded text is
7532 inserted in BUFFER instead of returned as a string.  In this case,
7533 the return value is BUFFER.
7534
7535 This function sets `last-coding-system-used' to the precise coding system
7536 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7537 not fully specified.  */)
7538   (string, coding_system, nocopy, buffer)
7539      Lisp_Object string, coding_system, nocopy, buffer;
7540 {
7541   return code_convert_string (string, coding_system, buffer,
7542                               0, ! NILP (nocopy), 0);
7543 }
7544
7545 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7546        2, 4, 0,
7547        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7548
7549 Optional third arg NOCOPY non-nil means it is OK to return STRING
7550 itself if the encoding operation is trivial.
7551
7552 Optional fourth arg BUFFER non-nil meant that the encoded text is
7553 inserted in BUFFER instead of returned as a string.  In this case,
7554 the return value is BUFFER.
7555
7556 This function sets `last-coding-system-used' to the precise coding system
7557 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7558 not fully specified.)  */)
7559      (string, coding_system, nocopy, buffer)
7560      Lisp_Object string, coding_system, nocopy, buffer;
7561 {
7562   return code_convert_string (string, coding_system, buffer,
7563                               1, ! NILP (nocopy), 1);
7564 }
7565
7566 \f
7567 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7568        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7569 Return the corresponding character.  */)
7570      (code)
7571      Lisp_Object code;
7572 {
7573   Lisp_Object spec, attrs, val;
7574   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7575   int c;
7576
7577   CHECK_NATNUM (code);
7578   c = XFASTINT (code);
7579   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7580   attrs = AREF (spec, 0);
7581
7582   if (ASCII_BYTE_P (c)
7583       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7584     return code;
7585
7586   val = CODING_ATTR_CHARSET_LIST (attrs);
7587   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7588   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7589   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7590
7591   if (c <= 0x7F)
7592     charset = charset_roman;
7593   else if (c >= 0xA0 && c < 0xDF)
7594     {
7595       charset = charset_kana;
7596       c -= 0x80;
7597     }
7598   else
7599     {
7600       int s1 = c >> 8, s2 = c & 0xFF;
7601
7602       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7603           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7604         error ("Invalid code: %d", code);
7605       SJIS_TO_JIS (c);
7606       charset = charset_kanji;
7607     }
7608   c = DECODE_CHAR (charset, c);
7609   if (c < 0)
7610     error ("Invalid code: %d", code);
7611   return make_number (c);
7612 }
7613
7614
7615 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7616        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7617 Return the corresponding code in SJIS.  */)
7618      (ch)
7619     Lisp_Object ch;
7620 {
7621   Lisp_Object spec, attrs, charset_list;
7622   int c;
7623   struct charset *charset;
7624   unsigned code;
7625
7626   CHECK_CHARACTER (ch);
7627   c = XFASTINT (ch);
7628   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7629   attrs = AREF (spec, 0);
7630
7631   if (ASCII_CHAR_P (c)
7632       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7633     return ch;
7634
7635   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7636   charset = char_charset (c, charset_list, &code);
7637   if (code == CHARSET_INVALID_CODE (charset))
7638     error ("Can't encode by shift_jis encoding: %d", c);
7639   JIS_TO_SJIS (code);
7640
7641   return make_number (code);
7642 }
7643
7644 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7645        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7646 Return the corresponding character.  */)
7647      (code)
7648      Lisp_Object code;
7649 {
7650   Lisp_Object spec, attrs, val;
7651   struct charset *charset_roman, *charset_big5, *charset;
7652   int c;
7653
7654   CHECK_NATNUM (code);
7655   c = XFASTINT (code);
7656   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7657   attrs = AREF (spec, 0);
7658
7659   if (ASCII_BYTE_P (c)
7660       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7661     return code;
7662
7663   val = CODING_ATTR_CHARSET_LIST (attrs);
7664   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7665   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7666
7667   if (c <= 0x7F)
7668     charset = charset_roman;
7669   else
7670     {
7671       int b1 = c >> 8, b2 = c & 0x7F;
7672       if (b1 < 0xA1 || b1 > 0xFE
7673           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7674         error ("Invalid code: %d", code);
7675       charset = charset_big5;
7676     }
7677   c = DECODE_CHAR (charset, (unsigned )c);
7678   if (c < 0)
7679     error ("Invalid code: %d", code);
7680   return make_number (c);
7681 }
7682
7683 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7684        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7685 Return the corresponding character code in Big5.  */)
7686      (ch)
7687      Lisp_Object ch;
7688 {
7689   Lisp_Object spec, attrs, charset_list;
7690   struct charset *charset;
7691   int c;
7692   unsigned code;
7693
7694   CHECK_CHARACTER (ch);
7695   c = XFASTINT (ch);
7696   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7697   attrs = AREF (spec, 0);
7698   if (ASCII_CHAR_P (c)
7699       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7700     return ch;
7701
7702   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7703   charset = char_charset (c, charset_list, &code);
7704   if (code == CHARSET_INVALID_CODE (charset))
7705     error ("Can't encode by Big5 encoding: %d", c);
7706
7707   return make_number (code);
7708 }
7709
7710 \f
7711 DEFUN ("set-terminal-coding-system-internal",
7712        Fset_terminal_coding_system_internal,
7713        Sset_terminal_coding_system_internal, 1, 1, 0,
7714        doc: /* Internal use only.  */)
7715      (coding_system)
7716      Lisp_Object coding_system;
7717 {
7718   CHECK_SYMBOL (coding_system);
7719   setup_coding_system (Fcheck_coding_system (coding_system),
7720                         &terminal_coding);
7721
7722   /* We had better not send unsafe characters to terminal.  */
7723   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7724   /* Characer composition should be disabled.  */
7725   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7726   terminal_coding.src_multibyte = 1;
7727   terminal_coding.dst_multibyte = 0;
7728   return Qnil;
7729 }
7730
7731 DEFUN ("set-safe-terminal-coding-system-internal",
7732        Fset_safe_terminal_coding_system_internal,
7733        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7734        doc: /* Internal use only.  */)
7735      (coding_system)
7736      Lisp_Object coding_system;
7737 {
7738   CHECK_SYMBOL (coding_system);
7739   setup_coding_system (Fcheck_coding_system (coding_system),
7740                        &safe_terminal_coding);
7741   /* Characer composition should be disabled.  */
7742   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7743   safe_terminal_coding.src_multibyte = 1;
7744   safe_terminal_coding.dst_multibyte = 0;
7745   return Qnil;
7746 }
7747
7748 DEFUN ("terminal-coding-system",
7749        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7750        doc: /* Return coding system specified for terminal output.  */)
7751      ()
7752 {
7753   return CODING_ID_NAME (terminal_coding.id);
7754 }
7755
7756 DEFUN ("set-keyboard-coding-system-internal",
7757        Fset_keyboard_coding_system_internal,
7758        Sset_keyboard_coding_system_internal, 1, 1, 0,
7759        doc: /* Internal use only.  */)
7760      (coding_system)
7761      Lisp_Object coding_system;
7762 {
7763   CHECK_SYMBOL (coding_system);
7764   setup_coding_system (Fcheck_coding_system (coding_system),
7765                        &keyboard_coding);
7766   /* Characer composition should be disabled.  */
7767   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7768   return Qnil;
7769 }
7770
7771 DEFUN ("keyboard-coding-system",
7772        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7773        doc: /* Return coding system specified for decoding keyboard input.  */)
7774      ()
7775 {
7776   return CODING_ID_NAME (keyboard_coding.id);
7777 }
7778
7779 \f
7780 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7781        Sfind_operation_coding_system,  1, MANY, 0,
7782        doc: /* Choose a coding system for an operation based on the target name.
7783 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7784 DECODING-SYSTEM is the coding system to use for decoding
7785 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7786 for encoding (in case OPERATION does encoding).
7787
7788 The first argument OPERATION specifies an I/O primitive:
7789   For file I/O, `insert-file-contents' or `write-region'.
7790   For process I/O, `call-process', `call-process-region', or `start-process'.
7791   For network I/O, `open-network-stream'.
7792
7793 The remaining arguments should be the same arguments that were passed
7794 to the primitive.  Depending on which primitive, one of those arguments
7795 is selected as the TARGET.  For example, if OPERATION does file I/O,
7796 whichever argument specifies the file name is TARGET.
7797
7798 TARGET has a meaning which depends on OPERATION:
7799   For file I/O, TARGET is a file name.
7800   For process I/O, TARGET is a process name.
7801   For network I/O, TARGET is a service name or a port number
7802
7803 This function looks up what specified for TARGET in,
7804 `file-coding-system-alist', `process-coding-system-alist',
7805 or `network-coding-system-alist' depending on OPERATION.
7806 They may specify a coding system, a cons of coding systems,
7807 or a function symbol to call.
7808 In the last case, we call the function with one argument,
7809 which is a list of all the arguments given to this function.
7810
7811 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7812      (nargs, args)
7813      int nargs;
7814      Lisp_Object *args;
7815 {
7816   Lisp_Object operation, target_idx, target, val;
7817   register Lisp_Object chain;
7818
7819   if (nargs < 2)
7820     error ("Too few arguments");
7821   operation = args[0];
7822   if (!SYMBOLP (operation)
7823       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7824     error ("Invalid first arguement");
7825   if (nargs < 1 + XINT (target_idx))
7826     error ("Too few arguments for operation: %s",
7827            SDATA (SYMBOL_NAME (operation)));
7828   target = args[XINT (target_idx) + 1];
7829   if (!(STRINGP (target)
7830         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7831     error ("Invalid %dth argument", XINT (target_idx) + 1);
7832
7833   chain = ((EQ (operation, Qinsert_file_contents)
7834             || EQ (operation, Qwrite_region))
7835            ? Vfile_coding_system_alist
7836            : (EQ (operation, Qopen_network_stream)
7837               ? Vnetwork_coding_system_alist
7838               : Vprocess_coding_system_alist));
7839   if (NILP (chain))
7840     return Qnil;
7841
7842   for (; CONSP (chain); chain = XCDR (chain))
7843     {
7844       Lisp_Object elt;
7845
7846       elt = XCAR (chain);
7847       if (CONSP (elt)
7848           && ((STRINGP (target)
7849                && STRINGP (XCAR (elt))
7850                && fast_string_match (XCAR (elt), target) >= 0)
7851               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7852         {
7853           val = XCDR (elt);
7854           /* Here, if VAL is both a valid coding system and a valid
7855              function symbol, we return VAL as a coding system.  */
7856           if (CONSP (val))
7857             return val;
7858           if (! SYMBOLP (val))
7859             return Qnil;
7860           if (! NILP (Fcoding_system_p (val)))
7861             return Fcons (val, val);
7862           if (! NILP (Ffboundp (val)))
7863             {
7864               val = call1 (val, Flist (nargs, args));
7865               if (CONSP (val))
7866                 return val;
7867               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7868                 return Fcons (val, val);
7869             }
7870           return Qnil;
7871         }
7872     }
7873   return Qnil;
7874 }
7875
7876 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7877        Sset_coding_system_priority, 0, MANY, 0,
7878        doc: /* Assign higher priority to the coding systems given as arguments.
7879 If multiple coding systems belongs to the same category,
7880 all but the first one are ignored.
7881
7882 usage: (set-coding-system-priority ...)  */)
7883      (nargs, args)
7884      int nargs;
7885      Lisp_Object *args;
7886 {
7887   int i, j;
7888   int changed[coding_category_max];
7889   enum coding_category priorities[coding_category_max];
7890
7891   bzero (changed, sizeof changed);
7892
7893   for (i = j = 0; i < nargs; i++)
7894     {
7895       enum coding_category category;
7896       Lisp_Object spec, attrs;
7897
7898       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7899       attrs = AREF (spec, 0);
7900       category = XINT (CODING_ATTR_CATEGORY (attrs));
7901       if (changed[category])
7902         /* Ignore this coding system because a coding system of the
7903            same category already had a higher priority.  */
7904         continue;
7905       changed[category] = 1;
7906       priorities[j++] = category;
7907       if (coding_categories[category].id >= 0
7908           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7909         setup_coding_system (args[i], &coding_categories[category]);
7910       Fset (AREF (Vcoding_category_table, category), args[i]);
7911     }
7912
7913   /* Now we have decided top J priorities.  Reflect the order of the
7914      original priorities to the remaining priorities.  */
7915
7916   for (i = j, j = 0; i < coding_category_max; i++, j++)
7917     {
7918       while (j < coding_category_max
7919              && changed[coding_priorities[j]])
7920         j++;
7921       if (j == coding_category_max)
7922         abort ();
7923       priorities[i] = coding_priorities[j];
7924     }
7925
7926   bcopy (priorities, coding_priorities, sizeof priorities);
7927
7928   /* Update `coding-category-list'.  */
7929   Vcoding_category_list = Qnil;
7930   for (i = coding_category_max - 1; i >= 0; i--)
7931     Vcoding_category_list
7932       = Fcons (AREF (Vcoding_category_table, priorities[i]),
7933                Vcoding_category_list);
7934
7935   return Qnil;
7936 }
7937
7938 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7939        Scoding_system_priority_list, 0, 1, 0,
7940        doc: /* Return a list of coding systems ordered by their priorities.
7941 HIGHESTP non-nil means just return the highest priority one.  */)
7942      (highestp)
7943      Lisp_Object highestp;
7944 {
7945   int i;
7946   Lisp_Object val;
7947
7948   for (i = 0, val = Qnil; i < coding_category_max; i++)
7949     {
7950       enum coding_category category = coding_priorities[i];
7951       int id = coding_categories[category].id;
7952       Lisp_Object attrs;
7953
7954       if (id < 0)
7955         continue;
7956       attrs = CODING_ID_ATTRS (id);
7957       if (! NILP (highestp))
7958         return CODING_ATTR_BASE_NAME (attrs);
7959       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7960     }
7961   return Fnreverse (val);
7962 }
7963
7964 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7965
7966 static Lisp_Object
7967 make_subsidiaries (base)
7968      Lisp_Object base;
7969 {
7970   Lisp_Object subsidiaries;
7971   int base_name_len = SBYTES (SYMBOL_NAME (base));
7972   char *buf = (char *) alloca (base_name_len + 6);
7973   int i;
7974
7975   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7976   subsidiaries = Fmake_vector (make_number (3), Qnil);
7977   for (i = 0; i < 3; i++)
7978     {
7979       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7980       ASET (subsidiaries, i, intern (buf));
7981     }
7982   return subsidiaries;
7983 }
7984
7985
7986 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7987        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7988        doc: /* For internal use only.
7989 usage: (define-coding-system-internal ...)  */)
7990      (nargs, args)
7991      int nargs;
7992      Lisp_Object *args;
7993 {
7994   Lisp_Object name;
7995   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7996   Lisp_Object attrs;            /* Vector of attributes.  */
7997   Lisp_Object eol_type;
7998   Lisp_Object aliases;
7999   Lisp_Object coding_type, charset_list, safe_charsets;
8000   enum coding_category category;
8001   Lisp_Object tail, val;
8002   int max_charset_id = 0;
8003   int i;
8004
8005   if (nargs < coding_arg_max)
8006     goto short_args;
8007
8008   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8009
8010   name = args[coding_arg_name];
8011   CHECK_SYMBOL (name);
8012   CODING_ATTR_BASE_NAME (attrs) = name;
8013
8014   val = args[coding_arg_mnemonic];
8015   if (! STRINGP (val))
8016     CHECK_CHARACTER (val);
8017   CODING_ATTR_MNEMONIC (attrs) = val;
8018
8019   coding_type = args[coding_arg_coding_type];
8020   CHECK_SYMBOL (coding_type);
8021   CODING_ATTR_TYPE (attrs) = coding_type;
8022
8023   charset_list = args[coding_arg_charset_list];
8024   if (SYMBOLP (charset_list))
8025     {
8026       if (EQ (charset_list, Qiso_2022))
8027         {
8028           if (! EQ (coding_type, Qiso_2022))
8029             error ("Invalid charset-list");
8030           charset_list = Viso_2022_charset_list;
8031         }
8032       else if (EQ (charset_list, Qemacs_mule))
8033         {
8034           if (! EQ (coding_type, Qemacs_mule))
8035             error ("Invalid charset-list");
8036           charset_list = Vemacs_mule_charset_list;
8037         }
8038       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8039         if (max_charset_id < XFASTINT (XCAR (tail)))
8040           max_charset_id = XFASTINT (XCAR (tail));
8041     }
8042   else
8043     {
8044       charset_list = Fcopy_sequence (charset_list);
8045       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8046         {
8047           struct charset *charset;
8048
8049           val = Fcar (tail);
8050           CHECK_CHARSET_GET_CHARSET (val, charset);
8051           if (EQ (coding_type, Qiso_2022)
8052               ? CHARSET_ISO_FINAL (charset) < 0
8053               : EQ (coding_type, Qemacs_mule)
8054               ? CHARSET_EMACS_MULE_ID (charset) < 0
8055               : 0)
8056             error ("Can't handle charset `%s'",
8057                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8058
8059           XSETCAR (tail, make_number (charset->id));
8060           if (max_charset_id < charset->id)
8061             max_charset_id = charset->id;
8062         }
8063     }
8064   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8065
8066   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8067                                 make_number (255));
8068   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8069     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8070   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8071
8072   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8073
8074   val = args[coding_arg_decode_translation_table];
8075   if (! NILP (val))
8076     CHECK_CHAR_TABLE (val);
8077   CODING_ATTR_DECODE_TBL (attrs) = val;
8078
8079   val = args[coding_arg_encode_translation_table];
8080   if (! NILP (val))
8081     CHECK_CHAR_TABLE (val);
8082   CODING_ATTR_ENCODE_TBL (attrs) = val;
8083
8084   val = args[coding_arg_post_read_conversion];
8085   CHECK_SYMBOL (val);
8086   CODING_ATTR_POST_READ (attrs) = val;
8087
8088   val = args[coding_arg_pre_write_conversion];
8089   CHECK_SYMBOL (val);
8090   CODING_ATTR_PRE_WRITE (attrs) = val;
8091
8092   val = args[coding_arg_default_char];
8093   if (NILP (val))
8094     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8095   else
8096     {
8097       CHECK_CHARACTER (val);
8098       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8099     }
8100
8101   val = args[coding_arg_for_unibyte];
8102   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8103
8104   val = args[coding_arg_plist];
8105   CHECK_LIST (val);
8106   CODING_ATTR_PLIST (attrs) = val;
8107
8108   if (EQ (coding_type, Qcharset))
8109     {
8110       /* Generate a lisp vector of 256 elements.  Each element is nil,
8111          integer, or a list of charset IDs.
8112
8113          If Nth element is nil, the byte code N is invalid in this
8114          coding system.
8115
8116          If Nth element is a number NUM, N is the first byte of a
8117          charset whose ID is NUM.
8118
8119          If Nth element is a list of charset IDs, N is the first byte
8120          of one of them.  The list is sorted by dimensions of the
8121          charsets.  A charset of smaller dimension comes firtst. */
8122       Lisp_Object list;
8123       int maybe_ascii_compatible = 1;
8124
8125       for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8126         {
8127           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8128
8129           if (charset->method == CHARSET_METHOD_SUPERSET)
8130             {
8131               val = CHARSET_SUPERSET (charset);
8132               for (; CONSP (val); val = XCDR (val))
8133                 list = Fcons (XCAR (XCAR (val)), list);
8134               maybe_ascii_compatible = 0;
8135             }
8136           else
8137             list = Fcons (XCAR (tail), list);
8138         }
8139
8140       val = Fmake_vector (make_number (256), Qnil);
8141
8142       for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8143         {
8144           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8145           int dim = CHARSET_DIMENSION (charset);
8146           int idx = (dim - 1) * 4;
8147
8148           if (CHARSET_ASCII_COMPATIBLE_P (charset)
8149               && maybe_ascii_compatible)
8150             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8151
8152           for (i = charset->code_space[idx];
8153                i <= charset->code_space[idx + 1]; i++)
8154             {
8155               Lisp_Object tmp, tmp2;
8156               int dim2;
8157
8158               tmp = AREF (val, i);
8159               if (NILP (tmp))
8160                 tmp = XCAR (tail);
8161               else if (NUMBERP (tmp))
8162                 {
8163                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8164                   if (dim < dim2)
8165                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8166                   else
8167                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8168                 }
8169               else
8170                 {
8171                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8172                     {
8173                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8174                       if (dim < dim2)
8175                         break;
8176                     }
8177                   if (NILP (tmp2))
8178                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8179                   else
8180                     {
8181                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8182                       XSETCAR (tmp2, XCAR (tail));
8183                     }
8184                 }
8185               ASET (val, i, tmp);
8186             }
8187         }
8188       ASET (attrs, coding_attr_charset_valids, val);
8189       category = coding_category_charset;
8190     }
8191   else if (EQ (coding_type, Qccl))
8192     {
8193       Lisp_Object valids;
8194
8195       if (nargs < coding_arg_ccl_max)
8196         goto short_args;
8197
8198       val = args[coding_arg_ccl_decoder];
8199       CHECK_CCL_PROGRAM (val);
8200       if (VECTORP (val))
8201         val = Fcopy_sequence (val);
8202       ASET (attrs, coding_attr_ccl_decoder, val);
8203
8204       val = args[coding_arg_ccl_encoder];
8205       CHECK_CCL_PROGRAM (val);
8206       if (VECTORP (val))
8207         val = Fcopy_sequence (val);
8208       ASET (attrs, coding_attr_ccl_encoder, val);
8209
8210       val = args[coding_arg_ccl_valids];
8211       valids = Fmake_string (make_number (256), make_number (0));
8212       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8213         {
8214           int from, to;
8215
8216           val = Fcar (tail);
8217           if (INTEGERP (val))
8218             {
8219               from = to = XINT (val);
8220               if (from < 0 || from > 255)
8221                 args_out_of_range_3 (val, make_number (0), make_number (255));
8222             }
8223           else
8224             {
8225               CHECK_CONS (val);
8226               CHECK_NATNUM_CAR (val);
8227               CHECK_NATNUM_CDR (val);
8228               from = XINT (XCAR (val));
8229               if (from > 255)
8230                 args_out_of_range_3 (XCAR (val),
8231                                      make_number (0), make_number (255));
8232               to = XINT (XCDR (val));
8233               if (to < from || to > 255)
8234                 args_out_of_range_3 (XCDR (val),
8235                                      XCAR (val), make_number (255));
8236             }
8237           for (i = from; i <= to; i++)
8238             SSET (valids, i, 1);
8239         }
8240       ASET (attrs, coding_attr_ccl_valids, valids);
8241
8242       category = coding_category_ccl;
8243     }
8244   else if (EQ (coding_type, Qutf_16))
8245     {
8246       Lisp_Object bom, endian;
8247
8248       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8249
8250       if (nargs < coding_arg_utf16_max)
8251         goto short_args;
8252
8253       bom = args[coding_arg_utf16_bom];
8254       if (! NILP (bom) && ! EQ (bom, Qt))
8255         {
8256           CHECK_CONS (bom);
8257           val = XCAR (bom);
8258           CHECK_CODING_SYSTEM (val);
8259           val = XCDR (bom);
8260           CHECK_CODING_SYSTEM (val);
8261         }
8262       ASET (attrs, coding_attr_utf_16_bom, bom);
8263
8264       endian = args[coding_arg_utf16_endian];
8265       CHECK_SYMBOL (endian);
8266       if (NILP (endian))
8267         endian = Qbig;
8268       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8269         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8270       ASET (attrs, coding_attr_utf_16_endian, endian);
8271
8272       category = (CONSP (bom)
8273                   ? coding_category_utf_16_auto
8274                   : NILP (bom)
8275                   ? (EQ (endian, Qbig)
8276                      ? coding_category_utf_16_be_nosig
8277                      : coding_category_utf_16_le_nosig)
8278                   : (EQ (endian, Qbig)
8279                      ? coding_category_utf_16_be
8280                      : coding_category_utf_16_le));
8281     }
8282   else if (EQ (coding_type, Qiso_2022))
8283     {
8284       Lisp_Object initial, reg_usage, request, flags;
8285       int i;
8286
8287       if (nargs < coding_arg_iso2022_max)
8288         goto short_args;
8289
8290       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8291       CHECK_VECTOR (initial);
8292       for (i = 0; i < 4; i++)
8293         {
8294           val = Faref (initial, make_number (i));
8295           if (! NILP (val))
8296             {
8297               struct charset *charset;
8298
8299               CHECK_CHARSET_GET_CHARSET (val, charset);
8300               ASET (initial, i, make_number (CHARSET_ID (charset)));
8301               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8302                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8303             }
8304           else
8305             ASET (initial, i, make_number (-1));
8306         }
8307
8308       reg_usage = args[coding_arg_iso2022_reg_usage];
8309       CHECK_CONS (reg_usage);
8310       CHECK_NUMBER_CAR (reg_usage);
8311       CHECK_NUMBER_CDR (reg_usage);
8312
8313       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8314       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8315         {
8316           int id;
8317           Lisp_Object tmp;
8318
8319           val = Fcar (tail);
8320           CHECK_CONS (val);
8321           tmp = XCAR (val);
8322           CHECK_CHARSET_GET_ID (tmp, id);
8323           CHECK_NATNUM_CDR (val);
8324           if (XINT (XCDR (val)) >= 4)
8325             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8326           XSETCAR (val, make_number (id));
8327         }
8328
8329       flags = args[coding_arg_iso2022_flags];
8330       CHECK_NATNUM (flags);
8331       i = XINT (flags);
8332       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8333         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8334
8335       ASET (attrs, coding_attr_iso_initial, initial);
8336       ASET (attrs, coding_attr_iso_usage, reg_usage);
8337       ASET (attrs, coding_attr_iso_request, request);
8338       ASET (attrs, coding_attr_iso_flags, flags);
8339       setup_iso_safe_charsets (attrs);
8340
8341       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8342         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8343                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8344                     ? coding_category_iso_7_else
8345                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8346                     ? coding_category_iso_7
8347                     : coding_category_iso_7_tight);
8348       else
8349         {
8350           int id = XINT (AREF (initial, 1));
8351
8352           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8353                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8354                        || id < 0)
8355                       ? coding_category_iso_8_else
8356                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8357                       ? coding_category_iso_8_1
8358                       : coding_category_iso_8_2);
8359         }
8360       if (category != coding_category_iso_8_1
8361           && category != coding_category_iso_8_2)
8362         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8363     }
8364   else if (EQ (coding_type, Qemacs_mule))
8365     {
8366       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8367         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8368       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8369       category = coding_category_emacs_mule;
8370     }
8371   else if (EQ (coding_type, Qshift_jis))
8372     {
8373
8374       struct charset *charset;
8375
8376       if (XINT (Flength (charset_list)) != 3)
8377         error ("There should be just three charsets");
8378
8379       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8380       if (CHARSET_DIMENSION (charset) != 1)
8381         error ("Dimension of charset %s is not one",
8382                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8383       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8384         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8385
8386       charset_list = XCDR (charset_list);
8387       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8388       if (CHARSET_DIMENSION (charset) != 1)
8389         error ("Dimension of charset %s is not one",
8390                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8391
8392       charset_list = XCDR (charset_list);
8393       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8394       if (CHARSET_DIMENSION (charset) != 2)
8395         error ("Dimension of charset %s is not two",
8396                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8397
8398       category = coding_category_sjis;
8399       Vsjis_coding_system = name;
8400     }
8401   else if (EQ (coding_type, Qbig5))
8402     {
8403       struct charset *charset;
8404
8405       if (XINT (Flength (charset_list)) != 2)
8406         error ("There should be just two charsets");
8407
8408       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8409       if (CHARSET_DIMENSION (charset) != 1)
8410         error ("Dimension of charset %s is not one",
8411                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8412       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8413         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8414
8415       charset_list = XCDR (charset_list);
8416       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8417       if (CHARSET_DIMENSION (charset) != 2)
8418         error ("Dimension of charset %s is not two",
8419                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8420
8421       category = coding_category_big5;
8422       Vbig5_coding_system = name;
8423     }
8424   else if (EQ (coding_type, Qraw_text))
8425     {
8426       category = coding_category_raw_text;
8427       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8428     }
8429   else if (EQ (coding_type, Qutf_8))
8430     {
8431       category = coding_category_utf_8;
8432       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8433     }
8434   else if (EQ (coding_type, Qundecided))
8435     category = coding_category_undecided;
8436   else
8437     error ("Invalid coding system type: %s",
8438            SDATA (SYMBOL_NAME (coding_type)));
8439
8440   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8441   CODING_ATTR_PLIST (attrs)
8442     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8443                                 CODING_ATTR_PLIST (attrs)));
8444
8445   eol_type = args[coding_arg_eol_type];
8446   if (! NILP (eol_type)
8447       && ! EQ (eol_type, Qunix)
8448       && ! EQ (eol_type, Qdos)
8449       && ! EQ (eol_type, Qmac))
8450     error ("Invalid eol-type");
8451
8452   aliases = Fcons (name, Qnil);
8453
8454   if (NILP (eol_type))
8455     {
8456       eol_type = make_subsidiaries (name);
8457       for (i = 0; i < 3; i++)
8458         {
8459           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8460
8461           this_name = AREF (eol_type, i);
8462           this_aliases = Fcons (this_name, Qnil);
8463           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8464           this_spec = Fmake_vector (make_number (3), attrs);
8465           ASET (this_spec, 1, this_aliases);
8466           ASET (this_spec, 2, this_eol_type);
8467           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8468           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8469           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8470                                         Vcoding_system_alist);
8471         }
8472     }
8473
8474   spec_vec = Fmake_vector (make_number (3), attrs);
8475   ASET (spec_vec, 1, aliases);
8476   ASET (spec_vec, 2, eol_type);
8477
8478   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8479   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8480   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8481                                 Vcoding_system_alist);
8482
8483   {
8484     int id = coding_categories[category].id;
8485
8486     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8487       setup_coding_system (name, &coding_categories[category]);
8488   }
8489
8490   return Qnil;
8491
8492  short_args:
8493   return Fsignal (Qwrong_number_of_arguments,
8494                   Fcons (intern ("define-coding-system-internal"),
8495                          make_number (nargs)));
8496 }
8497
8498
8499 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8500        Sdefine_coding_system_alias, 2, 2, 0,
8501        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8502      (alias, coding_system)
8503      Lisp_Object alias, coding_system;
8504 {
8505   Lisp_Object spec, aliases, eol_type;
8506
8507   CHECK_SYMBOL (alias);
8508   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8509   aliases = AREF (spec, 1);
8510   /* ALISES should be a list of length more than zero, and the first
8511      element is a base coding system.  Append ALIAS at the tail of the
8512      list.  */
8513   while (!NILP (XCDR (aliases)))
8514     aliases = XCDR (aliases);
8515   XSETCDR (aliases, Fcons (alias, Qnil));
8516
8517   eol_type = AREF (spec, 2);
8518   if (VECTORP (eol_type))
8519     {
8520       Lisp_Object subsidiaries;
8521       int i;
8522
8523       subsidiaries = make_subsidiaries (alias);
8524       for (i = 0; i < 3; i++)
8525         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8526                                      AREF (eol_type, i));
8527     }
8528
8529   Fputhash (alias, spec, Vcoding_system_hash_table);
8530   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8531   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8532                                 Vcoding_system_alist);
8533
8534   return Qnil;
8535 }
8536
8537 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8538        1, 1, 0,
8539        doc: /* Return the base of CODING-SYSTEM.
8540 Any alias or subsidiary coding system is not a base coding system.  */)
8541   (coding_system)
8542      Lisp_Object coding_system;
8543 {
8544   Lisp_Object spec, attrs;
8545
8546   if (NILP (coding_system))
8547     return (Qno_conversion);
8548   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8549   attrs = AREF (spec, 0);
8550   return CODING_ATTR_BASE_NAME (attrs);
8551 }
8552
8553 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8554        1, 1, 0,
8555        doc: "Return the property list of CODING-SYSTEM.")
8556      (coding_system)
8557      Lisp_Object coding_system;
8558 {
8559   Lisp_Object spec, attrs;
8560
8561   if (NILP (coding_system))
8562     coding_system = Qno_conversion;
8563   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8564   attrs = AREF (spec, 0);
8565   return CODING_ATTR_PLIST (attrs);
8566 }
8567
8568
8569 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8570        1, 1, 0,
8571        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8572      (coding_system)
8573      Lisp_Object coding_system;
8574 {
8575   Lisp_Object spec;
8576
8577   if (NILP (coding_system))
8578     coding_system = Qno_conversion;
8579   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8580   return AREF (spec, 1);
8581 }
8582
8583 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8584        Scoding_system_eol_type, 1, 1, 0,
8585        doc: /* Return eol-type of CODING-SYSTEM.
8586 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8587
8588 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8589 and CR respectively.
8590
8591 A vector value indicates that a format of end-of-line should be
8592 detected automatically.  Nth element of the vector is the subsidiary
8593 coding system whose eol-type is N.  */)
8594      (coding_system)
8595      Lisp_Object coding_system;
8596 {
8597   Lisp_Object spec, eol_type;
8598   int n;
8599
8600   if (NILP (coding_system))
8601     coding_system = Qno_conversion;
8602   if (! CODING_SYSTEM_P (coding_system))
8603     return Qnil;
8604   spec = CODING_SYSTEM_SPEC (coding_system);
8605   eol_type = AREF (spec, 2);
8606   if (VECTORP (eol_type))
8607     return Fcopy_sequence (eol_type);
8608   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8609   return make_number (n);
8610 }
8611
8612 #endif /* emacs */
8613
8614 \f
8615 /*** 9. Post-amble ***/
8616
8617 void
8618 init_coding_once ()
8619 {
8620   int i;
8621
8622   for (i = 0; i < coding_category_max; i++)
8623     {
8624       coding_categories[i].id = -1;
8625       coding_priorities[i] = i;
8626     }
8627
8628   /* ISO2022 specific initialize routine.  */
8629   for (i = 0; i < 0x20; i++)
8630     iso_code_class[i] = ISO_control_0;
8631   for (i = 0x21; i < 0x7F; i++)
8632     iso_code_class[i] = ISO_graphic_plane_0;
8633   for (i = 0x80; i < 0xA0; i++)
8634     iso_code_class[i] = ISO_control_1;
8635   for (i = 0xA1; i < 0xFF; i++)
8636     iso_code_class[i] = ISO_graphic_plane_1;
8637   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8638   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8639   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8640   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8641   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8642   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8643   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8644   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8645   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8646   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8647
8648   for (i = 0; i < 256; i++)
8649     {
8650       emacs_mule_bytes[i] = 1;
8651     }
8652   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8653   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8654   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8655   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8656 }
8657
8658 #ifdef emacs
8659
8660 void
8661 syms_of_coding ()
8662 {
8663   staticpro (&Vcoding_system_hash_table);
8664   {
8665     Lisp_Object args[2];
8666     args[0] = QCtest;
8667     args[1] = Qeq;
8668     Vcoding_system_hash_table = Fmake_hash_table (2, args);
8669   }
8670
8671   staticpro (&Vsjis_coding_system);
8672   Vsjis_coding_system = Qnil;
8673
8674   staticpro (&Vbig5_coding_system);
8675   Vbig5_coding_system = Qnil;
8676
8677   staticpro (&Vcode_conversion_work_buf_list);
8678   Vcode_conversion_work_buf_list = Qnil;
8679
8680   staticpro (&Vcode_conversion_reused_work_buf);
8681   Vcode_conversion_reused_work_buf = Qnil;
8682
8683   DEFSYM (Qcharset, "charset");
8684   DEFSYM (Qtarget_idx, "target-idx");
8685   DEFSYM (Qcoding_system_history, "coding-system-history");
8686   Fset (Qcoding_system_history, Qnil);
8687
8688   /* Target FILENAME is the first argument.  */
8689   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8690   /* Target FILENAME is the third argument.  */
8691   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8692
8693   DEFSYM (Qcall_process, "call-process");
8694   /* Target PROGRAM is the first argument.  */
8695   Fput (Qcall_process, Qtarget_idx, make_number (0));
8696
8697   DEFSYM (Qcall_process_region, "call-process-region");
8698   /* Target PROGRAM is the third argument.  */
8699   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8700
8701   DEFSYM (Qstart_process, "start-process");
8702   /* Target PROGRAM is the third argument.  */
8703   Fput (Qstart_process, Qtarget_idx, make_number (2));
8704
8705   DEFSYM (Qopen_network_stream, "open-network-stream");
8706   /* Target SERVICE is the fourth argument.  */
8707   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8708
8709   DEFSYM (Qcoding_system, "coding-system");
8710   DEFSYM (Qcoding_aliases, "coding-aliases");
8711
8712   DEFSYM (Qeol_type, "eol-type");
8713   DEFSYM (Qunix, "unix");
8714   DEFSYM (Qdos, "dos");
8715
8716   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8717   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8718   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8719   DEFSYM (Qdefault_char, "default-char");
8720   DEFSYM (Qundecided, "undecided");
8721   DEFSYM (Qno_conversion, "no-conversion");
8722   DEFSYM (Qraw_text, "raw-text");
8723
8724   DEFSYM (Qiso_2022, "iso-2022");
8725
8726   DEFSYM (Qutf_8, "utf-8");
8727   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8728
8729   DEFSYM (Qutf_16, "utf-16");
8730   DEFSYM (Qbig, "big");
8731   DEFSYM (Qlittle, "little");
8732
8733   DEFSYM (Qshift_jis, "shift-jis");
8734   DEFSYM (Qbig5, "big5");
8735
8736   DEFSYM (Qcoding_system_p, "coding-system-p");
8737
8738   DEFSYM (Qcoding_system_error, "coding-system-error");
8739   Fput (Qcoding_system_error, Qerror_conditions,
8740         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8741   Fput (Qcoding_system_error, Qerror_message,
8742         build_string ("Invalid coding system"));
8743
8744   /* Intern this now in case it isn't already done.
8745      Setting this variable twice is harmless.
8746      But don't staticpro it here--that is done in alloc.c.  */
8747   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8748
8749   DEFSYM (Qtranslation_table, "translation-table");
8750   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8751   DEFSYM (Qtranslation_table_id, "translation-table-id");
8752   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8753   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8754
8755   DEFSYM (Qvalid_codes, "valid-codes");
8756
8757   DEFSYM (Qemacs_mule, "emacs-mule");
8758
8759   DEFSYM (QCcategory, ":category");
8760
8761   Vcoding_category_table
8762     = Fmake_vector (make_number (coding_category_max), Qnil);
8763   staticpro (&Vcoding_category_table);
8764   /* Followings are target of code detection.  */
8765   ASET (Vcoding_category_table, coding_category_iso_7,
8766         intern ("coding-category-iso-7"));
8767   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8768         intern ("coding-category-iso-7-tight"));
8769   ASET (Vcoding_category_table, coding_category_iso_8_1,
8770         intern ("coding-category-iso-8-1"));
8771   ASET (Vcoding_category_table, coding_category_iso_8_2,
8772         intern ("coding-category-iso-8-2"));
8773   ASET (Vcoding_category_table, coding_category_iso_7_else,
8774         intern ("coding-category-iso-7-else"));
8775   ASET (Vcoding_category_table, coding_category_iso_8_else,
8776         intern ("coding-category-iso-8-else"));
8777   ASET (Vcoding_category_table, coding_category_utf_8,
8778         intern ("coding-category-utf-8"));
8779   ASET (Vcoding_category_table, coding_category_utf_16_be,
8780         intern ("coding-category-utf-16-be"));
8781   ASET (Vcoding_category_table, coding_category_utf_16_auto,
8782         intern ("coding-category-utf-16-auto"));
8783   ASET (Vcoding_category_table, coding_category_utf_16_le,
8784         intern ("coding-category-utf-16-le"));
8785   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8786         intern ("coding-category-utf-16-be-nosig"));
8787   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8788         intern ("coding-category-utf-16-le-nosig"));
8789   ASET (Vcoding_category_table, coding_category_charset,
8790         intern ("coding-category-charset"));
8791   ASET (Vcoding_category_table, coding_category_sjis,
8792         intern ("coding-category-sjis"));
8793   ASET (Vcoding_category_table, coding_category_big5,
8794         intern ("coding-category-big5"));
8795   ASET (Vcoding_category_table, coding_category_ccl,
8796         intern ("coding-category-ccl"));
8797   ASET (Vcoding_category_table, coding_category_emacs_mule,
8798         intern ("coding-category-emacs-mule"));
8799   /* Followings are NOT target of code detection.  */
8800   ASET (Vcoding_category_table, coding_category_raw_text,
8801         intern ("coding-category-raw-text"));
8802   ASET (Vcoding_category_table, coding_category_undecided,
8803         intern ("coding-category-undecided"));
8804
8805   defsubr (&Scoding_system_p);
8806   defsubr (&Sread_coding_system);
8807   defsubr (&Sread_non_nil_coding_system);
8808   defsubr (&Scheck_coding_system);
8809   defsubr (&Sdetect_coding_region);
8810   defsubr (&Sdetect_coding_string);
8811   defsubr (&Sfind_coding_systems_region_internal);
8812   defsubr (&Sunencodable_char_position);
8813   defsubr (&Scheck_coding_systems_region);
8814   defsubr (&Sdecode_coding_region);
8815   defsubr (&Sencode_coding_region);
8816   defsubr (&Sdecode_coding_string);
8817   defsubr (&Sencode_coding_string);
8818   defsubr (&Sdecode_sjis_char);
8819   defsubr (&Sencode_sjis_char);
8820   defsubr (&Sdecode_big5_char);
8821   defsubr (&Sencode_big5_char);
8822   defsubr (&Sset_terminal_coding_system_internal);
8823   defsubr (&Sset_safe_terminal_coding_system_internal);
8824   defsubr (&Sterminal_coding_system);
8825   defsubr (&Sset_keyboard_coding_system_internal);
8826   defsubr (&Skeyboard_coding_system);
8827   defsubr (&Sfind_operation_coding_system);
8828   defsubr (&Sset_coding_system_priority);
8829   defsubr (&Sdefine_coding_system_internal);
8830   defsubr (&Sdefine_coding_system_alias);
8831   defsubr (&Scoding_system_base);
8832   defsubr (&Scoding_system_plist);
8833   defsubr (&Scoding_system_aliases);
8834   defsubr (&Scoding_system_eol_type);
8835   defsubr (&Scoding_system_priority_list);
8836
8837   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8838                doc: /* List of coding systems.
8839
8840 Do not alter the value of this variable manually.  This variable should be
8841 updated by the functions `define-coding-system' and
8842 `define-coding-system-alias'.  */);
8843   Vcoding_system_list = Qnil;
8844
8845   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8846                doc: /* Alist of coding system names.
8847 Each element is one element list of coding system name.
8848 This variable is given to `completing-read' as TABLE argument.
8849
8850 Do not alter the value of this variable manually.  This variable should be
8851 updated by the functions `make-coding-system' and
8852 `define-coding-system-alias'.  */);
8853   Vcoding_system_alist = Qnil;
8854
8855   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8856                doc: /* List of coding-categories (symbols) ordered by priority.
8857
8858 On detecting a coding system, Emacs tries code detection algorithms
8859 associated with each coding-category one by one in this order.  When
8860 one algorithm agrees with a byte sequence of source text, the coding
8861 system bound to the corresponding coding-category is selected.  */);
8862   {
8863     int i;
8864
8865     Vcoding_category_list = Qnil;
8866     for (i = coding_category_max - 1; i >= 0; i--)
8867       Vcoding_category_list
8868         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8869                  Vcoding_category_list);
8870   }
8871
8872   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8873                doc: /* Specify the coding system for read operations.
8874 It is useful to bind this variable with `let', but do not set it globally.
8875 If the value is a coding system, it is used for decoding on read operation.
8876 If not, an appropriate element is used from one of the coding system alists:
8877 There are three such tables, `file-coding-system-alist',
8878 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8879   Vcoding_system_for_read = Qnil;
8880
8881   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8882                doc: /* Specify the coding system for write operations.
8883 Programs bind this variable with `let', but you should not set it globally.
8884 If the value is a coding system, it is used for encoding of output,
8885 when writing it to a file and when sending it to a file or subprocess.
8886
8887 If this does not specify a coding system, an appropriate element
8888 is used from one of the coding system alists:
8889 There are three such tables, `file-coding-system-alist',
8890 `process-coding-system-alist', and `network-coding-system-alist'.
8891 For output to files, if the above procedure does not specify a coding system,
8892 the value of `buffer-file-coding-system' is used.  */);
8893   Vcoding_system_for_write = Qnil;
8894
8895   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8896                doc: /*
8897 Coding system used in the latest file or process I/O.  */);
8898   Vlast_coding_system_used = Qnil;
8899
8900   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8901                doc: /*
8902 *Non-nil means always inhibit code conversion of end-of-line format.
8903 See info node `Coding Systems' and info node `Text and Binary' concerning
8904 such conversion.  */);
8905   inhibit_eol_conversion = 0;
8906
8907   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8908                doc: /*
8909 Non-nil means process buffer inherits coding system of process output.
8910 Bind it to t if the process output is to be treated as if it were a file
8911 read from some filesystem.  */);
8912   inherit_process_coding_system = 0;
8913
8914   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8915                doc: /*
8916 Alist to decide a coding system to use for a file I/O operation.
8917 The format is ((PATTERN . VAL) ...),
8918 where PATTERN is a regular expression matching a file name,
8919 VAL is a coding system, a cons of coding systems, or a function symbol.
8920 If VAL is a coding system, it is used for both decoding and encoding
8921 the file contents.
8922 If VAL is a cons of coding systems, the car part is used for decoding,
8923 and the cdr part is used for encoding.
8924 If VAL is a function symbol, the function must return a coding system
8925 or a cons of coding systems which are used as above.  The function gets
8926 the arguments with which `find-operation-coding-systems' was called.
8927
8928 See also the function `find-operation-coding-system'
8929 and the variable `auto-coding-alist'.  */);
8930   Vfile_coding_system_alist = Qnil;
8931
8932   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8933                doc: /*
8934 Alist to decide a coding system to use for a process I/O operation.
8935 The format is ((PATTERN . VAL) ...),
8936 where PATTERN is a regular expression matching a program name,
8937 VAL is a coding system, a cons of coding systems, or a function symbol.
8938 If VAL is a coding system, it is used for both decoding what received
8939 from the program and encoding what sent to the program.
8940 If VAL is a cons of coding systems, the car part is used for decoding,
8941 and the cdr part is used for encoding.
8942 If VAL is a function symbol, the function must return a coding system
8943 or a cons of coding systems which are used as above.
8944
8945 See also the function `find-operation-coding-system'.  */);
8946   Vprocess_coding_system_alist = Qnil;
8947
8948   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8949                doc: /*
8950 Alist to decide a coding system to use for a network I/O operation.
8951 The format is ((PATTERN . VAL) ...),
8952 where PATTERN is a regular expression matching a network service name
8953 or is a port number to connect to,
8954 VAL is a coding system, a cons of coding systems, or a function symbol.
8955 If VAL is a coding system, it is used for both decoding what received
8956 from the network stream and encoding what sent to the network stream.
8957 If VAL is a cons of coding systems, the car part is used for decoding,
8958 and the cdr part is used for encoding.
8959 If VAL is a function symbol, the function must return a coding system
8960 or a cons of coding systems which are used as above.
8961
8962 See also the function `find-operation-coding-system'.  */);
8963   Vnetwork_coding_system_alist = Qnil;
8964
8965   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8966                doc: /* Coding system to use with system messages.
8967 Also used for decoding keyboard input on X Window system.  */);
8968   Vlocale_coding_system = Qnil;
8969
8970   /* The eol mnemonics are reset in startup.el system-dependently.  */
8971   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8972                doc: /*
8973 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8974   eol_mnemonic_unix = build_string (":");
8975
8976   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8977                doc: /*
8978 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8979   eol_mnemonic_dos = build_string ("\\");
8980
8981   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8982                doc: /*
8983 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8984   eol_mnemonic_mac = build_string ("/");
8985
8986   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8987                doc: /*
8988 *String displayed in mode line when end-of-line format is not yet determined.  */);
8989   eol_mnemonic_undecided = build_string (":");
8990
8991   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8992                doc: /*
8993 *Non-nil enables character translation while encoding and decoding.  */);
8994   Venable_character_translation = Qt;
8995
8996   DEFVAR_LISP ("standard-translation-table-for-decode",
8997                &Vstandard_translation_table_for_decode,
8998                doc: /* Table for translating characters while decoding.  */);
8999   Vstandard_translation_table_for_decode = Qnil;
9000
9001   DEFVAR_LISP ("standard-translation-table-for-encode",
9002                &Vstandard_translation_table_for_encode,
9003                doc: /* Table for translating characters while encoding.  */);
9004   Vstandard_translation_table_for_encode = Qnil;
9005
9006   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9007                doc: /* Alist of charsets vs revision numbers.
9008 While encoding, if a charset (car part of an element) is found,
9009 designate it with the escape sequence identifying revision (cdr part
9010 of the element).  */);
9011   Vcharset_revision_table = Qnil;
9012
9013   DEFVAR_LISP ("default-process-coding-system",
9014                &Vdefault_process_coding_system,
9015                doc: /* Cons of coding systems used for process I/O by default.
9016 The car part is used for decoding a process output,
9017 the cdr part is used for encoding a text to be sent to a process.  */);
9018   Vdefault_process_coding_system = Qnil;
9019
9020   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9021                doc: /*
9022 Table of extra Latin codes in the range 128..159 (inclusive).
9023 This is a vector of length 256.
9024 If Nth element is non-nil, the existence of code N in a file
9025 \(or output of subprocess) doesn't prevent it to be detected as
9026 a coding system of ISO 2022 variant which has a flag
9027 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9028 or reading output of a subprocess.
9029 Only 128th through 159th elements has a meaning.  */);
9030   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9031
9032   DEFVAR_LISP ("select-safe-coding-system-function",
9033                &Vselect_safe_coding_system_function,
9034                doc: /*
9035 Function to call to select safe coding system for encoding a text.
9036
9037 If set, this function is called to force a user to select a proper
9038 coding system which can encode the text in the case that a default
9039 coding system used in each operation can't encode the text.
9040
9041 The default value is `select-safe-coding-system' (which see).  */);
9042   Vselect_safe_coding_system_function = Qnil;
9043
9044   DEFVAR_BOOL ("coding-system-require-warning",
9045                &coding_system_require_warning,
9046                doc: /* Internal use only.
9047 If non-nil, on writing a file, `select-safe-coding-system-function' is
9048 called even if `coding-system-for-write' is non-nil.  The command
9049 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9050   coding_system_require_warning = 0;
9051
9052
9053   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9054                &inhibit_iso_escape_detection,
9055                doc: /*
9056 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9057
9058 By default, on reading a file, Emacs tries to detect how the text is
9059 encoded.  This code detection is sensitive to escape sequences.  If
9060 the sequence is valid as ISO2022, the code is determined as one of
9061 the ISO2022 encodings, and the file is decoded by the corresponding
9062 coding system (e.g. `iso-2022-7bit').
9063
9064 However, there may be a case that you want to read escape sequences in
9065 a file as is.  In such a case, you can set this variable to non-nil.
9066 Then, as the code detection ignores any escape sequences, no file is
9067 detected as encoded in some ISO2022 encoding.  The result is that all
9068 escape sequences become visible in a buffer.
9069
9070 The default value is nil, and it is strongly recommended not to change
9071 it.  That is because many Emacs Lisp source files that contain
9072 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9073 in Emacs's distribution, and they won't be decoded correctly on
9074 reading if you suppress escape sequence detection.
9075
9076 The other way to read escape sequences in a file without decoding is
9077 to explicitly specify some coding system that doesn't use ISO2022's
9078 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9079   inhibit_iso_escape_detection = 0;
9080
9081   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9082                doc: /* Char table for translating self-inserting characters.
9083 This is applied to the result of input methods, not their input.  See also
9084 `keyboard-translate-table'.  */);
9085     Vtranslation_table_for_input = Qnil;
9086
9087   {
9088     Lisp_Object args[coding_arg_max];
9089     Lisp_Object plist[16];
9090     int i;
9091
9092     for (i = 0; i < coding_arg_max; i++)
9093       args[i] = Qnil;
9094
9095     plist[0] = intern (":name");
9096     plist[1] = args[coding_arg_name] = Qno_conversion;
9097     plist[2] = intern (":mnemonic");
9098     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9099     plist[4] = intern (":coding-type");
9100     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9101     plist[6] = intern (":ascii-compatible-p");
9102     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9103     plist[8] = intern (":default-char");
9104     plist[9] = args[coding_arg_default_char] = make_number (0);
9105     plist[10] = intern (":for-unibyte");
9106     plist[11] = args[coding_arg_for_unibyte] = Qt;
9107     plist[12] = intern (":docstring");
9108     plist[13] = build_string ("Do no conversion.\n\
9109 \n\
9110 When you visit a file with this coding, the file is read into a\n\
9111 unibyte buffer as is, thus each byte of a file is treated as a\n\
9112 character.");
9113     plist[14] = intern (":eol-type");
9114     plist[15] = args[coding_arg_eol_type] = Qunix;
9115     args[coding_arg_plist] = Flist (16, plist);
9116     Fdefine_coding_system_internal (coding_arg_max, args);
9117   }
9118
9119   setup_coding_system (Qno_conversion, &keyboard_coding);
9120   setup_coding_system (Qno_conversion, &terminal_coding);
9121   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9122
9123   {
9124     int i;
9125
9126     for (i = 0; i < coding_category_max; i++)
9127       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9128   }
9129 }
9130
9131 char *
9132 emacs_strerror (error_number)
9133      int error_number;
9134 {
9135   char *str;
9136
9137   synchronize_system_messages_locale ();
9138   str = strerror (error_number);
9139
9140   if (! NILP (Vlocale_coding_system))
9141     {
9142       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9143                                                       Vlocale_coding_system,
9144                                                       0);
9145       str = (char *) SDATA (dec);
9146     }
9147
9148   return str;
9149 }
9150
9151 #endif /* emacs */