src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software; you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation; either version 3, or (at your option)
  17 any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs; see the file COPYING.  If not, write to
  26 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  27 Boston, MA 02110-1301, USA.  */
  28
  29 /*** TABLE OF CONTENTS ***
  30
  31   0. General comments
  32   1. Preamble
  33   2. Emacs' internal format (emacs-utf-8) handlers
  34   3. UTF-8 handlers
  35   4. UTF-16 handlers
  36   5. Charset-base coding systems handlers
  37   6. emacs-mule (old Emacs' internal format) handlers
  38   7. ISO2022 handlers
  39   8. Shift-JIS and BIG5 handlers
  40   9. CCL handlers
  41   10. C library functions
  42   11. Emacs Lisp library functions
  43   12. Postamble
  44
  45 */
  46
  47 /*** 0. General comments ***
  48
  49
  50 CODING SYSTEM
  51
  52   A coding system is an object for an encoding mechanism that contains
  53   information about how to convert byte sequences to character
  54   sequences and vice versa.  When we say "decode", it means converting
  55   a byte sequence of a specific coding system into a character
  56   sequence that is represented by Emacs' internal coding system
  57   `emacs-utf-8', and when we say "encode", it means converting a
  58   character sequence of emacs-utf-8 to a byte sequence of a specific
  59   coding system.
  60
  61   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  62   C level, a coding system is represented by a vector of attributes
  63   stored in the hash table Vcharset_hash_table.  The conversion from
  64   coding system symbol to attributes vector is done by looking up
  65   Vcharset_hash_table by the symbol.
  66
  67   Coding systems are classified into the following types depending on
  68   the encoding mechanism.  Here's a brief description of the types.
  69
  70   o UTF-8
  71
  72   o UTF-16
  73
  74   o Charset-base coding system
  75
  76   A coding system defined by one or more (coded) character sets.
  77   Decoding and encoding are done by a code converter defined for each
  78   character set.
  79
  80   o Old Emacs internal format (emacs-mule)
  81
  82   The coding system adopted by old versions of Emacs (20 and 21).
  83
  84   o ISO2022-base coding system
  85
  86   The most famous coding system for multiple character sets.  X's
  87   Compound Text, various EUCs (Extended Unix Code), and coding systems
  88   used in the Internet communication such as ISO-2022-JP are all
  89   variants of ISO2022.
  90
  91   o SJIS (or Shift-JIS or MS-Kanji-Code)
  92
  93   A coding system to encode character sets: ASCII, JISX0201, and
  94   JISX0208.  Widely used for PC's in Japan.  Details are described in
  95   section 8.
  96
  97   o BIG5
  98
  99   A coding system to encode character sets: ASCII and Big5.  Widely
 100   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 101   described in section 8.  In this file, when we write "big5" (all
 102   lowercase), we mean the coding system, and when we write "Big5"
 103   (capitalized), we mean the character set.
 104
 105   o CCL
 106
 107   If a user wants to decode/encode text encoded in a coding system
 108   not listed above, he can supply a decoder and an encoder for it in
 109   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 110   program while decoding/encoding.
 111
 112   o Raw-text
 113
 114   A coding system for text containing raw eight-bit data.  Emacs
 115   treats each byte of source text as a character (except for
 116   end-of-line conversion).
 117
 118   o No-conversion
 119
 120   Like raw text, but don't do end-of-line conversion.
 121
 122
 123 END-OF-LINE FORMAT
 124
 125   How text end-of-line is encoded depends on operating system.  For
 126   instance, Unix's format is just one byte of LF (line-feed) code,
 127   whereas DOS's format is two-byte sequence of `carriage-return' and
 128   `line-feed' codes.  MacOS's format is usually one byte of
 129   `carriage-return'.
 130
 131   Since text character encoding and end-of-line encoding are
 132   independent, any coding system described above can take any format
 133   of end-of-line (except for no-conversion).
 134
 135 STRUCT CODING_SYSTEM
 136
 137   Before using a coding system for code conversion (i.e. decoding and
 138   encoding), we setup a structure of type `struct coding_system'.
 139   This structure keeps various information about a specific code
 140   conversion (e.g. the location of source and destination data).
 141
 142 */
 143
 144 /* COMMON MACROS */
 145
 146
 147 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 148
 149   These functions check if a byte sequence specified as a source in
 150   CODING conforms to the format of XXX, and update the members of
 151   DETECT_INFO.
 152
 153   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, detect_info)
 160      struct coding_system *coding;
 161      struct coding_detection_info *detect_info;
 162 {
 163   const unsigned char *src = coding->source;
 164   const unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int consumed_chars = 0;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175
 176       if (! __C_conforms_to_XXX___ (c))
 177         break;
 178       if (! __C_strongly_suggests_XXX__ (c))
 179         found = CATEGORY_MASK_XXX;
 180     }
 181   /* The byte sequence is invalid for XXX.  */
 182   detect_info->rejected |= CATEGORY_MASK_XXX;
 183   return 0;
 184
 185  no_more_source:
 186   /* The source exausted successfully.  */
 187   detect_info->found |= found;
 188   return 1;
 189 }
 190 #endif
 191
 192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 193
 194   These functions decode a byte sequence specified as a source by
 195   CODING.  The resulting multibyte text goes to a place pointed to by
 196   CODING->charbuf, the length of which should not exceed
 197   CODING->charbuf_size;
 198
 199   These functions set the information of original and decoded texts in
 200   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 201   They also set CODING->result to one of CODING_RESULT_XXX indicating
 202   how the decoding is finished.
 203
 204   Below is the template of these functions.  */
 205
 206 #if 0
 207 static void
 208 decode_coding_XXXX (coding)
 209      struct coding_system *coding;
 210 {
 211   const unsigned char *src = coding->source + coding->consumed;
 212   const unsigned char *src_end = coding->source + coding->src_bytes;
 213   /* SRC_BASE remembers the start position in source in each loop.
 214      The loop will be exited when there's not enough source code, or
 215      when there's no room in CHARBUF for a decoded character.  */
 216   const unsigned char *src_base;
 217   /* A buffer to produce decoded characters.  */
 218   int *charbuf = coding->charbuf + coding->charbuf_used;
 219   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 220   int multibytep = coding->src_multibyte;
 221
 222   while (1)
 223     {
 224       src_base = src;
 225       if (charbuf < charbuf_end)
 226         /* No more room to produce a decoded character.  */
 227         break;
 228       ONE_MORE_BYTE (c);
 229       /* Decode it. */
 230     }
 231
 232  no_more_source:
 233   if (src_base < src_end
 234       && coding->mode & CODING_MODE_LAST_BLOCK)
 235     /* If the source ends by partial bytes to construct a character,
 236        treat them as eight-bit raw data.  */
 237     while (src_base < src_end && charbuf < charbuf_end)
 238       *charbuf++ = *src_base++;
 239   /* Remember how many bytes and characters we consumed.  If the
 240      source is multibyte, the bytes and chars are not identical.  */
 241   coding->consumed = coding->consumed_char = src_base - coding->source;
 242   /* Remember how many characters we produced.  */
 243   coding->charbuf_used = charbuf - coding->charbuf;
 244 }
 245 #endif
 246
 247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 248
 249   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 250   internal multibyte format by CODING.  The resulting byte sequence
 251   goes to a place pointed to by DESTINATION, the length of which
 252   should not exceed DST_BYTES.
 253
 254   These functions set the information of original and encoded texts in
 255   the members produced, produced_char, consumed, and consumed_char of
 256   the structure *CODING.  They also set the member result to one of
 257   CODING_RESULT_XXX indicating how the encoding finished.
 258
 259   DST_BYTES zero means that source area and destination area are
 260   overlapped, which means that we can produce a encoded text until it
 261   reaches at the head of not-yet-encoded source text.
 262
 263   Below is a template of these functions.  */
 264 #if 0
 265 static void
 266 encode_coding_XXX (coding)
 267      struct coding_system *coding;
 268 {
 269   int multibytep = coding->dst_multibyte;
 270   int *charbuf = coding->charbuf;
 271   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 272   unsigned char *dst = coding->destination + coding->produced;
 273   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 274   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 275   int produced_chars = 0;
 276
 277   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 278     {
 279       int c = *charbuf;
 280       /* Encode C into DST, and increment DST.  */
 281     }
 282  label_no_more_destination:
 283   /* How many chars and bytes we produced.  */
 284   coding->produced_char += produced_chars;
 285   coding->produced = dst - coding->destination;
 286 }
 287 #endif
 288
 289 \f
 290 /*** 1. Preamble ***/
 291
 292 #include <config.h>
 293 #include <stdio.h>
 294
 295 #include "lisp.h"
 296 #include "buffer.h"
 297 #include "character.h"
 298 #include "charset.h"
 299 #include "ccl.h"
 300 #include "composite.h"
 301 #include "coding.h"
 302 #include "window.h"
 303 #include "frame.h"
 304 #include "termhooks.h"
 305
 306 Lisp_Object Vcoding_system_hash_table;
 307
 308 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 309 Lisp_Object Qunix, Qdos;
 310 extern Lisp_Object Qmac;        /* frame.c */
 311 Lisp_Object Qbuffer_file_coding_system;
 312 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 313 Lisp_Object Qdefault_char;
 314 Lisp_Object Qno_conversion, Qundecided;
 315 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 316 Lisp_Object Qbig, Qlittle;
 317 Lisp_Object Qcoding_system_history;
 318 Lisp_Object Qvalid_codes;
 319 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 320 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 321 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 322 Lisp_Object QCascii_compatible_p;
 323
 324 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 Lisp_Object Qtarget_idx;
 328
 329 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 330 Lisp_Object Qinterrupted, Qinsufficient_memory;
 331
 332 /* If a symbol has this property, evaluate the value to define the
 333    symbol as a coding system.  */
 334 static Lisp_Object Qcoding_system_define_form;
 335
 336 int coding_system_require_warning;
 337
 338 Lisp_Object Vselect_safe_coding_system_function;
 339
 340 /* Mnemonic string for each format of end-of-line.  */
 341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 342 /* Mnemonic string to indicate format of end-of-line is not yet
 343    decided.  */
 344 Lisp_Object eol_mnemonic_undecided;
 345
 346 /* Format of end-of-line decided by system.  This is Qunix on
 347    Unix and Mac, Qdos on DOS/Windows.
 348    This has an effect only for external encoding (i.e. for output to
 349    file and process), not for in-buffer or Lisp string encoding.  */
 350 static Lisp_Object system_eol_type;
 351
 352 #ifdef emacs
 353
 354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 355
 356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 357
 358 /* Coding system emacs-mule and raw-text are for converting only
 359    end-of-line format.  */
 360 Lisp_Object Qemacs_mule, Qraw_text;
 361 Lisp_Object Qutf_8_emacs;
 362
 363 /* Coding-systems are handed between Emacs Lisp programs and C internal
 364    routines by the following three variables.  */
 365 /* Coding-system for reading files and receiving data from process.  */
 366 Lisp_Object Vcoding_system_for_read;
 367 /* Coding-system for writing files and sending data to process.  */
 368 Lisp_Object Vcoding_system_for_write;
 369 /* Coding-system actually used in the latest I/O.  */
 370 Lisp_Object Vlast_coding_system_used;
 371 /* Set to non-nil when an error is detected while code conversion.  */
 372 Lisp_Object Vlast_code_conversion_error;
 373 /* A vector of length 256 which contains information about special
 374    Latin codes (especially for dealing with Microsoft codes).  */
 375 Lisp_Object Vlatin_extra_code_table;
 376
 377 /* Flag to inhibit code conversion of end-of-line format.  */
 378 int inhibit_eol_conversion;
 379
 380 /* Flag to inhibit ISO2022 escape sequence detection.  */
 381 int inhibit_iso_escape_detection;
 382
 383 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 384 int inherit_process_coding_system;
 385
 386 /* Coding system to be used to encode text for terminal display when
 387    terminal coding system is nil.  */
 388 struct coding_system safe_terminal_coding;
 389
 390 /* Default coding system to be used to write a file.  */
 391 struct coding_system default_buffer_file_coding;
 392
 393 Lisp_Object Vfile_coding_system_alist;
 394 Lisp_Object Vprocess_coding_system_alist;
 395 Lisp_Object Vnetwork_coding_system_alist;
 396
 397 Lisp_Object Vlocale_coding_system;
 398
 399 #endif /* emacs */
 400
 401 /* Flag to tell if we look up translation table on character code
 402    conversion.  */
 403 Lisp_Object Venable_character_translation;
 404 /* Standard translation table to look up on decoding (reading).  */
 405 Lisp_Object Vstandard_translation_table_for_decode;
 406 /* Standard translation table to look up on encoding (writing).  */
 407 Lisp_Object Vstandard_translation_table_for_encode;
 408
 409 Lisp_Object Qtranslation_table;
 410 Lisp_Object Qtranslation_table_id;
 411 Lisp_Object Qtranslation_table_for_decode;
 412 Lisp_Object Qtranslation_table_for_encode;
 413
 414 /* Alist of charsets vs revision number.  */
 415 static Lisp_Object Vcharset_revision_table;
 416
 417 /* Default coding systems used for process I/O.  */
 418 Lisp_Object Vdefault_process_coding_system;
 419
 420 /* Char table for translating Quail and self-inserting input.  */
 421 Lisp_Object Vtranslation_table_for_input;
 422
 423 /* Two special coding systems.  */
 424 Lisp_Object Vsjis_coding_system;
 425 Lisp_Object Vbig5_coding_system;
 426
 427 /* ISO2022 section */
 428
 429 #define CODING_ISO_INITIAL(coding, reg)                 \
 430   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 431                      coding_attr_iso_initial),          \
 432                reg)))
 433
 434
 435 #define CODING_ISO_REQUEST(coding, charset_id)  \
 436   ((charset_id <= (coding)->max_charset_id      \
 437     ? (coding)->safe_charsets[charset_id]       \
 438     : -1))
 439
 440
 441 #define CODING_ISO_FLAGS(coding)        \
 442   ((coding)->spec.iso_2022.flags)
 443 #define CODING_ISO_DESIGNATION(coding, reg)     \
 444   ((coding)->spec.iso_2022.current_designation[reg])
 445 #define CODING_ISO_INVOCATION(coding, plane)    \
 446   ((coding)->spec.iso_2022.current_invocation[plane])
 447 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 448   ((coding)->spec.iso_2022.single_shifting)
 449 #define CODING_ISO_BOL(coding)  \
 450   ((coding)->spec.iso_2022.bol)
 451 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 452   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 453
 454 /* Control characters of ISO2022.  */
 455                         /* code */      /* function */
 456 #define ISO_CODE_LF     0x0A            /* line-feed */
 457 #define ISO_CODE_CR     0x0D            /* carriage-return */
 458 #define ISO_CODE_SO     0x0E            /* shift-out */
 459 #define ISO_CODE_SI     0x0F            /* shift-in */
 460 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 461 #define ISO_CODE_ESC    0x1B            /* escape */
 462 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 463 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 464 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 465
 466 /* All code (1-byte) of ISO2022 is classified into one of the
 467    followings.  */
 468 enum iso_code_class_type
 469   {
 470     ISO_control_0,              /* Control codes in the range
 471                                    0x00..0x1F and 0x7F, except for the
 472                                    following 5 codes.  */
 473     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 474     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 475     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 476     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 477     ISO_control_1,              /* Control codes in the range
 478                                    0x80..0x9F, except for the
 479                                    following 3 codes.  */
 480     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 481     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 482     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 483     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 484     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 485     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 486     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 487   };
 488
 489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 490     `iso-flags' attribute of an iso2022 coding system.  */
 491
 492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 493    instead of the correct short-form sequence (e.g. ESC $ A).  */
 494 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 495
 496 /* If set, reset graphic planes and registers at end-of-line to the
 497    initial state.  */
 498 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 499
 500 /* If set, reset graphic planes and registers before any control
 501    characters to the initial state.  */
 502 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 503
 504 /* If set, encode by 7-bit environment.  */
 505 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 506
 507 /* If set, use locking-shift function.  */
 508 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 509
 510 /* If set, use single-shift function.  Overwrite
 511    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 512 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 513
 514 /* If set, use designation escape sequence.  */
 515 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 516
 517 /* If set, produce revision number sequence.  */
 518 #define CODING_ISO_FLAG_REVISION        0x0080
 519
 520 /* If set, produce ISO6429's direction specifying sequence.  */
 521 #define CODING_ISO_FLAG_DIRECTION       0x0100
 522
 523 /* If set, assume designation states are reset at beginning of line on
 524    output.  */
 525 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 526
 527 /* If set, designation sequence should be placed at beginning of line
 528    on output.  */
 529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 530
 531 /* If set, do not encode unsafe charactes on output.  */
 532 #define CODING_ISO_FLAG_SAFE            0x0800
 533
 534 /* If set, extra latin codes (128..159) are accepted as a valid code
 535    on input.  */
 536 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 537
 538 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 539
 540 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 541
 542 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 543
 544 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 545
 546 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 547
 548 /* A character to be produced on output if encoding of the original
 549    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 551
 552
 553 /* UTF-16 section */
 554 #define CODING_UTF_16_BOM(coding)       \
 555   ((coding)->spec.utf_16.bom)
 556
 557 #define CODING_UTF_16_ENDIAN(coding)    \
 558   ((coding)->spec.utf_16.endian)
 559
 560 #define CODING_UTF_16_SURROGATE(coding) \
 561   ((coding)->spec.utf_16.surrogate)
 562
 563
 564 /* CCL section */
 565 #define CODING_CCL_DECODER(coding)      \
 566   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 567 #define CODING_CCL_ENCODER(coding)      \
 568   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 569 #define CODING_CCL_VALIDS(coding)                                          \
 570   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 571
 572 /* Index for each coding category in `coding_categories' */
 573
 574 enum coding_category
 575   {
 576     coding_category_iso_7,
 577     coding_category_iso_7_tight,
 578     coding_category_iso_8_1,
 579     coding_category_iso_8_2,
 580     coding_category_iso_7_else,
 581     coding_category_iso_8_else,
 582     coding_category_utf_8,
 583     coding_category_utf_16_auto,
 584     coding_category_utf_16_be,
 585     coding_category_utf_16_le,
 586     coding_category_utf_16_be_nosig,
 587     coding_category_utf_16_le_nosig,
 588     coding_category_charset,
 589     coding_category_sjis,
 590     coding_category_big5,
 591     coding_category_ccl,
 592     coding_category_emacs_mule,
 593     /* All above are targets of code detection.  */
 594     coding_category_raw_text,
 595     coding_category_undecided,
 596     coding_category_max
 597   };
 598
 599 /* Definitions of flag bits used in detect_coding_XXXX.  */
 600 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 601 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 602 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 603 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 604 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 605 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 606 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 607 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 608 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 609 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 610 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 611 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 612 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 613 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 614 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 615 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 616 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 617 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 618
 619 /* This value is returned if detect_coding_mask () find nothing other
 620    than ASCII characters.  */
 621 #define CATEGORY_MASK_ANY               \
 622   (CATEGORY_MASK_ISO_7                  \
 623    | CATEGORY_MASK_ISO_7_TIGHT          \
 624    | CATEGORY_MASK_ISO_8_1              \
 625    | CATEGORY_MASK_ISO_8_2              \
 626    | CATEGORY_MASK_ISO_7_ELSE           \
 627    | CATEGORY_MASK_ISO_8_ELSE           \
 628    | CATEGORY_MASK_UTF_8                \
 629    | CATEGORY_MASK_UTF_16_BE            \
 630    | CATEGORY_MASK_UTF_16_LE            \
 631    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 632    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 633    | CATEGORY_MASK_CHARSET              \
 634    | CATEGORY_MASK_SJIS                 \
 635    | CATEGORY_MASK_BIG5                 \
 636    | CATEGORY_MASK_CCL                  \
 637    | CATEGORY_MASK_EMACS_MULE)
 638
 639
 640 #define CATEGORY_MASK_ISO_7BIT \
 641   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 642
 643 #define CATEGORY_MASK_ISO_8BIT \
 644   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 645
 646 #define CATEGORY_MASK_ISO_ELSE \
 647   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO_ESCAPE        \
 650   (CATEGORY_MASK_ISO_7                  \
 651    | CATEGORY_MASK_ISO_7_TIGHT          \
 652    | CATEGORY_MASK_ISO_7_ELSE           \
 653    | CATEGORY_MASK_ISO_8_ELSE)
 654
 655 #define CATEGORY_MASK_ISO       \
 656   (  CATEGORY_MASK_ISO_7BIT     \
 657      | CATEGORY_MASK_ISO_8BIT   \
 658      | CATEGORY_MASK_ISO_ELSE)
 659
 660 #define CATEGORY_MASK_UTF_16            \
 661   (CATEGORY_MASK_UTF_16_BE              \
 662    | CATEGORY_MASK_UTF_16_LE            \
 663    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 664    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 665
 666
 667 /* List of symbols `coding-category-xxx' ordered by priority.  This
 668    variable is exposed to Emacs Lisp.  */
 669 static Lisp_Object Vcoding_category_list;
 670
 671 /* Table of coding categories (Lisp symbols).  This variable is for
 672    internal use oly.  */
 673 static Lisp_Object Vcoding_category_table;
 674
 675 /* Table of coding-categories ordered by priority.  */
 676 static enum coding_category coding_priorities[coding_category_max];
 677
 678 /* Nth element is a coding context for the coding system bound to the
 679    Nth coding category.  */
 680 static struct coding_system coding_categories[coding_category_max];
 681
 682 /*** Commonly used macros and functions ***/
 683
 684 #ifndef min
 685 #define min(a, b) ((a) < (b) ? (a) : (b))
 686 #endif
 687 #ifndef max
 688 #define max(a, b) ((a) > (b) ? (a) : (b))
 689 #endif
 690
 691 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 692   do {                                                  \
 693     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 694     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 695   } while (0)
 696
 697
 698 /* Safely get one byte from the source text pointed by SRC which ends
 699    at SRC_END, and set C to that byte.  If there are not enough bytes
 700    in the source, it jumps to `no_more_source'.  If multibytep is
 701    nonzero, and a multibyte character is found at SRC, set C to the
 702    negative value of the character code.  The caller should declare
 703    and set these variables appropriately in advance:
 704         src, src_end, multibytep */
 705
 706 #define ONE_MORE_BYTE(c)                                \
 707   do {                                                  \
 708     if (src == src_end)                                 \
 709       {                                                 \
 710         if (src_base < src)                             \
 711           record_conversion_result                      \
 712             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 713         goto no_more_source;                            \
 714       }                                                 \
 715     c = *src++;                                         \
 716     if (multibytep && (c & 0x80))                       \
 717       {                                                 \
 718         if ((c & 0xFE) == 0xC0)                         \
 719           c = ((c & 1) << 6) | *src++;                  \
 720         else                                            \
 721           {                                             \
 722             src--;                                      \
 723             c = - string_char (src, &src, NULL);        \
 724             record_conversion_result                    \
 725               (coding, CODING_RESULT_INVALID_SRC);      \
 726           }                                             \
 727       }                                                 \
 728     consumed_chars++;                                   \
 729   } while (0)
 730
 731
 732 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 733   do {                                                  \
 734     c = *src++;                                         \
 735     if (multibytep && (c & 0x80))                       \
 736       {                                                 \
 737         if ((c & 0xFE) == 0xC0)                         \
 738           c = ((c & 1) << 6) | *src++;                  \
 739         else                                            \
 740           {                                             \
 741             src--;                                      \
 742             c = - string_char (src, &src, NULL);        \
 743             record_conversion_result                    \
 744               (coding, CODING_RESULT_INVALID_SRC);      \
 745           }                                             \
 746       }                                                 \
 747     consumed_chars++;                                   \
 748   } while (0)
 749
 750
 751 /* Store a byte C in the place pointed by DST and increment DST to the
 752    next free point, and increment PRODUCED_CHARS.  The caller should
 753    assure that C is 0..127, and declare and set the variable `dst'
 754    appropriately in advance.
 755 */
 756
 757
 758 #define EMIT_ONE_ASCII_BYTE(c)  \
 759   do {                          \
 760     produced_chars++;           \
 761     *dst++ = (c);               \
 762   } while (0)
 763
 764
 765 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 766
 767 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 768   do {                                  \
 769     produced_chars += 2;                \
 770     *dst++ = (c1), *dst++ = (c2);       \
 771   } while (0)
 772
 773
 774 /* Store a byte C in the place pointed by DST and increment DST to the
 775    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 776    nonzero, store in an appropriate multibyte from.  The caller should
 777    declare and set the variables `dst' and `multibytep' appropriately
 778    in advance.  */
 779
 780 #define EMIT_ONE_BYTE(c)                \
 781   do {                                  \
 782     produced_chars++;                   \
 783     if (multibytep)                     \
 784       {                                 \
 785         int ch = (c);                   \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789       }                                 \
 790     else                                \
 791       *dst++ = (c);                     \
 792   } while (0)
 793
 794
 795 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 796
 797 #define EMIT_TWO_BYTES(c1, c2)          \
 798   do {                                  \
 799     produced_chars += 2;                \
 800     if (multibytep)                     \
 801       {                                 \
 802         int ch;                         \
 803                                         \
 804         ch = (c1);                      \
 805         if (ch >= 0x80)                 \
 806           ch = BYTE8_TO_CHAR (ch);      \
 807         CHAR_STRING_ADVANCE (ch, dst);  \
 808         ch = (c2);                      \
 809         if (ch >= 0x80)                 \
 810           ch = BYTE8_TO_CHAR (ch);      \
 811         CHAR_STRING_ADVANCE (ch, dst);  \
 812       }                                 \
 813     else                                \
 814       {                                 \
 815         *dst++ = (c1);                  \
 816         *dst++ = (c2);                  \
 817       }                                 \
 818   } while (0)
 819
 820
 821 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 822   do {                                  \
 823     EMIT_ONE_BYTE (c1);                 \
 824     EMIT_TWO_BYTES (c2, c3);            \
 825   } while (0)
 826
 827
 828 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 829   do {                                          \
 830     EMIT_TWO_BYTES (c1, c2);                    \
 831     EMIT_TWO_BYTES (c3, c4);                    \
 832   } while (0)
 833
 834
 835 /* Prototypes for static functions.  */
 836 static void record_conversion_result P_ ((struct coding_system *coding,
 837                                           enum coding_result_code result));
 838 static int detect_coding_utf_8 P_ ((struct coding_system *,
 839                                     struct coding_detection_info *info));
 840 static void decode_coding_utf_8 P_ ((struct coding_system *));
 841 static int encode_coding_utf_8 P_ ((struct coding_system *));
 842
 843 static int detect_coding_utf_16 P_ ((struct coding_system *,
 844                                      struct coding_detection_info *info));
 845 static void decode_coding_utf_16 P_ ((struct coding_system *));
 846 static int encode_coding_utf_16 P_ ((struct coding_system *));
 847
 848 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 849                                        struct coding_detection_info *info));
 850 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 851 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 852
 853 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 854                                          struct coding_detection_info *info));
 855 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 856 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 857
 858 static int detect_coding_sjis P_ ((struct coding_system *,
 859                                    struct coding_detection_info *info));
 860 static void decode_coding_sjis P_ ((struct coding_system *));
 861 static int encode_coding_sjis P_ ((struct coding_system *));
 862
 863 static int detect_coding_big5 P_ ((struct coding_system *,
 864                                    struct coding_detection_info *info));
 865 static void decode_coding_big5 P_ ((struct coding_system *));
 866 static int encode_coding_big5 P_ ((struct coding_system *));
 867
 868 static int detect_coding_ccl P_ ((struct coding_system *,
 869                                   struct coding_detection_info *info));
 870 static void decode_coding_ccl P_ ((struct coding_system *));
 871 static int encode_coding_ccl P_ ((struct coding_system *));
 872
 873 static void decode_coding_raw_text P_ ((struct coding_system *));
 874 static int encode_coding_raw_text P_ ((struct coding_system *));
 875
 876 static void coding_set_source P_ ((struct coding_system *));
 877 static void coding_set_destination P_ ((struct coding_system *));
 878 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 879 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 880                                             EMACS_INT, EMACS_INT));
 881 static unsigned char *alloc_destination P_ ((struct coding_system *,
 882                                              EMACS_INT, unsigned char *));
 883 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 884 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 885                                                      int *, int *,
 886                                                      unsigned char *));
 887 static int detect_eol P_ ((const unsigned char *,
 888                            EMACS_INT, enum coding_category));
 889 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 890 static void decode_eol P_ ((struct coding_system *));
 891 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 892 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 893                                         int, int *, int *));
 894 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 895 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 896                                             EMACS_INT));
 897 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 898                                         EMACS_INT));
 899 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 900 static int decode_coding P_ ((struct coding_system *));
 901 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 902                                                       struct coding_system *,
 903                                                       int *, EMACS_INT *));
 904 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 905                                                   struct coding_system *,
 906                                                   int *, EMACS_INT *));
 907 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 908 static int encode_coding P_ ((struct coding_system *));
 909 static Lisp_Object make_conversion_work_buffer P_ ((int));
 910 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 911 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 912 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 913
 914 static void
 915 record_conversion_result (struct coding_system *coding,
 916                           enum coding_result_code result)
 917 {
 918   coding->result = result;
 919   switch (result)
 920     {
 921     case CODING_RESULT_INSUFFICIENT_SRC:
 922       Vlast_code_conversion_error = Qinsufficient_source;
 923       break;
 924     case CODING_RESULT_INCONSISTENT_EOL:
 925       Vlast_code_conversion_error = Qinconsistent_eol;
 926       break;
 927     case CODING_RESULT_INVALID_SRC:
 928       Vlast_code_conversion_error = Qinvalid_source;
 929       break;
 930     case CODING_RESULT_INTERRUPT:
 931       Vlast_code_conversion_error = Qinterrupted;
 932       break;
 933     case CODING_RESULT_INSUFFICIENT_MEM:
 934       Vlast_code_conversion_error = Qinsufficient_memory;
 935       break;
 936     default:
 937       Vlast_code_conversion_error = intern ("Unknown error");
 938     }
 939 }
 940
 941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 942   do {                                                                       \
 943     charset_map_loaded = 0;                                                  \
 944     c = DECODE_CHAR (charset, code);                                         \
 945     if (charset_map_loaded)                                                  \
 946       {                                                                      \
 947         const unsigned char *orig = coding->source;                          \
 948         EMACS_INT offset;                                                    \
 949                                                                              \
 950         coding_set_source (coding);                                          \
 951         offset = coding->source - orig;                                      \
 952         src += offset;                                                       \
 953         src_base += offset;                                                  \
 954         src_end += offset;                                                   \
 955       }                                                                      \
 956   } while (0)
 957
 958
 959 #define ASSURE_DESTINATION(bytes)                               \
 960   do {                                                          \
 961     if (dst + (bytes) >= dst_end)                               \
 962       {                                                         \
 963         int more_bytes = charbuf_end - charbuf + (bytes);       \
 964                                                                 \
 965         dst = alloc_destination (coding, more_bytes, dst);      \
 966         dst_end = coding->destination + coding->dst_bytes;      \
 967       }                                                         \
 968   } while (0)
 969
 970
 971
 972 static void
 973 coding_set_source (coding)
 974      struct coding_system *coding;
 975 {
 976   if (BUFFERP (coding->src_object))
 977     {
 978       struct buffer *buf = XBUFFER (coding->src_object);
 979
 980       if (coding->src_pos < 0)
 981         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 982       else
 983         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 984     }
 985   else if (STRINGP (coding->src_object))
 986     {
 987       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 988     }
 989   else
 990     /* Otherwise, the source is C string and is never relocated
 991        automatically.  Thus we don't have to update anything.  */
 992     ;
 993 }
 994
 995 static void
 996 coding_set_destination (coding)
 997      struct coding_system *coding;
 998 {
 999   if (BUFFERP (coding->dst_object))
1000     {
1001       if (coding->src_pos < 0)
1002         {
1003           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
1004           coding->dst_bytes = (GAP_END_ADDR
1005                                - (coding->src_bytes - coding->consumed)
1006                                - coding->destination);
1007         }
1008       else
1009         {
1010           /* We are sure that coding->dst_pos_byte is before the gap
1011              of the buffer. */
1012           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1013                                  + coding->dst_pos_byte - 1);
1014           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1015                                - coding->destination);
1016         }
1017     }
1018   else
1019     /* Otherwise, the destination is C string and is never relocated
1020        automatically.  Thus we don't have to update anything.  */
1021     ;
1022 }
1023
1024
1025 static void
1026 coding_alloc_by_realloc (coding, bytes)
1027      struct coding_system *coding;
1028      EMACS_INT bytes;
1029 {
1030   coding->destination = (unsigned char *) xrealloc (coding->destination,
1031                                                     coding->dst_bytes + bytes);
1032   coding->dst_bytes += bytes;
1033 }
1034
1035 static void
1036 coding_alloc_by_making_gap (coding, offset, bytes)
1037      struct coding_system *coding;
1038      EMACS_INT offset, bytes;
1039 {
1040   if (BUFFERP (coding->dst_object)
1041       && EQ (coding->src_object, coding->dst_object))
1042     {
1043       EMACS_INT add = offset + (coding->src_bytes - coding->consumed);
1044
1045       GPT += offset, GPT_BYTE += offset;
1046       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1047       make_gap (bytes);
1048       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1049       GPT -= offset, GPT_BYTE -= offset;
1050     }
1051   else
1052     {
1053       Lisp_Object this_buffer;
1054
1055       this_buffer = Fcurrent_buffer ();
1056       set_buffer_internal (XBUFFER (coding->dst_object));
1057       make_gap (bytes);
1058       set_buffer_internal (XBUFFER (this_buffer));
1059     }
1060 }
1061
1062
1063 static unsigned char *
1064 alloc_destination (coding, nbytes, dst)
1065      struct coding_system *coding;
1066      EMACS_INT nbytes;
1067      unsigned char *dst;
1068 {
1069   EMACS_INT offset = dst - coding->destination;
1070
1071   if (BUFFERP (coding->dst_object))
1072     coding_alloc_by_making_gap (coding, offset, nbytes);
1073   else
1074     coding_alloc_by_realloc (coding, nbytes);
1075   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1076   coding_set_destination (coding);
1077   dst = coding->destination + offset;
1078   return dst;
1079 }
1080
1081 /** Macros for annotations.  */
1082
1083 /* Maximum length of annotation data (sum of annotations for
1084    composition and charset).  */
1085 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1086
1087 /* An annotation data is stored in the array coding->charbuf in this
1088    format:
1089      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1090    LENGTH is the number of elements in the annotation.
1091    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1092    NCHARS is the number of characters in the text annotated.
1093
1094    The format of the following elements depend on ANNOTATION_MASK.
1095
1096    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1097    follows:
1098      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1099    METHOD is one of enum composition_method.
1100    Optionnal COMPOSITION-COMPONENTS are characters and composition
1101    rules.
1102
1103    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1104    follows.  */
1105
1106 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1107   do {                                                  \
1108     *(buf)++ = -(len);                                  \
1109     *(buf)++ = (mask);                                  \
1110     *(buf)++ = (nchars);                                \
1111     coding->annotated = 1;                              \
1112   } while (0);
1113
1114 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1115   do {                                                                      \
1116     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1117     *buf++ = method;                                                        \
1118   } while (0)
1119
1120
1121 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1122   do {                                                                  \
1123     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1124     *buf++ = id;                                                        \
1125   } while (0)
1126
1127 \f
1128 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1129
1130
1131
1132 \f
1133 /*** 3. UTF-8 ***/
1134
1135 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1136    Check if a text is encoded in UTF-8.  If it is, return 1, else
1137    return 0.  */
1138
1139 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1140 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1141 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1142 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1143 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1144 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1145
1146 static int
1147 detect_coding_utf_8 (coding, detect_info)
1148      struct coding_system *coding;
1149      struct coding_detection_info *detect_info;
1150 {
1151   const unsigned char *src = coding->source, *src_base;
1152   const unsigned char *src_end = coding->source + coding->src_bytes;
1153   int multibytep = coding->src_multibyte;
1154   int consumed_chars = 0;
1155   int found = 0;
1156
1157   detect_info->checked |= CATEGORY_MASK_UTF_8;
1158   /* A coding system of this category is always ASCII compatible.  */
1159   src += coding->head_ascii;
1160
1161   while (1)
1162     {
1163       int c, c1, c2, c3, c4;
1164
1165       src_base = src;
1166       ONE_MORE_BYTE (c);
1167       if (c < 0 || UTF_8_1_OCTET_P (c))
1168         continue;
1169       ONE_MORE_BYTE (c1);
1170       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1171         break;
1172       if (UTF_8_2_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c2);
1178       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1179         break;
1180       if (UTF_8_3_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c3);
1186       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1187         break;
1188       if (UTF_8_4_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       ONE_MORE_BYTE (c4);
1194       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1195         break;
1196       if (UTF_8_5_OCTET_LEADING_P (c))
1197         {
1198           found = CATEGORY_MASK_UTF_8;
1199           continue;
1200         }
1201       break;
1202     }
1203   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1204   return 0;
1205
1206  no_more_source:
1207   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1208     {
1209       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1210       return 0;
1211     }
1212   detect_info->found |= found;
1213   return 1;
1214 }
1215
1216
1217 static void
1218 decode_coding_utf_8 (coding)
1219      struct coding_system *coding;
1220 {
1221   const unsigned char *src = coding->source + coding->consumed;
1222   const unsigned char *src_end = coding->source + coding->src_bytes;
1223   const unsigned char *src_base;
1224   int *charbuf = coding->charbuf + coding->charbuf_used;
1225   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1226   int consumed_chars = 0, consumed_chars_base;
1227   int multibytep = coding->src_multibyte;
1228   Lisp_Object attr, charset_list;
1229
1230   CODING_GET_INFO (coding, attr, charset_list);
1231
1232   while (1)
1233     {
1234       int c, c1, c2, c3, c4, c5;
1235
1236       src_base = src;
1237       consumed_chars_base = consumed_chars;
1238
1239       if (charbuf >= charbuf_end)
1240         break;
1241
1242       ONE_MORE_BYTE (c1);
1243       if (c1 < 0)
1244         {
1245           c = - c1;
1246         }
1247       else if (UTF_8_1_OCTET_P(c1))
1248         {
1249           c = c1;
1250         }
1251       else
1252         {
1253           ONE_MORE_BYTE (c2);
1254           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1255             goto invalid_code;
1256           if (UTF_8_2_OCTET_LEADING_P (c1))
1257             {
1258               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1259               /* Reject overlong sequences here and below.  Encoders
1260                  producing them are incorrect, they can be misleading,
1261                  and they mess up read/write invariance.  */
1262               if (c < 128)
1263                 goto invalid_code;
1264             }
1265           else
1266             {
1267               ONE_MORE_BYTE (c3);
1268               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1269                 goto invalid_code;
1270               if (UTF_8_3_OCTET_LEADING_P (c1))
1271                 {
1272                   c = (((c1 & 0xF) << 12)
1273                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1274                   if (c < 0x800
1275                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1276                     goto invalid_code;
1277                 }
1278               else
1279                 {
1280                   ONE_MORE_BYTE (c4);
1281                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1282                     goto invalid_code;
1283                   if (UTF_8_4_OCTET_LEADING_P (c1))
1284                     {
1285                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1286                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1287                     if (c < 0x10000)
1288                       goto invalid_code;
1289                     }
1290                   else
1291                     {
1292                       ONE_MORE_BYTE (c5);
1293                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1294                         goto invalid_code;
1295                       if (UTF_8_5_OCTET_LEADING_P (c1))
1296                         {
1297                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1298                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1299                                | (c5 & 0x3F));
1300                           if ((c > MAX_CHAR) || (c < 0x200000))
1301                             goto invalid_code;
1302                         }
1303                       else
1304                         goto invalid_code;
1305                     }
1306                 }
1307             }
1308         }
1309
1310       *charbuf++ = c;
1311       continue;
1312
1313     invalid_code:
1314       src = src_base;
1315       consumed_chars = consumed_chars_base;
1316       ONE_MORE_BYTE (c);
1317       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1318       coding->errors++;
1319     }
1320
1321  no_more_source:
1322   coding->consumed_char += consumed_chars_base;
1323   coding->consumed = src_base - coding->source;
1324   coding->charbuf_used = charbuf - coding->charbuf;
1325 }
1326
1327
1328 static int
1329 encode_coding_utf_8 (coding)
1330      struct coding_system *coding;
1331 {
1332   int multibytep = coding->dst_multibyte;
1333   int *charbuf = coding->charbuf;
1334   int *charbuf_end = charbuf + coding->charbuf_used;
1335   unsigned char *dst = coding->destination + coding->produced;
1336   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1337   int produced_chars = 0;
1338   int c;
1339
1340   if (multibytep)
1341     {
1342       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1343
1344       while (charbuf < charbuf_end)
1345         {
1346           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1347
1348           ASSURE_DESTINATION (safe_room);
1349           c = *charbuf++;
1350           if (CHAR_BYTE8_P (c))
1351             {
1352               c = CHAR_TO_BYTE8 (c);
1353               EMIT_ONE_BYTE (c);
1354             }
1355           else
1356             {
1357               CHAR_STRING_ADVANCE (c, pend);
1358               for (p = str; p < pend; p++)
1359                 EMIT_ONE_BYTE (*p);
1360             }
1361         }
1362     }
1363   else
1364     {
1365       int safe_room = MAX_MULTIBYTE_LENGTH;
1366
1367       while (charbuf < charbuf_end)
1368         {
1369           ASSURE_DESTINATION (safe_room);
1370           c = *charbuf++;
1371           if (CHAR_BYTE8_P (c))
1372             *dst++ = CHAR_TO_BYTE8 (c);
1373           else
1374             dst += CHAR_STRING (c, dst);
1375           produced_chars++;
1376         }
1377     }
1378   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1379   coding->produced_char += produced_chars;
1380   coding->produced = dst - coding->destination;
1381   return 0;
1382 }
1383
1384
1385 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1386    Check if a text is encoded in one of UTF-16 based coding systems.
1387    If it is, return 1, else return 0.  */
1388
1389 #define UTF_16_HIGH_SURROGATE_P(val) \
1390   (((val) & 0xFC00) == 0xD800)
1391
1392 #define UTF_16_LOW_SURROGATE_P(val) \
1393   (((val) & 0xFC00) == 0xDC00)
1394
1395 #define UTF_16_INVALID_P(val)   \
1396   (((val) == 0xFFFE)            \
1397    || ((val) == 0xFFFF)         \
1398    || UTF_16_LOW_SURROGATE_P (val))
1399
1400
1401 static int
1402 detect_coding_utf_16 (coding, detect_info)
1403      struct coding_system *coding;
1404      struct coding_detection_info *detect_info;
1405 {
1406   const unsigned char *src = coding->source, *src_base = src;
1407   const unsigned char *src_end = coding->source + coding->src_bytes;
1408   int multibytep = coding->src_multibyte;
1409   int consumed_chars = 0;
1410   int c1, c2;
1411
1412   detect_info->checked |= CATEGORY_MASK_UTF_16;
1413   if (coding->mode & CODING_MODE_LAST_BLOCK
1414       && (coding->src_chars & 1))
1415     {
1416       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1417       return 0;
1418     }
1419
1420   ONE_MORE_BYTE (c1);
1421   ONE_MORE_BYTE (c2);
1422   if ((c1 == 0xFF) && (c2 == 0xFE))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if ((c1 == 0xFE) && (c2 == 0xFF))
1431     {
1432       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1433                              | CATEGORY_MASK_UTF_16_AUTO);
1434       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1435                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1436                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1437     }
1438   else if (c1 >= 0 && c2 >= 0)
1439     {
1440       detect_info->rejected
1441         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1442     }
1443  no_more_source:
1444   return 1;
1445 }
1446
1447 static void
1448 decode_coding_utf_16 (coding)
1449      struct coding_system *coding;
1450 {
1451   const unsigned char *src = coding->source + coding->consumed;
1452   const unsigned char *src_end = coding->source + coding->src_bytes;
1453   const unsigned char *src_base;
1454   int *charbuf = coding->charbuf + coding->charbuf_used;
1455   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1456   int consumed_chars = 0, consumed_chars_base;
1457   int multibytep = coding->src_multibyte;
1458   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1459   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1460   int surrogate = CODING_UTF_16_SURROGATE (coding);
1461   Lisp_Object attr, charset_list;
1462
1463   CODING_GET_INFO (coding, attr, charset_list);
1464
1465   if (bom == utf_16_with_bom)
1466     {
1467       int c, c1, c2;
1468
1469       src_base = src;
1470       ONE_MORE_BYTE (c1);
1471       ONE_MORE_BYTE (c2);
1472       c = (c1 << 8) | c2;
1473
1474       if (endian == utf_16_big_endian
1475           ? c != 0xFEFF : c != 0xFFFE)
1476         {
1477           /* The first two bytes are not BOM.  Treat them as bytes
1478              for a normal character.  */
1479           src = src_base;
1480           coding->errors++;
1481         }
1482       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1483     }
1484   else if (bom == utf_16_detect_bom)
1485     {
1486       /* We have already tried to detect BOM and failed in
1487          detect_coding.  */
1488       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1489     }
1490
1491   while (1)
1492     {
1493       int c, c1, c2;
1494
1495       src_base = src;
1496       consumed_chars_base = consumed_chars;
1497
1498       if (charbuf + 2 >= charbuf_end)
1499         break;
1500
1501       ONE_MORE_BYTE (c1);
1502       if (c1 < 0)
1503         {
1504           *charbuf++ = -c1;
1505           continue;
1506         }
1507       ONE_MORE_BYTE (c2);
1508       if (c2 < 0)
1509         {
1510           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1511           *charbuf++ = -c2;
1512           continue;
1513         }
1514       c = (endian == utf_16_big_endian
1515            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1516       if (surrogate)
1517         {
1518           if (! UTF_16_LOW_SURROGATE_P (c))
1519             {
1520               if (endian == utf_16_big_endian)
1521                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1522               else
1523                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1524               *charbuf++ = c1;
1525               *charbuf++ = c2;
1526               coding->errors++;
1527               if (UTF_16_HIGH_SURROGATE_P (c))
1528                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1529               else
1530                 *charbuf++ = c;
1531             }
1532           else
1533             {
1534               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1535               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1536               *charbuf++ = 0x10000 + c;
1537             }
1538         }
1539       else
1540         {
1541           if (UTF_16_HIGH_SURROGATE_P (c))
1542             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1543           else
1544             *charbuf++ = c;
1545         }
1546     }
1547
1548  no_more_source:
1549   coding->consumed_char += consumed_chars_base;
1550   coding->consumed = src_base - coding->source;
1551   coding->charbuf_used = charbuf - coding->charbuf;
1552 }
1553
1554 static int
1555 encode_coding_utf_16 (coding)
1556      struct coding_system *coding;
1557 {
1558   int multibytep = coding->dst_multibyte;
1559   int *charbuf = coding->charbuf;
1560   int *charbuf_end = charbuf + coding->charbuf_used;
1561   unsigned char *dst = coding->destination + coding->produced;
1562   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1563   int safe_room = 8;
1564   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1565   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1566   int produced_chars = 0;
1567   Lisp_Object attrs, charset_list;
1568   int c;
1569
1570   CODING_GET_INFO (coding, attrs, charset_list);
1571
1572   if (bom != utf_16_without_bom)
1573     {
1574       ASSURE_DESTINATION (safe_room);
1575       if (big_endian)
1576         EMIT_TWO_BYTES (0xFE, 0xFF);
1577       else
1578         EMIT_TWO_BYTES (0xFF, 0xFE);
1579       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1580     }
1581
1582   while (charbuf < charbuf_end)
1583     {
1584       ASSURE_DESTINATION (safe_room);
1585       c = *charbuf++;
1586       if (c >= MAX_UNICODE_CHAR)
1587         c = coding->default_char;
1588
1589       if (c < 0x10000)
1590         {
1591           if (big_endian)
1592             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1593           else
1594             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1595         }
1596       else
1597         {
1598           int c1, c2;
1599
1600           c -= 0x10000;
1601           c1 = (c >> 10) + 0xD800;
1602           c2 = (c & 0x3FF) + 0xDC00;
1603           if (big_endian)
1604             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1605           else
1606             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1607         }
1608     }
1609   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1610   coding->produced = dst - coding->destination;
1611   coding->produced_char += produced_chars;
1612   return 0;
1613 }
1614
1615 \f
1616 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1617
1618 /* Emacs' internal format for representation of multiple character
1619    sets is a kind of multi-byte encoding, i.e. characters are
1620    represented by variable-length sequences of one-byte codes.
1621
1622    ASCII characters and control characters (e.g. `tab', `newline') are
1623    represented by one-byte sequences which are their ASCII codes, in
1624    the range 0x00 through 0x7F.
1625
1626    8-bit characters of the range 0x80..0x9F are represented by
1627    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1628    code + 0x20).
1629
1630    8-bit characters of the range 0xA0..0xFF are represented by
1631    one-byte sequences which are their 8-bit code.
1632
1633    The other characters are represented by a sequence of `base
1634    leading-code', optional `extended leading-code', and one or two
1635    `position-code's.  The length of the sequence is determined by the
1636    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1637    whereas extended leading-code and position-code take the range 0xA0
1638    through 0xFF.  See `charset.h' for more details about leading-code
1639    and position-code.
1640
1641    --- CODE RANGE of Emacs' internal format ---
1642    character set        range
1643    -------------        -----
1644    ascii                0x00..0x7F
1645    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1646    eight-bit-graphic    0xA0..0xBF
1647    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1648    ---------------------------------------------
1649
1650    As this is the internal character representation, the format is
1651    usually not used externally (i.e. in a file or in a data sent to a
1652    process).  But, it is possible to have a text externally in this
1653    format (i.e. by encoding by the coding system `emacs-mule').
1654
1655    In that case, a sequence of one-byte codes has a slightly different
1656    form.
1657
1658    At first, all characters in eight-bit-control are represented by
1659    one-byte sequences which are their 8-bit code.
1660
1661    Next, character composition data are represented by the byte
1662    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1663    where,
1664         METHOD is 0xF0 plus one of composition method (enum
1665         composition_method),
1666
1667         BYTES is 0xA0 plus a byte length of this composition data,
1668
1669         CHARS is 0x20 plus a number of characters composed by this
1670         data,
1671
1672         COMPONENTs are characters of multibye form or composition
1673         rules encoded by two-byte of ASCII codes.
1674
1675    In addition, for backward compatibility, the following formats are
1676    also recognized as composition data on decoding.
1677
1678    0x80 MSEQ ...
1679    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1680
1681    Here,
1682         MSEQ is a multibyte form but in these special format:
1683           ASCII: 0xA0 ASCII_CODE+0x80,
1684           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1685         RULE is a one byte code of the range 0xA0..0xF0 that
1686         represents a composition rule.
1687   */
1688
1689 char emacs_mule_bytes[256];
1690
1691 int
1692 emacs_mule_char (coding, src, nbytes, nchars, id)
1693      struct coding_system *coding;
1694      const unsigned char *src;
1695      int *nbytes, *nchars, *id;
1696 {
1697   const unsigned char *src_end = coding->source + coding->src_bytes;
1698   const unsigned char *src_base = src;
1699   int multibytep = coding->src_multibyte;
1700   struct charset *charset;
1701   unsigned code;
1702   int c;
1703   int consumed_chars = 0;
1704
1705   ONE_MORE_BYTE (c);
1706   if (c < 0)
1707     {
1708       c = -c;
1709       charset = emacs_mule_charset[0];
1710     }
1711   else
1712     {
1713       if (c >= 0xA0)
1714         {
1715           /* Old style component character of a compostion.  */
1716           if (c == 0xA0)
1717             {
1718               ONE_MORE_BYTE (c);
1719               c -= 0x80;
1720             }
1721           else
1722             c -= 0x20;
1723         }
1724
1725       switch (emacs_mule_bytes[c])
1726         {
1727         case 2:
1728           if (! (charset = emacs_mule_charset[c]))
1729             goto invalid_code;
1730           ONE_MORE_BYTE (c);
1731           if (c < 0xA0)
1732             goto invalid_code;
1733           code = c & 0x7F;
1734           break;
1735
1736         case 3:
1737           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1738               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1739             {
1740               ONE_MORE_BYTE (c);
1741               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1742                 goto invalid_code;
1743               ONE_MORE_BYTE (c);
1744               if (c < 0xA0)
1745                 goto invalid_code;
1746               code = c & 0x7F;
1747             }
1748           else
1749             {
1750               if (! (charset = emacs_mule_charset[c]))
1751                 goto invalid_code;
1752               ONE_MORE_BYTE (c);
1753               if (c < 0xA0)
1754                 goto invalid_code;
1755               code = (c & 0x7F) << 8;
1756               ONE_MORE_BYTE (c);
1757               if (c < 0xA0)
1758                 goto invalid_code;
1759               code |= c & 0x7F;
1760             }
1761           break;
1762
1763         case 4:
1764           ONE_MORE_BYTE (c);
1765           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1766             goto invalid_code;
1767           ONE_MORE_BYTE (c);
1768           if (c < 0xA0)
1769             goto invalid_code;
1770           code = (c & 0x7F) << 8;
1771           ONE_MORE_BYTE (c);
1772           if (c < 0xA0)
1773             goto invalid_code;
1774           code |= c & 0x7F;
1775           break;
1776
1777         case 1:
1778           code = c;
1779           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1780                                      ? charset_ascii : charset_eight_bit);
1781           break;
1782
1783         default:
1784           abort ();
1785         }
1786       c = DECODE_CHAR (charset, code);
1787       if (c < 0)
1788         goto invalid_code;
1789     }
1790   *nbytes = src - src_base;
1791   *nchars = consumed_chars;
1792   if (id)
1793     *id = charset->id;
1794   return c;
1795
1796  no_more_source:
1797   return -2;
1798
1799  invalid_code:
1800   return -1;
1801 }
1802
1803
1804 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1805    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1806    else return 0.  */
1807
1808 static int
1809 detect_coding_emacs_mule (coding, detect_info)
1810      struct coding_system *coding;
1811      struct coding_detection_info *detect_info;
1812 {
1813   const unsigned char *src = coding->source, *src_base;
1814   const unsigned char *src_end = coding->source + coding->src_bytes;
1815   int multibytep = coding->src_multibyte;
1816   int consumed_chars = 0;
1817   int c;
1818   int found = 0;
1819
1820   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1821   /* A coding system of this category is always ASCII compatible.  */
1822   src += coding->head_ascii;
1823
1824   while (1)
1825     {
1826       src_base = src;
1827       ONE_MORE_BYTE (c);
1828       if (c < 0)
1829         continue;
1830       if (c == 0x80)
1831         {
1832           /* Perhaps the start of composite character.  We simple skip
1833              it because analyzing it is too heavy for detecting.  But,
1834              at least, we check that the composite character
1835              constitues of more than 4 bytes.  */
1836           const unsigned char *src_base;
1837
1838         repeat:
1839           src_base = src;
1840           do
1841             {
1842               ONE_MORE_BYTE (c);
1843             }
1844           while (c >= 0xA0);
1845
1846           if (src - src_base <= 4)
1847             break;
1848           found = CATEGORY_MASK_EMACS_MULE;
1849           if (c == 0x80)
1850             goto repeat;
1851         }
1852
1853       if (c < 0x80)
1854         {
1855           if (c < 0x20
1856               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1857             break;
1858         }
1859       else
1860         {
1861           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1862
1863           while (more_bytes > 0)
1864             {
1865               ONE_MORE_BYTE (c);
1866               if (c < 0xA0)
1867                 {
1868                   src--;        /* Unread the last byte.  */
1869                   break;
1870                 }
1871               more_bytes--;
1872             }
1873           if (more_bytes != 0)
1874             break;
1875           found = CATEGORY_MASK_EMACS_MULE;
1876         }
1877     }
1878   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1879   return 0;
1880
1881  no_more_source:
1882   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1883     {
1884       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1885       return 0;
1886     }
1887   detect_info->found |= found;
1888   return 1;
1889 }
1890
1891
1892 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1893
1894 /* Decode a character represented as a component of composition
1895    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1896    update SRC to the head of next character (or an encoded composition
1897    rule).  If SRC doesn't points a composition component, set C to -1.
1898    If SRC points an invalid byte sequence, global exit by a return
1899    value 0.  */
1900
1901 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1902   if (1)                                                        \
1903     {                                                           \
1904       int c;                                                    \
1905       int nbytes, nchars;                                       \
1906                                                                 \
1907       if (src == src_end)                                       \
1908         break;                                                  \
1909       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1910       if (c < 0)                                                \
1911         {                                                       \
1912           if (c == -2)                                          \
1913             break;                                              \
1914           goto invalid_code;                                    \
1915         }                                                       \
1916       *buf++ = c;                                               \
1917       src += nbytes;                                            \
1918       consumed_chars += nchars;                                 \
1919     }                                                           \
1920   else
1921
1922
1923 /* Decode a composition rule represented as a component of composition
1924    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1925    and increment BUF.  If SRC points an invalid byte sequence, set C
1926    to -1.  */
1927
1928 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1929   do {                                                  \
1930     int c, gref, nref;                                  \
1931                                                         \
1932     if (src >= src_end)                                 \
1933       goto invalid_code;                                \
1934     ONE_MORE_BYTE_NO_CHECK (c);                         \
1935     c -= 0xA0;                                          \
1936     if (c < 0 || c >= 81)                               \
1937       goto invalid_code;                                \
1938                                                         \
1939     gref = c / 9, nref = c % 9;                         \
1940     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1941   } while (0)
1942
1943
1944 /* Decode a composition rule represented as a component of composition
1945    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1946    and increment BUF.  If SRC points an invalid byte sequence, set C
1947    to -1.  */
1948
1949 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1950   do {                                                  \
1951     int gref, nref;                                     \
1952                                                         \
1953     if (src + 1>= src_end)                              \
1954       goto invalid_code;                                \
1955     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1956     gref -= 0x20;                                       \
1957     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1958     nref -= 0x20;                                       \
1959     if (gref < 0 || gref >= 81                          \
1960         || nref < 0 || nref >= 81)                      \
1961       goto invalid_code;                                \
1962     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1963   } while (0)
1964
1965
1966 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1967   do {                                                                  \
1968     /* Emacs 21 style format.  The first three bytes at SRC are         \
1969        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1970        the byte length of this composition information, CHARS is the    \
1971        number of characters composed by this composition.  */           \
1972     enum composition_method method = c - 0xF2;                          \
1973     int *charbuf_base = charbuf;                                        \
1974     int consumed_chars_limit;                                           \
1975     int nbytes, nchars;                                                 \
1976                                                                         \
1977     ONE_MORE_BYTE (c);                                                  \
1978     if (c < 0)                                                          \
1979       goto invalid_code;                                                \
1980     nbytes = c - 0xA0;                                                  \
1981     if (nbytes < 3)                                                     \
1982       goto invalid_code;                                                \
1983     ONE_MORE_BYTE (c);                                                  \
1984     if (c < 0)                                                          \
1985       goto invalid_code;                                                \
1986     nchars = c - 0xA0;                                                  \
1987     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1988     consumed_chars_limit = consumed_chars_base + nbytes;                \
1989     if (method != COMPOSITION_RELATIVE)                                 \
1990       {                                                                 \
1991         int i = 0;                                                      \
1992         while (consumed_chars < consumed_chars_limit)                   \
1993           {                                                             \
1994             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1995               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1996             else                                                        \
1997               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1998             i++;                                                        \
1999           }                                                             \
2000         if (consumed_chars < consumed_chars_limit)                      \
2001           goto invalid_code;                                            \
2002         charbuf_base[0] -= i;                                           \
2003       }                                                                 \
2004   } while (0)
2005
2006
2007 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)                    \
2008   do {                                                                  \
2009     /* Emacs 20 style format for relative composition.  */              \
2010     /* Store multibyte form of characters to be composed.  */           \
2011     enum composition_method method = COMPOSITION_RELATIVE;              \
2012     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];                 \
2013     int *buf = components;                                              \
2014     int i, j;                                                           \
2015                                                                         \
2016     src = src_base;                                                     \
2017     ONE_MORE_BYTE (c);          /* skip 0x80 */                         \
2018     for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++)    \
2019       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                         \
2020     if (i < 2)                                                          \
2021       goto invalid_code;                                                \
2022     ADD_COMPOSITION_DATA (charbuf, i, method);                          \
2023     for (j = 0; j < i; j++)                                             \
2024       *charbuf++ = components[j];                                       \
2025   } while (0)
2026
2027
2028 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2029   do {                                                          \
2030     /* Emacs 20 style format for rule-base composition.  */     \
2031     /* Store multibyte form of characters to be composed.  */   \
2032     enum composition_method method = COMPOSITION_WITH_RULE;     \
2033     int *charbuf_base = charbuf;                                \
2034     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2035     int *buf = components;                                      \
2036     int i, j;                                                   \
2037                                                                 \
2038     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2039     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2040       {                                                         \
2041         if (*src < 0xA0)                                        \
2042           break;                                                \
2043         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2044         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2045       }                                                         \
2046     if (i <= 1 || (buf - components) % 2 == 0)                  \
2047       goto invalid_code;                                        \
2048     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2049       goto no_more_source;                                      \
2050     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2051     i = i * 2 - 1;                                              \
2052     for (j = 0; j < i; j++)                                     \
2053       *charbuf++ = components[j];                               \
2054     charbuf_base[0] -= i;                                       \
2055     for (j = 0; j < i; j += 2)                                  \
2056       *charbuf++ = components[j];                               \
2057   } while (0)
2058
2059
2060 static void
2061 decode_coding_emacs_mule (coding)
2062      struct coding_system *coding;
2063 {
2064   const unsigned char *src = coding->source + coding->consumed;
2065   const unsigned char *src_end = coding->source + coding->src_bytes;
2066   const unsigned char *src_base;
2067   int *charbuf = coding->charbuf + coding->charbuf_used;
2068   int *charbuf_end
2069     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2070   int consumed_chars = 0, consumed_chars_base;
2071   int multibytep = coding->src_multibyte;
2072   Lisp_Object attrs, charset_list;
2073   int char_offset = coding->produced_char;
2074   int last_offset = char_offset;
2075   int last_id = charset_ascii;
2076
2077   CODING_GET_INFO (coding, attrs, charset_list);
2078
2079   while (1)
2080     {
2081       int c;
2082
2083       src_base = src;
2084       consumed_chars_base = consumed_chars;
2085
2086       if (charbuf >= charbuf_end)
2087         break;
2088
2089       ONE_MORE_BYTE (c);
2090       if (c < 0)
2091         {
2092           *charbuf++ = -c;
2093           char_offset++;
2094         }
2095       else if (c < 0x80)
2096         {
2097           *charbuf++ = c;
2098           char_offset++;
2099         }
2100       else if (c == 0x80)
2101         {
2102           ONE_MORE_BYTE (c);
2103           if (c < 0)
2104             goto invalid_code;
2105           if (c - 0xF2 >= COMPOSITION_RELATIVE
2106               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2107             DECODE_EMACS_MULE_21_COMPOSITION (c);
2108           else if (c < 0xC0)
2109             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2110           else if (c == 0xFF)
2111             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2112           else
2113             goto invalid_code;
2114         }
2115       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2116         {
2117           int nbytes, nchars;
2118           int id;
2119
2120           src = src_base;
2121           consumed_chars = consumed_chars_base;
2122           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2123           if (c < 0)
2124             {
2125               if (c == -2)
2126                 break;
2127               goto invalid_code;
2128             }
2129           if (last_id != id)
2130             {
2131               if (last_id != charset_ascii)
2132                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2133               last_id = id;
2134               last_offset = char_offset;
2135             }
2136           *charbuf++ = c;
2137           src += nbytes;
2138           consumed_chars += nchars;
2139           char_offset++;
2140         }
2141       else
2142         goto invalid_code;
2143       continue;
2144
2145     invalid_code:
2146       src = src_base;
2147       consumed_chars = consumed_chars_base;
2148       ONE_MORE_BYTE (c);
2149       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2150       char_offset++;
2151       coding->errors++;
2152     }
2153
2154  no_more_source:
2155   if (last_id != charset_ascii)
2156     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2157   coding->consumed_char += consumed_chars_base;
2158   coding->consumed = src_base - coding->source;
2159   coding->charbuf_used = charbuf - coding->charbuf;
2160 }
2161
2162
2163 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2164   do {                                          \
2165     if (id < 0xA0)                              \
2166       codes[0] = id, codes[1] = 0;              \
2167     else if (id < 0xE0)                         \
2168       codes[0] = 0x9A, codes[1] = id;           \
2169     else if (id < 0xF0)                         \
2170       codes[0] = 0x9B, codes[1] = id;           \
2171     else if (id < 0xF5)                         \
2172       codes[0] = 0x9C, codes[1] = id;           \
2173     else                                        \
2174       codes[0] = 0x9D, codes[1] = id;           \
2175   } while (0);
2176
2177
2178 static int
2179 encode_coding_emacs_mule (coding)
2180      struct coding_system *coding;
2181 {
2182   int multibytep = coding->dst_multibyte;
2183   int *charbuf = coding->charbuf;
2184   int *charbuf_end = charbuf + coding->charbuf_used;
2185   unsigned char *dst = coding->destination + coding->produced;
2186   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2187   int safe_room = 8;
2188   int produced_chars = 0;
2189   Lisp_Object attrs, charset_list;
2190   int c;
2191   int preferred_charset_id = -1;
2192
2193   CODING_GET_INFO (coding, attrs, charset_list);
2194   if (! EQ (charset_list, Vemacs_mule_charset_list))
2195     {
2196       CODING_ATTR_CHARSET_LIST (attrs)
2197         = charset_list = Vemacs_mule_charset_list;
2198     }
2199
2200   while (charbuf < charbuf_end)
2201     {
2202       ASSURE_DESTINATION (safe_room);
2203       c = *charbuf++;
2204
2205       if (c < 0)
2206         {
2207           /* Handle an annotation.  */
2208           switch (*charbuf)
2209             {
2210             case CODING_ANNOTATE_COMPOSITION_MASK:
2211               /* Not yet implemented.  */
2212               break;
2213             case CODING_ANNOTATE_CHARSET_MASK:
2214               preferred_charset_id = charbuf[3];
2215               if (preferred_charset_id >= 0
2216                   && NILP (Fmemq (make_number (preferred_charset_id),
2217                                   charset_list)))
2218                 preferred_charset_id = -1;
2219               break;
2220             default:
2221               abort ();
2222             }
2223           charbuf += -c - 1;
2224           continue;
2225         }
2226
2227       if (ASCII_CHAR_P (c))
2228         EMIT_ONE_ASCII_BYTE (c);
2229       else if (CHAR_BYTE8_P (c))
2230         {
2231           c = CHAR_TO_BYTE8 (c);
2232           EMIT_ONE_BYTE (c);
2233         }
2234       else
2235         {
2236           struct charset *charset;
2237           unsigned code;
2238           int dimension;
2239           int emacs_mule_id;
2240           unsigned char leading_codes[2];
2241
2242           if (preferred_charset_id >= 0)
2243             {
2244               charset = CHARSET_FROM_ID (preferred_charset_id);
2245               if (! CHAR_CHARSET_P (c, charset))
2246                 charset = char_charset (c, charset_list, NULL);
2247             }
2248           else
2249             charset = char_charset (c, charset_list, &code);
2250           if (! charset)
2251             {
2252               c = coding->default_char;
2253               if (ASCII_CHAR_P (c))
2254                 {
2255                   EMIT_ONE_ASCII_BYTE (c);
2256                   continue;
2257                 }
2258               charset = char_charset (c, charset_list, &code);
2259             }
2260           dimension = CHARSET_DIMENSION (charset);
2261           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2262           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2263           EMIT_ONE_BYTE (leading_codes[0]);
2264           if (leading_codes[1])
2265             EMIT_ONE_BYTE (leading_codes[1]);
2266           if (dimension == 1)
2267             EMIT_ONE_BYTE (code | 0x80);
2268           else
2269             {
2270               code |= 0x8080;
2271               EMIT_ONE_BYTE (code >> 8);
2272               EMIT_ONE_BYTE (code & 0xFF);
2273             }
2274         }
2275     }
2276   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2277   coding->produced_char += produced_chars;
2278   coding->produced = dst - coding->destination;
2279   return 0;
2280 }
2281
2282 \f
2283 /*** 7. ISO2022 handlers ***/
2284
2285 /* The following note describes the coding system ISO2022 briefly.
2286    Since the intention of this note is to help understand the
2287    functions in this file, some parts are NOT ACCURATE or are OVERLY
2288    SIMPLIFIED.  For thorough understanding, please refer to the
2289    original document of ISO2022.  This is equivalent to the standard
2290    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2291
2292    ISO2022 provides many mechanisms to encode several character sets
2293    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2294    is encoded using bytes less than 128.  This may make the encoded
2295    text a little bit longer, but the text passes more easily through
2296    several types of gateway, some of which strip off the MSB (Most
2297    Significant Bit).
2298
2299    There are two kinds of character sets: control character sets and
2300    graphic character sets.  The former contain control characters such
2301    as `newline' and `escape' to provide control functions (control
2302    functions are also provided by escape sequences).  The latter
2303    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2304    two control character sets and many graphic character sets.
2305
2306    Graphic character sets are classified into one of the following
2307    four classes, according to the number of bytes (DIMENSION) and
2308    number of characters in one dimension (CHARS) of the set:
2309    - DIMENSION1_CHARS94
2310    - DIMENSION1_CHARS96
2311    - DIMENSION2_CHARS94
2312    - DIMENSION2_CHARS96
2313
2314    In addition, each character set is assigned an identification tag,
2315    unique for each set, called the "final character" (denoted as <F>
2316    hereafter).  The <F> of each character set is decided by ECMA(*)
2317    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2318    (0x30..0x3F are for private use only).
2319
2320    Note (*): ECMA = European Computer Manufacturers Association
2321
2322    Here are examples of graphic character sets [NAME(<F>)]:
2323         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2324         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2325         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2326         o DIMENSION2_CHARS96 -- none for the moment
2327
2328    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2329         C0 [0x00..0x1F] -- control character plane 0
2330         GL [0x20..0x7F] -- graphic character plane 0
2331         C1 [0x80..0x9F] -- control character plane 1
2332         GR [0xA0..0xFF] -- graphic character plane 1
2333
2334    A control character set is directly designated and invoked to C0 or
2335    C1 by an escape sequence.  The most common case is that:
2336    - ISO646's  control character set is designated/invoked to C0, and
2337    - ISO6429's control character set is designated/invoked to C1,
2338    and usually these designations/invocations are omitted in encoded
2339    text.  In a 7-bit environment, only C0 can be used, and a control
2340    character for C1 is encoded by an appropriate escape sequence to
2341    fit into the environment.  All control characters for C1 are
2342    defined to have corresponding escape sequences.
2343
2344    A graphic character set is at first designated to one of four
2345    graphic registers (G0 through G3), then these graphic registers are
2346    invoked to GL or GR.  These designations and invocations can be
2347    done independently.  The most common case is that G0 is invoked to
2348    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2349    these invocations and designations are omitted in encoded text.
2350    In a 7-bit environment, only GL can be used.
2351
2352    When a graphic character set of CHARS94 is invoked to GL, codes
2353    0x20 and 0x7F of the GL area work as control characters SPACE and
2354    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2355    be used.
2356
2357    There are two ways of invocation: locking-shift and single-shift.
2358    With locking-shift, the invocation lasts until the next different
2359    invocation, whereas with single-shift, the invocation affects the
2360    following character only and doesn't affect the locking-shift
2361    state.  Invocations are done by the following control characters or
2362    escape sequences:
2363
2364    ----------------------------------------------------------------------
2365    abbrev  function                  cntrl escape seq   description
2366    ----------------------------------------------------------------------
2367    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2368    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2369    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2370    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2371    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2372    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2373    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2374    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2375    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2376    ----------------------------------------------------------------------
2377    (*) These are not used by any known coding system.
2378
2379    Control characters for these functions are defined by macros
2380    ISO_CODE_XXX in `coding.h'.
2381
2382    Designations are done by the following escape sequences:
2383    ----------------------------------------------------------------------
2384    escape sequence      description
2385    ----------------------------------------------------------------------
2386    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2387    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2388    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2389    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2390    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2391    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2392    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2393    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2394    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2395    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2396    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2397    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2398    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2399    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2400    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2401    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2402    ----------------------------------------------------------------------
2403
2404    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2405    of dimension 1, chars 94, and final character <F>, etc...
2406
2407    Note (*): Although these designations are not allowed in ISO2022,
2408    Emacs accepts them on decoding, and produces them on encoding
2409    CHARS96 character sets in a coding system which is characterized as
2410    7-bit environment, non-locking-shift, and non-single-shift.
2411
2412    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2413    '(' must be omitted.  We refer to this as "short-form" hereafter.
2414
2415    Now you may notice that there are a lot of ways of encoding the
2416    same multilingual text in ISO2022.  Actually, there exist many
2417    coding systems such as Compound Text (used in X11's inter client
2418    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2419    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2420    localized platforms), and all of these are variants of ISO2022.
2421
2422    In addition to the above, Emacs handles two more kinds of escape
2423    sequences: ISO6429's direction specification and Emacs' private
2424    sequence for specifying character composition.
2425
2426    ISO6429's direction specification takes the following form:
2427         o CSI ']'      -- end of the current direction
2428         o CSI '0' ']'  -- end of the current direction
2429         o CSI '1' ']'  -- start of left-to-right text
2430         o CSI '2' ']'  -- start of right-to-left text
2431    The control character CSI (0x9B: control sequence introducer) is
2432    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2433
2434    Character composition specification takes the following form:
2435         o ESC '0' -- start relative composition
2436         o ESC '1' -- end composition
2437         o ESC '2' -- start rule-base composition (*)
2438         o ESC '3' -- start relative composition with alternate chars  (**)
2439         o ESC '4' -- start rule-base composition with alternate chars  (**)
2440   Since these are not standard escape sequences of any ISO standard,
2441   the use of them with these meanings is restricted to Emacs only.
2442
2443   (*) This form is used only in Emacs 20.7 and older versions,
2444   but newer versions can safely decode it.
2445   (**) This form is used only in Emacs 21.1 and newer versions,
2446   and older versions can't decode it.
2447
2448   Here's a list of example usages of these composition escape
2449   sequences (categorized by `enum composition_method').
2450
2451   COMPOSITION_RELATIVE:
2452         ESC 0 CHAR [ CHAR ] ESC 1
2453   COMPOSITION_WITH_RULE:
2454         ESC 2 CHAR [ RULE CHAR ] ESC 1
2455   COMPOSITION_WITH_ALTCHARS:
2456         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2457   COMPOSITION_WITH_RULE_ALTCHARS:
2458         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2459
2460 enum iso_code_class_type iso_code_class[256];
2461
2462 #define SAFE_CHARSET_P(coding, id)      \
2463   ((id) <= (coding)->max_charset_id     \
2464    && (coding)->safe_charsets[id] >= 0)
2465
2466
2467 #define SHIFT_OUT_OK(category)  \
2468   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2469
2470 static void
2471 setup_iso_safe_charsets (attrs)
2472      Lisp_Object attrs;
2473 {
2474   Lisp_Object charset_list, safe_charsets;
2475   Lisp_Object request;
2476   Lisp_Object reg_usage;
2477   Lisp_Object tail;
2478   int reg94, reg96;
2479   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2480   int max_charset_id;
2481
2482   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2483   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2484       && ! EQ (charset_list, Viso_2022_charset_list))
2485     {
2486       CODING_ATTR_CHARSET_LIST (attrs)
2487         = charset_list = Viso_2022_charset_list;
2488       ASET (attrs, coding_attr_safe_charsets, Qnil);
2489     }
2490
2491   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2492     return;
2493
2494   max_charset_id = 0;
2495   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2496     {
2497       int id = XINT (XCAR (tail));
2498       if (max_charset_id < id)
2499         max_charset_id = id;
2500     }
2501
2502   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2503                                 make_number (255));
2504   request = AREF (attrs, coding_attr_iso_request);
2505   reg_usage = AREF (attrs, coding_attr_iso_usage);
2506   reg94 = XINT (XCAR (reg_usage));
2507   reg96 = XINT (XCDR (reg_usage));
2508
2509   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2510     {
2511       Lisp_Object id;
2512       Lisp_Object reg;
2513       struct charset *charset;
2514
2515       id = XCAR (tail);
2516       charset = CHARSET_FROM_ID (XINT (id));
2517       reg = Fcdr (Fassq (id, request));
2518       if (! NILP (reg))
2519         SSET (safe_charsets, XINT (id), XINT (reg));
2520       else if (charset->iso_chars_96)
2521         {
2522           if (reg96 < 4)
2523             SSET (safe_charsets, XINT (id), reg96);
2524         }
2525       else
2526         {
2527           if (reg94 < 4)
2528             SSET (safe_charsets, XINT (id), reg94);
2529         }
2530     }
2531   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2532 }
2533
2534
2535 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2536    Check if a text is encoded in one of ISO-2022 based codig systems.
2537    If it is, return 1, else return 0.  */
2538
2539 static int
2540 detect_coding_iso_2022 (coding, detect_info)
2541      struct coding_system *coding;
2542      struct coding_detection_info *detect_info;
2543 {
2544   const unsigned char *src = coding->source, *src_base = src;
2545   const unsigned char *src_end = coding->source + coding->src_bytes;
2546   int multibytep = coding->src_multibyte;
2547   int single_shifting = 0;
2548   int id;
2549   int c, c1;
2550   int consumed_chars = 0;
2551   int i;
2552   int rejected = 0;
2553   int found = 0;
2554
2555   detect_info->checked |= CATEGORY_MASK_ISO;
2556
2557   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2558     {
2559       struct coding_system *this = &(coding_categories[i]);
2560       Lisp_Object attrs, val;
2561
2562       attrs = CODING_ID_ATTRS (this->id);
2563       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2564           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2565         setup_iso_safe_charsets (attrs);
2566       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2567       this->max_charset_id = SCHARS (val) - 1;
2568       this->safe_charsets = (char *) SDATA (val);
2569     }
2570
2571   /* A coding system of this category is always ASCII compatible.  */
2572   src += coding->head_ascii;
2573
2574   while (rejected != CATEGORY_MASK_ISO)
2575     {
2576       src_base = src;
2577       ONE_MORE_BYTE (c);
2578       switch (c)
2579         {
2580         case ISO_CODE_ESC:
2581           if (inhibit_iso_escape_detection)
2582             break;
2583           single_shifting = 0;
2584           ONE_MORE_BYTE (c);
2585           if (c >= '(' && c <= '/')
2586             {
2587               /* Designation sequence for a charset of dimension 1.  */
2588               ONE_MORE_BYTE (c1);
2589               if (c1 < ' ' || c1 >= 0x80
2590                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2591                 /* Invalid designation sequence.  Just ignore.  */
2592                 break;
2593             }
2594           else if (c == '$')
2595             {
2596               /* Designation sequence for a charset of dimension 2.  */
2597               ONE_MORE_BYTE (c);
2598               if (c >= '@' && c <= 'B')
2599                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2600                 id = iso_charset_table[1][0][c];
2601               else if (c >= '(' && c <= '/')
2602                 {
2603                   ONE_MORE_BYTE (c1);
2604                   if (c1 < ' ' || c1 >= 0x80
2605                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2606                     /* Invalid designation sequence.  Just ignore.  */
2607                     break;
2608                 }
2609               else
2610                 /* Invalid designation sequence.  Just ignore it.  */
2611                 break;
2612             }
2613           else if (c == 'N' || c == 'O')
2614             {
2615               /* ESC <Fe> for SS2 or SS3.  */
2616               single_shifting = 1;
2617               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2618               break;
2619             }
2620           else if (c >= '0' && c <= '4')
2621             {
2622               /* ESC <Fp> for start/end composition.  */
2623               found |= CATEGORY_MASK_ISO;
2624               break;
2625             }
2626           else
2627             {
2628               /* Invalid escape sequence.  Just ignore it.  */
2629               break;
2630             }
2631
2632           /* We found a valid designation sequence for CHARSET.  */
2633           rejected |= CATEGORY_MASK_ISO_8BIT;
2634           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2635                               id))
2636             found |= CATEGORY_MASK_ISO_7;
2637           else
2638             rejected |= CATEGORY_MASK_ISO_7;
2639           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2640                               id))
2641             found |= CATEGORY_MASK_ISO_7_TIGHT;
2642           else
2643             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2644           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2645                               id))
2646             found |= CATEGORY_MASK_ISO_7_ELSE;
2647           else
2648             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2649           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2650                               id))
2651             found |= CATEGORY_MASK_ISO_8_ELSE;
2652           else
2653             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2654           break;
2655
2656         case ISO_CODE_SO:
2657         case ISO_CODE_SI:
2658           /* Locking shift out/in.  */
2659           if (inhibit_iso_escape_detection)
2660             break;
2661           single_shifting = 0;
2662           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2663           break;
2664
2665         case ISO_CODE_CSI:
2666           /* Control sequence introducer.  */
2667           single_shifting = 0;
2668           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2669           found |= CATEGORY_MASK_ISO_8_ELSE;
2670           goto check_extra_latin;
2671
2672         case ISO_CODE_SS2:
2673         case ISO_CODE_SS3:
2674           /* Single shift.   */
2675           if (inhibit_iso_escape_detection)
2676             break;
2677           single_shifting = 0;
2678           rejected |= CATEGORY_MASK_ISO_7BIT;
2679           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2680               & CODING_ISO_FLAG_SINGLE_SHIFT)
2681             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2682           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2683               & CODING_ISO_FLAG_SINGLE_SHIFT)
2684             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2685           if (single_shifting)
2686             break;
2687           goto check_extra_latin;
2688
2689         default:
2690           if (c < 0)
2691             continue;
2692           if (c < 0x80)
2693             {
2694               single_shifting = 0;
2695               break;
2696             }
2697           if (c >= 0xA0)
2698             {
2699               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2700               found |= CATEGORY_MASK_ISO_8_1;
2701               /* Check the length of succeeding codes of the range
2702                  0xA0..0FF.  If the byte length is even, we include
2703                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2704                  only when we are not single shifting.  */
2705               if (! single_shifting
2706                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2707                 {
2708                   int i = 1;
2709                   while (src < src_end)
2710                     {
2711                       ONE_MORE_BYTE (c);
2712                       if (c < 0xA0)
2713                         break;
2714                       i++;
2715                     }
2716
2717                   if (i & 1 && src < src_end)
2718                     rejected |= CATEGORY_MASK_ISO_8_2;
2719                   else
2720                     found |= CATEGORY_MASK_ISO_8_2;
2721                 }
2722               break;
2723             }
2724         check_extra_latin:
2725           single_shifting = 0;
2726           if (! VECTORP (Vlatin_extra_code_table)
2727               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2728             {
2729               rejected = CATEGORY_MASK_ISO;
2730               break;
2731             }
2732           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2733               & CODING_ISO_FLAG_LATIN_EXTRA)
2734             found |= CATEGORY_MASK_ISO_8_1;
2735           else
2736             rejected |= CATEGORY_MASK_ISO_8_1;
2737           rejected |= CATEGORY_MASK_ISO_8_2;
2738         }
2739     }
2740   detect_info->rejected |= CATEGORY_MASK_ISO;
2741   return 0;
2742
2743  no_more_source:
2744   detect_info->rejected |= rejected;
2745   detect_info->found |= (found & ~rejected);
2746   return 1;
2747 }
2748
2749
2750 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2751    escape sequence should be kept.  */
2752 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2753   do {                                                                  \
2754     int id, prev;                                                       \
2755                                                                         \
2756     if (final < '0' || final >= 128                                     \
2757         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2758         || !SAFE_CHARSET_P (coding, id))                                \
2759       {                                                                 \
2760         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2761         chars_96 = -1;                                                  \
2762         break;                                                          \
2763       }                                                                 \
2764     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2765     if (id == charset_jisx0201_roman)                                   \
2766       {                                                                 \
2767         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2768           id = charset_ascii;                                           \
2769       }                                                                 \
2770     else if (id == charset_jisx0208_1978)                               \
2771       {                                                                 \
2772         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2773           id = charset_jisx0208;                                        \
2774       }                                                                 \
2775     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2776     /* If there was an invalid designation to REG previously, and this  \
2777        designation is ASCII to REG, we should keep this designation     \
2778        sequence.  */                                                    \
2779     if (prev == -2 && id == charset_ascii)                              \
2780       chars_96 = -1;                                                    \
2781   } while (0)
2782
2783
2784 #define MAYBE_FINISH_COMPOSITION()                              \
2785   do {                                                          \
2786     int i;                                                      \
2787     if (composition_state == COMPOSING_NO)                      \
2788       break;                                                    \
2789     /* It is assured that we have enough room for producing     \
2790        characters stored in the table `components'.  */         \
2791     if (charbuf + component_idx > charbuf_end)                  \
2792       goto no_more_source;                                      \
2793     composition_state = COMPOSING_NO;                           \
2794     if (method == COMPOSITION_RELATIVE                          \
2795         || method == COMPOSITION_WITH_ALTCHARS)                 \
2796       {                                                         \
2797         for (i = 0; i < component_idx; i++)                     \
2798           *charbuf++ = components[i];                           \
2799         char_offset += component_idx;                           \
2800       }                                                         \
2801     else                                                        \
2802       {                                                         \
2803         for (i = 0; i < component_idx; i += 2)                  \
2804           *charbuf++ = components[i];                           \
2805         char_offset += (component_idx / 2) + 1;                 \
2806       }                                                         \
2807   } while (0)
2808
2809
2810 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2811    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2812    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2813    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2814    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2815   */
2816
2817 #define DECODE_COMPOSITION_START(c1)                                    \
2818   do {                                                                  \
2819     if (c1 == '0'                                                       \
2820         && composition_state == COMPOSING_COMPONENT_RULE)               \
2821       {                                                                 \
2822         component_len = component_idx;                                  \
2823         composition_state = COMPOSING_CHAR;                             \
2824       }                                                                 \
2825     else                                                                \
2826       {                                                                 \
2827         const unsigned char *p;                                         \
2828                                                                         \
2829         MAYBE_FINISH_COMPOSITION ();                                    \
2830         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2831           goto no_more_source;                                          \
2832         for (p = src; p < src_end - 1; p++)                             \
2833           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2834             break;                                                      \
2835         if (p == src_end - 1)                                           \
2836           {                                                             \
2837             /* The current composition doesn't end in the current       \
2838                source.  */                                              \
2839             record_conversion_result                                    \
2840               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
2841             goto no_more_source;                                        \
2842           }                                                             \
2843                                                                         \
2844         /* This is surely the start of a composition.  */               \
2845         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2846                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2847                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2848                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2849         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2850                              : COMPOSING_COMPONENT_CHAR);               \
2851         component_idx = component_len = 0;                              \
2852       }                                                                 \
2853   } while (0)
2854
2855
2856 /* Handle compositoin end sequence ESC 1.  */
2857
2858 #define DECODE_COMPOSITION_END()                                        \
2859   do {                                                                  \
2860     int nchars = (component_len > 0 ? component_idx - component_len     \
2861                   : method == COMPOSITION_RELATIVE ? component_idx      \
2862                   : (component_idx + 1) / 2);                           \
2863     int i;                                                              \
2864     int *saved_charbuf = charbuf;                                       \
2865                                                                         \
2866     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2867     if (method != COMPOSITION_RELATIVE)                                 \
2868       {                                                                 \
2869         if (component_len == 0)                                         \
2870           for (i = 0; i < component_idx; i++)                           \
2871             *charbuf++ = components[i];                                 \
2872         else                                                            \
2873           for (i = 0; i < component_len; i++)                           \
2874             *charbuf++ = components[i];                                 \
2875         *saved_charbuf = saved_charbuf - charbuf;                       \
2876       }                                                                 \
2877     if (method == COMPOSITION_WITH_RULE)                                \
2878       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2879         *charbuf++ = components[i];                                     \
2880     else                                                                \
2881       for (i = component_len; i < component_idx; i++, char_offset++)    \
2882         *charbuf++ = components[i];                                     \
2883     coding->annotated = 1;                                              \
2884     composition_state = COMPOSING_NO;                                   \
2885   } while (0)
2886
2887
2888 /* Decode a composition rule from the byte C1 (and maybe one more byte
2889    from SRC) and store one encoded composition rule in
2890    coding->cmp_data.  */
2891
2892 #define DECODE_COMPOSITION_RULE(c1)                                     \
2893   do {                                                                  \
2894     (c1) -= 32;                                                         \
2895     if (c1 < 81)                /* old format (before ver.21) */        \
2896       {                                                                 \
2897         int gref = (c1) / 9;                                            \
2898         int nref = (c1) % 9;                                            \
2899         if (gref == 4) gref = 10;                                       \
2900         if (nref == 4) nref = 10;                                       \
2901         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2902       }                                                                 \
2903     else if (c1 < 93)           /* new format (after ver.21) */         \
2904       {                                                                 \
2905         ONE_MORE_BYTE (c2);                                             \
2906         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2907       }                                                                 \
2908     else                                                                \
2909       c1 = 0;                                                           \
2910   } while (0)
2911
2912
2913 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2914
2915 static void
2916 decode_coding_iso_2022 (coding)
2917      struct coding_system *coding;
2918 {
2919   const unsigned char *src = coding->source + coding->consumed;
2920   const unsigned char *src_end = coding->source + coding->src_bytes;
2921   const unsigned char *src_base;
2922   int *charbuf = coding->charbuf + coding->charbuf_used;
2923   int *charbuf_end
2924     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2925   int consumed_chars = 0, consumed_chars_base;
2926   int multibytep = coding->src_multibyte;
2927   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2928   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2929   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2930   int charset_id_2, charset_id_3;
2931   struct charset *charset;
2932   int c;
2933   /* For handling composition sequence.  */
2934 #define COMPOSING_NO                    0
2935 #define COMPOSING_CHAR                  1
2936 #define COMPOSING_RULE                  2
2937 #define COMPOSING_COMPONENT_CHAR        3
2938 #define COMPOSING_COMPONENT_RULE        4
2939
2940   int composition_state = COMPOSING_NO;
2941   enum composition_method method;
2942   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2943   int component_idx;
2944   int component_len;
2945   Lisp_Object attrs, charset_list;
2946   int char_offset = coding->produced_char;
2947   int last_offset = char_offset;
2948   int last_id = charset_ascii;
2949
2950   CODING_GET_INFO (coding, attrs, charset_list);
2951   setup_iso_safe_charsets (attrs);
2952   /* Charset list may have been changed.  */
2953   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2954   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
2955
2956   while (1)
2957     {
2958       int c1, c2;
2959
2960       src_base = src;
2961       consumed_chars_base = consumed_chars;
2962
2963       if (charbuf >= charbuf_end)
2964         break;
2965
2966       ONE_MORE_BYTE (c1);
2967       if (c1 < 0)
2968         goto invalid_code;
2969
2970       /* We produce at most one character.  */
2971       switch (iso_code_class [c1])
2972         {
2973         case ISO_0x20_or_0x7F:
2974           if (composition_state != COMPOSING_NO)
2975             {
2976               if (composition_state == COMPOSING_RULE
2977                   || composition_state == COMPOSING_COMPONENT_RULE)
2978                 {
2979                   DECODE_COMPOSITION_RULE (c1);
2980                   components[component_idx++] = c1;
2981                   composition_state--;
2982                   continue;
2983                 }
2984             }
2985           if (charset_id_0 < 0
2986               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2987             /* This is SPACE or DEL.  */
2988             charset = CHARSET_FROM_ID (charset_ascii);
2989           else
2990             charset = CHARSET_FROM_ID (charset_id_0);
2991           break;
2992
2993         case ISO_graphic_plane_0:
2994           if (composition_state != COMPOSING_NO)
2995             {
2996               if (composition_state == COMPOSING_RULE
2997                   || composition_state == COMPOSING_COMPONENT_RULE)
2998                 {
2999                   DECODE_COMPOSITION_RULE (c1);
3000                   components[component_idx++] = c1;
3001                   composition_state--;
3002                   continue;
3003                 }
3004             }
3005           if (charset_id_0 < 0)
3006             charset = CHARSET_FROM_ID (charset_ascii);
3007           else
3008             charset = CHARSET_FROM_ID (charset_id_0);
3009           break;
3010
3011         case ISO_0xA0_or_0xFF:
3012           if (charset_id_1 < 0
3013               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3014               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3015             goto invalid_code;
3016           /* This is a graphic character, we fall down ... */
3017
3018         case ISO_graphic_plane_1:
3019           if (charset_id_1 < 0)
3020             goto invalid_code;
3021           charset = CHARSET_FROM_ID (charset_id_1);
3022           break;
3023
3024         case ISO_control_0:
3025           MAYBE_FINISH_COMPOSITION ();
3026           charset = CHARSET_FROM_ID (charset_ascii);
3027           break;
3028
3029         case ISO_control_1:
3030           MAYBE_FINISH_COMPOSITION ();
3031           goto invalid_code;
3032
3033         case ISO_shift_out:
3034           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3035               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3036             goto invalid_code;
3037           CODING_ISO_INVOCATION (coding, 0) = 1;
3038           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3039           continue;
3040
3041         case ISO_shift_in:
3042           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3043             goto invalid_code;
3044           CODING_ISO_INVOCATION (coding, 0) = 0;
3045           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3046           continue;
3047
3048         case ISO_single_shift_2_7:
3049         case ISO_single_shift_2:
3050           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3051             goto invalid_code;
3052           /* SS2 is handled as an escape sequence of ESC 'N' */
3053           c1 = 'N';
3054           goto label_escape_sequence;
3055
3056         case ISO_single_shift_3:
3057           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3058             goto invalid_code;
3059           /* SS2 is handled as an escape sequence of ESC 'O' */
3060           c1 = 'O';
3061           goto label_escape_sequence;
3062
3063         case ISO_control_sequence_introducer:
3064           /* CSI is handled as an escape sequence of ESC '[' ...  */
3065           c1 = '[';
3066           goto label_escape_sequence;
3067
3068         case ISO_escape:
3069           ONE_MORE_BYTE (c1);
3070         label_escape_sequence:
3071           /* Escape sequences handled here are invocation,
3072              designation, direction specification, and character
3073              composition specification.  */
3074           switch (c1)
3075             {
3076             case '&':           /* revision of following character set */
3077               ONE_MORE_BYTE (c1);
3078               if (!(c1 >= '@' && c1 <= '~'))
3079                 goto invalid_code;
3080               ONE_MORE_BYTE (c1);
3081               if (c1 != ISO_CODE_ESC)
3082                 goto invalid_code;
3083               ONE_MORE_BYTE (c1);
3084               goto label_escape_sequence;
3085
3086             case '$':           /* designation of 2-byte character set */
3087               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3088                 goto invalid_code;
3089               {
3090                 int reg, chars96;
3091
3092                 ONE_MORE_BYTE (c1);
3093                 if (c1 >= '@' && c1 <= 'B')
3094                   {     /* designation of JISX0208.1978, GB2312.1980,
3095                            or JISX0208.1980 */
3096                     reg = 0, chars96 = 0;
3097                   }
3098                 else if (c1 >= 0x28 && c1 <= 0x2B)
3099                   { /* designation of DIMENSION2_CHARS94 character set */
3100                     reg = c1 - 0x28, chars96 = 0;
3101                     ONE_MORE_BYTE (c1);
3102                   }
3103                 else if (c1 >= 0x2C && c1 <= 0x2F)
3104                   { /* designation of DIMENSION2_CHARS96 character set */
3105                     reg = c1 - 0x2C, chars96 = 1;
3106                     ONE_MORE_BYTE (c1);
3107                   }
3108                 else
3109                   goto invalid_code;
3110                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3111                 /* We must update these variables now.  */
3112                 if (reg == 0)
3113                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3114                 else if (reg == 1)
3115                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3116                 if (chars96 < 0)
3117                   goto invalid_code;
3118               }
3119               continue;
3120
3121             case 'n':           /* invocation of locking-shift-2 */
3122               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3123                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3124                 goto invalid_code;
3125               CODING_ISO_INVOCATION (coding, 0) = 2;
3126               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3127               continue;
3128
3129             case 'o':           /* invocation of locking-shift-3 */
3130               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3131                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3132                 goto invalid_code;
3133               CODING_ISO_INVOCATION (coding, 0) = 3;
3134               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3135               continue;
3136
3137             case 'N':           /* invocation of single-shift-2 */
3138               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3139                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3140                 goto invalid_code;
3141               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3142               if (charset_id_2 < 0)
3143                 charset = CHARSET_FROM_ID (charset_ascii);
3144               else
3145                 charset = CHARSET_FROM_ID (charset_id_2);
3146               ONE_MORE_BYTE (c1);
3147               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3148                 goto invalid_code;
3149               break;
3150
3151             case 'O':           /* invocation of single-shift-3 */
3152               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3153                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3154                 goto invalid_code;
3155               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3156               if (charset_id_3 < 0)
3157                 charset = CHARSET_FROM_ID (charset_ascii);
3158               else
3159                 charset = CHARSET_FROM_ID (charset_id_3);
3160               ONE_MORE_BYTE (c1);
3161               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3162                 goto invalid_code;
3163               break;
3164
3165             case '0': case '2': case '3': case '4': /* start composition */
3166               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3167                 goto invalid_code;
3168               DECODE_COMPOSITION_START (c1);
3169               continue;
3170
3171             case '1':           /* end composition */
3172               if (composition_state == COMPOSING_NO)
3173                 goto invalid_code;
3174               DECODE_COMPOSITION_END ();
3175               continue;
3176
3177             case '[':           /* specification of direction */
3178               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3179                 goto invalid_code;
3180               /* For the moment, nested direction is not supported.
3181                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3182                  left-to-right, and nozero means right-to-left.  */
3183               ONE_MORE_BYTE (c1);
3184               switch (c1)
3185                 {
3186                 case ']':       /* end of the current direction */
3187                   coding->mode &= ~CODING_MODE_DIRECTION;
3188
3189                 case '0':       /* end of the current direction */
3190                 case '1':       /* start of left-to-right direction */
3191                   ONE_MORE_BYTE (c1);
3192                   if (c1 == ']')
3193                     coding->mode &= ~CODING_MODE_DIRECTION;
3194                   else
3195                     goto invalid_code;
3196                   break;
3197
3198                 case '2':       /* start of right-to-left direction */
3199                   ONE_MORE_BYTE (c1);
3200                   if (c1 == ']')
3201                     coding->mode |= CODING_MODE_DIRECTION;
3202                   else
3203                     goto invalid_code;
3204                   break;
3205
3206                 default:
3207                   goto invalid_code;
3208                 }
3209               continue;
3210
3211             case '%':
3212               ONE_MORE_BYTE (c1);
3213               if (c1 == '/')
3214                 {
3215                   /* CTEXT extended segment:
3216                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3217                      We keep these bytes as is for the moment.
3218                      They may be decoded by post-read-conversion.  */
3219                   int dim, M, L;
3220                   int size;
3221
3222                   ONE_MORE_BYTE (dim);
3223                   ONE_MORE_BYTE (M);
3224                   ONE_MORE_BYTE (L);
3225                   size = ((M - 128) * 128) + (L - 128);
3226                   if (charbuf + 8 + size > charbuf_end)
3227                     goto break_loop;
3228                   *charbuf++ = ISO_CODE_ESC;
3229                   *charbuf++ = '%';
3230                   *charbuf++ = '/';
3231                   *charbuf++ = dim;
3232                   *charbuf++ = BYTE8_TO_CHAR (M);
3233                   *charbuf++ = BYTE8_TO_CHAR (L);
3234                   while (size-- > 0)
3235                     {
3236                       ONE_MORE_BYTE (c1);
3237                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3238                     }
3239                 }
3240               else if (c1 == 'G')
3241                 {
3242                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3243                      ESC % G --UTF-8-BYTES-- ESC % @
3244                      We keep these bytes as is for the moment.
3245                      They may be decoded by post-read-conversion.  */
3246                   int *p = charbuf;
3247
3248                   if (p + 6 > charbuf_end)
3249                     goto break_loop;
3250                   *p++ = ISO_CODE_ESC;
3251                   *p++ = '%';
3252                   *p++ = 'G';
3253                   while (p < charbuf_end)
3254                     {
3255                       ONE_MORE_BYTE (c1);
3256                       if (c1 == ISO_CODE_ESC
3257                           && src + 1 < src_end
3258                           && src[0] == '%'
3259                           && src[1] == '@')
3260                         {
3261                           src += 2;
3262                           break;
3263                         }
3264                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3265                     }
3266                   if (p + 3 > charbuf_end)
3267                     goto break_loop;
3268                   *p++ = ISO_CODE_ESC;
3269                   *p++ = '%';
3270                   *p++ = '@';
3271                   charbuf = p;
3272                 }
3273               else
3274                 goto invalid_code;
3275               continue;
3276               break;
3277
3278             default:
3279               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3280                 goto invalid_code;
3281               {
3282                 int reg, chars96;
3283
3284                 if (c1 >= 0x28 && c1 <= 0x2B)
3285                   { /* designation of DIMENSION1_CHARS94 character set */
3286                     reg = c1 - 0x28, chars96 = 0;
3287                     ONE_MORE_BYTE (c1);
3288                   }
3289                 else if (c1 >= 0x2C && c1 <= 0x2F)
3290                   { /* designation of DIMENSION1_CHARS96 character set */
3291                     reg = c1 - 0x2C, chars96 = 1;
3292                     ONE_MORE_BYTE (c1);
3293                   }
3294                 else
3295                   goto invalid_code;
3296                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3297                 /* We must update these variables now.  */
3298                 if (reg == 0)
3299                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3300                 else if (reg == 1)
3301                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3302                 if (chars96 < 0)
3303                   goto invalid_code;
3304               }
3305               continue;
3306             }
3307         }
3308
3309       if (charset->id != charset_ascii
3310           && last_id != charset->id)
3311         {
3312           if (last_id != charset_ascii)
3313             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3314           last_id = charset->id;
3315           last_offset = char_offset;
3316         }
3317
3318       /* Now we know CHARSET and 1st position code C1 of a character.
3319          Produce a decoded character while getting 2nd position code
3320          C2 if necessary.  */
3321       c1 &= 0x7F;
3322       if (CHARSET_DIMENSION (charset) > 1)
3323         {
3324           ONE_MORE_BYTE (c2);
3325           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3326             /* C2 is not in a valid range.  */
3327             goto invalid_code;
3328           c1 = (c1 << 8) | (c2 & 0x7F);
3329           if (CHARSET_DIMENSION (charset) > 2)
3330             {
3331               ONE_MORE_BYTE (c2);
3332               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3333                 /* C2 is not in a valid range.  */
3334                 goto invalid_code;
3335               c1 = (c1 << 8) | (c2 & 0x7F);
3336             }
3337         }
3338
3339       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3340       if (c < 0)
3341         {
3342           MAYBE_FINISH_COMPOSITION ();
3343           for (; src_base < src; src_base++, char_offset++)
3344             {
3345               if (ASCII_BYTE_P (*src_base))
3346                 *charbuf++ = *src_base;
3347               else
3348                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3349             }
3350         }
3351       else if (composition_state == COMPOSING_NO)
3352         {
3353           *charbuf++ = c;
3354           char_offset++;
3355         }
3356       else
3357         {
3358           components[component_idx++] = c;
3359           if (method == COMPOSITION_WITH_RULE
3360               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3361                   && composition_state == COMPOSING_COMPONENT_CHAR))
3362             composition_state++;
3363         }
3364       continue;
3365
3366     invalid_code:
3367       MAYBE_FINISH_COMPOSITION ();
3368       src = src_base;
3369       consumed_chars = consumed_chars_base;
3370       ONE_MORE_BYTE (c);
3371       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3372       char_offset++;
3373       coding->errors++;
3374       continue;
3375
3376     break_loop:
3377       break;
3378     }
3379
3380  no_more_source:
3381   if (last_id != charset_ascii)
3382     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3383   coding->consumed_char += consumed_chars_base;
3384   coding->consumed = src_base - coding->source;
3385   coding->charbuf_used = charbuf - coding->charbuf;
3386 }
3387
3388
3389 /* ISO2022 encoding stuff.  */
3390
3391 /*
3392    It is not enough to say just "ISO2022" on encoding, we have to
3393    specify more details.  In Emacs, each coding system of ISO2022
3394    variant has the following specifications:
3395         1. Initial designation to G0 thru G3.
3396         2. Allows short-form designation?
3397         3. ASCII should be designated to G0 before control characters?
3398         4. ASCII should be designated to G0 at end of line?
3399         5. 7-bit environment or 8-bit environment?
3400         6. Use locking-shift?
3401         7. Use Single-shift?
3402    And the following two are only for Japanese:
3403         8. Use ASCII in place of JIS0201-1976-Roman?
3404         9. Use JISX0208-1983 in place of JISX0208-1978?
3405    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3406    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3407    details.
3408 */
3409
3410 /* Produce codes (escape sequence) for designating CHARSET to graphic
3411    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3412    '@', 'A', or 'B' and the coding system CODING allows, produce
3413    designation sequence of short-form.  */
3414
3415 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3416   do {                                                                  \
3417     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3418     char *intermediate_char_94 = "()*+";                                \
3419     char *intermediate_char_96 = ",-./";                                \
3420     int revision = -1;                                                  \
3421     int c;                                                              \
3422                                                                         \
3423     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3424       revision = CHARSET_ISO_REVISION (charset);                        \
3425                                                                         \
3426     if (revision >= 0)                                                  \
3427       {                                                                 \
3428         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3429         EMIT_ONE_BYTE ('@' + revision);                                 \
3430       }                                                                 \
3431     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3432     if (CHARSET_DIMENSION (charset) == 1)                               \
3433       {                                                                 \
3434         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3435           c = intermediate_char_94[reg];                                \
3436         else                                                            \
3437           c = intermediate_char_96[reg];                                \
3438         EMIT_ONE_ASCII_BYTE (c);                                        \
3439       }                                                                 \
3440     else                                                                \
3441       {                                                                 \
3442         EMIT_ONE_ASCII_BYTE ('$');                                      \
3443         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3444           {                                                             \
3445             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3446                 || reg != 0                                             \
3447                 || final_char < '@' || final_char > 'B')                \
3448               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3449           }                                                             \
3450         else                                                            \
3451           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3452       }                                                                 \
3453     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3454                                                                         \
3455     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3456   } while (0)
3457
3458
3459 /* The following two macros produce codes (control character or escape
3460    sequence) for ISO2022 single-shift functions (single-shift-2 and
3461    single-shift-3).  */
3462
3463 #define ENCODE_SINGLE_SHIFT_2                                           \
3464   do {                                                                  \
3465     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3466       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3467     else                                                                \
3468       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3469     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3470   } while (0)
3471
3472
3473 #define ENCODE_SINGLE_SHIFT_3                                           \
3474   do {                                                                  \
3475     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3476       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3477     else                                                                \
3478       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3479     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3480   } while (0)
3481
3482
3483 /* The following four macros produce codes (control character or
3484    escape sequence) for ISO2022 locking-shift functions (shift-in,
3485    shift-out, locking-shift-2, and locking-shift-3).  */
3486
3487 #define ENCODE_SHIFT_IN                                 \
3488   do {                                                  \
3489     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3490     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3491   } while (0)
3492
3493
3494 #define ENCODE_SHIFT_OUT                                \
3495   do {                                                  \
3496     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3497     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3498   } while (0)
3499
3500
3501 #define ENCODE_LOCKING_SHIFT_2                          \
3502   do {                                                  \
3503     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3504     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3505   } while (0)
3506
3507
3508 #define ENCODE_LOCKING_SHIFT_3                          \
3509   do {                                                  \
3510     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3511     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3512   } while (0)
3513
3514
3515 /* Produce codes for a DIMENSION1 character whose character set is
3516    CHARSET and whose position-code is C1.  Designation and invocation
3517    sequences are also produced in advance if necessary.  */
3518
3519 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3520   do {                                                                  \
3521     int id = CHARSET_ID (charset);                                      \
3522                                                                         \
3523     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3524         && id == charset_ascii)                                         \
3525       {                                                                 \
3526         id = charset_jisx0201_roman;                                    \
3527         charset = CHARSET_FROM_ID (id);                                 \
3528       }                                                                 \
3529                                                                         \
3530     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3531       {                                                                 \
3532         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3533           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3534         else                                                            \
3535           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3536         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3537         break;                                                          \
3538       }                                                                 \
3539     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3540       {                                                                 \
3541         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3542         break;                                                          \
3543       }                                                                 \
3544     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3545       {                                                                 \
3546         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3547         break;                                                          \
3548       }                                                                 \
3549     else                                                                \
3550       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3551          must invoke it, or, at first, designate it to some graphic     \
3552          register.  Then repeat the loop to actually produce the        \
3553          character.  */                                                 \
3554       dst = encode_invocation_designation (charset, coding, dst,        \
3555                                            &produced_chars);            \
3556   } while (1)
3557
3558
3559 /* Produce codes for a DIMENSION2 character whose character set is
3560    CHARSET and whose position-codes are C1 and C2.  Designation and
3561    invocation codes are also produced in advance if necessary.  */
3562
3563 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3564   do {                                                                  \
3565     int id = CHARSET_ID (charset);                                      \
3566                                                                         \
3567     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3568         && id == charset_jisx0208)                                      \
3569       {                                                                 \
3570         id = charset_jisx0208_1978;                                     \
3571         charset = CHARSET_FROM_ID (id);                                 \
3572       }                                                                 \
3573                                                                         \
3574     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3575       {                                                                 \
3576         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3577           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3578         else                                                            \
3579           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3580         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3581         break;                                                          \
3582       }                                                                 \
3583     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3584       {                                                                 \
3585         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3586         break;                                                          \
3587       }                                                                 \
3588     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3589       {                                                                 \
3590         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3591         break;                                                          \
3592       }                                                                 \
3593     else                                                                \
3594       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3595          must invoke it, or, at first, designate it to some graphic     \
3596          register.  Then repeat the loop to actually produce the        \
3597          character.  */                                                 \
3598       dst = encode_invocation_designation (charset, coding, dst,        \
3599                                            &produced_chars);            \
3600   } while (1)
3601
3602
3603 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3604   do {                                                                     \
3605     int code = ENCODE_CHAR ((charset),(c));                                \
3606                                                                            \
3607     if (CHARSET_DIMENSION (charset) == 1)                                  \
3608       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3609     else                                                                   \
3610       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3611   } while (0)
3612
3613
3614 /* Produce designation and invocation codes at a place pointed by DST
3615    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3616    Return new DST.  */
3617
3618 unsigned char *
3619 encode_invocation_designation (charset, coding, dst, p_nchars)
3620      struct charset *charset;
3621      struct coding_system *coding;
3622      unsigned char *dst;
3623      int *p_nchars;
3624 {
3625   int multibytep = coding->dst_multibyte;
3626   int produced_chars = *p_nchars;
3627   int reg;                      /* graphic register number */
3628   int id = CHARSET_ID (charset);
3629
3630   /* At first, check designations.  */
3631   for (reg = 0; reg < 4; reg++)
3632     if (id == CODING_ISO_DESIGNATION (coding, reg))
3633       break;
3634
3635   if (reg >= 4)
3636     {
3637       /* CHARSET is not yet designated to any graphic registers.  */
3638       /* At first check the requested designation.  */
3639       reg = CODING_ISO_REQUEST (coding, id);
3640       if (reg < 0)
3641         /* Since CHARSET requests no special designation, designate it
3642            to graphic register 0.  */
3643         reg = 0;
3644
3645       ENCODE_DESIGNATION (charset, reg, coding);
3646     }
3647
3648   if (CODING_ISO_INVOCATION (coding, 0) != reg
3649       && CODING_ISO_INVOCATION (coding, 1) != reg)
3650     {
3651       /* Since the graphic register REG is not invoked to any graphic
3652          planes, invoke it to graphic plane 0.  */
3653       switch (reg)
3654         {
3655         case 0:                 /* graphic register 0 */
3656           ENCODE_SHIFT_IN;
3657           break;
3658
3659         case 1:                 /* graphic register 1 */
3660           ENCODE_SHIFT_OUT;
3661           break;
3662
3663         case 2:                 /* graphic register 2 */
3664           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3665             ENCODE_SINGLE_SHIFT_2;
3666           else
3667             ENCODE_LOCKING_SHIFT_2;
3668           break;
3669
3670         case 3:                 /* graphic register 3 */
3671           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3672             ENCODE_SINGLE_SHIFT_3;
3673           else
3674             ENCODE_LOCKING_SHIFT_3;
3675           break;
3676         }
3677     }
3678
3679   *p_nchars = produced_chars;
3680   return dst;
3681 }
3682
3683 /* The following three macros produce codes for indicating direction
3684    of text.  */
3685 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3686   do {                                                                  \
3687     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3688       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3689     else                                                                \
3690       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3691   } while (0)
3692
3693
3694 #define ENCODE_DIRECTION_R2L()                  \
3695   do {                                          \
3696     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3697     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3698   } while (0)
3699
3700
3701 #define ENCODE_DIRECTION_L2R()                  \
3702   do {                                          \
3703     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3704     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3705   } while (0)
3706
3707
3708 /* Produce codes for designation and invocation to reset the graphic
3709    planes and registers to initial state.  */
3710 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3711   do {                                                                  \
3712     int reg;                                                            \
3713     struct charset *charset;                                            \
3714                                                                         \
3715     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3716       ENCODE_SHIFT_IN;                                                  \
3717     for (reg = 0; reg < 4; reg++)                                       \
3718       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3719           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3720               != CODING_ISO_INITIAL (coding, reg)))                     \
3721         {                                                               \
3722           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3723           ENCODE_DESIGNATION (charset, reg, coding);                    \
3724         }                                                               \
3725   } while (0)
3726
3727
3728 /* Produce designation sequences of charsets in the line started from
3729    SRC to a place pointed by DST, and return updated DST.
3730
3731    If the current block ends before any end-of-line, we may fail to
3732    find all the necessary designations.  */
3733
3734 static unsigned char *
3735 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3736      struct coding_system *coding;
3737      int *charbuf, *charbuf_end;
3738      unsigned char *dst;
3739 {
3740   struct charset *charset;
3741   /* Table of charsets to be designated to each graphic register.  */
3742   int r[4];
3743   int c, found = 0, reg;
3744   int produced_chars = 0;
3745   int multibytep = coding->dst_multibyte;
3746   Lisp_Object attrs;
3747   Lisp_Object charset_list;
3748
3749   attrs = CODING_ID_ATTRS (coding->id);
3750   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3751   if (EQ (charset_list, Qiso_2022))
3752     charset_list = Viso_2022_charset_list;
3753
3754   for (reg = 0; reg < 4; reg++)
3755     r[reg] = -1;
3756
3757   while (found < 4)
3758     {
3759       int id;
3760
3761       c = *charbuf++;
3762       if (c == '\n')
3763         break;
3764       charset = char_charset (c, charset_list, NULL);
3765       id = CHARSET_ID (charset);
3766       reg = CODING_ISO_REQUEST (coding, id);
3767       if (reg >= 0 && r[reg] < 0)
3768         {
3769           found++;
3770           r[reg] = id;
3771         }
3772     }
3773
3774   if (found)
3775     {
3776       for (reg = 0; reg < 4; reg++)
3777         if (r[reg] >= 0
3778             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3779           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3780     }
3781
3782   return dst;
3783 }
3784
3785 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3786
3787 static int
3788 encode_coding_iso_2022 (coding)
3789      struct coding_system *coding;
3790 {
3791   int multibytep = coding->dst_multibyte;
3792   int *charbuf = coding->charbuf;
3793   int *charbuf_end = charbuf + coding->charbuf_used;
3794   unsigned char *dst = coding->destination + coding->produced;
3795   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3796   int safe_room = 16;
3797   int bol_designation
3798     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3799        && CODING_ISO_BOL (coding));
3800   int produced_chars = 0;
3801   Lisp_Object attrs, eol_type, charset_list;
3802   int ascii_compatible;
3803   int c;
3804   int preferred_charset_id = -1;
3805
3806   CODING_GET_INFO (coding, attrs, charset_list);
3807   eol_type = CODING_ID_EOL_TYPE (coding->id);
3808   if (VECTORP (eol_type))
3809     eol_type = Qunix;
3810
3811   setup_iso_safe_charsets (attrs);
3812   /* Charset list may have been changed.  */
3813   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3814   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3815
3816   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3817
3818   while (charbuf < charbuf_end)
3819     {
3820       ASSURE_DESTINATION (safe_room);
3821
3822       if (bol_designation)
3823         {
3824           unsigned char *dst_prev = dst;
3825
3826           /* We have to produce designation sequences if any now.  */
3827           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3828           bol_designation = 0;
3829           /* We are sure that designation sequences are all ASCII bytes.  */
3830           produced_chars += dst - dst_prev;
3831         }
3832
3833       c = *charbuf++;
3834
3835       if (c < 0)
3836         {
3837           /* Handle an annotation.  */
3838           switch (*charbuf)
3839             {
3840             case CODING_ANNOTATE_COMPOSITION_MASK:
3841               /* Not yet implemented.  */
3842               break;
3843             case CODING_ANNOTATE_CHARSET_MASK:
3844               preferred_charset_id = charbuf[2];
3845               if (preferred_charset_id >= 0
3846                   && NILP (Fmemq (make_number (preferred_charset_id),
3847                                   charset_list)))
3848                 preferred_charset_id = -1;
3849               break;
3850             default:
3851               abort ();
3852             }
3853           charbuf += -c - 1;
3854           continue;
3855         }
3856
3857       /* Now encode the character C.  */
3858       if (c < 0x20 || c == 0x7F)
3859         {
3860           if (c == '\n'
3861               || (c == '\r' && EQ (eol_type, Qmac)))
3862             {
3863               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3864                 ENCODE_RESET_PLANE_AND_REGISTER ();
3865               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3866                 {
3867                   int i;
3868
3869                   for (i = 0; i < 4; i++)
3870                     CODING_ISO_DESIGNATION (coding, i)
3871                       = CODING_ISO_INITIAL (coding, i);
3872                 }
3873               bol_designation
3874                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3875             }
3876           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3877             ENCODE_RESET_PLANE_AND_REGISTER ();
3878           EMIT_ONE_ASCII_BYTE (c);
3879         }
3880       else if (ASCII_CHAR_P (c))
3881         {
3882           if (ascii_compatible)
3883             EMIT_ONE_ASCII_BYTE (c);
3884           else
3885             {
3886               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3887               ENCODE_ISO_CHARACTER (charset, c);
3888             }
3889         }
3890       else if (CHAR_BYTE8_P (c))
3891         {
3892           c = CHAR_TO_BYTE8 (c);
3893           EMIT_ONE_BYTE (c);
3894         }
3895       else
3896         {
3897           struct charset *charset;
3898
3899           if (preferred_charset_id >= 0)
3900             {
3901               charset = CHARSET_FROM_ID (preferred_charset_id);
3902               if (! CHAR_CHARSET_P (c, charset))
3903                 charset = char_charset (c, charset_list, NULL);
3904             }
3905           else
3906             charset = char_charset (c, charset_list, NULL);
3907           if (!charset)
3908             {
3909               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3910                 {
3911                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3912                   charset = CHARSET_FROM_ID (charset_ascii);
3913                 }
3914               else
3915                 {
3916                   c = coding->default_char;
3917                   charset = char_charset (c, charset_list, NULL);
3918                 }
3919             }
3920           ENCODE_ISO_CHARACTER (charset, c);
3921         }
3922     }
3923
3924   if (coding->mode & CODING_MODE_LAST_BLOCK
3925       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3926     {
3927       ASSURE_DESTINATION (safe_room);
3928       ENCODE_RESET_PLANE_AND_REGISTER ();
3929     }
3930   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3931   CODING_ISO_BOL (coding) = bol_designation;
3932   coding->produced_char += produced_chars;
3933   coding->produced = dst - coding->destination;
3934   return 0;
3935 }
3936
3937 \f
3938 /*** 8,9. SJIS and BIG5 handlers ***/
3939
3940 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3941    quite widely.  So, for the moment, Emacs supports them in the bare
3942    C code.  But, in the future, they may be supported only by CCL.  */
3943
3944 /* SJIS is a coding system encoding three character sets: ASCII, right
3945    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3946    as is.  A character of charset katakana-jisx0201 is encoded by
3947    "position-code + 0x80".  A character of charset japanese-jisx0208
3948    is encoded in 2-byte but two position-codes are divided and shifted
3949    so that it fit in the range below.
3950
3951    --- CODE RANGE of SJIS ---
3952    (character set)      (range)
3953    ASCII                0x00 .. 0x7F
3954    KATAKANA-JISX0201    0xA0 .. 0xDF
3955    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3956             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3957    -------------------------------
3958
3959 */
3960
3961 /* BIG5 is a coding system encoding two character sets: ASCII and
3962    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3963    character set and is encoded in two-byte.
3964
3965    --- CODE RANGE of BIG5 ---
3966    (character set)      (range)
3967    ASCII                0x00 .. 0x7F
3968    Big5 (1st byte)      0xA1 .. 0xFE
3969         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3970    --------------------------
3971
3972   */
3973
3974 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3975    Check if a text is encoded in SJIS.  If it is, return
3976    CATEGORY_MASK_SJIS, else return 0.  */
3977
3978 static int
3979 detect_coding_sjis (coding, detect_info)
3980      struct coding_system *coding;
3981      struct coding_detection_info *detect_info;
3982 {
3983   const unsigned char *src = coding->source, *src_base;
3984   const unsigned char *src_end = coding->source + coding->src_bytes;
3985   int multibytep = coding->src_multibyte;
3986   int consumed_chars = 0;
3987   int found = 0;
3988   int c;
3989
3990   detect_info->checked |= CATEGORY_MASK_SJIS;
3991   /* A coding system of this category is always ASCII compatible.  */
3992   src += coding->head_ascii;
3993
3994   while (1)
3995     {
3996       src_base = src;
3997       ONE_MORE_BYTE (c);
3998       if (c < 0x80)
3999         continue;
4000       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4001         {
4002           ONE_MORE_BYTE (c);
4003           if (c < 0x40 || c == 0x7F || c > 0xFC)
4004             break;
4005           found = CATEGORY_MASK_SJIS;
4006         }
4007       else if (c >= 0xA0 && c < 0xE0)
4008         found = CATEGORY_MASK_SJIS;
4009       else
4010         break;
4011     }
4012   detect_info->rejected |= CATEGORY_MASK_SJIS;
4013   return 0;
4014
4015  no_more_source:
4016   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4017     {
4018       detect_info->rejected |= CATEGORY_MASK_SJIS;
4019       return 0;
4020     }
4021   detect_info->found |= found;
4022   return 1;
4023 }
4024
4025 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4026    Check if a text is encoded in BIG5.  If it is, return
4027    CATEGORY_MASK_BIG5, else return 0.  */
4028
4029 static int
4030 detect_coding_big5 (coding, detect_info)
4031      struct coding_system *coding;
4032      struct coding_detection_info *detect_info;
4033 {
4034   const unsigned char *src = coding->source, *src_base;
4035   const unsigned char *src_end = coding->source + coding->src_bytes;
4036   int multibytep = coding->src_multibyte;
4037   int consumed_chars = 0;
4038   int found = 0;
4039   int c;
4040
4041   detect_info->checked |= CATEGORY_MASK_BIG5;
4042   /* A coding system of this category is always ASCII compatible.  */
4043   src += coding->head_ascii;
4044
4045   while (1)
4046     {
4047       src_base = src;
4048       ONE_MORE_BYTE (c);
4049       if (c < 0x80)
4050         continue;
4051       if (c >= 0xA1)
4052         {
4053           ONE_MORE_BYTE (c);
4054           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4055             return 0;
4056           found = CATEGORY_MASK_BIG5;
4057         }
4058       else
4059         break;
4060     }
4061   detect_info->rejected |= CATEGORY_MASK_BIG5;
4062   return 0;
4063
4064  no_more_source:
4065   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4066     {
4067       detect_info->rejected |= CATEGORY_MASK_BIG5;
4068       return 0;
4069     }
4070   detect_info->found |= found;
4071   return 1;
4072 }
4073
4074 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4075    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4076
4077 static void
4078 decode_coding_sjis (coding)
4079      struct coding_system *coding;
4080 {
4081   const unsigned char *src = coding->source + coding->consumed;
4082   const unsigned char *src_end = coding->source + coding->src_bytes;
4083   const unsigned char *src_base;
4084   int *charbuf = coding->charbuf + coding->charbuf_used;
4085   int *charbuf_end
4086     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4087   int consumed_chars = 0, consumed_chars_base;
4088   int multibytep = coding->src_multibyte;
4089   struct charset *charset_roman, *charset_kanji, *charset_kana;
4090   struct charset *charset_kanji2;
4091   Lisp_Object attrs, charset_list, val;
4092   int char_offset = coding->produced_char;
4093   int last_offset = char_offset;
4094   int last_id = charset_ascii;
4095
4096   CODING_GET_INFO (coding, attrs, charset_list);
4097
4098   val = charset_list;
4099   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4100   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4101   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4102   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4103
4104   while (1)
4105     {
4106       int c, c1;
4107       struct charset *charset;
4108
4109       src_base = src;
4110       consumed_chars_base = consumed_chars;
4111
4112       if (charbuf >= charbuf_end)
4113         break;
4114
4115       ONE_MORE_BYTE (c);
4116       if (c < 0)
4117         goto invalid_code;
4118       if (c < 0x80)
4119         charset = charset_roman;
4120       else if (c == 0x80 || c == 0xA0)
4121         goto invalid_code;
4122       else if (c >= 0xA1 && c <= 0xDF)
4123         {
4124           /* SJIS -> JISX0201-Kana */
4125           c &= 0x7F;
4126           charset = charset_kana;
4127         }
4128       else if (c <= 0xEF)
4129         {
4130           /* SJIS -> JISX0208 */
4131           ONE_MORE_BYTE (c1);
4132           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4133             goto invalid_code;
4134           c = (c << 8) | c1;
4135           SJIS_TO_JIS (c);
4136           charset = charset_kanji;
4137         }
4138       else if (c <= 0xFC && charset_kanji2)
4139         {
4140           /* SJIS -> JISX0213-2 */
4141           ONE_MORE_BYTE (c1);
4142           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4143             goto invalid_code;
4144           c = (c << 8) | c1;
4145           SJIS_TO_JIS2 (c);
4146           charset = charset_kanji2;
4147         }
4148       else
4149         goto invalid_code;
4150       if (charset->id != charset_ascii
4151           && last_id != charset->id)
4152         {
4153           if (last_id != charset_ascii)
4154             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4155           last_id = charset->id;
4156           last_offset = char_offset;
4157         }
4158       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4159       *charbuf++ = c;
4160       char_offset++;
4161       continue;
4162
4163     invalid_code:
4164       src = src_base;
4165       consumed_chars = consumed_chars_base;
4166       ONE_MORE_BYTE (c);
4167       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4168       char_offset++;
4169       coding->errors++;
4170     }
4171
4172  no_more_source:
4173   if (last_id != charset_ascii)
4174     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4175   coding->consumed_char += consumed_chars_base;
4176   coding->consumed = src_base - coding->source;
4177   coding->charbuf_used = charbuf - coding->charbuf;
4178 }
4179
4180 static void
4181 decode_coding_big5 (coding)
4182      struct coding_system *coding;
4183 {
4184   const unsigned char *src = coding->source + coding->consumed;
4185   const unsigned char *src_end = coding->source + coding->src_bytes;
4186   const unsigned char *src_base;
4187   int *charbuf = coding->charbuf + coding->charbuf_used;
4188   int *charbuf_end
4189     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4190   int consumed_chars = 0, consumed_chars_base;
4191   int multibytep = coding->src_multibyte;
4192   struct charset *charset_roman, *charset_big5;
4193   Lisp_Object attrs, charset_list, val;
4194   int char_offset = coding->produced_char;
4195   int last_offset = char_offset;
4196   int last_id = charset_ascii;
4197
4198   CODING_GET_INFO (coding, attrs, charset_list);
4199   val = charset_list;
4200   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4201   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4202
4203   while (1)
4204     {
4205       int c, c1;
4206       struct charset *charset;
4207
4208       src_base = src;
4209       consumed_chars_base = consumed_chars;
4210
4211       if (charbuf >= charbuf_end)
4212         break;
4213
4214       ONE_MORE_BYTE (c);
4215
4216       if (c < 0)
4217         goto invalid_code;
4218       if (c < 0x80)
4219         charset = charset_roman;
4220       else
4221         {
4222           /* BIG5 -> Big5 */
4223           if (c < 0xA1 || c > 0xFE)
4224             goto invalid_code;
4225           ONE_MORE_BYTE (c1);
4226           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4227             goto invalid_code;
4228           c = c << 8 | c1;
4229           charset = charset_big5;
4230         }
4231       if (charset->id != charset_ascii
4232           && last_id != charset->id)
4233         {
4234           if (last_id != charset_ascii)
4235             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4236           last_id = charset->id;
4237           last_offset = char_offset;
4238         }
4239       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4240       *charbuf++ = c;
4241       char_offset++;
4242       continue;
4243
4244     invalid_code:
4245       src = src_base;
4246       consumed_chars = consumed_chars_base;
4247       ONE_MORE_BYTE (c);
4248       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4249       char_offset++;
4250       coding->errors++;
4251     }
4252
4253  no_more_source:
4254   if (last_id != charset_ascii)
4255     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4256   coding->consumed_char += consumed_chars_base;
4257   coding->consumed = src_base - coding->source;
4258   coding->charbuf_used = charbuf - coding->charbuf;
4259 }
4260
4261 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4262    This function can encode charsets `ascii', `katakana-jisx0201',
4263    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4264    are sure that all these charsets are registered as official charset
4265    (i.e. do not have extended leading-codes).  Characters of other
4266    charsets are produced without any encoding.  If SJIS_P is 1, encode
4267    SJIS text, else encode BIG5 text.  */
4268
4269 static int
4270 encode_coding_sjis (coding)
4271      struct coding_system *coding;
4272 {
4273   int multibytep = coding->dst_multibyte;
4274   int *charbuf = coding->charbuf;
4275   int *charbuf_end = charbuf + coding->charbuf_used;
4276   unsigned char *dst = coding->destination + coding->produced;
4277   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4278   int safe_room = 4;
4279   int produced_chars = 0;
4280   Lisp_Object attrs, charset_list, val;
4281   int ascii_compatible;
4282   struct charset *charset_roman, *charset_kanji, *charset_kana;
4283   struct charset *charset_kanji2;
4284   int c;
4285
4286   CODING_GET_INFO (coding, attrs, charset_list);
4287   val = charset_list;
4288   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4289   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4290   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4291   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4292
4293   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4294
4295   while (charbuf < charbuf_end)
4296     {
4297       ASSURE_DESTINATION (safe_room);
4298       c = *charbuf++;
4299       /* Now encode the character C.  */
4300       if (ASCII_CHAR_P (c) && ascii_compatible)
4301         EMIT_ONE_ASCII_BYTE (c);
4302       else if (CHAR_BYTE8_P (c))
4303         {
4304           c = CHAR_TO_BYTE8 (c);
4305           EMIT_ONE_BYTE (c);
4306         }
4307       else
4308         {
4309           unsigned code;
4310           struct charset *charset = char_charset (c, charset_list, &code);
4311
4312           if (!charset)
4313             {
4314               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4315                 {
4316                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4317                   charset = CHARSET_FROM_ID (charset_ascii);
4318                 }
4319               else
4320                 {
4321                   c = coding->default_char;
4322                   charset = char_charset (c, charset_list, &code);
4323                 }
4324             }
4325           if (code == CHARSET_INVALID_CODE (charset))
4326             abort ();
4327           if (charset == charset_kanji)
4328             {
4329               int c1, c2;
4330               JIS_TO_SJIS (code);
4331               c1 = code >> 8, c2 = code & 0xFF;
4332               EMIT_TWO_BYTES (c1, c2);
4333             }
4334           else if (charset == charset_kana)
4335             EMIT_ONE_BYTE (code | 0x80);
4336           else if (charset_kanji2 && charset == charset_kanji2)
4337             {
4338               int c1, c2;
4339
4340               c1 = code >> 8;
4341               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4342                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4343                 {
4344                   JIS_TO_SJIS2 (code);
4345                   c1 = code >> 8, c2 = code & 0xFF;
4346                   EMIT_TWO_BYTES (c1, c2);
4347                 }
4348               else
4349                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4350             }
4351           else
4352             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4353         }
4354     }
4355   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4356   coding->produced_char += produced_chars;
4357   coding->produced = dst - coding->destination;
4358   return 0;
4359 }
4360
4361 static int
4362 encode_coding_big5 (coding)
4363      struct coding_system *coding;
4364 {
4365   int multibytep = coding->dst_multibyte;
4366   int *charbuf = coding->charbuf;
4367   int *charbuf_end = charbuf + coding->charbuf_used;
4368   unsigned char *dst = coding->destination + coding->produced;
4369   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4370   int safe_room = 4;
4371   int produced_chars = 0;
4372   Lisp_Object attrs, charset_list, val;
4373   int ascii_compatible;
4374   struct charset *charset_roman, *charset_big5;
4375   int c;
4376
4377   CODING_GET_INFO (coding, attrs, charset_list);
4378   val = charset_list;
4379   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4380   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4381   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4382
4383   while (charbuf < charbuf_end)
4384     {
4385       ASSURE_DESTINATION (safe_room);
4386       c = *charbuf++;
4387       /* Now encode the character C.  */
4388       if (ASCII_CHAR_P (c) && ascii_compatible)
4389         EMIT_ONE_ASCII_BYTE (c);
4390       else if (CHAR_BYTE8_P (c))
4391         {
4392           c = CHAR_TO_BYTE8 (c);
4393           EMIT_ONE_BYTE (c);
4394         }
4395       else
4396         {
4397           unsigned code;
4398           struct charset *charset = char_charset (c, charset_list, &code);
4399
4400           if (! charset)
4401             {
4402               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4403                 {
4404                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4405                   charset = CHARSET_FROM_ID (charset_ascii);
4406                 }
4407               else
4408                 {
4409                   c = coding->default_char;
4410                   charset = char_charset (c, charset_list, &code);
4411                 }
4412             }
4413           if (code == CHARSET_INVALID_CODE (charset))
4414             abort ();
4415           if (charset == charset_big5)
4416             {
4417               int c1, c2;
4418
4419               c1 = code >> 8, c2 = code & 0xFF;
4420               EMIT_TWO_BYTES (c1, c2);
4421             }
4422           else
4423             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4424         }
4425     }
4426   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4427   coding->produced_char += produced_chars;
4428   coding->produced = dst - coding->destination;
4429   return 0;
4430 }
4431
4432 \f
4433 /*** 10. CCL handlers ***/
4434
4435 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4436    Check if a text is encoded in a coding system of which
4437    encoder/decoder are written in CCL program.  If it is, return
4438    CATEGORY_MASK_CCL, else return 0.  */
4439
4440 static int
4441 detect_coding_ccl (coding, detect_info)
4442      struct coding_system *coding;
4443      struct coding_detection_info *detect_info;
4444 {
4445   const unsigned char *src = coding->source, *src_base;
4446   const unsigned char *src_end = coding->source + coding->src_bytes;
4447   int multibytep = coding->src_multibyte;
4448   int consumed_chars = 0;
4449   int found = 0;
4450   unsigned char *valids;
4451   int head_ascii = coding->head_ascii;
4452   Lisp_Object attrs;
4453
4454   detect_info->checked |= CATEGORY_MASK_CCL;
4455
4456   coding = &coding_categories[coding_category_ccl];
4457   valids = CODING_CCL_VALIDS (coding);
4458   attrs = CODING_ID_ATTRS (coding->id);
4459   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4460     src += head_ascii;
4461
4462   while (1)
4463     {
4464       int c;
4465
4466       src_base = src;
4467       ONE_MORE_BYTE (c);
4468       if (c < 0 || ! valids[c])
4469         break;
4470       if ((valids[c] > 1))
4471         found = CATEGORY_MASK_CCL;
4472     }
4473   detect_info->rejected |= CATEGORY_MASK_CCL;
4474   return 0;
4475
4476  no_more_source:
4477   detect_info->found |= found;
4478   return 1;
4479 }
4480
4481 static void
4482 decode_coding_ccl (coding)
4483      struct coding_system *coding;
4484 {
4485   const unsigned char *src = coding->source + coding->consumed;
4486   const unsigned char *src_end = coding->source + coding->src_bytes;
4487   int *charbuf = coding->charbuf + coding->charbuf_used;
4488   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4489   int consumed_chars = 0;
4490   int multibytep = coding->src_multibyte;
4491   struct ccl_program ccl;
4492   int source_charbuf[1024];
4493   int source_byteidx[1024];
4494   Lisp_Object attrs, charset_list;
4495
4496   CODING_GET_INFO (coding, attrs, charset_list);
4497   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4498
4499   while (src < src_end)
4500     {
4501       const unsigned char *p = src;
4502       int *source, *source_end;
4503       int i = 0;
4504
4505       if (multibytep)
4506         while (i < 1024 && p < src_end)
4507           {
4508             source_byteidx[i] = p - src;
4509             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4510           }
4511       else
4512         while (i < 1024 && p < src_end)
4513           source_charbuf[i++] = *p++;
4514
4515       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4516         ccl.last_block = 1;
4517
4518       source = source_charbuf;
4519       source_end = source + i;
4520       while (source < source_end)
4521         {
4522           ccl_driver (&ccl, source, charbuf,
4523                       source_end - source, charbuf_end - charbuf,
4524                       charset_list);
4525           source += ccl.consumed;
4526           charbuf += ccl.produced;
4527           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4528             break;
4529         }
4530       if (source < source_end)
4531         src += source_byteidx[source - source_charbuf];
4532       else
4533         src = p;
4534       consumed_chars += source - source_charbuf;
4535
4536       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4537           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4538         break;
4539     }
4540
4541   switch (ccl.status)
4542     {
4543     case CCL_STAT_SUSPEND_BY_SRC:
4544       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4545       break;
4546     case CCL_STAT_SUSPEND_BY_DST:
4547       break;
4548     case CCL_STAT_QUIT:
4549     case CCL_STAT_INVALID_CMD:
4550       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4551       break;
4552     default:
4553       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4554       break;
4555     }
4556   coding->consumed_char += consumed_chars;
4557   coding->consumed = src - coding->source;
4558   coding->charbuf_used = charbuf - coding->charbuf;
4559 }
4560
4561 static int
4562 encode_coding_ccl (coding)
4563      struct coding_system *coding;
4564 {
4565   struct ccl_program ccl;
4566   int multibytep = coding->dst_multibyte;
4567   int *charbuf = coding->charbuf;
4568   int *charbuf_end = charbuf + coding->charbuf_used;
4569   unsigned char *dst = coding->destination + coding->produced;
4570   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4571   int destination_charbuf[1024];
4572   int i, produced_chars = 0;
4573   Lisp_Object attrs, charset_list;
4574
4575   CODING_GET_INFO (coding, attrs, charset_list);
4576   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4577
4578   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4579   ccl.dst_multibyte = coding->dst_multibyte;
4580
4581   while (charbuf < charbuf_end)
4582     {
4583       ccl_driver (&ccl, charbuf, destination_charbuf,
4584                   charbuf_end - charbuf, 1024, charset_list);
4585       if (multibytep)
4586         {
4587           ASSURE_DESTINATION (ccl.produced * 2);
4588           for (i = 0; i < ccl.produced; i++)
4589             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4590         }
4591       else
4592         {
4593           ASSURE_DESTINATION (ccl.produced);
4594           for (i = 0; i < ccl.produced; i++)
4595             *dst++ = destination_charbuf[i] & 0xFF;
4596           produced_chars += ccl.produced;
4597         }
4598       charbuf += ccl.consumed;
4599       if (ccl.status == CCL_STAT_QUIT
4600           || ccl.status == CCL_STAT_INVALID_CMD)
4601         break;
4602     }
4603
4604   switch (ccl.status)
4605     {
4606     case CCL_STAT_SUSPEND_BY_SRC:
4607       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4608       break;
4609     case CCL_STAT_SUSPEND_BY_DST:
4610       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4611       break;
4612     case CCL_STAT_QUIT:
4613     case CCL_STAT_INVALID_CMD:
4614       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4615       break;
4616     default:
4617       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4618       break;
4619     }
4620
4621   coding->produced_char += produced_chars;
4622   coding->produced = dst - coding->destination;
4623   return 0;
4624 }
4625
4626
4627 \f
4628 /*** 10, 11. no-conversion handlers ***/
4629
4630 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4631
4632 static void
4633 decode_coding_raw_text (coding)
4634      struct coding_system *coding;
4635 {
4636   coding->chars_at_source = 1;
4637   coding->consumed_char = 0;
4638   coding->consumed = 0;
4639   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4640 }
4641
4642 static int
4643 encode_coding_raw_text (coding)
4644      struct coding_system *coding;
4645 {
4646   int multibytep = coding->dst_multibyte;
4647   int *charbuf = coding->charbuf;
4648   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4649   unsigned char *dst = coding->destination + coding->produced;
4650   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4651   int produced_chars = 0;
4652   int c;
4653
4654   if (multibytep)
4655     {
4656       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4657
4658       if (coding->src_multibyte)
4659         while (charbuf < charbuf_end)
4660           {
4661             ASSURE_DESTINATION (safe_room);
4662             c = *charbuf++;
4663             if (ASCII_CHAR_P (c))
4664               EMIT_ONE_ASCII_BYTE (c);
4665             else if (CHAR_BYTE8_P (c))
4666               {
4667                 c = CHAR_TO_BYTE8 (c);
4668                 EMIT_ONE_BYTE (c);
4669               }
4670             else
4671               {
4672                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4673
4674                 CHAR_STRING_ADVANCE (c, p1);
4675                 while (p0 < p1)
4676                   {
4677                     EMIT_ONE_BYTE (*p0);
4678                     p0++;
4679                   }
4680               }
4681           }
4682       else
4683         while (charbuf < charbuf_end)
4684           {
4685             ASSURE_DESTINATION (safe_room);
4686             c = *charbuf++;
4687             EMIT_ONE_BYTE (c);
4688           }
4689     }
4690   else
4691     {
4692       if (coding->src_multibyte)
4693         {
4694           int safe_room = MAX_MULTIBYTE_LENGTH;
4695
4696           while (charbuf < charbuf_end)
4697             {
4698               ASSURE_DESTINATION (safe_room);
4699               c = *charbuf++;
4700               if (ASCII_CHAR_P (c))
4701                 *dst++ = c;
4702               else if (CHAR_BYTE8_P (c))
4703                 *dst++ = CHAR_TO_BYTE8 (c);
4704               else
4705                 CHAR_STRING_ADVANCE (c, dst);
4706               produced_chars++;
4707             }
4708         }
4709       else
4710         {
4711           ASSURE_DESTINATION (charbuf_end - charbuf);
4712           while (charbuf < charbuf_end && dst < dst_end)
4713             *dst++ = *charbuf++;
4714           produced_chars = dst - (coding->destination + coding->dst_bytes);
4715         }
4716     }
4717   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4718   coding->produced_char += produced_chars;
4719   coding->produced = dst - coding->destination;
4720   return 0;
4721 }
4722
4723 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4724    Check if a text is encoded in a charset-based coding system.  If it
4725    is, return 1, else return 0.  */
4726
4727 static int
4728 detect_coding_charset (coding, detect_info)
4729      struct coding_system *coding;
4730      struct coding_detection_info *detect_info;
4731 {
4732   const unsigned char *src = coding->source, *src_base;
4733   const unsigned char *src_end = coding->source + coding->src_bytes;
4734   int multibytep = coding->src_multibyte;
4735   int consumed_chars = 0;
4736   Lisp_Object attrs, valids;
4737   int found = 0;
4738   int head_ascii = coding->head_ascii;
4739
4740   detect_info->checked |= CATEGORY_MASK_CHARSET;
4741
4742   coding = &coding_categories[coding_category_charset];
4743   attrs = CODING_ID_ATTRS (coding->id);
4744   valids = AREF (attrs, coding_attr_charset_valids);
4745
4746   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4747     src += head_ascii;
4748
4749   while (1)
4750     {
4751       int c;
4752       Lisp_Object val;
4753       struct charset *charset;
4754       int dim, idx;
4755
4756       src_base = src;
4757       ONE_MORE_BYTE (c);
4758       if (c < 0)
4759         continue;
4760       val = AREF (valids, c);
4761       if (NILP (val))
4762         break;
4763       if (c >= 0x80)
4764         found = CATEGORY_MASK_CHARSET;
4765       if (INTEGERP (val))
4766         {
4767           charset = CHARSET_FROM_ID (XFASTINT (val));
4768           dim = CHARSET_DIMENSION (charset);
4769           for (idx = 1; idx < dim; idx++)
4770             {
4771               if (src == src_end)
4772                 goto too_short;
4773               ONE_MORE_BYTE (c);
4774               if (c < charset->code_space[(dim - 1 - idx) * 2]
4775                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4776                 break;
4777             }
4778           if (idx < dim)
4779             break;
4780         }
4781       else
4782         {
4783           idx = 1;
4784           for (; CONSP (val); val = XCDR (val))
4785             {
4786               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4787               dim = CHARSET_DIMENSION (charset);
4788               while (idx < dim)
4789                 {
4790                   if (src == src_end)
4791                     goto too_short;
4792                   ONE_MORE_BYTE (c);
4793                   if (c < charset->code_space[(dim - 1 - idx) * 4]
4794                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4795                     break;
4796                   idx++;
4797                 }
4798               if (idx == dim)
4799                 {
4800                   val = Qnil;
4801                   break;
4802                 }
4803             }
4804           if (CONSP (val))
4805             break;
4806         }
4807     }
4808  too_short:
4809   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4810   return 0;
4811
4812  no_more_source:
4813   detect_info->found |= found;
4814   return 1;
4815 }
4816
4817 static void
4818 decode_coding_charset (coding)
4819      struct coding_system *coding;
4820 {
4821   const unsigned char *src = coding->source + coding->consumed;
4822   const unsigned char *src_end = coding->source + coding->src_bytes;
4823   const unsigned char *src_base;
4824   int *charbuf = coding->charbuf + coding->charbuf_used;
4825   int *charbuf_end
4826     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4827   int consumed_chars = 0, consumed_chars_base;
4828   int multibytep = coding->src_multibyte;
4829   Lisp_Object attrs, charset_list, valids;
4830   int char_offset = coding->produced_char;
4831   int last_offset = char_offset;
4832   int last_id = charset_ascii;
4833
4834   CODING_GET_INFO (coding, attrs, charset_list);
4835   valids = AREF (attrs, coding_attr_charset_valids);
4836
4837   while (1)
4838     {
4839       int c;
4840       Lisp_Object val;
4841       struct charset *charset;
4842       int dim;
4843       int len = 1;
4844       unsigned code;
4845
4846       src_base = src;
4847       consumed_chars_base = consumed_chars;
4848
4849       if (charbuf >= charbuf_end)
4850         break;
4851
4852       ONE_MORE_BYTE (c);
4853       if (c < 0)
4854         goto invalid_code;
4855       code = c;
4856
4857       val = AREF (valids, c);
4858       if (NILP (val))
4859         goto invalid_code;
4860       if (INTEGERP (val))
4861         {
4862           charset = CHARSET_FROM_ID (XFASTINT (val));
4863           dim = CHARSET_DIMENSION (charset);
4864           while (len < dim)
4865             {
4866               ONE_MORE_BYTE (c);
4867               code = (code << 8) | c;
4868               len++;
4869             }
4870           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4871                               charset, code, c);
4872         }
4873       else
4874         {
4875           /* VAL is a list of charset IDs.  It is assured that the
4876              list is sorted by charset dimensions (smaller one
4877              comes first).  */
4878           while (CONSP (val))
4879             {
4880               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4881               dim = CHARSET_DIMENSION (charset);
4882               while (len < dim)
4883                 {
4884                   ONE_MORE_BYTE (c);
4885                   code = (code << 8) | c;
4886                   len++;
4887                 }
4888               CODING_DECODE_CHAR (coding, src, src_base,
4889                                   src_end, charset, code, c);
4890               if (c >= 0)
4891                 break;
4892               val = XCDR (val);
4893             }
4894         }
4895       if (c < 0)
4896         goto invalid_code;
4897       if (charset->id != charset_ascii
4898           && last_id != charset->id)
4899         {
4900           if (last_id != charset_ascii)
4901             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4902           last_id = charset->id;
4903           last_offset = char_offset;
4904         }
4905
4906       *charbuf++ = c;
4907       char_offset++;
4908       continue;
4909
4910     invalid_code:
4911       src = src_base;
4912       consumed_chars = consumed_chars_base;
4913       ONE_MORE_BYTE (c);
4914       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4915       char_offset++;
4916       coding->errors++;
4917     }
4918
4919  no_more_source:
4920   if (last_id != charset_ascii)
4921     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4922   coding->consumed_char += consumed_chars_base;
4923   coding->consumed = src_base - coding->source;
4924   coding->charbuf_used = charbuf - coding->charbuf;
4925 }
4926
4927 static int
4928 encode_coding_charset (coding)
4929      struct coding_system *coding;
4930 {
4931   int multibytep = coding->dst_multibyte;
4932   int *charbuf = coding->charbuf;
4933   int *charbuf_end = charbuf + coding->charbuf_used;
4934   unsigned char *dst = coding->destination + coding->produced;
4935   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4936   int safe_room = MAX_MULTIBYTE_LENGTH;
4937   int produced_chars = 0;
4938   Lisp_Object attrs, charset_list;
4939   int ascii_compatible;
4940   int c;
4941
4942   CODING_GET_INFO (coding, attrs, charset_list);
4943   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4944
4945   while (charbuf < charbuf_end)
4946     {
4947       struct charset *charset;
4948       unsigned code;
4949
4950       ASSURE_DESTINATION (safe_room);
4951       c = *charbuf++;
4952       if (ascii_compatible && ASCII_CHAR_P (c))
4953         EMIT_ONE_ASCII_BYTE (c);
4954       else if (CHAR_BYTE8_P (c))
4955         {
4956           c = CHAR_TO_BYTE8 (c);
4957           EMIT_ONE_BYTE (c);
4958         }
4959       else
4960         {
4961           charset = char_charset (c, charset_list, &code);
4962           if (charset)
4963             {
4964               if (CHARSET_DIMENSION (charset) == 1)
4965                 EMIT_ONE_BYTE (code);
4966               else if (CHARSET_DIMENSION (charset) == 2)
4967                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4968               else if (CHARSET_DIMENSION (charset) == 3)
4969                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4970               else
4971                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4972                                  (code >> 8) & 0xFF, code & 0xFF);
4973             }
4974           else
4975             {
4976               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4977                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4978               else
4979                 c = coding->default_char;
4980               EMIT_ONE_BYTE (c);
4981             }
4982         }
4983     }
4984
4985   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4986   coding->produced_char += produced_chars;
4987   coding->produced = dst - coding->destination;
4988   return 0;
4989 }
4990
4991 \f
4992 /*** 7. C library functions ***/
4993
4994 /* Setup coding context CODING from information about CODING_SYSTEM.
4995    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4996    CODING_SYSTEM is invalid, signal an error.  */
4997
4998 void
4999 setup_coding_system (coding_system, coding)
5000      Lisp_Object coding_system;
5001      struct coding_system *coding;
5002 {
5003   Lisp_Object attrs;
5004   Lisp_Object eol_type;
5005   Lisp_Object coding_type;
5006   Lisp_Object val;
5007
5008   if (NILP (coding_system))
5009     coding_system = Qundecided;
5010
5011   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5012
5013   attrs = CODING_ID_ATTRS (coding->id);
5014   eol_type = CODING_ID_EOL_TYPE (coding->id);
5015
5016   coding->mode = 0;
5017   coding->head_ascii = -1;
5018   if (VECTORP (eol_type))
5019     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5020                             | CODING_REQUIRE_DETECTION_MASK);
5021   else if (! EQ (eol_type, Qunix))
5022     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5023                             | CODING_REQUIRE_ENCODING_MASK);
5024   else
5025     coding->common_flags = 0;
5026   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5027     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5028   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5029     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5030   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5031     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5032
5033   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5034   coding->max_charset_id = SCHARS (val) - 1;
5035   coding->safe_charsets = (char *) SDATA (val);
5036   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5037
5038   coding_type = CODING_ATTR_TYPE (attrs);
5039   if (EQ (coding_type, Qundecided))
5040     {
5041       coding->detector = NULL;
5042       coding->decoder = decode_coding_raw_text;
5043       coding->encoder = encode_coding_raw_text;
5044       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5045     }
5046   else if (EQ (coding_type, Qiso_2022))
5047     {
5048       int i;
5049       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5050
5051       /* Invoke graphic register 0 to plane 0.  */
5052       CODING_ISO_INVOCATION (coding, 0) = 0;
5053       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5054       CODING_ISO_INVOCATION (coding, 1)
5055         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5056       /* Setup the initial status of designation.  */
5057       for (i = 0; i < 4; i++)
5058         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5059       /* Not single shifting initially.  */
5060       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5061       /* Beginning of buffer should also be regarded as bol. */
5062       CODING_ISO_BOL (coding) = 1;
5063       coding->detector = detect_coding_iso_2022;
5064       coding->decoder = decode_coding_iso_2022;
5065       coding->encoder = encode_coding_iso_2022;
5066       if (flags & CODING_ISO_FLAG_SAFE)
5067         coding->mode |= CODING_MODE_SAFE_ENCODING;
5068       coding->common_flags
5069         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5070             | CODING_REQUIRE_FLUSHING_MASK);
5071       if (flags & CODING_ISO_FLAG_COMPOSITION)
5072         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5073       if (flags & CODING_ISO_FLAG_DESIGNATION)
5074         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5075       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5076         {
5077           setup_iso_safe_charsets (attrs);
5078           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5079           coding->max_charset_id = SCHARS (val) - 1;
5080           coding->safe_charsets = (char *) SDATA (val);
5081         }
5082       CODING_ISO_FLAGS (coding) = flags;
5083     }
5084   else if (EQ (coding_type, Qcharset))
5085     {
5086       coding->detector = detect_coding_charset;
5087       coding->decoder = decode_coding_charset;
5088       coding->encoder = encode_coding_charset;
5089       coding->common_flags
5090         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5091     }
5092   else if (EQ (coding_type, Qutf_8))
5093     {
5094       coding->detector = detect_coding_utf_8;
5095       coding->decoder = decode_coding_utf_8;
5096       coding->encoder = encode_coding_utf_8;
5097       coding->common_flags
5098         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5099     }
5100   else if (EQ (coding_type, Qutf_16))
5101     {
5102       val = AREF (attrs, coding_attr_utf_16_bom);
5103       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5104                                     : EQ (val, Qt) ? utf_16_with_bom
5105                                     : utf_16_without_bom);
5106       val = AREF (attrs, coding_attr_utf_16_endian);
5107       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5108                                        : utf_16_little_endian);
5109       CODING_UTF_16_SURROGATE (coding) = 0;
5110       coding->detector = detect_coding_utf_16;
5111       coding->decoder = decode_coding_utf_16;
5112       coding->encoder = encode_coding_utf_16;
5113       coding->common_flags
5114         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5115       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5116         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5117     }
5118   else if (EQ (coding_type, Qccl))
5119     {
5120       coding->detector = detect_coding_ccl;
5121       coding->decoder = decode_coding_ccl;
5122       coding->encoder = encode_coding_ccl;
5123       coding->common_flags
5124         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5125             | CODING_REQUIRE_FLUSHING_MASK);
5126     }
5127   else if (EQ (coding_type, Qemacs_mule))
5128     {
5129       coding->detector = detect_coding_emacs_mule;
5130       coding->decoder = decode_coding_emacs_mule;
5131       coding->encoder = encode_coding_emacs_mule;
5132       coding->common_flags
5133         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5134       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5135           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5136         {
5137           Lisp_Object tail, safe_charsets;
5138           int max_charset_id = 0;
5139
5140           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5141                tail = XCDR (tail))
5142             if (max_charset_id < XFASTINT (XCAR (tail)))
5143               max_charset_id = XFASTINT (XCAR (tail));
5144           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5145                                         make_number (255));
5146           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5147                tail = XCDR (tail))
5148             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5149           coding->max_charset_id = max_charset_id;
5150           coding->safe_charsets = (char *) SDATA (safe_charsets);
5151         }
5152     }
5153   else if (EQ (coding_type, Qshift_jis))
5154     {
5155       coding->detector = detect_coding_sjis;
5156       coding->decoder = decode_coding_sjis;
5157       coding->encoder = encode_coding_sjis;
5158       coding->common_flags
5159         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5160     }
5161   else if (EQ (coding_type, Qbig5))
5162     {
5163       coding->detector = detect_coding_big5;
5164       coding->decoder = decode_coding_big5;
5165       coding->encoder = encode_coding_big5;
5166       coding->common_flags
5167         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5168     }
5169   else                          /* EQ (coding_type, Qraw_text) */
5170     {
5171       coding->detector = NULL;
5172       coding->decoder = decode_coding_raw_text;
5173       coding->encoder = encode_coding_raw_text;
5174       if (! EQ (eol_type, Qunix))
5175         {
5176           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5177           if (! VECTORP (eol_type))
5178             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5179         }
5180
5181     }
5182
5183   return;
5184 }
5185
5186 /* Return a list of charsets supported by CODING.  */
5187
5188 Lisp_Object
5189 coding_charset_list (coding)
5190      struct coding_system *coding;
5191 {
5192   Lisp_Object attrs, charset_list;
5193
5194   CODING_GET_INFO (coding, attrs, charset_list);
5195   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5196     {
5197       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5198
5199       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5200         charset_list = Viso_2022_charset_list;
5201     }
5202   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5203     {
5204       charset_list = Vemacs_mule_charset_list;
5205     }
5206   return charset_list;
5207 }
5208
5209
5210 /* Return raw-text or one of its subsidiaries that has the same
5211    eol_type as CODING-SYSTEM.  */
5212
5213 Lisp_Object
5214 raw_text_coding_system (coding_system)
5215      Lisp_Object coding_system;
5216 {
5217   Lisp_Object spec, attrs;
5218   Lisp_Object eol_type, raw_text_eol_type;
5219
5220   if (NILP (coding_system))
5221     return Qraw_text;
5222   spec = CODING_SYSTEM_SPEC (coding_system);
5223   attrs = AREF (spec, 0);
5224
5225   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5226     return coding_system;
5227
5228   eol_type = AREF (spec, 2);
5229   if (VECTORP (eol_type))
5230     return Qraw_text;
5231   spec = CODING_SYSTEM_SPEC (Qraw_text);
5232   raw_text_eol_type = AREF (spec, 2);
5233   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5234           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5235           : AREF (raw_text_eol_type, 2));
5236 }
5237
5238
5239 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5240    does, return one of the subsidiary that has the same eol-spec as
5241    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
5242    inherit end-of-line format from the system's setting
5243    (system_eol_type).  */
5244
5245 Lisp_Object
5246 coding_inherit_eol_type (coding_system, parent)
5247      Lisp_Object coding_system, parent;
5248 {
5249   Lisp_Object spec, eol_type;
5250
5251   if (NILP (coding_system))
5252     coding_system = Qraw_text;
5253   spec = CODING_SYSTEM_SPEC (coding_system);
5254   eol_type = AREF (spec, 2);
5255   if (VECTORP (eol_type))
5256     {
5257       Lisp_Object parent_eol_type;
5258
5259       if (! NILP (parent))
5260         {
5261           Lisp_Object parent_spec;
5262
5263           parent_spec = CODING_SYSTEM_SPEC (parent);
5264           parent_eol_type = AREF (parent_spec, 2);
5265         }
5266       else
5267         parent_eol_type = system_eol_type;
5268       if (EQ (parent_eol_type, Qunix))
5269         coding_system = AREF (eol_type, 0);
5270       else if (EQ (parent_eol_type, Qdos))
5271         coding_system = AREF (eol_type, 1);
5272       else if (EQ (parent_eol_type, Qmac))
5273         coding_system = AREF (eol_type, 2);
5274     }
5275   return coding_system;
5276 }
5277
5278 /* Emacs has a mechanism to automatically detect a coding system if it
5279    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5280    it's impossible to distinguish some coding systems accurately
5281    because they use the same range of codes.  So, at first, coding
5282    systems are categorized into 7, those are:
5283
5284    o coding-category-emacs-mule
5285
5286         The category for a coding system which has the same code range
5287         as Emacs' internal format.  Assigned the coding-system (Lisp
5288         symbol) `emacs-mule' by default.
5289
5290    o coding-category-sjis
5291
5292         The category for a coding system which has the same code range
5293         as SJIS.  Assigned the coding-system (Lisp
5294         symbol) `japanese-shift-jis' by default.
5295
5296    o coding-category-iso-7
5297
5298         The category for a coding system which has the same code range
5299         as ISO2022 of 7-bit environment.  This doesn't use any locking
5300         shift and single shift functions.  This can encode/decode all
5301         charsets.  Assigned the coding-system (Lisp symbol)
5302         `iso-2022-7bit' by default.
5303
5304    o coding-category-iso-7-tight
5305
5306         Same as coding-category-iso-7 except that this can
5307         encode/decode only the specified charsets.
5308
5309    o coding-category-iso-8-1
5310
5311         The category for a coding system which has the same code range
5312         as ISO2022 of 8-bit environment and graphic plane 1 used only
5313         for DIMENSION1 charset.  This doesn't use any locking shift
5314         and single shift functions.  Assigned the coding-system (Lisp
5315         symbol) `iso-latin-1' by default.
5316
5317    o coding-category-iso-8-2
5318
5319         The category for a coding system which has the same code range
5320         as ISO2022 of 8-bit environment and graphic plane 1 used only
5321         for DIMENSION2 charset.  This doesn't use any locking shift
5322         and single shift functions.  Assigned the coding-system (Lisp
5323         symbol) `japanese-iso-8bit' by default.
5324
5325    o coding-category-iso-7-else
5326
5327         The category for a coding system which has the same code range
5328         as ISO2022 of 7-bit environemnt but uses locking shift or
5329         single shift functions.  Assigned the coding-system (Lisp
5330         symbol) `iso-2022-7bit-lock' by default.
5331
5332    o coding-category-iso-8-else
5333
5334         The category for a coding system which has the same code range
5335         as ISO2022 of 8-bit environemnt but uses locking shift or
5336         single shift functions.  Assigned the coding-system (Lisp
5337         symbol) `iso-2022-8bit-ss2' by default.
5338
5339    o coding-category-big5
5340
5341         The category for a coding system which has the same code range
5342         as BIG5.  Assigned the coding-system (Lisp symbol)
5343         `cn-big5' by default.
5344
5345    o coding-category-utf-8
5346
5347         The category for a coding system which has the same code range
5348         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5349         symbol) `utf-8' by default.
5350
5351    o coding-category-utf-16-be
5352
5353         The category for a coding system in which a text has an
5354         Unicode signature (cf. Unicode Standard) in the order of BIG
5355         endian at the head.  Assigned the coding-system (Lisp symbol)
5356         `utf-16-be' by default.
5357
5358    o coding-category-utf-16-le
5359
5360         The category for a coding system in which a text has an
5361         Unicode signature (cf. Unicode Standard) in the order of
5362         LITTLE endian at the head.  Assigned the coding-system (Lisp
5363         symbol) `utf-16-le' by default.
5364
5365    o coding-category-ccl
5366
5367         The category for a coding system of which encoder/decoder is
5368         written in CCL programs.  The default value is nil, i.e., no
5369         coding system is assigned.
5370
5371    o coding-category-binary
5372
5373         The category for a coding system not categorized in any of the
5374         above.  Assigned the coding-system (Lisp symbol)
5375         `no-conversion' by default.
5376
5377    Each of them is a Lisp symbol and the value is an actual
5378    `coding-system's (this is also a Lisp symbol) assigned by a user.
5379    What Emacs does actually is to detect a category of coding system.
5380    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5381    decide only one possible category, it selects a category of the
5382    highest priority.  Priorities of categories are also specified by a
5383    user in a Lisp variable `coding-category-list'.
5384
5385 */
5386
5387 #define EOL_SEEN_NONE   0
5388 #define EOL_SEEN_LF     1
5389 #define EOL_SEEN_CR     2
5390 #define EOL_SEEN_CRLF   4
5391
5392 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5393    SOURCE is encoded.  If CATEGORY is one of
5394    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5395    two-byte, else they are encoded by one-byte.
5396
5397    Return one of EOL_SEEN_XXX.  */
5398
5399 #define MAX_EOL_CHECK_COUNT 3
5400
5401 static int
5402 detect_eol (source, src_bytes, category)
5403      const unsigned char *source;
5404      EMACS_INT src_bytes;
5405      enum coding_category category;
5406 {
5407   const unsigned char *src = source, *src_end = src + src_bytes;
5408   unsigned char c;
5409   int total  = 0;
5410   int eol_seen = EOL_SEEN_NONE;
5411
5412   if ((1 << category) & CATEGORY_MASK_UTF_16)
5413     {
5414       int msb, lsb;
5415
5416       msb = category == (coding_category_utf_16_le
5417                          | coding_category_utf_16_le_nosig);
5418       lsb = 1 - msb;
5419
5420       while (src + 1 < src_end)
5421         {
5422           c = src[lsb];
5423           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5424             {
5425               int this_eol;
5426
5427               if (c == '\n')
5428                 this_eol = EOL_SEEN_LF;
5429               else if (src + 3 >= src_end
5430                        || src[msb + 2] != 0
5431                        || src[lsb + 2] != '\n')
5432                 this_eol = EOL_SEEN_CR;
5433               else
5434                 this_eol = EOL_SEEN_CRLF;
5435
5436               if (eol_seen == EOL_SEEN_NONE)
5437                 /* This is the first end-of-line.  */
5438                 eol_seen = this_eol;
5439               else if (eol_seen != this_eol)
5440                 {
5441                   /* The found type is different from what found before.  */
5442                   eol_seen = EOL_SEEN_LF;
5443                   break;
5444                 }
5445               if (++total == MAX_EOL_CHECK_COUNT)
5446                 break;
5447             }
5448           src += 2;
5449         }
5450     }
5451   else
5452     {
5453       while (src < src_end)
5454         {
5455           c = *src++;
5456           if (c == '\n' || c == '\r')
5457             {
5458               int this_eol;
5459
5460               if (c == '\n')
5461                 this_eol = EOL_SEEN_LF;
5462               else if (src >= src_end || *src != '\n')
5463                 this_eol = EOL_SEEN_CR;
5464               else
5465                 this_eol = EOL_SEEN_CRLF, src++;
5466
5467               if (eol_seen == EOL_SEEN_NONE)
5468                 /* This is the first end-of-line.  */
5469                 eol_seen = this_eol;
5470               else if (eol_seen != this_eol)
5471                 {
5472                   /* The found type is different from what found before.  */
5473                   eol_seen = EOL_SEEN_LF;
5474                   break;
5475                 }
5476               if (++total == MAX_EOL_CHECK_COUNT)
5477                 break;
5478             }
5479         }
5480     }
5481   return eol_seen;
5482 }
5483
5484
5485 static Lisp_Object
5486 adjust_coding_eol_type (coding, eol_seen)
5487      struct coding_system *coding;
5488      int eol_seen;
5489 {
5490   Lisp_Object eol_type;
5491
5492   eol_type = CODING_ID_EOL_TYPE (coding->id);
5493   if (eol_seen & EOL_SEEN_LF)
5494     {
5495       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5496       eol_type = Qunix;
5497     }
5498   else if (eol_seen & EOL_SEEN_CRLF)
5499     {
5500       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5501       eol_type = Qdos;
5502     }
5503   else if (eol_seen & EOL_SEEN_CR)
5504     {
5505       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5506       eol_type = Qmac;
5507     }
5508   return eol_type;
5509 }
5510
5511 /* Detect how a text specified in CODING is encoded.  If a coding
5512    system is detected, update fields of CODING by the detected coding
5513    system.  */
5514
5515 void
5516 detect_coding (coding)
5517      struct coding_system *coding;
5518 {
5519   const unsigned char *src, *src_end;
5520
5521   coding->consumed = coding->consumed_char = 0;
5522   coding->produced = coding->produced_char = 0;
5523   coding_set_source (coding);
5524
5525   src_end = coding->source + coding->src_bytes;
5526
5527   /* If we have not yet decided the text encoding type, detect it
5528      now.  */
5529   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5530     {
5531       int c, i;
5532       struct coding_detection_info detect_info;
5533
5534       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5535       for (i = 0, src = coding->source; src < src_end; i++, src++)
5536         {
5537           c = *src;
5538           if (c & 0x80)
5539             break;
5540           if (c < 0x20
5541               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5542               && ! inhibit_iso_escape_detection
5543               && ! detect_info.checked)
5544             {
5545               coding->head_ascii = src - (coding->source + coding->consumed);
5546               if (detect_coding_iso_2022 (coding, &detect_info))
5547                 {
5548                   /* We have scanned the whole data.  */
5549                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5550                     /* We didn't find an 8-bit code.  */
5551                     src = src_end;
5552                   break;
5553                 }
5554             }
5555         }
5556       coding->head_ascii = src - (coding->source + coding->consumed);
5557
5558       if (coding->head_ascii < coding->src_bytes
5559           || detect_info.found)
5560         {
5561           enum coding_category category;
5562           struct coding_system *this;
5563
5564           if (coding->head_ascii == coding->src_bytes)
5565             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5566             for (i = 0; i < coding_category_raw_text; i++)
5567               {
5568                 category = coding_priorities[i];
5569                 this = coding_categories + category;
5570                 if (detect_info.found & (1 << category))
5571                   break;
5572               }
5573           else
5574             for (i = 0; i < coding_category_raw_text; i++)
5575               {
5576                 category = coding_priorities[i];
5577                 this = coding_categories + category;
5578                 if (this->id < 0)
5579                   {
5580                     /* No coding system of this category is defined.  */
5581                     detect_info.rejected |= (1 << category);
5582                   }
5583                 else if (category >= coding_category_raw_text)
5584                   continue;
5585                 else if (detect_info.checked & (1 << category))
5586                   {
5587                     if (detect_info.found & (1 << category))
5588                       break;
5589                   }
5590                 else if ((*(this->detector)) (coding, &detect_info)
5591                          && detect_info.found & (1 << category))
5592                   {
5593                     if (category == coding_category_utf_16_auto)
5594                       {
5595                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5596                           category = coding_category_utf_16_le;
5597                         else
5598                           category = coding_category_utf_16_be;
5599                       }
5600                     break;
5601                   }
5602               }
5603
5604           if (i < coding_category_raw_text)
5605             setup_coding_system (CODING_ID_NAME (this->id), coding);
5606           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5607             setup_coding_system (Qraw_text, coding);
5608           else if (detect_info.rejected)
5609             for (i = 0; i < coding_category_raw_text; i++)
5610               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5611                 {
5612                   this = coding_categories + coding_priorities[i];
5613                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5614                   break;
5615                 }
5616         }
5617     }
5618   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5619            == coding_category_utf_16_auto)
5620     {
5621       Lisp_Object coding_systems;
5622       struct coding_detection_info detect_info;
5623
5624       coding_systems
5625         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5626       detect_info.found = detect_info.rejected = 0;
5627       if (CONSP (coding_systems)
5628           && detect_coding_utf_16 (coding, &detect_info))
5629         {
5630           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5631             setup_coding_system (XCAR (coding_systems), coding);
5632           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5633             setup_coding_system (XCDR (coding_systems), coding);
5634         }
5635     }
5636 }
5637
5638
5639 static void
5640 decode_eol (coding)
5641      struct coding_system *coding;
5642 {
5643   Lisp_Object eol_type;
5644   unsigned char *p, *pbeg, *pend;
5645
5646   eol_type = CODING_ID_EOL_TYPE (coding->id);
5647   if (EQ (eol_type, Qunix))
5648     return;
5649
5650   if (NILP (coding->dst_object))
5651     pbeg = coding->destination;
5652   else
5653     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5654   pend = pbeg + coding->produced;
5655
5656   if (VECTORP (eol_type))
5657     {
5658       int eol_seen = EOL_SEEN_NONE;
5659
5660       for (p = pbeg; p < pend; p++)
5661         {
5662           if (*p == '\n')
5663             eol_seen |= EOL_SEEN_LF;
5664           else if (*p == '\r')
5665             {
5666               if (p + 1 < pend && *(p + 1) == '\n')
5667                 {
5668                   eol_seen |= EOL_SEEN_CRLF;
5669                   p++;
5670                 }
5671               else
5672                 eol_seen |= EOL_SEEN_CR;
5673             }
5674         }
5675       if (eol_seen != EOL_SEEN_NONE
5676           && eol_seen != EOL_SEEN_LF
5677           && eol_seen != EOL_SEEN_CRLF
5678           && eol_seen != EOL_SEEN_CR)
5679         eol_seen = EOL_SEEN_LF;
5680       if (eol_seen != EOL_SEEN_NONE)
5681         eol_type = adjust_coding_eol_type (coding, eol_seen);
5682     }
5683
5684   if (EQ (eol_type, Qmac))
5685     {
5686       for (p = pbeg; p < pend; p++)
5687         if (*p == '\r')
5688           *p = '\n';
5689     }
5690   else if (EQ (eol_type, Qdos))
5691     {
5692       int n = 0;
5693
5694       if (NILP (coding->dst_object))
5695         {
5696           /* Start deleting '\r' from the tail to minimize the memory
5697              movement.  */
5698           for (p = pend - 2; p >= pbeg; p--)
5699             if (*p == '\r')
5700               {
5701                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5702                 n++;
5703               }
5704         }
5705       else
5706         {
5707           int pos_byte = coding->dst_pos_byte;
5708           int pos = coding->dst_pos;
5709           int pos_end = pos + coding->produced_char - 1;
5710
5711           while (pos < pos_end)
5712             {
5713               p = BYTE_POS_ADDR (pos_byte);
5714               if (*p == '\r' && p[1] == '\n')
5715                 {
5716                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5717                   n++;
5718                   pos_end--;
5719                 }
5720               pos++;
5721               pos_byte += BYTES_BY_CHAR_HEAD (*p);
5722             }
5723         }
5724       coding->produced -= n;
5725       coding->produced_char -= n;
5726     }
5727 }
5728
5729
5730 /* Return a translation table (or list of them) from coding system
5731    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5732    decoding (ENCODEP is zero). */
5733
5734 static Lisp_Object
5735 get_translation_table (attrs, encodep, max_lookup)
5736      Lisp_Object attrs;
5737      int encodep, *max_lookup;
5738 {
5739   Lisp_Object standard, translation_table;
5740   Lisp_Object val;
5741
5742   if (encodep)
5743     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5744       standard = Vstandard_translation_table_for_encode;
5745   else
5746     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5747       standard = Vstandard_translation_table_for_decode;
5748   if (NILP (translation_table))
5749     translation_table = standard;
5750   else
5751     {
5752       if (SYMBOLP (translation_table))
5753         translation_table = Fget (translation_table, Qtranslation_table);
5754       else if (CONSP (translation_table))
5755         {
5756           translation_table = Fcopy_sequence (translation_table);
5757           for (val = translation_table; CONSP (val); val = XCDR (val))
5758             if (SYMBOLP (XCAR (val)))
5759               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5760         }
5761       if (CHAR_TABLE_P (standard))
5762         {
5763           if (CONSP (translation_table))
5764             translation_table = nconc2 (translation_table,
5765                                         Fcons (standard, Qnil));
5766           else
5767             translation_table = Fcons (translation_table,
5768                                        Fcons (standard, Qnil));
5769         }
5770     }
5771
5772   if (max_lookup)
5773     {
5774       *max_lookup = 1;
5775       if (CHAR_TABLE_P (translation_table)
5776           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5777         {
5778           val = XCHAR_TABLE (translation_table)->extras[1];
5779           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5780             *max_lookup = XFASTINT (val);
5781         }
5782       else if (CONSP (translation_table))
5783         {
5784           Lisp_Object tail, val;
5785
5786           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5787             if (CHAR_TABLE_P (XCAR (tail))
5788                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5789               {
5790                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5791                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5792                   *max_lookup = XFASTINT (val);
5793               }
5794         }
5795     }
5796   return translation_table;
5797 }
5798
5799 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5800   do {                                                          \
5801     trans = Qnil;                                               \
5802     if (CHAR_TABLE_P (table))                                   \
5803       {                                                         \
5804         trans = CHAR_TABLE_REF (table, c);                      \
5805         if (CHARACTERP (trans))                                 \
5806           c = XFASTINT (trans), trans = Qnil;                   \
5807       }                                                         \
5808     else if (CONSP (table))                                     \
5809       {                                                         \
5810         Lisp_Object tail;                                       \
5811                                                                 \
5812         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5813           if (CHAR_TABLE_P (XCAR (tail)))                       \
5814             {                                                   \
5815               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5816               if (CHARACTERP (trans))                           \
5817                 c = XFASTINT (trans), trans = Qnil;             \
5818               else if (! NILP (trans))                          \
5819                 break;                                          \
5820             }                                                   \
5821       }                                                         \
5822   } while (0)
5823
5824
5825 static Lisp_Object
5826 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5827      Lisp_Object val;
5828      int *buf, *buf_end;
5829      int last_block;
5830      int *from_nchars, *to_nchars;
5831 {
5832   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5833      [TO-CHAR ...].  */
5834   if (CONSP (val))
5835     {
5836       Lisp_Object from, tail;
5837       int i, len;
5838
5839       for (tail = val; CONSP (tail); tail = XCDR (tail))
5840         {
5841           val = XCAR (tail);
5842           from = XCAR (val);
5843           len = ASIZE (from);
5844           for (i = 0; i < len; i++)
5845             {
5846               if (buf + i == buf_end)
5847                 {
5848                   if (! last_block)
5849                     return Qt;
5850                   break;
5851                 }
5852               if (XINT (AREF (from, i)) != buf[i])
5853                 break;
5854             }
5855           if (i == len)
5856             {
5857               val = XCDR (val);
5858               *from_nchars = len;
5859               break;
5860             }
5861         }
5862       if (! CONSP (tail))
5863         return Qnil;
5864     }
5865   if (VECTORP (val))
5866     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5867   else
5868     *buf = XINT (val);
5869   return val;
5870 }
5871
5872
5873 static int
5874 produce_chars (coding, translation_table, last_block)
5875      struct coding_system *coding;
5876      Lisp_Object translation_table;
5877      int last_block;
5878 {
5879   unsigned char *dst = coding->destination + coding->produced;
5880   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5881   int produced;
5882   int produced_chars = 0;
5883   int carryover = 0;
5884
5885   if (! coding->chars_at_source)
5886     {
5887       /* Characters are in coding->charbuf.  */
5888       int *buf = coding->charbuf;
5889       int *buf_end = buf + coding->charbuf_used;
5890
5891       if (BUFFERP (coding->src_object)
5892           && EQ (coding->src_object, coding->dst_object))
5893         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5894
5895       while (buf < buf_end)
5896         {
5897           int c = *buf, i;
5898
5899           if (c >= 0)
5900             {
5901               int from_nchars = 1, to_nchars = 1;
5902               Lisp_Object trans = Qnil;
5903
5904               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5905               if (! NILP (trans))
5906                 {
5907                   trans = get_translation (trans, buf, buf_end, last_block,
5908                                            &from_nchars, &to_nchars);
5909                   if (EQ (trans, Qt))
5910                     break;
5911                   c = *buf;
5912                 }
5913
5914               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5915                 {
5916                   dst = alloc_destination (coding,
5917                                            buf_end - buf
5918                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5919                                            dst);
5920                   dst_end = coding->destination + coding->dst_bytes;
5921                 }
5922
5923               for (i = 0; i < to_nchars; i++)
5924                 {
5925                   if (i > 0)
5926                     c = XINT (AREF (trans, i));
5927                   if (coding->dst_multibyte
5928                       || ! CHAR_BYTE8_P (c))
5929                     CHAR_STRING_ADVANCE (c, dst);
5930                   else
5931                     *dst++ = CHAR_TO_BYTE8 (c);
5932                 }
5933               produced_chars += to_nchars;
5934               *buf++ = to_nchars;
5935               while (--from_nchars > 0)
5936                 *buf++ = 0;
5937             }
5938           else
5939             /* This is an annotation datum.  (-C) is the length.  */
5940             buf += -c;
5941         }
5942       carryover = buf_end - buf;
5943     }
5944   else
5945     {
5946       const unsigned char *src = coding->source;
5947       const unsigned char *src_end = src + coding->src_bytes;
5948       Lisp_Object eol_type;
5949
5950       eol_type = CODING_ID_EOL_TYPE (coding->id);
5951
5952       if (coding->src_multibyte != coding->dst_multibyte)
5953         {
5954           if (coding->src_multibyte)
5955             {
5956               int multibytep = 1;
5957               int consumed_chars;
5958
5959               while (1)
5960                 {
5961                   const unsigned char *src_base = src;
5962                   int c;
5963
5964                   ONE_MORE_BYTE (c);
5965                   if (c == '\r')
5966                     {
5967                       if (EQ (eol_type, Qdos))
5968                         {
5969                           if (src == src_end)
5970                             {
5971                               record_conversion_result
5972                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5973                               goto no_more_source;
5974                             }
5975                           if (*src == '\n')
5976                             c = *src++;
5977                         }
5978                       else if (EQ (eol_type, Qmac))
5979                         c = '\n';
5980                     }
5981                   if (dst == dst_end)
5982                     {
5983                       coding->consumed = src - coding->source;
5984
5985                     if (EQ (coding->src_object, coding->dst_object))
5986                       dst_end = (unsigned char *) src;
5987                     if (dst == dst_end)
5988                       {
5989                         dst = alloc_destination (coding, src_end - src + 1,
5990                                                  dst);
5991                         dst_end = coding->destination + coding->dst_bytes;
5992                         coding_set_source (coding);
5993                         src = coding->source + coding->consumed;
5994                         src_end = coding->source + coding->src_bytes;
5995                       }
5996                     }
5997                   *dst++ = c;
5998                   produced_chars++;
5999                 }
6000             no_more_source:
6001               ;
6002             }
6003           else
6004             while (src < src_end)
6005               {
6006                 int multibytep = 1;
6007                 int c = *src++;
6008
6009                 if (c == '\r')
6010                   {
6011                     if (EQ (eol_type, Qdos))
6012                       {
6013                         if (src < src_end
6014                             && *src == '\n')
6015                           c = *src++;
6016                       }
6017                     else if (EQ (eol_type, Qmac))
6018                       c = '\n';
6019                   }
6020                 if (dst >= dst_end - 1)
6021                   {
6022                     coding->consumed = src - coding->source;
6023
6024                     if (EQ (coding->src_object, coding->dst_object))
6025                       dst_end = (unsigned char *) src;
6026                     if (dst >= dst_end - 1)
6027                       {
6028                         dst = alloc_destination (coding, src_end - src + 2,
6029                                                  dst);
6030                         dst_end = coding->destination + coding->dst_bytes;
6031                         coding_set_source (coding);
6032                         src = coding->source + coding->consumed;
6033                         src_end = coding->source + coding->src_bytes;
6034                       }
6035                   }
6036                 EMIT_ONE_BYTE (c);
6037               }
6038         }
6039       else
6040         {
6041           if (!EQ (coding->src_object, coding->dst_object))
6042             {
6043               int require = coding->src_bytes - coding->dst_bytes;
6044
6045               if (require > 0)
6046                 {
6047                   EMACS_INT offset = src - coding->source;
6048
6049                   dst = alloc_destination (coding, require, dst);
6050                   coding_set_source (coding);
6051                   src = coding->source + offset;
6052                   src_end = coding->source + coding->src_bytes;
6053                 }
6054             }
6055           produced_chars = coding->src_chars;
6056           while (src < src_end)
6057             {
6058               int c = *src++;
6059
6060               if (c == '\r')
6061                 {
6062                   if (EQ (eol_type, Qdos))
6063                     {
6064                       if (src < src_end
6065                           && *src == '\n')
6066                         c = *src++;
6067                       produced_chars--;
6068                     }
6069                   else if (EQ (eol_type, Qmac))
6070                     c = '\n';
6071                 }
6072               *dst++ = c;
6073             }
6074         }
6075       coding->consumed = coding->src_bytes;
6076       coding->consumed_char = coding->src_chars;
6077     }
6078
6079   produced = dst - (coding->destination + coding->produced);
6080   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6081     insert_from_gap (produced_chars, produced);
6082   coding->produced += produced;
6083   coding->produced_char += produced_chars;
6084   return carryover;
6085 }
6086
6087 /* Compose text in CODING->object according to the annotation data at
6088    CHARBUF.  CHARBUF is an array:
6089      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6090  */
6091
6092 static INLINE void
6093 produce_composition (coding, charbuf, pos)
6094      struct coding_system *coding;
6095      int *charbuf;
6096      EMACS_INT pos;
6097 {
6098   int len;
6099   EMACS_INT to;
6100   enum composition_method method;
6101   Lisp_Object components;
6102
6103   len = -charbuf[0];
6104   to = pos + charbuf[2];
6105   if (to <= pos)
6106     return;
6107   method = (enum composition_method) (charbuf[3]);
6108
6109   if (method == COMPOSITION_RELATIVE)
6110     components = Qnil;
6111   else if (method >= COMPOSITION_WITH_RULE
6112            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6113     {
6114       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6115       int i;
6116
6117       len -= 4;
6118       charbuf += 4;
6119       for (i = 0; i < len; i++)
6120         {
6121           args[i] = make_number (charbuf[i]);
6122           if (charbuf[i] < 0)
6123             return;
6124         }
6125       components = (method == COMPOSITION_WITH_ALTCHARS
6126                     ? Fstring (len, args) : Fvector (len, args));
6127     }
6128   else
6129     return;
6130   compose_text (pos, to, components, Qnil, coding->dst_object);
6131 }
6132
6133
6134 /* Put `charset' property on text in CODING->object according to
6135    the annotation data at CHARBUF.  CHARBUF is an array:
6136      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6137  */
6138
6139 static INLINE void
6140 produce_charset (coding, charbuf, pos)
6141      struct coding_system *coding;
6142      int *charbuf;
6143      EMACS_INT pos;
6144 {
6145   EMACS_INT from = pos - charbuf[2];
6146   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6147
6148   Fput_text_property (make_number (from), make_number (pos),
6149                       Qcharset, CHARSET_NAME (charset),
6150                       coding->dst_object);
6151 }
6152
6153
6154 #define CHARBUF_SIZE 0x4000
6155
6156 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6157   do {                                                                  \
6158     int size = CHARBUF_SIZE;;                                           \
6159                                                                         \
6160     coding->charbuf = NULL;                                             \
6161     while (size > 1024)                                                 \
6162       {                                                                 \
6163         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6164         if (coding->charbuf)                                            \
6165           break;                                                        \
6166         size >>= 1;                                                     \
6167       }                                                                 \
6168     if (! coding->charbuf)                                              \
6169       {                                                                 \
6170         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6171         return coding->result;                                          \
6172       }                                                                 \
6173     coding->charbuf_size = size;                                        \
6174   } while (0)
6175
6176
6177 static void
6178 produce_annotation (coding, pos)
6179      struct coding_system *coding;
6180      EMACS_INT pos;
6181 {
6182   int *charbuf = coding->charbuf;
6183   int *charbuf_end = charbuf + coding->charbuf_used;
6184
6185   if (NILP (coding->dst_object))
6186     return;
6187
6188   while (charbuf < charbuf_end)
6189     {
6190       if (*charbuf >= 0)
6191         pos += *charbuf++;
6192       else
6193         {
6194           int len = -*charbuf;
6195           switch (charbuf[1])
6196             {
6197             case CODING_ANNOTATE_COMPOSITION_MASK:
6198               produce_composition (coding, charbuf, pos);
6199               break;
6200             case CODING_ANNOTATE_CHARSET_MASK:
6201               produce_charset (coding, charbuf, pos);
6202               break;
6203             default:
6204               abort ();
6205             }
6206           charbuf += len;
6207         }
6208     }
6209 }
6210
6211 /* Decode the data at CODING->src_object into CODING->dst_object.
6212    CODING->src_object is a buffer, a string, or nil.
6213    CODING->dst_object is a buffer.
6214
6215    If CODING->src_object is a buffer, it must be the current buffer.
6216    In this case, if CODING->src_pos is positive, it is a position of
6217    the source text in the buffer, otherwise, the source text is in the
6218    gap area of the buffer, and CODING->src_pos specifies the offset of
6219    the text from GPT (which must be the same as PT).  If this is the
6220    same buffer as CODING->dst_object, CODING->src_pos must be
6221    negative.
6222
6223    If CODING->src_object is a string, CODING->src_pos is an index to
6224    that string.
6225
6226    If CODING->src_object is nil, CODING->source must already point to
6227    the non-relocatable memory area.  In this case, CODING->src_pos is
6228    an offset from CODING->source.
6229
6230    The decoded data is inserted at the current point of the buffer
6231    CODING->dst_object.
6232 */
6233
6234 static int
6235 decode_coding (coding)
6236      struct coding_system *coding;
6237 {
6238   Lisp_Object attrs;
6239   Lisp_Object undo_list;
6240   Lisp_Object translation_table;
6241   int carryover;
6242   int i;
6243
6244   if (BUFFERP (coding->src_object)
6245       && coding->src_pos > 0
6246       && coding->src_pos < GPT
6247       && coding->src_pos + coding->src_chars > GPT)
6248     move_gap_both (coding->src_pos, coding->src_pos_byte);
6249
6250   undo_list = Qt;
6251   if (BUFFERP (coding->dst_object))
6252     {
6253       if (current_buffer != XBUFFER (coding->dst_object))
6254         set_buffer_internal (XBUFFER (coding->dst_object));
6255       if (GPT != PT)
6256         move_gap_both (PT, PT_BYTE);
6257       undo_list = current_buffer->undo_list;
6258       current_buffer->undo_list = Qt;
6259     }
6260
6261   coding->consumed = coding->consumed_char = 0;
6262   coding->produced = coding->produced_char = 0;
6263   coding->chars_at_source = 0;
6264   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6265   coding->errors = 0;
6266
6267   ALLOC_CONVERSION_WORK_AREA (coding);
6268
6269   attrs = CODING_ID_ATTRS (coding->id);
6270   translation_table = get_translation_table (attrs, 0, NULL);
6271
6272   carryover = 0;
6273   do
6274     {
6275       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6276
6277       coding_set_source (coding);
6278       coding->annotated = 0;
6279       coding->charbuf_used = carryover;
6280       (*(coding->decoder)) (coding);
6281       coding_set_destination (coding);
6282       carryover = produce_chars (coding, translation_table, 0);
6283       if (coding->annotated)
6284         produce_annotation (coding, pos);
6285       for (i = 0; i < carryover; i++)
6286         coding->charbuf[i]
6287           = coding->charbuf[coding->charbuf_used - carryover + i];
6288     }
6289   while (coding->consumed < coding->src_bytes
6290          && (coding->result == CODING_RESULT_SUCCESS
6291              || coding->result == CODING_RESULT_INVALID_SRC));
6292
6293   if (carryover > 0)
6294     {
6295       coding_set_destination (coding);
6296       coding->charbuf_used = carryover;
6297       produce_chars (coding, translation_table, 1);
6298     }
6299
6300   coding->carryover_bytes = 0;
6301   if (coding->consumed < coding->src_bytes)
6302     {
6303       int nbytes = coding->src_bytes - coding->consumed;
6304       const unsigned char *src;
6305
6306       coding_set_source (coding);
6307       coding_set_destination (coding);
6308       src = coding->source + coding->consumed;
6309
6310       if (coding->mode & CODING_MODE_LAST_BLOCK)
6311         {
6312           /* Flush out unprocessed data as binary chars.  We are sure
6313              that the number of data is less than the size of
6314              coding->charbuf.  */
6315           coding->charbuf_used = 0;
6316           while (nbytes-- > 0)
6317             {
6318               int c = *src++;
6319
6320               if (c & 0x80)
6321                 c = BYTE8_TO_CHAR (c);
6322               coding->charbuf[coding->charbuf_used++] = c;
6323             }
6324           produce_chars (coding, Qnil, 1);
6325         }
6326       else
6327         {
6328           /* Record unprocessed bytes in coding->carryover.  We are
6329              sure that the number of data is less than the size of
6330              coding->carryover.  */
6331           unsigned char *p = coding->carryover;
6332
6333           coding->carryover_bytes = nbytes;
6334           while (nbytes-- > 0)
6335             *p++ = *src++;
6336         }
6337       coding->consumed = coding->src_bytes;
6338     }
6339
6340   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6341     decode_eol (coding);
6342   if (BUFFERP (coding->dst_object))
6343     {
6344       current_buffer->undo_list = undo_list;
6345       record_insert (coding->dst_pos, coding->produced_char);
6346     }
6347   return coding->result;
6348 }
6349
6350
6351 /* Extract an annotation datum from a composition starting at POS and
6352    ending before LIMIT of CODING->src_object (buffer or string), store
6353    the data in BUF, set *STOP to a starting position of the next
6354    composition (if any) or to LIMIT, and return the address of the
6355    next element of BUF.
6356
6357    If such an annotation is not found, set *STOP to a starting
6358    position of a composition after POS (if any) or to LIMIT, and
6359    return BUF.  */
6360
6361 static INLINE int *
6362 handle_composition_annotation (pos, limit, coding, buf, stop)
6363      EMACS_INT pos, limit;
6364      struct coding_system *coding;
6365      int *buf;
6366      EMACS_INT *stop;
6367 {
6368   EMACS_INT start, end;
6369   Lisp_Object prop;
6370
6371   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6372       || end > limit)
6373     *stop = limit;
6374   else if (start > pos)
6375     *stop = start;
6376   else
6377     {
6378       if (start == pos)
6379         {
6380           /* We found a composition.  Store the corresponding
6381              annotation data in BUF.  */
6382           int *head = buf;
6383           enum composition_method method = COMPOSITION_METHOD (prop);
6384           int nchars = COMPOSITION_LENGTH (prop);
6385
6386           ADD_COMPOSITION_DATA (buf, nchars, method);
6387           if (method != COMPOSITION_RELATIVE)
6388             {
6389               Lisp_Object components;
6390               int len, i, i_byte;
6391
6392               components = COMPOSITION_COMPONENTS (prop);
6393               if (VECTORP (components))
6394                 {
6395                   len = XVECTOR (components)->size;
6396                   for (i = 0; i < len; i++)
6397                     *buf++ = XINT (AREF (components, i));
6398                 }
6399               else if (STRINGP (components))
6400                 {
6401                   len = SCHARS (components);
6402                   i = i_byte = 0;
6403                   while (i < len)
6404                     {
6405                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6406                       buf++;
6407                     }
6408                 }
6409               else if (INTEGERP (components))
6410                 {
6411                   len = 1;
6412                   *buf++ = XINT (components);
6413                 }
6414               else if (CONSP (components))
6415                 {
6416                   for (len = 0; CONSP (components);
6417                        len++, components = XCDR (components))
6418                     *buf++ = XINT (XCAR (components));
6419                 }
6420               else
6421                 abort ();
6422               *head -= len;
6423             }
6424         }
6425
6426       if (find_composition (end, limit, &start, &end, &prop,
6427                             coding->src_object)
6428           && end <= limit)
6429         *stop = start;
6430       else
6431         *stop = limit;
6432     }
6433   return buf;
6434 }
6435
6436
6437 /* Extract an annotation datum from a text property `charset' at POS of
6438    CODING->src_object (buffer of string), store the data in BUF, set
6439    *STOP to the position where the value of `charset' property changes
6440    (limiting by LIMIT), and return the address of the next element of
6441    BUF.
6442
6443    If the property value is nil, set *STOP to the position where the
6444    property value is non-nil (limiting by LIMIT), and return BUF.  */
6445
6446 static INLINE int *
6447 handle_charset_annotation (pos, limit, coding, buf, stop)
6448      EMACS_INT pos, limit;
6449      struct coding_system *coding;
6450      int *buf;
6451      EMACS_INT *stop;
6452 {
6453   Lisp_Object val, next;
6454   int id;
6455
6456   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6457   if (! NILP (val) && CHARSETP (val))
6458     id = XINT (CHARSET_SYMBOL_ID (val));
6459   else
6460     id = -1;
6461   ADD_CHARSET_DATA (buf, 0, id);
6462   next = Fnext_single_property_change (make_number (pos), Qcharset,
6463                                        coding->src_object,
6464                                        make_number (limit));
6465   *stop = XINT (next);
6466   return buf;
6467 }
6468
6469
6470 static void
6471 consume_chars (coding, translation_table, max_lookup)
6472      struct coding_system *coding;
6473      Lisp_Object translation_table;
6474      int max_lookup;
6475 {
6476   int *buf = coding->charbuf;
6477   int *buf_end = coding->charbuf + coding->charbuf_size;
6478   const unsigned char *src = coding->source + coding->consumed;
6479   const unsigned char *src_end = coding->source + coding->src_bytes;
6480   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6481   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6482   int multibytep = coding->src_multibyte;
6483   Lisp_Object eol_type;
6484   int c;
6485   EMACS_INT stop, stop_composition, stop_charset;
6486   int *lookup_buf = NULL;
6487
6488   if (! NILP (translation_table))
6489     lookup_buf = alloca (sizeof (int) * max_lookup);
6490
6491   eol_type = CODING_ID_EOL_TYPE (coding->id);
6492   if (VECTORP (eol_type))
6493     eol_type = Qunix;
6494
6495   /* Note: composition handling is not yet implemented.  */
6496   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6497
6498   if (NILP (coding->src_object))
6499     stop = stop_composition = stop_charset = end_pos;
6500   else
6501     {
6502       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6503         stop = stop_composition = pos;
6504       else
6505         stop = stop_composition = end_pos;
6506       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6507         stop = stop_charset = pos;
6508       else
6509         stop_charset = end_pos;
6510     }
6511
6512   /* Compensate for CRLF and conversion.  */
6513   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6514   while (buf < buf_end)
6515     {
6516       Lisp_Object trans;
6517
6518       if (pos == stop)
6519         {
6520           if (pos == end_pos)
6521             break;
6522           if (pos == stop_composition)
6523             buf = handle_composition_annotation (pos, end_pos, coding,
6524                                                  buf, &stop_composition);
6525           if (pos == stop_charset)
6526             buf = handle_charset_annotation (pos, end_pos, coding,
6527                                              buf, &stop_charset);
6528           stop = (stop_composition < stop_charset
6529                   ? stop_composition : stop_charset);
6530         }
6531
6532       if (! multibytep)
6533         {
6534           EMACS_INT bytes;
6535
6536           if (coding->encoder == encode_coding_raw_text)
6537             c = *src++, pos++;
6538           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6539             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6540           else
6541             c = BYTE8_TO_CHAR (*src), src++, pos++;
6542         }
6543       else
6544         c = STRING_CHAR_ADVANCE (src), pos++;
6545       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6546         c = '\n';
6547       if (! EQ (eol_type, Qunix))
6548         {
6549           if (c == '\n')
6550             {
6551               if (EQ (eol_type, Qdos))
6552                 *buf++ = '\r';
6553               else
6554                 c = '\r';
6555             }
6556         }
6557
6558       trans = Qnil;
6559       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6560       if (NILP (trans))
6561         *buf++ = c;
6562       else
6563         {
6564           int from_nchars = 1, to_nchars = 1;
6565           int *lookup_buf_end;
6566           const unsigned char *p = src;
6567           int i;
6568
6569           lookup_buf[0] = c;
6570           for (i = 1; i < max_lookup && p < src_end; i++)
6571             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6572           lookup_buf_end = lookup_buf + i;
6573           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6574                                    &from_nchars, &to_nchars);
6575           if (EQ (trans, Qt)
6576               || buf + to_nchars > buf_end)
6577             break;
6578           *buf++ = *lookup_buf;
6579           for (i = 1; i < to_nchars; i++)
6580             *buf++ = XINT (AREF (trans, i));
6581           for (i = 1; i < from_nchars; i++, pos++)
6582             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6583         }
6584     }
6585
6586   coding->consumed = src - coding->source;
6587   coding->consumed_char = pos - coding->src_pos;
6588   coding->charbuf_used = buf - coding->charbuf;
6589   coding->chars_at_source = 0;
6590 }
6591
6592
6593 /* Encode the text at CODING->src_object into CODING->dst_object.
6594    CODING->src_object is a buffer or a string.
6595    CODING->dst_object is a buffer or nil.
6596
6597    If CODING->src_object is a buffer, it must be the current buffer.
6598    In this case, if CODING->src_pos is positive, it is a position of
6599    the source text in the buffer, otherwise. the source text is in the
6600    gap area of the buffer, and coding->src_pos specifies the offset of
6601    the text from GPT (which must be the same as PT).  If this is the
6602    same buffer as CODING->dst_object, CODING->src_pos must be
6603    negative and CODING should not have `pre-write-conversion'.
6604
6605    If CODING->src_object is a string, CODING should not have
6606    `pre-write-conversion'.
6607
6608    If CODING->dst_object is a buffer, the encoded data is inserted at
6609    the current point of that buffer.
6610
6611    If CODING->dst_object is nil, the encoded data is placed at the
6612    memory area specified by CODING->destination.  */
6613
6614 static int
6615 encode_coding (coding)
6616      struct coding_system *coding;
6617 {
6618   Lisp_Object attrs;
6619   Lisp_Object translation_table;
6620   int max_lookup;
6621
6622   attrs = CODING_ID_ATTRS (coding->id);
6623   if (coding->encoder == encode_coding_raw_text)
6624     translation_table = Qnil, max_lookup = 0;
6625   else
6626     translation_table = get_translation_table (attrs, 1, &max_lookup);
6627
6628   if (BUFFERP (coding->dst_object))
6629     {
6630       set_buffer_internal (XBUFFER (coding->dst_object));
6631       coding->dst_multibyte
6632         = ! NILP (current_buffer->enable_multibyte_characters);
6633     }
6634
6635   coding->consumed = coding->consumed_char = 0;
6636   coding->produced = coding->produced_char = 0;
6637   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6638   coding->errors = 0;
6639
6640   ALLOC_CONVERSION_WORK_AREA (coding);
6641
6642   do {
6643     coding_set_source (coding);
6644     consume_chars (coding, translation_table, max_lookup);
6645     coding_set_destination (coding);
6646     (*(coding->encoder)) (coding);
6647   } while (coding->consumed_char < coding->src_chars);
6648
6649   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6650     insert_from_gap (coding->produced_char, coding->produced);
6651
6652   return (coding->result);
6653 }
6654
6655
6656 /* Name (or base name) of work buffer for code conversion.  */
6657 static Lisp_Object Vcode_conversion_workbuf_name;
6658
6659 /* A working buffer used by the top level conversion.  Once it is
6660    created, it is never destroyed.  It has the name
6661    Vcode_conversion_workbuf_name.  The other working buffers are
6662    destroyed after the use is finished, and their names are modified
6663    versions of Vcode_conversion_workbuf_name.  */
6664 static Lisp_Object Vcode_conversion_reused_workbuf;
6665
6666 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6667 static int reused_workbuf_in_use;
6668
6669
6670 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6671    multibyteness of returning buffer.  */
6672
6673 static Lisp_Object
6674 make_conversion_work_buffer (multibyte)
6675      int multibyte;
6676 {
6677   Lisp_Object name, workbuf;
6678   struct buffer *current;
6679
6680   if (reused_workbuf_in_use++)
6681     {
6682       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6683       workbuf = Fget_buffer_create (name);
6684     }
6685   else
6686     {
6687       name = Vcode_conversion_workbuf_name;
6688       workbuf = Fget_buffer_create (name);
6689       if (NILP (Vcode_conversion_reused_workbuf))
6690         Vcode_conversion_reused_workbuf = workbuf;
6691     }
6692   current = current_buffer;
6693   set_buffer_internal (XBUFFER (workbuf));
6694   Ferase_buffer ();
6695   current_buffer->undo_list = Qt;
6696   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6697   set_buffer_internal (current);
6698   return workbuf;
6699 }
6700
6701
6702 static Lisp_Object
6703 code_conversion_restore (arg)
6704      Lisp_Object arg;
6705 {
6706   Lisp_Object current, workbuf;
6707   struct gcpro gcpro1;
6708
6709   GCPRO1 (arg);
6710   current = XCAR (arg);
6711   workbuf = XCDR (arg);
6712   if (! NILP (workbuf))
6713     {
6714       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6715         reused_workbuf_in_use = 0;
6716       else if (! NILP (Fbuffer_live_p (workbuf)))
6717         Fkill_buffer (workbuf);
6718     }
6719   set_buffer_internal (XBUFFER (current));
6720   UNGCPRO;
6721   return Qnil;
6722 }
6723
6724 Lisp_Object
6725 code_conversion_save (with_work_buf, multibyte)
6726      int with_work_buf, multibyte;
6727 {
6728   Lisp_Object workbuf = Qnil;
6729
6730   if (with_work_buf)
6731     workbuf = make_conversion_work_buffer (multibyte);
6732   record_unwind_protect (code_conversion_restore,
6733                          Fcons (Fcurrent_buffer (), workbuf));
6734   return workbuf;
6735 }
6736
6737 int
6738 decode_coding_gap (coding, chars, bytes)
6739      struct coding_system *coding;
6740      EMACS_INT chars, bytes;
6741 {
6742   int count = specpdl_ptr - specpdl;
6743   Lisp_Object attrs;
6744
6745   code_conversion_save (0, 0);
6746
6747   coding->src_object = Fcurrent_buffer ();
6748   coding->src_chars = chars;
6749   coding->src_bytes = bytes;
6750   coding->src_pos = -chars;
6751   coding->src_pos_byte = -bytes;
6752   coding->src_multibyte = chars < bytes;
6753   coding->dst_object = coding->src_object;
6754   coding->dst_pos = PT;
6755   coding->dst_pos_byte = PT_BYTE;
6756   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6757
6758   if (CODING_REQUIRE_DETECTION (coding))
6759     detect_coding (coding);
6760
6761   coding->mode |= CODING_MODE_LAST_BLOCK;
6762   current_buffer->text->inhibit_shrinking = 1;
6763   decode_coding (coding);
6764   current_buffer->text->inhibit_shrinking = 0;
6765
6766   attrs = CODING_ID_ATTRS (coding->id);
6767   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6768     {
6769       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6770       Lisp_Object val;
6771
6772       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6773       val = call1 (CODING_ATTR_POST_READ (attrs),
6774                    make_number (coding->produced_char));
6775       CHECK_NATNUM (val);
6776       coding->produced_char += Z - prev_Z;
6777       coding->produced += Z_BYTE - prev_Z_BYTE;
6778     }
6779
6780   unbind_to (count, Qnil);
6781   return coding->result;
6782 }
6783
6784 int
6785 encode_coding_gap (coding, chars, bytes)
6786      struct coding_system *coding;
6787      EMACS_INT chars, bytes;
6788 {
6789   int count = specpdl_ptr - specpdl;
6790
6791   code_conversion_save (0, 0);
6792
6793   coding->src_object = Fcurrent_buffer ();
6794   coding->src_chars = chars;
6795   coding->src_bytes = bytes;
6796   coding->src_pos = -chars;
6797   coding->src_pos_byte = -bytes;
6798   coding->src_multibyte = chars < bytes;
6799   coding->dst_object = coding->src_object;
6800   coding->dst_pos = PT;
6801   coding->dst_pos_byte = PT_BYTE;
6802
6803   encode_coding (coding);
6804
6805   unbind_to (count, Qnil);
6806   return coding->result;
6807 }
6808
6809
6810 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6811    SRC_OBJECT into DST_OBJECT by coding context CODING.
6812
6813    SRC_OBJECT is a buffer, a string, or Qnil.
6814
6815    If it is a buffer, the text is at point of the buffer.  FROM and TO
6816    are positions in the buffer.
6817
6818    If it is a string, the text is at the beginning of the string.
6819    FROM and TO are indices to the string.
6820
6821    If it is nil, the text is at coding->source.  FROM and TO are
6822    indices to coding->source.
6823
6824    DST_OBJECT is a buffer, Qt, or Qnil.
6825
6826    If it is a buffer, the decoded text is inserted at point of the
6827    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6828    is deleted.
6829
6830    If it is Qt, a string is made from the decoded text, and
6831    set in CODING->dst_object.
6832
6833    If it is Qnil, the decoded text is stored at CODING->destination.
6834    The caller must allocate CODING->dst_bytes bytes at
6835    CODING->destination by xmalloc.  If the decoded text is longer than
6836    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6837  */
6838
6839 void
6840 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6841                       dst_object)
6842      struct coding_system *coding;
6843      Lisp_Object src_object;
6844      EMACS_INT from, from_byte, to, to_byte;
6845      Lisp_Object dst_object;
6846 {
6847   int count = specpdl_ptr - specpdl;
6848   unsigned char *destination;
6849   EMACS_INT dst_bytes;
6850   EMACS_INT chars = to - from;
6851   EMACS_INT bytes = to_byte - from_byte;
6852   Lisp_Object attrs;
6853   Lisp_Object buffer;
6854   int saved_pt = -1, saved_pt_byte;
6855
6856   buffer = Fcurrent_buffer ();
6857
6858   if (NILP (dst_object))
6859     {
6860       destination = coding->destination;
6861       dst_bytes = coding->dst_bytes;
6862     }
6863
6864   coding->src_object = src_object;
6865   coding->src_chars = chars;
6866   coding->src_bytes = bytes;
6867   coding->src_multibyte = chars < bytes;
6868
6869   if (STRINGP (src_object))
6870     {
6871       coding->src_pos = from;
6872       coding->src_pos_byte = from_byte;
6873     }
6874   else if (BUFFERP (src_object))
6875     {
6876       set_buffer_internal (XBUFFER (src_object));
6877       if (from != GPT)
6878         move_gap_both (from, from_byte);
6879       if (EQ (src_object, dst_object))
6880         {
6881           saved_pt = PT, saved_pt_byte = PT_BYTE;
6882           TEMP_SET_PT_BOTH (from, from_byte);
6883           del_range_both (from, from_byte, to, to_byte, 1);
6884           coding->src_pos = -chars;
6885           coding->src_pos_byte = -bytes;
6886         }
6887       else
6888         {
6889           coding->src_pos = from;
6890           coding->src_pos_byte = from_byte;
6891         }
6892     }
6893
6894   if (CODING_REQUIRE_DETECTION (coding))
6895     detect_coding (coding);
6896   attrs = CODING_ID_ATTRS (coding->id);
6897
6898   if (EQ (dst_object, Qt)
6899       || (! NILP (CODING_ATTR_POST_READ (attrs))
6900           && NILP (dst_object)))
6901     {
6902       coding->dst_object = code_conversion_save (1, 1);
6903       coding->dst_pos = BEG;
6904       coding->dst_pos_byte = BEG_BYTE;
6905       coding->dst_multibyte = 1;
6906     }
6907   else if (BUFFERP (dst_object))
6908     {
6909       code_conversion_save (0, 0);
6910       coding->dst_object = dst_object;
6911       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6912       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6913       coding->dst_multibyte
6914         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6915     }
6916   else
6917     {
6918       code_conversion_save (0, 0);
6919       coding->dst_object = Qnil;
6920       coding->dst_multibyte = 1;
6921     }
6922
6923   decode_coding (coding);
6924
6925   if (BUFFERP (coding->dst_object))
6926     set_buffer_internal (XBUFFER (coding->dst_object));
6927
6928   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6929     {
6930       struct gcpro gcpro1, gcpro2;
6931       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6932       Lisp_Object val;
6933
6934       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6935       GCPRO2 (coding->src_object, coding->dst_object);
6936       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6937                         make_number (coding->produced_char));
6938       UNGCPRO;
6939       CHECK_NATNUM (val);
6940       coding->produced_char += Z - prev_Z;
6941       coding->produced += Z_BYTE - prev_Z_BYTE;
6942     }
6943
6944   if (EQ (dst_object, Qt))
6945     {
6946       coding->dst_object = Fbuffer_string ();
6947     }
6948   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6949     {
6950       set_buffer_internal (XBUFFER (coding->dst_object));
6951       if (dst_bytes < coding->produced)
6952         {
6953           destination
6954             = (unsigned char *) xrealloc (destination, coding->produced);
6955           if (! destination)
6956             {
6957               record_conversion_result (coding,
6958                                         CODING_RESULT_INSUFFICIENT_DST);
6959               unbind_to (count, Qnil);
6960               return;
6961             }
6962           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6963             move_gap_both (BEGV, BEGV_BYTE);
6964           bcopy (BEGV_ADDR, destination, coding->produced);
6965           coding->destination = destination;
6966         }
6967     }
6968
6969   if (saved_pt >= 0)
6970     {
6971       /* This is the case of:
6972          (BUFFERP (src_object) && EQ (src_object, dst_object))
6973          As we have moved PT while replacing the original buffer
6974          contents, we must recover it now.  */
6975       set_buffer_internal (XBUFFER (src_object));
6976       if (saved_pt < from)
6977         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6978       else if (saved_pt < from + chars)
6979         TEMP_SET_PT_BOTH (from, from_byte);
6980       else if (! NILP (current_buffer->enable_multibyte_characters))
6981         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6982                           saved_pt_byte + (coding->produced - bytes));
6983       else
6984         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6985                           saved_pt_byte + (coding->produced - bytes));
6986     }
6987
6988   unbind_to (count, coding->dst_object);
6989 }
6990
6991
6992 void
6993 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6994                       dst_object)
6995      struct coding_system *coding;
6996      Lisp_Object src_object;
6997      EMACS_INT from, from_byte, to, to_byte;
6998      Lisp_Object dst_object;
6999 {
7000   int count = specpdl_ptr - specpdl;
7001   EMACS_INT chars = to - from;
7002   EMACS_INT bytes = to_byte - from_byte;
7003   Lisp_Object attrs;
7004   Lisp_Object buffer;
7005   int saved_pt = -1, saved_pt_byte;
7006   int kill_src_buffer = 0;
7007
7008   buffer = Fcurrent_buffer ();
7009
7010   coding->src_object = src_object;
7011   coding->src_chars = chars;
7012   coding->src_bytes = bytes;
7013   coding->src_multibyte = chars < bytes;
7014
7015   attrs = CODING_ID_ATTRS (coding->id);
7016
7017   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7018     {
7019       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7020       set_buffer_internal (XBUFFER (coding->src_object));
7021       if (STRINGP (src_object))
7022         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7023       else if (BUFFERP (src_object))
7024         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7025       else
7026         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7027
7028       if (EQ (src_object, dst_object))
7029         {
7030           set_buffer_internal (XBUFFER (src_object));
7031           saved_pt = PT, saved_pt_byte = PT_BYTE;
7032           del_range_both (from, from_byte, to, to_byte, 1);
7033           set_buffer_internal (XBUFFER (coding->src_object));
7034         }
7035
7036       {
7037         Lisp_Object args[3];
7038
7039         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7040         args[1] = make_number (BEG);
7041         args[2] = make_number (Z);
7042         safe_call (3, args);
7043       }
7044       if (XBUFFER (coding->src_object) != current_buffer)
7045         kill_src_buffer = 1;
7046       coding->src_object = Fcurrent_buffer ();
7047       if (BEG != GPT)
7048         move_gap_both (BEG, BEG_BYTE);
7049       coding->src_chars = Z - BEG;
7050       coding->src_bytes = Z_BYTE - BEG_BYTE;
7051       coding->src_pos = BEG;
7052       coding->src_pos_byte = BEG_BYTE;
7053       coding->src_multibyte = Z < Z_BYTE;
7054     }
7055   else if (STRINGP (src_object))
7056     {
7057       code_conversion_save (0, 0);
7058       coding->src_pos = from;
7059       coding->src_pos_byte = from_byte;
7060     }
7061   else if (BUFFERP (src_object))
7062     {
7063       code_conversion_save (0, 0);
7064       set_buffer_internal (XBUFFER (src_object));
7065       if (EQ (src_object, dst_object))
7066         {
7067           saved_pt = PT, saved_pt_byte = PT_BYTE;
7068           coding->src_object = del_range_1 (from, to, 1, 1);
7069           coding->src_pos = 0;
7070           coding->src_pos_byte = 0;
7071         }
7072       else
7073         {
7074           if (from < GPT && to >= GPT)
7075             move_gap_both (from, from_byte);
7076           coding->src_pos = from;
7077           coding->src_pos_byte = from_byte;
7078         }
7079     }
7080   else
7081     code_conversion_save (0, 0);
7082
7083   if (BUFFERP (dst_object))
7084     {
7085       coding->dst_object = dst_object;
7086       if (EQ (src_object, dst_object))
7087         {
7088           coding->dst_pos = from;
7089           coding->dst_pos_byte = from_byte;
7090         }
7091       else
7092         {
7093           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7094           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7095         }
7096       coding->dst_multibyte
7097         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7098     }
7099   else if (EQ (dst_object, Qt))
7100     {
7101       coding->dst_object = Qnil;
7102       coding->dst_bytes = coding->src_chars;
7103       if (coding->dst_bytes == 0)
7104         coding->dst_bytes = 1;
7105       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7106       coding->dst_multibyte = 0;
7107     }
7108   else
7109     {
7110       coding->dst_object = Qnil;
7111       coding->dst_multibyte = 0;
7112     }
7113
7114   encode_coding (coding);
7115
7116   if (EQ (dst_object, Qt))
7117     {
7118       if (BUFFERP (coding->dst_object))
7119         coding->dst_object = Fbuffer_string ();
7120       else
7121         {
7122           coding->dst_object
7123             = make_unibyte_string ((char *) coding->destination,
7124                                    coding->produced);
7125           xfree (coding->destination);
7126         }
7127     }
7128
7129   if (saved_pt >= 0)
7130     {
7131       /* This is the case of:
7132          (BUFFERP (src_object) && EQ (src_object, dst_object))
7133          As we have moved PT while replacing the original buffer
7134          contents, we must recover it now.  */
7135       set_buffer_internal (XBUFFER (src_object));
7136       if (saved_pt < from)
7137         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7138       else if (saved_pt < from + chars)
7139         TEMP_SET_PT_BOTH (from, from_byte);
7140       else if (! NILP (current_buffer->enable_multibyte_characters))
7141         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7142                           saved_pt_byte + (coding->produced - bytes));
7143       else
7144         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7145                           saved_pt_byte + (coding->produced - bytes));
7146     }
7147
7148   if (kill_src_buffer)
7149     Fkill_buffer (coding->src_object);
7150   unbind_to (count, Qnil);
7151 }
7152
7153
7154 Lisp_Object
7155 preferred_coding_system ()
7156 {
7157   int id = coding_categories[coding_priorities[0]].id;
7158
7159   return CODING_ID_NAME (id);
7160 }
7161
7162 \f
7163 #ifdef emacs
7164 /*** 8. Emacs Lisp library functions ***/
7165
7166 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7167        doc: /* Return t if OBJECT is nil or a coding-system.
7168 See the documentation of `define-coding-system' for information
7169 about coding-system objects.  */)
7170      (obj)
7171      Lisp_Object obj;
7172 {
7173   if (NILP (obj)
7174       || CODING_SYSTEM_ID (obj) >= 0)
7175     return Qt;
7176   if (! SYMBOLP (obj)
7177       || NILP (Fget (obj, Qcoding_system_define_form)))
7178     return Qnil;
7179   return Qt;
7180 }
7181
7182 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7183        Sread_non_nil_coding_system, 1, 1, 0,
7184        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7185      (prompt)
7186      Lisp_Object prompt;
7187 {
7188   Lisp_Object val;
7189   do
7190     {
7191       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7192                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7193     }
7194   while (SCHARS (val) == 0);
7195   return (Fintern (val, Qnil));
7196 }
7197
7198 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7199        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7200 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7201      (prompt, default_coding_system)
7202      Lisp_Object prompt, default_coding_system;
7203 {
7204   Lisp_Object val;
7205   if (SYMBOLP (default_coding_system))
7206     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7207   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7208                           Qt, Qnil, Qcoding_system_history,
7209                           default_coding_system, Qnil);
7210   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7211 }
7212
7213 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7214        1, 1, 0,
7215        doc: /* Check validity of CODING-SYSTEM.
7216 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7217 It is valid if it is nil or a symbol defined as a coding system by the
7218 function `define-coding-system'.  */)
7219   (coding_system)
7220      Lisp_Object coding_system;
7221 {
7222   Lisp_Object define_form;
7223
7224   define_form = Fget (coding_system, Qcoding_system_define_form);
7225   if (! NILP (define_form))
7226     {
7227       Fput (coding_system, Qcoding_system_define_form, Qnil);
7228       safe_eval (define_form);
7229     }
7230   if (!NILP (Fcoding_system_p (coding_system)))
7231     return coding_system;
7232   xsignal1 (Qcoding_system_error, coding_system);
7233 }
7234
7235 \f
7236 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7237    HIGHEST is nonzero, return the coding system of the highest
7238    priority among the detected coding systems.  Otherwize return a
7239    list of detected coding systems sorted by their priorities.  If
7240    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7241    multibyte form but contains only ASCII and eight-bit chars.
7242    Otherwise, the bytes are raw bytes.
7243
7244    CODING-SYSTEM controls the detection as below:
7245
7246    If it is nil, detect both text-format and eol-format.  If the
7247    text-format part of CODING-SYSTEM is already specified
7248    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7249    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7250    detect only text-format.  */
7251
7252 Lisp_Object
7253 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7254                       coding_system)
7255      const unsigned char *src;
7256      int src_chars, src_bytes, highest;
7257      int multibytep;
7258      Lisp_Object coding_system;
7259 {
7260   const unsigned char *src_end = src + src_bytes;
7261   Lisp_Object attrs, eol_type;
7262   Lisp_Object val;
7263   struct coding_system coding;
7264   int id;
7265   struct coding_detection_info detect_info;
7266   enum coding_category base_category;
7267
7268   if (NILP (coding_system))
7269     coding_system = Qundecided;
7270   setup_coding_system (coding_system, &coding);
7271   attrs = CODING_ID_ATTRS (coding.id);
7272   eol_type = CODING_ID_EOL_TYPE (coding.id);
7273   coding_system = CODING_ATTR_BASE_NAME (attrs);
7274
7275   coding.source = src;
7276   coding.src_chars = src_chars;
7277   coding.src_bytes = src_bytes;
7278   coding.src_multibyte = multibytep;
7279   coding.consumed = 0;
7280   coding.mode |= CODING_MODE_LAST_BLOCK;
7281
7282   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7283
7284   /* At first, detect text-format if necessary.  */
7285   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7286   if (base_category == coding_category_undecided)
7287     {
7288       enum coding_category category;
7289       struct coding_system *this;
7290       int c, i;
7291
7292       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7293       for (i = 0; src < src_end; i++, src++)
7294         {
7295           c = *src;
7296           if (c & 0x80)
7297             break;
7298           if (c < 0x20
7299               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7300               && ! inhibit_iso_escape_detection)
7301             {
7302               coding.head_ascii = src - coding.source;
7303               if (detect_coding_iso_2022 (&coding, &detect_info))
7304                 {
7305                   /* We have scanned the whole data.  */
7306                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7307                     /* We didn't find an 8-bit code.  */
7308                     src = src_end;
7309                   break;
7310                 }
7311             }
7312         }
7313       coding.head_ascii = src - coding.source;
7314
7315       if (src < src_end
7316           || detect_info.found)
7317         {
7318           if (src == src_end)
7319             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7320             for (i = 0; i < coding_category_raw_text; i++)
7321               {
7322                 category = coding_priorities[i];
7323                 this = coding_categories + category;
7324                 if (detect_info.found & (1 << category))
7325                   break;
7326               }
7327           else
7328             for (i = 0; i < coding_category_raw_text; i++)
7329               {
7330                 category = coding_priorities[i];
7331                 this = coding_categories + category;
7332
7333                 if (this->id < 0)
7334                   {
7335                     /* No coding system of this category is defined.  */
7336                     detect_info.rejected |= (1 << category);
7337                   }
7338                 else if (category >= coding_category_raw_text)
7339                   continue;
7340                 else if (detect_info.checked & (1 << category))
7341                   {
7342                     if (highest
7343                         && (detect_info.found & (1 << category)))
7344                       break;
7345                   }
7346                 else
7347                   {
7348                     if ((*(this->detector)) (&coding, &detect_info)
7349                         && highest
7350                         && (detect_info.found & (1 << category)))
7351                       {
7352                         if (category == coding_category_utf_16_auto)
7353                           {
7354                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7355                               category = coding_category_utf_16_le;
7356                             else
7357                               category = coding_category_utf_16_be;
7358                           }
7359                         break;
7360                       }
7361                   }
7362               }
7363         }
7364
7365       if (detect_info.rejected == CATEGORY_MASK_ANY)
7366         {
7367           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7368           id = coding_categories[coding_category_raw_text].id;
7369           val = Fcons (make_number (id), Qnil);
7370         }
7371       else if (! detect_info.rejected && ! detect_info.found)
7372         {
7373           detect_info.found = CATEGORY_MASK_ANY;
7374           id = coding_categories[coding_category_undecided].id;
7375           val = Fcons (make_number (id), Qnil);
7376         }
7377       else if (highest)
7378         {
7379           if (detect_info.found)
7380             {
7381               detect_info.found = 1 << category;
7382               val = Fcons (make_number (this->id), Qnil);
7383             }
7384           else
7385             for (i = 0; i < coding_category_raw_text; i++)
7386               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7387                 {
7388                   detect_info.found = 1 << coding_priorities[i];
7389                   id = coding_categories[coding_priorities[i]].id;
7390                   val = Fcons (make_number (id), Qnil);
7391                   break;
7392                 }
7393         }
7394       else
7395         {
7396           int mask = detect_info.rejected | detect_info.found;
7397           int found = 0;
7398           val = Qnil;
7399
7400           for (i = coding_category_raw_text - 1; i >= 0; i--)
7401             {
7402               category = coding_priorities[i];
7403               if (! (mask & (1 << category)))
7404                 {
7405                   found |= 1 << category;
7406                   id = coding_categories[category].id;
7407                   if (id >= 0)
7408                     val = Fcons (make_number (id), val);
7409                 }
7410             }
7411           for (i = coding_category_raw_text - 1; i >= 0; i--)
7412             {
7413               category = coding_priorities[i];
7414               if (detect_info.found & (1 << category))
7415                 {
7416                   id = coding_categories[category].id;
7417                   val = Fcons (make_number (id), val);
7418                 }
7419             }
7420           detect_info.found |= found;
7421         }
7422     }
7423   else if (base_category == coding_category_utf_16_auto)
7424     {
7425       if (detect_coding_utf_16 (&coding, &detect_info))
7426         {
7427           struct coding_system *this;
7428
7429           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7430             this = coding_categories + coding_category_utf_16_le;
7431           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7432             this = coding_categories + coding_category_utf_16_be;
7433           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7434             this = coding_categories + coding_category_utf_16_be_nosig;
7435           else
7436             this = coding_categories + coding_category_utf_16_le_nosig;
7437           val = Fcons (make_number (this->id), Qnil);
7438         }
7439     }
7440   else
7441     {
7442       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7443       val = Fcons (make_number (coding.id), Qnil);
7444     }
7445
7446   /* Then, detect eol-format if necessary.  */
7447   {
7448     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7449     Lisp_Object tail;
7450
7451     if (VECTORP (eol_type))
7452       {
7453         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7454           normal_eol = detect_eol (coding.source, src_bytes,
7455                                    coding_category_raw_text);
7456         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7457                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7458           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7459                                       coding_category_utf_16_be);
7460         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7461                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7462           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7463                                       coding_category_utf_16_le);
7464       }
7465     else
7466       {
7467         if (EQ (eol_type, Qunix))
7468           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7469         else if (EQ (eol_type, Qdos))
7470           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7471         else
7472           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7473       }
7474
7475     for (tail = val; CONSP (tail); tail = XCDR (tail))
7476       {
7477         enum coding_category category;
7478         int this_eol;
7479
7480         id = XINT (XCAR (tail));
7481         attrs = CODING_ID_ATTRS (id);
7482         category = XINT (CODING_ATTR_CATEGORY (attrs));
7483         eol_type = CODING_ID_EOL_TYPE (id);
7484         if (VECTORP (eol_type))
7485           {
7486             if (category == coding_category_utf_16_be
7487                 || category == coding_category_utf_16_be_nosig)
7488               this_eol = utf_16_be_eol;
7489             else if (category == coding_category_utf_16_le
7490                      || category == coding_category_utf_16_le_nosig)
7491               this_eol = utf_16_le_eol;
7492             else
7493               this_eol = normal_eol;
7494
7495             if (this_eol == EOL_SEEN_LF)
7496               XSETCAR (tail, AREF (eol_type, 0));
7497             else if (this_eol == EOL_SEEN_CRLF)
7498               XSETCAR (tail, AREF (eol_type, 1));
7499             else if (this_eol == EOL_SEEN_CR)
7500               XSETCAR (tail, AREF (eol_type, 2));
7501             else
7502               XSETCAR (tail, CODING_ID_NAME (id));
7503           }
7504         else
7505           XSETCAR (tail, CODING_ID_NAME (id));
7506       }
7507   }
7508
7509   return (highest ? XCAR (val) : val);
7510 }
7511
7512
7513 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7514        2, 3, 0,
7515        doc: /* Detect coding system of the text in the region between START and END.
7516 Return a list of possible coding systems ordered by priority.
7517
7518 If only ASCII characters are found (except for such ISO-2022 control
7519 characters ISO-2022 as ESC), it returns a list of single element
7520 `undecided' or its subsidiary coding system according to a detected
7521 end-of-line format.
7522
7523 If optional argument HIGHEST is non-nil, return the coding system of
7524 highest priority.  */)
7525      (start, end, highest)
7526      Lisp_Object start, end, highest;
7527 {
7528   int from, to;
7529   int from_byte, to_byte;
7530
7531   CHECK_NUMBER_COERCE_MARKER (start);
7532   CHECK_NUMBER_COERCE_MARKER (end);
7533
7534   validate_region (&start, &end);
7535   from = XINT (start), to = XINT (end);
7536   from_byte = CHAR_TO_BYTE (from);
7537   to_byte = CHAR_TO_BYTE (to);
7538
7539   if (from < GPT && to >= GPT)
7540     move_gap_both (to, to_byte);
7541
7542   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7543                                to - from, to_byte - from_byte,
7544                                !NILP (highest),
7545                                !NILP (current_buffer
7546                                       ->enable_multibyte_characters),
7547                                Qnil);
7548 }
7549
7550 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7551        1, 2, 0,
7552        doc: /* Detect coding system of the text in STRING.
7553 Return a list of possible coding systems ordered by priority.
7554
7555 If only ASCII characters are found (except for such ISO-2022 control
7556 characters ISO-2022 as ESC), it returns a list of single element
7557 `undecided' or its subsidiary coding system according to a detected
7558 end-of-line format.
7559
7560 If optional argument HIGHEST is non-nil, return the coding system of
7561 highest priority.  */)
7562      (string, highest)
7563      Lisp_Object string, highest;
7564 {
7565   CHECK_STRING (string);
7566
7567   return detect_coding_system (SDATA (string),
7568                                SCHARS (string), SBYTES (string),
7569                                !NILP (highest), STRING_MULTIBYTE (string),
7570                                Qnil);
7571 }
7572
7573
7574 static INLINE int
7575 char_encodable_p (c, attrs)
7576      int c;
7577      Lisp_Object attrs;
7578 {
7579   Lisp_Object tail;
7580   struct charset *charset;
7581   Lisp_Object translation_table;
7582
7583   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7584   if (! NILP (translation_table))
7585     c = translate_char (translation_table, c);
7586   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7587        CONSP (tail); tail = XCDR (tail))
7588     {
7589       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7590       if (CHAR_CHARSET_P (c, charset))
7591         break;
7592     }
7593   return (! NILP (tail));
7594 }
7595
7596
7597 /* Return a list of coding systems that safely encode the text between
7598    START and END.  If EXCLUDE is non-nil, it is a list of coding
7599    systems not to check.  The returned list doesn't contain any such
7600    coding systems.  In any case, if the text contains only ASCII or is
7601    unibyte, return t.  */
7602
7603 DEFUN ("find-coding-systems-region-internal",
7604        Ffind_coding_systems_region_internal,
7605        Sfind_coding_systems_region_internal, 2, 3, 0,
7606        doc: /* Internal use only.  */)
7607      (start, end, exclude)
7608      Lisp_Object start, end, exclude;
7609 {
7610   Lisp_Object coding_attrs_list, safe_codings;
7611   EMACS_INT start_byte, end_byte;
7612   const unsigned char *p, *pbeg, *pend;
7613   int c;
7614   Lisp_Object tail, elt;
7615
7616   if (STRINGP (start))
7617     {
7618       if (!STRING_MULTIBYTE (start)
7619           || SCHARS (start) == SBYTES (start))
7620         return Qt;
7621       start_byte = 0;
7622       end_byte = SBYTES (start);
7623     }
7624   else
7625     {
7626       CHECK_NUMBER_COERCE_MARKER (start);
7627       CHECK_NUMBER_COERCE_MARKER (end);
7628       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7629         args_out_of_range (start, end);
7630       if (NILP (current_buffer->enable_multibyte_characters))
7631         return Qt;
7632       start_byte = CHAR_TO_BYTE (XINT (start));
7633       end_byte = CHAR_TO_BYTE (XINT (end));
7634       if (XINT (end) - XINT (start) == end_byte - start_byte)
7635         return Qt;
7636
7637       if (XINT (start) < GPT && XINT (end) > GPT)
7638         {
7639           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7640             move_gap_both (XINT (start), start_byte);
7641           else
7642             move_gap_both (XINT (end), end_byte);
7643         }
7644     }
7645
7646   coding_attrs_list = Qnil;
7647   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7648     if (NILP (exclude)
7649         || NILP (Fmemq (XCAR (tail), exclude)))
7650       {
7651         Lisp_Object attrs;
7652
7653         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7654         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7655             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7656           {
7657             ASET (attrs, coding_attr_trans_tbl,
7658                   get_translation_table (attrs, 1, NULL));
7659             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7660           }
7661       }
7662
7663   if (STRINGP (start))
7664     p = pbeg = SDATA (start);
7665   else
7666     p = pbeg = BYTE_POS_ADDR (start_byte);
7667   pend = p + (end_byte - start_byte);
7668
7669   while (p < pend && ASCII_BYTE_P (*p)) p++;
7670   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7671
7672   while (p < pend)
7673     {
7674       if (ASCII_BYTE_P (*p))
7675         p++;
7676       else
7677         {
7678           c = STRING_CHAR_ADVANCE (p);
7679
7680           charset_map_loaded = 0;
7681           for (tail = coding_attrs_list; CONSP (tail);)
7682             {
7683               elt = XCAR (tail);
7684               if (NILP (elt))
7685                 tail = XCDR (tail);
7686               else if (char_encodable_p (c, elt))
7687                 tail = XCDR (tail);
7688               else if (CONSP (XCDR (tail)))
7689                 {
7690                   XSETCAR (tail, XCAR (XCDR (tail)));
7691                   XSETCDR (tail, XCDR (XCDR (tail)));
7692                 }
7693               else
7694                 {
7695                   XSETCAR (tail, Qnil);
7696                   tail = XCDR (tail);
7697                 }
7698             }
7699           if (charset_map_loaded)
7700             {
7701               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7702
7703               if (STRINGP (start))
7704                 pbeg = SDATA (start);
7705               else
7706                 pbeg = BYTE_POS_ADDR (start_byte);
7707               p = pbeg + p_offset;
7708               pend = pbeg + pend_offset;
7709             }
7710         }
7711     }
7712
7713   safe_codings = list2 (Qraw_text, Qno_conversion);
7714   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7715     if (! NILP (XCAR (tail)))
7716       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7717
7718   return safe_codings;
7719 }
7720
7721
7722 DEFUN ("unencodable-char-position", Funencodable_char_position,
7723        Sunencodable_char_position, 3, 5, 0,
7724        doc: /*
7725 Return position of first un-encodable character in a region.
7726 START and END specfiy the region and CODING-SYSTEM specifies the
7727 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7728
7729 If optional 4th argument COUNT is non-nil, it specifies at most how
7730 many un-encodable characters to search.  In this case, the value is a
7731 list of positions.
7732
7733 If optional 5th argument STRING is non-nil, it is a string to search
7734 for un-encodable characters.  In that case, START and END are indexes
7735 to the string.  */)
7736      (start, end, coding_system, count, string)
7737      Lisp_Object start, end, coding_system, count, string;
7738 {
7739   int n;
7740   struct coding_system coding;
7741   Lisp_Object attrs, charset_list, translation_table;
7742   Lisp_Object positions;
7743   int from, to;
7744   const unsigned char *p, *stop, *pend;
7745   int ascii_compatible;
7746
7747   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7748   attrs = CODING_ID_ATTRS (coding.id);
7749   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7750     return Qnil;
7751   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7752   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7753   translation_table = get_translation_table (attrs, 1, NULL);
7754
7755   if (NILP (string))
7756     {
7757       validate_region (&start, &end);
7758       from = XINT (start);
7759       to = XINT (end);
7760       if (NILP (current_buffer->enable_multibyte_characters)
7761           || (ascii_compatible
7762               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7763         return Qnil;
7764       p = CHAR_POS_ADDR (from);
7765       pend = CHAR_POS_ADDR (to);
7766       if (from < GPT && to >= GPT)
7767         stop = GPT_ADDR;
7768       else
7769         stop = pend;
7770     }
7771   else
7772     {
7773       CHECK_STRING (string);
7774       CHECK_NATNUM (start);
7775       CHECK_NATNUM (end);
7776       from = XINT (start);
7777       to = XINT (end);
7778       if (from > to
7779           || to > SCHARS (string))
7780         args_out_of_range_3 (string, start, end);
7781       if (! STRING_MULTIBYTE (string))
7782         return Qnil;
7783       p = SDATA (string) + string_char_to_byte (string, from);
7784       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7785       if (ascii_compatible && (to - from) == (pend - p))
7786         return Qnil;
7787     }
7788
7789   if (NILP (count))
7790     n = 1;
7791   else
7792     {
7793       CHECK_NATNUM (count);
7794       n = XINT (count);
7795     }
7796
7797   positions = Qnil;
7798   while (1)
7799     {
7800       int c;
7801
7802       if (ascii_compatible)
7803         while (p < stop && ASCII_BYTE_P (*p))
7804           p++, from++;
7805       if (p >= stop)
7806         {
7807           if (p >= pend)
7808             break;
7809           stop = pend;
7810           p = GAP_END_ADDR;
7811         }
7812
7813       c = STRING_CHAR_ADVANCE (p);
7814       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7815           && ! char_charset (translate_char (translation_table, c),
7816                              charset_list, NULL))
7817         {
7818           positions = Fcons (make_number (from), positions);
7819           n--;
7820           if (n == 0)
7821             break;
7822         }
7823
7824       from++;
7825     }
7826
7827   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7828 }
7829
7830
7831 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7832        Scheck_coding_systems_region, 3, 3, 0,
7833        doc: /* Check if the region is encodable by coding systems.
7834
7835 START and END are buffer positions specifying the region.
7836 CODING-SYSTEM-LIST is a list of coding systems to check.
7837
7838 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7839 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7840 whole region, POS0, POS1, ... are buffer positions where non-encodable
7841 characters are found.
7842
7843 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7844 value is nil.
7845
7846 START may be a string.  In that case, check if the string is
7847 encodable, and the value contains indices to the string instead of
7848 buffer positions.  END is ignored.  */)
7849      (start, end, coding_system_list)
7850      Lisp_Object start, end, coding_system_list;
7851 {
7852   Lisp_Object list;
7853   EMACS_INT start_byte, end_byte;
7854   int pos;
7855   const unsigned char *p, *pbeg, *pend;
7856   int c;
7857   Lisp_Object tail, elt, attrs;
7858
7859   if (STRINGP (start))
7860     {
7861       if (!STRING_MULTIBYTE (start)
7862           && SCHARS (start) != SBYTES (start))
7863         return Qnil;
7864       start_byte = 0;
7865       end_byte = SBYTES (start);
7866       pos = 0;
7867     }
7868   else
7869     {
7870       CHECK_NUMBER_COERCE_MARKER (start);
7871       CHECK_NUMBER_COERCE_MARKER (end);
7872       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7873         args_out_of_range (start, end);
7874       if (NILP (current_buffer->enable_multibyte_characters))
7875         return Qnil;
7876       start_byte = CHAR_TO_BYTE (XINT (start));
7877       end_byte = CHAR_TO_BYTE (XINT (end));
7878       if (XINT (end) - XINT (start) == end_byte - start_byte)
7879         return Qt;
7880
7881       if (XINT (start) < GPT && XINT (end) > GPT)
7882         {
7883           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7884             move_gap_both (XINT (start), start_byte);
7885           else
7886             move_gap_both (XINT (end), end_byte);
7887         }
7888       pos = XINT (start);
7889     }
7890
7891   list = Qnil;
7892   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7893     {
7894       elt = XCAR (tail);
7895       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7896       ASET (attrs, coding_attr_trans_tbl,
7897             get_translation_table (attrs, 1, NULL));
7898       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7899     }
7900
7901   if (STRINGP (start))
7902     p = pbeg = SDATA (start);
7903   else
7904     p = pbeg = BYTE_POS_ADDR (start_byte);
7905   pend = p + (end_byte - start_byte);
7906
7907   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7908   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7909
7910   while (p < pend)
7911     {
7912       if (ASCII_BYTE_P (*p))
7913         p++;
7914       else
7915         {
7916           c = STRING_CHAR_ADVANCE (p);
7917
7918           charset_map_loaded = 0;
7919           for (tail = list; CONSP (tail); tail = XCDR (tail))
7920             {
7921               elt = XCDR (XCAR (tail));
7922               if (! char_encodable_p (c, XCAR (elt)))
7923                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7924             }
7925           if (charset_map_loaded)
7926             {
7927               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7928
7929               if (STRINGP (start))
7930                 pbeg = SDATA (start);
7931               else
7932                 pbeg = BYTE_POS_ADDR (start_byte);
7933               p = pbeg + p_offset;
7934               pend = pbeg + pend_offset;
7935             }
7936         }
7937       pos++;
7938     }
7939
7940   tail = list;
7941   list = Qnil;
7942   for (; CONSP (tail); tail = XCDR (tail))
7943     {
7944       elt = XCAR (tail);
7945       if (CONSP (XCDR (XCDR (elt))))
7946         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7947                       list);
7948     }
7949
7950   return list;
7951 }
7952
7953
7954 Lisp_Object
7955 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7956      Lisp_Object start, end, coding_system, dst_object;
7957      int encodep, norecord;
7958 {
7959   struct coding_system coding;
7960   EMACS_INT from, from_byte, to, to_byte;
7961   Lisp_Object src_object;
7962
7963   CHECK_NUMBER_COERCE_MARKER (start);
7964   CHECK_NUMBER_COERCE_MARKER (end);
7965   if (NILP (coding_system))
7966     coding_system = Qno_conversion;
7967   else
7968     CHECK_CODING_SYSTEM (coding_system);
7969   src_object = Fcurrent_buffer ();
7970   if (NILP (dst_object))
7971     dst_object = src_object;
7972   else if (! EQ (dst_object, Qt))
7973     CHECK_BUFFER (dst_object);
7974
7975   validate_region (&start, &end);
7976   from = XFASTINT (start);
7977   from_byte = CHAR_TO_BYTE (from);
7978   to = XFASTINT (end);
7979   to_byte = CHAR_TO_BYTE (to);
7980
7981   setup_coding_system (coding_system, &coding);
7982   coding.mode |= CODING_MODE_LAST_BLOCK;
7983
7984   if (encodep)
7985     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7986                           dst_object);
7987   else
7988     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7989                           dst_object);
7990   if (! norecord)
7991     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7992
7993   return (BUFFERP (dst_object)
7994           ? make_number (coding.produced_char)
7995           : coding.dst_object);
7996 }
7997
7998
7999 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8000        3, 4, "r\nzCoding system: ",
8001        doc: /* Decode the current region from the specified coding system.
8002 When called from a program, takes four arguments:
8003         START, END, CODING-SYSTEM, and DESTINATION.
8004 START and END are buffer positions.
8005
8006 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8007 If nil, the region between START and END is replace by the decoded text.
8008 If buffer, the decoded text is inserted in the buffer.
8009 If t, the decoded text is returned.
8010
8011 This function sets `last-coding-system-used' to the precise coding system
8012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8013 not fully specified.)
8014 It returns the length of the decoded text.  */)
8015      (start, end, coding_system, destination)
8016      Lisp_Object start, end, coding_system, destination;
8017 {
8018   return code_convert_region (start, end, coding_system, destination, 0, 0);
8019 }
8020
8021 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8022        3, 4, "r\nzCoding system: ",
8023        doc: /* Encode the current region by specified coding system.
8024 When called from a program, takes three arguments:
8025 START, END, and CODING-SYSTEM.  START and END are buffer positions.
8026
8027 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8028 If nil, the region between START and END is replace by the encoded text.
8029 If buffer, the encoded text is inserted in the buffer.
8030 If t, the encoded text is returned.
8031
8032 This function sets `last-coding-system-used' to the precise coding system
8033 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8034 not fully specified.)
8035 It returns the length of the encoded text.  */)
8036   (start, end, coding_system, destination)
8037      Lisp_Object start, end, coding_system, destination;
8038 {
8039   return code_convert_region (start, end, coding_system, destination, 1, 0);
8040 }
8041
8042 Lisp_Object
8043 code_convert_string (string, coding_system, dst_object,
8044                      encodep, nocopy, norecord)
8045      Lisp_Object string, coding_system, dst_object;
8046      int encodep, nocopy, norecord;
8047 {
8048   struct coding_system coding;
8049   EMACS_INT chars, bytes;
8050
8051   CHECK_STRING (string);
8052   if (NILP (coding_system))
8053     {
8054       if (! norecord)
8055         Vlast_coding_system_used = Qno_conversion;
8056       if (NILP (dst_object))
8057         return (nocopy ? Fcopy_sequence (string) : string);
8058     }
8059
8060   if (NILP (coding_system))
8061     coding_system = Qno_conversion;
8062   else
8063     CHECK_CODING_SYSTEM (coding_system);
8064   if (NILP (dst_object))
8065     dst_object = Qt;
8066   else if (! EQ (dst_object, Qt))
8067     CHECK_BUFFER (dst_object);
8068
8069   setup_coding_system (coding_system, &coding);
8070   coding.mode |= CODING_MODE_LAST_BLOCK;
8071   chars = SCHARS (string);
8072   bytes = SBYTES (string);
8073   if (encodep)
8074     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8075   else
8076     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8077   if (! norecord)
8078     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8079
8080   return (BUFFERP (dst_object)
8081           ? make_number (coding.produced_char)
8082           : coding.dst_object);
8083 }
8084
8085
8086 /* Encode or decode STRING according to CODING_SYSTEM.
8087    Do not set Vlast_coding_system_used.
8088
8089    This function is called only from macros DECODE_FILE and
8090    ENCODE_FILE, thus we ignore character composition.  */
8091
8092 Lisp_Object
8093 code_convert_string_norecord (string, coding_system, encodep)
8094      Lisp_Object string, coding_system;
8095      int encodep;
8096 {
8097   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8098 }
8099
8100
8101 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8102        2, 4, 0,
8103        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8104
8105 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8106 if the decoding operation is trivial.
8107
8108 Optional fourth arg BUFFER non-nil meant that the decoded text is
8109 inserted in BUFFER instead of returned as a string.  In this case,
8110 the return value is BUFFER.
8111
8112 This function sets `last-coding-system-used' to the precise coding system
8113 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8114 not fully specified.  */)
8115   (string, coding_system, nocopy, buffer)
8116      Lisp_Object string, coding_system, nocopy, buffer;
8117 {
8118   return code_convert_string (string, coding_system, buffer,
8119                               0, ! NILP (nocopy), 0);
8120 }
8121
8122 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8123        2, 4, 0,
8124        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8125
8126 Optional third arg NOCOPY non-nil means it is OK to return STRING
8127 itself if the encoding operation is trivial.
8128
8129 Optional fourth arg BUFFER non-nil meant that the encoded text is
8130 inserted in BUFFER instead of returned as a string.  In this case,
8131 the return value is BUFFER.
8132
8133 This function sets `last-coding-system-used' to the precise coding system
8134 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8135 not fully specified.)  */)
8136      (string, coding_system, nocopy, buffer)
8137      Lisp_Object string, coding_system, nocopy, buffer;
8138 {
8139   return code_convert_string (string, coding_system, buffer,
8140                               1, ! NILP (nocopy), 1);
8141 }
8142
8143 \f
8144 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8145        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8146 Return the corresponding character.  */)
8147      (code)
8148      Lisp_Object code;
8149 {
8150   Lisp_Object spec, attrs, val;
8151   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8152   int c;
8153
8154   CHECK_NATNUM (code);
8155   c = XFASTINT (code);
8156   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8157   attrs = AREF (spec, 0);
8158
8159   if (ASCII_BYTE_P (c)
8160       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8161     return code;
8162
8163   val = CODING_ATTR_CHARSET_LIST (attrs);
8164   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8165   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8166   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8167
8168   if (c <= 0x7F)
8169     charset = charset_roman;
8170   else if (c >= 0xA0 && c < 0xDF)
8171     {
8172       charset = charset_kana;
8173       c -= 0x80;
8174     }
8175   else
8176     {
8177       int s1 = c >> 8, s2 = c & 0xFF;
8178
8179       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8180           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8181         error ("Invalid code: %d", code);
8182       SJIS_TO_JIS (c);
8183       charset = charset_kanji;
8184     }
8185   c = DECODE_CHAR (charset, c);
8186   if (c < 0)
8187     error ("Invalid code: %d", code);
8188   return make_number (c);
8189 }
8190
8191
8192 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8193        doc: /* Encode a Japanese character CH to shift_jis encoding.
8194 Return the corresponding code in SJIS.  */)
8195      (ch)
8196     Lisp_Object ch;
8197 {
8198   Lisp_Object spec, attrs, charset_list;
8199   int c;
8200   struct charset *charset;
8201   unsigned code;
8202
8203   CHECK_CHARACTER (ch);
8204   c = XFASTINT (ch);
8205   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8206   attrs = AREF (spec, 0);
8207
8208   if (ASCII_CHAR_P (c)
8209       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8210     return ch;
8211
8212   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8213   charset = char_charset (c, charset_list, &code);
8214   if (code == CHARSET_INVALID_CODE (charset))
8215     error ("Can't encode by shift_jis encoding: %d", c);
8216   JIS_TO_SJIS (code);
8217
8218   return make_number (code);
8219 }
8220
8221 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8222        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8223 Return the corresponding character.  */)
8224      (code)
8225      Lisp_Object code;
8226 {
8227   Lisp_Object spec, attrs, val;
8228   struct charset *charset_roman, *charset_big5, *charset;
8229   int c;
8230
8231   CHECK_NATNUM (code);
8232   c = XFASTINT (code);
8233   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8234   attrs = AREF (spec, 0);
8235
8236   if (ASCII_BYTE_P (c)
8237       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8238     return code;
8239
8240   val = CODING_ATTR_CHARSET_LIST (attrs);
8241   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8242   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8243
8244   if (c <= 0x7F)
8245     charset = charset_roman;
8246   else
8247     {
8248       int b1 = c >> 8, b2 = c & 0x7F;
8249       if (b1 < 0xA1 || b1 > 0xFE
8250           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8251         error ("Invalid code: %d", code);
8252       charset = charset_big5;
8253     }
8254   c = DECODE_CHAR (charset, (unsigned )c);
8255   if (c < 0)
8256     error ("Invalid code: %d", code);
8257   return make_number (c);
8258 }
8259
8260 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8261        doc: /* Encode the Big5 character CH to BIG5 coding system.
8262 Return the corresponding character code in Big5.  */)
8263      (ch)
8264      Lisp_Object ch;
8265 {
8266   Lisp_Object spec, attrs, charset_list;
8267   struct charset *charset;
8268   int c;
8269   unsigned code;
8270
8271   CHECK_CHARACTER (ch);
8272   c = XFASTINT (ch);
8273   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8274   attrs = AREF (spec, 0);
8275   if (ASCII_CHAR_P (c)
8276       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8277     return ch;
8278
8279   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8280   charset = char_charset (c, charset_list, &code);
8281   if (code == CHARSET_INVALID_CODE (charset))
8282     error ("Can't encode by Big5 encoding: %d", c);
8283
8284   return make_number (code);
8285 }
8286
8287 \f
8288 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8289        Sset_terminal_coding_system_internal, 1, 2, 0,
8290        doc: /* Internal use only.  */)
8291      (coding_system, terminal)
8292      Lisp_Object coding_system;
8293      Lisp_Object terminal;
8294 {
8295   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8296   CHECK_SYMBOL (coding_system);
8297   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8298   /* We had better not send unsafe characters to terminal.  */
8299   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8300   /* Characer composition should be disabled.  */
8301   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8302   terminal_coding->src_multibyte = 1;
8303   terminal_coding->dst_multibyte = 0;
8304   return Qnil;
8305 }
8306
8307 DEFUN ("set-safe-terminal-coding-system-internal",
8308        Fset_safe_terminal_coding_system_internal,
8309        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8310        doc: /* Internal use only.  */)
8311      (coding_system)
8312      Lisp_Object coding_system;
8313 {
8314   CHECK_SYMBOL (coding_system);
8315   setup_coding_system (Fcheck_coding_system (coding_system),
8316                        &safe_terminal_coding);
8317   /* Characer composition should be disabled.  */
8318   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8319   safe_terminal_coding.src_multibyte = 1;
8320   safe_terminal_coding.dst_multibyte = 0;
8321   return Qnil;
8322 }
8323
8324 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8325        Sterminal_coding_system, 0, 1, 0,
8326        doc: /* Return coding system specified for terminal output on the given terminal.
8327 TERMINAL may be a terminal id, a frame, or nil for the selected
8328 frame's terminal device.  */)
8329      (terminal)
8330      Lisp_Object terminal;
8331 {
8332   Lisp_Object coding_system;
8333
8334   coding_system = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
8335   /* For backward compatibility, return nil if it is `undecided'. */
8336   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8337 }
8338
8339 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8340        Sset_keyboard_coding_system_internal, 1, 2, 0,
8341        doc: /* Internal use only.  */)
8342      (coding_system, terminal)
8343      Lisp_Object coding_system;
8344      Lisp_Object terminal;
8345 {
8346   struct terminal *t = get_terminal (terminal, 1);
8347   CHECK_SYMBOL (coding_system);
8348   setup_coding_system (Fcheck_coding_system (coding_system),
8349                        TERMINAL_KEYBOARD_CODING (t));
8350   /* Characer composition should be disabled.  */
8351   TERMINAL_KEYBOARD_CODING (t)->common_flags
8352     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8353   return Qnil;
8354 }
8355
8356 DEFUN ("keyboard-coding-system",
8357        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8358        doc: /* Return coding system specified for decoding keyboard input.  */)
8359      ()
8360 {
8361   return CODING_ID_NAME (keyboard_coding.id);
8362 }
8363
8364 \f
8365 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8366        Sfind_operation_coding_system,  1, MANY, 0,
8367        doc: /* Choose a coding system for an operation based on the target name.
8368 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8369 DECODING-SYSTEM is the coding system to use for decoding
8370 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8371 for encoding (in case OPERATION does encoding).
8372
8373 The first argument OPERATION specifies an I/O primitive:
8374   For file I/O, `insert-file-contents' or `write-region'.
8375   For process I/O, `call-process', `call-process-region', or `start-process'.
8376   For network I/O, `open-network-stream'.
8377
8378 The remaining arguments should be the same arguments that were passed
8379 to the primitive.  Depending on which primitive, one of those arguments
8380 is selected as the TARGET.  For example, if OPERATION does file I/O,
8381 whichever argument specifies the file name is TARGET.
8382
8383 TARGET has a meaning which depends on OPERATION:
8384   For file I/O, TARGET is a file name (except for the special case below).
8385   For process I/O, TARGET is a process name.
8386   For network I/O, TARGET is a service name or a port number
8387
8388 This function looks up what specified for TARGET in,
8389 `file-coding-system-alist', `process-coding-system-alist',
8390 or `network-coding-system-alist' depending on OPERATION.
8391 They may specify a coding system, a cons of coding systems,
8392 or a function symbol to call.
8393 In the last case, we call the function with one argument,
8394 which is a list of all the arguments given to this function.
8395 If the function can't decide a coding system, it can return
8396 `undecided' so that the normal code-detection is performed.
8397
8398 If OPERATION is `insert-file-contents', the argument corresponding to
8399 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
8400 file name to look up, and BUFFER is a buffer that contains the file's
8401 contents (not yet decoded).  If `file-coding-system-alist' specifies a
8402 function to call for FILENAME, that function should examine the
8403 contents of BUFFER instead of reading the file.
8404
8405 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
8406      (nargs, args)
8407      int nargs;
8408      Lisp_Object *args;
8409 {
8410   Lisp_Object operation, target_idx, target, val;
8411   register Lisp_Object chain;
8412
8413   if (nargs < 2)
8414     error ("Too few arguments");
8415   operation = args[0];
8416   if (!SYMBOLP (operation)
8417       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8418     error ("Invalid first arguement");
8419   if (nargs < 1 + XINT (target_idx))
8420     error ("Too few arguments for operation: %s",
8421            SDATA (SYMBOL_NAME (operation)));
8422   target = args[XINT (target_idx) + 1];
8423   if (!(STRINGP (target)
8424         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8425             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8426         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8427     error ("Invalid %dth argument", XINT (target_idx) + 1);
8428   if (CONSP (target))
8429     target = XCAR (target);
8430
8431   chain = ((EQ (operation, Qinsert_file_contents)
8432             || EQ (operation, Qwrite_region))
8433            ? Vfile_coding_system_alist
8434            : (EQ (operation, Qopen_network_stream)
8435               ? Vnetwork_coding_system_alist
8436               : Vprocess_coding_system_alist));
8437   if (NILP (chain))
8438     return Qnil;
8439
8440   for (; CONSP (chain); chain = XCDR (chain))
8441     {
8442       Lisp_Object elt;
8443
8444       elt = XCAR (chain);
8445       if (CONSP (elt)
8446           && ((STRINGP (target)
8447                && STRINGP (XCAR (elt))
8448                && fast_string_match (XCAR (elt), target) >= 0)
8449               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8450         {
8451           val = XCDR (elt);
8452           /* Here, if VAL is both a valid coding system and a valid
8453              function symbol, we return VAL as a coding system.  */
8454           if (CONSP (val))
8455             return val;
8456           if (! SYMBOLP (val))
8457             return Qnil;
8458           if (! NILP (Fcoding_system_p (val)))
8459             return Fcons (val, val);
8460           if (! NILP (Ffboundp (val)))
8461             {
8462               /* We use call1 rather than safe_call1
8463                  so as to get bug reports about functions called here
8464                  which don't handle the current interface.  */
8465               val = call1 (val, Flist (nargs, args));
8466               if (CONSP (val))
8467                 return val;
8468               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8469                 return Fcons (val, val);
8470             }
8471           return Qnil;
8472         }
8473     }
8474   return Qnil;
8475 }
8476
8477 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8478        Sset_coding_system_priority, 0, MANY, 0,
8479        doc: /* Assign higher priority to the coding systems given as arguments.
8480 If multiple coding systems belongs to the same category,
8481 all but the first one are ignored.
8482
8483 usage: (set-coding-system-priority ...)  */)
8484      (nargs, args)
8485      int nargs;
8486      Lisp_Object *args;
8487 {
8488   int i, j;
8489   int changed[coding_category_max];
8490   enum coding_category priorities[coding_category_max];
8491
8492   bzero (changed, sizeof changed);
8493
8494   for (i = j = 0; i < nargs; i++)
8495     {
8496       enum coding_category category;
8497       Lisp_Object spec, attrs;
8498
8499       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8500       attrs = AREF (spec, 0);
8501       category = XINT (CODING_ATTR_CATEGORY (attrs));
8502       if (changed[category])
8503         /* Ignore this coding system because a coding system of the
8504            same category already had a higher priority.  */
8505         continue;
8506       changed[category] = 1;
8507       priorities[j++] = category;
8508       if (coding_categories[category].id >= 0
8509           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8510         setup_coding_system (args[i], &coding_categories[category]);
8511       Fset (AREF (Vcoding_category_table, category), args[i]);
8512     }
8513
8514   /* Now we have decided top J priorities.  Reflect the order of the
8515      original priorities to the remaining priorities.  */
8516
8517   for (i = j, j = 0; i < coding_category_max; i++, j++)
8518     {
8519       while (j < coding_category_max
8520              && changed[coding_priorities[j]])
8521         j++;
8522       if (j == coding_category_max)
8523         abort ();
8524       priorities[i] = coding_priorities[j];
8525     }
8526
8527   bcopy (priorities, coding_priorities, sizeof priorities);
8528
8529   /* Update `coding-category-list'.  */
8530   Vcoding_category_list = Qnil;
8531   for (i = coding_category_max - 1; i >= 0; i--)
8532     Vcoding_category_list
8533       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8534                Vcoding_category_list);
8535
8536   return Qnil;
8537 }
8538
8539 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8540        Scoding_system_priority_list, 0, 1, 0,
8541        doc: /* Return a list of coding systems ordered by their priorities.
8542 HIGHESTP non-nil means just return the highest priority one.  */)
8543      (highestp)
8544      Lisp_Object highestp;
8545 {
8546   int i;
8547   Lisp_Object val;
8548
8549   for (i = 0, val = Qnil; i < coding_category_max; i++)
8550     {
8551       enum coding_category category = coding_priorities[i];
8552       int id = coding_categories[category].id;
8553       Lisp_Object attrs;
8554
8555       if (id < 0)
8556         continue;
8557       attrs = CODING_ID_ATTRS (id);
8558       if (! NILP (highestp))
8559         return CODING_ATTR_BASE_NAME (attrs);
8560       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8561     }
8562   return Fnreverse (val);
8563 }
8564
8565 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8566
8567 static Lisp_Object
8568 make_subsidiaries (base)
8569      Lisp_Object base;
8570 {
8571   Lisp_Object subsidiaries;
8572   int base_name_len = SBYTES (SYMBOL_NAME (base));
8573   char *buf = (char *) alloca (base_name_len + 6);
8574   int i;
8575
8576   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8577   subsidiaries = Fmake_vector (make_number (3), Qnil);
8578   for (i = 0; i < 3; i++)
8579     {
8580       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8581       ASET (subsidiaries, i, intern (buf));
8582     }
8583   return subsidiaries;
8584 }
8585
8586
8587 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8588        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8589        doc: /* For internal use only.
8590 usage: (define-coding-system-internal ...)  */)
8591      (nargs, args)
8592      int nargs;
8593      Lisp_Object *args;
8594 {
8595   Lisp_Object name;
8596   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8597   Lisp_Object attrs;            /* Vector of attributes.  */
8598   Lisp_Object eol_type;
8599   Lisp_Object aliases;
8600   Lisp_Object coding_type, charset_list, safe_charsets;
8601   enum coding_category category;
8602   Lisp_Object tail, val;
8603   int max_charset_id = 0;
8604   int i;
8605
8606   if (nargs < coding_arg_max)
8607     goto short_args;
8608
8609   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8610
8611   name = args[coding_arg_name];
8612   CHECK_SYMBOL (name);
8613   CODING_ATTR_BASE_NAME (attrs) = name;
8614
8615   val = args[coding_arg_mnemonic];
8616   if (! STRINGP (val))
8617     CHECK_CHARACTER (val);
8618   CODING_ATTR_MNEMONIC (attrs) = val;
8619
8620   coding_type = args[coding_arg_coding_type];
8621   CHECK_SYMBOL (coding_type);
8622   CODING_ATTR_TYPE (attrs) = coding_type;
8623
8624   charset_list = args[coding_arg_charset_list];
8625   if (SYMBOLP (charset_list))
8626     {
8627       if (EQ (charset_list, Qiso_2022))
8628         {
8629           if (! EQ (coding_type, Qiso_2022))
8630             error ("Invalid charset-list");
8631           charset_list = Viso_2022_charset_list;
8632         }
8633       else if (EQ (charset_list, Qemacs_mule))
8634         {
8635           if (! EQ (coding_type, Qemacs_mule))
8636             error ("Invalid charset-list");
8637           charset_list = Vemacs_mule_charset_list;
8638         }
8639       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8640         if (max_charset_id < XFASTINT (XCAR (tail)))
8641           max_charset_id = XFASTINT (XCAR (tail));
8642     }
8643   else
8644     {
8645       charset_list = Fcopy_sequence (charset_list);
8646       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8647         {
8648           struct charset *charset;
8649
8650           val = Fcar (tail);
8651           CHECK_CHARSET_GET_CHARSET (val, charset);
8652           if (EQ (coding_type, Qiso_2022)
8653               ? CHARSET_ISO_FINAL (charset) < 0
8654               : EQ (coding_type, Qemacs_mule)
8655               ? CHARSET_EMACS_MULE_ID (charset) < 0
8656               : 0)
8657             error ("Can't handle charset `%s'",
8658                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8659
8660           XSETCAR (tail, make_number (charset->id));
8661           if (max_charset_id < charset->id)
8662             max_charset_id = charset->id;
8663         }
8664     }
8665   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8666
8667   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8668                                 make_number (255));
8669   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8670     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8671   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8672
8673   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8674
8675   val = args[coding_arg_decode_translation_table];
8676   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8677     CHECK_SYMBOL (val);
8678   CODING_ATTR_DECODE_TBL (attrs) = val;
8679
8680   val = args[coding_arg_encode_translation_table];
8681   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8682     CHECK_SYMBOL (val);
8683   CODING_ATTR_ENCODE_TBL (attrs) = val;
8684
8685   val = args[coding_arg_post_read_conversion];
8686   CHECK_SYMBOL (val);
8687   CODING_ATTR_POST_READ (attrs) = val;
8688
8689   val = args[coding_arg_pre_write_conversion];
8690   CHECK_SYMBOL (val);
8691   CODING_ATTR_PRE_WRITE (attrs) = val;
8692
8693   val = args[coding_arg_default_char];
8694   if (NILP (val))
8695     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8696   else
8697     {
8698       CHECK_CHARACTER (val);
8699       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8700     }
8701
8702   val = args[coding_arg_for_unibyte];
8703   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8704
8705   val = args[coding_arg_plist];
8706   CHECK_LIST (val);
8707   CODING_ATTR_PLIST (attrs) = val;
8708
8709   if (EQ (coding_type, Qcharset))
8710     {
8711       /* Generate a lisp vector of 256 elements.  Each element is nil,
8712          integer, or a list of charset IDs.
8713
8714          If Nth element is nil, the byte code N is invalid in this
8715          coding system.
8716
8717          If Nth element is a number NUM, N is the first byte of a
8718          charset whose ID is NUM.
8719
8720          If Nth element is a list of charset IDs, N is the first byte
8721          of one of them.  The list is sorted by dimensions of the
8722          charsets.  A charset of smaller dimension comes firtst. */
8723       val = Fmake_vector (make_number (256), Qnil);
8724
8725       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8726         {
8727           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8728           int dim = CHARSET_DIMENSION (charset);
8729           int idx = (dim - 1) * 4;
8730
8731           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8732             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8733
8734           for (i = charset->code_space[idx];
8735                i <= charset->code_space[idx + 1]; i++)
8736             {
8737               Lisp_Object tmp, tmp2;
8738               int dim2;
8739
8740               tmp = AREF (val, i);
8741               if (NILP (tmp))
8742                 tmp = XCAR (tail);
8743               else if (NUMBERP (tmp))
8744                 {
8745                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8746                   if (dim < dim2)
8747                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8748                   else
8749                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8750                 }
8751               else
8752                 {
8753                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8754                     {
8755                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8756                       if (dim < dim2)
8757                         break;
8758                     }
8759                   if (NILP (tmp2))
8760                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8761                   else
8762                     {
8763                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8764                       XSETCAR (tmp2, XCAR (tail));
8765                     }
8766                 }
8767               ASET (val, i, tmp);
8768             }
8769         }
8770       ASET (attrs, coding_attr_charset_valids, val);
8771       category = coding_category_charset;
8772     }
8773   else if (EQ (coding_type, Qccl))
8774     {
8775       Lisp_Object valids;
8776
8777       if (nargs < coding_arg_ccl_max)
8778         goto short_args;
8779
8780       val = args[coding_arg_ccl_decoder];
8781       CHECK_CCL_PROGRAM (val);
8782       if (VECTORP (val))
8783         val = Fcopy_sequence (val);
8784       ASET (attrs, coding_attr_ccl_decoder, val);
8785
8786       val = args[coding_arg_ccl_encoder];
8787       CHECK_CCL_PROGRAM (val);
8788       if (VECTORP (val))
8789         val = Fcopy_sequence (val);
8790       ASET (attrs, coding_attr_ccl_encoder, val);
8791
8792       val = args[coding_arg_ccl_valids];
8793       valids = Fmake_string (make_number (256), make_number (0));
8794       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8795         {
8796           int from, to;
8797
8798           val = Fcar (tail);
8799           if (INTEGERP (val))
8800             {
8801               from = to = XINT (val);
8802               if (from < 0 || from > 255)
8803                 args_out_of_range_3 (val, make_number (0), make_number (255));
8804             }
8805           else
8806             {
8807               CHECK_CONS (val);
8808               CHECK_NATNUM_CAR (val);
8809               CHECK_NATNUM_CDR (val);
8810               from = XINT (XCAR (val));
8811               if (from > 255)
8812                 args_out_of_range_3 (XCAR (val),
8813                                      make_number (0), make_number (255));
8814               to = XINT (XCDR (val));
8815               if (to < from || to > 255)
8816                 args_out_of_range_3 (XCDR (val),
8817                                      XCAR (val), make_number (255));
8818             }
8819           for (i = from; i <= to; i++)
8820             SSET (valids, i, 1);
8821         }
8822       ASET (attrs, coding_attr_ccl_valids, valids);
8823
8824       category = coding_category_ccl;
8825     }
8826   else if (EQ (coding_type, Qutf_16))
8827     {
8828       Lisp_Object bom, endian;
8829
8830       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8831
8832       if (nargs < coding_arg_utf16_max)
8833         goto short_args;
8834
8835       bom = args[coding_arg_utf16_bom];
8836       if (! NILP (bom) && ! EQ (bom, Qt))
8837         {
8838           CHECK_CONS (bom);
8839           val = XCAR (bom);
8840           CHECK_CODING_SYSTEM (val);
8841           val = XCDR (bom);
8842           CHECK_CODING_SYSTEM (val);
8843         }
8844       ASET (attrs, coding_attr_utf_16_bom, bom);
8845
8846       endian = args[coding_arg_utf16_endian];
8847       CHECK_SYMBOL (endian);
8848       if (NILP (endian))
8849         endian = Qbig;
8850       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8851         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8852       ASET (attrs, coding_attr_utf_16_endian, endian);
8853
8854       category = (CONSP (bom)
8855                   ? coding_category_utf_16_auto
8856                   : NILP (bom)
8857                   ? (EQ (endian, Qbig)
8858                      ? coding_category_utf_16_be_nosig
8859                      : coding_category_utf_16_le_nosig)
8860                   : (EQ (endian, Qbig)
8861                      ? coding_category_utf_16_be
8862                      : coding_category_utf_16_le));
8863     }
8864   else if (EQ (coding_type, Qiso_2022))
8865     {
8866       Lisp_Object initial, reg_usage, request, flags;
8867       int i;
8868
8869       if (nargs < coding_arg_iso2022_max)
8870         goto short_args;
8871
8872       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8873       CHECK_VECTOR (initial);
8874       for (i = 0; i < 4; i++)
8875         {
8876           val = Faref (initial, make_number (i));
8877           if (! NILP (val))
8878             {
8879               struct charset *charset;
8880
8881               CHECK_CHARSET_GET_CHARSET (val, charset);
8882               ASET (initial, i, make_number (CHARSET_ID (charset)));
8883               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8884                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8885             }
8886           else
8887             ASET (initial, i, make_number (-1));
8888         }
8889
8890       reg_usage = args[coding_arg_iso2022_reg_usage];
8891       CHECK_CONS (reg_usage);
8892       CHECK_NUMBER_CAR (reg_usage);
8893       CHECK_NUMBER_CDR (reg_usage);
8894
8895       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8896       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8897         {
8898           int id;
8899           Lisp_Object tmp;
8900
8901           val = Fcar (tail);
8902           CHECK_CONS (val);
8903           tmp = XCAR (val);
8904           CHECK_CHARSET_GET_ID (tmp, id);
8905           CHECK_NATNUM_CDR (val);
8906           if (XINT (XCDR (val)) >= 4)
8907             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8908           XSETCAR (val, make_number (id));
8909         }
8910
8911       flags = args[coding_arg_iso2022_flags];
8912       CHECK_NATNUM (flags);
8913       i = XINT (flags);
8914       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8915         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8916
8917       ASET (attrs, coding_attr_iso_initial, initial);
8918       ASET (attrs, coding_attr_iso_usage, reg_usage);
8919       ASET (attrs, coding_attr_iso_request, request);
8920       ASET (attrs, coding_attr_iso_flags, flags);
8921       setup_iso_safe_charsets (attrs);
8922
8923       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8924         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8925                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8926                     ? coding_category_iso_7_else
8927                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8928                     ? coding_category_iso_7
8929                     : coding_category_iso_7_tight);
8930       else
8931         {
8932           int id = XINT (AREF (initial, 1));
8933
8934           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8935                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8936                        || id < 0)
8937                       ? coding_category_iso_8_else
8938                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8939                       ? coding_category_iso_8_1
8940                       : coding_category_iso_8_2);
8941         }
8942       if (category != coding_category_iso_8_1
8943           && category != coding_category_iso_8_2)
8944         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8945     }
8946   else if (EQ (coding_type, Qemacs_mule))
8947     {
8948       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8949         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8950       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8951       category = coding_category_emacs_mule;
8952     }
8953   else if (EQ (coding_type, Qshift_jis))
8954     {
8955
8956       struct charset *charset;
8957
8958       if (XINT (Flength (charset_list)) != 3
8959           && XINT (Flength (charset_list)) != 4)
8960         error ("There should be three or four charsets");
8961
8962       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8963       if (CHARSET_DIMENSION (charset) != 1)
8964         error ("Dimension of charset %s is not one",
8965                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8966       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8967         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8968
8969       charset_list = XCDR (charset_list);
8970       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8971       if (CHARSET_DIMENSION (charset) != 1)
8972         error ("Dimension of charset %s is not one",
8973                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8974
8975       charset_list = XCDR (charset_list);
8976       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8977       if (CHARSET_DIMENSION (charset) != 2)
8978         error ("Dimension of charset %s is not two",
8979                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8980
8981       charset_list = XCDR (charset_list);
8982       if (! NILP (charset_list))
8983         {
8984           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8985           if (CHARSET_DIMENSION (charset) != 2)
8986             error ("Dimension of charset %s is not two",
8987                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8988         }
8989
8990       category = coding_category_sjis;
8991       Vsjis_coding_system = name;
8992     }
8993   else if (EQ (coding_type, Qbig5))
8994     {
8995       struct charset *charset;
8996
8997       if (XINT (Flength (charset_list)) != 2)
8998         error ("There should be just two charsets");
8999
9000       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9001       if (CHARSET_DIMENSION (charset) != 1)
9002         error ("Dimension of charset %s is not one",
9003                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9004       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9005         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9006
9007       charset_list = XCDR (charset_list);
9008       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9009       if (CHARSET_DIMENSION (charset) != 2)
9010         error ("Dimension of charset %s is not two",
9011                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9012
9013       category = coding_category_big5;
9014       Vbig5_coding_system = name;
9015     }
9016   else if (EQ (coding_type, Qraw_text))
9017     {
9018       category = coding_category_raw_text;
9019       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9020     }
9021   else if (EQ (coding_type, Qutf_8))
9022     {
9023       category = coding_category_utf_8;
9024       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9025     }
9026   else if (EQ (coding_type, Qundecided))
9027     category = coding_category_undecided;
9028   else
9029     error ("Invalid coding system type: %s",
9030            SDATA (SYMBOL_NAME (coding_type)));
9031
9032   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9033   CODING_ATTR_PLIST (attrs)
9034     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9035                                 CODING_ATTR_PLIST (attrs)));
9036   CODING_ATTR_PLIST (attrs)
9037     = Fcons (QCascii_compatible_p,
9038              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9039                     CODING_ATTR_PLIST (attrs)));
9040
9041   eol_type = args[coding_arg_eol_type];
9042   if (! NILP (eol_type)
9043       && ! EQ (eol_type, Qunix)
9044       && ! EQ (eol_type, Qdos)
9045       && ! EQ (eol_type, Qmac))
9046     error ("Invalid eol-type");
9047
9048   aliases = Fcons (name, Qnil);
9049
9050   if (NILP (eol_type))
9051     {
9052       eol_type = make_subsidiaries (name);
9053       for (i = 0; i < 3; i++)
9054         {
9055           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9056
9057           this_name = AREF (eol_type, i);
9058           this_aliases = Fcons (this_name, Qnil);
9059           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9060           this_spec = Fmake_vector (make_number (3), attrs);
9061           ASET (this_spec, 1, this_aliases);
9062           ASET (this_spec, 2, this_eol_type);
9063           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9064           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9065           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9066           if (NILP (val))
9067             Vcoding_system_alist
9068               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9069                        Vcoding_system_alist);
9070         }
9071     }
9072
9073   spec_vec = Fmake_vector (make_number (3), attrs);
9074   ASET (spec_vec, 1, aliases);
9075   ASET (spec_vec, 2, eol_type);
9076
9077   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9078   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9079   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9080   if (NILP (val))
9081     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9082                                   Vcoding_system_alist);
9083
9084   {
9085     int id = coding_categories[category].id;
9086
9087     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9088       setup_coding_system (name, &coding_categories[category]);
9089   }
9090
9091   return Qnil;
9092
9093  short_args:
9094   return Fsignal (Qwrong_number_of_arguments,
9095                   Fcons (intern ("define-coding-system-internal"),
9096                          make_number (nargs)));
9097 }
9098
9099
9100 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9101        3, 3, 0,
9102        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9103   (coding_system, prop, val)
9104      Lisp_Object coding_system, prop, val;
9105 {
9106   Lisp_Object spec, attrs;
9107
9108   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9109   attrs = AREF (spec, 0);
9110   if (EQ (prop, QCmnemonic))
9111     {
9112       if (! STRINGP (val))
9113         CHECK_CHARACTER (val);
9114       CODING_ATTR_MNEMONIC (attrs) = val;
9115     }
9116   else if (EQ (prop, QCdefalut_char))
9117     {
9118       if (NILP (val))
9119         val = make_number (' ');
9120       else
9121         CHECK_CHARACTER (val);
9122       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9123     }
9124   else if (EQ (prop, QCdecode_translation_table))
9125     {
9126       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9127         CHECK_SYMBOL (val);
9128       CODING_ATTR_DECODE_TBL (attrs) = val;
9129     }
9130   else if (EQ (prop, QCencode_translation_table))
9131     {
9132       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9133         CHECK_SYMBOL (val);
9134       CODING_ATTR_ENCODE_TBL (attrs) = val;
9135     }
9136   else if (EQ (prop, QCpost_read_conversion))
9137     {
9138       CHECK_SYMBOL (val);
9139       CODING_ATTR_POST_READ (attrs) = val;
9140     }
9141   else if (EQ (prop, QCpre_write_conversion))
9142     {
9143       CHECK_SYMBOL (val);
9144       CODING_ATTR_PRE_WRITE (attrs) = val;
9145     }
9146   else if (EQ (prop, QCascii_compatible_p))
9147     {
9148       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9149     }
9150
9151   CODING_ATTR_PLIST (attrs)
9152     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9153   return val;
9154 }
9155
9156
9157 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9158        Sdefine_coding_system_alias, 2, 2, 0,
9159        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9160      (alias, coding_system)
9161      Lisp_Object alias, coding_system;
9162 {
9163   Lisp_Object spec, aliases, eol_type, val;
9164
9165   CHECK_SYMBOL (alias);
9166   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9167   aliases = AREF (spec, 1);
9168   /* ALISES should be a list of length more than zero, and the first
9169      element is a base coding system.  Append ALIAS at the tail of the
9170      list.  */
9171   while (!NILP (XCDR (aliases)))
9172     aliases = XCDR (aliases);
9173   XSETCDR (aliases, Fcons (alias, Qnil));
9174
9175   eol_type = AREF (spec, 2);
9176   if (VECTORP (eol_type))
9177     {
9178       Lisp_Object subsidiaries;
9179       int i;
9180
9181       subsidiaries = make_subsidiaries (alias);
9182       for (i = 0; i < 3; i++)
9183         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9184                                      AREF (eol_type, i));
9185     }
9186
9187   Fputhash (alias, spec, Vcoding_system_hash_table);
9188   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9189   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9190   if (NILP (val))
9191     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9192                                   Vcoding_system_alist);
9193
9194   return Qnil;
9195 }
9196
9197 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9198        1, 1, 0,
9199        doc: /* Return the base of CODING-SYSTEM.
9200 Any alias or subsidiary coding system is not a base coding system.  */)
9201   (coding_system)
9202      Lisp_Object coding_system;
9203 {
9204   Lisp_Object spec, attrs;
9205
9206   if (NILP (coding_system))
9207     return (Qno_conversion);
9208   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9209   attrs = AREF (spec, 0);
9210   return CODING_ATTR_BASE_NAME (attrs);
9211 }
9212
9213 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9214        1, 1, 0,
9215        doc: "Return the property list of CODING-SYSTEM.")
9216      (coding_system)
9217      Lisp_Object coding_system;
9218 {
9219   Lisp_Object spec, attrs;
9220
9221   if (NILP (coding_system))
9222     coding_system = Qno_conversion;
9223   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9224   attrs = AREF (spec, 0);
9225   return CODING_ATTR_PLIST (attrs);
9226 }
9227
9228
9229 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9230        1, 1, 0,
9231        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9232      (coding_system)
9233      Lisp_Object coding_system;
9234 {
9235   Lisp_Object spec;
9236
9237   if (NILP (coding_system))
9238     coding_system = Qno_conversion;
9239   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9240   return AREF (spec, 1);
9241 }
9242
9243 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9244        Scoding_system_eol_type, 1, 1, 0,
9245        doc: /* Return eol-type of CODING-SYSTEM.
9246 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9247
9248 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9249 and CR respectively.
9250
9251 A vector value indicates that a format of end-of-line should be
9252 detected automatically.  Nth element of the vector is the subsidiary
9253 coding system whose eol-type is N.  */)
9254      (coding_system)
9255      Lisp_Object coding_system;
9256 {
9257   Lisp_Object spec, eol_type;
9258   int n;
9259
9260   if (NILP (coding_system))
9261     coding_system = Qno_conversion;
9262   if (! CODING_SYSTEM_P (coding_system))
9263     return Qnil;
9264   spec = CODING_SYSTEM_SPEC (coding_system);
9265   eol_type = AREF (spec, 2);
9266   if (VECTORP (eol_type))
9267     return Fcopy_sequence (eol_type);
9268   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9269   return make_number (n);
9270 }
9271
9272 #endif /* emacs */
9273
9274 \f
9275 /*** 9. Post-amble ***/
9276
9277 void
9278 init_coding_once ()
9279 {
9280   int i;
9281
9282   for (i = 0; i < coding_category_max; i++)
9283     {
9284       coding_categories[i].id = -1;
9285       coding_priorities[i] = i;
9286     }
9287
9288   /* ISO2022 specific initialize routine.  */
9289   for (i = 0; i < 0x20; i++)
9290     iso_code_class[i] = ISO_control_0;
9291   for (i = 0x21; i < 0x7F; i++)
9292     iso_code_class[i] = ISO_graphic_plane_0;
9293   for (i = 0x80; i < 0xA0; i++)
9294     iso_code_class[i] = ISO_control_1;
9295   for (i = 0xA1; i < 0xFF; i++)
9296     iso_code_class[i] = ISO_graphic_plane_1;
9297   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9298   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9299   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9300   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9301   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9302   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9303   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9304   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9305   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9306
9307   for (i = 0; i < 256; i++)
9308     {
9309       emacs_mule_bytes[i] = 1;
9310     }
9311   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9312   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9313   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9314   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9315 }
9316
9317 #ifdef emacs
9318
9319 void
9320 syms_of_coding ()
9321 {
9322   staticpro (&Vcoding_system_hash_table);
9323   {
9324     Lisp_Object args[2];
9325     args[0] = QCtest;
9326     args[1] = Qeq;
9327     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9328   }
9329
9330   staticpro (&Vsjis_coding_system);
9331   Vsjis_coding_system = Qnil;
9332
9333   staticpro (&Vbig5_coding_system);
9334   Vbig5_coding_system = Qnil;
9335
9336   staticpro (&Vcode_conversion_reused_workbuf);
9337   Vcode_conversion_reused_workbuf = Qnil;
9338
9339   staticpro (&Vcode_conversion_workbuf_name);
9340   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9341
9342   reused_workbuf_in_use = 0;
9343
9344   DEFSYM (Qcharset, "charset");
9345   DEFSYM (Qtarget_idx, "target-idx");
9346   DEFSYM (Qcoding_system_history, "coding-system-history");
9347   Fset (Qcoding_system_history, Qnil);
9348
9349   /* Target FILENAME is the first argument.  */
9350   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9351   /* Target FILENAME is the third argument.  */
9352   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9353
9354   DEFSYM (Qcall_process, "call-process");
9355   /* Target PROGRAM is the first argument.  */
9356   Fput (Qcall_process, Qtarget_idx, make_number (0));
9357
9358   DEFSYM (Qcall_process_region, "call-process-region");
9359   /* Target PROGRAM is the third argument.  */
9360   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9361
9362   DEFSYM (Qstart_process, "start-process");
9363   /* Target PROGRAM is the third argument.  */
9364   Fput (Qstart_process, Qtarget_idx, make_number (2));
9365
9366   DEFSYM (Qopen_network_stream, "open-network-stream");
9367   /* Target SERVICE is the fourth argument.  */
9368   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9369
9370   DEFSYM (Qcoding_system, "coding-system");
9371   DEFSYM (Qcoding_aliases, "coding-aliases");
9372
9373   DEFSYM (Qeol_type, "eol-type");
9374   DEFSYM (Qunix, "unix");
9375   DEFSYM (Qdos, "dos");
9376
9377   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9378   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9379   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9380   DEFSYM (Qdefault_char, "default-char");
9381   DEFSYM (Qundecided, "undecided");
9382   DEFSYM (Qno_conversion, "no-conversion");
9383   DEFSYM (Qraw_text, "raw-text");
9384
9385   DEFSYM (Qiso_2022, "iso-2022");
9386
9387   DEFSYM (Qutf_8, "utf-8");
9388   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9389
9390   DEFSYM (Qutf_16, "utf-16");
9391   DEFSYM (Qbig, "big");
9392   DEFSYM (Qlittle, "little");
9393
9394   DEFSYM (Qshift_jis, "shift-jis");
9395   DEFSYM (Qbig5, "big5");
9396
9397   DEFSYM (Qcoding_system_p, "coding-system-p");
9398
9399   DEFSYM (Qcoding_system_error, "coding-system-error");
9400   Fput (Qcoding_system_error, Qerror_conditions,
9401         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9402   Fput (Qcoding_system_error, Qerror_message,
9403         build_string ("Invalid coding system"));
9404
9405   /* Intern this now in case it isn't already done.
9406      Setting this variable twice is harmless.
9407      But don't staticpro it here--that is done in alloc.c.  */
9408   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9409
9410   DEFSYM (Qtranslation_table, "translation-table");
9411   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9412   DEFSYM (Qtranslation_table_id, "translation-table-id");
9413   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9414   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9415
9416   DEFSYM (Qvalid_codes, "valid-codes");
9417
9418   DEFSYM (Qemacs_mule, "emacs-mule");
9419
9420   DEFSYM (QCcategory, ":category");
9421   DEFSYM (QCmnemonic, ":mnemonic");
9422   DEFSYM (QCdefalut_char, ":default-char");
9423   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9424   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9425   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9426   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9427   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9428
9429   Vcoding_category_table
9430     = Fmake_vector (make_number (coding_category_max), Qnil);
9431   staticpro (&Vcoding_category_table);
9432   /* Followings are target of code detection.  */
9433   ASET (Vcoding_category_table, coding_category_iso_7,
9434         intern ("coding-category-iso-7"));
9435   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9436         intern ("coding-category-iso-7-tight"));
9437   ASET (Vcoding_category_table, coding_category_iso_8_1,
9438         intern ("coding-category-iso-8-1"));
9439   ASET (Vcoding_category_table, coding_category_iso_8_2,
9440         intern ("coding-category-iso-8-2"));
9441   ASET (Vcoding_category_table, coding_category_iso_7_else,
9442         intern ("coding-category-iso-7-else"));
9443   ASET (Vcoding_category_table, coding_category_iso_8_else,
9444         intern ("coding-category-iso-8-else"));
9445   ASET (Vcoding_category_table, coding_category_utf_8,
9446         intern ("coding-category-utf-8"));
9447   ASET (Vcoding_category_table, coding_category_utf_16_be,
9448         intern ("coding-category-utf-16-be"));
9449   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9450         intern ("coding-category-utf-16-auto"));
9451   ASET (Vcoding_category_table, coding_category_utf_16_le,
9452         intern ("coding-category-utf-16-le"));
9453   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9454         intern ("coding-category-utf-16-be-nosig"));
9455   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9456         intern ("coding-category-utf-16-le-nosig"));
9457   ASET (Vcoding_category_table, coding_category_charset,
9458         intern ("coding-category-charset"));
9459   ASET (Vcoding_category_table, coding_category_sjis,
9460         intern ("coding-category-sjis"));
9461   ASET (Vcoding_category_table, coding_category_big5,
9462         intern ("coding-category-big5"));
9463   ASET (Vcoding_category_table, coding_category_ccl,
9464         intern ("coding-category-ccl"));
9465   ASET (Vcoding_category_table, coding_category_emacs_mule,
9466         intern ("coding-category-emacs-mule"));
9467   /* Followings are NOT target of code detection.  */
9468   ASET (Vcoding_category_table, coding_category_raw_text,
9469         intern ("coding-category-raw-text"));
9470   ASET (Vcoding_category_table, coding_category_undecided,
9471         intern ("coding-category-undecided"));
9472
9473   DEFSYM (Qinsufficient_source, "insufficient-source");
9474   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9475   DEFSYM (Qinvalid_source, "invalid-source");
9476   DEFSYM (Qinterrupted, "interrupted");
9477   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9478   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9479
9480   defsubr (&Scoding_system_p);
9481   defsubr (&Sread_coding_system);
9482   defsubr (&Sread_non_nil_coding_system);
9483   defsubr (&Scheck_coding_system);
9484   defsubr (&Sdetect_coding_region);
9485   defsubr (&Sdetect_coding_string);
9486   defsubr (&Sfind_coding_systems_region_internal);
9487   defsubr (&Sunencodable_char_position);
9488   defsubr (&Scheck_coding_systems_region);
9489   defsubr (&Sdecode_coding_region);
9490   defsubr (&Sencode_coding_region);
9491   defsubr (&Sdecode_coding_string);
9492   defsubr (&Sencode_coding_string);
9493   defsubr (&Sdecode_sjis_char);
9494   defsubr (&Sencode_sjis_char);
9495   defsubr (&Sdecode_big5_char);
9496   defsubr (&Sencode_big5_char);
9497   defsubr (&Sset_terminal_coding_system_internal);
9498   defsubr (&Sset_safe_terminal_coding_system_internal);
9499   defsubr (&Sterminal_coding_system);
9500   defsubr (&Sset_keyboard_coding_system_internal);
9501   defsubr (&Skeyboard_coding_system);
9502   defsubr (&Sfind_operation_coding_system);
9503   defsubr (&Sset_coding_system_priority);
9504   defsubr (&Sdefine_coding_system_internal);
9505   defsubr (&Sdefine_coding_system_alias);
9506   defsubr (&Scoding_system_put);
9507   defsubr (&Scoding_system_base);
9508   defsubr (&Scoding_system_plist);
9509   defsubr (&Scoding_system_aliases);
9510   defsubr (&Scoding_system_eol_type);
9511   defsubr (&Scoding_system_priority_list);
9512
9513   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9514                doc: /* List of coding systems.
9515
9516 Do not alter the value of this variable manually.  This variable should be
9517 updated by the functions `define-coding-system' and
9518 `define-coding-system-alias'.  */);
9519   Vcoding_system_list = Qnil;
9520
9521   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9522                doc: /* Alist of coding system names.
9523 Each element is one element list of coding system name.
9524 This variable is given to `completing-read' as TABLE argument.
9525
9526 Do not alter the value of this variable manually.  This variable should be
9527 updated by the functions `make-coding-system' and
9528 `define-coding-system-alias'.  */);
9529   Vcoding_system_alist = Qnil;
9530
9531   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9532                doc: /* List of coding-categories (symbols) ordered by priority.
9533
9534 On detecting a coding system, Emacs tries code detection algorithms
9535 associated with each coding-category one by one in this order.  When
9536 one algorithm agrees with a byte sequence of source text, the coding
9537 system bound to the corresponding coding-category is selected.
9538
9539 Don't modify this variable directly, but use `set-coding-priority'.  */);
9540   {
9541     int i;
9542
9543     Vcoding_category_list = Qnil;
9544     for (i = coding_category_max - 1; i >= 0; i--)
9545       Vcoding_category_list
9546         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9547                  Vcoding_category_list);
9548   }
9549
9550   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9551                doc: /* Specify the coding system for read operations.
9552 It is useful to bind this variable with `let', but do not set it globally.
9553 If the value is a coding system, it is used for decoding on read operation.
9554 If not, an appropriate element is used from one of the coding system alists:
9555 There are three such tables, `file-coding-system-alist',
9556 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9557   Vcoding_system_for_read = Qnil;
9558
9559   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9560                doc: /* Specify the coding system for write operations.
9561 Programs bind this variable with `let', but you should not set it globally.
9562 If the value is a coding system, it is used for encoding of output,
9563 when writing it to a file and when sending it to a file or subprocess.
9564
9565 If this does not specify a coding system, an appropriate element
9566 is used from one of the coding system alists:
9567 There are three such tables, `file-coding-system-alist',
9568 `process-coding-system-alist', and `network-coding-system-alist'.
9569 For output to files, if the above procedure does not specify a coding system,
9570 the value of `buffer-file-coding-system' is used.  */);
9571   Vcoding_system_for_write = Qnil;
9572
9573   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9574                doc: /*
9575 Coding system used in the latest file or process I/O.  */);
9576   Vlast_coding_system_used = Qnil;
9577
9578   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9579                doc: /*
9580 Error status of the last code conversion.
9581
9582 When an error was detected in the last code conversion, this variable
9583 is set to one of the following symbols.
9584   `insufficient-source'
9585   `inconsistent-eol'
9586   `invalid-source'
9587   `interrupted'
9588   `insufficient-memory'
9589 When no error was detected, the value doesn't change.  So, to check
9590 the error status of a code conversion by this variable, you must
9591 explicitly set this variable to nil before performing code
9592 conversion.  */);
9593   Vlast_code_conversion_error = Qnil;
9594
9595   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9596                doc: /*
9597 *Non-nil means always inhibit code conversion of end-of-line format.
9598 See info node `Coding Systems' and info node `Text and Binary' concerning
9599 such conversion.  */);
9600   inhibit_eol_conversion = 0;
9601
9602   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9603                doc: /*
9604 Non-nil means process buffer inherits coding system of process output.
9605 Bind it to t if the process output is to be treated as if it were a file
9606 read from some filesystem.  */);
9607   inherit_process_coding_system = 0;
9608
9609   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9610                doc: /*
9611 Alist to decide a coding system to use for a file I/O operation.
9612 The format is ((PATTERN . VAL) ...),
9613 where PATTERN is a regular expression matching a file name,
9614 VAL is a coding system, a cons of coding systems, or a function symbol.
9615 If VAL is a coding system, it is used for both decoding and encoding
9616 the file contents.
9617 If VAL is a cons of coding systems, the car part is used for decoding,
9618 and the cdr part is used for encoding.
9619 If VAL is a function symbol, the function must return a coding system
9620 or a cons of coding systems which are used as above.  The function is
9621 called with an argument that is a list of the arguments with which
9622 `find-operation-coding-system' was called.  If the function can't decide
9623 a coding system, it can return `undecided' so that the normal
9624 code-detection is performed.
9625
9626 See also the function `find-operation-coding-system'
9627 and the variable `auto-coding-alist'.  */);
9628   Vfile_coding_system_alist = Qnil;
9629
9630   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9631                doc: /*
9632 Alist to decide a coding system to use for a process I/O operation.
9633 The format is ((PATTERN . VAL) ...),
9634 where PATTERN is a regular expression matching a program name,
9635 VAL is a coding system, a cons of coding systems, or a function symbol.
9636 If VAL is a coding system, it is used for both decoding what received
9637 from the program and encoding what sent to the program.
9638 If VAL is a cons of coding systems, the car part is used for decoding,
9639 and the cdr part is used for encoding.
9640 If VAL is a function symbol, the function must return a coding system
9641 or a cons of coding systems which are used as above.
9642
9643 See also the function `find-operation-coding-system'.  */);
9644   Vprocess_coding_system_alist = Qnil;
9645
9646   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9647                doc: /*
9648 Alist to decide a coding system to use for a network I/O operation.
9649 The format is ((PATTERN . VAL) ...),
9650 where PATTERN is a regular expression matching a network service name
9651 or is a port number to connect to,
9652 VAL is a coding system, a cons of coding systems, or a function symbol.
9653 If VAL is a coding system, it is used for both decoding what received
9654 from the network stream and encoding what sent to the network stream.
9655 If VAL is a cons of coding systems, the car part is used for decoding,
9656 and the cdr part is used for encoding.
9657 If VAL is a function symbol, the function must return a coding system
9658 or a cons of coding systems which are used as above.
9659
9660 See also the function `find-operation-coding-system'.  */);
9661   Vnetwork_coding_system_alist = Qnil;
9662
9663   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9664                doc: /* Coding system to use with system messages.
9665 Also used for decoding keyboard input on X Window system.  */);
9666   Vlocale_coding_system = Qnil;
9667
9668   /* The eol mnemonics are reset in startup.el system-dependently.  */
9669   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9670                doc: /*
9671 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9672   eol_mnemonic_unix = build_string (":");
9673
9674   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9675                doc: /*
9676 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9677   eol_mnemonic_dos = build_string ("\\");
9678
9679   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9680                doc: /*
9681 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9682   eol_mnemonic_mac = build_string ("/");
9683
9684   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9685                doc: /*
9686 *String displayed in mode line when end-of-line format is not yet determined.  */);
9687   eol_mnemonic_undecided = build_string (":");
9688
9689   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9690                doc: /*
9691 *Non-nil enables character translation while encoding and decoding.  */);
9692   Venable_character_translation = Qt;
9693
9694   DEFVAR_LISP ("standard-translation-table-for-decode",
9695                &Vstandard_translation_table_for_decode,
9696                doc: /* Table for translating characters while decoding.  */);
9697   Vstandard_translation_table_for_decode = Qnil;
9698
9699   DEFVAR_LISP ("standard-translation-table-for-encode",
9700                &Vstandard_translation_table_for_encode,
9701                doc: /* Table for translating characters while encoding.  */);
9702   Vstandard_translation_table_for_encode = Qnil;
9703
9704   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9705                doc: /* Alist of charsets vs revision numbers.
9706 While encoding, if a charset (car part of an element) is found,
9707 designate it with the escape sequence identifying revision (cdr part
9708 of the element).  */);
9709   Vcharset_revision_table = Qnil;
9710
9711   DEFVAR_LISP ("default-process-coding-system",
9712                &Vdefault_process_coding_system,
9713                doc: /* Cons of coding systems used for process I/O by default.
9714 The car part is used for decoding a process output,
9715 the cdr part is used for encoding a text to be sent to a process.  */);
9716   Vdefault_process_coding_system = Qnil;
9717
9718   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9719                doc: /*
9720 Table of extra Latin codes in the range 128..159 (inclusive).
9721 This is a vector of length 256.
9722 If Nth element is non-nil, the existence of code N in a file
9723 \(or output of subprocess) doesn't prevent it to be detected as
9724 a coding system of ISO 2022 variant which has a flag
9725 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9726 or reading output of a subprocess.
9727 Only 128th through 159th elements has a meaning.  */);
9728   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9729
9730   DEFVAR_LISP ("select-safe-coding-system-function",
9731                &Vselect_safe_coding_system_function,
9732                doc: /*
9733 Function to call to select safe coding system for encoding a text.
9734
9735 If set, this function is called to force a user to select a proper
9736 coding system which can encode the text in the case that a default
9737 coding system used in each operation can't encode the text.
9738
9739 The default value is `select-safe-coding-system' (which see).  */);
9740   Vselect_safe_coding_system_function = Qnil;
9741
9742   DEFVAR_BOOL ("coding-system-require-warning",
9743                &coding_system_require_warning,
9744                doc: /* Internal use only.
9745 If non-nil, on writing a file, `select-safe-coding-system-function' is
9746 called even if `coding-system-for-write' is non-nil.  The command
9747 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9748   coding_system_require_warning = 0;
9749
9750
9751   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9752                &inhibit_iso_escape_detection,
9753                doc: /*
9754 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9755
9756 By default, on reading a file, Emacs tries to detect how the text is
9757 encoded.  This code detection is sensitive to escape sequences.  If
9758 the sequence is valid as ISO2022, the code is determined as one of
9759 the ISO2022 encodings, and the file is decoded by the corresponding
9760 coding system (e.g. `iso-2022-7bit').
9761
9762 However, there may be a case that you want to read escape sequences in
9763 a file as is.  In such a case, you can set this variable to non-nil.
9764 Then, as the code detection ignores any escape sequences, no file is
9765 detected as encoded in some ISO2022 encoding.  The result is that all
9766 escape sequences become visible in a buffer.
9767
9768 The default value is nil, and it is strongly recommended not to change
9769 it.  That is because many Emacs Lisp source files that contain
9770 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9771 in Emacs's distribution, and they won't be decoded correctly on
9772 reading if you suppress escape sequence detection.
9773
9774 The other way to read escape sequences in a file without decoding is
9775 to explicitly specify some coding system that doesn't use ISO2022's
9776 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9777   inhibit_iso_escape_detection = 0;
9778
9779   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9780                doc: /* Char table for translating self-inserting characters.
9781 This is applied to the result of input methods, not their input.  See also
9782 `keyboard-translate-table'.  */);
9783     Vtranslation_table_for_input = Qnil;
9784
9785   {
9786     Lisp_Object args[coding_arg_max];
9787     Lisp_Object plist[16];
9788     int i;
9789
9790     for (i = 0; i < coding_arg_max; i++)
9791       args[i] = Qnil;
9792
9793     plist[0] = intern (":name");
9794     plist[1] = args[coding_arg_name] = Qno_conversion;
9795     plist[2] = intern (":mnemonic");
9796     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9797     plist[4] = intern (":coding-type");
9798     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9799     plist[6] = intern (":ascii-compatible-p");
9800     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9801     plist[8] = intern (":default-char");
9802     plist[9] = args[coding_arg_default_char] = make_number (0);
9803     plist[10] = intern (":for-unibyte");
9804     plist[11] = args[coding_arg_for_unibyte] = Qt;
9805     plist[12] = intern (":docstring");
9806     plist[13] = build_string ("Do no conversion.\n\
9807 \n\
9808 When you visit a file with this coding, the file is read into a\n\
9809 unibyte buffer as is, thus each byte of a file is treated as a\n\
9810 character.");
9811     plist[14] = intern (":eol-type");
9812     plist[15] = args[coding_arg_eol_type] = Qunix;
9813     args[coding_arg_plist] = Flist (16, plist);
9814     Fdefine_coding_system_internal (coding_arg_max, args);
9815
9816     plist[1] = args[coding_arg_name] = Qundecided;
9817     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9818     plist[5] = args[coding_arg_coding_type] = Qundecided;
9819     /* This is already set.
9820        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9821     plist[8] = intern (":charset-list");
9822     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9823     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9824     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9825     plist[15] = args[coding_arg_eol_type] = Qnil;
9826     args[coding_arg_plist] = Flist (16, plist);
9827     Fdefine_coding_system_internal (coding_arg_max, args);
9828   }
9829
9830   setup_coding_system (Qno_conversion, &keyboard_coding);
9831   setup_coding_system (Qundecided, &terminal_coding);
9832   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9833
9834   {
9835     int i;
9836
9837     for (i = 0; i < coding_category_max; i++)
9838       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9839   }
9840 #if defined (MSDOS) || defined (WINDOWSNT)
9841   system_eol_type = Qdos;
9842 #else
9843   system_eol_type = Qunix;
9844 #endif
9845   staticpro (&system_eol_type);
9846 }
9847
9848 char *
9849 emacs_strerror (error_number)
9850      int error_number;
9851 {
9852   char *str;
9853
9854   synchronize_system_messages_locale ();
9855   str = strerror (error_number);
9856
9857   if (! NILP (Vlocale_coding_system))
9858     {
9859       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9860                                                       Vlocale_coding_system,
9861                                                       0);
9862       str = (char *) SDATA (dec);
9863     }
9864
9865   return str;
9866 }
9867
9868 #endif /* emacs */
9869
9870 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9871    (do not change this comment) */