src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 #ifdef emacs
 344
 345 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 346
 347 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 348
 349 /* Coding system emacs-mule and raw-text are for converting only
 350    end-of-line format.  */
 351 Lisp_Object Qemacs_mule, Qraw_text;
 352 Lisp_Object Qutf_8_emacs;
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding-system for reading files and receiving data from process.  */
 357 Lisp_Object Vcoding_system_for_read;
 358 /* Coding-system for writing files and sending data to process.  */
 359 Lisp_Object Vcoding_system_for_write;
 360 /* Coding-system actually used in the latest I/O.  */
 361 Lisp_Object Vlast_coding_system_used;
 362 /* Set to non-nil when an error is detected while code conversion.  */
 363 Lisp_Object Vlast_code_conversion_error;
 364 /* A vector of length 256 which contains information about special
 365    Latin codes (especially for dealing with Microsoft codes).  */
 366 Lisp_Object Vlatin_extra_code_table;
 367
 368 /* Flag to inhibit code conversion of end-of-line format.  */
 369 int inhibit_eol_conversion;
 370
 371 /* Flag to inhibit ISO2022 escape sequence detection.  */
 372 int inhibit_iso_escape_detection;
 373
 374 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 375 int inherit_process_coding_system;
 376
 377 /* Coding system to be used to encode text for terminal display.  */
 378 struct coding_system terminal_coding;
 379
 380 /* Coding system to be used to encode text for terminal display when
 381    terminal coding system is nil.  */
 382 struct coding_system safe_terminal_coding;
 383
 384 /* Coding system of what is sent from terminal keyboard.  */
 385 struct coding_system keyboard_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)  \
 430   ((charset_id <= (coding)->max_charset_id      \
 431     ? (coding)->safe_charsets[charset_id]       \
 432     : -1))
 433
 434
 435 #define CODING_ISO_FLAGS(coding)        \
 436   ((coding)->spec.iso_2022.flags)
 437 #define CODING_ISO_DESIGNATION(coding, reg)     \
 438   ((coding)->spec.iso_2022.current_designation[reg])
 439 #define CODING_ISO_INVOCATION(coding, plane)    \
 440   ((coding)->spec.iso_2022.current_invocation[plane])
 441 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 442   ((coding)->spec.iso_2022.single_shifting)
 443 #define CODING_ISO_BOL(coding)  \
 444   ((coding)->spec.iso_2022.bol)
 445 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 446   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 447
 448 /* Control characters of ISO2022.  */
 449                         /* code */      /* function */
 450 #define ISO_CODE_LF     0x0A            /* line-feed */
 451 #define ISO_CODE_CR     0x0D            /* carriage-return */
 452 #define ISO_CODE_SO     0x0E            /* shift-out */
 453 #define ISO_CODE_SI     0x0F            /* shift-in */
 454 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 455 #define ISO_CODE_ESC    0x1B            /* escape */
 456 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 457 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 458 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 459
 460 /* All code (1-byte) of ISO2022 is classified into one of the
 461    followings.  */
 462 enum iso_code_class_type
 463   {
 464     ISO_control_0,              /* Control codes in the range
 465                                    0x00..0x1F and 0x7F, except for the
 466                                    following 5 codes.  */
 467     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 468     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 469     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 470     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 471     ISO_control_1,              /* Control codes in the range
 472                                    0x80..0x9F, except for the
 473                                    following 3 codes.  */
 474     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 475     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 476     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 477     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 478     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 479     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 480     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 481   };
 482
 483 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 484     `iso-flags' attribute of an iso2022 coding system.  */
 485
 486 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 487    instead of the correct short-form sequence (e.g. ESC $ A).  */
 488 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 489
 490 /* If set, reset graphic planes and registers at end-of-line to the
 491    initial state.  */
 492 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 493
 494 /* If set, reset graphic planes and registers before any control
 495    characters to the initial state.  */
 496 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 497
 498 /* If set, encode by 7-bit environment.  */
 499 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 500
 501 /* If set, use locking-shift function.  */
 502 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 503
 504 /* If set, use single-shift function.  Overwrite
 505    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 506 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 507
 508 /* If set, use designation escape sequence.  */
 509 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 510
 511 /* If set, produce revision number sequence.  */
 512 #define CODING_ISO_FLAG_REVISION        0x0080
 513
 514 /* If set, produce ISO6429's direction specifying sequence.  */
 515 #define CODING_ISO_FLAG_DIRECTION       0x0100
 516
 517 /* If set, assume designation states are reset at beginning of line on
 518    output.  */
 519 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 520
 521 /* If set, designation sequence should be placed at beginning of line
 522    on output.  */
 523 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 524
 525 /* If set, do not encode unsafe charactes on output.  */
 526 #define CODING_ISO_FLAG_SAFE            0x0800
 527
 528 /* If set, extra latin codes (128..159) are accepted as a valid code
 529    on input.  */
 530 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 531
 532 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 533
 534 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 535
 536 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 537
 538 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 539
 540 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 541
 542 /* A character to be produced on output if encoding of the original
 543    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 544 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 545
 546
 547 /* UTF-16 section */
 548 #define CODING_UTF_16_BOM(coding)       \
 549   ((coding)->spec.utf_16.bom)
 550
 551 #define CODING_UTF_16_ENDIAN(coding)    \
 552   ((coding)->spec.utf_16.endian)
 553
 554 #define CODING_UTF_16_SURROGATE(coding) \
 555   ((coding)->spec.utf_16.surrogate)
 556
 557
 558 /* CCL section */
 559 #define CODING_CCL_DECODER(coding)      \
 560   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 561 #define CODING_CCL_ENCODER(coding)      \
 562   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 563 #define CODING_CCL_VALIDS(coding)                                          \
 564   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 565
 566 /* Index for each coding category in `coding_categories' */
 567
 568 enum coding_category
 569   {
 570     coding_category_iso_7,
 571     coding_category_iso_7_tight,
 572     coding_category_iso_8_1,
 573     coding_category_iso_8_2,
 574     coding_category_iso_7_else,
 575     coding_category_iso_8_else,
 576     coding_category_utf_8,
 577     coding_category_utf_16_auto,
 578     coding_category_utf_16_be,
 579     coding_category_utf_16_le,
 580     coding_category_utf_16_be_nosig,
 581     coding_category_utf_16_le_nosig,
 582     coding_category_charset,
 583     coding_category_sjis,
 584     coding_category_big5,
 585     coding_category_ccl,
 586     coding_category_emacs_mule,
 587     /* All above are targets of code detection.  */
 588     coding_category_raw_text,
 589     coding_category_undecided,
 590     coding_category_max
 591   };
 592
 593 /* Definitions of flag bits used in detect_coding_XXXX.  */
 594 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 595 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 596 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 597 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 598 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 599 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 600 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 601 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 602 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 603 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 604 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 605 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 606 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 607 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 608 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 609 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 610 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 611 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 612
 613 /* This value is returned if detect_coding_mask () find nothing other
 614    than ASCII characters.  */
 615 #define CATEGORY_MASK_ANY               \
 616   (CATEGORY_MASK_ISO_7                  \
 617    | CATEGORY_MASK_ISO_7_TIGHT          \
 618    | CATEGORY_MASK_ISO_8_1              \
 619    | CATEGORY_MASK_ISO_8_2              \
 620    | CATEGORY_MASK_ISO_7_ELSE           \
 621    | CATEGORY_MASK_ISO_8_ELSE           \
 622    | CATEGORY_MASK_UTF_8                \
 623    | CATEGORY_MASK_UTF_16_BE            \
 624    | CATEGORY_MASK_UTF_16_LE            \
 625    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 626    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 627    | CATEGORY_MASK_CHARSET              \
 628    | CATEGORY_MASK_SJIS                 \
 629    | CATEGORY_MASK_BIG5                 \
 630    | CATEGORY_MASK_CCL                  \
 631    | CATEGORY_MASK_EMACS_MULE)
 632
 633
 634 #define CATEGORY_MASK_ISO_7BIT \
 635   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 636
 637 #define CATEGORY_MASK_ISO_8BIT \
 638   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 639
 640 #define CATEGORY_MASK_ISO_ELSE \
 641   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO_ESCAPE        \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_7_ELSE           \
 647    | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO       \
 650   (  CATEGORY_MASK_ISO_7BIT     \
 651      | CATEGORY_MASK_ISO_8BIT   \
 652      | CATEGORY_MASK_ISO_ELSE)
 653
 654 #define CATEGORY_MASK_UTF_16            \
 655   (CATEGORY_MASK_UTF_16_BE              \
 656    | CATEGORY_MASK_UTF_16_LE            \
 657    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 658    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 659
 660
 661 /* List of symbols `coding-category-xxx' ordered by priority.  This
 662    variable is exposed to Emacs Lisp.  */
 663 static Lisp_Object Vcoding_category_list;
 664
 665 /* Table of coding categories (Lisp symbols).  This variable is for
 666    internal use oly.  */
 667 static Lisp_Object Vcoding_category_table;
 668
 669 /* Table of coding-categories ordered by priority.  */
 670 static enum coding_category coding_priorities[coding_category_max];
 671
 672 /* Nth element is a coding context for the coding system bound to the
 673    Nth coding category.  */
 674 static struct coding_system coding_categories[coding_category_max];
 675
 676 /*** Commonly used macros and functions ***/
 677
 678 #ifndef min
 679 #define min(a, b) ((a) < (b) ? (a) : (b))
 680 #endif
 681 #ifndef max
 682 #define max(a, b) ((a) > (b) ? (a) : (b))
 683 #endif
 684
 685 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 686   do {                                                  \
 687     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 688     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 689   } while (0)
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to `no_more_source'.  If multibytep is
 695    nonzero, and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725
 726 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 727   do {                                                  \
 728     c = *src++;                                         \
 729     if (multibytep && (c & 0x80))                       \
 730       {                                                 \
 731         if ((c & 0xFE) == 0xC0)                         \
 732           c = ((c & 1) << 6) | *src++;                  \
 733         else                                            \
 734           {                                             \
 735             src--;                                      \
 736             c = - string_char (src, &src, NULL);        \
 737             record_conversion_result                    \
 738               (coding, CODING_RESULT_INVALID_SRC);      \
 739           }                                             \
 740       }                                                 \
 741     consumed_chars++;                                   \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  The caller should
 747    assure that C is 0..127, and declare and set the variable `dst'
 748    appropriately in advance.
 749 */
 750
 751
 752 #define EMIT_ONE_ASCII_BYTE(c)  \
 753   do {                          \
 754     produced_chars++;           \
 755     *dst++ = (c);               \
 756   } while (0)
 757
 758
 759 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 760
 761 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 762   do {                                  \
 763     produced_chars += 2;                \
 764     *dst++ = (c1), *dst++ = (c2);       \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 770    nonzero, store in an appropriate multibyte from.  The caller should
 771    declare and set the variables `dst' and `multibytep' appropriately
 772    in advance.  */
 773
 774 #define EMIT_ONE_BYTE(c)                \
 775   do {                                  \
 776     produced_chars++;                   \
 777     if (multibytep)                     \
 778       {                                 \
 779         int ch = (c);                   \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       *dst++ = (c);                     \
 786   } while (0)
 787
 788
 789 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 790
 791 #define EMIT_TWO_BYTES(c1, c2)          \
 792   do {                                  \
 793     produced_chars += 2;                \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch;                         \
 797                                         \
 798         ch = (c1);                      \
 799         if (ch >= 0x80)                 \
 800           ch = BYTE8_TO_CHAR (ch);      \
 801         CHAR_STRING_ADVANCE (ch, dst);  \
 802         ch = (c2);                      \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       {                                 \
 809         *dst++ = (c1);                  \
 810         *dst++ = (c2);                  \
 811       }                                 \
 812   } while (0)
 813
 814
 815 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 816   do {                                  \
 817     EMIT_ONE_BYTE (c1);                 \
 818     EMIT_TWO_BYTES (c2, c3);            \
 819   } while (0)
 820
 821
 822 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 823   do {                                          \
 824     EMIT_TWO_BYTES (c1, c2);                    \
 825     EMIT_TWO_BYTES (c3, c4);                    \
 826   } while (0)
 827
 828
 829 /* Prototypes for static functions.  */
 830 static void record_conversion_result P_ ((struct coding_system *coding,
 831                                           enum coding_result_code result));
 832 static int detect_coding_utf_8 P_ ((struct coding_system *,
 833                                     struct coding_detection_info *info));
 834 static void decode_coding_utf_8 P_ ((struct coding_system *));
 835 static int encode_coding_utf_8 P_ ((struct coding_system *));
 836
 837 static int detect_coding_utf_16 P_ ((struct coding_system *,
 838                                      struct coding_detection_info *info));
 839 static void decode_coding_utf_16 P_ ((struct coding_system *));
 840 static int encode_coding_utf_16 P_ ((struct coding_system *));
 841
 842 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 843                                        struct coding_detection_info *info));
 844 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 845 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 846
 847 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 848                                          struct coding_detection_info *info));
 849 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 850 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 851
 852 static int detect_coding_sjis P_ ((struct coding_system *,
 853                                    struct coding_detection_info *info));
 854 static void decode_coding_sjis P_ ((struct coding_system *));
 855 static int encode_coding_sjis P_ ((struct coding_system *));
 856
 857 static int detect_coding_big5 P_ ((struct coding_system *,
 858                                    struct coding_detection_info *info));
 859 static void decode_coding_big5 P_ ((struct coding_system *));
 860 static int encode_coding_big5 P_ ((struct coding_system *));
 861
 862 static int detect_coding_ccl P_ ((struct coding_system *,
 863                                   struct coding_detection_info *info));
 864 static void decode_coding_ccl P_ ((struct coding_system *));
 865 static int encode_coding_ccl P_ ((struct coding_system *));
 866
 867 static void decode_coding_raw_text P_ ((struct coding_system *));
 868 static int encode_coding_raw_text P_ ((struct coding_system *));
 869
 870 static void coding_set_source P_ ((struct coding_system *));
 871 static void coding_set_destination P_ ((struct coding_system *));
 872 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 873 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 874                                             EMACS_INT));
 875 static unsigned char *alloc_destination P_ ((struct coding_system *,
 876                                              EMACS_INT, unsigned char *));
 877 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 878 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 879                                                      int *, int *,
 880                                                      unsigned char *));
 881 static int detect_eol P_ ((const unsigned char *,
 882                            EMACS_INT, enum coding_category));
 883 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 884 static void decode_eol P_ ((struct coding_system *));
 885 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 886 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 887                                         int, int *, int *));
 888 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 889 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 890                                             EMACS_INT));
 891 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 892                                         EMACS_INT));
 893 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 894 static int decode_coding P_ ((struct coding_system *));
 895 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 896                                                       struct coding_system *,
 897                                                       int *, EMACS_INT *));
 898 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 899                                                   struct coding_system *,
 900                                                   int *, EMACS_INT *));
 901 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 902 static int encode_coding P_ ((struct coding_system *));
 903 static Lisp_Object make_conversion_work_buffer P_ ((int));
 904 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 905 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 906 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 907
 908 static void
 909 record_conversion_result (struct coding_system *coding,
 910                           enum coding_result_code result)
 911 {
 912   coding->result = result;
 913   switch (result)
 914     {
 915     case CODING_RESULT_INSUFFICIENT_SRC:
 916       Vlast_code_conversion_error = Qinsufficient_source;
 917       break;
 918     case CODING_RESULT_INCONSISTENT_EOL:
 919       Vlast_code_conversion_error = Qinconsistent_eol;
 920       break;
 921     case CODING_RESULT_INVALID_SRC:
 922       Vlast_code_conversion_error = Qinvalid_source;
 923       break;
 924     case CODING_RESULT_INTERRUPT:
 925       Vlast_code_conversion_error = Qinterrupted;
 926       break;
 927     case CODING_RESULT_INSUFFICIENT_MEM:
 928       Vlast_code_conversion_error = Qinsufficient_memory;
 929       break;
 930     default:
 931       Vlast_code_conversion_error = intern ("Unknown error");
 932     }
 933 }
 934
 935 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 936   do {                                                                       \
 937     charset_map_loaded = 0;                                                  \
 938     c = DECODE_CHAR (charset, code);                                         \
 939     if (charset_map_loaded)                                                  \
 940       {                                                                      \
 941         const unsigned char *orig = coding->source;                          \
 942         EMACS_INT offset;                                                    \
 943                                                                              \
 944         coding_set_source (coding);                                          \
 945         offset = coding->source - orig;                                      \
 946         src += offset;                                                       \
 947         src_base += offset;                                                  \
 948         src_end += offset;                                                   \
 949       }                                                                      \
 950   } while (0)
 951
 952
 953 #define ASSURE_DESTINATION(bytes)                               \
 954   do {                                                          \
 955     if (dst + (bytes) >= dst_end)                               \
 956       {                                                         \
 957         int more_bytes = charbuf_end - charbuf + (bytes);       \
 958                                                                 \
 959         dst = alloc_destination (coding, more_bytes, dst);      \
 960         dst_end = coding->destination + coding->dst_bytes;      \
 961       }                                                         \
 962   } while (0)
 963
 964
 965
 966 static void
 967 coding_set_source (coding)
 968      struct coding_system *coding;
 969 {
 970   if (BUFFERP (coding->src_object))
 971     {
 972       struct buffer *buf = XBUFFER (coding->src_object);
 973
 974       if (coding->src_pos < 0)
 975         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 976       else
 977         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 978     }
 979   else if (STRINGP (coding->src_object))
 980     {
 981       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 982     }
 983   else
 984     /* Otherwise, the source is C string and is never relocated
 985        automatically.  Thus we don't have to update anything.  */
 986     ;
 987 }
 988
 989 static void
 990 coding_set_destination (coding)
 991      struct coding_system *coding;
 992 {
 993   if (BUFFERP (coding->dst_object))
 994     {
 995       if (coding->src_pos < 0)
 996         {
 997           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 998           coding->dst_bytes = (GAP_END_ADDR
 999                                - (coding->src_bytes - coding->consumed)
1000                                - coding->destination);
1001         }
1002       else
1003         {
1004           /* We are sure that coding->dst_pos_byte is before the gap
1005              of the buffer. */
1006           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1007                                  + coding->dst_pos_byte - 1);
1008           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1009                                - coding->destination);
1010         }
1011     }
1012   else
1013     /* Otherwise, the destination is C string and is never relocated
1014        automatically.  Thus we don't have to update anything.  */
1015     ;
1016 }
1017
1018
1019 static void
1020 coding_alloc_by_realloc (coding, bytes)
1021      struct coding_system *coding;
1022      EMACS_INT bytes;
1023 {
1024   coding->destination = (unsigned char *) xrealloc (coding->destination,
1025                                                     coding->dst_bytes + bytes);
1026   coding->dst_bytes += bytes;
1027 }
1028
1029 static void
1030 coding_alloc_by_making_gap (coding, bytes)
1031      struct coding_system *coding;
1032      EMACS_INT bytes;
1033 {
1034   if (BUFFERP (coding->dst_object)
1035       && EQ (coding->src_object, coding->dst_object))
1036     {
1037       EMACS_INT add = coding->src_bytes - coding->consumed;
1038
1039       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042     }
1043   else
1044     {
1045       Lisp_Object this_buffer;
1046
1047       this_buffer = Fcurrent_buffer ();
1048       set_buffer_internal (XBUFFER (coding->dst_object));
1049       make_gap (bytes);
1050       set_buffer_internal (XBUFFER (this_buffer));
1051     }
1052 }
1053
1054
1055 static unsigned char *
1056 alloc_destination (coding, nbytes, dst)
1057      struct coding_system *coding;
1058      EMACS_INT nbytes;
1059      unsigned char *dst;
1060 {
1061   EMACS_INT offset = dst - coding->destination;
1062
1063   if (BUFFERP (coding->dst_object))
1064     coding_alloc_by_making_gap (coding, nbytes);
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1068   coding_set_destination (coding);
1069   dst = coding->destination + offset;
1070   return dst;
1071 }
1072
1073 /** Macros for annotations.  */
1074
1075 /* Maximum length of annotation data (sum of annotations for
1076    composition and charset).  */
1077 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1091    METHOD is one of enum composition_method.
1092    Optionnal COMPOSITION-COMPONENTS are characters and composition
1093    rules.
1094
1095    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1096    follows.  */
1097
1098 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1099   do {                                                  \
1100     *(buf)++ = -(len);                                  \
1101     *(buf)++ = (mask);                                  \
1102     *(buf)++ = (nchars);                                \
1103     coding->annotated = 1;                              \
1104   } while (0);
1105
1106 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1107   do {                                                                      \
1108     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1109     *buf++ = method;                                                        \
1110   } while (0)
1111
1112
1113 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1114   do {                                                                  \
1115     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1116     *buf++ = id;                                                        \
1117   } while (0)
1118
1119 \f
1120 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1121
1122
1123
1124 \f
1125 /*** 3. UTF-8 ***/
1126
1127 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1128    Check if a text is encoded in UTF-8.  If it is, return 1, else
1129    return 0.  */
1130
1131 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1132 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1133 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1134 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1135 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1136 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1137
1138 static int
1139 detect_coding_utf_8 (coding, detect_info)
1140      struct coding_system *coding;
1141      struct coding_detection_info *detect_info;
1142 {
1143   const unsigned char *src = coding->source, *src_base;
1144   const unsigned char *src_end = coding->source + coding->src_bytes;
1145   int multibytep = coding->src_multibyte;
1146   int consumed_chars = 0;
1147   int found = 0;
1148
1149   detect_info->checked |= CATEGORY_MASK_UTF_8;
1150   /* A coding system of this category is always ASCII compatible.  */
1151   src += coding->head_ascii;
1152
1153   while (1)
1154     {
1155       int c, c1, c2, c3, c4;
1156
1157       src_base = src;
1158       ONE_MORE_BYTE (c);
1159       if (c < 0 || UTF_8_1_OCTET_P (c))
1160         continue;
1161       ONE_MORE_BYTE (c1);
1162       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1163         break;
1164       if (UTF_8_2_OCTET_LEADING_P (c))
1165         {
1166           found = CATEGORY_MASK_UTF_8;
1167           continue;
1168         }
1169       ONE_MORE_BYTE (c2);
1170       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1171         break;
1172       if (UTF_8_3_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c3);
1178       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1179         break;
1180       if (UTF_8_4_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c4);
1186       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1187         break;
1188       if (UTF_8_5_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       break;
1194     }
1195   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196   return 0;
1197
1198  no_more_source:
1199   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1200     {
1201       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202       return 0;
1203     }
1204   detect_info->found |= found;
1205   return 1;
1206 }
1207
1208
1209 static void
1210 decode_coding_utf_8 (coding)
1211      struct coding_system *coding;
1212 {
1213   const unsigned char *src = coding->source + coding->consumed;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   const unsigned char *src_base;
1216   int *charbuf = coding->charbuf + coding->charbuf_used;
1217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1218   int consumed_chars = 0, consumed_chars_base;
1219   int multibytep = coding->src_multibyte;
1220   Lisp_Object attr, charset_list;
1221
1222   CODING_GET_INFO (coding, attr, charset_list);
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4, c5;
1227
1228       src_base = src;
1229       consumed_chars_base = consumed_chars;
1230
1231       if (charbuf >= charbuf_end)
1232         break;
1233
1234       ONE_MORE_BYTE (c1);
1235       if (c1 < 0)
1236         {
1237           c = - c1;
1238         }
1239       else if (UTF_8_1_OCTET_P(c1))
1240         {
1241           c = c1;
1242         }
1243       else
1244         {
1245           ONE_MORE_BYTE (c2);
1246           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1247             goto invalid_code;
1248           if (UTF_8_2_OCTET_LEADING_P (c1))
1249             {
1250               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1251               /* Reject overlong sequences here and below.  Encoders
1252                  producing them are incorrect, they can be misleading,
1253                  and they mess up read/write invariance.  */
1254               if (c < 128)
1255                 goto invalid_code;
1256             }
1257           else
1258             {
1259               ONE_MORE_BYTE (c3);
1260               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1261                 goto invalid_code;
1262               if (UTF_8_3_OCTET_LEADING_P (c1))
1263                 {
1264                   c = (((c1 & 0xF) << 12)
1265                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1266                   if (c < 0x800
1267                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1268                     goto invalid_code;
1269                 }
1270               else
1271                 {
1272                   ONE_MORE_BYTE (c4);
1273                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274                     goto invalid_code;
1275                   if (UTF_8_4_OCTET_LEADING_P (c1))
1276                     {
1277                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1278                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1279                     if (c < 0x10000)
1280                       goto invalid_code;
1281                     }
1282                   else
1283                     {
1284                       ONE_MORE_BYTE (c5);
1285                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1286                         goto invalid_code;
1287                       if (UTF_8_5_OCTET_LEADING_P (c1))
1288                         {
1289                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1290                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1291                                | (c5 & 0x3F));
1292                           if ((c > MAX_CHAR) || (c < 0x200000))
1293                             goto invalid_code;
1294                         }
1295                       else
1296                         goto invalid_code;
1297                     }
1298                 }
1299             }
1300         }
1301
1302       *charbuf++ = c;
1303       continue;
1304
1305     invalid_code:
1306       src = src_base;
1307       consumed_chars = consumed_chars_base;
1308       ONE_MORE_BYTE (c);
1309       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1310       coding->errors++;
1311     }
1312
1313  no_more_source:
1314   coding->consumed_char += consumed_chars_base;
1315   coding->consumed = src_base - coding->source;
1316   coding->charbuf_used = charbuf - coding->charbuf;
1317 }
1318
1319
1320 static int
1321 encode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   int multibytep = coding->dst_multibyte;
1325   int *charbuf = coding->charbuf;
1326   int *charbuf_end = charbuf + coding->charbuf_used;
1327   unsigned char *dst = coding->destination + coding->produced;
1328   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1329   int produced_chars = 0;
1330   int c;
1331
1332   if (multibytep)
1333     {
1334       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1335
1336       while (charbuf < charbuf_end)
1337         {
1338           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1339
1340           ASSURE_DESTINATION (safe_room);
1341           c = *charbuf++;
1342           if (CHAR_BYTE8_P (c))
1343             {
1344               c = CHAR_TO_BYTE8 (c);
1345               EMIT_ONE_BYTE (c);
1346             }
1347           else
1348             {
1349               CHAR_STRING_ADVANCE (c, pend);
1350               for (p = str; p < pend; p++)
1351                 EMIT_ONE_BYTE (*p);
1352             }
1353         }
1354     }
1355   else
1356     {
1357       int safe_room = MAX_MULTIBYTE_LENGTH;
1358
1359       while (charbuf < charbuf_end)
1360         {
1361           ASSURE_DESTINATION (safe_room);
1362           c = *charbuf++;
1363           if (CHAR_BYTE8_P (c))
1364             *dst++ = CHAR_TO_BYTE8 (c);
1365           else
1366             dst += CHAR_STRING (c, dst);
1367           produced_chars++;
1368         }
1369     }
1370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1371   coding->produced_char += produced_chars;
1372   coding->produced = dst - coding->destination;
1373   return 0;
1374 }
1375
1376
1377 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1378    Check if a text is encoded in one of UTF-16 based coding systems.
1379    If it is, return 1, else return 0.  */
1380
1381 #define UTF_16_HIGH_SURROGATE_P(val) \
1382   (((val) & 0xFC00) == 0xD800)
1383
1384 #define UTF_16_LOW_SURROGATE_P(val) \
1385   (((val) & 0xFC00) == 0xDC00)
1386
1387 #define UTF_16_INVALID_P(val)   \
1388   (((val) == 0xFFFE)            \
1389    || ((val) == 0xFFFF)         \
1390    || UTF_16_LOW_SURROGATE_P (val))
1391
1392
1393 static int
1394 detect_coding_utf_16 (coding, detect_info)
1395      struct coding_system *coding;
1396      struct coding_detection_info *detect_info;
1397 {
1398   const unsigned char *src = coding->source, *src_base = src;
1399   const unsigned char *src_end = coding->source + coding->src_bytes;
1400   int multibytep = coding->src_multibyte;
1401   int consumed_chars = 0;
1402   int c1, c2;
1403
1404   detect_info->checked |= CATEGORY_MASK_UTF_16;
1405   if (coding->mode & CODING_MODE_LAST_BLOCK
1406       && (coding->src_chars & 1))
1407     {
1408       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1409       return 0;
1410     }
1411
1412   ONE_MORE_BYTE (c1);
1413   ONE_MORE_BYTE (c2);
1414   if ((c1 == 0xFF) && (c2 == 0xFE))
1415     {
1416       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1417                              | CATEGORY_MASK_UTF_16_AUTO);
1418       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1419                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1420                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1421     }
1422   else if ((c1 == 0xFE) && (c2 == 0xFF))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if (c1 >= 0 && c2 >= 0)
1431     {
1432       detect_info->rejected
1433         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1434     }
1435  no_more_source:
1436   return 1;
1437 }
1438
1439 static void
1440 decode_coding_utf_16 (coding)
1441      struct coding_system *coding;
1442 {
1443   const unsigned char *src = coding->source + coding->consumed;
1444   const unsigned char *src_end = coding->source + coding->src_bytes;
1445   const unsigned char *src_base;
1446   int *charbuf = coding->charbuf + coding->charbuf_used;
1447   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1448   int consumed_chars = 0, consumed_chars_base;
1449   int multibytep = coding->src_multibyte;
1450   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1451   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1452   int surrogate = CODING_UTF_16_SURROGATE (coding);
1453   Lisp_Object attr, charset_list;
1454
1455   CODING_GET_INFO (coding, attr, charset_list);
1456
1457   if (bom == utf_16_with_bom)
1458     {
1459       int c, c1, c2;
1460
1461       src_base = src;
1462       ONE_MORE_BYTE (c1);
1463       ONE_MORE_BYTE (c2);
1464       c = (c1 << 8) | c2;
1465
1466       if (endian == utf_16_big_endian
1467           ? c != 0xFEFF : c != 0xFFFE)
1468         {
1469           /* The first two bytes are not BOM.  Treat them as bytes
1470              for a normal character.  */
1471           src = src_base;
1472           coding->errors++;
1473         }
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476   else if (bom == utf_16_detect_bom)
1477     {
1478       /* We have already tried to detect BOM and failed in
1479          detect_coding.  */
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482
1483   while (1)
1484     {
1485       int c, c1, c2;
1486
1487       src_base = src;
1488       consumed_chars_base = consumed_chars;
1489
1490       if (charbuf + 2 >= charbuf_end)
1491         break;
1492
1493       ONE_MORE_BYTE (c1);
1494       if (c1 < 0)
1495         {
1496           *charbuf++ = -c1;
1497           continue;
1498         }
1499       ONE_MORE_BYTE (c2);
1500       if (c2 < 0)
1501         {
1502           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1503           *charbuf++ = -c2;
1504           continue;
1505         }
1506       c = (endian == utf_16_big_endian
1507            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1508       if (surrogate)
1509         {
1510           if (! UTF_16_LOW_SURROGATE_P (c))
1511             {
1512               if (endian == utf_16_big_endian)
1513                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1514               else
1515                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1516               *charbuf++ = c1;
1517               *charbuf++ = c2;
1518               coding->errors++;
1519               if (UTF_16_HIGH_SURROGATE_P (c))
1520                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1521               else
1522                 *charbuf++ = c;
1523             }
1524           else
1525             {
1526               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1527               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1528               *charbuf++ = 0x10000 + c;
1529             }
1530         }
1531       else
1532         {
1533           if (UTF_16_HIGH_SURROGATE_P (c))
1534             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1535           else
1536             *charbuf++ = c;
1537         }
1538     }
1539
1540  no_more_source:
1541   coding->consumed_char += consumed_chars_base;
1542   coding->consumed = src_base - coding->source;
1543   coding->charbuf_used = charbuf - coding->charbuf;
1544 }
1545
1546 static int
1547 encode_coding_utf_16 (coding)
1548      struct coding_system *coding;
1549 {
1550   int multibytep = coding->dst_multibyte;
1551   int *charbuf = coding->charbuf;
1552   int *charbuf_end = charbuf + coding->charbuf_used;
1553   unsigned char *dst = coding->destination + coding->produced;
1554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1555   int safe_room = 8;
1556   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1557   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1558   int produced_chars = 0;
1559   Lisp_Object attrs, charset_list;
1560   int c;
1561
1562   CODING_GET_INFO (coding, attrs, charset_list);
1563
1564   if (bom != utf_16_without_bom)
1565     {
1566       ASSURE_DESTINATION (safe_room);
1567       if (big_endian)
1568         EMIT_TWO_BYTES (0xFE, 0xFF);
1569       else
1570         EMIT_TWO_BYTES (0xFF, 0xFE);
1571       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1572     }
1573
1574   while (charbuf < charbuf_end)
1575     {
1576       ASSURE_DESTINATION (safe_room);
1577       c = *charbuf++;
1578       if (c >= MAX_UNICODE_CHAR)
1579         c = coding->default_char;
1580
1581       if (c < 0x10000)
1582         {
1583           if (big_endian)
1584             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1585           else
1586             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1587         }
1588       else
1589         {
1590           int c1, c2;
1591
1592           c -= 0x10000;
1593           c1 = (c >> 10) + 0xD800;
1594           c2 = (c & 0x3FF) + 0xDC00;
1595           if (big_endian)
1596             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1597           else
1598             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1599         }
1600     }
1601   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1602   coding->produced = dst - coding->destination;
1603   coding->produced_char += produced_chars;
1604   return 0;
1605 }
1606
1607 \f
1608 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1609
1610 /* Emacs' internal format for representation of multiple character
1611    sets is a kind of multi-byte encoding, i.e. characters are
1612    represented by variable-length sequences of one-byte codes.
1613
1614    ASCII characters and control characters (e.g. `tab', `newline') are
1615    represented by one-byte sequences which are their ASCII codes, in
1616    the range 0x00 through 0x7F.
1617
1618    8-bit characters of the range 0x80..0x9F are represented by
1619    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1620    code + 0x20).
1621
1622    8-bit characters of the range 0xA0..0xFF are represented by
1623    one-byte sequences which are their 8-bit code.
1624
1625    The other characters are represented by a sequence of `base
1626    leading-code', optional `extended leading-code', and one or two
1627    `position-code's.  The length of the sequence is determined by the
1628    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1629    whereas extended leading-code and position-code take the range 0xA0
1630    through 0xFF.  See `charset.h' for more details about leading-code
1631    and position-code.
1632
1633    --- CODE RANGE of Emacs' internal format ---
1634    character set        range
1635    -------------        -----
1636    ascii                0x00..0x7F
1637    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1638    eight-bit-graphic    0xA0..0xBF
1639    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1640    ---------------------------------------------
1641
1642    As this is the internal character representation, the format is
1643    usually not used externally (i.e. in a file or in a data sent to a
1644    process).  But, it is possible to have a text externally in this
1645    format (i.e. by encoding by the coding system `emacs-mule').
1646
1647    In that case, a sequence of one-byte codes has a slightly different
1648    form.
1649
1650    At first, all characters in eight-bit-control are represented by
1651    one-byte sequences which are their 8-bit code.
1652
1653    Next, character composition data are represented by the byte
1654    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1655    where,
1656         METHOD is 0xF0 plus one of composition method (enum
1657         composition_method),
1658
1659         BYTES is 0xA0 plus a byte length of this composition data,
1660
1661         CHARS is 0x20 plus a number of characters composed by this
1662         data,
1663
1664         COMPONENTs are characters of multibye form or composition
1665         rules encoded by two-byte of ASCII codes.
1666
1667    In addition, for backward compatibility, the following formats are
1668    also recognized as composition data on decoding.
1669
1670    0x80 MSEQ ...
1671    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1672
1673    Here,
1674         MSEQ is a multibyte form but in these special format:
1675           ASCII: 0xA0 ASCII_CODE+0x80,
1676           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1677         RULE is a one byte code of the range 0xA0..0xF0 that
1678         represents a composition rule.
1679   */
1680
1681 char emacs_mule_bytes[256];
1682
1683 int
1684 emacs_mule_char (coding, src, nbytes, nchars, id)
1685      struct coding_system *coding;
1686      const unsigned char *src;
1687      int *nbytes, *nchars, *id;
1688 {
1689   const unsigned char *src_end = coding->source + coding->src_bytes;
1690   const unsigned char *src_base = src;
1691   int multibytep = coding->src_multibyte;
1692   struct charset *charset;
1693   unsigned code;
1694   int c;
1695   int consumed_chars = 0;
1696
1697   ONE_MORE_BYTE (c);
1698   if (c < 0)
1699     {
1700       c = -c;
1701       charset = emacs_mule_charset[0];
1702     }
1703   else
1704     {
1705       switch (emacs_mule_bytes[c])
1706         {
1707         case 2:
1708           if (! (charset = emacs_mule_charset[c]))
1709             goto invalid_code;
1710           ONE_MORE_BYTE (c);
1711           if (c < 0xA0)
1712             goto invalid_code;
1713           code = c & 0x7F;
1714           break;
1715
1716         case 3:
1717           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1718               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1719             {
1720               ONE_MORE_BYTE (c);
1721               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1722                 goto invalid_code;
1723               ONE_MORE_BYTE (c);
1724               if (c < 0xA0)
1725                 goto invalid_code;
1726               code = c & 0x7F;
1727             }
1728           else
1729             {
1730               if (! (charset = emacs_mule_charset[c]))
1731                 goto invalid_code;
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0)
1734                 goto invalid_code;
1735               code = (c & 0x7F) << 8;
1736               ONE_MORE_BYTE (c);
1737               if (c < 0xA0)
1738                 goto invalid_code;
1739               code |= c & 0x7F;
1740             }
1741           break;
1742
1743         case 4:
1744           ONE_MORE_BYTE (c);
1745           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1746             goto invalid_code;
1747           ONE_MORE_BYTE (c);
1748           if (c < 0xA0)
1749             goto invalid_code;
1750           code = (c & 0x7F) << 8;
1751           ONE_MORE_BYTE (c);
1752           if (c < 0xA0)
1753             goto invalid_code;
1754           code |= c & 0x7F;
1755           break;
1756
1757         case 1:
1758           code = c;
1759           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1760                                      ? charset_ascii : charset_eight_bit);
1761           break;
1762
1763         default:
1764           abort ();
1765         }
1766       c = DECODE_CHAR (charset, code);
1767       if (c < 0)
1768         goto invalid_code;
1769     }
1770   *nbytes = src - src_base;
1771   *nchars = consumed_chars;
1772   if (id)
1773     *id = charset->id;
1774   return c;
1775
1776  no_more_source:
1777   return -2;
1778
1779  invalid_code:
1780   return -1;
1781 }
1782
1783
1784 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1785    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1786    else return 0.  */
1787
1788 static int
1789 detect_coding_emacs_mule (coding, detect_info)
1790      struct coding_system *coding;
1791      struct coding_detection_info *detect_info;
1792 {
1793   const unsigned char *src = coding->source, *src_base;
1794   const unsigned char *src_end = coding->source + coding->src_bytes;
1795   int multibytep = coding->src_multibyte;
1796   int consumed_chars = 0;
1797   int c;
1798   int found = 0;
1799
1800   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1801   /* A coding system of this category is always ASCII compatible.  */
1802   src += coding->head_ascii;
1803
1804   while (1)
1805     {
1806       src_base = src;
1807       ONE_MORE_BYTE (c);
1808       if (c < 0)
1809         continue;
1810       if (c == 0x80)
1811         {
1812           /* Perhaps the start of composite character.  We simple skip
1813              it because analyzing it is too heavy for detecting.  But,
1814              at least, we check that the composite character
1815              constitues of more than 4 bytes.  */
1816           const unsigned char *src_base;
1817
1818         repeat:
1819           src_base = src;
1820           do
1821             {
1822               ONE_MORE_BYTE (c);
1823             }
1824           while (c >= 0xA0);
1825
1826           if (src - src_base <= 4)
1827             break;
1828           found = CATEGORY_MASK_EMACS_MULE;
1829           if (c == 0x80)
1830             goto repeat;
1831         }
1832
1833       if (c < 0x80)
1834         {
1835           if (c < 0x20
1836               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1837             break;
1838         }
1839       else
1840         {
1841           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1842
1843           while (more_bytes > 0)
1844             {
1845               ONE_MORE_BYTE (c);
1846               if (c < 0xA0)
1847                 {
1848                   src--;        /* Unread the last byte.  */
1849                   break;
1850                 }
1851               more_bytes--;
1852             }
1853           if (more_bytes != 0)
1854             break;
1855           found = CATEGORY_MASK_EMACS_MULE;
1856         }
1857     }
1858   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1859   return 0;
1860
1861  no_more_source:
1862   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1863     {
1864       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1865       return 0;
1866     }
1867   detect_info->found |= found;
1868   return 1;
1869 }
1870
1871
1872 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1873
1874 /* Decode a character represented as a component of composition
1875    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1876    update SRC to the head of next character (or an encoded composition
1877    rule).  If SRC doesn't points a composition component, set C to -1.
1878    If SRC points an invalid byte sequence, global exit by a return
1879    value 0.  */
1880
1881 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1882   if (1)                                                        \
1883     {                                                           \
1884       int c;                                                    \
1885       int nbytes, nchars;                                       \
1886                                                                 \
1887       if (src == src_end)                                       \
1888         break;                                                  \
1889       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1890       if (c < 0)                                                \
1891         {                                                       \
1892           if (c == -2)                                          \
1893             break;                                              \
1894           goto invalid_code;                                    \
1895         }                                                       \
1896       *buf++ = c;                                               \
1897       src += nbytes;                                            \
1898       consumed_chars += nchars;                                 \
1899     }                                                           \
1900   else
1901
1902
1903 /* Decode a composition rule represented as a component of composition
1904    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1905    and increment BUF.  If SRC points an invalid byte sequence, set C
1906    to -1.  */
1907
1908 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1909   do {                                                  \
1910     int c, gref, nref;                                  \
1911                                                         \
1912     if (src >= src_end)                                 \
1913       goto invalid_code;                                \
1914     ONE_MORE_BYTE_NO_CHECK (c);                         \
1915     c -= 0x20;                                          \
1916     if (c < 0 || c >= 81)                               \
1917       goto invalid_code;                                \
1918                                                         \
1919     gref = c / 9, nref = c % 9;                         \
1920     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1921   } while (0)
1922
1923
1924 /* Decode a composition rule represented as a component of composition
1925    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1926    and increment BUF.  If SRC points an invalid byte sequence, set C
1927    to -1.  */
1928
1929 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1930   do {                                                  \
1931     int gref, nref;                                     \
1932                                                         \
1933     if (src + 1>= src_end)                              \
1934       goto invalid_code;                                \
1935     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1936     gref -= 0x20;                                       \
1937     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1938     nref -= 0x20;                                       \
1939     if (gref < 0 || gref >= 81                          \
1940         || nref < 0 || nref >= 81)                      \
1941       goto invalid_code;                                \
1942     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1943   } while (0)
1944
1945
1946 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1947   do {                                                                  \
1948     /* Emacs 21 style format.  The first three bytes at SRC are         \
1949        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1950        the byte length of this composition information, CHARS is the    \
1951        number of characters composed by this composition.  */           \
1952     enum composition_method method = c - 0xF2;                          \
1953     int *charbuf_base = charbuf;                                        \
1954     int consumed_chars_limit;                                           \
1955     int nbytes, nchars;                                                 \
1956                                                                         \
1957     ONE_MORE_BYTE (c);                                                  \
1958     if (c < 0)                                                          \
1959       goto invalid_code;                                                \
1960     nbytes = c - 0xA0;                                                  \
1961     if (nbytes < 3)                                                     \
1962       goto invalid_code;                                                \
1963     ONE_MORE_BYTE (c);                                                  \
1964     if (c < 0)                                                          \
1965       goto invalid_code;                                                \
1966     nchars = c - 0xA0;                                                  \
1967     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1968     consumed_chars_limit = consumed_chars_base + nbytes;                \
1969     if (method != COMPOSITION_RELATIVE)                                 \
1970       {                                                                 \
1971         int i = 0;                                                      \
1972         while (consumed_chars < consumed_chars_limit)                   \
1973           {                                                             \
1974             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1975               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1976             else                                                        \
1977               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1978             i++;                                                        \
1979           }                                                             \
1980         if (consumed_chars < consumed_chars_limit)                      \
1981           goto invalid_code;                                            \
1982         charbuf_base[0] -= i;                                           \
1983       }                                                                 \
1984   } while (0)
1985
1986
1987 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1988   do {                                                          \
1989     /* Emacs 20 style format for relative composition.  */      \
1990     /* Store multibyte form of characters to be composed.  */   \
1991     enum composition_method method = COMPOSITION_RELATIVE;      \
1992     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1993     int *buf = components;                                      \
1994     int i, j;                                                   \
1995                                                                 \
1996     src = src_base;                                             \
1997     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1998     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1999       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
2000     if (i < 2)                                                  \
2001       goto invalid_code;                                        \
2002     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2003     for (j = 0; j < i; j++)                                     \
2004       *charbuf++ = components[j];                               \
2005   } while (0)
2006
2007
2008 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2009   do {                                                          \
2010     /* Emacs 20 style format for rule-base composition.  */     \
2011     /* Store multibyte form of characters to be composed.  */   \
2012     enum composition_method method = COMPOSITION_WITH_RULE;     \
2013     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2014     int *buf = components;                                      \
2015     int i, j;                                                   \
2016                                                                 \
2017     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2018     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2019       {                                                         \
2020         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2021         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2022       }                                                         \
2023     if (i < 1 || (buf - components) % 2 == 0)                   \
2024       goto invalid_code;                                        \
2025     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2026       goto no_more_source;                                      \
2027     ADD_COMPOSITION_DATA (buf, i, method);                      \
2028     for (j = 0; j < i; j++)                                     \
2029       *charbuf++ = components[j];                               \
2030     for (j = 0; j < i; j += 2)                                  \
2031       *charbuf++ = components[j];                               \
2032   } while (0)
2033
2034
2035 static void
2036 decode_coding_emacs_mule (coding)
2037      struct coding_system *coding;
2038 {
2039   const unsigned char *src = coding->source + coding->consumed;
2040   const unsigned char *src_end = coding->source + coding->src_bytes;
2041   const unsigned char *src_base;
2042   int *charbuf = coding->charbuf + coding->charbuf_used;
2043   int *charbuf_end
2044     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2045   int consumed_chars = 0, consumed_chars_base;
2046   int multibytep = coding->src_multibyte;
2047   Lisp_Object attrs, charset_list;
2048   int char_offset = coding->produced_char;
2049   int last_offset = char_offset;
2050   int last_id = charset_ascii;
2051
2052   CODING_GET_INFO (coding, attrs, charset_list);
2053
2054   while (1)
2055     {
2056       int c;
2057
2058       src_base = src;
2059       consumed_chars_base = consumed_chars;
2060
2061       if (charbuf >= charbuf_end)
2062         break;
2063
2064       ONE_MORE_BYTE (c);
2065       if (c < 0)
2066         {
2067           *charbuf++ = -c;
2068           char_offset++;
2069         }
2070       else if (c < 0x80)
2071         {
2072           *charbuf++ = c;
2073           char_offset++;
2074         }
2075       else if (c == 0x80)
2076         {
2077           ONE_MORE_BYTE (c);
2078           if (c < 0)
2079             goto invalid_code;
2080           if (c - 0xF2 >= COMPOSITION_RELATIVE
2081               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2082             DECODE_EMACS_MULE_21_COMPOSITION (c);
2083           else if (c < 0xC0)
2084             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2085           else if (c == 0xFF)
2086             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2087           else
2088             goto invalid_code;
2089         }
2090       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2091         {
2092           int nbytes, nchars;
2093           int id;
2094
2095           src = src_base;
2096           consumed_chars = consumed_chars_base;
2097           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2098           if (c < 0)
2099             {
2100               if (c == -2)
2101                 break;
2102               goto invalid_code;
2103             }
2104           if (last_id != id)
2105             {
2106               if (last_id != charset_ascii)
2107                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2108               last_id = id;
2109               last_offset = char_offset;
2110             }
2111           *charbuf++ = c;
2112           src += nbytes;
2113           consumed_chars += nchars;
2114           char_offset++;
2115         }
2116       continue;
2117
2118     invalid_code:
2119       src = src_base;
2120       consumed_chars = consumed_chars_base;
2121       ONE_MORE_BYTE (c);
2122       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2123       char_offset++;
2124       coding->errors++;
2125     }
2126
2127  no_more_source:
2128   if (last_id != charset_ascii)
2129     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2130   coding->consumed_char += consumed_chars_base;
2131   coding->consumed = src_base - coding->source;
2132   coding->charbuf_used = charbuf - coding->charbuf;
2133 }
2134
2135
2136 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2137   do {                                          \
2138     if (id < 0xA0)                              \
2139       codes[0] = id, codes[1] = 0;              \
2140     else if (id < 0xE0)                         \
2141       codes[0] = 0x9A, codes[1] = id;           \
2142     else if (id < 0xF0)                         \
2143       codes[0] = 0x9B, codes[1] = id;           \
2144     else if (id < 0xF5)                         \
2145       codes[0] = 0x9C, codes[1] = id;           \
2146     else                                        \
2147       codes[0] = 0x9D, codes[1] = id;           \
2148   } while (0);
2149
2150
2151 static int
2152 encode_coding_emacs_mule (coding)
2153      struct coding_system *coding;
2154 {
2155   int multibytep = coding->dst_multibyte;
2156   int *charbuf = coding->charbuf;
2157   int *charbuf_end = charbuf + coding->charbuf_used;
2158   unsigned char *dst = coding->destination + coding->produced;
2159   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2160   int safe_room = 8;
2161   int produced_chars = 0;
2162   Lisp_Object attrs, charset_list;
2163   int c;
2164   int preferred_charset_id = -1;
2165
2166   CODING_GET_INFO (coding, attrs, charset_list);
2167   if (! EQ (charset_list, Vemacs_mule_charset_list))
2168     {
2169       CODING_ATTR_CHARSET_LIST (attrs)
2170         = charset_list = Vemacs_mule_charset_list;
2171     }
2172
2173   while (charbuf < charbuf_end)
2174     {
2175       ASSURE_DESTINATION (safe_room);
2176       c = *charbuf++;
2177
2178       if (c < 0)
2179         {
2180           /* Handle an annotation.  */
2181           switch (*charbuf)
2182             {
2183             case CODING_ANNOTATE_COMPOSITION_MASK:
2184               /* Not yet implemented.  */
2185               break;
2186             case CODING_ANNOTATE_CHARSET_MASK:
2187               preferred_charset_id = charbuf[3];
2188               if (preferred_charset_id >= 0
2189                   && NILP (Fmemq (make_number (preferred_charset_id),
2190                                   charset_list)))
2191                 preferred_charset_id = -1;
2192               break;
2193             default:
2194               abort ();
2195             }
2196           charbuf += -c - 1;
2197           continue;
2198         }
2199
2200       if (ASCII_CHAR_P (c))
2201         EMIT_ONE_ASCII_BYTE (c);
2202       else if (CHAR_BYTE8_P (c))
2203         {
2204           c = CHAR_TO_BYTE8 (c);
2205           EMIT_ONE_BYTE (c);
2206         }
2207       else
2208         {
2209           struct charset *charset;
2210           unsigned code;
2211           int dimension;
2212           int emacs_mule_id;
2213           unsigned char leading_codes[2];
2214
2215           if (preferred_charset_id >= 0)
2216             {
2217               charset = CHARSET_FROM_ID (preferred_charset_id);
2218               if (! CHAR_CHARSET_P (c, charset))
2219                 charset = char_charset (c, charset_list, NULL);
2220             }
2221           else
2222             charset = char_charset (c, charset_list, &code);
2223           if (! charset)
2224             {
2225               c = coding->default_char;
2226               if (ASCII_CHAR_P (c))
2227                 {
2228                   EMIT_ONE_ASCII_BYTE (c);
2229                   continue;
2230                 }
2231               charset = char_charset (c, charset_list, &code);
2232             }
2233           dimension = CHARSET_DIMENSION (charset);
2234           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2235           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2236           EMIT_ONE_BYTE (leading_codes[0]);
2237           if (leading_codes[1])
2238             EMIT_ONE_BYTE (leading_codes[1]);
2239           if (dimension == 1)
2240             EMIT_ONE_BYTE (code | 0x80);
2241           else
2242             {
2243               code |= 0x8080;
2244               EMIT_ONE_BYTE (code >> 8);
2245               EMIT_ONE_BYTE (code & 0xFF);
2246             }
2247         }
2248     }
2249   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2250   coding->produced_char += produced_chars;
2251   coding->produced = dst - coding->destination;
2252   return 0;
2253 }
2254
2255 \f
2256 /*** 7. ISO2022 handlers ***/
2257
2258 /* The following note describes the coding system ISO2022 briefly.
2259    Since the intention of this note is to help understand the
2260    functions in this file, some parts are NOT ACCURATE or are OVERLY
2261    SIMPLIFIED.  For thorough understanding, please refer to the
2262    original document of ISO2022.  This is equivalent to the standard
2263    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2264
2265    ISO2022 provides many mechanisms to encode several character sets
2266    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2267    is encoded using bytes less than 128.  This may make the encoded
2268    text a little bit longer, but the text passes more easily through
2269    several types of gateway, some of which strip off the MSB (Most
2270    Significant Bit).
2271
2272    There are two kinds of character sets: control character sets and
2273    graphic character sets.  The former contain control characters such
2274    as `newline' and `escape' to provide control functions (control
2275    functions are also provided by escape sequences).  The latter
2276    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2277    two control character sets and many graphic character sets.
2278
2279    Graphic character sets are classified into one of the following
2280    four classes, according to the number of bytes (DIMENSION) and
2281    number of characters in one dimension (CHARS) of the set:
2282    - DIMENSION1_CHARS94
2283    - DIMENSION1_CHARS96
2284    - DIMENSION2_CHARS94
2285    - DIMENSION2_CHARS96
2286
2287    In addition, each character set is assigned an identification tag,
2288    unique for each set, called the "final character" (denoted as <F>
2289    hereafter).  The <F> of each character set is decided by ECMA(*)
2290    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2291    (0x30..0x3F are for private use only).
2292
2293    Note (*): ECMA = European Computer Manufacturers Association
2294
2295    Here are examples of graphic character sets [NAME(<F>)]:
2296         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2297         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2298         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2299         o DIMENSION2_CHARS96 -- none for the moment
2300
2301    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2302         C0 [0x00..0x1F] -- control character plane 0
2303         GL [0x20..0x7F] -- graphic character plane 0
2304         C1 [0x80..0x9F] -- control character plane 1
2305         GR [0xA0..0xFF] -- graphic character plane 1
2306
2307    A control character set is directly designated and invoked to C0 or
2308    C1 by an escape sequence.  The most common case is that:
2309    - ISO646's  control character set is designated/invoked to C0, and
2310    - ISO6429's control character set is designated/invoked to C1,
2311    and usually these designations/invocations are omitted in encoded
2312    text.  In a 7-bit environment, only C0 can be used, and a control
2313    character for C1 is encoded by an appropriate escape sequence to
2314    fit into the environment.  All control characters for C1 are
2315    defined to have corresponding escape sequences.
2316
2317    A graphic character set is at first designated to one of four
2318    graphic registers (G0 through G3), then these graphic registers are
2319    invoked to GL or GR.  These designations and invocations can be
2320    done independently.  The most common case is that G0 is invoked to
2321    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2322    these invocations and designations are omitted in encoded text.
2323    In a 7-bit environment, only GL can be used.
2324
2325    When a graphic character set of CHARS94 is invoked to GL, codes
2326    0x20 and 0x7F of the GL area work as control characters SPACE and
2327    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2328    be used.
2329
2330    There are two ways of invocation: locking-shift and single-shift.
2331    With locking-shift, the invocation lasts until the next different
2332    invocation, whereas with single-shift, the invocation affects the
2333    following character only and doesn't affect the locking-shift
2334    state.  Invocations are done by the following control characters or
2335    escape sequences:
2336
2337    ----------------------------------------------------------------------
2338    abbrev  function                  cntrl escape seq   description
2339    ----------------------------------------------------------------------
2340    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2341    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2342    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2343    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2344    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2345    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2346    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2347    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2348    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2349    ----------------------------------------------------------------------
2350    (*) These are not used by any known coding system.
2351
2352    Control characters for these functions are defined by macros
2353    ISO_CODE_XXX in `coding.h'.
2354
2355    Designations are done by the following escape sequences:
2356    ----------------------------------------------------------------------
2357    escape sequence      description
2358    ----------------------------------------------------------------------
2359    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2360    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2361    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2362    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2363    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2364    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2365    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2366    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2367    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2368    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2369    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2370    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2371    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2372    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2373    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2374    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2375    ----------------------------------------------------------------------
2376
2377    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2378    of dimension 1, chars 94, and final character <F>, etc...
2379
2380    Note (*): Although these designations are not allowed in ISO2022,
2381    Emacs accepts them on decoding, and produces them on encoding
2382    CHARS96 character sets in a coding system which is characterized as
2383    7-bit environment, non-locking-shift, and non-single-shift.
2384
2385    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2386    '(' must be omitted.  We refer to this as "short-form" hereafter.
2387
2388    Now you may notice that there are a lot of ways of encoding the
2389    same multilingual text in ISO2022.  Actually, there exist many
2390    coding systems such as Compound Text (used in X11's inter client
2391    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2392    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2393    localized platforms), and all of these are variants of ISO2022.
2394
2395    In addition to the above, Emacs handles two more kinds of escape
2396    sequences: ISO6429's direction specification and Emacs' private
2397    sequence for specifying character composition.
2398
2399    ISO6429's direction specification takes the following form:
2400         o CSI ']'      -- end of the current direction
2401         o CSI '0' ']'  -- end of the current direction
2402         o CSI '1' ']'  -- start of left-to-right text
2403         o CSI '2' ']'  -- start of right-to-left text
2404    The control character CSI (0x9B: control sequence introducer) is
2405    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2406
2407    Character composition specification takes the following form:
2408         o ESC '0' -- start relative composition
2409         o ESC '1' -- end composition
2410         o ESC '2' -- start rule-base composition (*)
2411         o ESC '3' -- start relative composition with alternate chars  (**)
2412         o ESC '4' -- start rule-base composition with alternate chars  (**)
2413   Since these are not standard escape sequences of any ISO standard,
2414   the use of them with these meanings is restricted to Emacs only.
2415
2416   (*) This form is used only in Emacs 20.7 and older versions,
2417   but newer versions can safely decode it.
2418   (**) This form is used only in Emacs 21.1 and newer versions,
2419   and older versions can't decode it.
2420
2421   Here's a list of example usages of these composition escape
2422   sequences (categorized by `enum composition_method').
2423
2424   COMPOSITION_RELATIVE:
2425         ESC 0 CHAR [ CHAR ] ESC 1
2426   COMPOSITION_WITH_RULE:
2427         ESC 2 CHAR [ RULE CHAR ] ESC 1
2428   COMPOSITION_WITH_ALTCHARS:
2429         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2430   COMPOSITION_WITH_RULE_ALTCHARS:
2431         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2432
2433 enum iso_code_class_type iso_code_class[256];
2434
2435 #define SAFE_CHARSET_P(coding, id)      \
2436   ((id) <= (coding)->max_charset_id     \
2437    && (coding)->safe_charsets[id] >= 0)
2438
2439
2440 #define SHIFT_OUT_OK(category)  \
2441   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2442
2443 static void
2444 setup_iso_safe_charsets (attrs)
2445      Lisp_Object attrs;
2446 {
2447   Lisp_Object charset_list, safe_charsets;
2448   Lisp_Object request;
2449   Lisp_Object reg_usage;
2450   Lisp_Object tail;
2451   int reg94, reg96;
2452   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2453   int max_charset_id;
2454
2455   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2456   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2457       && ! EQ (charset_list, Viso_2022_charset_list))
2458     {
2459       CODING_ATTR_CHARSET_LIST (attrs)
2460         = charset_list = Viso_2022_charset_list;
2461       ASET (attrs, coding_attr_safe_charsets, Qnil);
2462     }
2463
2464   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2465     return;
2466
2467   max_charset_id = 0;
2468   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2469     {
2470       int id = XINT (XCAR (tail));
2471       if (max_charset_id < id)
2472         max_charset_id = id;
2473     }
2474
2475   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2476                                 make_number (255));
2477   request = AREF (attrs, coding_attr_iso_request);
2478   reg_usage = AREF (attrs, coding_attr_iso_usage);
2479   reg94 = XINT (XCAR (reg_usage));
2480   reg96 = XINT (XCDR (reg_usage));
2481
2482   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2483     {
2484       Lisp_Object id;
2485       Lisp_Object reg;
2486       struct charset *charset;
2487
2488       id = XCAR (tail);
2489       charset = CHARSET_FROM_ID (XINT (id));
2490       reg = Fcdr (Fassq (id, request));
2491       if (! NILP (reg))
2492         SSET (safe_charsets, XINT (id), XINT (reg));
2493       else if (charset->iso_chars_96)
2494         {
2495           if (reg96 < 4)
2496             SSET (safe_charsets, XINT (id), reg96);
2497         }
2498       else
2499         {
2500           if (reg94 < 4)
2501             SSET (safe_charsets, XINT (id), reg94);
2502         }
2503     }
2504   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2505 }
2506
2507
2508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2509    Check if a text is encoded in one of ISO-2022 based codig systems.
2510    If it is, return 1, else return 0.  */
2511
2512 static int
2513 detect_coding_iso_2022 (coding, detect_info)
2514      struct coding_system *coding;
2515      struct coding_detection_info *detect_info;
2516 {
2517   const unsigned char *src = coding->source, *src_base = src;
2518   const unsigned char *src_end = coding->source + coding->src_bytes;
2519   int multibytep = coding->src_multibyte;
2520   int single_shifting = 0;
2521   int id;
2522   int c, c1;
2523   int consumed_chars = 0;
2524   int i;
2525   int rejected = 0;
2526   int found = 0;
2527
2528   detect_info->checked |= CATEGORY_MASK_ISO;
2529
2530   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2531     {
2532       struct coding_system *this = &(coding_categories[i]);
2533       Lisp_Object attrs, val;
2534
2535       attrs = CODING_ID_ATTRS (this->id);
2536       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2537           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2538         setup_iso_safe_charsets (attrs);
2539       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2540       this->max_charset_id = SCHARS (val) - 1;
2541       this->safe_charsets = (char *) SDATA (val);
2542     }
2543
2544   /* A coding system of this category is always ASCII compatible.  */
2545   src += coding->head_ascii;
2546
2547   while (rejected != CATEGORY_MASK_ISO)
2548     {
2549       src_base = src;
2550       ONE_MORE_BYTE (c);
2551       switch (c)
2552         {
2553         case ISO_CODE_ESC:
2554           if (inhibit_iso_escape_detection)
2555             break;
2556           single_shifting = 0;
2557           ONE_MORE_BYTE (c);
2558           if (c >= '(' && c <= '/')
2559             {
2560               /* Designation sequence for a charset of dimension 1.  */
2561               ONE_MORE_BYTE (c1);
2562               if (c1 < ' ' || c1 >= 0x80
2563                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2564                 /* Invalid designation sequence.  Just ignore.  */
2565                 break;
2566             }
2567           else if (c == '$')
2568             {
2569               /* Designation sequence for a charset of dimension 2.  */
2570               ONE_MORE_BYTE (c);
2571               if (c >= '@' && c <= 'B')
2572                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2573                 id = iso_charset_table[1][0][c];
2574               else if (c >= '(' && c <= '/')
2575                 {
2576                   ONE_MORE_BYTE (c1);
2577                   if (c1 < ' ' || c1 >= 0x80
2578                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2579                     /* Invalid designation sequence.  Just ignore.  */
2580                     break;
2581                 }
2582               else
2583                 /* Invalid designation sequence.  Just ignore it.  */
2584                 break;
2585             }
2586           else if (c == 'N' || c == 'O')
2587             {
2588               /* ESC <Fe> for SS2 or SS3.  */
2589               single_shifting = 1;
2590               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2591               break;
2592             }
2593           else if (c >= '0' && c <= '4')
2594             {
2595               /* ESC <Fp> for start/end composition.  */
2596               found |= CATEGORY_MASK_ISO;
2597               break;
2598             }
2599           else
2600             {
2601               /* Invalid escape sequence.  Just ignore it.  */
2602               break;
2603             }
2604
2605           /* We found a valid designation sequence for CHARSET.  */
2606           rejected |= CATEGORY_MASK_ISO_8BIT;
2607           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2608                               id))
2609             found |= CATEGORY_MASK_ISO_7;
2610           else
2611             rejected |= CATEGORY_MASK_ISO_7;
2612           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2613                               id))
2614             found |= CATEGORY_MASK_ISO_7_TIGHT;
2615           else
2616             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2617           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2618                               id))
2619             found |= CATEGORY_MASK_ISO_7_ELSE;
2620           else
2621             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2622           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2623                               id))
2624             found |= CATEGORY_MASK_ISO_8_ELSE;
2625           else
2626             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2627           break;
2628
2629         case ISO_CODE_SO:
2630         case ISO_CODE_SI:
2631           /* Locking shift out/in.  */
2632           if (inhibit_iso_escape_detection)
2633             break;
2634           single_shifting = 0;
2635           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2636           found |= CATEGORY_MASK_ISO_ELSE;
2637           break;
2638
2639         case ISO_CODE_CSI:
2640           /* Control sequence introducer.  */
2641           single_shifting = 0;
2642           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2643           found |= CATEGORY_MASK_ISO_8_ELSE;
2644           goto check_extra_latin;
2645
2646         case ISO_CODE_SS2:
2647         case ISO_CODE_SS3:
2648           /* Single shift.   */
2649           if (inhibit_iso_escape_detection)
2650             break;
2651           single_shifting = 0;
2652           rejected |= CATEGORY_MASK_ISO_7BIT;
2653           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2654               & CODING_ISO_FLAG_SINGLE_SHIFT)
2655             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2656           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2657               & CODING_ISO_FLAG_SINGLE_SHIFT)
2658             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2659           if (single_shifting)
2660             break;
2661           goto check_extra_latin;
2662
2663         default:
2664           if (c < 0)
2665             continue;
2666           if (c < 0x80)
2667             {
2668               single_shifting = 0;
2669               break;
2670             }
2671           if (c >= 0xA0)
2672             {
2673               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2674               found |= CATEGORY_MASK_ISO_8_1;
2675               /* Check the length of succeeding codes of the range
2676                  0xA0..0FF.  If the byte length is even, we include
2677                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2678                  only when we are not single shifting.  */
2679               if (! single_shifting
2680                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2681                 {
2682                   int i = 1;
2683                   while (src < src_end)
2684                     {
2685                       ONE_MORE_BYTE (c);
2686                       if (c < 0xA0)
2687                         break;
2688                       i++;
2689                     }
2690
2691                   if (i & 1 && src < src_end)
2692                     rejected |= CATEGORY_MASK_ISO_8_2;
2693                   else
2694                     found |= CATEGORY_MASK_ISO_8_2;
2695                 }
2696               break;
2697             }
2698         check_extra_latin:
2699           single_shifting = 0;
2700           if (! VECTORP (Vlatin_extra_code_table)
2701               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2702             {
2703               rejected = CATEGORY_MASK_ISO;
2704               break;
2705             }
2706           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2707               & CODING_ISO_FLAG_LATIN_EXTRA)
2708             found |= CATEGORY_MASK_ISO_8_1;
2709           else
2710             rejected |= CATEGORY_MASK_ISO_8_1;
2711           rejected |= CATEGORY_MASK_ISO_8_2;
2712         }
2713     }
2714   detect_info->rejected |= CATEGORY_MASK_ISO;
2715   return 0;
2716
2717  no_more_source:
2718   detect_info->rejected |= rejected;
2719   detect_info->found |= (found & ~rejected);
2720   return 1;
2721 }
2722
2723
2724 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2725    escape sequence should be kept.  */
2726 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2727   do {                                                                  \
2728     int id, prev;                                                       \
2729                                                                         \
2730     if (final < '0' || final >= 128                                     \
2731         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2732         || !SAFE_CHARSET_P (coding, id))                                \
2733       {                                                                 \
2734         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2735         chars_96 = -1;                                                  \
2736         break;                                                          \
2737       }                                                                 \
2738     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2739     if (id == charset_jisx0201_roman)                                   \
2740       {                                                                 \
2741         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2742           id = charset_ascii;                                           \
2743       }                                                                 \
2744     else if (id == charset_jisx0208_1978)                               \
2745       {                                                                 \
2746         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2747           id = charset_jisx0208;                                        \
2748       }                                                                 \
2749     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2750     /* If there was an invalid designation to REG previously, and this  \
2751        designation is ASCII to REG, we should keep this designation     \
2752        sequence.  */                                                    \
2753     if (prev == -2 && id == charset_ascii)                              \
2754       chars_96 = -1;                                                    \
2755   } while (0)
2756
2757
2758 #define MAYBE_FINISH_COMPOSITION()                              \
2759   do {                                                          \
2760     int i;                                                      \
2761     if (composition_state == COMPOSING_NO)                      \
2762       break;                                                    \
2763     /* It is assured that we have enough room for producing     \
2764        characters stored in the table `components'.  */         \
2765     if (charbuf + component_idx > charbuf_end)                  \
2766       goto no_more_source;                                      \
2767     composition_state = COMPOSING_NO;                           \
2768     if (method == COMPOSITION_RELATIVE                          \
2769         || method == COMPOSITION_WITH_ALTCHARS)                 \
2770       {                                                         \
2771         for (i = 0; i < component_idx; i++)                     \
2772           *charbuf++ = components[i];                           \
2773         char_offset += component_idx;                           \
2774       }                                                         \
2775     else                                                        \
2776       {                                                         \
2777         for (i = 0; i < component_idx; i += 2)                  \
2778           *charbuf++ = components[i];                           \
2779         char_offset += (component_idx / 2) + 1;                 \
2780       }                                                         \
2781   } while (0)
2782
2783
2784 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2785    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2786    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2787    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2788    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2789   */
2790
2791 #define DECODE_COMPOSITION_START(c1)                                    \
2792   do {                                                                  \
2793     if (c1 == '0'                                                       \
2794         && composition_state == COMPOSING_COMPONENT_RULE)               \
2795       {                                                                 \
2796         component_len = component_idx;                                  \
2797         composition_state = COMPOSING_CHAR;                             \
2798       }                                                                 \
2799     else                                                                \
2800       {                                                                 \
2801         const unsigned char *p;                                         \
2802                                                                         \
2803         MAYBE_FINISH_COMPOSITION ();                                    \
2804         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2805           goto no_more_source;                                          \
2806         for (p = src; p < src_end - 1; p++)                             \
2807           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2808             break;                                                      \
2809         if (p == src_end - 1)                                           \
2810           {                                                             \
2811             /* The current composition doesn't end in the current       \
2812                source.  */                                              \
2813             record_conversion_result                                    \
2814               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
2815             goto no_more_source;                                        \
2816           }                                                             \
2817                                                                         \
2818         /* This is surely the start of a composition.  */               \
2819         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2820                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2821                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2822                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2823         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2824                              : COMPOSING_COMPONENT_CHAR);               \
2825         component_idx = component_len = 0;                              \
2826       }                                                                 \
2827   } while (0)
2828
2829
2830 /* Handle compositoin end sequence ESC 1.  */
2831
2832 #define DECODE_COMPOSITION_END()                                        \
2833   do {                                                                  \
2834     int nchars = (component_len > 0 ? component_idx - component_len     \
2835                   : method == COMPOSITION_RELATIVE ? component_idx      \
2836                   : (component_idx + 1) / 2);                           \
2837     int i;                                                              \
2838     int *saved_charbuf = charbuf;                                       \
2839                                                                         \
2840     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2841     if (method != COMPOSITION_RELATIVE)                                 \
2842       {                                                                 \
2843         if (component_len == 0)                                         \
2844           for (i = 0; i < component_idx; i++)                           \
2845             *charbuf++ = components[i];                                 \
2846         else                                                            \
2847           for (i = 0; i < component_len; i++)                           \
2848             *charbuf++ = components[i];                                 \
2849         *saved_charbuf = saved_charbuf - charbuf;                       \
2850       }                                                                 \
2851     if (method == COMPOSITION_WITH_RULE)                                \
2852       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2853         *charbuf++ = components[i];                                     \
2854     else                                                                \
2855       for (i = component_len; i < component_idx; i++, char_offset++)    \
2856         *charbuf++ = components[i];                                     \
2857     coding->annotated = 1;                                              \
2858     composition_state = COMPOSING_NO;                                   \
2859   } while (0)
2860
2861
2862 /* Decode a composition rule from the byte C1 (and maybe one more byte
2863    from SRC) and store one encoded composition rule in
2864    coding->cmp_data.  */
2865
2866 #define DECODE_COMPOSITION_RULE(c1)                                     \
2867   do {                                                                  \
2868     (c1) -= 32;                                                         \
2869     if (c1 < 81)                /* old format (before ver.21) */        \
2870       {                                                                 \
2871         int gref = (c1) / 9;                                            \
2872         int nref = (c1) % 9;                                            \
2873         if (gref == 4) gref = 10;                                       \
2874         if (nref == 4) nref = 10;                                       \
2875         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2876       }                                                                 \
2877     else if (c1 < 93)           /* new format (after ver.21) */         \
2878       {                                                                 \
2879         ONE_MORE_BYTE (c2);                                             \
2880         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2881       }                                                                 \
2882     else                                                                \
2883       c1 = 0;                                                           \
2884   } while (0)
2885
2886
2887 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2888
2889 static void
2890 decode_coding_iso_2022 (coding)
2891      struct coding_system *coding;
2892 {
2893   const unsigned char *src = coding->source + coding->consumed;
2894   const unsigned char *src_end = coding->source + coding->src_bytes;
2895   const unsigned char *src_base;
2896   int *charbuf = coding->charbuf + coding->charbuf_used;
2897   int *charbuf_end
2898     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2899   int consumed_chars = 0, consumed_chars_base;
2900   int multibytep = coding->src_multibyte;
2901   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2902   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2903   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2904   int charset_id_2, charset_id_3;
2905   struct charset *charset;
2906   int c;
2907   /* For handling composition sequence.  */
2908 #define COMPOSING_NO                    0
2909 #define COMPOSING_CHAR                  1
2910 #define COMPOSING_RULE                  2
2911 #define COMPOSING_COMPONENT_CHAR        3
2912 #define COMPOSING_COMPONENT_RULE        4
2913
2914   int composition_state = COMPOSING_NO;
2915   enum composition_method method;
2916   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2917   int component_idx;
2918   int component_len;
2919   Lisp_Object attrs, charset_list;
2920   int char_offset = coding->produced_char;
2921   int last_offset = char_offset;
2922   int last_id = charset_ascii;
2923
2924   CODING_GET_INFO (coding, attrs, charset_list);
2925   setup_iso_safe_charsets (attrs);
2926
2927   while (1)
2928     {
2929       int c1, c2;
2930
2931       src_base = src;
2932       consumed_chars_base = consumed_chars;
2933
2934       if (charbuf >= charbuf_end)
2935         break;
2936
2937       ONE_MORE_BYTE (c1);
2938       if (c1 < 0)
2939         goto invalid_code;
2940
2941       /* We produce at most one character.  */
2942       switch (iso_code_class [c1])
2943         {
2944         case ISO_0x20_or_0x7F:
2945           if (composition_state != COMPOSING_NO)
2946             {
2947               if (composition_state == COMPOSING_RULE
2948                   || composition_state == COMPOSING_COMPONENT_RULE)
2949                 {
2950                   DECODE_COMPOSITION_RULE (c1);
2951                   components[component_idx++] = c1;
2952                   composition_state--;
2953                   continue;
2954                 }
2955             }
2956           if (charset_id_0 < 0
2957               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2958             /* This is SPACE or DEL.  */
2959             charset = CHARSET_FROM_ID (charset_ascii);
2960           else
2961             charset = CHARSET_FROM_ID (charset_id_0);
2962           break;
2963
2964         case ISO_graphic_plane_0:
2965           if (composition_state != COMPOSING_NO)
2966             {
2967               if (composition_state == COMPOSING_RULE
2968                   || composition_state == COMPOSING_COMPONENT_RULE)
2969                 {
2970                   DECODE_COMPOSITION_RULE (c1);
2971                   components[component_idx++] = c1;
2972                   composition_state--;
2973                   continue;
2974                 }
2975             }
2976           if (charset_id_0 < 0)
2977             charset = CHARSET_FROM_ID (charset_ascii);
2978           else
2979             charset = CHARSET_FROM_ID (charset_id_0);
2980           break;
2981
2982         case ISO_0xA0_or_0xFF:
2983           if (charset_id_1 < 0
2984               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2985               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2986             goto invalid_code;
2987           /* This is a graphic character, we fall down ... */
2988
2989         case ISO_graphic_plane_1:
2990           if (charset_id_1 < 0)
2991             goto invalid_code;
2992           charset = CHARSET_FROM_ID (charset_id_1);
2993           break;
2994
2995         case ISO_control_0:
2996           MAYBE_FINISH_COMPOSITION ();
2997           charset = CHARSET_FROM_ID (charset_ascii);
2998           break;
2999
3000         case ISO_control_1:
3001           MAYBE_FINISH_COMPOSITION ();
3002           goto invalid_code;
3003
3004         case ISO_shift_out:
3005           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3006               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3007             goto invalid_code;
3008           CODING_ISO_INVOCATION (coding, 0) = 1;
3009           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3010           continue;
3011
3012         case ISO_shift_in:
3013           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3014             goto invalid_code;
3015           CODING_ISO_INVOCATION (coding, 0) = 0;
3016           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3017           continue;
3018
3019         case ISO_single_shift_2_7:
3020         case ISO_single_shift_2:
3021           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3022             goto invalid_code;
3023           /* SS2 is handled as an escape sequence of ESC 'N' */
3024           c1 = 'N';
3025           goto label_escape_sequence;
3026
3027         case ISO_single_shift_3:
3028           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3029             goto invalid_code;
3030           /* SS2 is handled as an escape sequence of ESC 'O' */
3031           c1 = 'O';
3032           goto label_escape_sequence;
3033
3034         case ISO_control_sequence_introducer:
3035           /* CSI is handled as an escape sequence of ESC '[' ...  */
3036           c1 = '[';
3037           goto label_escape_sequence;
3038
3039         case ISO_escape:
3040           ONE_MORE_BYTE (c1);
3041         label_escape_sequence:
3042           /* Escape sequences handled here are invocation,
3043              designation, direction specification, and character
3044              composition specification.  */
3045           switch (c1)
3046             {
3047             case '&':           /* revision of following character set */
3048               ONE_MORE_BYTE (c1);
3049               if (!(c1 >= '@' && c1 <= '~'))
3050                 goto invalid_code;
3051               ONE_MORE_BYTE (c1);
3052               if (c1 != ISO_CODE_ESC)
3053                 goto invalid_code;
3054               ONE_MORE_BYTE (c1);
3055               goto label_escape_sequence;
3056
3057             case '$':           /* designation of 2-byte character set */
3058               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3059                 goto invalid_code;
3060               {
3061                 int reg, chars96;
3062
3063                 ONE_MORE_BYTE (c1);
3064                 if (c1 >= '@' && c1 <= 'B')
3065                   {     /* designation of JISX0208.1978, GB2312.1980,
3066                            or JISX0208.1980 */
3067                     reg = 0, chars96 = 0;
3068                   }
3069                 else if (c1 >= 0x28 && c1 <= 0x2B)
3070                   { /* designation of DIMENSION2_CHARS94 character set */
3071                     reg = c1 - 0x28, chars96 = 0;
3072                     ONE_MORE_BYTE (c1);
3073                   }
3074                 else if (c1 >= 0x2C && c1 <= 0x2F)
3075                   { /* designation of DIMENSION2_CHARS96 character set */
3076                     reg = c1 - 0x2C, chars96 = 1;
3077                     ONE_MORE_BYTE (c1);
3078                   }
3079                 else
3080                   goto invalid_code;
3081                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3082                 /* We must update these variables now.  */
3083                 if (reg == 0)
3084                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3085                 else if (reg == 1)
3086                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3087                 if (chars96 < 0)
3088                   goto invalid_code;
3089               }
3090               continue;
3091
3092             case 'n':           /* invocation of locking-shift-2 */
3093               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3094                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3095                 goto invalid_code;
3096               CODING_ISO_INVOCATION (coding, 0) = 2;
3097               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3098               continue;
3099
3100             case 'o':           /* invocation of locking-shift-3 */
3101               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3102                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3103                 goto invalid_code;
3104               CODING_ISO_INVOCATION (coding, 0) = 3;
3105               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3106               continue;
3107
3108             case 'N':           /* invocation of single-shift-2 */
3109               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3110                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3111                 goto invalid_code;
3112               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3113               if (charset_id_2 < 0)
3114                 charset = CHARSET_FROM_ID (charset_ascii);
3115               else
3116                 charset = CHARSET_FROM_ID (charset_id_2);
3117               ONE_MORE_BYTE (c1);
3118               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3119                 goto invalid_code;
3120               break;
3121
3122             case 'O':           /* invocation of single-shift-3 */
3123               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3124                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3125                 goto invalid_code;
3126               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3127               if (charset_id_3 < 0)
3128                 charset = CHARSET_FROM_ID (charset_ascii);
3129               else
3130                 charset = CHARSET_FROM_ID (charset_id_3);
3131               ONE_MORE_BYTE (c1);
3132               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3133                 goto invalid_code;
3134               break;
3135
3136             case '0': case '2': case '3': case '4': /* start composition */
3137               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3138                 goto invalid_code;
3139               DECODE_COMPOSITION_START (c1);
3140               continue;
3141
3142             case '1':           /* end composition */
3143               if (composition_state == COMPOSING_NO)
3144                 goto invalid_code;
3145               DECODE_COMPOSITION_END ();
3146               continue;
3147
3148             case '[':           /* specification of direction */
3149               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3150                 goto invalid_code;
3151               /* For the moment, nested direction is not supported.
3152                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3153                  left-to-right, and nozero means right-to-left.  */
3154               ONE_MORE_BYTE (c1);
3155               switch (c1)
3156                 {
3157                 case ']':       /* end of the current direction */
3158                   coding->mode &= ~CODING_MODE_DIRECTION;
3159
3160                 case '0':       /* end of the current direction */
3161                 case '1':       /* start of left-to-right direction */
3162                   ONE_MORE_BYTE (c1);
3163                   if (c1 == ']')
3164                     coding->mode &= ~CODING_MODE_DIRECTION;
3165                   else
3166                     goto invalid_code;
3167                   break;
3168
3169                 case '2':       /* start of right-to-left direction */
3170                   ONE_MORE_BYTE (c1);
3171                   if (c1 == ']')
3172                     coding->mode |= CODING_MODE_DIRECTION;
3173                   else
3174                     goto invalid_code;
3175                   break;
3176
3177                 default:
3178                   goto invalid_code;
3179                 }
3180               continue;
3181
3182             case '%':
3183               ONE_MORE_BYTE (c1);
3184               if (c1 == '/')
3185                 {
3186                   /* CTEXT extended segment:
3187                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3188                      We keep these bytes as is for the moment.
3189                      They may be decoded by post-read-conversion.  */
3190                   int dim, M, L;
3191                   int size;
3192
3193                   ONE_MORE_BYTE (dim);
3194                   ONE_MORE_BYTE (M);
3195                   ONE_MORE_BYTE (L);
3196                   size = ((M - 128) * 128) + (L - 128);
3197                   if (charbuf + 8 + size > charbuf_end)
3198                     goto break_loop;
3199                   *charbuf++ = ISO_CODE_ESC;
3200                   *charbuf++ = '%';
3201                   *charbuf++ = '/';
3202                   *charbuf++ = dim;
3203                   *charbuf++ = BYTE8_TO_CHAR (M);
3204                   *charbuf++ = BYTE8_TO_CHAR (L);
3205                   while (size-- > 0)
3206                     {
3207                       ONE_MORE_BYTE (c1);
3208                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3209                     }
3210                 }
3211               else if (c1 == 'G')
3212                 {
3213                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3214                      ESC % G --UTF-8-BYTES-- ESC % @
3215                      We keep these bytes as is for the moment.
3216                      They may be decoded by post-read-conversion.  */
3217                   int *p = charbuf;
3218
3219                   if (p + 6 > charbuf_end)
3220                     goto break_loop;
3221                   *p++ = ISO_CODE_ESC;
3222                   *p++ = '%';
3223                   *p++ = 'G';
3224                   while (p < charbuf_end)
3225                     {
3226                       ONE_MORE_BYTE (c1);
3227                       if (c1 == ISO_CODE_ESC
3228                           && src + 1 < src_end
3229                           && src[0] == '%'
3230                           && src[1] == '@')
3231                         {
3232                           src += 2;
3233                           break;
3234                         }
3235                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3236                     }
3237                   if (p + 3 > charbuf_end)
3238                     goto break_loop;
3239                   *p++ = ISO_CODE_ESC;
3240                   *p++ = '%';
3241                   *p++ = '@';
3242                   charbuf = p;
3243                 }
3244               else
3245                 goto invalid_code;
3246               continue;
3247               break;
3248
3249             default:
3250               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3251                 goto invalid_code;
3252               {
3253                 int reg, chars96;
3254
3255                 if (c1 >= 0x28 && c1 <= 0x2B)
3256                   { /* designation of DIMENSION1_CHARS94 character set */
3257                     reg = c1 - 0x28, chars96 = 0;
3258                     ONE_MORE_BYTE (c1);
3259                   }
3260                 else if (c1 >= 0x2C && c1 <= 0x2F)
3261                   { /* designation of DIMENSION1_CHARS96 character set */
3262                     reg = c1 - 0x2C, chars96 = 1;
3263                     ONE_MORE_BYTE (c1);
3264                   }
3265                 else
3266                   goto invalid_code;
3267                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3268                 /* We must update these variables now.  */
3269                 if (reg == 0)
3270                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3271                 else if (reg == 1)
3272                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3273                 if (chars96 < 0)
3274                   goto invalid_code;
3275               }
3276               continue;
3277             }
3278         }
3279
3280       if (charset->id != charset_ascii
3281           && last_id != charset->id)
3282         {
3283           if (last_id != charset_ascii)
3284             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3285           last_id = charset->id;
3286           last_offset = char_offset;
3287         }
3288
3289       /* Now we know CHARSET and 1st position code C1 of a character.
3290          Produce a decoded character while getting 2nd position code
3291          C2 if necessary.  */
3292       c1 &= 0x7F;
3293       if (CHARSET_DIMENSION (charset) > 1)
3294         {
3295           ONE_MORE_BYTE (c2);
3296           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3297             /* C2 is not in a valid range.  */
3298             goto invalid_code;
3299           c1 = (c1 << 8) | (c2 & 0x7F);
3300           if (CHARSET_DIMENSION (charset) > 2)
3301             {
3302               ONE_MORE_BYTE (c2);
3303               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3304                 /* C2 is not in a valid range.  */
3305                 goto invalid_code;
3306               c1 = (c1 << 8) | (c2 & 0x7F);
3307             }
3308         }
3309
3310       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3311       if (c < 0)
3312         {
3313           MAYBE_FINISH_COMPOSITION ();
3314           for (; src_base < src; src_base++, char_offset++)
3315             {
3316               if (ASCII_BYTE_P (*src_base))
3317                 *charbuf++ = *src_base;
3318               else
3319                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3320             }
3321         }
3322       else if (composition_state == COMPOSING_NO)
3323         {
3324           *charbuf++ = c;
3325           char_offset++;
3326         }
3327       else
3328         {
3329           components[component_idx++] = c;
3330           if (method == COMPOSITION_WITH_RULE
3331               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3332                   && composition_state == COMPOSING_COMPONENT_CHAR))
3333             composition_state++;
3334         }
3335       continue;
3336
3337     invalid_code:
3338       MAYBE_FINISH_COMPOSITION ();
3339       src = src_base;
3340       consumed_chars = consumed_chars_base;
3341       ONE_MORE_BYTE (c);
3342       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3343       char_offset++;
3344       coding->errors++;
3345       continue;
3346
3347     break_loop:
3348       break;
3349     }
3350
3351  no_more_source:
3352   if (last_id != charset_ascii)
3353     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3354   coding->consumed_char += consumed_chars_base;
3355   coding->consumed = src_base - coding->source;
3356   coding->charbuf_used = charbuf - coding->charbuf;
3357 }
3358
3359
3360 /* ISO2022 encoding stuff.  */
3361
3362 /*
3363    It is not enough to say just "ISO2022" on encoding, we have to
3364    specify more details.  In Emacs, each coding system of ISO2022
3365    variant has the following specifications:
3366         1. Initial designation to G0 thru G3.
3367         2. Allows short-form designation?
3368         3. ASCII should be designated to G0 before control characters?
3369         4. ASCII should be designated to G0 at end of line?
3370         5. 7-bit environment or 8-bit environment?
3371         6. Use locking-shift?
3372         7. Use Single-shift?
3373    And the following two are only for Japanese:
3374         8. Use ASCII in place of JIS0201-1976-Roman?
3375         9. Use JISX0208-1983 in place of JISX0208-1978?
3376    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3377    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3378    details.
3379 */
3380
3381 /* Produce codes (escape sequence) for designating CHARSET to graphic
3382    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3383    '@', 'A', or 'B' and the coding system CODING allows, produce
3384    designation sequence of short-form.  */
3385
3386 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3387   do {                                                                  \
3388     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3389     char *intermediate_char_94 = "()*+";                                \
3390     char *intermediate_char_96 = ",-./";                                \
3391     int revision = -1;                                                  \
3392     int c;                                                              \
3393                                                                         \
3394     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3395       revision = CHARSET_ISO_REVISION (charset);                        \
3396                                                                         \
3397     if (revision >= 0)                                                  \
3398       {                                                                 \
3399         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3400         EMIT_ONE_BYTE ('@' + revision);                                 \
3401       }                                                                 \
3402     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3403     if (CHARSET_DIMENSION (charset) == 1)                               \
3404       {                                                                 \
3405         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3406           c = intermediate_char_94[reg];                                \
3407         else                                                            \
3408           c = intermediate_char_96[reg];                                \
3409         EMIT_ONE_ASCII_BYTE (c);                                        \
3410       }                                                                 \
3411     else                                                                \
3412       {                                                                 \
3413         EMIT_ONE_ASCII_BYTE ('$');                                      \
3414         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3415           {                                                             \
3416             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3417                 || reg != 0                                             \
3418                 || final_char < '@' || final_char > 'B')                \
3419               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3420           }                                                             \
3421         else                                                            \
3422           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3423       }                                                                 \
3424     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3425                                                                         \
3426     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3427   } while (0)
3428
3429
3430 /* The following two macros produce codes (control character or escape
3431    sequence) for ISO2022 single-shift functions (single-shift-2 and
3432    single-shift-3).  */
3433
3434 #define ENCODE_SINGLE_SHIFT_2                                           \
3435   do {                                                                  \
3436     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3437       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3438     else                                                                \
3439       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3440     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3441   } while (0)
3442
3443
3444 #define ENCODE_SINGLE_SHIFT_3                                           \
3445   do {                                                                  \
3446     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3447       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3448     else                                                                \
3449       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3450     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3451   } while (0)
3452
3453
3454 /* The following four macros produce codes (control character or
3455    escape sequence) for ISO2022 locking-shift functions (shift-in,
3456    shift-out, locking-shift-2, and locking-shift-3).  */
3457
3458 #define ENCODE_SHIFT_IN                                 \
3459   do {                                                  \
3460     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3461     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3462   } while (0)
3463
3464
3465 #define ENCODE_SHIFT_OUT                                \
3466   do {                                                  \
3467     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3468     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3469   } while (0)
3470
3471
3472 #define ENCODE_LOCKING_SHIFT_2                          \
3473   do {                                                  \
3474     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3475     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3476   } while (0)
3477
3478
3479 #define ENCODE_LOCKING_SHIFT_3                          \
3480   do {                                                  \
3481     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3482     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3483   } while (0)
3484
3485
3486 /* Produce codes for a DIMENSION1 character whose character set is
3487    CHARSET and whose position-code is C1.  Designation and invocation
3488    sequences are also produced in advance if necessary.  */
3489
3490 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3491   do {                                                                  \
3492     int id = CHARSET_ID (charset);                                      \
3493                                                                         \
3494     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3495         && id == charset_ascii)                                         \
3496       {                                                                 \
3497         id = charset_jisx0201_roman;                                    \
3498         charset = CHARSET_FROM_ID (id);                                 \
3499       }                                                                 \
3500                                                                         \
3501     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3502       {                                                                 \
3503         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3504           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3505         else                                                            \
3506           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3507         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3508         break;                                                          \
3509       }                                                                 \
3510     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3511       {                                                                 \
3512         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3513         break;                                                          \
3514       }                                                                 \
3515     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3516       {                                                                 \
3517         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3518         break;                                                          \
3519       }                                                                 \
3520     else                                                                \
3521       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3522          must invoke it, or, at first, designate it to some graphic     \
3523          register.  Then repeat the loop to actually produce the        \
3524          character.  */                                                 \
3525       dst = encode_invocation_designation (charset, coding, dst,        \
3526                                            &produced_chars);            \
3527   } while (1)
3528
3529
3530 /* Produce codes for a DIMENSION2 character whose character set is
3531    CHARSET and whose position-codes are C1 and C2.  Designation and
3532    invocation codes are also produced in advance if necessary.  */
3533
3534 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3535   do {                                                                  \
3536     int id = CHARSET_ID (charset);                                      \
3537                                                                         \
3538     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3539         && id == charset_jisx0208)                                      \
3540       {                                                                 \
3541         id = charset_jisx0208_1978;                                     \
3542         charset = CHARSET_FROM_ID (id);                                 \
3543       }                                                                 \
3544                                                                         \
3545     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3546       {                                                                 \
3547         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3548           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3549         else                                                            \
3550           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3551         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3552         break;                                                          \
3553       }                                                                 \
3554     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3555       {                                                                 \
3556         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3557         break;                                                          \
3558       }                                                                 \
3559     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3560       {                                                                 \
3561         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3562         break;                                                          \
3563       }                                                                 \
3564     else                                                                \
3565       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3566          must invoke it, or, at first, designate it to some graphic     \
3567          register.  Then repeat the loop to actually produce the        \
3568          character.  */                                                 \
3569       dst = encode_invocation_designation (charset, coding, dst,        \
3570                                            &produced_chars);            \
3571   } while (1)
3572
3573
3574 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3575   do {                                                                     \
3576     int code = ENCODE_CHAR ((charset),(c));                                \
3577                                                                            \
3578     if (CHARSET_DIMENSION (charset) == 1)                                  \
3579       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3580     else                                                                   \
3581       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3582   } while (0)
3583
3584
3585 /* Produce designation and invocation codes at a place pointed by DST
3586    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3587    Return new DST.  */
3588
3589 unsigned char *
3590 encode_invocation_designation (charset, coding, dst, p_nchars)
3591      struct charset *charset;
3592      struct coding_system *coding;
3593      unsigned char *dst;
3594      int *p_nchars;
3595 {
3596   int multibytep = coding->dst_multibyte;
3597   int produced_chars = *p_nchars;
3598   int reg;                      /* graphic register number */
3599   int id = CHARSET_ID (charset);
3600
3601   /* At first, check designations.  */
3602   for (reg = 0; reg < 4; reg++)
3603     if (id == CODING_ISO_DESIGNATION (coding, reg))
3604       break;
3605
3606   if (reg >= 4)
3607     {
3608       /* CHARSET is not yet designated to any graphic registers.  */
3609       /* At first check the requested designation.  */
3610       reg = CODING_ISO_REQUEST (coding, id);
3611       if (reg < 0)
3612         /* Since CHARSET requests no special designation, designate it
3613            to graphic register 0.  */
3614         reg = 0;
3615
3616       ENCODE_DESIGNATION (charset, reg, coding);
3617     }
3618
3619   if (CODING_ISO_INVOCATION (coding, 0) != reg
3620       && CODING_ISO_INVOCATION (coding, 1) != reg)
3621     {
3622       /* Since the graphic register REG is not invoked to any graphic
3623          planes, invoke it to graphic plane 0.  */
3624       switch (reg)
3625         {
3626         case 0:                 /* graphic register 0 */
3627           ENCODE_SHIFT_IN;
3628           break;
3629
3630         case 1:                 /* graphic register 1 */
3631           ENCODE_SHIFT_OUT;
3632           break;
3633
3634         case 2:                 /* graphic register 2 */
3635           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3636             ENCODE_SINGLE_SHIFT_2;
3637           else
3638             ENCODE_LOCKING_SHIFT_2;
3639           break;
3640
3641         case 3:                 /* graphic register 3 */
3642           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3643             ENCODE_SINGLE_SHIFT_3;
3644           else
3645             ENCODE_LOCKING_SHIFT_3;
3646           break;
3647         }
3648     }
3649
3650   *p_nchars = produced_chars;
3651   return dst;
3652 }
3653
3654 /* The following three macros produce codes for indicating direction
3655    of text.  */
3656 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3657   do {                                                                  \
3658     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3659       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3660     else                                                                \
3661       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3662   } while (0)
3663
3664
3665 #define ENCODE_DIRECTION_R2L()                  \
3666   do {                                          \
3667     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3668     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3669   } while (0)
3670
3671
3672 #define ENCODE_DIRECTION_L2R()                  \
3673   do {                                          \
3674     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3675     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3676   } while (0)
3677
3678
3679 /* Produce codes for designation and invocation to reset the graphic
3680    planes and registers to initial state.  */
3681 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3682   do {                                                                  \
3683     int reg;                                                            \
3684     struct charset *charset;                                            \
3685                                                                         \
3686     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3687       ENCODE_SHIFT_IN;                                                  \
3688     for (reg = 0; reg < 4; reg++)                                       \
3689       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3690           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3691               != CODING_ISO_INITIAL (coding, reg)))                     \
3692         {                                                               \
3693           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3694           ENCODE_DESIGNATION (charset, reg, coding);                    \
3695         }                                                               \
3696   } while (0)
3697
3698
3699 /* Produce designation sequences of charsets in the line started from
3700    SRC to a place pointed by DST, and return updated DST.
3701
3702    If the current block ends before any end-of-line, we may fail to
3703    find all the necessary designations.  */
3704
3705 static unsigned char *
3706 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3707      struct coding_system *coding;
3708      int *charbuf, *charbuf_end;
3709      unsigned char *dst;
3710 {
3711   struct charset *charset;
3712   /* Table of charsets to be designated to each graphic register.  */
3713   int r[4];
3714   int c, found = 0, reg;
3715   int produced_chars = 0;
3716   int multibytep = coding->dst_multibyte;
3717   Lisp_Object attrs;
3718   Lisp_Object charset_list;
3719
3720   attrs = CODING_ID_ATTRS (coding->id);
3721   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3722   if (EQ (charset_list, Qiso_2022))
3723     charset_list = Viso_2022_charset_list;
3724
3725   for (reg = 0; reg < 4; reg++)
3726     r[reg] = -1;
3727
3728   while (found < 4)
3729     {
3730       int id;
3731
3732       c = *charbuf++;
3733       if (c == '\n')
3734         break;
3735       charset = char_charset (c, charset_list, NULL);
3736       id = CHARSET_ID (charset);
3737       reg = CODING_ISO_REQUEST (coding, id);
3738       if (reg >= 0 && r[reg] < 0)
3739         {
3740           found++;
3741           r[reg] = id;
3742         }
3743     }
3744
3745   if (found)
3746     {
3747       for (reg = 0; reg < 4; reg++)
3748         if (r[reg] >= 0
3749             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3750           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3751     }
3752
3753   return dst;
3754 }
3755
3756 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3757
3758 static int
3759 encode_coding_iso_2022 (coding)
3760      struct coding_system *coding;
3761 {
3762   int multibytep = coding->dst_multibyte;
3763   int *charbuf = coding->charbuf;
3764   int *charbuf_end = charbuf + coding->charbuf_used;
3765   unsigned char *dst = coding->destination + coding->produced;
3766   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3767   int safe_room = 16;
3768   int bol_designation
3769     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3770        && CODING_ISO_BOL (coding));
3771   int produced_chars = 0;
3772   Lisp_Object attrs, eol_type, charset_list;
3773   int ascii_compatible;
3774   int c;
3775   int preferred_charset_id = -1;
3776
3777   CODING_GET_INFO (coding, attrs, charset_list);
3778   eol_type = CODING_ID_EOL_TYPE (coding->id);
3779   if (VECTORP (eol_type))
3780     eol_type = Qunix;
3781
3782   setup_iso_safe_charsets (attrs);
3783   /* Charset list may have been changed.  */
3784   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3785   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3786
3787   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3788
3789   while (charbuf < charbuf_end)
3790     {
3791       ASSURE_DESTINATION (safe_room);
3792
3793       if (bol_designation)
3794         {
3795           unsigned char *dst_prev = dst;
3796
3797           /* We have to produce designation sequences if any now.  */
3798           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3799           bol_designation = 0;
3800           /* We are sure that designation sequences are all ASCII bytes.  */
3801           produced_chars += dst - dst_prev;
3802         }
3803
3804       c = *charbuf++;
3805
3806       if (c < 0)
3807         {
3808           /* Handle an annotation.  */
3809           switch (*charbuf)
3810             {
3811             case CODING_ANNOTATE_COMPOSITION_MASK:
3812               /* Not yet implemented.  */
3813               break;
3814             case CODING_ANNOTATE_CHARSET_MASK:
3815               preferred_charset_id = charbuf[2];
3816               if (preferred_charset_id >= 0
3817                   && NILP (Fmemq (make_number (preferred_charset_id),
3818                                   charset_list)))
3819                 preferred_charset_id = -1;
3820               break;
3821             default:
3822               abort ();
3823             }
3824           charbuf += -c - 1;
3825           continue;
3826         }
3827
3828       /* Now encode the character C.  */
3829       if (c < 0x20 || c == 0x7F)
3830         {
3831           if (c == '\n'
3832               || (c == '\r' && EQ (eol_type, Qmac)))
3833             {
3834               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3835                 ENCODE_RESET_PLANE_AND_REGISTER ();
3836               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3837                 {
3838                   int i;
3839
3840                   for (i = 0; i < 4; i++)
3841                     CODING_ISO_DESIGNATION (coding, i)
3842                       = CODING_ISO_INITIAL (coding, i);
3843                 }
3844               bol_designation
3845                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3846             }
3847           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3848             ENCODE_RESET_PLANE_AND_REGISTER ();
3849           EMIT_ONE_ASCII_BYTE (c);
3850         }
3851       else if (ASCII_CHAR_P (c))
3852         {
3853           if (ascii_compatible)
3854             EMIT_ONE_ASCII_BYTE (c);
3855           else
3856             {
3857               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3858               ENCODE_ISO_CHARACTER (charset, c);
3859             }
3860         }
3861       else if (CHAR_BYTE8_P (c))
3862         {
3863           c = CHAR_TO_BYTE8 (c);
3864           EMIT_ONE_BYTE (c);
3865         }
3866       else
3867         {
3868           struct charset *charset;
3869
3870           if (preferred_charset_id >= 0)
3871             {
3872               charset = CHARSET_FROM_ID (preferred_charset_id);
3873               if (! CHAR_CHARSET_P (c, charset))
3874                 charset = char_charset (c, charset_list, NULL);
3875             }
3876           else
3877             charset = char_charset (c, charset_list, NULL);
3878           if (!charset)
3879             {
3880               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3881                 {
3882                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3883                   charset = CHARSET_FROM_ID (charset_ascii);
3884                 }
3885               else
3886                 {
3887                   c = coding->default_char;
3888                   charset = char_charset (c, charset_list, NULL);
3889                 }
3890             }
3891           ENCODE_ISO_CHARACTER (charset, c);
3892         }
3893     }
3894
3895   if (coding->mode & CODING_MODE_LAST_BLOCK
3896       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3897     {
3898       ASSURE_DESTINATION (safe_room);
3899       ENCODE_RESET_PLANE_AND_REGISTER ();
3900     }
3901   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3902   CODING_ISO_BOL (coding) = bol_designation;
3903   coding->produced_char += produced_chars;
3904   coding->produced = dst - coding->destination;
3905   return 0;
3906 }
3907
3908 \f
3909 /*** 8,9. SJIS and BIG5 handlers ***/
3910
3911 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3912    quite widely.  So, for the moment, Emacs supports them in the bare
3913    C code.  But, in the future, they may be supported only by CCL.  */
3914
3915 /* SJIS is a coding system encoding three character sets: ASCII, right
3916    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3917    as is.  A character of charset katakana-jisx0201 is encoded by
3918    "position-code + 0x80".  A character of charset japanese-jisx0208
3919    is encoded in 2-byte but two position-codes are divided and shifted
3920    so that it fit in the range below.
3921
3922    --- CODE RANGE of SJIS ---
3923    (character set)      (range)
3924    ASCII                0x00 .. 0x7F
3925    KATAKANA-JISX0201    0xA0 .. 0xDF
3926    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3927             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3928    -------------------------------
3929
3930 */
3931
3932 /* BIG5 is a coding system encoding two character sets: ASCII and
3933    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3934    character set and is encoded in two-byte.
3935
3936    --- CODE RANGE of BIG5 ---
3937    (character set)      (range)
3938    ASCII                0x00 .. 0x7F
3939    Big5 (1st byte)      0xA1 .. 0xFE
3940         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3941    --------------------------
3942
3943   */
3944
3945 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3946    Check if a text is encoded in SJIS.  If it is, return
3947    CATEGORY_MASK_SJIS, else return 0.  */
3948
3949 static int
3950 detect_coding_sjis (coding, detect_info)
3951      struct coding_system *coding;
3952      struct coding_detection_info *detect_info;
3953 {
3954   const unsigned char *src = coding->source, *src_base;
3955   const unsigned char *src_end = coding->source + coding->src_bytes;
3956   int multibytep = coding->src_multibyte;
3957   int consumed_chars = 0;
3958   int found = 0;
3959   int c;
3960
3961   detect_info->checked |= CATEGORY_MASK_SJIS;
3962   /* A coding system of this category is always ASCII compatible.  */
3963   src += coding->head_ascii;
3964
3965   while (1)
3966     {
3967       src_base = src;
3968       ONE_MORE_BYTE (c);
3969       if (c < 0x80)
3970         continue;
3971       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3972         {
3973           ONE_MORE_BYTE (c);
3974           if (c < 0x40 || c == 0x7F || c > 0xFC)
3975             break;
3976           found = CATEGORY_MASK_SJIS;
3977         }
3978       else if (c >= 0xA0 && c < 0xE0)
3979         found = CATEGORY_MASK_SJIS;
3980       else
3981         break;
3982     }
3983   detect_info->rejected |= CATEGORY_MASK_SJIS;
3984   return 0;
3985
3986  no_more_source:
3987   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3988     {
3989       detect_info->rejected |= CATEGORY_MASK_SJIS;
3990       return 0;
3991     }
3992   detect_info->found |= found;
3993   return 1;
3994 }
3995
3996 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3997    Check if a text is encoded in BIG5.  If it is, return
3998    CATEGORY_MASK_BIG5, else return 0.  */
3999
4000 static int
4001 detect_coding_big5 (coding, detect_info)
4002      struct coding_system *coding;
4003      struct coding_detection_info *detect_info;
4004 {
4005   const unsigned char *src = coding->source, *src_base;
4006   const unsigned char *src_end = coding->source + coding->src_bytes;
4007   int multibytep = coding->src_multibyte;
4008   int consumed_chars = 0;
4009   int found = 0;
4010   int c;
4011
4012   detect_info->checked |= CATEGORY_MASK_BIG5;
4013   /* A coding system of this category is always ASCII compatible.  */
4014   src += coding->head_ascii;
4015
4016   while (1)
4017     {
4018       src_base = src;
4019       ONE_MORE_BYTE (c);
4020       if (c < 0x80)
4021         continue;
4022       if (c >= 0xA1)
4023         {
4024           ONE_MORE_BYTE (c);
4025           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4026             return 0;
4027           found = CATEGORY_MASK_BIG5;
4028         }
4029       else
4030         break;
4031     }
4032   detect_info->rejected |= CATEGORY_MASK_BIG5;
4033   return 0;
4034
4035  no_more_source:
4036   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4037     {
4038       detect_info->rejected |= CATEGORY_MASK_BIG5;
4039       return 0;
4040     }
4041   detect_info->found |= found;
4042   return 1;
4043 }
4044
4045 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4046    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4047
4048 static void
4049 decode_coding_sjis (coding)
4050      struct coding_system *coding;
4051 {
4052   const unsigned char *src = coding->source + coding->consumed;
4053   const unsigned char *src_end = coding->source + coding->src_bytes;
4054   const unsigned char *src_base;
4055   int *charbuf = coding->charbuf + coding->charbuf_used;
4056   int *charbuf_end
4057     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4058   int consumed_chars = 0, consumed_chars_base;
4059   int multibytep = coding->src_multibyte;
4060   struct charset *charset_roman, *charset_kanji, *charset_kana;
4061   struct charset *charset_kanji2;
4062   Lisp_Object attrs, charset_list, val;
4063   int char_offset = coding->produced_char;
4064   int last_offset = char_offset;
4065   int last_id = charset_ascii;
4066
4067   CODING_GET_INFO (coding, attrs, charset_list);
4068
4069   val = charset_list;
4070   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4071   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4072   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4073   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4074
4075   while (1)
4076     {
4077       int c, c1;
4078       struct charset *charset;
4079
4080       src_base = src;
4081       consumed_chars_base = consumed_chars;
4082
4083       if (charbuf >= charbuf_end)
4084         break;
4085
4086       ONE_MORE_BYTE (c);
4087       if (c < 0)
4088         goto invalid_code;
4089       if (c < 0x80)
4090         charset = charset_roman;
4091       else if (c == 0x80 || c == 0xA0)
4092         goto invalid_code;
4093       else if (c >= 0xA1 && c <= 0xDF)
4094         {
4095           /* SJIS -> JISX0201-Kana */
4096           c &= 0x7F;
4097           charset = charset_kana;
4098         }
4099       else if (c <= 0xEF)
4100         {
4101           /* SJIS -> JISX0208 */
4102           ONE_MORE_BYTE (c1);
4103           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4104             goto invalid_code;
4105           c = (c << 8) | c1;
4106           SJIS_TO_JIS (c);
4107           charset = charset_kanji;
4108         }
4109       else if (c <= 0xFC && charset_kanji2)
4110         {
4111           /* SJIS -> JISX0213-2 */
4112           ONE_MORE_BYTE (c1);
4113           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4114             goto invalid_code;
4115           c = (c << 8) | c1;
4116           SJIS_TO_JIS2 (c);
4117           charset = charset_kanji2;
4118         }
4119       else
4120         goto invalid_code;
4121       if (charset->id != charset_ascii
4122           && last_id != charset->id)
4123         {
4124           if (last_id != charset_ascii)
4125             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4126           last_id = charset->id;
4127           last_offset = char_offset;
4128         }
4129       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4130       *charbuf++ = c;
4131       char_offset++;
4132       continue;
4133
4134     invalid_code:
4135       src = src_base;
4136       consumed_chars = consumed_chars_base;
4137       ONE_MORE_BYTE (c);
4138       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4139       char_offset++;
4140       coding->errors++;
4141     }
4142
4143  no_more_source:
4144   if (last_id != charset_ascii)
4145     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4146   coding->consumed_char += consumed_chars_base;
4147   coding->consumed = src_base - coding->source;
4148   coding->charbuf_used = charbuf - coding->charbuf;
4149 }
4150
4151 static void
4152 decode_coding_big5 (coding)
4153      struct coding_system *coding;
4154 {
4155   const unsigned char *src = coding->source + coding->consumed;
4156   const unsigned char *src_end = coding->source + coding->src_bytes;
4157   const unsigned char *src_base;
4158   int *charbuf = coding->charbuf + coding->charbuf_used;
4159   int *charbuf_end
4160     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4161   int consumed_chars = 0, consumed_chars_base;
4162   int multibytep = coding->src_multibyte;
4163   struct charset *charset_roman, *charset_big5;
4164   Lisp_Object attrs, charset_list, val;
4165   int char_offset = coding->produced_char;
4166   int last_offset = char_offset;
4167   int last_id = charset_ascii;
4168
4169   CODING_GET_INFO (coding, attrs, charset_list);
4170   val = charset_list;
4171   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4172   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4173
4174   while (1)
4175     {
4176       int c, c1;
4177       struct charset *charset;
4178
4179       src_base = src;
4180       consumed_chars_base = consumed_chars;
4181
4182       if (charbuf >= charbuf_end)
4183         break;
4184
4185       ONE_MORE_BYTE (c);
4186
4187       if (c < 0)
4188         goto invalid_code;
4189       if (c < 0x80)
4190         charset = charset_roman;
4191       else
4192         {
4193           /* BIG5 -> Big5 */
4194           if (c < 0xA1 || c > 0xFE)
4195             goto invalid_code;
4196           ONE_MORE_BYTE (c1);
4197           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4198             goto invalid_code;
4199           c = c << 8 | c1;
4200           charset = charset_big5;
4201         }
4202       if (charset->id != charset_ascii
4203           && last_id != charset->id)
4204         {
4205           if (last_id != charset_ascii)
4206             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4207           last_id = charset->id;
4208           last_offset = char_offset;
4209         }
4210       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4211       *charbuf++ = c;
4212       char_offset++;
4213       continue;
4214
4215     invalid_code:
4216       src = src_base;
4217       consumed_chars = consumed_chars_base;
4218       ONE_MORE_BYTE (c);
4219       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4220       char_offset++;
4221       coding->errors++;
4222     }
4223
4224  no_more_source:
4225   if (last_id != charset_ascii)
4226     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4227   coding->consumed_char += consumed_chars_base;
4228   coding->consumed = src_base - coding->source;
4229   coding->charbuf_used = charbuf - coding->charbuf;
4230 }
4231
4232 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4233    This function can encode charsets `ascii', `katakana-jisx0201',
4234    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4235    are sure that all these charsets are registered as official charset
4236    (i.e. do not have extended leading-codes).  Characters of other
4237    charsets are produced without any encoding.  If SJIS_P is 1, encode
4238    SJIS text, else encode BIG5 text.  */
4239
4240 static int
4241 encode_coding_sjis (coding)
4242      struct coding_system *coding;
4243 {
4244   int multibytep = coding->dst_multibyte;
4245   int *charbuf = coding->charbuf;
4246   int *charbuf_end = charbuf + coding->charbuf_used;
4247   unsigned char *dst = coding->destination + coding->produced;
4248   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4249   int safe_room = 4;
4250   int produced_chars = 0;
4251   Lisp_Object attrs, charset_list, val;
4252   int ascii_compatible;
4253   struct charset *charset_roman, *charset_kanji, *charset_kana;
4254   struct charset *charset_kanji2;
4255   int c;
4256
4257   CODING_GET_INFO (coding, attrs, charset_list);
4258   val = charset_list;
4259   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4260   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4261   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4262   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4263
4264   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4265
4266   while (charbuf < charbuf_end)
4267     {
4268       ASSURE_DESTINATION (safe_room);
4269       c = *charbuf++;
4270       /* Now encode the character C.  */
4271       if (ASCII_CHAR_P (c) && ascii_compatible)
4272         EMIT_ONE_ASCII_BYTE (c);
4273       else if (CHAR_BYTE8_P (c))
4274         {
4275           c = CHAR_TO_BYTE8 (c);
4276           EMIT_ONE_BYTE (c);
4277         }
4278       else
4279         {
4280           unsigned code;
4281           struct charset *charset = char_charset (c, charset_list, &code);
4282
4283           if (!charset)
4284             {
4285               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4286                 {
4287                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4288                   charset = CHARSET_FROM_ID (charset_ascii);
4289                 }
4290               else
4291                 {
4292                   c = coding->default_char;
4293                   charset = char_charset (c, charset_list, &code);
4294                 }
4295             }
4296           if (code == CHARSET_INVALID_CODE (charset))
4297             abort ();
4298           if (charset == charset_kanji)
4299             {
4300               int c1, c2;
4301               JIS_TO_SJIS (code);
4302               c1 = code >> 8, c2 = code & 0xFF;
4303               EMIT_TWO_BYTES (c1, c2);
4304             }
4305           else if (charset == charset_kana)
4306             EMIT_ONE_BYTE (code | 0x80);
4307           else if (charset_kanji2 && charset == charset_kanji2)
4308             {
4309               int c1, c2;
4310
4311               c1 = code >> 8;
4312               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4313                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4314                 {
4315                   JIS_TO_SJIS2 (code);
4316                   c1 = code >> 8, c2 = code & 0xFF;
4317                   EMIT_TWO_BYTES (c1, c2);
4318                 }
4319               else
4320                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4321             }
4322           else
4323             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4324         }
4325     }
4326   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4327   coding->produced_char += produced_chars;
4328   coding->produced = dst - coding->destination;
4329   return 0;
4330 }
4331
4332 static int
4333 encode_coding_big5 (coding)
4334      struct coding_system *coding;
4335 {
4336   int multibytep = coding->dst_multibyte;
4337   int *charbuf = coding->charbuf;
4338   int *charbuf_end = charbuf + coding->charbuf_used;
4339   unsigned char *dst = coding->destination + coding->produced;
4340   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4341   int safe_room = 4;
4342   int produced_chars = 0;
4343   Lisp_Object attrs, charset_list, val;
4344   int ascii_compatible;
4345   struct charset *charset_roman, *charset_big5;
4346   int c;
4347
4348   CODING_GET_INFO (coding, attrs, charset_list);
4349   val = charset_list;
4350   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4351   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4352   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4353
4354   while (charbuf < charbuf_end)
4355     {
4356       ASSURE_DESTINATION (safe_room);
4357       c = *charbuf++;
4358       /* Now encode the character C.  */
4359       if (ASCII_CHAR_P (c) && ascii_compatible)
4360         EMIT_ONE_ASCII_BYTE (c);
4361       else if (CHAR_BYTE8_P (c))
4362         {
4363           c = CHAR_TO_BYTE8 (c);
4364           EMIT_ONE_BYTE (c);
4365         }
4366       else
4367         {
4368           unsigned code;
4369           struct charset *charset = char_charset (c, charset_list, &code);
4370
4371           if (! charset)
4372             {
4373               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4374                 {
4375                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4376                   charset = CHARSET_FROM_ID (charset_ascii);
4377                 }
4378               else
4379                 {
4380                   c = coding->default_char;
4381                   charset = char_charset (c, charset_list, &code);
4382                 }
4383             }
4384           if (code == CHARSET_INVALID_CODE (charset))
4385             abort ();
4386           if (charset == charset_big5)
4387             {
4388               int c1, c2;
4389
4390               c1 = code >> 8, c2 = code & 0xFF;
4391               EMIT_TWO_BYTES (c1, c2);
4392             }
4393           else
4394             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4395         }
4396     }
4397   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4398   coding->produced_char += produced_chars;
4399   coding->produced = dst - coding->destination;
4400   return 0;
4401 }
4402
4403 \f
4404 /*** 10. CCL handlers ***/
4405
4406 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4407    Check if a text is encoded in a coding system of which
4408    encoder/decoder are written in CCL program.  If it is, return
4409    CATEGORY_MASK_CCL, else return 0.  */
4410
4411 static int
4412 detect_coding_ccl (coding, detect_info)
4413      struct coding_system *coding;
4414      struct coding_detection_info *detect_info;
4415 {
4416   const unsigned char *src = coding->source, *src_base;
4417   const unsigned char *src_end = coding->source + coding->src_bytes;
4418   int multibytep = coding->src_multibyte;
4419   int consumed_chars = 0;
4420   int found = 0;
4421   unsigned char *valids;
4422   int head_ascii = coding->head_ascii;
4423   Lisp_Object attrs;
4424
4425   detect_info->checked |= CATEGORY_MASK_CCL;
4426
4427   coding = &coding_categories[coding_category_ccl];
4428   valids = CODING_CCL_VALIDS (coding);
4429   attrs = CODING_ID_ATTRS (coding->id);
4430   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4431     src += head_ascii;
4432
4433   while (1)
4434     {
4435       int c;
4436
4437       src_base = src;
4438       ONE_MORE_BYTE (c);
4439       if (c < 0 || ! valids[c])
4440         break;
4441       if ((valids[c] > 1))
4442         found = CATEGORY_MASK_CCL;
4443     }
4444   detect_info->rejected |= CATEGORY_MASK_CCL;
4445   return 0;
4446
4447  no_more_source:
4448   detect_info->found |= found;
4449   return 1;
4450 }
4451
4452 static void
4453 decode_coding_ccl (coding)
4454      struct coding_system *coding;
4455 {
4456   const unsigned char *src = coding->source + coding->consumed;
4457   const unsigned char *src_end = coding->source + coding->src_bytes;
4458   int *charbuf = coding->charbuf + coding->charbuf_used;
4459   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4460   int consumed_chars = 0;
4461   int multibytep = coding->src_multibyte;
4462   struct ccl_program ccl;
4463   int source_charbuf[1024];
4464   int source_byteidx[1024];
4465   Lisp_Object attrs, charset_list;
4466
4467   CODING_GET_INFO (coding, attrs, charset_list);
4468   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4469
4470   while (src < src_end)
4471     {
4472       const unsigned char *p = src;
4473       int *source, *source_end;
4474       int i = 0;
4475
4476       if (multibytep)
4477         while (i < 1024 && p < src_end)
4478           {
4479             source_byteidx[i] = p - src;
4480             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4481           }
4482       else
4483         while (i < 1024 && p < src_end)
4484           source_charbuf[i++] = *p++;
4485
4486       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4487         ccl.last_block = 1;
4488
4489       source = source_charbuf;
4490       source_end = source + i;
4491       while (source < source_end)
4492         {
4493           ccl_driver (&ccl, source, charbuf,
4494                       source_end - source, charbuf_end - charbuf,
4495                       charset_list);
4496           source += ccl.consumed;
4497           charbuf += ccl.produced;
4498           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4499             break;
4500         }
4501       if (source < source_end)
4502         src += source_byteidx[source - source_charbuf];
4503       else
4504         src = p;
4505       consumed_chars += source - source_charbuf;
4506
4507       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4508           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4509         break;
4510     }
4511
4512   switch (ccl.status)
4513     {
4514     case CCL_STAT_SUSPEND_BY_SRC:
4515       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4516       break;
4517     case CCL_STAT_SUSPEND_BY_DST:
4518       break;
4519     case CCL_STAT_QUIT:
4520     case CCL_STAT_INVALID_CMD:
4521       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4522       break;
4523     default:
4524       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4525       break;
4526     }
4527   coding->consumed_char += consumed_chars;
4528   coding->consumed = src - coding->source;
4529   coding->charbuf_used = charbuf - coding->charbuf;
4530 }
4531
4532 static int
4533 encode_coding_ccl (coding)
4534      struct coding_system *coding;
4535 {
4536   struct ccl_program ccl;
4537   int multibytep = coding->dst_multibyte;
4538   int *charbuf = coding->charbuf;
4539   int *charbuf_end = charbuf + coding->charbuf_used;
4540   unsigned char *dst = coding->destination + coding->produced;
4541   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4542   int destination_charbuf[1024];
4543   int i, produced_chars = 0;
4544   Lisp_Object attrs, charset_list;
4545
4546   CODING_GET_INFO (coding, attrs, charset_list);
4547   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4548
4549   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4550   ccl.dst_multibyte = coding->dst_multibyte;
4551
4552   while (charbuf < charbuf_end)
4553     {
4554       ccl_driver (&ccl, charbuf, destination_charbuf,
4555                   charbuf_end - charbuf, 1024, charset_list);
4556       if (multibytep)
4557         {
4558           ASSURE_DESTINATION (ccl.produced * 2);
4559           for (i = 0; i < ccl.produced; i++)
4560             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4561         }
4562       else
4563         {
4564           ASSURE_DESTINATION (ccl.produced);
4565           for (i = 0; i < ccl.produced; i++)
4566             *dst++ = destination_charbuf[i] & 0xFF;
4567           produced_chars += ccl.produced;
4568         }
4569       charbuf += ccl.consumed;
4570       if (ccl.status == CCL_STAT_QUIT
4571           || ccl.status == CCL_STAT_INVALID_CMD)
4572         break;
4573     }
4574
4575   switch (ccl.status)
4576     {
4577     case CCL_STAT_SUSPEND_BY_SRC:
4578       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4579       break;
4580     case CCL_STAT_SUSPEND_BY_DST:
4581       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4582       break;
4583     case CCL_STAT_QUIT:
4584     case CCL_STAT_INVALID_CMD:
4585       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4586       break;
4587     default:
4588       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4589       break;
4590     }
4591
4592   coding->produced_char += produced_chars;
4593   coding->produced = dst - coding->destination;
4594   return 0;
4595 }
4596
4597
4598 \f
4599 /*** 10, 11. no-conversion handlers ***/
4600
4601 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4602
4603 static void
4604 decode_coding_raw_text (coding)
4605      struct coding_system *coding;
4606 {
4607   coding->chars_at_source = 1;
4608   coding->consumed_char = 0;
4609   coding->consumed = 0;
4610   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4611 }
4612
4613 static int
4614 encode_coding_raw_text (coding)
4615      struct coding_system *coding;
4616 {
4617   int multibytep = coding->dst_multibyte;
4618   int *charbuf = coding->charbuf;
4619   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4620   unsigned char *dst = coding->destination + coding->produced;
4621   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4622   int produced_chars = 0;
4623   int c;
4624
4625   if (multibytep)
4626     {
4627       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4628
4629       if (coding->src_multibyte)
4630         while (charbuf < charbuf_end)
4631           {
4632             ASSURE_DESTINATION (safe_room);
4633             c = *charbuf++;
4634             if (ASCII_CHAR_P (c))
4635               EMIT_ONE_ASCII_BYTE (c);
4636             else if (CHAR_BYTE8_P (c))
4637               {
4638                 c = CHAR_TO_BYTE8 (c);
4639                 EMIT_ONE_BYTE (c);
4640               }
4641             else
4642               {
4643                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4644
4645                 CHAR_STRING_ADVANCE (c, p1);
4646                 while (p0 < p1)
4647                   {
4648                     EMIT_ONE_BYTE (*p0);
4649                     p0++;
4650                   }
4651               }
4652           }
4653       else
4654         while (charbuf < charbuf_end)
4655           {
4656             ASSURE_DESTINATION (safe_room);
4657             c = *charbuf++;
4658             EMIT_ONE_BYTE (c);
4659           }
4660     }
4661   else
4662     {
4663       if (coding->src_multibyte)
4664         {
4665           int safe_room = MAX_MULTIBYTE_LENGTH;
4666
4667           while (charbuf < charbuf_end)
4668             {
4669               ASSURE_DESTINATION (safe_room);
4670               c = *charbuf++;
4671               if (ASCII_CHAR_P (c))
4672                 *dst++ = c;
4673               else if (CHAR_BYTE8_P (c))
4674                 *dst++ = CHAR_TO_BYTE8 (c);
4675               else
4676                 CHAR_STRING_ADVANCE (c, dst);
4677               produced_chars++;
4678             }
4679         }
4680       else
4681         {
4682           ASSURE_DESTINATION (charbuf_end - charbuf);
4683           while (charbuf < charbuf_end && dst < dst_end)
4684             *dst++ = *charbuf++;
4685           produced_chars = dst - (coding->destination + coding->dst_bytes);
4686         }
4687     }
4688   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4689   coding->produced_char += produced_chars;
4690   coding->produced = dst - coding->destination;
4691   return 0;
4692 }
4693
4694 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4695    Check if a text is encoded in a charset-based coding system.  If it
4696    is, return 1, else return 0.  */
4697
4698 static int
4699 detect_coding_charset (coding, detect_info)
4700      struct coding_system *coding;
4701      struct coding_detection_info *detect_info;
4702 {
4703   const unsigned char *src = coding->source, *src_base;
4704   const unsigned char *src_end = coding->source + coding->src_bytes;
4705   int multibytep = coding->src_multibyte;
4706   int consumed_chars = 0;
4707   Lisp_Object attrs, valids;
4708   int found = 0;
4709
4710   detect_info->checked |= CATEGORY_MASK_CHARSET;
4711
4712   coding = &coding_categories[coding_category_charset];
4713   attrs = CODING_ID_ATTRS (coding->id);
4714   valids = AREF (attrs, coding_attr_charset_valids);
4715
4716   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4717     src += coding->head_ascii;
4718
4719   while (1)
4720     {
4721       int c;
4722
4723       src_base = src;
4724       ONE_MORE_BYTE (c);
4725       if (c < 0)
4726         continue;
4727       if (NILP (AREF (valids, c)))
4728         break;
4729       if (c >= 0x80)
4730         found = CATEGORY_MASK_CHARSET;
4731     }
4732   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4733   return 0;
4734
4735  no_more_source:
4736   detect_info->found |= found;
4737   return 1;
4738 }
4739
4740 static void
4741 decode_coding_charset (coding)
4742      struct coding_system *coding;
4743 {
4744   const unsigned char *src = coding->source + coding->consumed;
4745   const unsigned char *src_end = coding->source + coding->src_bytes;
4746   const unsigned char *src_base;
4747   int *charbuf = coding->charbuf + coding->charbuf_used;
4748   int *charbuf_end
4749     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4750   int consumed_chars = 0, consumed_chars_base;
4751   int multibytep = coding->src_multibyte;
4752   Lisp_Object attrs, charset_list, valids;
4753   int char_offset = coding->produced_char;
4754   int last_offset = char_offset;
4755   int last_id = charset_ascii;
4756
4757   CODING_GET_INFO (coding, attrs, charset_list);
4758   valids = AREF (attrs, coding_attr_charset_valids);
4759
4760   while (1)
4761     {
4762       int c;
4763       Lisp_Object val;
4764       struct charset *charset;
4765       int dim;
4766       int len = 1;
4767       unsigned code;
4768
4769       src_base = src;
4770       consumed_chars_base = consumed_chars;
4771
4772       if (charbuf >= charbuf_end)
4773         break;
4774
4775       ONE_MORE_BYTE (c);
4776       if (c < 0)
4777         goto invalid_code;
4778       code = c;
4779
4780       val = AREF (valids, c);
4781       if (NILP (val))
4782         goto invalid_code;
4783       if (INTEGERP (val))
4784         {
4785           charset = CHARSET_FROM_ID (XFASTINT (val));
4786           dim = CHARSET_DIMENSION (charset);
4787           while (len < dim)
4788             {
4789               ONE_MORE_BYTE (c);
4790               code = (code << 8) | c;
4791               len++;
4792             }
4793           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4794                               charset, code, c);
4795         }
4796       else
4797         {
4798           /* VAL is a list of charset IDs.  It is assured that the
4799              list is sorted by charset dimensions (smaller one
4800              comes first).  */
4801           while (CONSP (val))
4802             {
4803               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4804               dim = CHARSET_DIMENSION (charset);
4805               while (len < dim)
4806                 {
4807                   ONE_MORE_BYTE (c);
4808                   code = (code << 8) | c;
4809                   len++;
4810                 }
4811               CODING_DECODE_CHAR (coding, src, src_base,
4812                                   src_end, charset, code, c);
4813               if (c >= 0)
4814                 break;
4815               val = XCDR (val);
4816             }
4817         }
4818       if (c < 0)
4819         goto invalid_code;
4820       if (charset->id != charset_ascii
4821           && last_id != charset->id)
4822         {
4823           if (last_id != charset_ascii)
4824             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4825           last_id = charset->id;
4826           last_offset = char_offset;
4827         }
4828
4829       *charbuf++ = c;
4830       char_offset++;
4831       continue;
4832
4833     invalid_code:
4834       src = src_base;
4835       consumed_chars = consumed_chars_base;
4836       ONE_MORE_BYTE (c);
4837       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4838       char_offset++;
4839       coding->errors++;
4840     }
4841
4842  no_more_source:
4843   if (last_id != charset_ascii)
4844     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845   coding->consumed_char += consumed_chars_base;
4846   coding->consumed = src_base - coding->source;
4847   coding->charbuf_used = charbuf - coding->charbuf;
4848 }
4849
4850 static int
4851 encode_coding_charset (coding)
4852      struct coding_system *coding;
4853 {
4854   int multibytep = coding->dst_multibyte;
4855   int *charbuf = coding->charbuf;
4856   int *charbuf_end = charbuf + coding->charbuf_used;
4857   unsigned char *dst = coding->destination + coding->produced;
4858   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4859   int safe_room = MAX_MULTIBYTE_LENGTH;
4860   int produced_chars = 0;
4861   Lisp_Object attrs, charset_list;
4862   int ascii_compatible;
4863   int c;
4864
4865   CODING_GET_INFO (coding, attrs, charset_list);
4866   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4867
4868   while (charbuf < charbuf_end)
4869     {
4870       struct charset *charset;
4871       unsigned code;
4872
4873       ASSURE_DESTINATION (safe_room);
4874       c = *charbuf++;
4875       if (ascii_compatible && ASCII_CHAR_P (c))
4876         EMIT_ONE_ASCII_BYTE (c);
4877       else if (CHAR_BYTE8_P (c))
4878         {
4879           c = CHAR_TO_BYTE8 (c);
4880           EMIT_ONE_BYTE (c);
4881         }
4882       else
4883         {
4884           charset = char_charset (c, charset_list, &code);
4885           if (charset)
4886             {
4887               if (CHARSET_DIMENSION (charset) == 1)
4888                 EMIT_ONE_BYTE (code);
4889               else if (CHARSET_DIMENSION (charset) == 2)
4890                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4891               else if (CHARSET_DIMENSION (charset) == 3)
4892                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4893               else
4894                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4895                                  (code >> 8) & 0xFF, code & 0xFF);
4896             }
4897           else
4898             {
4899               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4900                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4901               else
4902                 c = coding->default_char;
4903               EMIT_ONE_BYTE (c);
4904             }
4905         }
4906     }
4907
4908   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4909   coding->produced_char += produced_chars;
4910   coding->produced = dst - coding->destination;
4911   return 0;
4912 }
4913
4914 \f
4915 /*** 7. C library functions ***/
4916
4917 /* Setup coding context CODING from information about CODING_SYSTEM.
4918    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4919    CODING_SYSTEM is invalid, signal an error.  */
4920
4921 void
4922 setup_coding_system (coding_system, coding)
4923      Lisp_Object coding_system;
4924      struct coding_system *coding;
4925 {
4926   Lisp_Object attrs;
4927   Lisp_Object eol_type;
4928   Lisp_Object coding_type;
4929   Lisp_Object val;
4930
4931   if (NILP (coding_system))
4932     coding_system = Qundecided;
4933
4934   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4935
4936   attrs = CODING_ID_ATTRS (coding->id);
4937   eol_type = CODING_ID_EOL_TYPE (coding->id);
4938
4939   coding->mode = 0;
4940   coding->head_ascii = -1;
4941   coding->common_flags
4942     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4943   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4944     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4945   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4946     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4947   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4948     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4949
4950   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4951   coding->max_charset_id = SCHARS (val) - 1;
4952   coding->safe_charsets = (char *) SDATA (val);
4953   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4954
4955   coding_type = CODING_ATTR_TYPE (attrs);
4956   if (EQ (coding_type, Qundecided))
4957     {
4958       coding->detector = NULL;
4959       coding->decoder = decode_coding_raw_text;
4960       coding->encoder = encode_coding_raw_text;
4961       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4962     }
4963   else if (EQ (coding_type, Qiso_2022))
4964     {
4965       int i;
4966       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4967
4968       /* Invoke graphic register 0 to plane 0.  */
4969       CODING_ISO_INVOCATION (coding, 0) = 0;
4970       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4971       CODING_ISO_INVOCATION (coding, 1)
4972         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4973       /* Setup the initial status of designation.  */
4974       for (i = 0; i < 4; i++)
4975         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4976       /* Not single shifting initially.  */
4977       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4978       /* Beginning of buffer should also be regarded as bol. */
4979       CODING_ISO_BOL (coding) = 1;
4980       coding->detector = detect_coding_iso_2022;
4981       coding->decoder = decode_coding_iso_2022;
4982       coding->encoder = encode_coding_iso_2022;
4983       if (flags & CODING_ISO_FLAG_SAFE)
4984         coding->mode |= CODING_MODE_SAFE_ENCODING;
4985       coding->common_flags
4986         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4987             | CODING_REQUIRE_FLUSHING_MASK);
4988       if (flags & CODING_ISO_FLAG_COMPOSITION)
4989         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4990       if (flags & CODING_ISO_FLAG_DESIGNATION)
4991         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4992       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4993         {
4994           setup_iso_safe_charsets (attrs);
4995           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4996           coding->max_charset_id = SCHARS (val) - 1;
4997           coding->safe_charsets = (char *) SDATA (val);
4998         }
4999       CODING_ISO_FLAGS (coding) = flags;
5000     }
5001   else if (EQ (coding_type, Qcharset))
5002     {
5003       coding->detector = detect_coding_charset;
5004       coding->decoder = decode_coding_charset;
5005       coding->encoder = encode_coding_charset;
5006       coding->common_flags
5007         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5008     }
5009   else if (EQ (coding_type, Qutf_8))
5010     {
5011       coding->detector = detect_coding_utf_8;
5012       coding->decoder = decode_coding_utf_8;
5013       coding->encoder = encode_coding_utf_8;
5014       coding->common_flags
5015         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5016     }
5017   else if (EQ (coding_type, Qutf_16))
5018     {
5019       val = AREF (attrs, coding_attr_utf_16_bom);
5020       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5021                                     : EQ (val, Qt) ? utf_16_with_bom
5022                                     : utf_16_without_bom);
5023       val = AREF (attrs, coding_attr_utf_16_endian);
5024       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5025                                        : utf_16_little_endian);
5026       CODING_UTF_16_SURROGATE (coding) = 0;
5027       coding->detector = detect_coding_utf_16;
5028       coding->decoder = decode_coding_utf_16;
5029       coding->encoder = encode_coding_utf_16;
5030       coding->common_flags
5031         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5032       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5033         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5034     }
5035   else if (EQ (coding_type, Qccl))
5036     {
5037       coding->detector = detect_coding_ccl;
5038       coding->decoder = decode_coding_ccl;
5039       coding->encoder = encode_coding_ccl;
5040       coding->common_flags
5041         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5042             | CODING_REQUIRE_FLUSHING_MASK);
5043     }
5044   else if (EQ (coding_type, Qemacs_mule))
5045     {
5046       coding->detector = detect_coding_emacs_mule;
5047       coding->decoder = decode_coding_emacs_mule;
5048       coding->encoder = encode_coding_emacs_mule;
5049       coding->common_flags
5050         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5051       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5052           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5053         {
5054           Lisp_Object tail, safe_charsets;
5055           int max_charset_id = 0;
5056
5057           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5058                tail = XCDR (tail))
5059             if (max_charset_id < XFASTINT (XCAR (tail)))
5060               max_charset_id = XFASTINT (XCAR (tail));
5061           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5062                                         make_number (255));
5063           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5064                tail = XCDR (tail))
5065             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5066           coding->max_charset_id = max_charset_id;
5067           coding->safe_charsets = (char *) SDATA (safe_charsets);
5068         }
5069     }
5070   else if (EQ (coding_type, Qshift_jis))
5071     {
5072       coding->detector = detect_coding_sjis;
5073       coding->decoder = decode_coding_sjis;
5074       coding->encoder = encode_coding_sjis;
5075       coding->common_flags
5076         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5077     }
5078   else if (EQ (coding_type, Qbig5))
5079     {
5080       coding->detector = detect_coding_big5;
5081       coding->decoder = decode_coding_big5;
5082       coding->encoder = encode_coding_big5;
5083       coding->common_flags
5084         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5085     }
5086   else                          /* EQ (coding_type, Qraw_text) */
5087     {
5088       coding->detector = NULL;
5089       coding->decoder = decode_coding_raw_text;
5090       coding->encoder = encode_coding_raw_text;
5091       if (! EQ (eol_type, Qunix))
5092         {
5093           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5094           if (! VECTORP (eol_type))
5095             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5096         }
5097
5098     }
5099
5100   return;
5101 }
5102
5103 /* Return a list of charsets supported by CODING.  */
5104
5105 Lisp_Object
5106 coding_charset_list (coding)
5107      struct coding_system *coding;
5108 {
5109   Lisp_Object attrs, charset_list;
5110
5111   CODING_GET_INFO (coding, attrs, charset_list);
5112   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5113     {
5114       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5115
5116       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5117         charset_list = Viso_2022_charset_list;
5118     }
5119   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5120     {
5121       charset_list = Vemacs_mule_charset_list;
5122     }
5123   return charset_list;
5124 }
5125
5126
5127 /* Return raw-text or one of its subsidiaries that has the same
5128    eol_type as CODING-SYSTEM.  */
5129
5130 Lisp_Object
5131 raw_text_coding_system (coding_system)
5132      Lisp_Object coding_system;
5133 {
5134   Lisp_Object spec, attrs;
5135   Lisp_Object eol_type, raw_text_eol_type;
5136
5137   if (NILP (coding_system))
5138     return Qraw_text;
5139   spec = CODING_SYSTEM_SPEC (coding_system);
5140   attrs = AREF (spec, 0);
5141
5142   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5143     return coding_system;
5144
5145   eol_type = AREF (spec, 2);
5146   if (VECTORP (eol_type))
5147     return Qraw_text;
5148   spec = CODING_SYSTEM_SPEC (Qraw_text);
5149   raw_text_eol_type = AREF (spec, 2);
5150   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5151           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5152           : AREF (raw_text_eol_type, 2));
5153 }
5154
5155
5156 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5157    does, return one of the subsidiary that has the same eol-spec as
5158    PARENT.  Otherwise, return CODING_SYSTEM.  */
5159
5160 Lisp_Object
5161 coding_inherit_eol_type (coding_system, parent)
5162      Lisp_Object coding_system, parent;
5163 {
5164   Lisp_Object spec, eol_type;
5165
5166   if (NILP (coding_system))
5167     coding_system = Qraw_text;
5168   spec = CODING_SYSTEM_SPEC (coding_system);
5169   eol_type = AREF (spec, 2);
5170   if (VECTORP (eol_type)
5171       && ! NILP (parent))
5172     {
5173       Lisp_Object parent_spec;
5174       Lisp_Object parent_eol_type;
5175
5176       parent_spec
5177         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5178       parent_eol_type = AREF (parent_spec, 2);
5179       if (EQ (parent_eol_type, Qunix))
5180         coding_system = AREF (eol_type, 0);
5181       else if (EQ (parent_eol_type, Qdos))
5182         coding_system = AREF (eol_type, 1);
5183       else if (EQ (parent_eol_type, Qmac))
5184         coding_system = AREF (eol_type, 2);
5185     }
5186   return coding_system;
5187 }
5188
5189 /* Emacs has a mechanism to automatically detect a coding system if it
5190    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5191    it's impossible to distinguish some coding systems accurately
5192    because they use the same range of codes.  So, at first, coding
5193    systems are categorized into 7, those are:
5194
5195    o coding-category-emacs-mule
5196
5197         The category for a coding system which has the same code range
5198         as Emacs' internal format.  Assigned the coding-system (Lisp
5199         symbol) `emacs-mule' by default.
5200
5201    o coding-category-sjis
5202
5203         The category for a coding system which has the same code range
5204         as SJIS.  Assigned the coding-system (Lisp
5205         symbol) `japanese-shift-jis' by default.
5206
5207    o coding-category-iso-7
5208
5209         The category for a coding system which has the same code range
5210         as ISO2022 of 7-bit environment.  This doesn't use any locking
5211         shift and single shift functions.  This can encode/decode all
5212         charsets.  Assigned the coding-system (Lisp symbol)
5213         `iso-2022-7bit' by default.
5214
5215    o coding-category-iso-7-tight
5216
5217         Same as coding-category-iso-7 except that this can
5218         encode/decode only the specified charsets.
5219
5220    o coding-category-iso-8-1
5221
5222         The category for a coding system which has the same code range
5223         as ISO2022 of 8-bit environment and graphic plane 1 used only
5224         for DIMENSION1 charset.  This doesn't use any locking shift
5225         and single shift functions.  Assigned the coding-system (Lisp
5226         symbol) `iso-latin-1' by default.
5227
5228    o coding-category-iso-8-2
5229
5230         The category for a coding system which has the same code range
5231         as ISO2022 of 8-bit environment and graphic plane 1 used only
5232         for DIMENSION2 charset.  This doesn't use any locking shift
5233         and single shift functions.  Assigned the coding-system (Lisp
5234         symbol) `japanese-iso-8bit' by default.
5235
5236    o coding-category-iso-7-else
5237
5238         The category for a coding system which has the same code range
5239         as ISO2022 of 7-bit environemnt but uses locking shift or
5240         single shift functions.  Assigned the coding-system (Lisp
5241         symbol) `iso-2022-7bit-lock' by default.
5242
5243    o coding-category-iso-8-else
5244
5245         The category for a coding system which has the same code range
5246         as ISO2022 of 8-bit environemnt but uses locking shift or
5247         single shift functions.  Assigned the coding-system (Lisp
5248         symbol) `iso-2022-8bit-ss2' by default.
5249
5250    o coding-category-big5
5251
5252         The category for a coding system which has the same code range
5253         as BIG5.  Assigned the coding-system (Lisp symbol)
5254         `cn-big5' by default.
5255
5256    o coding-category-utf-8
5257
5258         The category for a coding system which has the same code range
5259         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5260         symbol) `utf-8' by default.
5261
5262    o coding-category-utf-16-be
5263
5264         The category for a coding system in which a text has an
5265         Unicode signature (cf. Unicode Standard) in the order of BIG
5266         endian at the head.  Assigned the coding-system (Lisp symbol)
5267         `utf-16-be' by default.
5268
5269    o coding-category-utf-16-le
5270
5271         The category for a coding system in which a text has an
5272         Unicode signature (cf. Unicode Standard) in the order of
5273         LITTLE endian at the head.  Assigned the coding-system (Lisp
5274         symbol) `utf-16-le' by default.
5275
5276    o coding-category-ccl
5277
5278         The category for a coding system of which encoder/decoder is
5279         written in CCL programs.  The default value is nil, i.e., no
5280         coding system is assigned.
5281
5282    o coding-category-binary
5283
5284         The category for a coding system not categorized in any of the
5285         above.  Assigned the coding-system (Lisp symbol)
5286         `no-conversion' by default.
5287
5288    Each of them is a Lisp symbol and the value is an actual
5289    `coding-system's (this is also a Lisp symbol) assigned by a user.
5290    What Emacs does actually is to detect a category of coding system.
5291    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5292    decide only one possible category, it selects a category of the
5293    highest priority.  Priorities of categories are also specified by a
5294    user in a Lisp variable `coding-category-list'.
5295
5296 */
5297
5298 #define EOL_SEEN_NONE   0
5299 #define EOL_SEEN_LF     1
5300 #define EOL_SEEN_CR     2
5301 #define EOL_SEEN_CRLF   4
5302
5303 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5304    SOURCE is encoded.  If CATEGORY is one of
5305    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5306    two-byte, else they are encoded by one-byte.
5307
5308    Return one of EOL_SEEN_XXX.  */
5309
5310 #define MAX_EOL_CHECK_COUNT 3
5311
5312 static int
5313 detect_eol (source, src_bytes, category)
5314      const unsigned char *source;
5315      EMACS_INT src_bytes;
5316      enum coding_category category;
5317 {
5318   const unsigned char *src = source, *src_end = src + src_bytes;
5319   unsigned char c;
5320   int total  = 0;
5321   int eol_seen = EOL_SEEN_NONE;
5322
5323   if ((1 << category) & CATEGORY_MASK_UTF_16)
5324     {
5325       int msb, lsb;
5326
5327       msb = category == (coding_category_utf_16_le
5328                          | coding_category_utf_16_le_nosig);
5329       lsb = 1 - msb;
5330
5331       while (src + 1 < src_end)
5332         {
5333           c = src[lsb];
5334           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5335             {
5336               int this_eol;
5337
5338               if (c == '\n')
5339                 this_eol = EOL_SEEN_LF;
5340               else if (src + 3 >= src_end
5341                        || src[msb + 2] != 0
5342                        || src[lsb + 2] != '\n')
5343                 this_eol = EOL_SEEN_CR;
5344               else
5345                 this_eol = EOL_SEEN_CRLF;
5346
5347               if (eol_seen == EOL_SEEN_NONE)
5348                 /* This is the first end-of-line.  */
5349                 eol_seen = this_eol;
5350               else if (eol_seen != this_eol)
5351                 {
5352                   /* The found type is different from what found before.  */
5353                   eol_seen = EOL_SEEN_LF;
5354                   break;
5355                 }
5356               if (++total == MAX_EOL_CHECK_COUNT)
5357                 break;
5358             }
5359           src += 2;
5360         }
5361     }
5362   else
5363     {
5364       while (src < src_end)
5365         {
5366           c = *src++;
5367           if (c == '\n' || c == '\r')
5368             {
5369               int this_eol;
5370
5371               if (c == '\n')
5372                 this_eol = EOL_SEEN_LF;
5373               else if (src >= src_end || *src != '\n')
5374                 this_eol = EOL_SEEN_CR;
5375               else
5376                 this_eol = EOL_SEEN_CRLF, src++;
5377
5378               if (eol_seen == EOL_SEEN_NONE)
5379                 /* This is the first end-of-line.  */
5380                 eol_seen = this_eol;
5381               else if (eol_seen != this_eol)
5382                 {
5383                   /* The found type is different from what found before.  */
5384                   eol_seen = EOL_SEEN_LF;
5385                   break;
5386                 }
5387               if (++total == MAX_EOL_CHECK_COUNT)
5388                 break;
5389             }
5390         }
5391     }
5392   return eol_seen;
5393 }
5394
5395
5396 static Lisp_Object
5397 adjust_coding_eol_type (coding, eol_seen)
5398      struct coding_system *coding;
5399      int eol_seen;
5400 {
5401   Lisp_Object eol_type;
5402
5403   eol_type = CODING_ID_EOL_TYPE (coding->id);
5404   if (eol_seen & EOL_SEEN_LF)
5405     {
5406       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5407       eol_type = Qunix;
5408     }
5409   else if (eol_seen & EOL_SEEN_CRLF)
5410     {
5411       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5412       eol_type = Qdos;
5413     }
5414   else if (eol_seen & EOL_SEEN_CR)
5415     {
5416       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5417       eol_type = Qmac;
5418     }
5419   return eol_type;
5420 }
5421
5422 /* Detect how a text specified in CODING is encoded.  If a coding
5423    system is detected, update fields of CODING by the detected coding
5424    system.  */
5425
5426 void
5427 detect_coding (coding)
5428      struct coding_system *coding;
5429 {
5430   const unsigned char *src, *src_end;
5431
5432   coding->consumed = coding->consumed_char = 0;
5433   coding->produced = coding->produced_char = 0;
5434   coding_set_source (coding);
5435
5436   src_end = coding->source + coding->src_bytes;
5437
5438   /* If we have not yet decided the text encoding type, detect it
5439      now.  */
5440   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5441     {
5442       int c, i;
5443       struct coding_detection_info detect_info;
5444
5445       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5446       for (i = 0, src = coding->source; src < src_end; i++, src++)
5447         {
5448           c = *src;
5449           if (c & 0x80)
5450             break;
5451           if (c < 0x20
5452               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5453               && ! inhibit_iso_escape_detection
5454               && ! detect_info.checked)
5455             {
5456               coding->head_ascii = src - (coding->source + coding->consumed);
5457               if (detect_coding_iso_2022 (coding, &detect_info))
5458                 {
5459                   /* We have scanned the whole data.  */
5460                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5461                     /* We didn't find an 8-bit code.  */
5462                     src = src_end;
5463                   break;
5464                 }
5465             }
5466         }
5467       coding->head_ascii = src - (coding->source + coding->consumed);
5468
5469       if (coding->head_ascii < coding->src_bytes
5470           || detect_info.found)
5471         {
5472           enum coding_category category;
5473           struct coding_system *this;
5474
5475           if (coding->head_ascii == coding->src_bytes)
5476             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5477             for (i = 0; i < coding_category_raw_text; i++)
5478               {
5479                 category = coding_priorities[i];
5480                 this = coding_categories + category;
5481                 if (detect_info.found & (1 << category))
5482                   break;
5483               }
5484           else
5485             for (i = 0; i < coding_category_raw_text; i++)
5486               {
5487                 category = coding_priorities[i];
5488                 this = coding_categories + category;
5489                 if (this->id < 0)
5490                   {
5491                     /* No coding system of this category is defined.  */
5492                     detect_info.rejected |= (1 << category);
5493                   }
5494                 else if (category >= coding_category_raw_text)
5495                   continue;
5496                 else if (detect_info.checked & (1 << category))
5497                   {
5498                     if (detect_info.found & (1 << category))
5499                       break;
5500                   }
5501                 else if ((*(this->detector)) (coding, &detect_info)
5502                          && detect_info.found & (1 << category))
5503                   {
5504                     if (category == coding_category_utf_16_auto)
5505                       {
5506                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5507                           category = coding_category_utf_16_le;
5508                         else
5509                           category = coding_category_utf_16_be;
5510                       }
5511                     break;
5512                   }
5513               }
5514
5515           if (i < coding_category_raw_text)
5516             setup_coding_system (CODING_ID_NAME (this->id), coding);
5517           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5518             setup_coding_system (Qraw_text, coding);
5519           else if (detect_info.rejected)
5520             for (i = 0; i < coding_category_raw_text; i++)
5521               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5522                 {
5523                   this = coding_categories + coding_priorities[i];
5524                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5525                   break;
5526                 }
5527         }
5528     }
5529   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5530            == coding_category_utf_16_auto)
5531     {
5532       Lisp_Object coding_systems;
5533       struct coding_detection_info detect_info;
5534
5535       coding_systems
5536         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5537       detect_info.found = detect_info.rejected = 0;
5538       if (CONSP (coding_systems)
5539           && detect_coding_utf_16 (coding, &detect_info))
5540         {
5541           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5542             setup_coding_system (XCAR (coding_systems), coding);
5543           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5544             setup_coding_system (XCDR (coding_systems), coding);
5545         }
5546     }
5547 }
5548
5549
5550 static void
5551 decode_eol (coding)
5552      struct coding_system *coding;
5553 {
5554   Lisp_Object eol_type;
5555   unsigned char *p, *pbeg, *pend;
5556
5557   eol_type = CODING_ID_EOL_TYPE (coding->id);
5558   if (EQ (eol_type, Qunix))
5559     return;
5560
5561   if (NILP (coding->dst_object))
5562     pbeg = coding->destination;
5563   else
5564     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5565   pend = pbeg + coding->produced;
5566
5567   if (VECTORP (eol_type))
5568     {
5569       int eol_seen = EOL_SEEN_NONE;
5570
5571       for (p = pbeg; p < pend; p++)
5572         {
5573           if (*p == '\n')
5574             eol_seen |= EOL_SEEN_LF;
5575           else if (*p == '\r')
5576             {
5577               if (p + 1 < pend && *(p + 1) == '\n')
5578                 {
5579                   eol_seen |= EOL_SEEN_CRLF;
5580                   p++;
5581                 }
5582               else
5583                 eol_seen |= EOL_SEEN_CR;
5584             }
5585         }
5586       if (eol_seen != EOL_SEEN_NONE
5587           && eol_seen != EOL_SEEN_LF
5588           && eol_seen != EOL_SEEN_CRLF
5589           && eol_seen != EOL_SEEN_CR)
5590         eol_seen = EOL_SEEN_LF;
5591       if (eol_seen != EOL_SEEN_NONE)
5592         eol_type = adjust_coding_eol_type (coding, eol_seen);
5593     }
5594
5595   if (EQ (eol_type, Qmac))
5596     {
5597       for (p = pbeg; p < pend; p++)
5598         if (*p == '\r')
5599           *p = '\n';
5600     }
5601   else if (EQ (eol_type, Qdos))
5602     {
5603       int n = 0;
5604
5605       if (NILP (coding->dst_object))
5606         {
5607           for (p = pend - 2; p >= pbeg; p--)
5608             if (*p == '\r')
5609               {
5610                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5611                 n++;
5612               }
5613         }
5614       else
5615         {
5616           for (p = pend - 2; p >= pbeg; p--)
5617             if (*p == '\r')
5618               {
5619                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5620                 int pos = BYTE_TO_CHAR (pos_byte);
5621
5622                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5623                 n++;
5624               }
5625         }
5626       coding->produced -= n;
5627       coding->produced_char -= n;
5628     }
5629 }
5630
5631
5632 /* Return a translation table (or list of them) from coding system
5633    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5634    decoding (ENCODEP is zero). */
5635
5636 static Lisp_Object
5637 get_translation_table (attrs, encodep, max_lookup)
5638      Lisp_Object attrs;
5639      int encodep, *max_lookup;
5640 {
5641   Lisp_Object standard, translation_table;
5642   Lisp_Object val;
5643
5644   if (encodep)
5645     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5646       standard = Vstandard_translation_table_for_encode;
5647   else
5648     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5649       standard = Vstandard_translation_table_for_decode;
5650   if (NILP (translation_table))
5651     translation_table = standard;
5652   else
5653     {
5654       if (SYMBOLP (translation_table))
5655         translation_table = Fget (translation_table, Qtranslation_table);
5656       else if (CONSP (translation_table))
5657         {
5658           translation_table = Fcopy_sequence (translation_table);
5659           for (val = translation_table; CONSP (val); val = XCDR (val))
5660             if (SYMBOLP (XCAR (val)))
5661               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5662         }
5663       if (CHAR_TABLE_P (standard))
5664         {
5665           if (CONSP (translation_table))
5666             translation_table = nconc2 (translation_table,
5667                                         Fcons (standard, Qnil));
5668           else
5669             translation_table = Fcons (translation_table,
5670                                        Fcons (standard, Qnil));
5671         }
5672     }
5673
5674   if (max_lookup)
5675     {
5676       *max_lookup = 1;
5677       if (CHAR_TABLE_P (translation_table)
5678           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5679         {
5680           val = XCHAR_TABLE (translation_table)->extras[1];
5681           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5682             *max_lookup = XFASTINT (val);
5683         }
5684       else if (CONSP (translation_table))
5685         {
5686           Lisp_Object tail, val;
5687
5688           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5689             if (CHAR_TABLE_P (XCAR (tail))
5690                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5691               {
5692                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5693                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5694                   *max_lookup = XFASTINT (val);
5695               }
5696         }
5697     }
5698   return translation_table;
5699 }
5700
5701 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5702   do {                                                          \
5703     trans = Qnil;                                               \
5704     if (CHAR_TABLE_P (table))                                   \
5705       {                                                         \
5706         trans = CHAR_TABLE_REF (table, c);                      \
5707         if (CHARACTERP (trans))                                 \
5708           c = XFASTINT (trans), trans = Qnil;                   \
5709       }                                                         \
5710     else if (CONSP (table))                                     \
5711       {                                                         \
5712         Lisp_Object tail;                                       \
5713                                                                 \
5714         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5715           if (CHAR_TABLE_P (XCAR (tail)))                       \
5716             {                                                   \
5717               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5718               if (CHARACTERP (trans))                           \
5719                 c = XFASTINT (trans), trans = Qnil;             \
5720               else if (! NILP (trans))                          \
5721                 break;                                          \
5722             }                                                   \
5723       }                                                         \
5724   } while (0)
5725
5726
5727 static Lisp_Object
5728 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5729      Lisp_Object val;
5730      int *buf, *buf_end;
5731      int last_block;
5732      int *from_nchars, *to_nchars;
5733 {
5734   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5735      [TO-CHAR ...].  */
5736   if (CONSP (val))
5737     {
5738       Lisp_Object from, tail;
5739       int i, len;
5740
5741       for (tail = val; CONSP (tail); tail = XCDR (tail))
5742         {
5743           val = XCAR (tail);
5744           from = XCAR (val);
5745           len = ASIZE (from);
5746           for (i = 0; i < len; i++)
5747             {
5748               if (buf + i == buf_end)
5749                 {
5750                   if (! last_block)
5751                     return Qt;
5752                   break;
5753                 }
5754               if (XINT (AREF (from, i)) != buf[i])
5755                 break;
5756             }
5757           if (i == len)
5758             {
5759               val = XCDR (val);
5760               *from_nchars = len;
5761               break;
5762             }
5763         }
5764       if (! CONSP (tail))
5765         return Qnil;
5766     }
5767   if (VECTORP (val))
5768     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5769   else
5770     *buf = XINT (val);
5771   return val;
5772 }
5773
5774
5775 static int
5776 produce_chars (coding, translation_table, last_block)
5777      struct coding_system *coding;
5778      Lisp_Object translation_table;
5779      int last_block;
5780 {
5781   unsigned char *dst = coding->destination + coding->produced;
5782   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5783   int produced;
5784   int produced_chars = 0;
5785   int carryover = 0;
5786
5787   if (! coding->chars_at_source)
5788     {
5789       /* Characters are in coding->charbuf.  */
5790       int *buf = coding->charbuf;
5791       int *buf_end = buf + coding->charbuf_used;
5792
5793       if (BUFFERP (coding->src_object)
5794           && EQ (coding->src_object, coding->dst_object))
5795         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5796
5797       while (buf < buf_end)
5798         {
5799           int c = *buf, i;
5800
5801           if (c >= 0)
5802             {
5803               int from_nchars = 1, to_nchars = 1;
5804               Lisp_Object trans = Qnil;
5805
5806               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5807               if (! NILP (trans))
5808                 {
5809                   trans = get_translation (trans, buf, buf_end, last_block,
5810                                            &from_nchars, &to_nchars);
5811                   if (EQ (trans, Qt))
5812                     break;
5813                   c = *buf;
5814                 }
5815
5816               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5817                 {
5818                   dst = alloc_destination (coding,
5819                                            buf_end - buf
5820                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5821                                            dst);
5822                   dst_end = coding->destination + coding->dst_bytes;
5823                 }
5824
5825               for (i = 0; i < to_nchars; i++)
5826                 {
5827                   if (i > 0)
5828                     c = XINT (AREF (trans, i));
5829                   if (coding->dst_multibyte
5830                       || ! CHAR_BYTE8_P (c))
5831                     CHAR_STRING_ADVANCE (c, dst);
5832                   else
5833                     *dst++ = CHAR_TO_BYTE8 (c);
5834                 }
5835               produced_chars += to_nchars;
5836               *buf++ = to_nchars;
5837               while (--from_nchars > 0)
5838                 *buf++ = 0;
5839             }
5840           else
5841             /* This is an annotation datum.  (-C) is the length.  */
5842             buf += -c;
5843         }
5844       carryover = buf_end - buf;
5845     }
5846   else
5847     {
5848       const unsigned char *src = coding->source;
5849       const unsigned char *src_end = src + coding->src_bytes;
5850       Lisp_Object eol_type;
5851
5852       eol_type = CODING_ID_EOL_TYPE (coding->id);
5853
5854       if (coding->src_multibyte != coding->dst_multibyte)
5855         {
5856           if (coding->src_multibyte)
5857             {
5858               int multibytep = 1;
5859               int consumed_chars;
5860
5861               while (1)
5862                 {
5863                   const unsigned char *src_base = src;
5864                   int c;
5865
5866                   ONE_MORE_BYTE (c);
5867                   if (c == '\r')
5868                     {
5869                       if (EQ (eol_type, Qdos))
5870                         {
5871                           if (src == src_end)
5872                             {
5873                               record_conversion_result
5874                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5875                               goto no_more_source;
5876                             }
5877                           if (*src == '\n')
5878                             c = *src++;
5879                         }
5880                       else if (EQ (eol_type, Qmac))
5881                         c = '\n';
5882                     }
5883                   if (dst == dst_end)
5884                     {
5885                       coding->consumed = src - coding->source;
5886
5887                     if (EQ (coding->src_object, coding->dst_object))
5888                       dst_end = (unsigned char *) src;
5889                     if (dst == dst_end)
5890                       {
5891                         dst = alloc_destination (coding, src_end - src + 1,
5892                                                  dst);
5893                         dst_end = coding->destination + coding->dst_bytes;
5894                         coding_set_source (coding);
5895                         src = coding->source + coding->consumed;
5896                         src_end = coding->source + coding->src_bytes;
5897                       }
5898                     }
5899                   *dst++ = c;
5900                   produced_chars++;
5901                 }
5902             no_more_source:
5903               ;
5904             }
5905           else
5906             while (src < src_end)
5907               {
5908                 int multibytep = 1;
5909                 int c = *src++;
5910
5911                 if (c == '\r')
5912                   {
5913                     if (EQ (eol_type, Qdos))
5914                       {
5915                         if (src < src_end
5916                             && *src == '\n')
5917                           c = *src++;
5918                       }
5919                     else if (EQ (eol_type, Qmac))
5920                       c = '\n';
5921                   }
5922                 if (dst >= dst_end - 1)
5923                   {
5924                     coding->consumed = src - coding->source;
5925
5926                     if (EQ (coding->src_object, coding->dst_object))
5927                       dst_end = (unsigned char *) src;
5928                     if (dst >= dst_end - 1)
5929                       {
5930                         dst = alloc_destination (coding, src_end - src + 2,
5931                                                  dst);
5932                         dst_end = coding->destination + coding->dst_bytes;
5933                         coding_set_source (coding);
5934                         src = coding->source + coding->consumed;
5935                         src_end = coding->source + coding->src_bytes;
5936                       }
5937                   }
5938                 EMIT_ONE_BYTE (c);
5939               }
5940         }
5941       else
5942         {
5943           if (!EQ (coding->src_object, coding->dst_object))
5944             {
5945               int require = coding->src_bytes - coding->dst_bytes;
5946
5947               if (require > 0)
5948                 {
5949                   EMACS_INT offset = src - coding->source;
5950
5951                   dst = alloc_destination (coding, require, dst);
5952                   coding_set_source (coding);
5953                   src = coding->source + offset;
5954                   src_end = coding->source + coding->src_bytes;
5955                 }
5956             }
5957           produced_chars = coding->src_chars;
5958           while (src < src_end)
5959             {
5960               int c = *src++;
5961
5962               if (c == '\r')
5963                 {
5964                   if (EQ (eol_type, Qdos))
5965                     {
5966                       if (src < src_end
5967                           && *src == '\n')
5968                         c = *src++;
5969                       produced_chars--;
5970                     }
5971                   else if (EQ (eol_type, Qmac))
5972                     c = '\n';
5973                 }
5974               *dst++ = c;
5975             }
5976         }
5977       coding->consumed = coding->src_bytes;
5978       coding->consumed_char = coding->src_chars;
5979     }
5980
5981   produced = dst - (coding->destination + coding->produced);
5982   if (BUFFERP (coding->dst_object))
5983     insert_from_gap (produced_chars, produced);
5984   coding->produced += produced;
5985   coding->produced_char += produced_chars;
5986   return carryover;
5987 }
5988
5989 /* Compose text in CODING->object according to the annotation data at
5990    CHARBUF.  CHARBUF is an array:
5991      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5992  */
5993
5994 static INLINE void
5995 produce_composition (coding, charbuf, pos)
5996      struct coding_system *coding;
5997      int *charbuf;
5998      EMACS_INT pos;
5999 {
6000   int len;
6001   EMACS_INT to;
6002   enum composition_method method;
6003   Lisp_Object components;
6004
6005   len = -charbuf[0];
6006   to = pos + charbuf[2];
6007   if (to <= pos)
6008     return;
6009   method = (enum composition_method) (charbuf[3]);
6010
6011   if (method == COMPOSITION_RELATIVE)
6012     components = Qnil;
6013   else if (method >= COMPOSITION_WITH_RULE
6014            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6015     {
6016       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6017       int i;
6018
6019       len -= 4;
6020       charbuf += 4;
6021       for (i = 0; i < len; i++)
6022         {
6023           args[i] = make_number (charbuf[i]);
6024           if (args[i] < 0)
6025             return;
6026         }
6027       components = (method == COMPOSITION_WITH_ALTCHARS
6028                     ? Fstring (len, args) : Fvector (len, args));
6029     }
6030   else
6031     return;
6032   compose_text (pos, to, components, Qnil, coding->dst_object);
6033 }
6034
6035
6036 /* Put `charset' property on text in CODING->object according to
6037    the annotation data at CHARBUF.  CHARBUF is an array:
6038      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6039  */
6040
6041 static INLINE void
6042 produce_charset (coding, charbuf, pos)
6043      struct coding_system *coding;
6044      int *charbuf;
6045      EMACS_INT pos;
6046 {
6047   EMACS_INT from = pos - charbuf[2];
6048   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6049
6050   Fput_text_property (make_number (from), make_number (pos),
6051                       Qcharset, CHARSET_NAME (charset),
6052                       coding->dst_object);
6053 }
6054
6055
6056 #define CHARBUF_SIZE 0x4000
6057
6058 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6059   do {                                                                  \
6060     int size = CHARBUF_SIZE;;                                           \
6061                                                                         \
6062     coding->charbuf = NULL;                                             \
6063     while (size > 1024)                                                 \
6064       {                                                                 \
6065         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6066         if (coding->charbuf)                                            \
6067           break;                                                        \
6068         size >>= 1;                                                     \
6069       }                                                                 \
6070     if (! coding->charbuf)                                              \
6071       {                                                                 \
6072         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6073         return coding->result;                                          \
6074       }                                                                 \
6075     coding->charbuf_size = size;                                        \
6076   } while (0)
6077
6078
6079 static void
6080 produce_annotation (coding, pos)
6081      struct coding_system *coding;
6082      EMACS_INT pos;
6083 {
6084   int *charbuf = coding->charbuf;
6085   int *charbuf_end = charbuf + coding->charbuf_used;
6086
6087   if (NILP (coding->dst_object))
6088     return;
6089
6090   while (charbuf < charbuf_end)
6091     {
6092       if (*charbuf >= 0)
6093         pos += *charbuf++;
6094       else
6095         {
6096           int len = -*charbuf;
6097           switch (charbuf[1])
6098             {
6099             case CODING_ANNOTATE_COMPOSITION_MASK:
6100               produce_composition (coding, charbuf, pos);
6101               break;
6102             case CODING_ANNOTATE_CHARSET_MASK:
6103               produce_charset (coding, charbuf, pos);
6104               break;
6105             default:
6106               abort ();
6107             }
6108           charbuf += len;
6109         }
6110     }
6111 }
6112
6113 /* Decode the data at CODING->src_object into CODING->dst_object.
6114    CODING->src_object is a buffer, a string, or nil.
6115    CODING->dst_object is a buffer.
6116
6117    If CODING->src_object is a buffer, it must be the current buffer.
6118    In this case, if CODING->src_pos is positive, it is a position of
6119    the source text in the buffer, otherwise, the source text is in the
6120    gap area of the buffer, and CODING->src_pos specifies the offset of
6121    the text from GPT (which must be the same as PT).  If this is the
6122    same buffer as CODING->dst_object, CODING->src_pos must be
6123    negative.
6124
6125    If CODING->src_object is a string, CODING->src_pos in an index to
6126    that string.
6127
6128    If CODING->src_object is nil, CODING->source must already point to
6129    the non-relocatable memory area.  In this case, CODING->src_pos is
6130    an offset from CODING->source.
6131
6132    The decoded data is inserted at the current point of the buffer
6133    CODING->dst_object.
6134 */
6135
6136 static int
6137 decode_coding (coding)
6138      struct coding_system *coding;
6139 {
6140   Lisp_Object attrs;
6141   Lisp_Object undo_list;
6142   Lisp_Object translation_table;
6143   int carryover;
6144   int i;
6145
6146   if (BUFFERP (coding->src_object)
6147       && coding->src_pos > 0
6148       && coding->src_pos < GPT
6149       && coding->src_pos + coding->src_chars > GPT)
6150     move_gap_both (coding->src_pos, coding->src_pos_byte);
6151
6152   undo_list = Qt;
6153   if (BUFFERP (coding->dst_object))
6154     {
6155       if (current_buffer != XBUFFER (coding->dst_object))
6156         set_buffer_internal (XBUFFER (coding->dst_object));
6157       if (GPT != PT)
6158         move_gap_both (PT, PT_BYTE);
6159       undo_list = current_buffer->undo_list;
6160       current_buffer->undo_list = Qt;
6161     }
6162
6163   coding->consumed = coding->consumed_char = 0;
6164   coding->produced = coding->produced_char = 0;
6165   coding->chars_at_source = 0;
6166   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6167   coding->errors = 0;
6168
6169   ALLOC_CONVERSION_WORK_AREA (coding);
6170
6171   attrs = CODING_ID_ATTRS (coding->id);
6172   translation_table = get_translation_table (attrs, 0, NULL);
6173
6174   carryover = 0;
6175   do
6176     {
6177       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6178
6179       coding_set_source (coding);
6180       coding->annotated = 0;
6181       coding->charbuf_used = carryover;
6182       (*(coding->decoder)) (coding);
6183       coding_set_destination (coding);
6184       carryover = produce_chars (coding, translation_table, 0);
6185       if (coding->annotated)
6186         produce_annotation (coding, pos);
6187       for (i = 0; i < carryover; i++)
6188         coding->charbuf[i]
6189           = coding->charbuf[coding->charbuf_used - carryover + i];
6190     }
6191   while (coding->consumed < coding->src_bytes
6192          && (coding->result == CODING_RESULT_SUCCESS
6193              || coding->result == CODING_RESULT_INVALID_SRC));
6194
6195   if (carryover > 0)
6196     {
6197       coding_set_destination (coding);
6198       coding->charbuf_used = carryover;
6199       produce_chars (coding, translation_table, 1);
6200     }
6201
6202   coding->carryover_bytes = 0;
6203   if (coding->consumed < coding->src_bytes)
6204     {
6205       int nbytes = coding->src_bytes - coding->consumed;
6206       const unsigned char *src;
6207
6208       coding_set_source (coding);
6209       coding_set_destination (coding);
6210       src = coding->source + coding->consumed;
6211
6212       if (coding->mode & CODING_MODE_LAST_BLOCK)
6213         {
6214           /* Flush out unprocessed data as binary chars.  We are sure
6215              that the number of data is less than the size of
6216              coding->charbuf.  */
6217           coding->charbuf_used = 0;
6218           while (nbytes-- > 0)
6219             {
6220               int c = *src++;
6221
6222               if (c & 0x80)
6223                 c = BYTE8_TO_CHAR (c);
6224               coding->charbuf[coding->charbuf_used++] = c;
6225             }
6226           produce_chars (coding, Qnil, 1);
6227         }
6228       else
6229         {
6230           /* Record unprocessed bytes in coding->carryover.  We are
6231              sure that the number of data is less than the size of
6232              coding->carryover.  */
6233           unsigned char *p = coding->carryover;
6234
6235           coding->carryover_bytes = nbytes;
6236           while (nbytes-- > 0)
6237             *p++ = *src++;
6238         }
6239       coding->consumed = coding->src_bytes;
6240     }
6241
6242   if (BUFFERP (coding->dst_object))
6243     {
6244       current_buffer->undo_list = undo_list;
6245       record_insert (coding->dst_pos, coding->produced_char);
6246     }
6247   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6248     decode_eol (coding);
6249   return coding->result;
6250 }
6251
6252
6253 /* Extract an annotation datum from a composition starting at POS and
6254    ending before LIMIT of CODING->src_object (buffer or string), store
6255    the data in BUF, set *STOP to a starting position of the next
6256    composition (if any) or to LIMIT, and return the address of the
6257    next element of BUF.
6258
6259    If such an annotation is not found, set *STOP to a starting
6260    position of a composition after POS (if any) or to LIMIT, and
6261    return BUF.  */
6262
6263 static INLINE int *
6264 handle_composition_annotation (pos, limit, coding, buf, stop)
6265      EMACS_INT pos, limit;
6266      struct coding_system *coding;
6267      int *buf;
6268      EMACS_INT *stop;
6269 {
6270   EMACS_INT start, end;
6271   Lisp_Object prop;
6272
6273   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6274       || end > limit)
6275     *stop = limit;
6276   else if (start > pos)
6277     *stop = start;
6278   else
6279     {
6280       if (start == pos)
6281         {
6282           /* We found a composition.  Store the corresponding
6283              annotation data in BUF.  */
6284           int *head = buf;
6285           enum composition_method method = COMPOSITION_METHOD (prop);
6286           int nchars = COMPOSITION_LENGTH (prop);
6287
6288           ADD_COMPOSITION_DATA (buf, nchars, method);
6289           if (method != COMPOSITION_RELATIVE)
6290             {
6291               Lisp_Object components;
6292               int len, i, i_byte;
6293
6294               components = COMPOSITION_COMPONENTS (prop);
6295               if (VECTORP (components))
6296                 {
6297                   len = XVECTOR (components)->size;
6298                   for (i = 0; i < len; i++)
6299                     *buf++ = XINT (AREF (components, i));
6300                 }
6301               else if (STRINGP (components))
6302                 {
6303                   len = SCHARS (components);
6304                   i = i_byte = 0;
6305                   while (i < len)
6306                     {
6307                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6308                       buf++;
6309                     }
6310                 }
6311               else if (INTEGERP (components))
6312                 {
6313                   len = 1;
6314                   *buf++ = XINT (components);
6315                 }
6316               else if (CONSP (components))
6317                 {
6318                   for (len = 0; CONSP (components);
6319                        len++, components = XCDR (components))
6320                     *buf++ = XINT (XCAR (components));
6321                 }
6322               else
6323                 abort ();
6324               *head -= len;
6325             }
6326         }
6327
6328       if (find_composition (end, limit, &start, &end, &prop,
6329                             coding->src_object)
6330           && end <= limit)
6331         *stop = start;
6332       else
6333         *stop = limit;
6334     }
6335   return buf;
6336 }
6337
6338
6339 /* Extract an annotation datum from a text property `charset' at POS of
6340    CODING->src_object (buffer of string), store the data in BUF, set
6341    *STOP to the position where the value of `charset' property changes
6342    (limiting by LIMIT), and return the address of the next element of
6343    BUF.
6344
6345    If the property value is nil, set *STOP to the position where the
6346    property value is non-nil (limiting by LIMIT), and return BUF.  */
6347
6348 static INLINE int *
6349 handle_charset_annotation (pos, limit, coding, buf, stop)
6350      EMACS_INT pos, limit;
6351      struct coding_system *coding;
6352      int *buf;
6353      EMACS_INT *stop;
6354 {
6355   Lisp_Object val, next;
6356   int id;
6357
6358   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6359   if (! NILP (val) && CHARSETP (val))
6360     id = XINT (CHARSET_SYMBOL_ID (val));
6361   else
6362     id = -1;
6363   ADD_CHARSET_DATA (buf, 0, id);
6364   next = Fnext_single_property_change (make_number (pos), Qcharset,
6365                                        coding->src_object,
6366                                        make_number (limit));
6367   *stop = XINT (next);
6368   return buf;
6369 }
6370
6371
6372 static void
6373 consume_chars (coding, translation_table, max_lookup)
6374      struct coding_system *coding;
6375      Lisp_Object translation_table;
6376      int max_lookup;
6377 {
6378   int *buf = coding->charbuf;
6379   int *buf_end = coding->charbuf + coding->charbuf_size;
6380   const unsigned char *src = coding->source + coding->consumed;
6381   const unsigned char *src_end = coding->source + coding->src_bytes;
6382   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6383   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6384   int multibytep = coding->src_multibyte;
6385   Lisp_Object eol_type;
6386   int c;
6387   EMACS_INT stop, stop_composition, stop_charset;
6388   int *lookup_buf = NULL;
6389
6390   if (! NILP (translation_table))
6391     lookup_buf = alloca (sizeof (int) * max_lookup);
6392
6393   eol_type = CODING_ID_EOL_TYPE (coding->id);
6394   if (VECTORP (eol_type))
6395     eol_type = Qunix;
6396
6397   /* Note: composition handling is not yet implemented.  */
6398   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6399
6400   if (NILP (coding->src_object))
6401     stop = stop_composition = stop_charset = end_pos;
6402   else
6403     {
6404       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6405         stop = stop_composition = pos;
6406       else
6407         stop = stop_composition = end_pos;
6408       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6409         stop = stop_charset = pos;
6410       else
6411         stop_charset = end_pos;
6412     }
6413
6414   /* Compensate for CRLF and conversion.  */
6415   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6416   while (buf < buf_end)
6417     {
6418       Lisp_Object trans;
6419
6420       if (pos == stop)
6421         {
6422           if (pos == end_pos)
6423             break;
6424           if (pos == stop_composition)
6425             buf = handle_composition_annotation (pos, end_pos, coding,
6426                                                  buf, &stop_composition);
6427           if (pos == stop_charset)
6428             buf = handle_charset_annotation (pos, end_pos, coding,
6429                                              buf, &stop_charset);
6430           stop = (stop_composition < stop_charset
6431                   ? stop_composition : stop_charset);
6432         }
6433
6434       if (! multibytep)
6435         {
6436           EMACS_INT bytes;
6437
6438           if (coding->encoder == encode_coding_raw_text)
6439             c = *src++, pos++;
6440           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6441             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6442           else
6443             c = BYTE8_TO_CHAR (*src), src++, pos++;
6444         }
6445       else
6446         c = STRING_CHAR_ADVANCE (src), pos++;
6447       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6448         c = '\n';
6449       if (! EQ (eol_type, Qunix))
6450         {
6451           if (c == '\n')
6452             {
6453               if (EQ (eol_type, Qdos))
6454                 *buf++ = '\r';
6455               else
6456                 c = '\r';
6457             }
6458         }
6459
6460       trans = Qnil;
6461       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6462       if (NILP (trans))
6463         *buf++ = c;
6464       else
6465         {
6466           int from_nchars = 1, to_nchars = 1;
6467           int *lookup_buf_end;
6468           const unsigned char *p = src;
6469           int i;
6470
6471           lookup_buf[0] = c;
6472           for (i = 1; i < max_lookup && p < src_end; i++)
6473             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6474           lookup_buf_end = lookup_buf + i;
6475           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6476                                    &from_nchars, &to_nchars);
6477           if (EQ (trans, Qt)
6478               || buf + to_nchars > buf_end)
6479             break;
6480           *buf++ = *lookup_buf;
6481           for (i = 1; i < to_nchars; i++)
6482             *buf++ = XINT (AREF (trans, i));
6483           for (i = 1; i < from_nchars; i++, pos++)
6484             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6485         }
6486     }
6487
6488   coding->consumed = src - coding->source;
6489   coding->consumed_char = pos - coding->src_pos;
6490   coding->charbuf_used = buf - coding->charbuf;
6491   coding->chars_at_source = 0;
6492 }
6493
6494
6495 /* Encode the text at CODING->src_object into CODING->dst_object.
6496    CODING->src_object is a buffer or a string.
6497    CODING->dst_object is a buffer or nil.
6498
6499    If CODING->src_object is a buffer, it must be the current buffer.
6500    In this case, if CODING->src_pos is positive, it is a position of
6501    the source text in the buffer, otherwise. the source text is in the
6502    gap area of the buffer, and coding->src_pos specifies the offset of
6503    the text from GPT (which must be the same as PT).  If this is the
6504    same buffer as CODING->dst_object, CODING->src_pos must be
6505    negative and CODING should not have `pre-write-conversion'.
6506
6507    If CODING->src_object is a string, CODING should not have
6508    `pre-write-conversion'.
6509
6510    If CODING->dst_object is a buffer, the encoded data is inserted at
6511    the current point of that buffer.
6512
6513    If CODING->dst_object is nil, the encoded data is placed at the
6514    memory area specified by CODING->destination.  */
6515
6516 static int
6517 encode_coding (coding)
6518      struct coding_system *coding;
6519 {
6520   Lisp_Object attrs;
6521   Lisp_Object translation_table;
6522   int max_lookup;
6523
6524   attrs = CODING_ID_ATTRS (coding->id);
6525   if (coding->encoder == encode_coding_raw_text)
6526     translation_table = Qnil, max_lookup = 0;
6527   else
6528     translation_table = get_translation_table (attrs, 1, &max_lookup);
6529
6530   if (BUFFERP (coding->dst_object))
6531     {
6532       set_buffer_internal (XBUFFER (coding->dst_object));
6533       coding->dst_multibyte
6534         = ! NILP (current_buffer->enable_multibyte_characters);
6535     }
6536
6537   coding->consumed = coding->consumed_char = 0;
6538   coding->produced = coding->produced_char = 0;
6539   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6540   coding->errors = 0;
6541
6542   ALLOC_CONVERSION_WORK_AREA (coding);
6543
6544   do {
6545     coding_set_source (coding);
6546     consume_chars (coding, translation_table, max_lookup);
6547     coding_set_destination (coding);
6548     (*(coding->encoder)) (coding);
6549   } while (coding->consumed_char < coding->src_chars);
6550
6551   if (BUFFERP (coding->dst_object))
6552     insert_from_gap (coding->produced_char, coding->produced);
6553
6554   return (coding->result);
6555 }
6556
6557
6558 /* Name (or base name) of work buffer for code conversion.  */
6559 static Lisp_Object Vcode_conversion_workbuf_name;
6560
6561 /* A working buffer used by the top level conversion.  Once it is
6562    created, it is never destroyed.  It has the name
6563    Vcode_conversion_workbuf_name.  The other working buffers are
6564    destroyed after the use is finished, and their names are modified
6565    versions of Vcode_conversion_workbuf_name.  */
6566 static Lisp_Object Vcode_conversion_reused_workbuf;
6567
6568 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6569 static int reused_workbuf_in_use;
6570
6571
6572 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6573    multibyteness of returning buffer.  */
6574
6575 static Lisp_Object
6576 make_conversion_work_buffer (multibyte)
6577      int multibyte;
6578 {
6579   Lisp_Object name, workbuf;
6580   struct buffer *current;
6581
6582   if (reused_workbuf_in_use++)
6583     {
6584       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6585       workbuf = Fget_buffer_create (name);
6586     }
6587   else
6588     {
6589       name = Vcode_conversion_workbuf_name;
6590       workbuf = Fget_buffer_create (name);
6591       if (NILP (Vcode_conversion_reused_workbuf))
6592         Vcode_conversion_reused_workbuf = workbuf;
6593     }
6594   current = current_buffer;
6595   set_buffer_internal (XBUFFER (workbuf));
6596   Ferase_buffer ();
6597   current_buffer->undo_list = Qt;
6598   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6599   set_buffer_internal (current);
6600   return workbuf;
6601 }
6602
6603
6604 static Lisp_Object
6605 code_conversion_restore (arg)
6606      Lisp_Object arg;
6607 {
6608   Lisp_Object current, workbuf;
6609   struct gcpro gcpro1;
6610
6611   GCPRO1 (arg);
6612   current = XCAR (arg);
6613   workbuf = XCDR (arg);
6614   if (! NILP (workbuf))
6615     {
6616       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6617         reused_workbuf_in_use = 0;
6618       else if (! NILP (Fbuffer_live_p (workbuf)))
6619         Fkill_buffer (workbuf);
6620     }
6621   set_buffer_internal (XBUFFER (current));
6622   UNGCPRO;
6623   return Qnil;
6624 }
6625
6626 Lisp_Object
6627 code_conversion_save (with_work_buf, multibyte)
6628      int with_work_buf, multibyte;
6629 {
6630   Lisp_Object workbuf = Qnil;
6631
6632   if (with_work_buf)
6633     workbuf = make_conversion_work_buffer (multibyte);
6634   record_unwind_protect (code_conversion_restore,
6635                          Fcons (Fcurrent_buffer (), workbuf));
6636   return workbuf;
6637 }
6638
6639 int
6640 decode_coding_gap (coding, chars, bytes)
6641      struct coding_system *coding;
6642      EMACS_INT chars, bytes;
6643 {
6644   int count = specpdl_ptr - specpdl;
6645   Lisp_Object attrs;
6646
6647   code_conversion_save (0, 0);
6648
6649   coding->src_object = Fcurrent_buffer ();
6650   coding->src_chars = chars;
6651   coding->src_bytes = bytes;
6652   coding->src_pos = -chars;
6653   coding->src_pos_byte = -bytes;
6654   coding->src_multibyte = chars < bytes;
6655   coding->dst_object = coding->src_object;
6656   coding->dst_pos = PT;
6657   coding->dst_pos_byte = PT_BYTE;
6658   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6659
6660   if (CODING_REQUIRE_DETECTION (coding))
6661     detect_coding (coding);
6662
6663   coding->mode |= CODING_MODE_LAST_BLOCK;
6664   decode_coding (coding);
6665
6666   attrs = CODING_ID_ATTRS (coding->id);
6667   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6668     {
6669       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6670       Lisp_Object val;
6671
6672       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6673       val = call1 (CODING_ATTR_POST_READ (attrs),
6674                    make_number (coding->produced_char));
6675       CHECK_NATNUM (val);
6676       coding->produced_char += Z - prev_Z;
6677       coding->produced += Z_BYTE - prev_Z_BYTE;
6678     }
6679
6680   unbind_to (count, Qnil);
6681   return coding->result;
6682 }
6683
6684 int
6685 encode_coding_gap (coding, chars, bytes)
6686      struct coding_system *coding;
6687      EMACS_INT chars, bytes;
6688 {
6689   int count = specpdl_ptr - specpdl;
6690
6691   code_conversion_save (0, 0);
6692
6693   coding->src_object = Fcurrent_buffer ();
6694   coding->src_chars = chars;
6695   coding->src_bytes = bytes;
6696   coding->src_pos = -chars;
6697   coding->src_pos_byte = -bytes;
6698   coding->src_multibyte = chars < bytes;
6699   coding->dst_object = coding->src_object;
6700   coding->dst_pos = PT;
6701   coding->dst_pos_byte = PT_BYTE;
6702
6703   encode_coding (coding);
6704
6705   unbind_to (count, Qnil);
6706   return coding->result;
6707 }
6708
6709
6710 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6711    SRC_OBJECT into DST_OBJECT by coding context CODING.
6712
6713    SRC_OBJECT is a buffer, a string, or Qnil.
6714
6715    If it is a buffer, the text is at point of the buffer.  FROM and TO
6716    are positions in the buffer.
6717
6718    If it is a string, the text is at the beginning of the string.
6719    FROM and TO are indices to the string.
6720
6721    If it is nil, the text is at coding->source.  FROM and TO are
6722    indices to coding->source.
6723
6724    DST_OBJECT is a buffer, Qt, or Qnil.
6725
6726    If it is a buffer, the decoded text is inserted at point of the
6727    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6728    is deleted.
6729
6730    If it is Qt, a string is made from the decoded text, and
6731    set in CODING->dst_object.
6732
6733    If it is Qnil, the decoded text is stored at CODING->destination.
6734    The caller must allocate CODING->dst_bytes bytes at
6735    CODING->destination by xmalloc.  If the decoded text is longer than
6736    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6737  */
6738
6739 void
6740 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6741                       dst_object)
6742      struct coding_system *coding;
6743      Lisp_Object src_object;
6744      EMACS_INT from, from_byte, to, to_byte;
6745      Lisp_Object dst_object;
6746 {
6747   int count = specpdl_ptr - specpdl;
6748   unsigned char *destination;
6749   EMACS_INT dst_bytes;
6750   EMACS_INT chars = to - from;
6751   EMACS_INT bytes = to_byte - from_byte;
6752   Lisp_Object attrs;
6753   Lisp_Object buffer;
6754   int saved_pt = -1, saved_pt_byte;
6755
6756   buffer = Fcurrent_buffer ();
6757
6758   if (NILP (dst_object))
6759     {
6760       destination = coding->destination;
6761       dst_bytes = coding->dst_bytes;
6762     }
6763
6764   coding->src_object = src_object;
6765   coding->src_chars = chars;
6766   coding->src_bytes = bytes;
6767   coding->src_multibyte = chars < bytes;
6768
6769   if (STRINGP (src_object))
6770     {
6771       coding->src_pos = from;
6772       coding->src_pos_byte = from_byte;
6773     }
6774   else if (BUFFERP (src_object))
6775     {
6776       set_buffer_internal (XBUFFER (src_object));
6777       if (from != GPT)
6778         move_gap_both (from, from_byte);
6779       if (EQ (src_object, dst_object))
6780         {
6781           saved_pt = PT, saved_pt_byte = PT_BYTE;
6782           TEMP_SET_PT_BOTH (from, from_byte);
6783           del_range_both (from, from_byte, to, to_byte, 1);
6784           coding->src_pos = -chars;
6785           coding->src_pos_byte = -bytes;
6786         }
6787       else
6788         {
6789           coding->src_pos = from;
6790           coding->src_pos_byte = from_byte;
6791         }
6792     }
6793
6794   if (CODING_REQUIRE_DETECTION (coding))
6795     detect_coding (coding);
6796   attrs = CODING_ID_ATTRS (coding->id);
6797
6798   if (EQ (dst_object, Qt)
6799       || (! NILP (CODING_ATTR_POST_READ (attrs))
6800           && NILP (dst_object)))
6801     {
6802       coding->dst_object = code_conversion_save (1, 1);
6803       coding->dst_pos = BEG;
6804       coding->dst_pos_byte = BEG_BYTE;
6805       coding->dst_multibyte = 1;
6806     }
6807   else if (BUFFERP (dst_object))
6808     {
6809       code_conversion_save (0, 0);
6810       coding->dst_object = dst_object;
6811       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6812       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6813       coding->dst_multibyte
6814         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6815     }
6816   else
6817     {
6818       code_conversion_save (0, 0);
6819       coding->dst_object = Qnil;
6820       coding->dst_multibyte = 1;
6821     }
6822
6823   decode_coding (coding);
6824
6825   if (BUFFERP (coding->dst_object))
6826     set_buffer_internal (XBUFFER (coding->dst_object));
6827
6828   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6829     {
6830       struct gcpro gcpro1, gcpro2;
6831       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6832       Lisp_Object val;
6833
6834       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6835       GCPRO2 (coding->src_object, coding->dst_object);
6836       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6837                         make_number (coding->produced_char));
6838       UNGCPRO;
6839       CHECK_NATNUM (val);
6840       coding->produced_char += Z - prev_Z;
6841       coding->produced += Z_BYTE - prev_Z_BYTE;
6842     }
6843
6844   if (EQ (dst_object, Qt))
6845     {
6846       coding->dst_object = Fbuffer_string ();
6847     }
6848   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6849     {
6850       set_buffer_internal (XBUFFER (coding->dst_object));
6851       if (dst_bytes < coding->produced)
6852         {
6853           destination
6854             = (unsigned char *) xrealloc (destination, coding->produced);
6855           if (! destination)
6856             {
6857               record_conversion_result (coding,
6858                                         CODING_RESULT_INSUFFICIENT_DST);
6859               unbind_to (count, Qnil);
6860               return;
6861             }
6862           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6863             move_gap_both (BEGV, BEGV_BYTE);
6864           bcopy (BEGV_ADDR, destination, coding->produced);
6865           coding->destination = destination;
6866         }
6867     }
6868
6869   if (saved_pt >= 0)
6870     {
6871       /* This is the case of:
6872          (BUFFERP (src_object) && EQ (src_object, dst_object))
6873          As we have moved PT while replacing the original buffer
6874          contents, we must recover it now.  */
6875       set_buffer_internal (XBUFFER (src_object));
6876       if (saved_pt < from)
6877         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6878       else if (saved_pt < from + chars)
6879         TEMP_SET_PT_BOTH (from, from_byte);
6880       else if (! NILP (current_buffer->enable_multibyte_characters))
6881         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6882                           saved_pt_byte + (coding->produced - bytes));
6883       else
6884         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6885                           saved_pt_byte + (coding->produced - bytes));
6886     }
6887
6888   unbind_to (count, coding->dst_object);
6889 }
6890
6891
6892 void
6893 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6894                       dst_object)
6895      struct coding_system *coding;
6896      Lisp_Object src_object;
6897      EMACS_INT from, from_byte, to, to_byte;
6898      Lisp_Object dst_object;
6899 {
6900   int count = specpdl_ptr - specpdl;
6901   EMACS_INT chars = to - from;
6902   EMACS_INT bytes = to_byte - from_byte;
6903   Lisp_Object attrs;
6904   Lisp_Object buffer;
6905   int saved_pt = -1, saved_pt_byte;
6906   int kill_src_buffer = 0;
6907
6908   buffer = Fcurrent_buffer ();
6909
6910   coding->src_object = src_object;
6911   coding->src_chars = chars;
6912   coding->src_bytes = bytes;
6913   coding->src_multibyte = chars < bytes;
6914
6915   attrs = CODING_ID_ATTRS (coding->id);
6916
6917   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6918     {
6919       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6920       set_buffer_internal (XBUFFER (coding->src_object));
6921       if (STRINGP (src_object))
6922         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6923       else if (BUFFERP (src_object))
6924         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6925       else
6926         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6927
6928       if (EQ (src_object, dst_object))
6929         {
6930           set_buffer_internal (XBUFFER (src_object));
6931           saved_pt = PT, saved_pt_byte = PT_BYTE;
6932           del_range_both (from, from_byte, to, to_byte, 1);
6933           set_buffer_internal (XBUFFER (coding->src_object));
6934         }
6935
6936       {
6937         Lisp_Object args[3];
6938
6939         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6940         args[1] = make_number (BEG);
6941         args[2] = make_number (Z);
6942         safe_call (3, args);
6943       }
6944       if (XBUFFER (coding->src_object) != current_buffer)
6945         kill_src_buffer = 1;
6946       coding->src_object = Fcurrent_buffer ();
6947       if (BEG != GPT)
6948         move_gap_both (BEG, BEG_BYTE);
6949       coding->src_chars = Z - BEG;
6950       coding->src_bytes = Z_BYTE - BEG_BYTE;
6951       coding->src_pos = BEG;
6952       coding->src_pos_byte = BEG_BYTE;
6953       coding->src_multibyte = Z < Z_BYTE;
6954     }
6955   else if (STRINGP (src_object))
6956     {
6957       code_conversion_save (0, 0);
6958       coding->src_pos = from;
6959       coding->src_pos_byte = from_byte;
6960     }
6961   else if (BUFFERP (src_object))
6962     {
6963       code_conversion_save (0, 0);
6964       set_buffer_internal (XBUFFER (src_object));
6965       if (EQ (src_object, dst_object))
6966         {
6967           saved_pt = PT, saved_pt_byte = PT_BYTE;
6968           coding->src_object = del_range_1 (from, to, 1, 1);
6969           coding->src_pos = 0;
6970           coding->src_pos_byte = 0;
6971         }
6972       else
6973         {
6974           if (from < GPT && to >= GPT)
6975             move_gap_both (from, from_byte);
6976           coding->src_pos = from;
6977           coding->src_pos_byte = from_byte;
6978         }
6979     }
6980   else
6981     code_conversion_save (0, 0);
6982
6983   if (BUFFERP (dst_object))
6984     {
6985       coding->dst_object = dst_object;
6986       if (EQ (src_object, dst_object))
6987         {
6988           coding->dst_pos = from;
6989           coding->dst_pos_byte = from_byte;
6990         }
6991       else
6992         {
6993           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6994           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6995         }
6996       coding->dst_multibyte
6997         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6998     }
6999   else if (EQ (dst_object, Qt))
7000     {
7001       coding->dst_object = Qnil;
7002       coding->dst_bytes = coding->src_chars;
7003       if (coding->dst_bytes == 0)
7004         coding->dst_bytes = 1;
7005       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7006       coding->dst_multibyte = 0;
7007     }
7008   else
7009     {
7010       coding->dst_object = Qnil;
7011       coding->dst_multibyte = 0;
7012     }
7013
7014   encode_coding (coding);
7015
7016   if (EQ (dst_object, Qt))
7017     {
7018       if (BUFFERP (coding->dst_object))
7019         coding->dst_object = Fbuffer_string ();
7020       else
7021         {
7022           coding->dst_object
7023             = make_unibyte_string ((char *) coding->destination,
7024                                    coding->produced);
7025           xfree (coding->destination);
7026         }
7027     }
7028
7029   if (saved_pt >= 0)
7030     {
7031       /* This is the case of:
7032          (BUFFERP (src_object) && EQ (src_object, dst_object))
7033          As we have moved PT while replacing the original buffer
7034          contents, we must recover it now.  */
7035       set_buffer_internal (XBUFFER (src_object));
7036       if (saved_pt < from)
7037         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7038       else if (saved_pt < from + chars)
7039         TEMP_SET_PT_BOTH (from, from_byte);
7040       else if (! NILP (current_buffer->enable_multibyte_characters))
7041         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7042                           saved_pt_byte + (coding->produced - bytes));
7043       else
7044         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7045                           saved_pt_byte + (coding->produced - bytes));
7046     }
7047
7048   if (kill_src_buffer)
7049     Fkill_buffer (coding->src_object);
7050   unbind_to (count, Qnil);
7051 }
7052
7053
7054 Lisp_Object
7055 preferred_coding_system ()
7056 {
7057   int id = coding_categories[coding_priorities[0]].id;
7058
7059   return CODING_ID_NAME (id);
7060 }
7061
7062 \f
7063 #ifdef emacs
7064 /*** 8. Emacs Lisp library functions ***/
7065
7066 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7067        doc: /* Return t if OBJECT is nil or a coding-system.
7068 See the documentation of `define-coding-system' for information
7069 about coding-system objects.  */)
7070      (obj)
7071      Lisp_Object obj;
7072 {
7073   if (NILP (obj)
7074       || CODING_SYSTEM_ID (obj) >= 0)
7075     return Qt;
7076   if (! SYMBOLP (obj)
7077       || NILP (Fget (obj, Qcoding_system_define_form)))
7078     return Qnil;
7079   return Qt;
7080 }
7081
7082 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7083        Sread_non_nil_coding_system, 1, 1, 0,
7084        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7085      (prompt)
7086      Lisp_Object prompt;
7087 {
7088   Lisp_Object val;
7089   do
7090     {
7091       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7092                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7093     }
7094   while (SCHARS (val) == 0);
7095   return (Fintern (val, Qnil));
7096 }
7097
7098 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7099        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7100 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7101      (prompt, default_coding_system)
7102      Lisp_Object prompt, default_coding_system;
7103 {
7104   Lisp_Object val;
7105   if (SYMBOLP (default_coding_system))
7106     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7107   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7108                           Qt, Qnil, Qcoding_system_history,
7109                           default_coding_system, Qnil);
7110   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7111 }
7112
7113 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7114        1, 1, 0,
7115        doc: /* Check validity of CODING-SYSTEM.
7116 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7117 It is valid if it is nil or a symbol defined as a coding system by the
7118 function `define-coding-system'.  */)
7119   (coding_system)
7120      Lisp_Object coding_system;
7121 {
7122   Lisp_Object define_form;
7123
7124   define_form = Fget (coding_system, Qcoding_system_define_form);
7125   if (! NILP (define_form))
7126     {
7127       Fput (coding_system, Qcoding_system_define_form, Qnil);
7128       safe_eval (define_form);
7129     }
7130   if (!NILP (Fcoding_system_p (coding_system)))
7131     return coding_system;
7132   while (1)
7133     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7134 }
7135
7136 \f
7137 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7138    HIGHEST is nonzero, return the coding system of the highest
7139    priority among the detected coding systems.  Otherwize return a
7140    list of detected coding systems sorted by their priorities.  If
7141    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7142    multibyte form but contains only ASCII and eight-bit chars.
7143    Otherwise, the bytes are raw bytes.
7144
7145    CODING-SYSTEM controls the detection as below:
7146
7147    If it is nil, detect both text-format and eol-format.  If the
7148    text-format part of CODING-SYSTEM is already specified
7149    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7150    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7151    detect only text-format.  */
7152
7153 Lisp_Object
7154 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7155                       coding_system)
7156      const unsigned char *src;
7157      int src_chars, src_bytes, highest;
7158      int multibytep;
7159      Lisp_Object coding_system;
7160 {
7161   const unsigned char *src_end = src + src_bytes;
7162   Lisp_Object attrs, eol_type;
7163   Lisp_Object val;
7164   struct coding_system coding;
7165   int id;
7166   struct coding_detection_info detect_info;
7167   enum coding_category base_category;
7168
7169   if (NILP (coding_system))
7170     coding_system = Qundecided;
7171   setup_coding_system (coding_system, &coding);
7172   attrs = CODING_ID_ATTRS (coding.id);
7173   eol_type = CODING_ID_EOL_TYPE (coding.id);
7174   coding_system = CODING_ATTR_BASE_NAME (attrs);
7175
7176   coding.source = src;
7177   coding.src_chars = src_chars;
7178   coding.src_bytes = src_bytes;
7179   coding.src_multibyte = multibytep;
7180   coding.consumed = 0;
7181   coding.mode |= CODING_MODE_LAST_BLOCK;
7182
7183   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7184
7185   /* At first, detect text-format if necessary.  */
7186   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7187   if (base_category == coding_category_undecided)
7188     {
7189       enum coding_category category;
7190       struct coding_system *this;
7191       int c, i;
7192
7193       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7194       for (i = 0; src < src_end; i++, src++)
7195         {
7196           c = *src;
7197           if (c & 0x80)
7198             break;
7199           if (c < 0x20
7200               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7201               && inhibit_iso_escape_detection)
7202             {
7203               coding.head_ascii = src - coding.source;
7204               if (detect_coding_iso_2022 (&coding, &detect_info))
7205                 {
7206                   /* We have scanned the whole data.  */
7207                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7208                     /* We didn't find an 8-bit code.  */
7209                     src = src_end;
7210                   break;
7211                 }
7212             }
7213         }
7214       coding.head_ascii = src - coding.source;
7215
7216       if (src < src_end
7217           || detect_info.found)
7218         {
7219           if (src == src_end)
7220             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7221             for (i = 0; i < coding_category_raw_text; i++)
7222               {
7223                 category = coding_priorities[i];
7224                 if (detect_info.found & (1 << category))
7225                   break;
7226               }
7227           else
7228             for (i = 0; i < coding_category_raw_text; i++)
7229               {
7230                 category = coding_priorities[i];
7231                 this = coding_categories + category;
7232
7233                 if (this->id < 0)
7234                   {
7235                     /* No coding system of this category is defined.  */
7236                     detect_info.rejected |= (1 << category);
7237                   }
7238                 else if (category >= coding_category_raw_text)
7239                   continue;
7240                 else if (detect_info.checked & (1 << category))
7241                   {
7242                     if (highest
7243                         && (detect_info.found & (1 << category)))
7244                       break;
7245                   }
7246                 else
7247                   {
7248                     if ((*(this->detector)) (&coding, &detect_info)
7249                         && highest
7250                         && (detect_info.found & (1 << category)))
7251                       {
7252                         if (category == coding_category_utf_16_auto)
7253                           {
7254                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7255                               category = coding_category_utf_16_le;
7256                             else
7257                               category = coding_category_utf_16_be;
7258                           }
7259                         break;
7260                       }
7261                   }
7262               }
7263         }
7264
7265       if (detect_info.rejected == CATEGORY_MASK_ANY)
7266         {
7267           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7268           id = coding_categories[coding_category_raw_text].id;
7269           val = Fcons (make_number (id), Qnil);
7270         }
7271       else if (! detect_info.rejected && ! detect_info.found)
7272         {
7273           detect_info.found = CATEGORY_MASK_ANY;
7274           id = coding_categories[coding_category_undecided].id;
7275           val = Fcons (make_number (id), Qnil);
7276         }
7277       else if (highest)
7278         {
7279           if (detect_info.found)
7280             {
7281               detect_info.found = 1 << category;
7282               val = Fcons (make_number (this->id), Qnil);
7283             }
7284           else
7285             for (i = 0; i < coding_category_raw_text; i++)
7286               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7287                 {
7288                   detect_info.found = 1 << coding_priorities[i];
7289                   id = coding_categories[coding_priorities[i]].id;
7290                   val = Fcons (make_number (id), Qnil);
7291                   break;
7292                 }
7293         }
7294       else
7295         {
7296           int mask = detect_info.rejected | detect_info.found;
7297           int found = 0;
7298           val = Qnil;
7299
7300           for (i = coding_category_raw_text - 1; i >= 0; i--)
7301             {
7302               category = coding_priorities[i];
7303               if (! (mask & (1 << category)))
7304                 {
7305                   found |= 1 << category;
7306                   id = coding_categories[category].id;
7307                   val = Fcons (make_number (id), val);
7308                 }
7309             }
7310           for (i = coding_category_raw_text - 1; i >= 0; i--)
7311             {
7312               category = coding_priorities[i];
7313               if (detect_info.found & (1 << category))
7314                 {
7315                   id = coding_categories[category].id;
7316                   val = Fcons (make_number (id), val);
7317                 }
7318             }
7319           detect_info.found |= found;
7320         }
7321     }
7322   else if (base_category == coding_category_utf_16_auto)
7323     {
7324       if (detect_coding_utf_16 (&coding, &detect_info))
7325         {
7326           struct coding_system *this;
7327
7328           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7329             this = coding_categories + coding_category_utf_16_le;
7330           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7331             this = coding_categories + coding_category_utf_16_be;
7332           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7333             this = coding_categories + coding_category_utf_16_be_nosig;
7334           else
7335             this = coding_categories + coding_category_utf_16_le_nosig;
7336           val = Fcons (make_number (this->id), Qnil);
7337         }
7338     }
7339   else
7340     {
7341       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7342       val = Fcons (make_number (coding.id), Qnil);
7343     }
7344
7345   /* Then, detect eol-format if necessary.  */
7346   {
7347     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7348     Lisp_Object tail;
7349
7350     if (VECTORP (eol_type))
7351       {
7352         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7353           normal_eol = detect_eol (coding.source, src_bytes,
7354                                    coding_category_raw_text);
7355         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7356                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7357           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7358                                       coding_category_utf_16_be);
7359         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7360                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7361           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7362                                       coding_category_utf_16_le);
7363       }
7364     else
7365       {
7366         if (EQ (eol_type, Qunix))
7367           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7368         else if (EQ (eol_type, Qdos))
7369           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7370         else
7371           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7372       }
7373
7374     for (tail = val; CONSP (tail); tail = XCDR (tail))
7375       {
7376         enum coding_category category;
7377         int this_eol;
7378
7379         id = XINT (XCAR (tail));
7380         attrs = CODING_ID_ATTRS (id);
7381         category = XINT (CODING_ATTR_CATEGORY (attrs));
7382         eol_type = CODING_ID_EOL_TYPE (id);
7383         if (VECTORP (eol_type))
7384           {
7385             if (category == coding_category_utf_16_be
7386                 || category == coding_category_utf_16_be_nosig)
7387               this_eol = utf_16_be_eol;
7388             else if (category == coding_category_utf_16_le
7389                      || category == coding_category_utf_16_le_nosig)
7390               this_eol = utf_16_le_eol;
7391             else
7392               this_eol = normal_eol;
7393
7394             if (this_eol == EOL_SEEN_LF)
7395               XSETCAR (tail, AREF (eol_type, 0));
7396             else if (this_eol == EOL_SEEN_CRLF)
7397               XSETCAR (tail, AREF (eol_type, 1));
7398             else if (this_eol == EOL_SEEN_CR)
7399               XSETCAR (tail, AREF (eol_type, 2));
7400             else
7401               XSETCAR (tail, CODING_ID_NAME (id));
7402           }
7403         else
7404           XSETCAR (tail, CODING_ID_NAME (id));
7405       }
7406   }
7407
7408   return (highest ? XCAR (val) : val);
7409 }
7410
7411
7412 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7413        2, 3, 0,
7414        doc: /* Detect coding system of the text in the region between START and END.
7415 Return a list of possible coding systems ordered by priority.
7416
7417 If only ASCII characters are found, it returns a list of single element
7418 `undecided' or its subsidiary coding system according to a detected
7419 end-of-line format.
7420
7421 If optional argument HIGHEST is non-nil, return the coding system of
7422 highest priority.  */)
7423      (start, end, highest)
7424      Lisp_Object start, end, highest;
7425 {
7426   int from, to;
7427   int from_byte, to_byte;
7428
7429   CHECK_NUMBER_COERCE_MARKER (start);
7430   CHECK_NUMBER_COERCE_MARKER (end);
7431
7432   validate_region (&start, &end);
7433   from = XINT (start), to = XINT (end);
7434   from_byte = CHAR_TO_BYTE (from);
7435   to_byte = CHAR_TO_BYTE (to);
7436
7437   if (from < GPT && to >= GPT)
7438     move_gap_both (to, to_byte);
7439
7440   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7441                                to - from, to_byte - from_byte,
7442                                !NILP (highest),
7443                                !NILP (current_buffer
7444                                       ->enable_multibyte_characters),
7445                                Qnil);
7446 }
7447
7448 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7449        1, 2, 0,
7450        doc: /* Detect coding system of the text in STRING.
7451 Return a list of possible coding systems ordered by priority.
7452
7453 If only ASCII characters are found, it returns a list of single element
7454 `undecided' or its subsidiary coding system according to a detected
7455 end-of-line format.
7456
7457 If optional argument HIGHEST is non-nil, return the coding system of
7458 highest priority.  */)
7459      (string, highest)
7460      Lisp_Object string, highest;
7461 {
7462   CHECK_STRING (string);
7463
7464   return detect_coding_system (SDATA (string),
7465                                SCHARS (string), SBYTES (string),
7466                                !NILP (highest), STRING_MULTIBYTE (string),
7467                                Qnil);
7468 }
7469
7470
7471 static INLINE int
7472 char_encodable_p (c, attrs)
7473      int c;
7474      Lisp_Object attrs;
7475 {
7476   Lisp_Object tail;
7477   struct charset *charset;
7478   Lisp_Object translation_table;
7479
7480   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7481   if (! NILP (translation_table))
7482     c = translate_char (translation_table, c);
7483   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7484        CONSP (tail); tail = XCDR (tail))
7485     {
7486       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7487       if (CHAR_CHARSET_P (c, charset))
7488         break;
7489     }
7490   return (! NILP (tail));
7491 }
7492
7493
7494 /* Return a list of coding systems that safely encode the text between
7495    START and END.  If EXCLUDE is non-nil, it is a list of coding
7496    systems not to check.  The returned list doesn't contain any such
7497    coding systems.  In any case, if the text contains only ASCII or is
7498    unibyte, return t.  */
7499
7500 DEFUN ("find-coding-systems-region-internal",
7501        Ffind_coding_systems_region_internal,
7502        Sfind_coding_systems_region_internal, 2, 3, 0,
7503        doc: /* Internal use only.  */)
7504      (start, end, exclude)
7505      Lisp_Object start, end, exclude;
7506 {
7507   Lisp_Object coding_attrs_list, safe_codings;
7508   EMACS_INT start_byte, end_byte;
7509   const unsigned char *p, *pbeg, *pend;
7510   int c;
7511   Lisp_Object tail, elt;
7512
7513   if (STRINGP (start))
7514     {
7515       if (!STRING_MULTIBYTE (start)
7516           || SCHARS (start) == SBYTES (start))
7517         return Qt;
7518       start_byte = 0;
7519       end_byte = SBYTES (start);
7520     }
7521   else
7522     {
7523       CHECK_NUMBER_COERCE_MARKER (start);
7524       CHECK_NUMBER_COERCE_MARKER (end);
7525       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7526         args_out_of_range (start, end);
7527       if (NILP (current_buffer->enable_multibyte_characters))
7528         return Qt;
7529       start_byte = CHAR_TO_BYTE (XINT (start));
7530       end_byte = CHAR_TO_BYTE (XINT (end));
7531       if (XINT (end) - XINT (start) == end_byte - start_byte)
7532         return Qt;
7533
7534       if (XINT (start) < GPT && XINT (end) > GPT)
7535         {
7536           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7537             move_gap_both (XINT (start), start_byte);
7538           else
7539             move_gap_both (XINT (end), end_byte);
7540         }
7541     }
7542
7543   coding_attrs_list = Qnil;
7544   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7545     if (NILP (exclude)
7546         || NILP (Fmemq (XCAR (tail), exclude)))
7547       {
7548         Lisp_Object attrs;
7549
7550         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7551         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7552             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7553           {
7554             ASET (attrs, coding_attr_trans_tbl,
7555                   get_translation_table (attrs, 1, NULL));
7556             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7557           }
7558       }
7559
7560   if (STRINGP (start))
7561     p = pbeg = SDATA (start);
7562   else
7563     p = pbeg = BYTE_POS_ADDR (start_byte);
7564   pend = p + (end_byte - start_byte);
7565
7566   while (p < pend && ASCII_BYTE_P (*p)) p++;
7567   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7568
7569   while (p < pend)
7570     {
7571       if (ASCII_BYTE_P (*p))
7572         p++;
7573       else
7574         {
7575           c = STRING_CHAR_ADVANCE (p);
7576
7577           charset_map_loaded = 0;
7578           for (tail = coding_attrs_list; CONSP (tail);)
7579             {
7580               elt = XCAR (tail);
7581               if (NILP (elt))
7582                 tail = XCDR (tail);
7583               else if (char_encodable_p (c, elt))
7584                 tail = XCDR (tail);
7585               else if (CONSP (XCDR (tail)))
7586                 {
7587                   XSETCAR (tail, XCAR (XCDR (tail)));
7588                   XSETCDR (tail, XCDR (XCDR (tail)));
7589                 }
7590               else
7591                 {
7592                   XSETCAR (tail, Qnil);
7593                   tail = XCDR (tail);
7594                 }
7595             }
7596           if (charset_map_loaded)
7597             {
7598               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7599
7600               if (STRINGP (start))
7601                 pbeg = SDATA (start);
7602               else
7603                 pbeg = BYTE_POS_ADDR (start_byte);
7604               p = pbeg + p_offset;
7605               pend = pbeg + pend_offset;
7606             }
7607         }
7608     }
7609
7610   safe_codings = list2 (Qraw_text, Qno_conversion);
7611   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7612     if (! NILP (XCAR (tail)))
7613       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7614
7615   return safe_codings;
7616 }
7617
7618
7619 DEFUN ("unencodable-char-position", Funencodable_char_position,
7620        Sunencodable_char_position, 3, 5, 0,
7621        doc: /*
7622 Return position of first un-encodable character in a region.
7623 START and END specfiy the region and CODING-SYSTEM specifies the
7624 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7625
7626 If optional 4th argument COUNT is non-nil, it specifies at most how
7627 many un-encodable characters to search.  In this case, the value is a
7628 list of positions.
7629
7630 If optional 5th argument STRING is non-nil, it is a string to search
7631 for un-encodable characters.  In that case, START and END are indexes
7632 to the string.  */)
7633      (start, end, coding_system, count, string)
7634      Lisp_Object start, end, coding_system, count, string;
7635 {
7636   int n;
7637   struct coding_system coding;
7638   Lisp_Object attrs, charset_list, translation_table;
7639   Lisp_Object positions;
7640   int from, to;
7641   const unsigned char *p, *stop, *pend;
7642   int ascii_compatible;
7643
7644   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7645   attrs = CODING_ID_ATTRS (coding.id);
7646   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7647     return Qnil;
7648   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7649   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7650   translation_table = get_translation_table (attrs, 1, NULL);
7651
7652   if (NILP (string))
7653     {
7654       validate_region (&start, &end);
7655       from = XINT (start);
7656       to = XINT (end);
7657       if (NILP (current_buffer->enable_multibyte_characters)
7658           || (ascii_compatible
7659               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7660         return Qnil;
7661       p = CHAR_POS_ADDR (from);
7662       pend = CHAR_POS_ADDR (to);
7663       if (from < GPT && to >= GPT)
7664         stop = GPT_ADDR;
7665       else
7666         stop = pend;
7667     }
7668   else
7669     {
7670       CHECK_STRING (string);
7671       CHECK_NATNUM (start);
7672       CHECK_NATNUM (end);
7673       from = XINT (start);
7674       to = XINT (end);
7675       if (from > to
7676           || to > SCHARS (string))
7677         args_out_of_range_3 (string, start, end);
7678       if (! STRING_MULTIBYTE (string))
7679         return Qnil;
7680       p = SDATA (string) + string_char_to_byte (string, from);
7681       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7682       if (ascii_compatible && (to - from) == (pend - p))
7683         return Qnil;
7684     }
7685
7686   if (NILP (count))
7687     n = 1;
7688   else
7689     {
7690       CHECK_NATNUM (count);
7691       n = XINT (count);
7692     }
7693
7694   positions = Qnil;
7695   while (1)
7696     {
7697       int c;
7698
7699       if (ascii_compatible)
7700         while (p < stop && ASCII_BYTE_P (*p))
7701           p++, from++;
7702       if (p >= stop)
7703         {
7704           if (p >= pend)
7705             break;
7706           stop = pend;
7707           p = GAP_END_ADDR;
7708         }
7709
7710       c = STRING_CHAR_ADVANCE (p);
7711       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7712           && ! char_charset (translate_char (translation_table, c),
7713                              charset_list, NULL))
7714         {
7715           positions = Fcons (make_number (from), positions);
7716           n--;
7717           if (n == 0)
7718             break;
7719         }
7720
7721       from++;
7722     }
7723
7724   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7725 }
7726
7727
7728 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7729        Scheck_coding_systems_region, 3, 3, 0,
7730        doc: /* Check if the region is encodable by coding systems.
7731
7732 START and END are buffer positions specifying the region.
7733 CODING-SYSTEM-LIST is a list of coding systems to check.
7734
7735 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7736 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7737 whole region, POS0, POS1, ... are buffer positions where non-encodable
7738 characters are found.
7739
7740 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7741 value is nil.
7742
7743 START may be a string.  In that case, check if the string is
7744 encodable, and the value contains indices to the string instead of
7745 buffer positions.  END is ignored.  */)
7746      (start, end, coding_system_list)
7747      Lisp_Object start, end, coding_system_list;
7748 {
7749   Lisp_Object list;
7750   EMACS_INT start_byte, end_byte;
7751   int pos;
7752   const unsigned char *p, *pbeg, *pend;
7753   int c;
7754   Lisp_Object tail, elt, attrs;
7755
7756   if (STRINGP (start))
7757     {
7758       if (!STRING_MULTIBYTE (start)
7759           && SCHARS (start) != SBYTES (start))
7760         return Qnil;
7761       start_byte = 0;
7762       end_byte = SBYTES (start);
7763       pos = 0;
7764     }
7765   else
7766     {
7767       CHECK_NUMBER_COERCE_MARKER (start);
7768       CHECK_NUMBER_COERCE_MARKER (end);
7769       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7770         args_out_of_range (start, end);
7771       if (NILP (current_buffer->enable_multibyte_characters))
7772         return Qnil;
7773       start_byte = CHAR_TO_BYTE (XINT (start));
7774       end_byte = CHAR_TO_BYTE (XINT (end));
7775       if (XINT (end) - XINT (start) == end_byte - start_byte)
7776         return Qt;
7777
7778       if (XINT (start) < GPT && XINT (end) > GPT)
7779         {
7780           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7781             move_gap_both (XINT (start), start_byte);
7782           else
7783             move_gap_both (XINT (end), end_byte);
7784         }
7785       pos = XINT (start);
7786     }
7787
7788   list = Qnil;
7789   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7790     {
7791       elt = XCAR (tail);
7792       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7793       ASET (attrs, coding_attr_trans_tbl,
7794             get_translation_table (attrs, 1, NULL));
7795       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7796     }
7797
7798   if (STRINGP (start))
7799     p = pbeg = SDATA (start);
7800   else
7801     p = pbeg = BYTE_POS_ADDR (start_byte);
7802   pend = p + (end_byte - start_byte);
7803
7804   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7805   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7806
7807   while (p < pend)
7808     {
7809       if (ASCII_BYTE_P (*p))
7810         p++;
7811       else
7812         {
7813           c = STRING_CHAR_ADVANCE (p);
7814
7815           charset_map_loaded = 0;
7816           for (tail = list; CONSP (tail); tail = XCDR (tail))
7817             {
7818               elt = XCDR (XCAR (tail));
7819               if (! char_encodable_p (c, XCAR (elt)))
7820                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7821             }
7822           if (charset_map_loaded)
7823             {
7824               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7825
7826               if (STRINGP (start))
7827                 pbeg = SDATA (start);
7828               else
7829                 pbeg = BYTE_POS_ADDR (start_byte);
7830               p = pbeg + p_offset;
7831               pend = pbeg + pend_offset;
7832             }
7833         }
7834       pos++;
7835     }
7836
7837   tail = list;
7838   list = Qnil;
7839   for (; CONSP (tail); tail = XCDR (tail))
7840     {
7841       elt = XCAR (tail);
7842       if (CONSP (XCDR (XCDR (elt))))
7843         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7844                       list);
7845     }
7846
7847   return list;
7848 }
7849
7850
7851 Lisp_Object
7852 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7853      Lisp_Object start, end, coding_system, dst_object;
7854      int encodep, norecord;
7855 {
7856   struct coding_system coding;
7857   EMACS_INT from, from_byte, to, to_byte;
7858   Lisp_Object src_object;
7859
7860   CHECK_NUMBER_COERCE_MARKER (start);
7861   CHECK_NUMBER_COERCE_MARKER (end);
7862   if (NILP (coding_system))
7863     coding_system = Qno_conversion;
7864   else
7865     CHECK_CODING_SYSTEM (coding_system);
7866   src_object = Fcurrent_buffer ();
7867   if (NILP (dst_object))
7868     dst_object = src_object;
7869   else if (! EQ (dst_object, Qt))
7870     CHECK_BUFFER (dst_object);
7871
7872   validate_region (&start, &end);
7873   from = XFASTINT (start);
7874   from_byte = CHAR_TO_BYTE (from);
7875   to = XFASTINT (end);
7876   to_byte = CHAR_TO_BYTE (to);
7877
7878   setup_coding_system (coding_system, &coding);
7879   coding.mode |= CODING_MODE_LAST_BLOCK;
7880
7881   if (encodep)
7882     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7883                           dst_object);
7884   else
7885     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7886                           dst_object);
7887   if (! norecord)
7888     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7889
7890   return (BUFFERP (dst_object)
7891           ? make_number (coding.produced_char)
7892           : coding.dst_object);
7893 }
7894
7895
7896 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7897        3, 4, "r\nzCoding system: ",
7898        doc: /* Decode the current region from the specified coding system.
7899 When called from a program, takes four arguments:
7900         START, END, CODING-SYSTEM, and DESTINATION.
7901 START and END are buffer positions.
7902
7903 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7904 If nil, the region between START and END is replace by the decoded text.
7905 If buffer, the decoded text is inserted in the buffer.
7906 If t, the decoded text is returned.
7907
7908 This function sets `last-coding-system-used' to the precise coding system
7909 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7910 not fully specified.)
7911 It returns the length of the decoded text.  */)
7912      (start, end, coding_system, destination)
7913      Lisp_Object start, end, coding_system, destination;
7914 {
7915   return code_convert_region (start, end, coding_system, destination, 0, 0);
7916 }
7917
7918 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7919        3, 4, "r\nzCoding system: ",
7920        doc: /* Encode the current region by specified coding system.
7921 When called from a program, takes three arguments:
7922 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7923
7924 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7925 If nil, the region between START and END is replace by the encoded text.
7926 If buffer, the encoded text is inserted in the buffer.
7927 If t, the encoded text is returned.
7928
7929 This function sets `last-coding-system-used' to the precise coding system
7930 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7931 not fully specified.)
7932 It returns the length of the encoded text.  */)
7933   (start, end, coding_system, destination)
7934      Lisp_Object start, end, coding_system, destination;
7935 {
7936   return code_convert_region (start, end, coding_system, destination, 1, 0);
7937 }
7938
7939 Lisp_Object
7940 code_convert_string (string, coding_system, dst_object,
7941                      encodep, nocopy, norecord)
7942      Lisp_Object string, coding_system, dst_object;
7943      int encodep, nocopy, norecord;
7944 {
7945   struct coding_system coding;
7946   EMACS_INT chars, bytes;
7947
7948   CHECK_STRING (string);
7949   if (NILP (coding_system))
7950     {
7951       if (! norecord)
7952         Vlast_coding_system_used = Qno_conversion;
7953       if (NILP (dst_object))
7954         return (nocopy ? Fcopy_sequence (string) : string);
7955     }
7956
7957   if (NILP (coding_system))
7958     coding_system = Qno_conversion;
7959   else
7960     CHECK_CODING_SYSTEM (coding_system);
7961   if (NILP (dst_object))
7962     dst_object = Qt;
7963   else if (! EQ (dst_object, Qt))
7964     CHECK_BUFFER (dst_object);
7965
7966   setup_coding_system (coding_system, &coding);
7967   coding.mode |= CODING_MODE_LAST_BLOCK;
7968   chars = SCHARS (string);
7969   bytes = SBYTES (string);
7970   if (encodep)
7971     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7972   else
7973     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7974   if (! norecord)
7975     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7976
7977   return (BUFFERP (dst_object)
7978           ? make_number (coding.produced_char)
7979           : coding.dst_object);
7980 }
7981
7982
7983 /* Encode or decode STRING according to CODING_SYSTEM.
7984    Do not set Vlast_coding_system_used.
7985
7986    This function is called only from macros DECODE_FILE and
7987    ENCODE_FILE, thus we ignore character composition.  */
7988
7989 Lisp_Object
7990 code_convert_string_norecord (string, coding_system, encodep)
7991      Lisp_Object string, coding_system;
7992      int encodep;
7993 {
7994   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7995 }
7996
7997
7998 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7999        2, 4, 0,
8000        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8001
8002 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8003 if the decoding operation is trivial.
8004
8005 Optional fourth arg BUFFER non-nil meant that the decoded text is
8006 inserted in BUFFER instead of returned as a string.  In this case,
8007 the return value is BUFFER.
8008
8009 This function sets `last-coding-system-used' to the precise coding system
8010 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8011 not fully specified.  */)
8012   (string, coding_system, nocopy, buffer)
8013      Lisp_Object string, coding_system, nocopy, buffer;
8014 {
8015   return code_convert_string (string, coding_system, buffer,
8016                               0, ! NILP (nocopy), 0);
8017 }
8018
8019 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8020        2, 4, 0,
8021        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8022
8023 Optional third arg NOCOPY non-nil means it is OK to return STRING
8024 itself if the encoding operation is trivial.
8025
8026 Optional fourth arg BUFFER non-nil meant that the encoded text is
8027 inserted in BUFFER instead of returned as a string.  In this case,
8028 the return value is BUFFER.
8029
8030 This function sets `last-coding-system-used' to the precise coding system
8031 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8032 not fully specified.)  */)
8033      (string, coding_system, nocopy, buffer)
8034      Lisp_Object string, coding_system, nocopy, buffer;
8035 {
8036   return code_convert_string (string, coding_system, buffer,
8037                               1, ! NILP (nocopy), 1);
8038 }
8039
8040 \f
8041 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8042        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8043 Return the corresponding character.  */)
8044      (code)
8045      Lisp_Object code;
8046 {
8047   Lisp_Object spec, attrs, val;
8048   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8049   int c;
8050
8051   CHECK_NATNUM (code);
8052   c = XFASTINT (code);
8053   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8054   attrs = AREF (spec, 0);
8055
8056   if (ASCII_BYTE_P (c)
8057       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8058     return code;
8059
8060   val = CODING_ATTR_CHARSET_LIST (attrs);
8061   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8062   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8063   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8064
8065   if (c <= 0x7F)
8066     charset = charset_roman;
8067   else if (c >= 0xA0 && c < 0xDF)
8068     {
8069       charset = charset_kana;
8070       c -= 0x80;
8071     }
8072   else
8073     {
8074       int s1 = c >> 8, s2 = c & 0xFF;
8075
8076       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8077           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8078         error ("Invalid code: %d", code);
8079       SJIS_TO_JIS (c);
8080       charset = charset_kanji;
8081     }
8082   c = DECODE_CHAR (charset, c);
8083   if (c < 0)
8084     error ("Invalid code: %d", code);
8085   return make_number (c);
8086 }
8087
8088
8089 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8090        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8091 Return the corresponding code in SJIS.  */)
8092      (ch)
8093     Lisp_Object ch;
8094 {
8095   Lisp_Object spec, attrs, charset_list;
8096   int c;
8097   struct charset *charset;
8098   unsigned code;
8099
8100   CHECK_CHARACTER (ch);
8101   c = XFASTINT (ch);
8102   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8103   attrs = AREF (spec, 0);
8104
8105   if (ASCII_CHAR_P (c)
8106       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8107     return ch;
8108
8109   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8110   charset = char_charset (c, charset_list, &code);
8111   if (code == CHARSET_INVALID_CODE (charset))
8112     error ("Can't encode by shift_jis encoding: %d", c);
8113   JIS_TO_SJIS (code);
8114
8115   return make_number (code);
8116 }
8117
8118 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8119        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8120 Return the corresponding character.  */)
8121      (code)
8122      Lisp_Object code;
8123 {
8124   Lisp_Object spec, attrs, val;
8125   struct charset *charset_roman, *charset_big5, *charset;
8126   int c;
8127
8128   CHECK_NATNUM (code);
8129   c = XFASTINT (code);
8130   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8131   attrs = AREF (spec, 0);
8132
8133   if (ASCII_BYTE_P (c)
8134       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8135     return code;
8136
8137   val = CODING_ATTR_CHARSET_LIST (attrs);
8138   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8139   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8140
8141   if (c <= 0x7F)
8142     charset = charset_roman;
8143   else
8144     {
8145       int b1 = c >> 8, b2 = c & 0x7F;
8146       if (b1 < 0xA1 || b1 > 0xFE
8147           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8148         error ("Invalid code: %d", code);
8149       charset = charset_big5;
8150     }
8151   c = DECODE_CHAR (charset, (unsigned )c);
8152   if (c < 0)
8153     error ("Invalid code: %d", code);
8154   return make_number (c);
8155 }
8156
8157 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8158        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8159 Return the corresponding character code in Big5.  */)
8160      (ch)
8161      Lisp_Object ch;
8162 {
8163   Lisp_Object spec, attrs, charset_list;
8164   struct charset *charset;
8165   int c;
8166   unsigned code;
8167
8168   CHECK_CHARACTER (ch);
8169   c = XFASTINT (ch);
8170   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8171   attrs = AREF (spec, 0);
8172   if (ASCII_CHAR_P (c)
8173       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8174     return ch;
8175
8176   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8177   charset = char_charset (c, charset_list, &code);
8178   if (code == CHARSET_INVALID_CODE (charset))
8179     error ("Can't encode by Big5 encoding: %d", c);
8180
8181   return make_number (code);
8182 }
8183
8184 \f
8185 DEFUN ("set-terminal-coding-system-internal",
8186        Fset_terminal_coding_system_internal,
8187        Sset_terminal_coding_system_internal, 1, 1, 0,
8188        doc: /* Internal use only.  */)
8189      (coding_system)
8190      Lisp_Object coding_system;
8191 {
8192   CHECK_SYMBOL (coding_system);
8193   setup_coding_system (Fcheck_coding_system (coding_system),
8194                         &terminal_coding);
8195
8196   /* We had better not send unsafe characters to terminal.  */
8197   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8198   /* Characer composition should be disabled.  */
8199   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8200   terminal_coding.src_multibyte = 1;
8201   terminal_coding.dst_multibyte = 0;
8202   return Qnil;
8203 }
8204
8205 DEFUN ("set-safe-terminal-coding-system-internal",
8206        Fset_safe_terminal_coding_system_internal,
8207        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8208        doc: /* Internal use only.  */)
8209      (coding_system)
8210      Lisp_Object coding_system;
8211 {
8212   CHECK_SYMBOL (coding_system);
8213   setup_coding_system (Fcheck_coding_system (coding_system),
8214                        &safe_terminal_coding);
8215   /* Characer composition should be disabled.  */
8216   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8217   safe_terminal_coding.src_multibyte = 1;
8218   safe_terminal_coding.dst_multibyte = 0;
8219   return Qnil;
8220 }
8221
8222 DEFUN ("terminal-coding-system",
8223        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8224        doc: /* Return coding system specified for terminal output.  */)
8225      ()
8226 {
8227   Lisp_Object coding_system;
8228
8229   coding_system = CODING_ID_NAME (terminal_coding.id);
8230   /* For backward compatibility, return nil if it is `undecided'. */
8231   return (coding_system != Qundecided ? coding_system : Qnil);
8232 }
8233
8234 DEFUN ("set-keyboard-coding-system-internal",
8235        Fset_keyboard_coding_system_internal,
8236        Sset_keyboard_coding_system_internal, 1, 1, 0,
8237        doc: /* Internal use only.  */)
8238      (coding_system)
8239      Lisp_Object coding_system;
8240 {
8241   CHECK_SYMBOL (coding_system);
8242   setup_coding_system (Fcheck_coding_system (coding_system),
8243                        &keyboard_coding);
8244   /* Characer composition should be disabled.  */
8245   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8246   return Qnil;
8247 }
8248
8249 DEFUN ("keyboard-coding-system",
8250        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8251        doc: /* Return coding system specified for decoding keyboard input.  */)
8252      ()
8253 {
8254   return CODING_ID_NAME (keyboard_coding.id);
8255 }
8256
8257 \f
8258 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8259        Sfind_operation_coding_system,  1, MANY, 0,
8260        doc: /* Choose a coding system for an operation based on the target name.
8261 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8262 DECODING-SYSTEM is the coding system to use for decoding
8263 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8264 for encoding (in case OPERATION does encoding).
8265
8266 The first argument OPERATION specifies an I/O primitive:
8267   For file I/O, `insert-file-contents' or `write-region'.
8268   For process I/O, `call-process', `call-process-region', or `start-process'.
8269   For network I/O, `open-network-stream'.
8270
8271 The remaining arguments should be the same arguments that were passed
8272 to the primitive.  Depending on which primitive, one of those arguments
8273 is selected as the TARGET.  For example, if OPERATION does file I/O,
8274 whichever argument specifies the file name is TARGET.
8275
8276 TARGET has a meaning which depends on OPERATION:
8277   For file I/O, TARGET is a file name.
8278   For process I/O, TARGET is a process name.
8279   For network I/O, TARGET is a service name or a port number
8280
8281 This function looks up what specified for TARGET in,
8282 `file-coding-system-alist', `process-coding-system-alist',
8283 or `network-coding-system-alist' depending on OPERATION.
8284 They may specify a coding system, a cons of coding systems,
8285 or a function symbol to call.
8286 In the last case, we call the function with one argument,
8287 which is a list of all the arguments given to this function.
8288
8289 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8290      (nargs, args)
8291      int nargs;
8292      Lisp_Object *args;
8293 {
8294   Lisp_Object operation, target_idx, target, val;
8295   register Lisp_Object chain;
8296
8297   if (nargs < 2)
8298     error ("Too few arguments");
8299   operation = args[0];
8300   if (!SYMBOLP (operation)
8301       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8302     error ("Invalid first arguement");
8303   if (nargs < 1 + XINT (target_idx))
8304     error ("Too few arguments for operation: %s",
8305            SDATA (SYMBOL_NAME (operation)));
8306   target = args[XINT (target_idx) + 1];
8307   if (!(STRINGP (target)
8308         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8309     error ("Invalid %dth argument", XINT (target_idx) + 1);
8310
8311   chain = ((EQ (operation, Qinsert_file_contents)
8312             || EQ (operation, Qwrite_region))
8313            ? Vfile_coding_system_alist
8314            : (EQ (operation, Qopen_network_stream)
8315               ? Vnetwork_coding_system_alist
8316               : Vprocess_coding_system_alist));
8317   if (NILP (chain))
8318     return Qnil;
8319
8320   for (; CONSP (chain); chain = XCDR (chain))
8321     {
8322       Lisp_Object elt;
8323
8324       elt = XCAR (chain);
8325       if (CONSP (elt)
8326           && ((STRINGP (target)
8327                && STRINGP (XCAR (elt))
8328                && fast_string_match (XCAR (elt), target) >= 0)
8329               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8330         {
8331           val = XCDR (elt);
8332           /* Here, if VAL is both a valid coding system and a valid
8333              function symbol, we return VAL as a coding system.  */
8334           if (CONSP (val))
8335             return val;
8336           if (! SYMBOLP (val))
8337             return Qnil;
8338           if (! NILP (Fcoding_system_p (val)))
8339             return Fcons (val, val);
8340           if (! NILP (Ffboundp (val)))
8341             {
8342               val = call1 (val, Flist (nargs, args));
8343               if (CONSP (val))
8344                 return val;
8345               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8346                 return Fcons (val, val);
8347             }
8348           return Qnil;
8349         }
8350     }
8351   return Qnil;
8352 }
8353
8354 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8355        Sset_coding_system_priority, 0, MANY, 0,
8356        doc: /* Assign higher priority to the coding systems given as arguments.
8357 If multiple coding systems belongs to the same category,
8358 all but the first one are ignored.
8359
8360 usage: (set-coding-system-priority ...)  */)
8361      (nargs, args)
8362      int nargs;
8363      Lisp_Object *args;
8364 {
8365   int i, j;
8366   int changed[coding_category_max];
8367   enum coding_category priorities[coding_category_max];
8368
8369   bzero (changed, sizeof changed);
8370
8371   for (i = j = 0; i < nargs; i++)
8372     {
8373       enum coding_category category;
8374       Lisp_Object spec, attrs;
8375
8376       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8377       attrs = AREF (spec, 0);
8378       category = XINT (CODING_ATTR_CATEGORY (attrs));
8379       if (changed[category])
8380         /* Ignore this coding system because a coding system of the
8381            same category already had a higher priority.  */
8382         continue;
8383       changed[category] = 1;
8384       priorities[j++] = category;
8385       if (coding_categories[category].id >= 0
8386           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8387         setup_coding_system (args[i], &coding_categories[category]);
8388       Fset (AREF (Vcoding_category_table, category), args[i]);
8389     }
8390
8391   /* Now we have decided top J priorities.  Reflect the order of the
8392      original priorities to the remaining priorities.  */
8393
8394   for (i = j, j = 0; i < coding_category_max; i++, j++)
8395     {
8396       while (j < coding_category_max
8397              && changed[coding_priorities[j]])
8398         j++;
8399       if (j == coding_category_max)
8400         abort ();
8401       priorities[i] = coding_priorities[j];
8402     }
8403
8404   bcopy (priorities, coding_priorities, sizeof priorities);
8405
8406   /* Update `coding-category-list'.  */
8407   Vcoding_category_list = Qnil;
8408   for (i = coding_category_max - 1; i >= 0; i--)
8409     Vcoding_category_list
8410       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8411                Vcoding_category_list);
8412
8413   return Qnil;
8414 }
8415
8416 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8417        Scoding_system_priority_list, 0, 1, 0,
8418        doc: /* Return a list of coding systems ordered by their priorities.
8419 HIGHESTP non-nil means just return the highest priority one.  */)
8420      (highestp)
8421      Lisp_Object highestp;
8422 {
8423   int i;
8424   Lisp_Object val;
8425
8426   for (i = 0, val = Qnil; i < coding_category_max; i++)
8427     {
8428       enum coding_category category = coding_priorities[i];
8429       int id = coding_categories[category].id;
8430       Lisp_Object attrs;
8431
8432       if (id < 0)
8433         continue;
8434       attrs = CODING_ID_ATTRS (id);
8435       if (! NILP (highestp))
8436         return CODING_ATTR_BASE_NAME (attrs);
8437       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8438     }
8439   return Fnreverse (val);
8440 }
8441
8442 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8443
8444 static Lisp_Object
8445 make_subsidiaries (base)
8446      Lisp_Object base;
8447 {
8448   Lisp_Object subsidiaries;
8449   int base_name_len = SBYTES (SYMBOL_NAME (base));
8450   char *buf = (char *) alloca (base_name_len + 6);
8451   int i;
8452
8453   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8454   subsidiaries = Fmake_vector (make_number (3), Qnil);
8455   for (i = 0; i < 3; i++)
8456     {
8457       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8458       ASET (subsidiaries, i, intern (buf));
8459     }
8460   return subsidiaries;
8461 }
8462
8463
8464 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8465        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8466        doc: /* For internal use only.
8467 usage: (define-coding-system-internal ...)  */)
8468      (nargs, args)
8469      int nargs;
8470      Lisp_Object *args;
8471 {
8472   Lisp_Object name;
8473   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8474   Lisp_Object attrs;            /* Vector of attributes.  */
8475   Lisp_Object eol_type;
8476   Lisp_Object aliases;
8477   Lisp_Object coding_type, charset_list, safe_charsets;
8478   enum coding_category category;
8479   Lisp_Object tail, val;
8480   int max_charset_id = 0;
8481   int i;
8482
8483   if (nargs < coding_arg_max)
8484     goto short_args;
8485
8486   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8487
8488   name = args[coding_arg_name];
8489   CHECK_SYMBOL (name);
8490   CODING_ATTR_BASE_NAME (attrs) = name;
8491
8492   val = args[coding_arg_mnemonic];
8493   if (! STRINGP (val))
8494     CHECK_CHARACTER (val);
8495   CODING_ATTR_MNEMONIC (attrs) = val;
8496
8497   coding_type = args[coding_arg_coding_type];
8498   CHECK_SYMBOL (coding_type);
8499   CODING_ATTR_TYPE (attrs) = coding_type;
8500
8501   charset_list = args[coding_arg_charset_list];
8502   if (SYMBOLP (charset_list))
8503     {
8504       if (EQ (charset_list, Qiso_2022))
8505         {
8506           if (! EQ (coding_type, Qiso_2022))
8507             error ("Invalid charset-list");
8508           charset_list = Viso_2022_charset_list;
8509         }
8510       else if (EQ (charset_list, Qemacs_mule))
8511         {
8512           if (! EQ (coding_type, Qemacs_mule))
8513             error ("Invalid charset-list");
8514           charset_list = Vemacs_mule_charset_list;
8515         }
8516       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8517         if (max_charset_id < XFASTINT (XCAR (tail)))
8518           max_charset_id = XFASTINT (XCAR (tail));
8519     }
8520   else
8521     {
8522       charset_list = Fcopy_sequence (charset_list);
8523       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8524         {
8525           struct charset *charset;
8526
8527           val = Fcar (tail);
8528           CHECK_CHARSET_GET_CHARSET (val, charset);
8529           if (EQ (coding_type, Qiso_2022)
8530               ? CHARSET_ISO_FINAL (charset) < 0
8531               : EQ (coding_type, Qemacs_mule)
8532               ? CHARSET_EMACS_MULE_ID (charset) < 0
8533               : 0)
8534             error ("Can't handle charset `%s'",
8535                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8536
8537           XSETCAR (tail, make_number (charset->id));
8538           if (max_charset_id < charset->id)
8539             max_charset_id = charset->id;
8540         }
8541     }
8542   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8543
8544   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8545                                 make_number (255));
8546   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8547     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8548   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8549
8550   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8551
8552   val = args[coding_arg_decode_translation_table];
8553   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8554     CHECK_SYMBOL (val);
8555   CODING_ATTR_DECODE_TBL (attrs) = val;
8556
8557   val = args[coding_arg_encode_translation_table];
8558   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8559     CHECK_SYMBOL (val);
8560   CODING_ATTR_ENCODE_TBL (attrs) = val;
8561
8562   val = args[coding_arg_post_read_conversion];
8563   CHECK_SYMBOL (val);
8564   CODING_ATTR_POST_READ (attrs) = val;
8565
8566   val = args[coding_arg_pre_write_conversion];
8567   CHECK_SYMBOL (val);
8568   CODING_ATTR_PRE_WRITE (attrs) = val;
8569
8570   val = args[coding_arg_default_char];
8571   if (NILP (val))
8572     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8573   else
8574     {
8575       CHECK_CHARACTER (val);
8576       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8577     }
8578
8579   val = args[coding_arg_for_unibyte];
8580   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8581
8582   val = args[coding_arg_plist];
8583   CHECK_LIST (val);
8584   CODING_ATTR_PLIST (attrs) = val;
8585
8586   if (EQ (coding_type, Qcharset))
8587     {
8588       /* Generate a lisp vector of 256 elements.  Each element is nil,
8589          integer, or a list of charset IDs.
8590
8591          If Nth element is nil, the byte code N is invalid in this
8592          coding system.
8593
8594          If Nth element is a number NUM, N is the first byte of a
8595          charset whose ID is NUM.
8596
8597          If Nth element is a list of charset IDs, N is the first byte
8598          of one of them.  The list is sorted by dimensions of the
8599          charsets.  A charset of smaller dimension comes firtst. */
8600       val = Fmake_vector (make_number (256), Qnil);
8601
8602       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8603         {
8604           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8605           int dim = CHARSET_DIMENSION (charset);
8606           int idx = (dim - 1) * 4;
8607
8608           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8609             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8610
8611           for (i = charset->code_space[idx];
8612                i <= charset->code_space[idx + 1]; i++)
8613             {
8614               Lisp_Object tmp, tmp2;
8615               int dim2;
8616
8617               tmp = AREF (val, i);
8618               if (NILP (tmp))
8619                 tmp = XCAR (tail);
8620               else if (NUMBERP (tmp))
8621                 {
8622                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8623                   if (dim < dim2)
8624                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8625                   else
8626                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8627                 }
8628               else
8629                 {
8630                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8631                     {
8632                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8633                       if (dim < dim2)
8634                         break;
8635                     }
8636                   if (NILP (tmp2))
8637                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8638                   else
8639                     {
8640                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8641                       XSETCAR (tmp2, XCAR (tail));
8642                     }
8643                 }
8644               ASET (val, i, tmp);
8645             }
8646         }
8647       ASET (attrs, coding_attr_charset_valids, val);
8648       category = coding_category_charset;
8649     }
8650   else if (EQ (coding_type, Qccl))
8651     {
8652       Lisp_Object valids;
8653
8654       if (nargs < coding_arg_ccl_max)
8655         goto short_args;
8656
8657       val = args[coding_arg_ccl_decoder];
8658       CHECK_CCL_PROGRAM (val);
8659       if (VECTORP (val))
8660         val = Fcopy_sequence (val);
8661       ASET (attrs, coding_attr_ccl_decoder, val);
8662
8663       val = args[coding_arg_ccl_encoder];
8664       CHECK_CCL_PROGRAM (val);
8665       if (VECTORP (val))
8666         val = Fcopy_sequence (val);
8667       ASET (attrs, coding_attr_ccl_encoder, val);
8668
8669       val = args[coding_arg_ccl_valids];
8670       valids = Fmake_string (make_number (256), make_number (0));
8671       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8672         {
8673           int from, to;
8674
8675           val = Fcar (tail);
8676           if (INTEGERP (val))
8677             {
8678               from = to = XINT (val);
8679               if (from < 0 || from > 255)
8680                 args_out_of_range_3 (val, make_number (0), make_number (255));
8681             }
8682           else
8683             {
8684               CHECK_CONS (val);
8685               CHECK_NATNUM_CAR (val);
8686               CHECK_NATNUM_CDR (val);
8687               from = XINT (XCAR (val));
8688               if (from > 255)
8689                 args_out_of_range_3 (XCAR (val),
8690                                      make_number (0), make_number (255));
8691               to = XINT (XCDR (val));
8692               if (to < from || to > 255)
8693                 args_out_of_range_3 (XCDR (val),
8694                                      XCAR (val), make_number (255));
8695             }
8696           for (i = from; i <= to; i++)
8697             SSET (valids, i, 1);
8698         }
8699       ASET (attrs, coding_attr_ccl_valids, valids);
8700
8701       category = coding_category_ccl;
8702     }
8703   else if (EQ (coding_type, Qutf_16))
8704     {
8705       Lisp_Object bom, endian;
8706
8707       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8708
8709       if (nargs < coding_arg_utf16_max)
8710         goto short_args;
8711
8712       bom = args[coding_arg_utf16_bom];
8713       if (! NILP (bom) && ! EQ (bom, Qt))
8714         {
8715           CHECK_CONS (bom);
8716           val = XCAR (bom);
8717           CHECK_CODING_SYSTEM (val);
8718           val = XCDR (bom);
8719           CHECK_CODING_SYSTEM (val);
8720         }
8721       ASET (attrs, coding_attr_utf_16_bom, bom);
8722
8723       endian = args[coding_arg_utf16_endian];
8724       CHECK_SYMBOL (endian);
8725       if (NILP (endian))
8726         endian = Qbig;
8727       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8728         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8729       ASET (attrs, coding_attr_utf_16_endian, endian);
8730
8731       category = (CONSP (bom)
8732                   ? coding_category_utf_16_auto
8733                   : NILP (bom)
8734                   ? (EQ (endian, Qbig)
8735                      ? coding_category_utf_16_be_nosig
8736                      : coding_category_utf_16_le_nosig)
8737                   : (EQ (endian, Qbig)
8738                      ? coding_category_utf_16_be
8739                      : coding_category_utf_16_le));
8740     }
8741   else if (EQ (coding_type, Qiso_2022))
8742     {
8743       Lisp_Object initial, reg_usage, request, flags;
8744       int i;
8745
8746       if (nargs < coding_arg_iso2022_max)
8747         goto short_args;
8748
8749       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8750       CHECK_VECTOR (initial);
8751       for (i = 0; i < 4; i++)
8752         {
8753           val = Faref (initial, make_number (i));
8754           if (! NILP (val))
8755             {
8756               struct charset *charset;
8757
8758               CHECK_CHARSET_GET_CHARSET (val, charset);
8759               ASET (initial, i, make_number (CHARSET_ID (charset)));
8760               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8761                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8762             }
8763           else
8764             ASET (initial, i, make_number (-1));
8765         }
8766
8767       reg_usage = args[coding_arg_iso2022_reg_usage];
8768       CHECK_CONS (reg_usage);
8769       CHECK_NUMBER_CAR (reg_usage);
8770       CHECK_NUMBER_CDR (reg_usage);
8771
8772       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8773       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8774         {
8775           int id;
8776           Lisp_Object tmp;
8777
8778           val = Fcar (tail);
8779           CHECK_CONS (val);
8780           tmp = XCAR (val);
8781           CHECK_CHARSET_GET_ID (tmp, id);
8782           CHECK_NATNUM_CDR (val);
8783           if (XINT (XCDR (val)) >= 4)
8784             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8785           XSETCAR (val, make_number (id));
8786         }
8787
8788       flags = args[coding_arg_iso2022_flags];
8789       CHECK_NATNUM (flags);
8790       i = XINT (flags);
8791       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8792         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8793
8794       ASET (attrs, coding_attr_iso_initial, initial);
8795       ASET (attrs, coding_attr_iso_usage, reg_usage);
8796       ASET (attrs, coding_attr_iso_request, request);
8797       ASET (attrs, coding_attr_iso_flags, flags);
8798       setup_iso_safe_charsets (attrs);
8799
8800       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8801         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8802                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8803                     ? coding_category_iso_7_else
8804                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8805                     ? coding_category_iso_7
8806                     : coding_category_iso_7_tight);
8807       else
8808         {
8809           int id = XINT (AREF (initial, 1));
8810
8811           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8812                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8813                        || id < 0)
8814                       ? coding_category_iso_8_else
8815                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8816                       ? coding_category_iso_8_1
8817                       : coding_category_iso_8_2);
8818         }
8819       if (category != coding_category_iso_8_1
8820           && category != coding_category_iso_8_2)
8821         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8822     }
8823   else if (EQ (coding_type, Qemacs_mule))
8824     {
8825       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8826         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8827       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8828       category = coding_category_emacs_mule;
8829     }
8830   else if (EQ (coding_type, Qshift_jis))
8831     {
8832
8833       struct charset *charset;
8834
8835       if (XINT (Flength (charset_list)) != 3
8836           && XINT (Flength (charset_list)) != 4)
8837         error ("There should be three or four charsets");
8838
8839       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8840       if (CHARSET_DIMENSION (charset) != 1)
8841         error ("Dimension of charset %s is not one",
8842                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8843       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8844         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8845
8846       charset_list = XCDR (charset_list);
8847       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8848       if (CHARSET_DIMENSION (charset) != 1)
8849         error ("Dimension of charset %s is not one",
8850                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8851
8852       charset_list = XCDR (charset_list);
8853       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8854       if (CHARSET_DIMENSION (charset) != 2)
8855         error ("Dimension of charset %s is not two",
8856                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8857
8858       charset_list = XCDR (charset_list);
8859       if (! NILP (charset_list))
8860         {
8861           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8862           if (CHARSET_DIMENSION (charset) != 2)
8863             error ("Dimension of charset %s is not two",
8864                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8865         }
8866
8867       category = coding_category_sjis;
8868       Vsjis_coding_system = name;
8869     }
8870   else if (EQ (coding_type, Qbig5))
8871     {
8872       struct charset *charset;
8873
8874       if (XINT (Flength (charset_list)) != 2)
8875         error ("There should be just two charsets");
8876
8877       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8878       if (CHARSET_DIMENSION (charset) != 1)
8879         error ("Dimension of charset %s is not one",
8880                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8881       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8882         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8883
8884       charset_list = XCDR (charset_list);
8885       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8886       if (CHARSET_DIMENSION (charset) != 2)
8887         error ("Dimension of charset %s is not two",
8888                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8889
8890       category = coding_category_big5;
8891       Vbig5_coding_system = name;
8892     }
8893   else if (EQ (coding_type, Qraw_text))
8894     {
8895       category = coding_category_raw_text;
8896       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8897     }
8898   else if (EQ (coding_type, Qutf_8))
8899     {
8900       category = coding_category_utf_8;
8901       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8902     }
8903   else if (EQ (coding_type, Qundecided))
8904     category = coding_category_undecided;
8905   else
8906     error ("Invalid coding system type: %s",
8907            SDATA (SYMBOL_NAME (coding_type)));
8908
8909   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8910   CODING_ATTR_PLIST (attrs)
8911     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8912                                 CODING_ATTR_PLIST (attrs)));
8913   CODING_ATTR_PLIST (attrs)
8914     = Fcons (QCascii_compatible_p,
8915              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8916                     CODING_ATTR_PLIST (attrs)));
8917
8918   eol_type = args[coding_arg_eol_type];
8919   if (! NILP (eol_type)
8920       && ! EQ (eol_type, Qunix)
8921       && ! EQ (eol_type, Qdos)
8922       && ! EQ (eol_type, Qmac))
8923     error ("Invalid eol-type");
8924
8925   aliases = Fcons (name, Qnil);
8926
8927   if (NILP (eol_type))
8928     {
8929       eol_type = make_subsidiaries (name);
8930       for (i = 0; i < 3; i++)
8931         {
8932           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8933
8934           this_name = AREF (eol_type, i);
8935           this_aliases = Fcons (this_name, Qnil);
8936           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8937           this_spec = Fmake_vector (make_number (3), attrs);
8938           ASET (this_spec, 1, this_aliases);
8939           ASET (this_spec, 2, this_eol_type);
8940           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8941           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8942           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8943           if (NILP (val))
8944             Vcoding_system_alist
8945               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8946                        Vcoding_system_alist);
8947         }
8948     }
8949
8950   spec_vec = Fmake_vector (make_number (3), attrs);
8951   ASET (spec_vec, 1, aliases);
8952   ASET (spec_vec, 2, eol_type);
8953
8954   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8955   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8956   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8957   if (NILP (val))
8958     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8959                                   Vcoding_system_alist);
8960
8961   {
8962     int id = coding_categories[category].id;
8963
8964     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8965       setup_coding_system (name, &coding_categories[category]);
8966   }
8967
8968   return Qnil;
8969
8970  short_args:
8971   return Fsignal (Qwrong_number_of_arguments,
8972                   Fcons (intern ("define-coding-system-internal"),
8973                          make_number (nargs)));
8974 }
8975
8976
8977 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8978        3, 3, 0,
8979        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8980   (coding_system, prop, val)
8981      Lisp_Object coding_system, prop, val;
8982 {
8983   Lisp_Object spec, attrs;
8984
8985   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8986   attrs = AREF (spec, 0);
8987   if (EQ (prop, QCmnemonic))
8988     {
8989       if (! STRINGP (val))
8990         CHECK_CHARACTER (val);
8991       CODING_ATTR_MNEMONIC (attrs) = val;
8992     }
8993   else if (EQ (prop, QCdefalut_char))
8994     {
8995       if (NILP (val))
8996         val = make_number (' ');
8997       else
8998         CHECK_CHARACTER (val);
8999       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9000     }
9001   else if (EQ (prop, QCdecode_translation_table))
9002     {
9003       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9004         CHECK_SYMBOL (val);
9005       CODING_ATTR_DECODE_TBL (attrs) = val;
9006     }
9007   else if (EQ (prop, QCencode_translation_table))
9008     {
9009       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9010         CHECK_SYMBOL (val);
9011       CODING_ATTR_ENCODE_TBL (attrs) = val;
9012     }
9013   else if (EQ (prop, QCpost_read_conversion))
9014     {
9015       CHECK_SYMBOL (val);
9016       CODING_ATTR_POST_READ (attrs) = val;
9017     }
9018   else if (EQ (prop, QCpre_write_conversion))
9019     {
9020       CHECK_SYMBOL (val);
9021       CODING_ATTR_PRE_WRITE (attrs) = val;
9022     }
9023   else if (EQ (prop, QCascii_compatible_p))
9024     {
9025       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9026     }
9027
9028   CODING_ATTR_PLIST (attrs)
9029     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9030   return val;
9031 }
9032
9033
9034 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9035        Sdefine_coding_system_alias, 2, 2, 0,
9036        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9037      (alias, coding_system)
9038      Lisp_Object alias, coding_system;
9039 {
9040   Lisp_Object spec, aliases, eol_type, val;
9041
9042   CHECK_SYMBOL (alias);
9043   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9044   aliases = AREF (spec, 1);
9045   /* ALISES should be a list of length more than zero, and the first
9046      element is a base coding system.  Append ALIAS at the tail of the
9047      list.  */
9048   while (!NILP (XCDR (aliases)))
9049     aliases = XCDR (aliases);
9050   XSETCDR (aliases, Fcons (alias, Qnil));
9051
9052   eol_type = AREF (spec, 2);
9053   if (VECTORP (eol_type))
9054     {
9055       Lisp_Object subsidiaries;
9056       int i;
9057
9058       subsidiaries = make_subsidiaries (alias);
9059       for (i = 0; i < 3; i++)
9060         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9061                                      AREF (eol_type, i));
9062     }
9063
9064   Fputhash (alias, spec, Vcoding_system_hash_table);
9065   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9066   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9067   if (NILP (val))
9068     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9069                                   Vcoding_system_alist);
9070
9071   return Qnil;
9072 }
9073
9074 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9075        1, 1, 0,
9076        doc: /* Return the base of CODING-SYSTEM.
9077 Any alias or subsidiary coding system is not a base coding system.  */)
9078   (coding_system)
9079      Lisp_Object coding_system;
9080 {
9081   Lisp_Object spec, attrs;
9082
9083   if (NILP (coding_system))
9084     return (Qno_conversion);
9085   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9086   attrs = AREF (spec, 0);
9087   return CODING_ATTR_BASE_NAME (attrs);
9088 }
9089
9090 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9091        1, 1, 0,
9092        doc: "Return the property list of CODING-SYSTEM.")
9093      (coding_system)
9094      Lisp_Object coding_system;
9095 {
9096   Lisp_Object spec, attrs;
9097
9098   if (NILP (coding_system))
9099     coding_system = Qno_conversion;
9100   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9101   attrs = AREF (spec, 0);
9102   return CODING_ATTR_PLIST (attrs);
9103 }
9104
9105
9106 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9107        1, 1, 0,
9108        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9109      (coding_system)
9110      Lisp_Object coding_system;
9111 {
9112   Lisp_Object spec;
9113
9114   if (NILP (coding_system))
9115     coding_system = Qno_conversion;
9116   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9117   return AREF (spec, 1);
9118 }
9119
9120 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9121        Scoding_system_eol_type, 1, 1, 0,
9122        doc: /* Return eol-type of CODING-SYSTEM.
9123 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9124
9125 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9126 and CR respectively.
9127
9128 A vector value indicates that a format of end-of-line should be
9129 detected automatically.  Nth element of the vector is the subsidiary
9130 coding system whose eol-type is N.  */)
9131      (coding_system)
9132      Lisp_Object coding_system;
9133 {
9134   Lisp_Object spec, eol_type;
9135   int n;
9136
9137   if (NILP (coding_system))
9138     coding_system = Qno_conversion;
9139   if (! CODING_SYSTEM_P (coding_system))
9140     return Qnil;
9141   spec = CODING_SYSTEM_SPEC (coding_system);
9142   eol_type = AREF (spec, 2);
9143   if (VECTORP (eol_type))
9144     return Fcopy_sequence (eol_type);
9145   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9146   return make_number (n);
9147 }
9148
9149 #endif /* emacs */
9150
9151 \f
9152 /*** 9. Post-amble ***/
9153
9154 void
9155 init_coding_once ()
9156 {
9157   int i;
9158
9159   for (i = 0; i < coding_category_max; i++)
9160     {
9161       coding_categories[i].id = -1;
9162       coding_priorities[i] = i;
9163     }
9164
9165   /* ISO2022 specific initialize routine.  */
9166   for (i = 0; i < 0x20; i++)
9167     iso_code_class[i] = ISO_control_0;
9168   for (i = 0x21; i < 0x7F; i++)
9169     iso_code_class[i] = ISO_graphic_plane_0;
9170   for (i = 0x80; i < 0xA0; i++)
9171     iso_code_class[i] = ISO_control_1;
9172   for (i = 0xA1; i < 0xFF; i++)
9173     iso_code_class[i] = ISO_graphic_plane_1;
9174   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9175   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9176   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9177   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9178   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9179   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9180   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9181   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9182   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9183
9184   for (i = 0; i < 256; i++)
9185     {
9186       emacs_mule_bytes[i] = 1;
9187     }
9188   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9189   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9190   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9191   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9192 }
9193
9194 #ifdef emacs
9195
9196 void
9197 syms_of_coding ()
9198 {
9199   staticpro (&Vcoding_system_hash_table);
9200   {
9201     Lisp_Object args[2];
9202     args[0] = QCtest;
9203     args[1] = Qeq;
9204     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9205   }
9206
9207   staticpro (&Vsjis_coding_system);
9208   Vsjis_coding_system = Qnil;
9209
9210   staticpro (&Vbig5_coding_system);
9211   Vbig5_coding_system = Qnil;
9212
9213   staticpro (&Vcode_conversion_reused_workbuf);
9214   Vcode_conversion_reused_workbuf = Qnil;
9215
9216   staticpro (&Vcode_conversion_workbuf_name);
9217   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9218
9219   reused_workbuf_in_use = 0;
9220
9221   DEFSYM (Qcharset, "charset");
9222   DEFSYM (Qtarget_idx, "target-idx");
9223   DEFSYM (Qcoding_system_history, "coding-system-history");
9224   Fset (Qcoding_system_history, Qnil);
9225
9226   /* Target FILENAME is the first argument.  */
9227   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9228   /* Target FILENAME is the third argument.  */
9229   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9230
9231   DEFSYM (Qcall_process, "call-process");
9232   /* Target PROGRAM is the first argument.  */
9233   Fput (Qcall_process, Qtarget_idx, make_number (0));
9234
9235   DEFSYM (Qcall_process_region, "call-process-region");
9236   /* Target PROGRAM is the third argument.  */
9237   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9238
9239   DEFSYM (Qstart_process, "start-process");
9240   /* Target PROGRAM is the third argument.  */
9241   Fput (Qstart_process, Qtarget_idx, make_number (2));
9242
9243   DEFSYM (Qopen_network_stream, "open-network-stream");
9244   /* Target SERVICE is the fourth argument.  */
9245   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9246
9247   DEFSYM (Qcoding_system, "coding-system");
9248   DEFSYM (Qcoding_aliases, "coding-aliases");
9249
9250   DEFSYM (Qeol_type, "eol-type");
9251   DEFSYM (Qunix, "unix");
9252   DEFSYM (Qdos, "dos");
9253
9254   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9255   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9256   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9257   DEFSYM (Qdefault_char, "default-char");
9258   DEFSYM (Qundecided, "undecided");
9259   DEFSYM (Qno_conversion, "no-conversion");
9260   DEFSYM (Qraw_text, "raw-text");
9261
9262   DEFSYM (Qiso_2022, "iso-2022");
9263
9264   DEFSYM (Qutf_8, "utf-8");
9265   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9266
9267   DEFSYM (Qutf_16, "utf-16");
9268   DEFSYM (Qbig, "big");
9269   DEFSYM (Qlittle, "little");
9270
9271   DEFSYM (Qshift_jis, "shift-jis");
9272   DEFSYM (Qbig5, "big5");
9273
9274   DEFSYM (Qcoding_system_p, "coding-system-p");
9275
9276   DEFSYM (Qcoding_system_error, "coding-system-error");
9277   Fput (Qcoding_system_error, Qerror_conditions,
9278         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9279   Fput (Qcoding_system_error, Qerror_message,
9280         build_string ("Invalid coding system"));
9281
9282   /* Intern this now in case it isn't already done.
9283      Setting this variable twice is harmless.
9284      But don't staticpro it here--that is done in alloc.c.  */
9285   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9286
9287   DEFSYM (Qtranslation_table, "translation-table");
9288   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9289   DEFSYM (Qtranslation_table_id, "translation-table-id");
9290   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9291   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9292
9293   DEFSYM (Qvalid_codes, "valid-codes");
9294
9295   DEFSYM (Qemacs_mule, "emacs-mule");
9296
9297   DEFSYM (QCcategory, ":category");
9298   DEFSYM (QCmnemonic, ":mnemonic");
9299   DEFSYM (QCdefalut_char, ":default-char");
9300   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9301   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9302   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9303   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9304   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9305
9306   Vcoding_category_table
9307     = Fmake_vector (make_number (coding_category_max), Qnil);
9308   staticpro (&Vcoding_category_table);
9309   /* Followings are target of code detection.  */
9310   ASET (Vcoding_category_table, coding_category_iso_7,
9311         intern ("coding-category-iso-7"));
9312   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9313         intern ("coding-category-iso-7-tight"));
9314   ASET (Vcoding_category_table, coding_category_iso_8_1,
9315         intern ("coding-category-iso-8-1"));
9316   ASET (Vcoding_category_table, coding_category_iso_8_2,
9317         intern ("coding-category-iso-8-2"));
9318   ASET (Vcoding_category_table, coding_category_iso_7_else,
9319         intern ("coding-category-iso-7-else"));
9320   ASET (Vcoding_category_table, coding_category_iso_8_else,
9321         intern ("coding-category-iso-8-else"));
9322   ASET (Vcoding_category_table, coding_category_utf_8,
9323         intern ("coding-category-utf-8"));
9324   ASET (Vcoding_category_table, coding_category_utf_16_be,
9325         intern ("coding-category-utf-16-be"));
9326   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9327         intern ("coding-category-utf-16-auto"));
9328   ASET (Vcoding_category_table, coding_category_utf_16_le,
9329         intern ("coding-category-utf-16-le"));
9330   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9331         intern ("coding-category-utf-16-be-nosig"));
9332   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9333         intern ("coding-category-utf-16-le-nosig"));
9334   ASET (Vcoding_category_table, coding_category_charset,
9335         intern ("coding-category-charset"));
9336   ASET (Vcoding_category_table, coding_category_sjis,
9337         intern ("coding-category-sjis"));
9338   ASET (Vcoding_category_table, coding_category_big5,
9339         intern ("coding-category-big5"));
9340   ASET (Vcoding_category_table, coding_category_ccl,
9341         intern ("coding-category-ccl"));
9342   ASET (Vcoding_category_table, coding_category_emacs_mule,
9343         intern ("coding-category-emacs-mule"));
9344   /* Followings are NOT target of code detection.  */
9345   ASET (Vcoding_category_table, coding_category_raw_text,
9346         intern ("coding-category-raw-text"));
9347   ASET (Vcoding_category_table, coding_category_undecided,
9348         intern ("coding-category-undecided"));
9349
9350   DEFSYM (Qinsufficient_source, "insufficient-source");
9351   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9352   DEFSYM (Qinvalid_source, "invalid-source");
9353   DEFSYM (Qinterrupted, "interrupted");
9354   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9355   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9356
9357   defsubr (&Scoding_system_p);
9358   defsubr (&Sread_coding_system);
9359   defsubr (&Sread_non_nil_coding_system);
9360   defsubr (&Scheck_coding_system);
9361   defsubr (&Sdetect_coding_region);
9362   defsubr (&Sdetect_coding_string);
9363   defsubr (&Sfind_coding_systems_region_internal);
9364   defsubr (&Sunencodable_char_position);
9365   defsubr (&Scheck_coding_systems_region);
9366   defsubr (&Sdecode_coding_region);
9367   defsubr (&Sencode_coding_region);
9368   defsubr (&Sdecode_coding_string);
9369   defsubr (&Sencode_coding_string);
9370   defsubr (&Sdecode_sjis_char);
9371   defsubr (&Sencode_sjis_char);
9372   defsubr (&Sdecode_big5_char);
9373   defsubr (&Sencode_big5_char);
9374   defsubr (&Sset_terminal_coding_system_internal);
9375   defsubr (&Sset_safe_terminal_coding_system_internal);
9376   defsubr (&Sterminal_coding_system);
9377   defsubr (&Sset_keyboard_coding_system_internal);
9378   defsubr (&Skeyboard_coding_system);
9379   defsubr (&Sfind_operation_coding_system);
9380   defsubr (&Sset_coding_system_priority);
9381   defsubr (&Sdefine_coding_system_internal);
9382   defsubr (&Sdefine_coding_system_alias);
9383   defsubr (&Scoding_system_put);
9384   defsubr (&Scoding_system_base);
9385   defsubr (&Scoding_system_plist);
9386   defsubr (&Scoding_system_aliases);
9387   defsubr (&Scoding_system_eol_type);
9388   defsubr (&Scoding_system_priority_list);
9389
9390   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9391                doc: /* List of coding systems.
9392
9393 Do not alter the value of this variable manually.  This variable should be
9394 updated by the functions `define-coding-system' and
9395 `define-coding-system-alias'.  */);
9396   Vcoding_system_list = Qnil;
9397
9398   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9399                doc: /* Alist of coding system names.
9400 Each element is one element list of coding system name.
9401 This variable is given to `completing-read' as TABLE argument.
9402
9403 Do not alter the value of this variable manually.  This variable should be
9404 updated by the functions `make-coding-system' and
9405 `define-coding-system-alias'.  */);
9406   Vcoding_system_alist = Qnil;
9407
9408   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9409                doc: /* List of coding-categories (symbols) ordered by priority.
9410
9411 On detecting a coding system, Emacs tries code detection algorithms
9412 associated with each coding-category one by one in this order.  When
9413 one algorithm agrees with a byte sequence of source text, the coding
9414 system bound to the corresponding coding-category is selected.
9415
9416 Don't modify this variable directly, but use `set-coding-priority'.  */);
9417   {
9418     int i;
9419
9420     Vcoding_category_list = Qnil;
9421     for (i = coding_category_max - 1; i >= 0; i--)
9422       Vcoding_category_list
9423         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9424                  Vcoding_category_list);
9425   }
9426
9427   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9428                doc: /* Specify the coding system for read operations.
9429 It is useful to bind this variable with `let', but do not set it globally.
9430 If the value is a coding system, it is used for decoding on read operation.
9431 If not, an appropriate element is used from one of the coding system alists:
9432 There are three such tables, `file-coding-system-alist',
9433 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9434   Vcoding_system_for_read = Qnil;
9435
9436   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9437                doc: /* Specify the coding system for write operations.
9438 Programs bind this variable with `let', but you should not set it globally.
9439 If the value is a coding system, it is used for encoding of output,
9440 when writing it to a file and when sending it to a file or subprocess.
9441
9442 If this does not specify a coding system, an appropriate element
9443 is used from one of the coding system alists:
9444 There are three such tables, `file-coding-system-alist',
9445 `process-coding-system-alist', and `network-coding-system-alist'.
9446 For output to files, if the above procedure does not specify a coding system,
9447 the value of `buffer-file-coding-system' is used.  */);
9448   Vcoding_system_for_write = Qnil;
9449
9450   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9451                doc: /*
9452 Coding system used in the latest file or process I/O.  */);
9453   Vlast_coding_system_used = Qnil;
9454
9455   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9456                doc: /*
9457 Error status of the last code conversion.
9458
9459 When an error was detected in the last code conversion, this variable
9460 is set to one of the following symbols.
9461   `insufficient-source'
9462   `inconsistent-eol'
9463   `invalid-source'
9464   `interrupted'
9465   `insufficient-memory'
9466 When no error was detected, the value doesn't change.  So, to check
9467 the error status of a code conversion by this variable, you must
9468 explicitly set this variable to nil before performing code
9469 conversion.  */);
9470   Vlast_code_conversion_error = Qnil;
9471
9472   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9473                doc: /*
9474 *Non-nil means always inhibit code conversion of end-of-line format.
9475 See info node `Coding Systems' and info node `Text and Binary' concerning
9476 such conversion.  */);
9477   inhibit_eol_conversion = 0;
9478
9479   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9480                doc: /*
9481 Non-nil means process buffer inherits coding system of process output.
9482 Bind it to t if the process output is to be treated as if it were a file
9483 read from some filesystem.  */);
9484   inherit_process_coding_system = 0;
9485
9486   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9487                doc: /*
9488 Alist to decide a coding system to use for a file I/O operation.
9489 The format is ((PATTERN . VAL) ...),
9490 where PATTERN is a regular expression matching a file name,
9491 VAL is a coding system, a cons of coding systems, or a function symbol.
9492 If VAL is a coding system, it is used for both decoding and encoding
9493 the file contents.
9494 If VAL is a cons of coding systems, the car part is used for decoding,
9495 and the cdr part is used for encoding.
9496 If VAL is a function symbol, the function must return a coding system
9497 or a cons of coding systems which are used as above.  The function gets
9498 the arguments with which `find-operation-coding-systems' was called.
9499
9500 See also the function `find-operation-coding-system'
9501 and the variable `auto-coding-alist'.  */);
9502   Vfile_coding_system_alist = Qnil;
9503
9504   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9505                doc: /*
9506 Alist to decide a coding system to use for a process I/O operation.
9507 The format is ((PATTERN . VAL) ...),
9508 where PATTERN is a regular expression matching a program name,
9509 VAL is a coding system, a cons of coding systems, or a function symbol.
9510 If VAL is a coding system, it is used for both decoding what received
9511 from the program and encoding what sent to the program.
9512 If VAL is a cons of coding systems, the car part is used for decoding,
9513 and the cdr part is used for encoding.
9514 If VAL is a function symbol, the function must return a coding system
9515 or a cons of coding systems which are used as above.
9516
9517 See also the function `find-operation-coding-system'.  */);
9518   Vprocess_coding_system_alist = Qnil;
9519
9520   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9521                doc: /*
9522 Alist to decide a coding system to use for a network I/O operation.
9523 The format is ((PATTERN . VAL) ...),
9524 where PATTERN is a regular expression matching a network service name
9525 or is a port number to connect to,
9526 VAL is a coding system, a cons of coding systems, or a function symbol.
9527 If VAL is a coding system, it is used for both decoding what received
9528 from the network stream and encoding what sent to the network stream.
9529 If VAL is a cons of coding systems, the car part is used for decoding,
9530 and the cdr part is used for encoding.
9531 If VAL is a function symbol, the function must return a coding system
9532 or a cons of coding systems which are used as above.
9533
9534 See also the function `find-operation-coding-system'.  */);
9535   Vnetwork_coding_system_alist = Qnil;
9536
9537   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9538                doc: /* Coding system to use with system messages.
9539 Also used for decoding keyboard input on X Window system.  */);
9540   Vlocale_coding_system = Qnil;
9541
9542   /* The eol mnemonics are reset in startup.el system-dependently.  */
9543   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9544                doc: /*
9545 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9546   eol_mnemonic_unix = build_string (":");
9547
9548   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9549                doc: /*
9550 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9551   eol_mnemonic_dos = build_string ("\\");
9552
9553   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9554                doc: /*
9555 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9556   eol_mnemonic_mac = build_string ("/");
9557
9558   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9559                doc: /*
9560 *String displayed in mode line when end-of-line format is not yet determined.  */);
9561   eol_mnemonic_undecided = build_string (":");
9562
9563   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9564                doc: /*
9565 *Non-nil enables character translation while encoding and decoding.  */);
9566   Venable_character_translation = Qt;
9567
9568   DEFVAR_LISP ("standard-translation-table-for-decode",
9569                &Vstandard_translation_table_for_decode,
9570                doc: /* Table for translating characters while decoding.  */);
9571   Vstandard_translation_table_for_decode = Qnil;
9572
9573   DEFVAR_LISP ("standard-translation-table-for-encode",
9574                &Vstandard_translation_table_for_encode,
9575                doc: /* Table for translating characters while encoding.  */);
9576   Vstandard_translation_table_for_encode = Qnil;
9577
9578   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9579                doc: /* Alist of charsets vs revision numbers.
9580 While encoding, if a charset (car part of an element) is found,
9581 designate it with the escape sequence identifying revision (cdr part
9582 of the element).  */);
9583   Vcharset_revision_table = Qnil;
9584
9585   DEFVAR_LISP ("default-process-coding-system",
9586                &Vdefault_process_coding_system,
9587                doc: /* Cons of coding systems used for process I/O by default.
9588 The car part is used for decoding a process output,
9589 the cdr part is used for encoding a text to be sent to a process.  */);
9590   Vdefault_process_coding_system = Qnil;
9591
9592   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9593                doc: /*
9594 Table of extra Latin codes in the range 128..159 (inclusive).
9595 This is a vector of length 256.
9596 If Nth element is non-nil, the existence of code N in a file
9597 \(or output of subprocess) doesn't prevent it to be detected as
9598 a coding system of ISO 2022 variant which has a flag
9599 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9600 or reading output of a subprocess.
9601 Only 128th through 159th elements has a meaning.  */);
9602   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9603
9604   DEFVAR_LISP ("select-safe-coding-system-function",
9605                &Vselect_safe_coding_system_function,
9606                doc: /*
9607 Function to call to select safe coding system for encoding a text.
9608
9609 If set, this function is called to force a user to select a proper
9610 coding system which can encode the text in the case that a default
9611 coding system used in each operation can't encode the text.
9612
9613 The default value is `select-safe-coding-system' (which see).  */);
9614   Vselect_safe_coding_system_function = Qnil;
9615
9616   DEFVAR_BOOL ("coding-system-require-warning",
9617                &coding_system_require_warning,
9618                doc: /* Internal use only.
9619 If non-nil, on writing a file, `select-safe-coding-system-function' is
9620 called even if `coding-system-for-write' is non-nil.  The command
9621 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9622   coding_system_require_warning = 0;
9623
9624
9625   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9626                &inhibit_iso_escape_detection,
9627                doc: /*
9628 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9629
9630 By default, on reading a file, Emacs tries to detect how the text is
9631 encoded.  This code detection is sensitive to escape sequences.  If
9632 the sequence is valid as ISO2022, the code is determined as one of
9633 the ISO2022 encodings, and the file is decoded by the corresponding
9634 coding system (e.g. `iso-2022-7bit').
9635
9636 However, there may be a case that you want to read escape sequences in
9637 a file as is.  In such a case, you can set this variable to non-nil.
9638 Then, as the code detection ignores any escape sequences, no file is
9639 detected as encoded in some ISO2022 encoding.  The result is that all
9640 escape sequences become visible in a buffer.
9641
9642 The default value is nil, and it is strongly recommended not to change
9643 it.  That is because many Emacs Lisp source files that contain
9644 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9645 in Emacs's distribution, and they won't be decoded correctly on
9646 reading if you suppress escape sequence detection.
9647
9648 The other way to read escape sequences in a file without decoding is
9649 to explicitly specify some coding system that doesn't use ISO2022's
9650 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9651   inhibit_iso_escape_detection = 0;
9652
9653   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9654                doc: /* Char table for translating self-inserting characters.
9655 This is applied to the result of input methods, not their input.  See also
9656 `keyboard-translate-table'.  */);
9657     Vtranslation_table_for_input = Qnil;
9658
9659   {
9660     Lisp_Object args[coding_arg_max];
9661     Lisp_Object plist[16];
9662     int i;
9663
9664     for (i = 0; i < coding_arg_max; i++)
9665       args[i] = Qnil;
9666
9667     plist[0] = intern (":name");
9668     plist[1] = args[coding_arg_name] = Qno_conversion;
9669     plist[2] = intern (":mnemonic");
9670     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9671     plist[4] = intern (":coding-type");
9672     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9673     plist[6] = intern (":ascii-compatible-p");
9674     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9675     plist[8] = intern (":default-char");
9676     plist[9] = args[coding_arg_default_char] = make_number (0);
9677     plist[10] = intern (":for-unibyte");
9678     plist[11] = args[coding_arg_for_unibyte] = Qt;
9679     plist[12] = intern (":docstring");
9680     plist[13] = build_string ("Do no conversion.\n\
9681 \n\
9682 When you visit a file with this coding, the file is read into a\n\
9683 unibyte buffer as is, thus each byte of a file is treated as a\n\
9684 character.");
9685     plist[14] = intern (":eol-type");
9686     plist[15] = args[coding_arg_eol_type] = Qunix;
9687     args[coding_arg_plist] = Flist (16, plist);
9688     Fdefine_coding_system_internal (coding_arg_max, args);
9689
9690     plist[1] = args[coding_arg_name] = Qundecided;
9691     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9692     plist[5] = args[coding_arg_coding_type] = Qundecided;
9693     /* This is already set.
9694        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9695     plist[8] = intern (":charset-list");
9696     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9697     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9698     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9699     plist[15] = args[coding_arg_eol_type] = Qnil;
9700     args[coding_arg_plist] = Flist (16, plist);
9701     Fdefine_coding_system_internal (coding_arg_max, args);
9702   }
9703
9704   setup_coding_system (Qno_conversion, &keyboard_coding);
9705   setup_coding_system (Qundecided, &terminal_coding);
9706   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9707
9708   {
9709     int i;
9710
9711     for (i = 0; i < coding_category_max; i++)
9712       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9713   }
9714 }
9715
9716 char *
9717 emacs_strerror (error_number)
9718      int error_number;
9719 {
9720   char *str;
9721
9722   synchronize_system_messages_locale ();
9723   str = strerror (error_number);
9724
9725   if (! NILP (Vlocale_coding_system))
9726     {
9727       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9728                                                       Vlocale_coding_system,
9729                                                       0);
9730       str = (char *) SDATA (dec);
9731     }
9732
9733   return str;
9734 }
9735
9736 #endif /* emacs */
9737
9738 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9739    (do not change this comment) */