src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 #ifdef emacs
 344
 345 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 346
 347 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 348
 349 /* Coding system emacs-mule and raw-text are for converting only
 350    end-of-line format.  */
 351 Lisp_Object Qemacs_mule, Qraw_text;
 352 Lisp_Object Qutf_8_emacs;
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding-system for reading files and receiving data from process.  */
 357 Lisp_Object Vcoding_system_for_read;
 358 /* Coding-system for writing files and sending data to process.  */
 359 Lisp_Object Vcoding_system_for_write;
 360 /* Coding-system actually used in the latest I/O.  */
 361 Lisp_Object Vlast_coding_system_used;
 362 /* Set to non-nil when an error is detected while code conversion.  */
 363 Lisp_Object Vlast_code_conversion_error;
 364 /* A vector of length 256 which contains information about special
 365    Latin codes (especially for dealing with Microsoft codes).  */
 366 Lisp_Object Vlatin_extra_code_table;
 367
 368 /* Flag to inhibit code conversion of end-of-line format.  */
 369 int inhibit_eol_conversion;
 370
 371 /* Flag to inhibit ISO2022 escape sequence detection.  */
 372 int inhibit_iso_escape_detection;
 373
 374 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 375 int inherit_process_coding_system;
 376
 377 /* Coding system to be used to encode text for terminal display.  */
 378 struct coding_system terminal_coding;
 379
 380 /* Coding system to be used to encode text for terminal display when
 381    terminal coding system is nil.  */
 382 struct coding_system safe_terminal_coding;
 383
 384 /* Coding system of what is sent from terminal keyboard.  */
 385 struct coding_system keyboard_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)  \
 430   ((charset_id <= (coding)->max_charset_id      \
 431     ? (coding)->safe_charsets[charset_id]       \
 432     : -1))
 433
 434
 435 #define CODING_ISO_FLAGS(coding)        \
 436   ((coding)->spec.iso_2022.flags)
 437 #define CODING_ISO_DESIGNATION(coding, reg)     \
 438   ((coding)->spec.iso_2022.current_designation[reg])
 439 #define CODING_ISO_INVOCATION(coding, plane)    \
 440   ((coding)->spec.iso_2022.current_invocation[plane])
 441 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 442   ((coding)->spec.iso_2022.single_shifting)
 443 #define CODING_ISO_BOL(coding)  \
 444   ((coding)->spec.iso_2022.bol)
 445 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 446   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 447
 448 /* Control characters of ISO2022.  */
 449                         /* code */      /* function */
 450 #define ISO_CODE_LF     0x0A            /* line-feed */
 451 #define ISO_CODE_CR     0x0D            /* carriage-return */
 452 #define ISO_CODE_SO     0x0E            /* shift-out */
 453 #define ISO_CODE_SI     0x0F            /* shift-in */
 454 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 455 #define ISO_CODE_ESC    0x1B            /* escape */
 456 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 457 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 458 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 459
 460 /* All code (1-byte) of ISO2022 is classified into one of the
 461    followings.  */
 462 enum iso_code_class_type
 463   {
 464     ISO_control_0,              /* Control codes in the range
 465                                    0x00..0x1F and 0x7F, except for the
 466                                    following 5 codes.  */
 467     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 468     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 469     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 470     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 471     ISO_control_1,              /* Control codes in the range
 472                                    0x80..0x9F, except for the
 473                                    following 3 codes.  */
 474     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 475     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 476     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 477     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 478     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 479     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 480     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 481   };
 482
 483 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 484     `iso-flags' attribute of an iso2022 coding system.  */
 485
 486 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 487    instead of the correct short-form sequence (e.g. ESC $ A).  */
 488 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 489
 490 /* If set, reset graphic planes and registers at end-of-line to the
 491    initial state.  */
 492 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 493
 494 /* If set, reset graphic planes and registers before any control
 495    characters to the initial state.  */
 496 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 497
 498 /* If set, encode by 7-bit environment.  */
 499 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 500
 501 /* If set, use locking-shift function.  */
 502 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 503
 504 /* If set, use single-shift function.  Overwrite
 505    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 506 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 507
 508 /* If set, use designation escape sequence.  */
 509 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 510
 511 /* If set, produce revision number sequence.  */
 512 #define CODING_ISO_FLAG_REVISION        0x0080
 513
 514 /* If set, produce ISO6429's direction specifying sequence.  */
 515 #define CODING_ISO_FLAG_DIRECTION       0x0100
 516
 517 /* If set, assume designation states are reset at beginning of line on
 518    output.  */
 519 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 520
 521 /* If set, designation sequence should be placed at beginning of line
 522    on output.  */
 523 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 524
 525 /* If set, do not encode unsafe charactes on output.  */
 526 #define CODING_ISO_FLAG_SAFE            0x0800
 527
 528 /* If set, extra latin codes (128..159) are accepted as a valid code
 529    on input.  */
 530 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 531
 532 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 533
 534 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 535
 536 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 537
 538 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 539
 540 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 541
 542 /* A character to be produced on output if encoding of the original
 543    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 544 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 545
 546
 547 /* UTF-16 section */
 548 #define CODING_UTF_16_BOM(coding)       \
 549   ((coding)->spec.utf_16.bom)
 550
 551 #define CODING_UTF_16_ENDIAN(coding)    \
 552   ((coding)->spec.utf_16.endian)
 553
 554 #define CODING_UTF_16_SURROGATE(coding) \
 555   ((coding)->spec.utf_16.surrogate)
 556
 557
 558 /* CCL section */
 559 #define CODING_CCL_DECODER(coding)      \
 560   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 561 #define CODING_CCL_ENCODER(coding)      \
 562   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 563 #define CODING_CCL_VALIDS(coding)                                          \
 564   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 565
 566 /* Index for each coding category in `coding_categories' */
 567
 568 enum coding_category
 569   {
 570     coding_category_iso_7,
 571     coding_category_iso_7_tight,
 572     coding_category_iso_8_1,
 573     coding_category_iso_8_2,
 574     coding_category_iso_7_else,
 575     coding_category_iso_8_else,
 576     coding_category_utf_8,
 577     coding_category_utf_16_auto,
 578     coding_category_utf_16_be,
 579     coding_category_utf_16_le,
 580     coding_category_utf_16_be_nosig,
 581     coding_category_utf_16_le_nosig,
 582     coding_category_charset,
 583     coding_category_sjis,
 584     coding_category_big5,
 585     coding_category_ccl,
 586     coding_category_emacs_mule,
 587     /* All above are targets of code detection.  */
 588     coding_category_raw_text,
 589     coding_category_undecided,
 590     coding_category_max
 591   };
 592
 593 /* Definitions of flag bits used in detect_coding_XXXX.  */
 594 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 595 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 596 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 597 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 598 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 599 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 600 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 601 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 602 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 603 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 604 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 605 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 606 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 607 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 608 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 609 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 610 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 611 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 612
 613 /* This value is returned if detect_coding_mask () find nothing other
 614    than ASCII characters.  */
 615 #define CATEGORY_MASK_ANY               \
 616   (CATEGORY_MASK_ISO_7                  \
 617    | CATEGORY_MASK_ISO_7_TIGHT          \
 618    | CATEGORY_MASK_ISO_8_1              \
 619    | CATEGORY_MASK_ISO_8_2              \
 620    | CATEGORY_MASK_ISO_7_ELSE           \
 621    | CATEGORY_MASK_ISO_8_ELSE           \
 622    | CATEGORY_MASK_UTF_8                \
 623    | CATEGORY_MASK_UTF_16_BE            \
 624    | CATEGORY_MASK_UTF_16_LE            \
 625    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 626    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 627    | CATEGORY_MASK_CHARSET              \
 628    | CATEGORY_MASK_SJIS                 \
 629    | CATEGORY_MASK_BIG5                 \
 630    | CATEGORY_MASK_CCL                  \
 631    | CATEGORY_MASK_EMACS_MULE)
 632
 633
 634 #define CATEGORY_MASK_ISO_7BIT \
 635   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 636
 637 #define CATEGORY_MASK_ISO_8BIT \
 638   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 639
 640 #define CATEGORY_MASK_ISO_ELSE \
 641   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO_ESCAPE        \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_7_ELSE           \
 647    | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO       \
 650   (  CATEGORY_MASK_ISO_7BIT     \
 651      | CATEGORY_MASK_ISO_8BIT   \
 652      | CATEGORY_MASK_ISO_ELSE)
 653
 654 #define CATEGORY_MASK_UTF_16            \
 655   (CATEGORY_MASK_UTF_16_BE              \
 656    | CATEGORY_MASK_UTF_16_LE            \
 657    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 658    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 659
 660
 661 /* List of symbols `coding-category-xxx' ordered by priority.  This
 662    variable is exposed to Emacs Lisp.  */
 663 static Lisp_Object Vcoding_category_list;
 664
 665 /* Table of coding categories (Lisp symbols).  This variable is for
 666    internal use oly.  */
 667 static Lisp_Object Vcoding_category_table;
 668
 669 /* Table of coding-categories ordered by priority.  */
 670 static enum coding_category coding_priorities[coding_category_max];
 671
 672 /* Nth element is a coding context for the coding system bound to the
 673    Nth coding category.  */
 674 static struct coding_system coding_categories[coding_category_max];
 675
 676 /*** Commonly used macros and functions ***/
 677
 678 #ifndef min
 679 #define min(a, b) ((a) < (b) ? (a) : (b))
 680 #endif
 681 #ifndef max
 682 #define max(a, b) ((a) > (b) ? (a) : (b))
 683 #endif
 684
 685 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 686   do {                                                  \
 687     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 688     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 689   } while (0)
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to `no_more_source'.  If multibytep is
 695    nonzero, and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725
 726 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 727   do {                                                  \
 728     c = *src++;                                         \
 729     if (multibytep && (c & 0x80))                       \
 730       {                                                 \
 731         if ((c & 0xFE) == 0xC0)                         \
 732           c = ((c & 1) << 6) | *src++;                  \
 733         else                                            \
 734           {                                             \
 735             src--;                                      \
 736             c = - string_char (src, &src, NULL);        \
 737             record_conversion_result                    \
 738               (coding, CODING_RESULT_INVALID_SRC);      \
 739           }                                             \
 740       }                                                 \
 741     consumed_chars++;                                   \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  The caller should
 747    assure that C is 0..127, and declare and set the variable `dst'
 748    appropriately in advance.
 749 */
 750
 751
 752 #define EMIT_ONE_ASCII_BYTE(c)  \
 753   do {                          \
 754     produced_chars++;           \
 755     *dst++ = (c);               \
 756   } while (0)
 757
 758
 759 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 760
 761 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 762   do {                                  \
 763     produced_chars += 2;                \
 764     *dst++ = (c1), *dst++ = (c2);       \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 770    nonzero, store in an appropriate multibyte from.  The caller should
 771    declare and set the variables `dst' and `multibytep' appropriately
 772    in advance.  */
 773
 774 #define EMIT_ONE_BYTE(c)                \
 775   do {                                  \
 776     produced_chars++;                   \
 777     if (multibytep)                     \
 778       {                                 \
 779         int ch = (c);                   \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       *dst++ = (c);                     \
 786   } while (0)
 787
 788
 789 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 790
 791 #define EMIT_TWO_BYTES(c1, c2)          \
 792   do {                                  \
 793     produced_chars += 2;                \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch;                         \
 797                                         \
 798         ch = (c1);                      \
 799         if (ch >= 0x80)                 \
 800           ch = BYTE8_TO_CHAR (ch);      \
 801         CHAR_STRING_ADVANCE (ch, dst);  \
 802         ch = (c2);                      \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       {                                 \
 809         *dst++ = (c1);                  \
 810         *dst++ = (c2);                  \
 811       }                                 \
 812   } while (0)
 813
 814
 815 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 816   do {                                  \
 817     EMIT_ONE_BYTE (c1);                 \
 818     EMIT_TWO_BYTES (c2, c3);            \
 819   } while (0)
 820
 821
 822 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 823   do {                                          \
 824     EMIT_TWO_BYTES (c1, c2);                    \
 825     EMIT_TWO_BYTES (c3, c4);                    \
 826   } while (0)
 827
 828
 829 /* Prototypes for static functions.  */
 830 static void record_conversion_result P_ ((struct coding_system *coding,
 831                                           enum coding_result_code result));
 832 static int detect_coding_utf_8 P_ ((struct coding_system *,
 833                                     struct coding_detection_info *info));
 834 static void decode_coding_utf_8 P_ ((struct coding_system *));
 835 static int encode_coding_utf_8 P_ ((struct coding_system *));
 836
 837 static int detect_coding_utf_16 P_ ((struct coding_system *,
 838                                      struct coding_detection_info *info));
 839 static void decode_coding_utf_16 P_ ((struct coding_system *));
 840 static int encode_coding_utf_16 P_ ((struct coding_system *));
 841
 842 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 843                                        struct coding_detection_info *info));
 844 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 845 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 846
 847 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 848                                          struct coding_detection_info *info));
 849 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 850 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 851
 852 static int detect_coding_sjis P_ ((struct coding_system *,
 853                                    struct coding_detection_info *info));
 854 static void decode_coding_sjis P_ ((struct coding_system *));
 855 static int encode_coding_sjis P_ ((struct coding_system *));
 856
 857 static int detect_coding_big5 P_ ((struct coding_system *,
 858                                    struct coding_detection_info *info));
 859 static void decode_coding_big5 P_ ((struct coding_system *));
 860 static int encode_coding_big5 P_ ((struct coding_system *));
 861
 862 static int detect_coding_ccl P_ ((struct coding_system *,
 863                                   struct coding_detection_info *info));
 864 static void decode_coding_ccl P_ ((struct coding_system *));
 865 static int encode_coding_ccl P_ ((struct coding_system *));
 866
 867 static void decode_coding_raw_text P_ ((struct coding_system *));
 868 static int encode_coding_raw_text P_ ((struct coding_system *));
 869
 870 static void coding_set_source P_ ((struct coding_system *));
 871 static void coding_set_destination P_ ((struct coding_system *));
 872 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 873 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 874                                             EMACS_INT));
 875 static unsigned char *alloc_destination P_ ((struct coding_system *,
 876                                              EMACS_INT, unsigned char *));
 877 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 878 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 879                                                      int *, int *,
 880                                                      unsigned char *));
 881 static int detect_eol P_ ((const unsigned char *,
 882                            EMACS_INT, enum coding_category));
 883 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 884 static void decode_eol P_ ((struct coding_system *));
 885 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 886 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 887                                         int, int *, int *));
 888 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 889 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 890                                             EMACS_INT));
 891 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 892                                         EMACS_INT));
 893 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 894 static int decode_coding P_ ((struct coding_system *));
 895 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 896                                                       struct coding_system *,
 897                                                       int *, EMACS_INT *));
 898 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 899                                                   struct coding_system *,
 900                                                   int *, EMACS_INT *));
 901 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 902 static int encode_coding P_ ((struct coding_system *));
 903 static Lisp_Object make_conversion_work_buffer P_ ((int));
 904 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 905 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 906 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 907
 908 static void
 909 record_conversion_result (struct coding_system *coding,
 910                           enum coding_result_code result)
 911 {
 912   coding->result = result;
 913   switch (result)
 914     {
 915     case CODING_RESULT_INSUFFICIENT_SRC:
 916       Vlast_code_conversion_error = Qinsufficient_source;
 917       break;
 918     case CODING_RESULT_INCONSISTENT_EOL:
 919       Vlast_code_conversion_error = Qinconsistent_eol;
 920       break;
 921     case CODING_RESULT_INVALID_SRC:
 922       Vlast_code_conversion_error = Qinvalid_source;
 923       break;
 924     case CODING_RESULT_INTERRUPT:
 925       Vlast_code_conversion_error = Qinterrupted;
 926       break;
 927     case CODING_RESULT_INSUFFICIENT_MEM:
 928       Vlast_code_conversion_error = Qinsufficient_memory;
 929       break;
 930     default:
 931       Vlast_code_conversion_error = intern ("Unknown error");
 932     }
 933 }
 934
 935 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 936   do {                                                                       \
 937     charset_map_loaded = 0;                                                  \
 938     c = DECODE_CHAR (charset, code);                                         \
 939     if (charset_map_loaded)                                                  \
 940       {                                                                      \
 941         const unsigned char *orig = coding->source;                          \
 942         EMACS_INT offset;                                                    \
 943                                                                              \
 944         coding_set_source (coding);                                          \
 945         offset = coding->source - orig;                                      \
 946         src += offset;                                                       \
 947         src_base += offset;                                                  \
 948         src_end += offset;                                                   \
 949       }                                                                      \
 950   } while (0)
 951
 952
 953 #define ASSURE_DESTINATION(bytes)                               \
 954   do {                                                          \
 955     if (dst + (bytes) >= dst_end)                               \
 956       {                                                         \
 957         int more_bytes = charbuf_end - charbuf + (bytes);       \
 958                                                                 \
 959         dst = alloc_destination (coding, more_bytes, dst);      \
 960         dst_end = coding->destination + coding->dst_bytes;      \
 961       }                                                         \
 962   } while (0)
 963
 964
 965
 966 static void
 967 coding_set_source (coding)
 968      struct coding_system *coding;
 969 {
 970   if (BUFFERP (coding->src_object))
 971     {
 972       struct buffer *buf = XBUFFER (coding->src_object);
 973
 974       if (coding->src_pos < 0)
 975         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 976       else
 977         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 978     }
 979   else if (STRINGP (coding->src_object))
 980     {
 981       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 982     }
 983   else
 984     /* Otherwise, the source is C string and is never relocated
 985        automatically.  Thus we don't have to update anything.  */
 986     ;
 987 }
 988
 989 static void
 990 coding_set_destination (coding)
 991      struct coding_system *coding;
 992 {
 993   if (BUFFERP (coding->dst_object))
 994     {
 995       if (coding->src_pos < 0)
 996         {
 997           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 998           coding->dst_bytes = (GAP_END_ADDR
 999                                - (coding->src_bytes - coding->consumed)
1000                                - coding->destination);
1001         }
1002       else
1003         {
1004           /* We are sure that coding->dst_pos_byte is before the gap
1005              of the buffer. */
1006           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1007                                  + coding->dst_pos_byte - 1);
1008           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1009                                - coding->destination);
1010         }
1011     }
1012   else
1013     /* Otherwise, the destination is C string and is never relocated
1014        automatically.  Thus we don't have to update anything.  */
1015     ;
1016 }
1017
1018
1019 static void
1020 coding_alloc_by_realloc (coding, bytes)
1021      struct coding_system *coding;
1022      EMACS_INT bytes;
1023 {
1024   coding->destination = (unsigned char *) xrealloc (coding->destination,
1025                                                     coding->dst_bytes + bytes);
1026   coding->dst_bytes += bytes;
1027 }
1028
1029 static void
1030 coding_alloc_by_making_gap (coding, bytes)
1031      struct coding_system *coding;
1032      EMACS_INT bytes;
1033 {
1034   if (BUFFERP (coding->dst_object)
1035       && EQ (coding->src_object, coding->dst_object))
1036     {
1037       EMACS_INT add = coding->src_bytes - coding->consumed;
1038
1039       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042     }
1043   else
1044     {
1045       Lisp_Object this_buffer;
1046
1047       this_buffer = Fcurrent_buffer ();
1048       set_buffer_internal (XBUFFER (coding->dst_object));
1049       make_gap (bytes);
1050       set_buffer_internal (XBUFFER (this_buffer));
1051     }
1052 }
1053
1054
1055 static unsigned char *
1056 alloc_destination (coding, nbytes, dst)
1057      struct coding_system *coding;
1058      EMACS_INT nbytes;
1059      unsigned char *dst;
1060 {
1061   EMACS_INT offset = dst - coding->destination;
1062
1063   if (BUFFERP (coding->dst_object))
1064     coding_alloc_by_making_gap (coding, nbytes);
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1068   coding_set_destination (coding);
1069   dst = coding->destination + offset;
1070   return dst;
1071 }
1072
1073 /** Macros for annotations.  */
1074
1075 /* Maximum length of annotation data (sum of annotations for
1076    composition and charset).  */
1077 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1091    METHOD is one of enum composition_method.
1092    Optionnal COMPOSITION-COMPONENTS are characters and composition
1093    rules.
1094
1095    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1096    follows.  */
1097
1098 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1099   do {                                                  \
1100     *(buf)++ = -(len);                                  \
1101     *(buf)++ = (mask);                                  \
1102     *(buf)++ = (nchars);                                \
1103     coding->annotated = 1;                              \
1104   } while (0);
1105
1106 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1107   do {                                                                      \
1108     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1109     *buf++ = method;                                                        \
1110   } while (0)
1111
1112
1113 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1114   do {                                                                  \
1115     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1116     *buf++ = id;                                                        \
1117   } while (0)
1118
1119 \f
1120 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1121
1122
1123
1124 \f
1125 /*** 3. UTF-8 ***/
1126
1127 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1128    Check if a text is encoded in UTF-8.  If it is, return 1, else
1129    return 0.  */
1130
1131 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1132 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1133 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1134 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1135 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1136 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1137
1138 static int
1139 detect_coding_utf_8 (coding, detect_info)
1140      struct coding_system *coding;
1141      struct coding_detection_info *detect_info;
1142 {
1143   const unsigned char *src = coding->source, *src_base;
1144   const unsigned char *src_end = coding->source + coding->src_bytes;
1145   int multibytep = coding->src_multibyte;
1146   int consumed_chars = 0;
1147   int found = 0;
1148
1149   detect_info->checked |= CATEGORY_MASK_UTF_8;
1150   /* A coding system of this category is always ASCII compatible.  */
1151   src += coding->head_ascii;
1152
1153   while (1)
1154     {
1155       int c, c1, c2, c3, c4;
1156
1157       src_base = src;
1158       ONE_MORE_BYTE (c);
1159       if (c < 0 || UTF_8_1_OCTET_P (c))
1160         continue;
1161       ONE_MORE_BYTE (c1);
1162       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1163         break;
1164       if (UTF_8_2_OCTET_LEADING_P (c))
1165         {
1166           found = CATEGORY_MASK_UTF_8;
1167           continue;
1168         }
1169       ONE_MORE_BYTE (c2);
1170       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1171         break;
1172       if (UTF_8_3_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c3);
1178       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1179         break;
1180       if (UTF_8_4_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c4);
1186       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1187         break;
1188       if (UTF_8_5_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       break;
1194     }
1195   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196   return 0;
1197
1198  no_more_source:
1199   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1200     {
1201       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202       return 0;
1203     }
1204   detect_info->found |= found;
1205   return 1;
1206 }
1207
1208
1209 static void
1210 decode_coding_utf_8 (coding)
1211      struct coding_system *coding;
1212 {
1213   const unsigned char *src = coding->source + coding->consumed;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   const unsigned char *src_base;
1216   int *charbuf = coding->charbuf + coding->charbuf_used;
1217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1218   int consumed_chars = 0, consumed_chars_base;
1219   int multibytep = coding->src_multibyte;
1220   Lisp_Object attr, charset_list;
1221
1222   CODING_GET_INFO (coding, attr, charset_list);
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4, c5;
1227
1228       src_base = src;
1229       consumed_chars_base = consumed_chars;
1230
1231       if (charbuf >= charbuf_end)
1232         break;
1233
1234       ONE_MORE_BYTE (c1);
1235       if (c1 < 0)
1236         {
1237           c = - c1;
1238         }
1239       else if (UTF_8_1_OCTET_P(c1))
1240         {
1241           c = c1;
1242         }
1243       else
1244         {
1245           ONE_MORE_BYTE (c2);
1246           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1247             goto invalid_code;
1248           if (UTF_8_2_OCTET_LEADING_P (c1))
1249             {
1250               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1251               /* Reject overlong sequences here and below.  Encoders
1252                  producing them are incorrect, they can be misleading,
1253                  and they mess up read/write invariance.  */
1254               if (c < 128)
1255                 goto invalid_code;
1256             }
1257           else
1258             {
1259               ONE_MORE_BYTE (c3);
1260               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1261                 goto invalid_code;
1262               if (UTF_8_3_OCTET_LEADING_P (c1))
1263                 {
1264                   c = (((c1 & 0xF) << 12)
1265                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1266                   if (c < 0x800
1267                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1268                     goto invalid_code;
1269                 }
1270               else
1271                 {
1272                   ONE_MORE_BYTE (c4);
1273                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274                     goto invalid_code;
1275                   if (UTF_8_4_OCTET_LEADING_P (c1))
1276                     {
1277                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1278                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1279                     if (c < 0x10000)
1280                       goto invalid_code;
1281                     }
1282                   else
1283                     {
1284                       ONE_MORE_BYTE (c5);
1285                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1286                         goto invalid_code;
1287                       if (UTF_8_5_OCTET_LEADING_P (c1))
1288                         {
1289                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1290                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1291                                | (c5 & 0x3F));
1292                           if ((c > MAX_CHAR) || (c < 0x200000))
1293                             goto invalid_code;
1294                         }
1295                       else
1296                         goto invalid_code;
1297                     }
1298                 }
1299             }
1300         }
1301
1302       *charbuf++ = c;
1303       continue;
1304
1305     invalid_code:
1306       src = src_base;
1307       consumed_chars = consumed_chars_base;
1308       ONE_MORE_BYTE (c);
1309       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1310       coding->errors++;
1311     }
1312
1313  no_more_source:
1314   coding->consumed_char += consumed_chars_base;
1315   coding->consumed = src_base - coding->source;
1316   coding->charbuf_used = charbuf - coding->charbuf;
1317 }
1318
1319
1320 static int
1321 encode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   int multibytep = coding->dst_multibyte;
1325   int *charbuf = coding->charbuf;
1326   int *charbuf_end = charbuf + coding->charbuf_used;
1327   unsigned char *dst = coding->destination + coding->produced;
1328   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1329   int produced_chars = 0;
1330   int c;
1331
1332   if (multibytep)
1333     {
1334       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1335
1336       while (charbuf < charbuf_end)
1337         {
1338           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1339
1340           ASSURE_DESTINATION (safe_room);
1341           c = *charbuf++;
1342           if (CHAR_BYTE8_P (c))
1343             {
1344               c = CHAR_TO_BYTE8 (c);
1345               EMIT_ONE_BYTE (c);
1346             }
1347           else
1348             {
1349               CHAR_STRING_ADVANCE (c, pend);
1350               for (p = str; p < pend; p++)
1351                 EMIT_ONE_BYTE (*p);
1352             }
1353         }
1354     }
1355   else
1356     {
1357       int safe_room = MAX_MULTIBYTE_LENGTH;
1358
1359       while (charbuf < charbuf_end)
1360         {
1361           ASSURE_DESTINATION (safe_room);
1362           c = *charbuf++;
1363           if (CHAR_BYTE8_P (c))
1364             *dst++ = CHAR_TO_BYTE8 (c);
1365           else
1366             dst += CHAR_STRING (c, dst);
1367           produced_chars++;
1368         }
1369     }
1370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1371   coding->produced_char += produced_chars;
1372   coding->produced = dst - coding->destination;
1373   return 0;
1374 }
1375
1376
1377 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1378    Check if a text is encoded in one of UTF-16 based coding systems.
1379    If it is, return 1, else return 0.  */
1380
1381 #define UTF_16_HIGH_SURROGATE_P(val) \
1382   (((val) & 0xFC00) == 0xD800)
1383
1384 #define UTF_16_LOW_SURROGATE_P(val) \
1385   (((val) & 0xFC00) == 0xDC00)
1386
1387 #define UTF_16_INVALID_P(val)   \
1388   (((val) == 0xFFFE)            \
1389    || ((val) == 0xFFFF)         \
1390    || UTF_16_LOW_SURROGATE_P (val))
1391
1392
1393 static int
1394 detect_coding_utf_16 (coding, detect_info)
1395      struct coding_system *coding;
1396      struct coding_detection_info *detect_info;
1397 {
1398   const unsigned char *src = coding->source, *src_base = src;
1399   const unsigned char *src_end = coding->source + coding->src_bytes;
1400   int multibytep = coding->src_multibyte;
1401   int consumed_chars = 0;
1402   int c1, c2;
1403
1404   detect_info->checked |= CATEGORY_MASK_UTF_16;
1405   if (coding->mode & CODING_MODE_LAST_BLOCK
1406       && (coding->src_chars & 1))
1407     {
1408       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1409       return 0;
1410     }
1411
1412   ONE_MORE_BYTE (c1);
1413   ONE_MORE_BYTE (c2);
1414   if ((c1 == 0xFF) && (c2 == 0xFE))
1415     {
1416       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1417                              | CATEGORY_MASK_UTF_16_AUTO);
1418       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1419                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1420                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1421     }
1422   else if ((c1 == 0xFE) && (c2 == 0xFF))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if (c1 >= 0 && c2 >= 0)
1431     {
1432       detect_info->rejected
1433         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1434     }
1435  no_more_source:
1436   return 1;
1437 }
1438
1439 static void
1440 decode_coding_utf_16 (coding)
1441      struct coding_system *coding;
1442 {
1443   const unsigned char *src = coding->source + coding->consumed;
1444   const unsigned char *src_end = coding->source + coding->src_bytes;
1445   const unsigned char *src_base;
1446   int *charbuf = coding->charbuf + coding->charbuf_used;
1447   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1448   int consumed_chars = 0, consumed_chars_base;
1449   int multibytep = coding->src_multibyte;
1450   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1451   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1452   int surrogate = CODING_UTF_16_SURROGATE (coding);
1453   Lisp_Object attr, charset_list;
1454
1455   CODING_GET_INFO (coding, attr, charset_list);
1456
1457   if (bom == utf_16_with_bom)
1458     {
1459       int c, c1, c2;
1460
1461       src_base = src;
1462       ONE_MORE_BYTE (c1);
1463       ONE_MORE_BYTE (c2);
1464       c = (c1 << 8) | c2;
1465
1466       if (endian == utf_16_big_endian
1467           ? c != 0xFEFF : c != 0xFFFE)
1468         {
1469           /* The first two bytes are not BOM.  Treat them as bytes
1470              for a normal character.  */
1471           src = src_base;
1472           coding->errors++;
1473         }
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476   else if (bom == utf_16_detect_bom)
1477     {
1478       /* We have already tried to detect BOM and failed in
1479          detect_coding.  */
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482
1483   while (1)
1484     {
1485       int c, c1, c2;
1486
1487       src_base = src;
1488       consumed_chars_base = consumed_chars;
1489
1490       if (charbuf + 2 >= charbuf_end)
1491         break;
1492
1493       ONE_MORE_BYTE (c1);
1494       if (c1 < 0)
1495         {
1496           *charbuf++ = -c1;
1497           continue;
1498         }
1499       ONE_MORE_BYTE (c2);
1500       if (c2 < 0)
1501         {
1502           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1503           *charbuf++ = -c2;
1504           continue;
1505         }
1506       c = (endian == utf_16_big_endian
1507            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1508       if (surrogate)
1509         {
1510           if (! UTF_16_LOW_SURROGATE_P (c))
1511             {
1512               if (endian == utf_16_big_endian)
1513                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1514               else
1515                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1516               *charbuf++ = c1;
1517               *charbuf++ = c2;
1518               coding->errors++;
1519               if (UTF_16_HIGH_SURROGATE_P (c))
1520                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1521               else
1522                 *charbuf++ = c;
1523             }
1524           else
1525             {
1526               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1527               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1528               *charbuf++ = 0x10000 + c;
1529             }
1530         }
1531       else
1532         {
1533           if (UTF_16_HIGH_SURROGATE_P (c))
1534             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1535           else
1536             *charbuf++ = c;
1537         }
1538     }
1539
1540  no_more_source:
1541   coding->consumed_char += consumed_chars_base;
1542   coding->consumed = src_base - coding->source;
1543   coding->charbuf_used = charbuf - coding->charbuf;
1544 }
1545
1546 static int
1547 encode_coding_utf_16 (coding)
1548      struct coding_system *coding;
1549 {
1550   int multibytep = coding->dst_multibyte;
1551   int *charbuf = coding->charbuf;
1552   int *charbuf_end = charbuf + coding->charbuf_used;
1553   unsigned char *dst = coding->destination + coding->produced;
1554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1555   int safe_room = 8;
1556   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1557   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1558   int produced_chars = 0;
1559   Lisp_Object attrs, charset_list;
1560   int c;
1561
1562   CODING_GET_INFO (coding, attrs, charset_list);
1563
1564   if (bom != utf_16_without_bom)
1565     {
1566       ASSURE_DESTINATION (safe_room);
1567       if (big_endian)
1568         EMIT_TWO_BYTES (0xFE, 0xFF);
1569       else
1570         EMIT_TWO_BYTES (0xFF, 0xFE);
1571       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1572     }
1573
1574   while (charbuf < charbuf_end)
1575     {
1576       ASSURE_DESTINATION (safe_room);
1577       c = *charbuf++;
1578       if (c >= MAX_UNICODE_CHAR)
1579         c = coding->default_char;
1580
1581       if (c < 0x10000)
1582         {
1583           if (big_endian)
1584             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1585           else
1586             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1587         }
1588       else
1589         {
1590           int c1, c2;
1591
1592           c -= 0x10000;
1593           c1 = (c >> 10) + 0xD800;
1594           c2 = (c & 0x3FF) + 0xDC00;
1595           if (big_endian)
1596             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1597           else
1598             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1599         }
1600     }
1601   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1602   coding->produced = dst - coding->destination;
1603   coding->produced_char += produced_chars;
1604   return 0;
1605 }
1606
1607 \f
1608 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1609
1610 /* Emacs' internal format for representation of multiple character
1611    sets is a kind of multi-byte encoding, i.e. characters are
1612    represented by variable-length sequences of one-byte codes.
1613
1614    ASCII characters and control characters (e.g. `tab', `newline') are
1615    represented by one-byte sequences which are their ASCII codes, in
1616    the range 0x00 through 0x7F.
1617
1618    8-bit characters of the range 0x80..0x9F are represented by
1619    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1620    code + 0x20).
1621
1622    8-bit characters of the range 0xA0..0xFF are represented by
1623    one-byte sequences which are their 8-bit code.
1624
1625    The other characters are represented by a sequence of `base
1626    leading-code', optional `extended leading-code', and one or two
1627    `position-code's.  The length of the sequence is determined by the
1628    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1629    whereas extended leading-code and position-code take the range 0xA0
1630    through 0xFF.  See `charset.h' for more details about leading-code
1631    and position-code.
1632
1633    --- CODE RANGE of Emacs' internal format ---
1634    character set        range
1635    -------------        -----
1636    ascii                0x00..0x7F
1637    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1638    eight-bit-graphic    0xA0..0xBF
1639    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1640    ---------------------------------------------
1641
1642    As this is the internal character representation, the format is
1643    usually not used externally (i.e. in a file or in a data sent to a
1644    process).  But, it is possible to have a text externally in this
1645    format (i.e. by encoding by the coding system `emacs-mule').
1646
1647    In that case, a sequence of one-byte codes has a slightly different
1648    form.
1649
1650    At first, all characters in eight-bit-control are represented by
1651    one-byte sequences which are their 8-bit code.
1652
1653    Next, character composition data are represented by the byte
1654    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1655    where,
1656         METHOD is 0xF0 plus one of composition method (enum
1657         composition_method),
1658
1659         BYTES is 0xA0 plus a byte length of this composition data,
1660
1661         CHARS is 0x20 plus a number of characters composed by this
1662         data,
1663
1664         COMPONENTs are characters of multibye form or composition
1665         rules encoded by two-byte of ASCII codes.
1666
1667    In addition, for backward compatibility, the following formats are
1668    also recognized as composition data on decoding.
1669
1670    0x80 MSEQ ...
1671    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1672
1673    Here,
1674         MSEQ is a multibyte form but in these special format:
1675           ASCII: 0xA0 ASCII_CODE+0x80,
1676           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1677         RULE is a one byte code of the range 0xA0..0xF0 that
1678         represents a composition rule.
1679   */
1680
1681 char emacs_mule_bytes[256];
1682
1683 int
1684 emacs_mule_char (coding, src, nbytes, nchars, id)
1685      struct coding_system *coding;
1686      const unsigned char *src;
1687      int *nbytes, *nchars, *id;
1688 {
1689   const unsigned char *src_end = coding->source + coding->src_bytes;
1690   const unsigned char *src_base = src;
1691   int multibytep = coding->src_multibyte;
1692   struct charset *charset;
1693   unsigned code;
1694   int c;
1695   int consumed_chars = 0;
1696
1697   ONE_MORE_BYTE (c);
1698   if (c < 0)
1699     {
1700       c = -c;
1701       charset = emacs_mule_charset[0];
1702     }
1703   else
1704     {
1705       switch (emacs_mule_bytes[c])
1706         {
1707         case 2:
1708           if (! (charset = emacs_mule_charset[c]))
1709             goto invalid_code;
1710           ONE_MORE_BYTE (c);
1711           if (c < 0xA0)
1712             goto invalid_code;
1713           code = c & 0x7F;
1714           break;
1715
1716         case 3:
1717           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1718               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1719             {
1720               ONE_MORE_BYTE (c);
1721               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1722                 goto invalid_code;
1723               ONE_MORE_BYTE (c);
1724               if (c < 0xA0)
1725                 goto invalid_code;
1726               code = c & 0x7F;
1727             }
1728           else
1729             {
1730               if (! (charset = emacs_mule_charset[c]))
1731                 goto invalid_code;
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0)
1734                 goto invalid_code;
1735               code = (c & 0x7F) << 8;
1736               ONE_MORE_BYTE (c);
1737               if (c < 0xA0)
1738                 goto invalid_code;
1739               code |= c & 0x7F;
1740             }
1741           break;
1742
1743         case 4:
1744           ONE_MORE_BYTE (c);
1745           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1746             goto invalid_code;
1747           ONE_MORE_BYTE (c);
1748           if (c < 0xA0)
1749             goto invalid_code;
1750           code = (c & 0x7F) << 8;
1751           ONE_MORE_BYTE (c);
1752           if (c < 0xA0)
1753             goto invalid_code;
1754           code |= c & 0x7F;
1755           break;
1756
1757         case 1:
1758           code = c;
1759           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1760                                      ? charset_ascii : charset_eight_bit);
1761           break;
1762
1763         default:
1764           abort ();
1765         }
1766       c = DECODE_CHAR (charset, code);
1767       if (c < 0)
1768         goto invalid_code;
1769     }
1770   *nbytes = src - src_base;
1771   *nchars = consumed_chars;
1772   if (id)
1773     *id = charset->id;
1774   return c;
1775
1776  no_more_source:
1777   return -2;
1778
1779  invalid_code:
1780   return -1;
1781 }
1782
1783
1784 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1785    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1786    else return 0.  */
1787
1788 static int
1789 detect_coding_emacs_mule (coding, detect_info)
1790      struct coding_system *coding;
1791      struct coding_detection_info *detect_info;
1792 {
1793   const unsigned char *src = coding->source, *src_base;
1794   const unsigned char *src_end = coding->source + coding->src_bytes;
1795   int multibytep = coding->src_multibyte;
1796   int consumed_chars = 0;
1797   int c;
1798   int found = 0;
1799
1800   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1801   /* A coding system of this category is always ASCII compatible.  */
1802   src += coding->head_ascii;
1803
1804   while (1)
1805     {
1806       src_base = src;
1807       ONE_MORE_BYTE (c);
1808       if (c < 0)
1809         continue;
1810       if (c == 0x80)
1811         {
1812           /* Perhaps the start of composite character.  We simple skip
1813              it because analyzing it is too heavy for detecting.  But,
1814              at least, we check that the composite character
1815              constitues of more than 4 bytes.  */
1816           const unsigned char *src_base;
1817
1818         repeat:
1819           src_base = src;
1820           do
1821             {
1822               ONE_MORE_BYTE (c);
1823             }
1824           while (c >= 0xA0);
1825
1826           if (src - src_base <= 4)
1827             break;
1828           found = CATEGORY_MASK_EMACS_MULE;
1829           if (c == 0x80)
1830             goto repeat;
1831         }
1832
1833       if (c < 0x80)
1834         {
1835           if (c < 0x20
1836               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1837             break;
1838         }
1839       else
1840         {
1841           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1842
1843           while (more_bytes > 0)
1844             {
1845               ONE_MORE_BYTE (c);
1846               if (c < 0xA0)
1847                 {
1848                   src--;        /* Unread the last byte.  */
1849                   break;
1850                 }
1851               more_bytes--;
1852             }
1853           if (more_bytes != 0)
1854             break;
1855           found = CATEGORY_MASK_EMACS_MULE;
1856         }
1857     }
1858   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1859   return 0;
1860
1861  no_more_source:
1862   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1863     {
1864       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1865       return 0;
1866     }
1867   detect_info->found |= found;
1868   return 1;
1869 }
1870
1871
1872 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1873
1874 /* Decode a character represented as a component of composition
1875    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1876    update SRC to the head of next character (or an encoded composition
1877    rule).  If SRC doesn't points a composition component, set C to -1.
1878    If SRC points an invalid byte sequence, global exit by a return
1879    value 0.  */
1880
1881 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1882   if (1)                                                        \
1883     {                                                           \
1884       int c;                                                    \
1885       int nbytes, nchars;                                       \
1886                                                                 \
1887       if (src == src_end)                                       \
1888         break;                                                  \
1889       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1890       if (c < 0)                                                \
1891         {                                                       \
1892           if (c == -2)                                          \
1893             break;                                              \
1894           goto invalid_code;                                    \
1895         }                                                       \
1896       *buf++ = c;                                               \
1897       src += nbytes;                                            \
1898       consumed_chars += nchars;                                 \
1899     }                                                           \
1900   else
1901
1902
1903 /* Decode a composition rule represented as a component of composition
1904    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1905    and increment BUF.  If SRC points an invalid byte sequence, set C
1906    to -1.  */
1907
1908 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1909   do {                                                  \
1910     int c, gref, nref;                                  \
1911                                                         \
1912     if (src >= src_end)                                 \
1913       goto invalid_code;                                \
1914     ONE_MORE_BYTE_NO_CHECK (c);                         \
1915     c -= 0x20;                                          \
1916     if (c < 0 || c >= 81)                               \
1917       goto invalid_code;                                \
1918                                                         \
1919     gref = c / 9, nref = c % 9;                         \
1920     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1921   } while (0)
1922
1923
1924 /* Decode a composition rule represented as a component of composition
1925    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1926    and increment BUF.  If SRC points an invalid byte sequence, set C
1927    to -1.  */
1928
1929 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1930   do {                                                  \
1931     int gref, nref;                                     \
1932                                                         \
1933     if (src + 1>= src_end)                              \
1934       goto invalid_code;                                \
1935     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1936     gref -= 0x20;                                       \
1937     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1938     nref -= 0x20;                                       \
1939     if (gref < 0 || gref >= 81                          \
1940         || nref < 0 || nref >= 81)                      \
1941       goto invalid_code;                                \
1942     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1943   } while (0)
1944
1945
1946 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1947   do {                                                                  \
1948     /* Emacs 21 style format.  The first three bytes at SRC are         \
1949        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1950        the byte length of this composition information, CHARS is the    \
1951        number of characters composed by this composition.  */           \
1952     enum composition_method method = c - 0xF2;                          \
1953     int *charbuf_base = charbuf;                                        \
1954     int consumed_chars_limit;                                           \
1955     int nbytes, nchars;                                                 \
1956                                                                         \
1957     ONE_MORE_BYTE (c);                                                  \
1958     if (c < 0)                                                          \
1959       goto invalid_code;                                                \
1960     nbytes = c - 0xA0;                                                  \
1961     if (nbytes < 3)                                                     \
1962       goto invalid_code;                                                \
1963     ONE_MORE_BYTE (c);                                                  \
1964     if (c < 0)                                                          \
1965       goto invalid_code;                                                \
1966     nchars = c - 0xA0;                                                  \
1967     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1968     consumed_chars_limit = consumed_chars_base + nbytes;                \
1969     if (method != COMPOSITION_RELATIVE)                                 \
1970       {                                                                 \
1971         int i = 0;                                                      \
1972         while (consumed_chars < consumed_chars_limit)                   \
1973           {                                                             \
1974             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1975               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1976             else                                                        \
1977               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1978             i++;                                                        \
1979           }                                                             \
1980         if (consumed_chars < consumed_chars_limit)                      \
1981           goto invalid_code;                                            \
1982         charbuf_base[0] -= i;                                           \
1983       }                                                                 \
1984   } while (0)
1985
1986
1987 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1988   do {                                                          \
1989     /* Emacs 20 style format for relative composition.  */      \
1990     /* Store multibyte form of characters to be composed.  */   \
1991     enum composition_method method = COMPOSITION_RELATIVE;      \
1992     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1993     int *buf = components;                                      \
1994     int i, j;                                                   \
1995                                                                 \
1996     src = src_base;                                             \
1997     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1998     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1999       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
2000     if (i < 2)                                                  \
2001       goto invalid_code;                                        \
2002     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2003     for (j = 0; j < i; j++)                                     \
2004       *charbuf++ = components[j];                               \
2005   } while (0)
2006
2007
2008 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2009   do {                                                          \
2010     /* Emacs 20 style format for rule-base composition.  */     \
2011     /* Store multibyte form of characters to be composed.  */   \
2012     enum composition_method method = COMPOSITION_WITH_RULE;     \
2013     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2014     int *buf = components;                                      \
2015     int i, j;                                                   \
2016                                                                 \
2017     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2018     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2019       {                                                         \
2020         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2021         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2022       }                                                         \
2023     if (i < 1 || (buf - components) % 2 == 0)                   \
2024       goto invalid_code;                                        \
2025     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2026       goto no_more_source;                                      \
2027     ADD_COMPOSITION_DATA (buf, i, method);                      \
2028     for (j = 0; j < i; j++)                                     \
2029       *charbuf++ = components[j];                               \
2030     for (j = 0; j < i; j += 2)                                  \
2031       *charbuf++ = components[j];                               \
2032   } while (0)
2033
2034
2035 static void
2036 decode_coding_emacs_mule (coding)
2037      struct coding_system *coding;
2038 {
2039   const unsigned char *src = coding->source + coding->consumed;
2040   const unsigned char *src_end = coding->source + coding->src_bytes;
2041   const unsigned char *src_base;
2042   int *charbuf = coding->charbuf + coding->charbuf_used;
2043   int *charbuf_end
2044     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2045   int consumed_chars = 0, consumed_chars_base;
2046   int multibytep = coding->src_multibyte;
2047   Lisp_Object attrs, charset_list;
2048   int char_offset = coding->produced_char;
2049   int last_offset = char_offset;
2050   int last_id = charset_ascii;
2051
2052   CODING_GET_INFO (coding, attrs, charset_list);
2053
2054   while (1)
2055     {
2056       int c;
2057
2058       src_base = src;
2059       consumed_chars_base = consumed_chars;
2060
2061       if (charbuf >= charbuf_end)
2062         break;
2063
2064       ONE_MORE_BYTE (c);
2065       if (c < 0)
2066         {
2067           *charbuf++ = -c;
2068           char_offset++;
2069         }
2070       else if (c < 0x80)
2071         {
2072           *charbuf++ = c;
2073           char_offset++;
2074         }
2075       else if (c == 0x80)
2076         {
2077           ONE_MORE_BYTE (c);
2078           if (c < 0)
2079             goto invalid_code;
2080           if (c - 0xF2 >= COMPOSITION_RELATIVE
2081               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2082             DECODE_EMACS_MULE_21_COMPOSITION (c);
2083           else if (c < 0xC0)
2084             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2085           else if (c == 0xFF)
2086             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2087           else
2088             goto invalid_code;
2089         }
2090       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2091         {
2092           int nbytes, nchars;
2093           int id;
2094
2095           src = src_base;
2096           consumed_chars = consumed_chars_base;
2097           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2098           if (c < 0)
2099             {
2100               if (c == -2)
2101                 break;
2102               goto invalid_code;
2103             }
2104           if (last_id != id)
2105             {
2106               if (last_id != charset_ascii)
2107                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2108               last_id = id;
2109               last_offset = char_offset;
2110             }
2111           *charbuf++ = c;
2112           src += nbytes;
2113           consumed_chars += nchars;
2114           char_offset++;
2115         }
2116       continue;
2117
2118     invalid_code:
2119       src = src_base;
2120       consumed_chars = consumed_chars_base;
2121       ONE_MORE_BYTE (c);
2122       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2123       char_offset++;
2124       coding->errors++;
2125     }
2126
2127  no_more_source:
2128   if (last_id != charset_ascii)
2129     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2130   coding->consumed_char += consumed_chars_base;
2131   coding->consumed = src_base - coding->source;
2132   coding->charbuf_used = charbuf - coding->charbuf;
2133 }
2134
2135
2136 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2137   do {                                          \
2138     if (id < 0xA0)                              \
2139       codes[0] = id, codes[1] = 0;              \
2140     else if (id < 0xE0)                         \
2141       codes[0] = 0x9A, codes[1] = id;           \
2142     else if (id < 0xF0)                         \
2143       codes[0] = 0x9B, codes[1] = id;           \
2144     else if (id < 0xF5)                         \
2145       codes[0] = 0x9C, codes[1] = id;           \
2146     else                                        \
2147       codes[0] = 0x9D, codes[1] = id;           \
2148   } while (0);
2149
2150
2151 static int
2152 encode_coding_emacs_mule (coding)
2153      struct coding_system *coding;
2154 {
2155   int multibytep = coding->dst_multibyte;
2156   int *charbuf = coding->charbuf;
2157   int *charbuf_end = charbuf + coding->charbuf_used;
2158   unsigned char *dst = coding->destination + coding->produced;
2159   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2160   int safe_room = 8;
2161   int produced_chars = 0;
2162   Lisp_Object attrs, charset_list;
2163   int c;
2164   int preferred_charset_id = -1;
2165
2166   CODING_GET_INFO (coding, attrs, charset_list);
2167   if (! EQ (charset_list, Vemacs_mule_charset_list))
2168     {
2169       CODING_ATTR_CHARSET_LIST (attrs)
2170         = charset_list = Vemacs_mule_charset_list;
2171     }
2172
2173   while (charbuf < charbuf_end)
2174     {
2175       ASSURE_DESTINATION (safe_room);
2176       c = *charbuf++;
2177
2178       if (c < 0)
2179         {
2180           /* Handle an annotation.  */
2181           switch (*charbuf)
2182             {
2183             case CODING_ANNOTATE_COMPOSITION_MASK:
2184               /* Not yet implemented.  */
2185               break;
2186             case CODING_ANNOTATE_CHARSET_MASK:
2187               preferred_charset_id = charbuf[3];
2188               if (preferred_charset_id >= 0
2189                   && NILP (Fmemq (make_number (preferred_charset_id),
2190                                   charset_list)))
2191                 preferred_charset_id = -1;
2192               break;
2193             default:
2194               abort ();
2195             }
2196           charbuf += -c - 1;
2197           continue;
2198         }
2199
2200       if (ASCII_CHAR_P (c))
2201         EMIT_ONE_ASCII_BYTE (c);
2202       else if (CHAR_BYTE8_P (c))
2203         {
2204           c = CHAR_TO_BYTE8 (c);
2205           EMIT_ONE_BYTE (c);
2206         }
2207       else
2208         {
2209           struct charset *charset;
2210           unsigned code;
2211           int dimension;
2212           int emacs_mule_id;
2213           unsigned char leading_codes[2];
2214
2215           if (preferred_charset_id >= 0)
2216             {
2217               charset = CHARSET_FROM_ID (preferred_charset_id);
2218               if (! CHAR_CHARSET_P (c, charset))
2219                 charset = char_charset (c, charset_list, NULL);
2220             }
2221           else
2222             charset = char_charset (c, charset_list, &code);
2223           if (! charset)
2224             {
2225               c = coding->default_char;
2226               if (ASCII_CHAR_P (c))
2227                 {
2228                   EMIT_ONE_ASCII_BYTE (c);
2229                   continue;
2230                 }
2231               charset = char_charset (c, charset_list, &code);
2232             }
2233           dimension = CHARSET_DIMENSION (charset);
2234           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2235           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2236           EMIT_ONE_BYTE (leading_codes[0]);
2237           if (leading_codes[1])
2238             EMIT_ONE_BYTE (leading_codes[1]);
2239           if (dimension == 1)
2240             EMIT_ONE_BYTE (code | 0x80);
2241           else
2242             {
2243               code |= 0x8080;
2244               EMIT_ONE_BYTE (code >> 8);
2245               EMIT_ONE_BYTE (code & 0xFF);
2246             }
2247         }
2248     }
2249   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2250   coding->produced_char += produced_chars;
2251   coding->produced = dst - coding->destination;
2252   return 0;
2253 }
2254
2255 \f
2256 /*** 7. ISO2022 handlers ***/
2257
2258 /* The following note describes the coding system ISO2022 briefly.
2259    Since the intention of this note is to help understand the
2260    functions in this file, some parts are NOT ACCURATE or are OVERLY
2261    SIMPLIFIED.  For thorough understanding, please refer to the
2262    original document of ISO2022.  This is equivalent to the standard
2263    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2264
2265    ISO2022 provides many mechanisms to encode several character sets
2266    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2267    is encoded using bytes less than 128.  This may make the encoded
2268    text a little bit longer, but the text passes more easily through
2269    several types of gateway, some of which strip off the MSB (Most
2270    Significant Bit).
2271
2272    There are two kinds of character sets: control character sets and
2273    graphic character sets.  The former contain control characters such
2274    as `newline' and `escape' to provide control functions (control
2275    functions are also provided by escape sequences).  The latter
2276    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2277    two control character sets and many graphic character sets.
2278
2279    Graphic character sets are classified into one of the following
2280    four classes, according to the number of bytes (DIMENSION) and
2281    number of characters in one dimension (CHARS) of the set:
2282    - DIMENSION1_CHARS94
2283    - DIMENSION1_CHARS96
2284    - DIMENSION2_CHARS94
2285    - DIMENSION2_CHARS96
2286
2287    In addition, each character set is assigned an identification tag,
2288    unique for each set, called the "final character" (denoted as <F>
2289    hereafter).  The <F> of each character set is decided by ECMA(*)
2290    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2291    (0x30..0x3F are for private use only).
2292
2293    Note (*): ECMA = European Computer Manufacturers Association
2294
2295    Here are examples of graphic character sets [NAME(<F>)]:
2296         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2297         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2298         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2299         o DIMENSION2_CHARS96 -- none for the moment
2300
2301    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2302         C0 [0x00..0x1F] -- control character plane 0
2303         GL [0x20..0x7F] -- graphic character plane 0
2304         C1 [0x80..0x9F] -- control character plane 1
2305         GR [0xA0..0xFF] -- graphic character plane 1
2306
2307    A control character set is directly designated and invoked to C0 or
2308    C1 by an escape sequence.  The most common case is that:
2309    - ISO646's  control character set is designated/invoked to C0, and
2310    - ISO6429's control character set is designated/invoked to C1,
2311    and usually these designations/invocations are omitted in encoded
2312    text.  In a 7-bit environment, only C0 can be used, and a control
2313    character for C1 is encoded by an appropriate escape sequence to
2314    fit into the environment.  All control characters for C1 are
2315    defined to have corresponding escape sequences.
2316
2317    A graphic character set is at first designated to one of four
2318    graphic registers (G0 through G3), then these graphic registers are
2319    invoked to GL or GR.  These designations and invocations can be
2320    done independently.  The most common case is that G0 is invoked to
2321    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2322    these invocations and designations are omitted in encoded text.
2323    In a 7-bit environment, only GL can be used.
2324
2325    When a graphic character set of CHARS94 is invoked to GL, codes
2326    0x20 and 0x7F of the GL area work as control characters SPACE and
2327    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2328    be used.
2329
2330    There are two ways of invocation: locking-shift and single-shift.
2331    With locking-shift, the invocation lasts until the next different
2332    invocation, whereas with single-shift, the invocation affects the
2333    following character only and doesn't affect the locking-shift
2334    state.  Invocations are done by the following control characters or
2335    escape sequences:
2336
2337    ----------------------------------------------------------------------
2338    abbrev  function                  cntrl escape seq   description
2339    ----------------------------------------------------------------------
2340    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2341    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2342    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2343    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2344    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2345    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2346    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2347    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2348    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2349    ----------------------------------------------------------------------
2350    (*) These are not used by any known coding system.
2351
2352    Control characters for these functions are defined by macros
2353    ISO_CODE_XXX in `coding.h'.
2354
2355    Designations are done by the following escape sequences:
2356    ----------------------------------------------------------------------
2357    escape sequence      description
2358    ----------------------------------------------------------------------
2359    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2360    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2361    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2362    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2363    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2364    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2365    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2366    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2367    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2368    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2369    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2370    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2371    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2372    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2373    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2374    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2375    ----------------------------------------------------------------------
2376
2377    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2378    of dimension 1, chars 94, and final character <F>, etc...
2379
2380    Note (*): Although these designations are not allowed in ISO2022,
2381    Emacs accepts them on decoding, and produces them on encoding
2382    CHARS96 character sets in a coding system which is characterized as
2383    7-bit environment, non-locking-shift, and non-single-shift.
2384
2385    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2386    '(' must be omitted.  We refer to this as "short-form" hereafter.
2387
2388    Now you may notice that there are a lot of ways of encoding the
2389    same multilingual text in ISO2022.  Actually, there exist many
2390    coding systems such as Compound Text (used in X11's inter client
2391    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2392    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2393    localized platforms), and all of these are variants of ISO2022.
2394
2395    In addition to the above, Emacs handles two more kinds of escape
2396    sequences: ISO6429's direction specification and Emacs' private
2397    sequence for specifying character composition.
2398
2399    ISO6429's direction specification takes the following form:
2400         o CSI ']'      -- end of the current direction
2401         o CSI '0' ']'  -- end of the current direction
2402         o CSI '1' ']'  -- start of left-to-right text
2403         o CSI '2' ']'  -- start of right-to-left text
2404    The control character CSI (0x9B: control sequence introducer) is
2405    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2406
2407    Character composition specification takes the following form:
2408         o ESC '0' -- start relative composition
2409         o ESC '1' -- end composition
2410         o ESC '2' -- start rule-base composition (*)
2411         o ESC '3' -- start relative composition with alternate chars  (**)
2412         o ESC '4' -- start rule-base composition with alternate chars  (**)
2413   Since these are not standard escape sequences of any ISO standard,
2414   the use of them with these meanings is restricted to Emacs only.
2415
2416   (*) This form is used only in Emacs 20.7 and older versions,
2417   but newer versions can safely decode it.
2418   (**) This form is used only in Emacs 21.1 and newer versions,
2419   and older versions can't decode it.
2420
2421   Here's a list of example usages of these composition escape
2422   sequences (categorized by `enum composition_method').
2423
2424   COMPOSITION_RELATIVE:
2425         ESC 0 CHAR [ CHAR ] ESC 1
2426   COMPOSITION_WITH_RULE:
2427         ESC 2 CHAR [ RULE CHAR ] ESC 1
2428   COMPOSITION_WITH_ALTCHARS:
2429         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2430   COMPOSITION_WITH_RULE_ALTCHARS:
2431         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2432
2433 enum iso_code_class_type iso_code_class[256];
2434
2435 #define SAFE_CHARSET_P(coding, id)      \
2436   ((id) <= (coding)->max_charset_id     \
2437    && (coding)->safe_charsets[id] >= 0)
2438
2439
2440 #define SHIFT_OUT_OK(category)  \
2441   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2442
2443 static void
2444 setup_iso_safe_charsets (attrs)
2445      Lisp_Object attrs;
2446 {
2447   Lisp_Object charset_list, safe_charsets;
2448   Lisp_Object request;
2449   Lisp_Object reg_usage;
2450   Lisp_Object tail;
2451   int reg94, reg96;
2452   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2453   int max_charset_id;
2454
2455   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2456   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2457       && ! EQ (charset_list, Viso_2022_charset_list))
2458     {
2459       CODING_ATTR_CHARSET_LIST (attrs)
2460         = charset_list = Viso_2022_charset_list;
2461       ASET (attrs, coding_attr_safe_charsets, Qnil);
2462     }
2463
2464   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2465     return;
2466
2467   max_charset_id = 0;
2468   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2469     {
2470       int id = XINT (XCAR (tail));
2471       if (max_charset_id < id)
2472         max_charset_id = id;
2473     }
2474
2475   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2476                                 make_number (255));
2477   request = AREF (attrs, coding_attr_iso_request);
2478   reg_usage = AREF (attrs, coding_attr_iso_usage);
2479   reg94 = XINT (XCAR (reg_usage));
2480   reg96 = XINT (XCDR (reg_usage));
2481
2482   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2483     {
2484       Lisp_Object id;
2485       Lisp_Object reg;
2486       struct charset *charset;
2487
2488       id = XCAR (tail);
2489       charset = CHARSET_FROM_ID (XINT (id));
2490       reg = Fcdr (Fassq (id, request));
2491       if (! NILP (reg))
2492         SSET (safe_charsets, XINT (id), XINT (reg));
2493       else if (charset->iso_chars_96)
2494         {
2495           if (reg96 < 4)
2496             SSET (safe_charsets, XINT (id), reg96);
2497         }
2498       else
2499         {
2500           if (reg94 < 4)
2501             SSET (safe_charsets, XINT (id), reg94);
2502         }
2503     }
2504   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2505 }
2506
2507
2508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2509    Check if a text is encoded in one of ISO-2022 based codig systems.
2510    If it is, return 1, else return 0.  */
2511
2512 static int
2513 detect_coding_iso_2022 (coding, detect_info)
2514      struct coding_system *coding;
2515      struct coding_detection_info *detect_info;
2516 {
2517   const unsigned char *src = coding->source, *src_base = src;
2518   const unsigned char *src_end = coding->source + coding->src_bytes;
2519   int multibytep = coding->src_multibyte;
2520   int single_shifting = 0;
2521   int id;
2522   int c, c1;
2523   int consumed_chars = 0;
2524   int i;
2525   int rejected = 0;
2526   int found = 0;
2527
2528   detect_info->checked |= CATEGORY_MASK_ISO;
2529
2530   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2531     {
2532       struct coding_system *this = &(coding_categories[i]);
2533       Lisp_Object attrs, val;
2534
2535       attrs = CODING_ID_ATTRS (this->id);
2536       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2537           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2538         setup_iso_safe_charsets (attrs);
2539       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2540       this->max_charset_id = SCHARS (val) - 1;
2541       this->safe_charsets = (char *) SDATA (val);
2542     }
2543
2544   /* A coding system of this category is always ASCII compatible.  */
2545   src += coding->head_ascii;
2546
2547   while (rejected != CATEGORY_MASK_ISO)
2548     {
2549       src_base = src;
2550       ONE_MORE_BYTE (c);
2551       switch (c)
2552         {
2553         case ISO_CODE_ESC:
2554           if (inhibit_iso_escape_detection)
2555             break;
2556           single_shifting = 0;
2557           ONE_MORE_BYTE (c);
2558           if (c >= '(' && c <= '/')
2559             {
2560               /* Designation sequence for a charset of dimension 1.  */
2561               ONE_MORE_BYTE (c1);
2562               if (c1 < ' ' || c1 >= 0x80
2563                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2564                 /* Invalid designation sequence.  Just ignore.  */
2565                 break;
2566             }
2567           else if (c == '$')
2568             {
2569               /* Designation sequence for a charset of dimension 2.  */
2570               ONE_MORE_BYTE (c);
2571               if (c >= '@' && c <= 'B')
2572                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2573                 id = iso_charset_table[1][0][c];
2574               else if (c >= '(' && c <= '/')
2575                 {
2576                   ONE_MORE_BYTE (c1);
2577                   if (c1 < ' ' || c1 >= 0x80
2578                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2579                     /* Invalid designation sequence.  Just ignore.  */
2580                     break;
2581                 }
2582               else
2583                 /* Invalid designation sequence.  Just ignore it.  */
2584                 break;
2585             }
2586           else if (c == 'N' || c == 'O')
2587             {
2588               /* ESC <Fe> for SS2 or SS3.  */
2589               single_shifting = 1;
2590               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2591               break;
2592             }
2593           else if (c >= '0' && c <= '4')
2594             {
2595               /* ESC <Fp> for start/end composition.  */
2596               found |= CATEGORY_MASK_ISO;
2597               break;
2598             }
2599           else
2600             {
2601               /* Invalid escape sequence.  Just ignore it.  */
2602               break;
2603             }
2604
2605           /* We found a valid designation sequence for CHARSET.  */
2606           rejected |= CATEGORY_MASK_ISO_8BIT;
2607           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2608                               id))
2609             found |= CATEGORY_MASK_ISO_7;
2610           else
2611             rejected |= CATEGORY_MASK_ISO_7;
2612           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2613                               id))
2614             found |= CATEGORY_MASK_ISO_7_TIGHT;
2615           else
2616             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2617           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2618                               id))
2619             found |= CATEGORY_MASK_ISO_7_ELSE;
2620           else
2621             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2622           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2623                               id))
2624             found |= CATEGORY_MASK_ISO_8_ELSE;
2625           else
2626             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2627           break;
2628
2629         case ISO_CODE_SO:
2630         case ISO_CODE_SI:
2631           /* Locking shift out/in.  */
2632           if (inhibit_iso_escape_detection)
2633             break;
2634           single_shifting = 0;
2635           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2636           found |= CATEGORY_MASK_ISO_ELSE;
2637           break;
2638
2639         case ISO_CODE_CSI:
2640           /* Control sequence introducer.  */
2641           single_shifting = 0;
2642           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2643           found |= CATEGORY_MASK_ISO_8_ELSE;
2644           goto check_extra_latin;
2645
2646         case ISO_CODE_SS2:
2647         case ISO_CODE_SS3:
2648           /* Single shift.   */
2649           if (inhibit_iso_escape_detection)
2650             break;
2651           single_shifting = 0;
2652           rejected |= CATEGORY_MASK_ISO_7BIT;
2653           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2654               & CODING_ISO_FLAG_SINGLE_SHIFT)
2655             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2656           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2657               & CODING_ISO_FLAG_SINGLE_SHIFT)
2658             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2659           if (single_shifting)
2660             break;
2661           goto check_extra_latin;
2662
2663         default:
2664           if (c < 0)
2665             continue;
2666           if (c < 0x80)
2667             {
2668               single_shifting = 0;
2669               break;
2670             }
2671           if (c >= 0xA0)
2672             {
2673               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2674               found |= CATEGORY_MASK_ISO_8_1;
2675               /* Check the length of succeeding codes of the range
2676                  0xA0..0FF.  If the byte length is even, we include
2677                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2678                  only when we are not single shifting.  */
2679               if (! single_shifting
2680                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2681                 {
2682                   int i = 1;
2683                   while (src < src_end)
2684                     {
2685                       ONE_MORE_BYTE (c);
2686                       if (c < 0xA0)
2687                         break;
2688                       i++;
2689                     }
2690
2691                   if (i & 1 && src < src_end)
2692                     rejected |= CATEGORY_MASK_ISO_8_2;
2693                   else
2694                     found |= CATEGORY_MASK_ISO_8_2;
2695                 }
2696               break;
2697             }
2698         check_extra_latin:
2699           single_shifting = 0;
2700           if (! VECTORP (Vlatin_extra_code_table)
2701               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2702             {
2703               rejected = CATEGORY_MASK_ISO;
2704               break;
2705             }
2706           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2707               & CODING_ISO_FLAG_LATIN_EXTRA)
2708             found |= CATEGORY_MASK_ISO_8_1;
2709           else
2710             rejected |= CATEGORY_MASK_ISO_8_1;
2711           rejected |= CATEGORY_MASK_ISO_8_2;
2712         }
2713     }
2714   detect_info->rejected |= CATEGORY_MASK_ISO;
2715   return 0;
2716
2717  no_more_source:
2718   detect_info->rejected |= rejected;
2719   detect_info->found |= (found & ~rejected);
2720   return 1;
2721 }
2722
2723
2724 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2725    escape sequence should be kept.  */
2726 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2727   do {                                                                  \
2728     int id, prev;                                                       \
2729                                                                         \
2730     if (final < '0' || final >= 128                                     \
2731         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2732         || !SAFE_CHARSET_P (coding, id))                                \
2733       {                                                                 \
2734         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2735         chars_96 = -1;                                                  \
2736         break;                                                          \
2737       }                                                                 \
2738     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2739     if (id == charset_jisx0201_roman)                                   \
2740       {                                                                 \
2741         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2742           id = charset_ascii;                                           \
2743       }                                                                 \
2744     else if (id == charset_jisx0208_1978)                               \
2745       {                                                                 \
2746         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2747           id = charset_jisx0208;                                        \
2748       }                                                                 \
2749     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2750     /* If there was an invalid designation to REG previously, and this  \
2751        designation is ASCII to REG, we should keep this designation     \
2752        sequence.  */                                                    \
2753     if (prev == -2 && id == charset_ascii)                              \
2754       chars_96 = -1;                                                    \
2755   } while (0)
2756
2757
2758 #define MAYBE_FINISH_COMPOSITION()                              \
2759   do {                                                          \
2760     int i;                                                      \
2761     if (composition_state == COMPOSING_NO)                      \
2762       break;                                                    \
2763     /* It is assured that we have enough room for producing     \
2764        characters stored in the table `components'.  */         \
2765     if (charbuf + component_idx > charbuf_end)                  \
2766       goto no_more_source;                                      \
2767     composition_state = COMPOSING_NO;                           \
2768     if (method == COMPOSITION_RELATIVE                          \
2769         || method == COMPOSITION_WITH_ALTCHARS)                 \
2770       {                                                         \
2771         for (i = 0; i < component_idx; i++)                     \
2772           *charbuf++ = components[i];                           \
2773         char_offset += component_idx;                           \
2774       }                                                         \
2775     else                                                        \
2776       {                                                         \
2777         for (i = 0; i < component_idx; i += 2)                  \
2778           *charbuf++ = components[i];                           \
2779         char_offset += (component_idx / 2) + 1;                 \
2780       }                                                         \
2781   } while (0)
2782
2783
2784 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2785    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2786    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2787    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2788    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2789   */
2790
2791 #define DECODE_COMPOSITION_START(c1)                                    \
2792   do {                                                                  \
2793     if (c1 == '0'                                                       \
2794         && composition_state == COMPOSING_COMPONENT_RULE)               \
2795       {                                                                 \
2796         component_len = component_idx;                                  \
2797         composition_state = COMPOSING_CHAR;                             \
2798       }                                                                 \
2799     else                                                                \
2800       {                                                                 \
2801         const unsigned char *p;                                         \
2802                                                                         \
2803         MAYBE_FINISH_COMPOSITION ();                                    \
2804         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2805           goto no_more_source;                                          \
2806         for (p = src; p < src_end - 1; p++)                             \
2807           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2808             break;                                                      \
2809         if (p == src_end - 1)                                           \
2810           {                                                             \
2811             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2812               goto invalid_code;                                        \
2813             goto no_more_source;                                        \
2814           }                                                             \
2815                                                                         \
2816         /* This is surely the start of a composition.  */               \
2817         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2818                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2819                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2820                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2821         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2822                              : COMPOSING_COMPONENT_CHAR);               \
2823         component_idx = component_len = 0;                              \
2824       }                                                                 \
2825   } while (0)
2826
2827
2828 /* Handle compositoin end sequence ESC 1.  */
2829
2830 #define DECODE_COMPOSITION_END()                                        \
2831   do {                                                                  \
2832     int nchars = (component_len > 0 ? component_idx - component_len     \
2833                   : method == COMPOSITION_RELATIVE ? component_idx      \
2834                   : (component_idx + 1) / 2);                           \
2835     int i;                                                              \
2836     int *saved_charbuf = charbuf;                                       \
2837                                                                         \
2838     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2839     if (method != COMPOSITION_RELATIVE)                                 \
2840       {                                                                 \
2841         if (component_len == 0)                                         \
2842           for (i = 0; i < component_idx; i++)                           \
2843             *charbuf++ = components[i];                                 \
2844         else                                                            \
2845           for (i = 0; i < component_len; i++)                           \
2846             *charbuf++ = components[i];                                 \
2847         *saved_charbuf = saved_charbuf - charbuf;                       \
2848       }                                                                 \
2849     if (method == COMPOSITION_WITH_RULE)                                \
2850       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2851         *charbuf++ = components[i];                                     \
2852     else                                                                \
2853       for (i = component_len; i < component_idx; i++, char_offset++)    \
2854         *charbuf++ = components[i];                                     \
2855     coding->annotated = 1;                                              \
2856     composition_state = COMPOSING_NO;                                   \
2857   } while (0)
2858
2859
2860 /* Decode a composition rule from the byte C1 (and maybe one more byte
2861    from SRC) and store one encoded composition rule in
2862    coding->cmp_data.  */
2863
2864 #define DECODE_COMPOSITION_RULE(c1)                                     \
2865   do {                                                                  \
2866     (c1) -= 32;                                                         \
2867     if (c1 < 81)                /* old format (before ver.21) */        \
2868       {                                                                 \
2869         int gref = (c1) / 9;                                            \
2870         int nref = (c1) % 9;                                            \
2871         if (gref == 4) gref = 10;                                       \
2872         if (nref == 4) nref = 10;                                       \
2873         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2874       }                                                                 \
2875     else if (c1 < 93)           /* new format (after ver.21) */         \
2876       {                                                                 \
2877         ONE_MORE_BYTE (c2);                                             \
2878         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2879       }                                                                 \
2880     else                                                                \
2881       c1 = 0;                                                           \
2882   } while (0)
2883
2884
2885 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2886
2887 static void
2888 decode_coding_iso_2022 (coding)
2889      struct coding_system *coding;
2890 {
2891   const unsigned char *src = coding->source + coding->consumed;
2892   const unsigned char *src_end = coding->source + coding->src_bytes;
2893   const unsigned char *src_base;
2894   int *charbuf = coding->charbuf + coding->charbuf_used;
2895   int *charbuf_end
2896     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2897   int consumed_chars = 0, consumed_chars_base;
2898   int multibytep = coding->src_multibyte;
2899   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2900   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2901   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2902   int charset_id_2, charset_id_3;
2903   struct charset *charset;
2904   int c;
2905   /* For handling composition sequence.  */
2906 #define COMPOSING_NO                    0
2907 #define COMPOSING_CHAR                  1
2908 #define COMPOSING_RULE                  2
2909 #define COMPOSING_COMPONENT_CHAR        3
2910 #define COMPOSING_COMPONENT_RULE        4
2911
2912   int composition_state = COMPOSING_NO;
2913   enum composition_method method;
2914   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2915   int component_idx;
2916   int component_len;
2917   Lisp_Object attrs, charset_list;
2918   int char_offset = coding->produced_char;
2919   int last_offset = char_offset;
2920   int last_id = charset_ascii;
2921
2922   CODING_GET_INFO (coding, attrs, charset_list);
2923   setup_iso_safe_charsets (attrs);
2924
2925   while (1)
2926     {
2927       int c1, c2;
2928
2929       src_base = src;
2930       consumed_chars_base = consumed_chars;
2931
2932       if (charbuf >= charbuf_end)
2933         break;
2934
2935       ONE_MORE_BYTE (c1);
2936       if (c1 < 0)
2937         goto invalid_code;
2938
2939       /* We produce at most one character.  */
2940       switch (iso_code_class [c1])
2941         {
2942         case ISO_0x20_or_0x7F:
2943           if (composition_state != COMPOSING_NO)
2944             {
2945               if (composition_state == COMPOSING_RULE
2946                   || composition_state == COMPOSING_COMPONENT_RULE)
2947                 {
2948                   DECODE_COMPOSITION_RULE (c1);
2949                   components[component_idx++] = c1;
2950                   composition_state--;
2951                   continue;
2952                 }
2953             }
2954           if (charset_id_0 < 0
2955               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2956             /* This is SPACE or DEL.  */
2957             charset = CHARSET_FROM_ID (charset_ascii);
2958           else
2959             charset = CHARSET_FROM_ID (charset_id_0);
2960           break;
2961
2962         case ISO_graphic_plane_0:
2963           if (composition_state != COMPOSING_NO)
2964             {
2965               if (composition_state == COMPOSING_RULE
2966                   || composition_state == COMPOSING_COMPONENT_RULE)
2967                 {
2968                   DECODE_COMPOSITION_RULE (c1);
2969                   components[component_idx++] = c1;
2970                   composition_state--;
2971                   continue;
2972                 }
2973             }
2974           if (charset_id_0 < 0)
2975             charset = CHARSET_FROM_ID (charset_ascii);
2976           else
2977             charset = CHARSET_FROM_ID (charset_id_0);
2978           break;
2979
2980         case ISO_0xA0_or_0xFF:
2981           if (charset_id_1 < 0
2982               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2983               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2984             goto invalid_code;
2985           /* This is a graphic character, we fall down ... */
2986
2987         case ISO_graphic_plane_1:
2988           if (charset_id_1 < 0)
2989             goto invalid_code;
2990           charset = CHARSET_FROM_ID (charset_id_1);
2991           break;
2992
2993         case ISO_control_0:
2994           MAYBE_FINISH_COMPOSITION ();
2995           charset = CHARSET_FROM_ID (charset_ascii);
2996           break;
2997
2998         case ISO_control_1:
2999           MAYBE_FINISH_COMPOSITION ();
3000           goto invalid_code;
3001
3002         case ISO_shift_out:
3003           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3004               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3005             goto invalid_code;
3006           CODING_ISO_INVOCATION (coding, 0) = 1;
3007           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3008           continue;
3009
3010         case ISO_shift_in:
3011           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3012             goto invalid_code;
3013           CODING_ISO_INVOCATION (coding, 0) = 0;
3014           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3015           continue;
3016
3017         case ISO_single_shift_2_7:
3018         case ISO_single_shift_2:
3019           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3020             goto invalid_code;
3021           /* SS2 is handled as an escape sequence of ESC 'N' */
3022           c1 = 'N';
3023           goto label_escape_sequence;
3024
3025         case ISO_single_shift_3:
3026           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3027             goto invalid_code;
3028           /* SS2 is handled as an escape sequence of ESC 'O' */
3029           c1 = 'O';
3030           goto label_escape_sequence;
3031
3032         case ISO_control_sequence_introducer:
3033           /* CSI is handled as an escape sequence of ESC '[' ...  */
3034           c1 = '[';
3035           goto label_escape_sequence;
3036
3037         case ISO_escape:
3038           ONE_MORE_BYTE (c1);
3039         label_escape_sequence:
3040           /* Escape sequences handled here are invocation,
3041              designation, direction specification, and character
3042              composition specification.  */
3043           switch (c1)
3044             {
3045             case '&':           /* revision of following character set */
3046               ONE_MORE_BYTE (c1);
3047               if (!(c1 >= '@' && c1 <= '~'))
3048                 goto invalid_code;
3049               ONE_MORE_BYTE (c1);
3050               if (c1 != ISO_CODE_ESC)
3051                 goto invalid_code;
3052               ONE_MORE_BYTE (c1);
3053               goto label_escape_sequence;
3054
3055             case '$':           /* designation of 2-byte character set */
3056               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3057                 goto invalid_code;
3058               {
3059                 int reg, chars96;
3060
3061                 ONE_MORE_BYTE (c1);
3062                 if (c1 >= '@' && c1 <= 'B')
3063                   {     /* designation of JISX0208.1978, GB2312.1980,
3064                            or JISX0208.1980 */
3065                     reg = 0, chars96 = 0;
3066                   }
3067                 else if (c1 >= 0x28 && c1 <= 0x2B)
3068                   { /* designation of DIMENSION2_CHARS94 character set */
3069                     reg = c1 - 0x28, chars96 = 0;
3070                     ONE_MORE_BYTE (c1);
3071                   }
3072                 else if (c1 >= 0x2C && c1 <= 0x2F)
3073                   { /* designation of DIMENSION2_CHARS96 character set */
3074                     reg = c1 - 0x2C, chars96 = 1;
3075                     ONE_MORE_BYTE (c1);
3076                   }
3077                 else
3078                   goto invalid_code;
3079                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3080                 /* We must update these variables now.  */
3081                 if (reg == 0)
3082                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3083                 else if (reg == 1)
3084                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3085                 if (chars96 < 0)
3086                   goto invalid_code;
3087               }
3088               continue;
3089
3090             case 'n':           /* invocation of locking-shift-2 */
3091               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3092                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3093                 goto invalid_code;
3094               CODING_ISO_INVOCATION (coding, 0) = 2;
3095               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3096               continue;
3097
3098             case 'o':           /* invocation of locking-shift-3 */
3099               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3100                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3101                 goto invalid_code;
3102               CODING_ISO_INVOCATION (coding, 0) = 3;
3103               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3104               continue;
3105
3106             case 'N':           /* invocation of single-shift-2 */
3107               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3108                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3109                 goto invalid_code;
3110               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3111               if (charset_id_2 < 0)
3112                 charset = CHARSET_FROM_ID (charset_ascii);
3113               else
3114                 charset = CHARSET_FROM_ID (charset_id_2);
3115               ONE_MORE_BYTE (c1);
3116               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3117                 goto invalid_code;
3118               break;
3119
3120             case 'O':           /* invocation of single-shift-3 */
3121               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3122                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3123                 goto invalid_code;
3124               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3125               if (charset_id_3 < 0)
3126                 charset = CHARSET_FROM_ID (charset_ascii);
3127               else
3128                 charset = CHARSET_FROM_ID (charset_id_3);
3129               ONE_MORE_BYTE (c1);
3130               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3131                 goto invalid_code;
3132               break;
3133
3134             case '0': case '2': case '3': case '4': /* start composition */
3135               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3136                 goto invalid_code;
3137               DECODE_COMPOSITION_START (c1);
3138               continue;
3139
3140             case '1':           /* end composition */
3141               if (composition_state == COMPOSING_NO)
3142                 goto invalid_code;
3143               DECODE_COMPOSITION_END ();
3144               continue;
3145
3146             case '[':           /* specification of direction */
3147               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3148                 goto invalid_code;
3149               /* For the moment, nested direction is not supported.
3150                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3151                  left-to-right, and nozero means right-to-left.  */
3152               ONE_MORE_BYTE (c1);
3153               switch (c1)
3154                 {
3155                 case ']':       /* end of the current direction */
3156                   coding->mode &= ~CODING_MODE_DIRECTION;
3157
3158                 case '0':       /* end of the current direction */
3159                 case '1':       /* start of left-to-right direction */
3160                   ONE_MORE_BYTE (c1);
3161                   if (c1 == ']')
3162                     coding->mode &= ~CODING_MODE_DIRECTION;
3163                   else
3164                     goto invalid_code;
3165                   break;
3166
3167                 case '2':       /* start of right-to-left direction */
3168                   ONE_MORE_BYTE (c1);
3169                   if (c1 == ']')
3170                     coding->mode |= CODING_MODE_DIRECTION;
3171                   else
3172                     goto invalid_code;
3173                   break;
3174
3175                 default:
3176                   goto invalid_code;
3177                 }
3178               continue;
3179
3180             case '%':
3181               ONE_MORE_BYTE (c1);
3182               if (c1 == '/')
3183                 {
3184                   /* CTEXT extended segment:
3185                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3186                      We keep these bytes as is for the moment.
3187                      They may be decoded by post-read-conversion.  */
3188                   int dim, M, L;
3189                   int size;
3190
3191                   ONE_MORE_BYTE (dim);
3192                   ONE_MORE_BYTE (M);
3193                   ONE_MORE_BYTE (L);
3194                   size = ((M - 128) * 128) + (L - 128);
3195                   if (charbuf + 8 + size > charbuf_end)
3196                     goto break_loop;
3197                   *charbuf++ = ISO_CODE_ESC;
3198                   *charbuf++ = '%';
3199                   *charbuf++ = '/';
3200                   *charbuf++ = dim;
3201                   *charbuf++ = BYTE8_TO_CHAR (M);
3202                   *charbuf++ = BYTE8_TO_CHAR (L);
3203                   while (size-- > 0)
3204                     {
3205                       ONE_MORE_BYTE (c1);
3206                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3207                     }
3208                 }
3209               else if (c1 == 'G')
3210                 {
3211                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3212                      ESC % G --UTF-8-BYTES-- ESC % @
3213                      We keep these bytes as is for the moment.
3214                      They may be decoded by post-read-conversion.  */
3215                   int *p = charbuf;
3216
3217                   if (p + 6 > charbuf_end)
3218                     goto break_loop;
3219                   *p++ = ISO_CODE_ESC;
3220                   *p++ = '%';
3221                   *p++ = 'G';
3222                   while (p < charbuf_end)
3223                     {
3224                       ONE_MORE_BYTE (c1);
3225                       if (c1 == ISO_CODE_ESC
3226                           && src + 1 < src_end
3227                           && src[0] == '%'
3228                           && src[1] == '@')
3229                         {
3230                           src += 2;
3231                           break;
3232                         }
3233                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3234                     }
3235                   if (p + 3 > charbuf_end)
3236                     goto break_loop;
3237                   *p++ = ISO_CODE_ESC;
3238                   *p++ = '%';
3239                   *p++ = '@';
3240                   charbuf = p;
3241                 }
3242               else
3243                 goto invalid_code;
3244               continue;
3245               break;
3246
3247             default:
3248               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3249                 goto invalid_code;
3250               {
3251                 int reg, chars96;
3252
3253                 if (c1 >= 0x28 && c1 <= 0x2B)
3254                   { /* designation of DIMENSION1_CHARS94 character set */
3255                     reg = c1 - 0x28, chars96 = 0;
3256                     ONE_MORE_BYTE (c1);
3257                   }
3258                 else if (c1 >= 0x2C && c1 <= 0x2F)
3259                   { /* designation of DIMENSION1_CHARS96 character set */
3260                     reg = c1 - 0x2C, chars96 = 1;
3261                     ONE_MORE_BYTE (c1);
3262                   }
3263                 else
3264                   goto invalid_code;
3265                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3266                 /* We must update these variables now.  */
3267                 if (reg == 0)
3268                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3269                 else if (reg == 1)
3270                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3271                 if (chars96 < 0)
3272                   goto invalid_code;
3273               }
3274               continue;
3275             }
3276         }
3277
3278       if (charset->id != charset_ascii
3279           && last_id != charset->id)
3280         {
3281           if (last_id != charset_ascii)
3282             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3283           last_id = charset->id;
3284           last_offset = char_offset;
3285         }
3286
3287       /* Now we know CHARSET and 1st position code C1 of a character.
3288          Produce a decoded character while getting 2nd position code
3289          C2 if necessary.  */
3290       c1 &= 0x7F;
3291       if (CHARSET_DIMENSION (charset) > 1)
3292         {
3293           ONE_MORE_BYTE (c2);
3294           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3295             /* C2 is not in a valid range.  */
3296             goto invalid_code;
3297           c1 = (c1 << 8) | (c2 & 0x7F);
3298           if (CHARSET_DIMENSION (charset) > 2)
3299             {
3300               ONE_MORE_BYTE (c2);
3301               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3302                 /* C2 is not in a valid range.  */
3303                 goto invalid_code;
3304               c1 = (c1 << 8) | (c2 & 0x7F);
3305             }
3306         }
3307
3308       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3309       if (c < 0)
3310         {
3311           MAYBE_FINISH_COMPOSITION ();
3312           for (; src_base < src; src_base++, char_offset++)
3313             {
3314               if (ASCII_BYTE_P (*src_base))
3315                 *charbuf++ = *src_base;
3316               else
3317                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3318             }
3319         }
3320       else if (composition_state == COMPOSING_NO)
3321         {
3322           *charbuf++ = c;
3323           char_offset++;
3324         }
3325       else
3326         {
3327           components[component_idx++] = c;
3328           if (method == COMPOSITION_WITH_RULE
3329               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3330                   && composition_state == COMPOSING_COMPONENT_CHAR))
3331             composition_state++;
3332         }
3333       continue;
3334
3335     invalid_code:
3336       MAYBE_FINISH_COMPOSITION ();
3337       src = src_base;
3338       consumed_chars = consumed_chars_base;
3339       ONE_MORE_BYTE (c);
3340       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3341       char_offset++;
3342       coding->errors++;
3343       continue;
3344
3345     break_loop:
3346       break;
3347     }
3348
3349  no_more_source:
3350   if (last_id != charset_ascii)
3351     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3352   coding->consumed_char += consumed_chars_base;
3353   coding->consumed = src_base - coding->source;
3354   coding->charbuf_used = charbuf - coding->charbuf;
3355 }
3356
3357
3358 /* ISO2022 encoding stuff.  */
3359
3360 /*
3361    It is not enough to say just "ISO2022" on encoding, we have to
3362    specify more details.  In Emacs, each coding system of ISO2022
3363    variant has the following specifications:
3364         1. Initial designation to G0 thru G3.
3365         2. Allows short-form designation?
3366         3. ASCII should be designated to G0 before control characters?
3367         4. ASCII should be designated to G0 at end of line?
3368         5. 7-bit environment or 8-bit environment?
3369         6. Use locking-shift?
3370         7. Use Single-shift?
3371    And the following two are only for Japanese:
3372         8. Use ASCII in place of JIS0201-1976-Roman?
3373         9. Use JISX0208-1983 in place of JISX0208-1978?
3374    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3375    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3376    details.
3377 */
3378
3379 /* Produce codes (escape sequence) for designating CHARSET to graphic
3380    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3381    '@', 'A', or 'B' and the coding system CODING allows, produce
3382    designation sequence of short-form.  */
3383
3384 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3385   do {                                                                  \
3386     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3387     char *intermediate_char_94 = "()*+";                                \
3388     char *intermediate_char_96 = ",-./";                                \
3389     int revision = -1;                                                  \
3390     int c;                                                              \
3391                                                                         \
3392     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3393       revision = CHARSET_ISO_REVISION (charset);                        \
3394                                                                         \
3395     if (revision >= 0)                                                  \
3396       {                                                                 \
3397         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3398         EMIT_ONE_BYTE ('@' + revision);                                 \
3399       }                                                                 \
3400     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3401     if (CHARSET_DIMENSION (charset) == 1)                               \
3402       {                                                                 \
3403         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3404           c = intermediate_char_94[reg];                                \
3405         else                                                            \
3406           c = intermediate_char_96[reg];                                \
3407         EMIT_ONE_ASCII_BYTE (c);                                        \
3408       }                                                                 \
3409     else                                                                \
3410       {                                                                 \
3411         EMIT_ONE_ASCII_BYTE ('$');                                      \
3412         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3413           {                                                             \
3414             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3415                 || reg != 0                                             \
3416                 || final_char < '@' || final_char > 'B')                \
3417               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3418           }                                                             \
3419         else                                                            \
3420           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3421       }                                                                 \
3422     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3423                                                                         \
3424     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3425   } while (0)
3426
3427
3428 /* The following two macros produce codes (control character or escape
3429    sequence) for ISO2022 single-shift functions (single-shift-2 and
3430    single-shift-3).  */
3431
3432 #define ENCODE_SINGLE_SHIFT_2                                           \
3433   do {                                                                  \
3434     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3435       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3436     else                                                                \
3437       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3438     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3439   } while (0)
3440
3441
3442 #define ENCODE_SINGLE_SHIFT_3                                           \
3443   do {                                                                  \
3444     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3445       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3446     else                                                                \
3447       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3448     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3449   } while (0)
3450
3451
3452 /* The following four macros produce codes (control character or
3453    escape sequence) for ISO2022 locking-shift functions (shift-in,
3454    shift-out, locking-shift-2, and locking-shift-3).  */
3455
3456 #define ENCODE_SHIFT_IN                                 \
3457   do {                                                  \
3458     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3459     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3460   } while (0)
3461
3462
3463 #define ENCODE_SHIFT_OUT                                \
3464   do {                                                  \
3465     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3466     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3467   } while (0)
3468
3469
3470 #define ENCODE_LOCKING_SHIFT_2                          \
3471   do {                                                  \
3472     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3473     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3474   } while (0)
3475
3476
3477 #define ENCODE_LOCKING_SHIFT_3                          \
3478   do {                                                  \
3479     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3480     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3481   } while (0)
3482
3483
3484 /* Produce codes for a DIMENSION1 character whose character set is
3485    CHARSET and whose position-code is C1.  Designation and invocation
3486    sequences are also produced in advance if necessary.  */
3487
3488 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3489   do {                                                                  \
3490     int id = CHARSET_ID (charset);                                      \
3491                                                                         \
3492     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3493         && id == charset_ascii)                                         \
3494       {                                                                 \
3495         id = charset_jisx0201_roman;                                    \
3496         charset = CHARSET_FROM_ID (id);                                 \
3497       }                                                                 \
3498                                                                         \
3499     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3500       {                                                                 \
3501         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3502           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3503         else                                                            \
3504           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3505         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3506         break;                                                          \
3507       }                                                                 \
3508     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3509       {                                                                 \
3510         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3511         break;                                                          \
3512       }                                                                 \
3513     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3514       {                                                                 \
3515         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3516         break;                                                          \
3517       }                                                                 \
3518     else                                                                \
3519       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3520          must invoke it, or, at first, designate it to some graphic     \
3521          register.  Then repeat the loop to actually produce the        \
3522          character.  */                                                 \
3523       dst = encode_invocation_designation (charset, coding, dst,        \
3524                                            &produced_chars);            \
3525   } while (1)
3526
3527
3528 /* Produce codes for a DIMENSION2 character whose character set is
3529    CHARSET and whose position-codes are C1 and C2.  Designation and
3530    invocation codes are also produced in advance if necessary.  */
3531
3532 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3533   do {                                                                  \
3534     int id = CHARSET_ID (charset);                                      \
3535                                                                         \
3536     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3537         && id == charset_jisx0208)                                      \
3538       {                                                                 \
3539         id = charset_jisx0208_1978;                                     \
3540         charset = CHARSET_FROM_ID (id);                                 \
3541       }                                                                 \
3542                                                                         \
3543     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3544       {                                                                 \
3545         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3546           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3547         else                                                            \
3548           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3549         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3550         break;                                                          \
3551       }                                                                 \
3552     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3553       {                                                                 \
3554         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3555         break;                                                          \
3556       }                                                                 \
3557     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3558       {                                                                 \
3559         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3560         break;                                                          \
3561       }                                                                 \
3562     else                                                                \
3563       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3564          must invoke it, or, at first, designate it to some graphic     \
3565          register.  Then repeat the loop to actually produce the        \
3566          character.  */                                                 \
3567       dst = encode_invocation_designation (charset, coding, dst,        \
3568                                            &produced_chars);            \
3569   } while (1)
3570
3571
3572 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3573   do {                                                                     \
3574     int code = ENCODE_CHAR ((charset),(c));                                \
3575                                                                            \
3576     if (CHARSET_DIMENSION (charset) == 1)                                  \
3577       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3578     else                                                                   \
3579       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3580   } while (0)
3581
3582
3583 /* Produce designation and invocation codes at a place pointed by DST
3584    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3585    Return new DST.  */
3586
3587 unsigned char *
3588 encode_invocation_designation (charset, coding, dst, p_nchars)
3589      struct charset *charset;
3590      struct coding_system *coding;
3591      unsigned char *dst;
3592      int *p_nchars;
3593 {
3594   int multibytep = coding->dst_multibyte;
3595   int produced_chars = *p_nchars;
3596   int reg;                      /* graphic register number */
3597   int id = CHARSET_ID (charset);
3598
3599   /* At first, check designations.  */
3600   for (reg = 0; reg < 4; reg++)
3601     if (id == CODING_ISO_DESIGNATION (coding, reg))
3602       break;
3603
3604   if (reg >= 4)
3605     {
3606       /* CHARSET is not yet designated to any graphic registers.  */
3607       /* At first check the requested designation.  */
3608       reg = CODING_ISO_REQUEST (coding, id);
3609       if (reg < 0)
3610         /* Since CHARSET requests no special designation, designate it
3611            to graphic register 0.  */
3612         reg = 0;
3613
3614       ENCODE_DESIGNATION (charset, reg, coding);
3615     }
3616
3617   if (CODING_ISO_INVOCATION (coding, 0) != reg
3618       && CODING_ISO_INVOCATION (coding, 1) != reg)
3619     {
3620       /* Since the graphic register REG is not invoked to any graphic
3621          planes, invoke it to graphic plane 0.  */
3622       switch (reg)
3623         {
3624         case 0:                 /* graphic register 0 */
3625           ENCODE_SHIFT_IN;
3626           break;
3627
3628         case 1:                 /* graphic register 1 */
3629           ENCODE_SHIFT_OUT;
3630           break;
3631
3632         case 2:                 /* graphic register 2 */
3633           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3634             ENCODE_SINGLE_SHIFT_2;
3635           else
3636             ENCODE_LOCKING_SHIFT_2;
3637           break;
3638
3639         case 3:                 /* graphic register 3 */
3640           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3641             ENCODE_SINGLE_SHIFT_3;
3642           else
3643             ENCODE_LOCKING_SHIFT_3;
3644           break;
3645         }
3646     }
3647
3648   *p_nchars = produced_chars;
3649   return dst;
3650 }
3651
3652 /* The following three macros produce codes for indicating direction
3653    of text.  */
3654 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3655   do {                                                                  \
3656     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3657       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3658     else                                                                \
3659       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3660   } while (0)
3661
3662
3663 #define ENCODE_DIRECTION_R2L()                  \
3664   do {                                          \
3665     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3666     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3667   } while (0)
3668
3669
3670 #define ENCODE_DIRECTION_L2R()                  \
3671   do {                                          \
3672     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3673     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3674   } while (0)
3675
3676
3677 /* Produce codes for designation and invocation to reset the graphic
3678    planes and registers to initial state.  */
3679 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3680   do {                                                                  \
3681     int reg;                                                            \
3682     struct charset *charset;                                            \
3683                                                                         \
3684     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3685       ENCODE_SHIFT_IN;                                                  \
3686     for (reg = 0; reg < 4; reg++)                                       \
3687       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3688           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3689               != CODING_ISO_INITIAL (coding, reg)))                     \
3690         {                                                               \
3691           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3692           ENCODE_DESIGNATION (charset, reg, coding);                    \
3693         }                                                               \
3694   } while (0)
3695
3696
3697 /* Produce designation sequences of charsets in the line started from
3698    SRC to a place pointed by DST, and return updated DST.
3699
3700    If the current block ends before any end-of-line, we may fail to
3701    find all the necessary designations.  */
3702
3703 static unsigned char *
3704 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3705      struct coding_system *coding;
3706      int *charbuf, *charbuf_end;
3707      unsigned char *dst;
3708 {
3709   struct charset *charset;
3710   /* Table of charsets to be designated to each graphic register.  */
3711   int r[4];
3712   int c, found = 0, reg;
3713   int produced_chars = 0;
3714   int multibytep = coding->dst_multibyte;
3715   Lisp_Object attrs;
3716   Lisp_Object charset_list;
3717
3718   attrs = CODING_ID_ATTRS (coding->id);
3719   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3720   if (EQ (charset_list, Qiso_2022))
3721     charset_list = Viso_2022_charset_list;
3722
3723   for (reg = 0; reg < 4; reg++)
3724     r[reg] = -1;
3725
3726   while (found < 4)
3727     {
3728       int id;
3729
3730       c = *charbuf++;
3731       if (c == '\n')
3732         break;
3733       charset = char_charset (c, charset_list, NULL);
3734       id = CHARSET_ID (charset);
3735       reg = CODING_ISO_REQUEST (coding, id);
3736       if (reg >= 0 && r[reg] < 0)
3737         {
3738           found++;
3739           r[reg] = id;
3740         }
3741     }
3742
3743   if (found)
3744     {
3745       for (reg = 0; reg < 4; reg++)
3746         if (r[reg] >= 0
3747             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3748           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3749     }
3750
3751   return dst;
3752 }
3753
3754 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3755
3756 static int
3757 encode_coding_iso_2022 (coding)
3758      struct coding_system *coding;
3759 {
3760   int multibytep = coding->dst_multibyte;
3761   int *charbuf = coding->charbuf;
3762   int *charbuf_end = charbuf + coding->charbuf_used;
3763   unsigned char *dst = coding->destination + coding->produced;
3764   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3765   int safe_room = 16;
3766   int bol_designation
3767     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3768        && CODING_ISO_BOL (coding));
3769   int produced_chars = 0;
3770   Lisp_Object attrs, eol_type, charset_list;
3771   int ascii_compatible;
3772   int c;
3773   int preferred_charset_id = -1;
3774
3775   CODING_GET_INFO (coding, attrs, charset_list);
3776   eol_type = CODING_ID_EOL_TYPE (coding->id);
3777   if (VECTORP (eol_type))
3778     eol_type = Qunix;
3779
3780   setup_iso_safe_charsets (attrs);
3781   /* Charset list may have been changed.  */
3782   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3783   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3784
3785   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3786
3787   while (charbuf < charbuf_end)
3788     {
3789       ASSURE_DESTINATION (safe_room);
3790
3791       if (bol_designation)
3792         {
3793           unsigned char *dst_prev = dst;
3794
3795           /* We have to produce designation sequences if any now.  */
3796           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3797           bol_designation = 0;
3798           /* We are sure that designation sequences are all ASCII bytes.  */
3799           produced_chars += dst - dst_prev;
3800         }
3801
3802       c = *charbuf++;
3803
3804       if (c < 0)
3805         {
3806           /* Handle an annotation.  */
3807           switch (*charbuf)
3808             {
3809             case CODING_ANNOTATE_COMPOSITION_MASK:
3810               /* Not yet implemented.  */
3811               break;
3812             case CODING_ANNOTATE_CHARSET_MASK:
3813               preferred_charset_id = charbuf[2];
3814               if (preferred_charset_id >= 0
3815                   && NILP (Fmemq (make_number (preferred_charset_id),
3816                                   charset_list)))
3817                 preferred_charset_id = -1;
3818               break;
3819             default:
3820               abort ();
3821             }
3822           charbuf += -c - 1;
3823           continue;
3824         }
3825
3826       /* Now encode the character C.  */
3827       if (c < 0x20 || c == 0x7F)
3828         {
3829           if (c == '\n'
3830               || (c == '\r' && EQ (eol_type, Qmac)))
3831             {
3832               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3833                 ENCODE_RESET_PLANE_AND_REGISTER ();
3834               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3835                 {
3836                   int i;
3837
3838                   for (i = 0; i < 4; i++)
3839                     CODING_ISO_DESIGNATION (coding, i)
3840                       = CODING_ISO_INITIAL (coding, i);
3841                 }
3842               bol_designation
3843                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3844             }
3845           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3846             ENCODE_RESET_PLANE_AND_REGISTER ();
3847           EMIT_ONE_ASCII_BYTE (c);
3848         }
3849       else if (ASCII_CHAR_P (c))
3850         {
3851           if (ascii_compatible)
3852             EMIT_ONE_ASCII_BYTE (c);
3853           else
3854             {
3855               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3856               ENCODE_ISO_CHARACTER (charset, c);
3857             }
3858         }
3859       else if (CHAR_BYTE8_P (c))
3860         {
3861           c = CHAR_TO_BYTE8 (c);
3862           EMIT_ONE_BYTE (c);
3863         }
3864       else
3865         {
3866           struct charset *charset;
3867
3868           if (preferred_charset_id >= 0)
3869             {
3870               charset = CHARSET_FROM_ID (preferred_charset_id);
3871               if (! CHAR_CHARSET_P (c, charset))
3872                 charset = char_charset (c, charset_list, NULL);
3873             }
3874           else
3875             charset = char_charset (c, charset_list, NULL);
3876           if (!charset)
3877             {
3878               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3879                 {
3880                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3881                   charset = CHARSET_FROM_ID (charset_ascii);
3882                 }
3883               else
3884                 {
3885                   c = coding->default_char;
3886                   charset = char_charset (c, charset_list, NULL);
3887                 }
3888             }
3889           ENCODE_ISO_CHARACTER (charset, c);
3890         }
3891     }
3892
3893   if (coding->mode & CODING_MODE_LAST_BLOCK
3894       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3895     {
3896       ASSURE_DESTINATION (safe_room);
3897       ENCODE_RESET_PLANE_AND_REGISTER ();
3898     }
3899   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3900   CODING_ISO_BOL (coding) = bol_designation;
3901   coding->produced_char += produced_chars;
3902   coding->produced = dst - coding->destination;
3903   return 0;
3904 }
3905
3906 \f
3907 /*** 8,9. SJIS and BIG5 handlers ***/
3908
3909 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3910    quite widely.  So, for the moment, Emacs supports them in the bare
3911    C code.  But, in the future, they may be supported only by CCL.  */
3912
3913 /* SJIS is a coding system encoding three character sets: ASCII, right
3914    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3915    as is.  A character of charset katakana-jisx0201 is encoded by
3916    "position-code + 0x80".  A character of charset japanese-jisx0208
3917    is encoded in 2-byte but two position-codes are divided and shifted
3918    so that it fit in the range below.
3919
3920    --- CODE RANGE of SJIS ---
3921    (character set)      (range)
3922    ASCII                0x00 .. 0x7F
3923    KATAKANA-JISX0201    0xA0 .. 0xDF
3924    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3925             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3926    -------------------------------
3927
3928 */
3929
3930 /* BIG5 is a coding system encoding two character sets: ASCII and
3931    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3932    character set and is encoded in two-byte.
3933
3934    --- CODE RANGE of BIG5 ---
3935    (character set)      (range)
3936    ASCII                0x00 .. 0x7F
3937    Big5 (1st byte)      0xA1 .. 0xFE
3938         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3939    --------------------------
3940
3941   */
3942
3943 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3944    Check if a text is encoded in SJIS.  If it is, return
3945    CATEGORY_MASK_SJIS, else return 0.  */
3946
3947 static int
3948 detect_coding_sjis (coding, detect_info)
3949      struct coding_system *coding;
3950      struct coding_detection_info *detect_info;
3951 {
3952   const unsigned char *src = coding->source, *src_base;
3953   const unsigned char *src_end = coding->source + coding->src_bytes;
3954   int multibytep = coding->src_multibyte;
3955   int consumed_chars = 0;
3956   int found = 0;
3957   int c;
3958
3959   detect_info->checked |= CATEGORY_MASK_SJIS;
3960   /* A coding system of this category is always ASCII compatible.  */
3961   src += coding->head_ascii;
3962
3963   while (1)
3964     {
3965       src_base = src;
3966       ONE_MORE_BYTE (c);
3967       if (c < 0x80)
3968         continue;
3969       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3970         {
3971           ONE_MORE_BYTE (c);
3972           if (c < 0x40 || c == 0x7F || c > 0xFC)
3973             break;
3974           found = CATEGORY_MASK_SJIS;
3975         }
3976       else if (c >= 0xA0 && c < 0xE0)
3977         found = CATEGORY_MASK_SJIS;
3978       else
3979         break;
3980     }
3981   detect_info->rejected |= CATEGORY_MASK_SJIS;
3982   return 0;
3983
3984  no_more_source:
3985   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3986     {
3987       detect_info->rejected |= CATEGORY_MASK_SJIS;
3988       return 0;
3989     }
3990   detect_info->found |= found;
3991   return 1;
3992 }
3993
3994 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3995    Check if a text is encoded in BIG5.  If it is, return
3996    CATEGORY_MASK_BIG5, else return 0.  */
3997
3998 static int
3999 detect_coding_big5 (coding, detect_info)
4000      struct coding_system *coding;
4001      struct coding_detection_info *detect_info;
4002 {
4003   const unsigned char *src = coding->source, *src_base;
4004   const unsigned char *src_end = coding->source + coding->src_bytes;
4005   int multibytep = coding->src_multibyte;
4006   int consumed_chars = 0;
4007   int found = 0;
4008   int c;
4009
4010   detect_info->checked |= CATEGORY_MASK_BIG5;
4011   /* A coding system of this category is always ASCII compatible.  */
4012   src += coding->head_ascii;
4013
4014   while (1)
4015     {
4016       src_base = src;
4017       ONE_MORE_BYTE (c);
4018       if (c < 0x80)
4019         continue;
4020       if (c >= 0xA1)
4021         {
4022           ONE_MORE_BYTE (c);
4023           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4024             return 0;
4025           found = CATEGORY_MASK_BIG5;
4026         }
4027       else
4028         break;
4029     }
4030   detect_info->rejected |= CATEGORY_MASK_BIG5;
4031   return 0;
4032
4033  no_more_source:
4034   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4035     {
4036       detect_info->rejected |= CATEGORY_MASK_BIG5;
4037       return 0;
4038     }
4039   detect_info->found |= found;
4040   return 1;
4041 }
4042
4043 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4044    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4045
4046 static void
4047 decode_coding_sjis (coding)
4048      struct coding_system *coding;
4049 {
4050   const unsigned char *src = coding->source + coding->consumed;
4051   const unsigned char *src_end = coding->source + coding->src_bytes;
4052   const unsigned char *src_base;
4053   int *charbuf = coding->charbuf + coding->charbuf_used;
4054   int *charbuf_end
4055     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4056   int consumed_chars = 0, consumed_chars_base;
4057   int multibytep = coding->src_multibyte;
4058   struct charset *charset_roman, *charset_kanji, *charset_kana;
4059   struct charset *charset_kanji2;
4060   Lisp_Object attrs, charset_list, val;
4061   int char_offset = coding->produced_char;
4062   int last_offset = char_offset;
4063   int last_id = charset_ascii;
4064
4065   CODING_GET_INFO (coding, attrs, charset_list);
4066
4067   val = charset_list;
4068   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4069   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4070   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4071   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4072
4073   while (1)
4074     {
4075       int c, c1;
4076       struct charset *charset;
4077
4078       src_base = src;
4079       consumed_chars_base = consumed_chars;
4080
4081       if (charbuf >= charbuf_end)
4082         break;
4083
4084       ONE_MORE_BYTE (c);
4085       if (c < 0)
4086         goto invalid_code;
4087       if (c < 0x80)
4088         charset = charset_roman;
4089       else if (c == 0x80 || c == 0xA0)
4090         goto invalid_code;
4091       else if (c >= 0xA1 && c <= 0xDF)
4092         {
4093           /* SJIS -> JISX0201-Kana */
4094           c &= 0x7F;
4095           charset = charset_kana;
4096         }
4097       else if (c <= 0xEF)
4098         {
4099           /* SJIS -> JISX0208 */
4100           ONE_MORE_BYTE (c1);
4101           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4102             goto invalid_code;
4103           c = (c << 8) | c1;
4104           SJIS_TO_JIS (c);
4105           charset = charset_kanji;
4106         }
4107       else if (c <= 0xFC && charset_kanji2)
4108         {
4109           /* SJIS -> JISX0213-2 */
4110           ONE_MORE_BYTE (c1);
4111           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4112             goto invalid_code;
4113           c = (c << 8) | c1;
4114           SJIS_TO_JIS2 (c);
4115           charset = charset_kanji2;
4116         }
4117       else
4118         goto invalid_code;
4119       if (charset->id != charset_ascii
4120           && last_id != charset->id)
4121         {
4122           if (last_id != charset_ascii)
4123             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4124           last_id = charset->id;
4125           last_offset = char_offset;
4126         }
4127       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4128       *charbuf++ = c;
4129       char_offset++;
4130       continue;
4131
4132     invalid_code:
4133       src = src_base;
4134       consumed_chars = consumed_chars_base;
4135       ONE_MORE_BYTE (c);
4136       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4137       char_offset++;
4138       coding->errors++;
4139     }
4140
4141  no_more_source:
4142   if (last_id != charset_ascii)
4143     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4144   coding->consumed_char += consumed_chars_base;
4145   coding->consumed = src_base - coding->source;
4146   coding->charbuf_used = charbuf - coding->charbuf;
4147 }
4148
4149 static void
4150 decode_coding_big5 (coding)
4151      struct coding_system *coding;
4152 {
4153   const unsigned char *src = coding->source + coding->consumed;
4154   const unsigned char *src_end = coding->source + coding->src_bytes;
4155   const unsigned char *src_base;
4156   int *charbuf = coding->charbuf + coding->charbuf_used;
4157   int *charbuf_end
4158     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4159   int consumed_chars = 0, consumed_chars_base;
4160   int multibytep = coding->src_multibyte;
4161   struct charset *charset_roman, *charset_big5;
4162   Lisp_Object attrs, charset_list, val;
4163   int char_offset = coding->produced_char;
4164   int last_offset = char_offset;
4165   int last_id = charset_ascii;
4166
4167   CODING_GET_INFO (coding, attrs, charset_list);
4168   val = charset_list;
4169   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4170   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4171
4172   while (1)
4173     {
4174       int c, c1;
4175       struct charset *charset;
4176
4177       src_base = src;
4178       consumed_chars_base = consumed_chars;
4179
4180       if (charbuf >= charbuf_end)
4181         break;
4182
4183       ONE_MORE_BYTE (c);
4184
4185       if (c < 0)
4186         goto invalid_code;
4187       if (c < 0x80)
4188         charset = charset_roman;
4189       else
4190         {
4191           /* BIG5 -> Big5 */
4192           if (c < 0xA1 || c > 0xFE)
4193             goto invalid_code;
4194           ONE_MORE_BYTE (c1);
4195           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4196             goto invalid_code;
4197           c = c << 8 | c1;
4198           charset = charset_big5;
4199         }
4200       if (charset->id != charset_ascii
4201           && last_id != charset->id)
4202         {
4203           if (last_id != charset_ascii)
4204             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4205           last_id = charset->id;
4206           last_offset = char_offset;
4207         }
4208       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4209       *charbuf++ = c;
4210       char_offset++;
4211       continue;
4212
4213     invalid_code:
4214       src = src_base;
4215       consumed_chars = consumed_chars_base;
4216       ONE_MORE_BYTE (c);
4217       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4218       char_offset++;
4219       coding->errors++;
4220     }
4221
4222  no_more_source:
4223   if (last_id != charset_ascii)
4224     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4225   coding->consumed_char += consumed_chars_base;
4226   coding->consumed = src_base - coding->source;
4227   coding->charbuf_used = charbuf - coding->charbuf;
4228 }
4229
4230 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4231    This function can encode charsets `ascii', `katakana-jisx0201',
4232    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4233    are sure that all these charsets are registered as official charset
4234    (i.e. do not have extended leading-codes).  Characters of other
4235    charsets are produced without any encoding.  If SJIS_P is 1, encode
4236    SJIS text, else encode BIG5 text.  */
4237
4238 static int
4239 encode_coding_sjis (coding)
4240      struct coding_system *coding;
4241 {
4242   int multibytep = coding->dst_multibyte;
4243   int *charbuf = coding->charbuf;
4244   int *charbuf_end = charbuf + coding->charbuf_used;
4245   unsigned char *dst = coding->destination + coding->produced;
4246   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4247   int safe_room = 4;
4248   int produced_chars = 0;
4249   Lisp_Object attrs, charset_list, val;
4250   int ascii_compatible;
4251   struct charset *charset_roman, *charset_kanji, *charset_kana;
4252   struct charset *charset_kanji2;
4253   int c;
4254
4255   CODING_GET_INFO (coding, attrs, charset_list);
4256   val = charset_list;
4257   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4258   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4259   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4260   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4261
4262   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4263
4264   while (charbuf < charbuf_end)
4265     {
4266       ASSURE_DESTINATION (safe_room);
4267       c = *charbuf++;
4268       /* Now encode the character C.  */
4269       if (ASCII_CHAR_P (c) && ascii_compatible)
4270         EMIT_ONE_ASCII_BYTE (c);
4271       else if (CHAR_BYTE8_P (c))
4272         {
4273           c = CHAR_TO_BYTE8 (c);
4274           EMIT_ONE_BYTE (c);
4275         }
4276       else
4277         {
4278           unsigned code;
4279           struct charset *charset = char_charset (c, charset_list, &code);
4280
4281           if (!charset)
4282             {
4283               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4284                 {
4285                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4286                   charset = CHARSET_FROM_ID (charset_ascii);
4287                 }
4288               else
4289                 {
4290                   c = coding->default_char;
4291                   charset = char_charset (c, charset_list, &code);
4292                 }
4293             }
4294           if (code == CHARSET_INVALID_CODE (charset))
4295             abort ();
4296           if (charset == charset_kanji)
4297             {
4298               int c1, c2;
4299               JIS_TO_SJIS (code);
4300               c1 = code >> 8, c2 = code & 0xFF;
4301               EMIT_TWO_BYTES (c1, c2);
4302             }
4303           else if (charset == charset_kana)
4304             EMIT_ONE_BYTE (code | 0x80);
4305           else if (charset_kanji2 && charset == charset_kanji2)
4306             {
4307               int c1, c2;
4308
4309               c1 = code >> 8;
4310               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4311                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4312                 {
4313                   JIS_TO_SJIS2 (code);
4314                   c1 = code >> 8, c2 = code & 0xFF;
4315                   EMIT_TWO_BYTES (c1, c2);
4316                 }
4317               else
4318                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4319             }
4320           else
4321             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4322         }
4323     }
4324   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4325   coding->produced_char += produced_chars;
4326   coding->produced = dst - coding->destination;
4327   return 0;
4328 }
4329
4330 static int
4331 encode_coding_big5 (coding)
4332      struct coding_system *coding;
4333 {
4334   int multibytep = coding->dst_multibyte;
4335   int *charbuf = coding->charbuf;
4336   int *charbuf_end = charbuf + coding->charbuf_used;
4337   unsigned char *dst = coding->destination + coding->produced;
4338   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4339   int safe_room = 4;
4340   int produced_chars = 0;
4341   Lisp_Object attrs, charset_list, val;
4342   int ascii_compatible;
4343   struct charset *charset_roman, *charset_big5;
4344   int c;
4345
4346   CODING_GET_INFO (coding, attrs, charset_list);
4347   val = charset_list;
4348   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4349   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4350   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4351
4352   while (charbuf < charbuf_end)
4353     {
4354       ASSURE_DESTINATION (safe_room);
4355       c = *charbuf++;
4356       /* Now encode the character C.  */
4357       if (ASCII_CHAR_P (c) && ascii_compatible)
4358         EMIT_ONE_ASCII_BYTE (c);
4359       else if (CHAR_BYTE8_P (c))
4360         {
4361           c = CHAR_TO_BYTE8 (c);
4362           EMIT_ONE_BYTE (c);
4363         }
4364       else
4365         {
4366           unsigned code;
4367           struct charset *charset = char_charset (c, charset_list, &code);
4368
4369           if (! charset)
4370             {
4371               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4372                 {
4373                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4374                   charset = CHARSET_FROM_ID (charset_ascii);
4375                 }
4376               else
4377                 {
4378                   c = coding->default_char;
4379                   charset = char_charset (c, charset_list, &code);
4380                 }
4381             }
4382           if (code == CHARSET_INVALID_CODE (charset))
4383             abort ();
4384           if (charset == charset_big5)
4385             {
4386               int c1, c2;
4387
4388               c1 = code >> 8, c2 = code & 0xFF;
4389               EMIT_TWO_BYTES (c1, c2);
4390             }
4391           else
4392             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4393         }
4394     }
4395   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4396   coding->produced_char += produced_chars;
4397   coding->produced = dst - coding->destination;
4398   return 0;
4399 }
4400
4401 \f
4402 /*** 10. CCL handlers ***/
4403
4404 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4405    Check if a text is encoded in a coding system of which
4406    encoder/decoder are written in CCL program.  If it is, return
4407    CATEGORY_MASK_CCL, else return 0.  */
4408
4409 static int
4410 detect_coding_ccl (coding, detect_info)
4411      struct coding_system *coding;
4412      struct coding_detection_info *detect_info;
4413 {
4414   const unsigned char *src = coding->source, *src_base;
4415   const unsigned char *src_end = coding->source + coding->src_bytes;
4416   int multibytep = coding->src_multibyte;
4417   int consumed_chars = 0;
4418   int found = 0;
4419   unsigned char *valids;
4420   int head_ascii = coding->head_ascii;
4421   Lisp_Object attrs;
4422
4423   detect_info->checked |= CATEGORY_MASK_CCL;
4424
4425   coding = &coding_categories[coding_category_ccl];
4426   valids = CODING_CCL_VALIDS (coding);
4427   attrs = CODING_ID_ATTRS (coding->id);
4428   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4429     src += head_ascii;
4430
4431   while (1)
4432     {
4433       int c;
4434
4435       src_base = src;
4436       ONE_MORE_BYTE (c);
4437       if (c < 0 || ! valids[c])
4438         break;
4439       if ((valids[c] > 1))
4440         found = CATEGORY_MASK_CCL;
4441     }
4442   detect_info->rejected |= CATEGORY_MASK_CCL;
4443   return 0;
4444
4445  no_more_source:
4446   detect_info->found |= found;
4447   return 1;
4448 }
4449
4450 static void
4451 decode_coding_ccl (coding)
4452      struct coding_system *coding;
4453 {
4454   const unsigned char *src = coding->source + coding->consumed;
4455   const unsigned char *src_end = coding->source + coding->src_bytes;
4456   int *charbuf = coding->charbuf + coding->charbuf_used;
4457   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4458   int consumed_chars = 0;
4459   int multibytep = coding->src_multibyte;
4460   struct ccl_program ccl;
4461   int source_charbuf[1024];
4462   int source_byteidx[1024];
4463   Lisp_Object attrs, charset_list;
4464
4465   CODING_GET_INFO (coding, attrs, charset_list);
4466   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4467
4468   while (src < src_end)
4469     {
4470       const unsigned char *p = src;
4471       int *source, *source_end;
4472       int i = 0;
4473
4474       if (multibytep)
4475         while (i < 1024 && p < src_end)
4476           {
4477             source_byteidx[i] = p - src;
4478             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4479           }
4480       else
4481         while (i < 1024 && p < src_end)
4482           source_charbuf[i++] = *p++;
4483
4484       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4485         ccl.last_block = 1;
4486
4487       source = source_charbuf;
4488       source_end = source + i;
4489       while (source < source_end)
4490         {
4491           ccl_driver (&ccl, source, charbuf,
4492                       source_end - source, charbuf_end - charbuf,
4493                       charset_list);
4494           source += ccl.consumed;
4495           charbuf += ccl.produced;
4496           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4497             break;
4498         }
4499       if (source < source_end)
4500         src += source_byteidx[source - source_charbuf];
4501       else
4502         src = p;
4503       consumed_chars += source - source_charbuf;
4504
4505       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4506           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4507         break;
4508     }
4509
4510   switch (ccl.status)
4511     {
4512     case CCL_STAT_SUSPEND_BY_SRC:
4513       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4514       break;
4515     case CCL_STAT_SUSPEND_BY_DST:
4516       break;
4517     case CCL_STAT_QUIT:
4518     case CCL_STAT_INVALID_CMD:
4519       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4520       break;
4521     default:
4522       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4523       break;
4524     }
4525   coding->consumed_char += consumed_chars;
4526   coding->consumed = src - coding->source;
4527   coding->charbuf_used = charbuf - coding->charbuf;
4528 }
4529
4530 static int
4531 encode_coding_ccl (coding)
4532      struct coding_system *coding;
4533 {
4534   struct ccl_program ccl;
4535   int multibytep = coding->dst_multibyte;
4536   int *charbuf = coding->charbuf;
4537   int *charbuf_end = charbuf + coding->charbuf_used;
4538   unsigned char *dst = coding->destination + coding->produced;
4539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4540   unsigned char *adjusted_dst_end = dst_end - 1;
4541   int destination_charbuf[1024];
4542   int i, produced_chars = 0;
4543   Lisp_Object attrs, charset_list;
4544
4545   CODING_GET_INFO (coding, attrs, charset_list);
4546   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4547
4548   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4549   ccl.dst_multibyte = coding->dst_multibyte;
4550
4551   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4552     {
4553       int dst_bytes = dst_end - dst;
4554       if (dst_bytes > 1024)
4555         dst_bytes = 1024;
4556
4557       ccl_driver (&ccl, charbuf, destination_charbuf,
4558                   charbuf_end - charbuf, dst_bytes, charset_list);
4559       charbuf += ccl.consumed;
4560       if (multibytep)
4561         for (i = 0; i < ccl.produced; i++)
4562           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4563       else
4564         {
4565           for (i = 0; i < ccl.produced; i++)
4566             *dst++ = destination_charbuf[i] & 0xFF;
4567           produced_chars += ccl.produced;
4568         }
4569     }
4570
4571   switch (ccl.status)
4572     {
4573     case CCL_STAT_SUSPEND_BY_SRC:
4574       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4575       break;
4576     case CCL_STAT_SUSPEND_BY_DST:
4577       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4578       break;
4579     case CCL_STAT_QUIT:
4580     case CCL_STAT_INVALID_CMD:
4581       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4582       break;
4583     default:
4584       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585       break;
4586     }
4587
4588   coding->produced_char += produced_chars;
4589   coding->produced = dst - coding->destination;
4590   return 0;
4591 }
4592
4593
4594 \f
4595 /*** 10, 11. no-conversion handlers ***/
4596
4597 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4598
4599 static void
4600 decode_coding_raw_text (coding)
4601      struct coding_system *coding;
4602 {
4603   coding->chars_at_source = 1;
4604   coding->consumed_char = 0;
4605   coding->consumed = 0;
4606   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4607 }
4608
4609 static int
4610 encode_coding_raw_text (coding)
4611      struct coding_system *coding;
4612 {
4613   int multibytep = coding->dst_multibyte;
4614   int *charbuf = coding->charbuf;
4615   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4616   unsigned char *dst = coding->destination + coding->produced;
4617   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4618   int produced_chars = 0;
4619   int c;
4620
4621   if (multibytep)
4622     {
4623       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4624
4625       if (coding->src_multibyte)
4626         while (charbuf < charbuf_end)
4627           {
4628             ASSURE_DESTINATION (safe_room);
4629             c = *charbuf++;
4630             if (ASCII_CHAR_P (c))
4631               EMIT_ONE_ASCII_BYTE (c);
4632             else if (CHAR_BYTE8_P (c))
4633               {
4634                 c = CHAR_TO_BYTE8 (c);
4635                 EMIT_ONE_BYTE (c);
4636               }
4637             else
4638               {
4639                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4640
4641                 CHAR_STRING_ADVANCE (c, p1);
4642                 while (p0 < p1)
4643                   {
4644                     EMIT_ONE_BYTE (*p0);
4645                     p0++;
4646                   }
4647               }
4648           }
4649       else
4650         while (charbuf < charbuf_end)
4651           {
4652             ASSURE_DESTINATION (safe_room);
4653             c = *charbuf++;
4654             EMIT_ONE_BYTE (c);
4655           }
4656     }
4657   else
4658     {
4659       if (coding->src_multibyte)
4660         {
4661           int safe_room = MAX_MULTIBYTE_LENGTH;
4662
4663           while (charbuf < charbuf_end)
4664             {
4665               ASSURE_DESTINATION (safe_room);
4666               c = *charbuf++;
4667               if (ASCII_CHAR_P (c))
4668                 *dst++ = c;
4669               else if (CHAR_BYTE8_P (c))
4670                 *dst++ = CHAR_TO_BYTE8 (c);
4671               else
4672                 CHAR_STRING_ADVANCE (c, dst);
4673               produced_chars++;
4674             }
4675         }
4676       else
4677         {
4678           ASSURE_DESTINATION (charbuf_end - charbuf);
4679           while (charbuf < charbuf_end && dst < dst_end)
4680             *dst++ = *charbuf++;
4681           produced_chars = dst - (coding->destination + coding->dst_bytes);
4682         }
4683     }
4684   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4685   coding->produced_char += produced_chars;
4686   coding->produced = dst - coding->destination;
4687   return 0;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691    Check if a text is encoded in a charset-based coding system.  If it
4692    is, return 1, else return 0.  */
4693
4694 static int
4695 detect_coding_charset (coding, detect_info)
4696      struct coding_system *coding;
4697      struct coding_detection_info *detect_info;
4698 {
4699   const unsigned char *src = coding->source, *src_base;
4700   const unsigned char *src_end = coding->source + coding->src_bytes;
4701   int multibytep = coding->src_multibyte;
4702   int consumed_chars = 0;
4703   Lisp_Object attrs, valids;
4704   int found = 0;
4705
4706   detect_info->checked |= CATEGORY_MASK_CHARSET;
4707
4708   coding = &coding_categories[coding_category_charset];
4709   attrs = CODING_ID_ATTRS (coding->id);
4710   valids = AREF (attrs, coding_attr_charset_valids);
4711
4712   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4713     src += coding->head_ascii;
4714
4715   while (1)
4716     {
4717       int c;
4718
4719       src_base = src;
4720       ONE_MORE_BYTE (c);
4721       if (c < 0)
4722         continue;
4723       if (NILP (AREF (valids, c)))
4724         break;
4725       if (c >= 0x80)
4726         found = CATEGORY_MASK_CHARSET;
4727     }
4728   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4729   return 0;
4730
4731  no_more_source:
4732   detect_info->found |= found;
4733   return 1;
4734 }
4735
4736 static void
4737 decode_coding_charset (coding)
4738      struct coding_system *coding;
4739 {
4740   const unsigned char *src = coding->source + coding->consumed;
4741   const unsigned char *src_end = coding->source + coding->src_bytes;
4742   const unsigned char *src_base;
4743   int *charbuf = coding->charbuf + coding->charbuf_used;
4744   int *charbuf_end
4745     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4746   int consumed_chars = 0, consumed_chars_base;
4747   int multibytep = coding->src_multibyte;
4748   Lisp_Object attrs, charset_list, valids;
4749   int char_offset = coding->produced_char;
4750   int last_offset = char_offset;
4751   int last_id = charset_ascii;
4752
4753   CODING_GET_INFO (coding, attrs, charset_list);
4754   valids = AREF (attrs, coding_attr_charset_valids);
4755
4756   while (1)
4757     {
4758       int c;
4759       Lisp_Object val;
4760       struct charset *charset;
4761       int dim;
4762       int len = 1;
4763       unsigned code;
4764
4765       src_base = src;
4766       consumed_chars_base = consumed_chars;
4767
4768       if (charbuf >= charbuf_end)
4769         break;
4770
4771       ONE_MORE_BYTE (c);
4772       if (c < 0)
4773         goto invalid_code;
4774       code = c;
4775
4776       val = AREF (valids, c);
4777       if (NILP (val))
4778         goto invalid_code;
4779       if (INTEGERP (val))
4780         {
4781           charset = CHARSET_FROM_ID (XFASTINT (val));
4782           dim = CHARSET_DIMENSION (charset);
4783           while (len < dim)
4784             {
4785               ONE_MORE_BYTE (c);
4786               code = (code << 8) | c;
4787               len++;
4788             }
4789           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4790                               charset, code, c);
4791         }
4792       else
4793         {
4794           /* VAL is a list of charset IDs.  It is assured that the
4795              list is sorted by charset dimensions (smaller one
4796              comes first).  */
4797           while (CONSP (val))
4798             {
4799               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4800               dim = CHARSET_DIMENSION (charset);
4801               while (len < dim)
4802                 {
4803                   ONE_MORE_BYTE (c);
4804                   code = (code << 8) | c;
4805                   len++;
4806                 }
4807               CODING_DECODE_CHAR (coding, src, src_base,
4808                                   src_end, charset, code, c);
4809               if (c >= 0)
4810                 break;
4811               val = XCDR (val);
4812             }
4813         }
4814       if (c < 0)
4815         goto invalid_code;
4816       if (charset->id != charset_ascii
4817           && last_id != charset->id)
4818         {
4819           if (last_id != charset_ascii)
4820             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4821           last_id = charset->id;
4822           last_offset = char_offset;
4823         }
4824
4825       *charbuf++ = c;
4826       char_offset++;
4827       continue;
4828
4829     invalid_code:
4830       src = src_base;
4831       consumed_chars = consumed_chars_base;
4832       ONE_MORE_BYTE (c);
4833       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4834       char_offset++;
4835       coding->errors++;
4836     }
4837
4838  no_more_source:
4839   if (last_id != charset_ascii)
4840     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4841   coding->consumed_char += consumed_chars_base;
4842   coding->consumed = src_base - coding->source;
4843   coding->charbuf_used = charbuf - coding->charbuf;
4844 }
4845
4846 static int
4847 encode_coding_charset (coding)
4848      struct coding_system *coding;
4849 {
4850   int multibytep = coding->dst_multibyte;
4851   int *charbuf = coding->charbuf;
4852   int *charbuf_end = charbuf + coding->charbuf_used;
4853   unsigned char *dst = coding->destination + coding->produced;
4854   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4855   int safe_room = MAX_MULTIBYTE_LENGTH;
4856   int produced_chars = 0;
4857   Lisp_Object attrs, charset_list;
4858   int ascii_compatible;
4859   int c;
4860
4861   CODING_GET_INFO (coding, attrs, charset_list);
4862   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4863
4864   while (charbuf < charbuf_end)
4865     {
4866       struct charset *charset;
4867       unsigned code;
4868
4869       ASSURE_DESTINATION (safe_room);
4870       c = *charbuf++;
4871       if (ascii_compatible && ASCII_CHAR_P (c))
4872         EMIT_ONE_ASCII_BYTE (c);
4873       else if (CHAR_BYTE8_P (c))
4874         {
4875           c = CHAR_TO_BYTE8 (c);
4876           EMIT_ONE_BYTE (c);
4877         }
4878       else
4879         {
4880           charset = char_charset (c, charset_list, &code);
4881           if (charset)
4882             {
4883               if (CHARSET_DIMENSION (charset) == 1)
4884                 EMIT_ONE_BYTE (code);
4885               else if (CHARSET_DIMENSION (charset) == 2)
4886                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4887               else if (CHARSET_DIMENSION (charset) == 3)
4888                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4889               else
4890                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4891                                  (code >> 8) & 0xFF, code & 0xFF);
4892             }
4893           else
4894             {
4895               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4896                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4897               else
4898                 c = coding->default_char;
4899               EMIT_ONE_BYTE (c);
4900             }
4901         }
4902     }
4903
4904   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4905   coding->produced_char += produced_chars;
4906   coding->produced = dst - coding->destination;
4907   return 0;
4908 }
4909
4910 \f
4911 /*** 7. C library functions ***/
4912
4913 /* Setup coding context CODING from information about CODING_SYSTEM.
4914    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4915    CODING_SYSTEM is invalid, signal an error.  */
4916
4917 void
4918 setup_coding_system (coding_system, coding)
4919      Lisp_Object coding_system;
4920      struct coding_system *coding;
4921 {
4922   Lisp_Object attrs;
4923   Lisp_Object eol_type;
4924   Lisp_Object coding_type;
4925   Lisp_Object val;
4926
4927   if (NILP (coding_system))
4928     coding_system = Qundecided;
4929
4930   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4931
4932   attrs = CODING_ID_ATTRS (coding->id);
4933   eol_type = CODING_ID_EOL_TYPE (coding->id);
4934
4935   coding->mode = 0;
4936   coding->head_ascii = -1;
4937   coding->common_flags
4938     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4939   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4940     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4941   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4942     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4943   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4944     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4945
4946   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4947   coding->max_charset_id = SCHARS (val) - 1;
4948   coding->safe_charsets = (char *) SDATA (val);
4949   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4950
4951   coding_type = CODING_ATTR_TYPE (attrs);
4952   if (EQ (coding_type, Qundecided))
4953     {
4954       coding->detector = NULL;
4955       coding->decoder = decode_coding_raw_text;
4956       coding->encoder = encode_coding_raw_text;
4957       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4958     }
4959   else if (EQ (coding_type, Qiso_2022))
4960     {
4961       int i;
4962       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4963
4964       /* Invoke graphic register 0 to plane 0.  */
4965       CODING_ISO_INVOCATION (coding, 0) = 0;
4966       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4967       CODING_ISO_INVOCATION (coding, 1)
4968         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4969       /* Setup the initial status of designation.  */
4970       for (i = 0; i < 4; i++)
4971         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4972       /* Not single shifting initially.  */
4973       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4974       /* Beginning of buffer should also be regarded as bol. */
4975       CODING_ISO_BOL (coding) = 1;
4976       coding->detector = detect_coding_iso_2022;
4977       coding->decoder = decode_coding_iso_2022;
4978       coding->encoder = encode_coding_iso_2022;
4979       if (flags & CODING_ISO_FLAG_SAFE)
4980         coding->mode |= CODING_MODE_SAFE_ENCODING;
4981       coding->common_flags
4982         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4983             | CODING_REQUIRE_FLUSHING_MASK);
4984       if (flags & CODING_ISO_FLAG_COMPOSITION)
4985         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4986       if (flags & CODING_ISO_FLAG_DESIGNATION)
4987         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4988       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4989         {
4990           setup_iso_safe_charsets (attrs);
4991           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4992           coding->max_charset_id = SCHARS (val) - 1;
4993           coding->safe_charsets = (char *) SDATA (val);
4994         }
4995       CODING_ISO_FLAGS (coding) = flags;
4996     }
4997   else if (EQ (coding_type, Qcharset))
4998     {
4999       coding->detector = detect_coding_charset;
5000       coding->decoder = decode_coding_charset;
5001       coding->encoder = encode_coding_charset;
5002       coding->common_flags
5003         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5004     }
5005   else if (EQ (coding_type, Qutf_8))
5006     {
5007       coding->detector = detect_coding_utf_8;
5008       coding->decoder = decode_coding_utf_8;
5009       coding->encoder = encode_coding_utf_8;
5010       coding->common_flags
5011         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5012     }
5013   else if (EQ (coding_type, Qutf_16))
5014     {
5015       val = AREF (attrs, coding_attr_utf_16_bom);
5016       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5017                                     : EQ (val, Qt) ? utf_16_with_bom
5018                                     : utf_16_without_bom);
5019       val = AREF (attrs, coding_attr_utf_16_endian);
5020       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5021                                        : utf_16_little_endian);
5022       CODING_UTF_16_SURROGATE (coding) = 0;
5023       coding->detector = detect_coding_utf_16;
5024       coding->decoder = decode_coding_utf_16;
5025       coding->encoder = encode_coding_utf_16;
5026       coding->common_flags
5027         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5028       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5029         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5030     }
5031   else if (EQ (coding_type, Qccl))
5032     {
5033       coding->detector = detect_coding_ccl;
5034       coding->decoder = decode_coding_ccl;
5035       coding->encoder = encode_coding_ccl;
5036       coding->common_flags
5037         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5038             | CODING_REQUIRE_FLUSHING_MASK);
5039     }
5040   else if (EQ (coding_type, Qemacs_mule))
5041     {
5042       coding->detector = detect_coding_emacs_mule;
5043       coding->decoder = decode_coding_emacs_mule;
5044       coding->encoder = encode_coding_emacs_mule;
5045       coding->common_flags
5046         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5047       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5048           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5049         {
5050           Lisp_Object tail, safe_charsets;
5051           int max_charset_id = 0;
5052
5053           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5054                tail = XCDR (tail))
5055             if (max_charset_id < XFASTINT (XCAR (tail)))
5056               max_charset_id = XFASTINT (XCAR (tail));
5057           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5058                                         make_number (255));
5059           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5060                tail = XCDR (tail))
5061             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5062           coding->max_charset_id = max_charset_id;
5063           coding->safe_charsets = (char *) SDATA (safe_charsets);
5064         }
5065     }
5066   else if (EQ (coding_type, Qshift_jis))
5067     {
5068       coding->detector = detect_coding_sjis;
5069       coding->decoder = decode_coding_sjis;
5070       coding->encoder = encode_coding_sjis;
5071       coding->common_flags
5072         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5073     }
5074   else if (EQ (coding_type, Qbig5))
5075     {
5076       coding->detector = detect_coding_big5;
5077       coding->decoder = decode_coding_big5;
5078       coding->encoder = encode_coding_big5;
5079       coding->common_flags
5080         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5081     }
5082   else                          /* EQ (coding_type, Qraw_text) */
5083     {
5084       coding->detector = NULL;
5085       coding->decoder = decode_coding_raw_text;
5086       coding->encoder = encode_coding_raw_text;
5087       if (! EQ (eol_type, Qunix))
5088         {
5089           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5090           if (! VECTORP (eol_type))
5091             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5092         }
5093
5094     }
5095
5096   return;
5097 }
5098
5099 /* Return a list of charsets supported by CODING.  */
5100
5101 Lisp_Object
5102 coding_charset_list (coding)
5103      struct coding_system *coding;
5104 {
5105   Lisp_Object attrs, charset_list;
5106
5107   CODING_GET_INFO (coding, attrs, charset_list);
5108   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5109     {
5110       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5111
5112       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5113         charset_list = Viso_2022_charset_list;
5114     }
5115   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5116     {
5117       charset_list = Vemacs_mule_charset_list;
5118     }
5119   return charset_list;
5120 }
5121
5122
5123 /* Return raw-text or one of its subsidiaries that has the same
5124    eol_type as CODING-SYSTEM.  */
5125
5126 Lisp_Object
5127 raw_text_coding_system (coding_system)
5128      Lisp_Object coding_system;
5129 {
5130   Lisp_Object spec, attrs;
5131   Lisp_Object eol_type, raw_text_eol_type;
5132
5133   if (NILP (coding_system))
5134     return Qraw_text;
5135   spec = CODING_SYSTEM_SPEC (coding_system);
5136   attrs = AREF (spec, 0);
5137
5138   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5139     return coding_system;
5140
5141   eol_type = AREF (spec, 2);
5142   if (VECTORP (eol_type))
5143     return Qraw_text;
5144   spec = CODING_SYSTEM_SPEC (Qraw_text);
5145   raw_text_eol_type = AREF (spec, 2);
5146   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5147           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5148           : AREF (raw_text_eol_type, 2));
5149 }
5150
5151
5152 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5153    does, return one of the subsidiary that has the same eol-spec as
5154    PARENT.  Otherwise, return CODING_SYSTEM.  */
5155
5156 Lisp_Object
5157 coding_inherit_eol_type (coding_system, parent)
5158      Lisp_Object coding_system, parent;
5159 {
5160   Lisp_Object spec, eol_type;
5161
5162   if (NILP (coding_system))
5163     coding_system = Qraw_text;
5164   spec = CODING_SYSTEM_SPEC (coding_system);
5165   eol_type = AREF (spec, 2);
5166   if (VECTORP (eol_type)
5167       && ! NILP (parent))
5168     {
5169       Lisp_Object parent_spec;
5170       Lisp_Object parent_eol_type;
5171
5172       parent_spec
5173         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5174       parent_eol_type = AREF (parent_spec, 2);
5175       if (EQ (parent_eol_type, Qunix))
5176         coding_system = AREF (eol_type, 0);
5177       else if (EQ (parent_eol_type, Qdos))
5178         coding_system = AREF (eol_type, 1);
5179       else if (EQ (parent_eol_type, Qmac))
5180         coding_system = AREF (eol_type, 2);
5181     }
5182   return coding_system;
5183 }
5184
5185 /* Emacs has a mechanism to automatically detect a coding system if it
5186    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5187    it's impossible to distinguish some coding systems accurately
5188    because they use the same range of codes.  So, at first, coding
5189    systems are categorized into 7, those are:
5190
5191    o coding-category-emacs-mule
5192
5193         The category for a coding system which has the same code range
5194         as Emacs' internal format.  Assigned the coding-system (Lisp
5195         symbol) `emacs-mule' by default.
5196
5197    o coding-category-sjis
5198
5199         The category for a coding system which has the same code range
5200         as SJIS.  Assigned the coding-system (Lisp
5201         symbol) `japanese-shift-jis' by default.
5202
5203    o coding-category-iso-7
5204
5205         The category for a coding system which has the same code range
5206         as ISO2022 of 7-bit environment.  This doesn't use any locking
5207         shift and single shift functions.  This can encode/decode all
5208         charsets.  Assigned the coding-system (Lisp symbol)
5209         `iso-2022-7bit' by default.
5210
5211    o coding-category-iso-7-tight
5212
5213         Same as coding-category-iso-7 except that this can
5214         encode/decode only the specified charsets.
5215
5216    o coding-category-iso-8-1
5217
5218         The category for a coding system which has the same code range
5219         as ISO2022 of 8-bit environment and graphic plane 1 used only
5220         for DIMENSION1 charset.  This doesn't use any locking shift
5221         and single shift functions.  Assigned the coding-system (Lisp
5222         symbol) `iso-latin-1' by default.
5223
5224    o coding-category-iso-8-2
5225
5226         The category for a coding system which has the same code range
5227         as ISO2022 of 8-bit environment and graphic plane 1 used only
5228         for DIMENSION2 charset.  This doesn't use any locking shift
5229         and single shift functions.  Assigned the coding-system (Lisp
5230         symbol) `japanese-iso-8bit' by default.
5231
5232    o coding-category-iso-7-else
5233
5234         The category for a coding system which has the same code range
5235         as ISO2022 of 7-bit environemnt but uses locking shift or
5236         single shift functions.  Assigned the coding-system (Lisp
5237         symbol) `iso-2022-7bit-lock' by default.
5238
5239    o coding-category-iso-8-else
5240
5241         The category for a coding system which has the same code range
5242         as ISO2022 of 8-bit environemnt but uses locking shift or
5243         single shift functions.  Assigned the coding-system (Lisp
5244         symbol) `iso-2022-8bit-ss2' by default.
5245
5246    o coding-category-big5
5247
5248         The category for a coding system which has the same code range
5249         as BIG5.  Assigned the coding-system (Lisp symbol)
5250         `cn-big5' by default.
5251
5252    o coding-category-utf-8
5253
5254         The category for a coding system which has the same code range
5255         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5256         symbol) `utf-8' by default.
5257
5258    o coding-category-utf-16-be
5259
5260         The category for a coding system in which a text has an
5261         Unicode signature (cf. Unicode Standard) in the order of BIG
5262         endian at the head.  Assigned the coding-system (Lisp symbol)
5263         `utf-16-be' by default.
5264
5265    o coding-category-utf-16-le
5266
5267         The category for a coding system in which a text has an
5268         Unicode signature (cf. Unicode Standard) in the order of
5269         LITTLE endian at the head.  Assigned the coding-system (Lisp
5270         symbol) `utf-16-le' by default.
5271
5272    o coding-category-ccl
5273
5274         The category for a coding system of which encoder/decoder is
5275         written in CCL programs.  The default value is nil, i.e., no
5276         coding system is assigned.
5277
5278    o coding-category-binary
5279
5280         The category for a coding system not categorized in any of the
5281         above.  Assigned the coding-system (Lisp symbol)
5282         `no-conversion' by default.
5283
5284    Each of them is a Lisp symbol and the value is an actual
5285    `coding-system's (this is also a Lisp symbol) assigned by a user.
5286    What Emacs does actually is to detect a category of coding system.
5287    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5288    decide only one possible category, it selects a category of the
5289    highest priority.  Priorities of categories are also specified by a
5290    user in a Lisp variable `coding-category-list'.
5291
5292 */
5293
5294 #define EOL_SEEN_NONE   0
5295 #define EOL_SEEN_LF     1
5296 #define EOL_SEEN_CR     2
5297 #define EOL_SEEN_CRLF   4
5298
5299 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5300    SOURCE is encoded.  If CATEGORY is one of
5301    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5302    two-byte, else they are encoded by one-byte.
5303
5304    Return one of EOL_SEEN_XXX.  */
5305
5306 #define MAX_EOL_CHECK_COUNT 3
5307
5308 static int
5309 detect_eol (source, src_bytes, category)
5310      const unsigned char *source;
5311      EMACS_INT src_bytes;
5312      enum coding_category category;
5313 {
5314   const unsigned char *src = source, *src_end = src + src_bytes;
5315   unsigned char c;
5316   int total  = 0;
5317   int eol_seen = EOL_SEEN_NONE;
5318
5319   if ((1 << category) & CATEGORY_MASK_UTF_16)
5320     {
5321       int msb, lsb;
5322
5323       msb = category == (coding_category_utf_16_le
5324                          | coding_category_utf_16_le_nosig);
5325       lsb = 1 - msb;
5326
5327       while (src + 1 < src_end)
5328         {
5329           c = src[lsb];
5330           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5331             {
5332               int this_eol;
5333
5334               if (c == '\n')
5335                 this_eol = EOL_SEEN_LF;
5336               else if (src + 3 >= src_end
5337                        || src[msb + 2] != 0
5338                        || src[lsb + 2] != '\n')
5339                 this_eol = EOL_SEEN_CR;
5340               else
5341                 this_eol = EOL_SEEN_CRLF;
5342
5343               if (eol_seen == EOL_SEEN_NONE)
5344                 /* This is the first end-of-line.  */
5345                 eol_seen = this_eol;
5346               else if (eol_seen != this_eol)
5347                 {
5348                   /* The found type is different from what found before.  */
5349                   eol_seen = EOL_SEEN_LF;
5350                   break;
5351                 }
5352               if (++total == MAX_EOL_CHECK_COUNT)
5353                 break;
5354             }
5355           src += 2;
5356         }
5357     }
5358   else
5359     {
5360       while (src < src_end)
5361         {
5362           c = *src++;
5363           if (c == '\n' || c == '\r')
5364             {
5365               int this_eol;
5366
5367               if (c == '\n')
5368                 this_eol = EOL_SEEN_LF;
5369               else if (src >= src_end || *src != '\n')
5370                 this_eol = EOL_SEEN_CR;
5371               else
5372                 this_eol = EOL_SEEN_CRLF, src++;
5373
5374               if (eol_seen == EOL_SEEN_NONE)
5375                 /* This is the first end-of-line.  */
5376                 eol_seen = this_eol;
5377               else if (eol_seen != this_eol)
5378                 {
5379                   /* The found type is different from what found before.  */
5380                   eol_seen = EOL_SEEN_LF;
5381                   break;
5382                 }
5383               if (++total == MAX_EOL_CHECK_COUNT)
5384                 break;
5385             }
5386         }
5387     }
5388   return eol_seen;
5389 }
5390
5391
5392 static Lisp_Object
5393 adjust_coding_eol_type (coding, eol_seen)
5394      struct coding_system *coding;
5395      int eol_seen;
5396 {
5397   Lisp_Object eol_type;
5398
5399   eol_type = CODING_ID_EOL_TYPE (coding->id);
5400   if (eol_seen & EOL_SEEN_LF)
5401     {
5402       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5403       eol_type = Qunix;
5404     }
5405   else if (eol_seen & EOL_SEEN_CRLF)
5406     {
5407       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5408       eol_type = Qdos;
5409     }
5410   else if (eol_seen & EOL_SEEN_CR)
5411     {
5412       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5413       eol_type = Qmac;
5414     }
5415   return eol_type;
5416 }
5417
5418 /* Detect how a text specified in CODING is encoded.  If a coding
5419    system is detected, update fields of CODING by the detected coding
5420    system.  */
5421
5422 void
5423 detect_coding (coding)
5424      struct coding_system *coding;
5425 {
5426   const unsigned char *src, *src_end;
5427
5428   coding->consumed = coding->consumed_char = 0;
5429   coding->produced = coding->produced_char = 0;
5430   coding_set_source (coding);
5431
5432   src_end = coding->source + coding->src_bytes;
5433
5434   /* If we have not yet decided the text encoding type, detect it
5435      now.  */
5436   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5437     {
5438       int c, i;
5439       struct coding_detection_info detect_info;
5440
5441       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5442       for (i = 0, src = coding->source; src < src_end; i++, src++)
5443         {
5444           c = *src;
5445           if (c & 0x80)
5446             break;
5447           if (c < 0x20
5448               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5449               && ! inhibit_iso_escape_detection
5450               && ! detect_info.checked)
5451             {
5452               coding->head_ascii = src - (coding->source + coding->consumed);
5453               if (detect_coding_iso_2022 (coding, &detect_info))
5454                 {
5455                   /* We have scanned the whole data.  */
5456                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5457                     /* We didn't find an 8-bit code.  */
5458                     src = src_end;
5459                   break;
5460                 }
5461             }
5462         }
5463       coding->head_ascii = src - (coding->source + coding->consumed);
5464
5465       if (coding->head_ascii < coding->src_bytes
5466           || detect_info.found)
5467         {
5468           enum coding_category category;
5469           struct coding_system *this;
5470
5471           if (coding->head_ascii == coding->src_bytes)
5472             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5473             for (i = 0; i < coding_category_raw_text; i++)
5474               {
5475                 category = coding_priorities[i];
5476                 this = coding_categories + category;
5477                 if (detect_info.found & (1 << category))
5478                   break;
5479               }
5480           else
5481             for (i = 0; i < coding_category_raw_text; i++)
5482               {
5483                 category = coding_priorities[i];
5484                 this = coding_categories + category;
5485                 if (this->id < 0)
5486                   {
5487                     /* No coding system of this category is defined.  */
5488                     detect_info.rejected |= (1 << category);
5489                   }
5490                 else if (category >= coding_category_raw_text)
5491                   continue;
5492                 else if (detect_info.checked & (1 << category))
5493                   {
5494                     if (detect_info.found & (1 << category))
5495                       break;
5496                   }
5497                 else if ((*(this->detector)) (coding, &detect_info)
5498                          && detect_info.found & (1 << category))
5499                   {
5500                     if (category == coding_category_utf_16_auto)
5501                       {
5502                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5503                           category = coding_category_utf_16_le;
5504                         else
5505                           category = coding_category_utf_16_be;
5506                       }
5507                     break;
5508                   }
5509               }
5510
5511           if (i < coding_category_raw_text)
5512             setup_coding_system (CODING_ID_NAME (this->id), coding);
5513           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5514             setup_coding_system (Qraw_text, coding);
5515           else if (detect_info.rejected)
5516             for (i = 0; i < coding_category_raw_text; i++)
5517               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5518                 {
5519                   this = coding_categories + coding_priorities[i];
5520                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5521                   break;
5522                 }
5523         }
5524     }
5525   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5526            == coding_category_utf_16_auto)
5527     {
5528       Lisp_Object coding_systems;
5529       struct coding_detection_info detect_info;
5530
5531       coding_systems
5532         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5533       detect_info.found = detect_info.rejected = 0;
5534       if (CONSP (coding_systems)
5535           && detect_coding_utf_16 (coding, &detect_info))
5536         {
5537           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5538             setup_coding_system (XCAR (coding_systems), coding);
5539           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5540             setup_coding_system (XCDR (coding_systems), coding);
5541         }
5542     }
5543 }
5544
5545
5546 static void
5547 decode_eol (coding)
5548      struct coding_system *coding;
5549 {
5550   Lisp_Object eol_type;
5551   unsigned char *p, *pbeg, *pend;
5552
5553   eol_type = CODING_ID_EOL_TYPE (coding->id);
5554   if (EQ (eol_type, Qunix))
5555     return;
5556
5557   if (NILP (coding->dst_object))
5558     pbeg = coding->destination;
5559   else
5560     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5561   pend = pbeg + coding->produced;
5562
5563   if (VECTORP (eol_type))
5564     {
5565       int eol_seen = EOL_SEEN_NONE;
5566
5567       for (p = pbeg; p < pend; p++)
5568         {
5569           if (*p == '\n')
5570             eol_seen |= EOL_SEEN_LF;
5571           else if (*p == '\r')
5572             {
5573               if (p + 1 < pend && *(p + 1) == '\n')
5574                 {
5575                   eol_seen |= EOL_SEEN_CRLF;
5576                   p++;
5577                 }
5578               else
5579                 eol_seen |= EOL_SEEN_CR;
5580             }
5581         }
5582       if (eol_seen != EOL_SEEN_NONE
5583           && eol_seen != EOL_SEEN_LF
5584           && eol_seen != EOL_SEEN_CRLF
5585           && eol_seen != EOL_SEEN_CR)
5586         eol_seen = EOL_SEEN_LF;
5587       if (eol_seen != EOL_SEEN_NONE)
5588         eol_type = adjust_coding_eol_type (coding, eol_seen);
5589     }
5590
5591   if (EQ (eol_type, Qmac))
5592     {
5593       for (p = pbeg; p < pend; p++)
5594         if (*p == '\r')
5595           *p = '\n';
5596     }
5597   else if (EQ (eol_type, Qdos))
5598     {
5599       int n = 0;
5600
5601       if (NILP (coding->dst_object))
5602         {
5603           for (p = pend - 2; p >= pbeg; p--)
5604             if (*p == '\r')
5605               {
5606                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5607                 n++;
5608               }
5609         }
5610       else
5611         {
5612           for (p = pend - 2; p >= pbeg; p--)
5613             if (*p == '\r')
5614               {
5615                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5616                 int pos = BYTE_TO_CHAR (pos_byte);
5617
5618                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5619                 n++;
5620               }
5621         }
5622       coding->produced -= n;
5623       coding->produced_char -= n;
5624     }
5625 }
5626
5627
5628 /* Return a translation table (or list of them) from coding system
5629    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5630    decoding (ENCODEP is zero). */
5631
5632 static Lisp_Object
5633 get_translation_table (attrs, encodep, max_lookup)
5634      Lisp_Object attrs;
5635      int encodep, *max_lookup;
5636 {
5637   Lisp_Object standard, translation_table;
5638   Lisp_Object val;
5639
5640   if (encodep)
5641     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5642       standard = Vstandard_translation_table_for_encode;
5643   else
5644     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5645       standard = Vstandard_translation_table_for_decode;
5646   if (NILP (translation_table))
5647     translation_table = standard;
5648   else
5649     {
5650       if (SYMBOLP (translation_table))
5651         translation_table = Fget (translation_table, Qtranslation_table);
5652       else if (CONSP (translation_table))
5653         {
5654           translation_table = Fcopy_sequence (translation_table);
5655           for (val = translation_table; CONSP (val); val = XCDR (val))
5656             if (SYMBOLP (XCAR (val)))
5657               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5658         }
5659       if (CHAR_TABLE_P (standard))
5660         {
5661           if (CONSP (translation_table))
5662             translation_table = nconc2 (translation_table,
5663                                         Fcons (standard, Qnil));
5664           else
5665             translation_table = Fcons (translation_table,
5666                                        Fcons (standard, Qnil));
5667         }
5668     }
5669
5670   if (max_lookup)
5671     {
5672       *max_lookup = 1;
5673       if (CHAR_TABLE_P (translation_table)
5674           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5675         {
5676           val = XCHAR_TABLE (translation_table)->extras[1];
5677           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5678             *max_lookup = XFASTINT (val);
5679         }
5680       else if (CONSP (translation_table))
5681         {
5682           Lisp_Object tail, val;
5683
5684           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5685             if (CHAR_TABLE_P (XCAR (tail))
5686                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5687               {
5688                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5689                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5690                   *max_lookup = XFASTINT (val);
5691               }
5692         }
5693     }
5694   return translation_table;
5695 }
5696
5697 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5698   do {                                                          \
5699     trans = Qnil;                                               \
5700     if (CHAR_TABLE_P (table))                                   \
5701       {                                                         \
5702         trans = CHAR_TABLE_REF (table, c);                      \
5703         if (CHARACTERP (trans))                                 \
5704           c = XFASTINT (trans), trans = Qnil;                   \
5705       }                                                         \
5706     else if (CONSP (table))                                     \
5707       {                                                         \
5708         Lisp_Object tail;                                       \
5709                                                                 \
5710         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5711           if (CHAR_TABLE_P (XCAR (tail)))                       \
5712             {                                                   \
5713               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5714               if (CHARACTERP (trans))                           \
5715                 c = XFASTINT (trans), trans = Qnil;             \
5716               else if (! NILP (trans))                          \
5717                 break;                                          \
5718             }                                                   \
5719       }                                                         \
5720   } while (0)
5721
5722
5723 static Lisp_Object
5724 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5725      Lisp_Object val;
5726      int *buf, *buf_end;
5727      int last_block;
5728      int *from_nchars, *to_nchars;
5729 {
5730   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5731      [TO-CHAR ...].  */
5732   if (CONSP (val))
5733     {
5734       Lisp_Object from, tail;
5735       int i, len;
5736
5737       for (tail = val; CONSP (tail); tail = XCDR (tail))
5738         {
5739           val = XCAR (tail);
5740           from = XCAR (val);
5741           len = ASIZE (from);
5742           for (i = 0; i < len; i++)
5743             {
5744               if (buf + i == buf_end)
5745                 {
5746                   if (! last_block)
5747                     return Qt;
5748                   break;
5749                 }
5750               if (XINT (AREF (from, i)) != buf[i])
5751                 break;
5752             }
5753           if (i == len)
5754             {
5755               val = XCDR (val);
5756               *from_nchars = len;
5757               break;
5758             }
5759         }
5760       if (! CONSP (tail))
5761         return Qnil;
5762     }
5763   if (VECTORP (val))
5764     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5765   else
5766     *buf = XINT (val);
5767   return val;
5768 }
5769
5770
5771 static int
5772 produce_chars (coding, translation_table, last_block)
5773      struct coding_system *coding;
5774      Lisp_Object translation_table;
5775      int last_block;
5776 {
5777   unsigned char *dst = coding->destination + coding->produced;
5778   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5779   int produced;
5780   int produced_chars = 0;
5781   int carryover = 0;
5782
5783   if (! coding->chars_at_source)
5784     {
5785       /* Characters are in coding->charbuf.  */
5786       int *buf = coding->charbuf;
5787       int *buf_end = buf + coding->charbuf_used;
5788
5789       if (BUFFERP (coding->src_object)
5790           && EQ (coding->src_object, coding->dst_object))
5791         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5792
5793       while (buf < buf_end)
5794         {
5795           int c = *buf, i;
5796
5797           if (c >= 0)
5798             {
5799               int from_nchars = 1, to_nchars = 1;
5800               Lisp_Object trans = Qnil;
5801
5802               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5803               if (! NILP (trans))
5804                 {
5805                   trans = get_translation (trans, buf, buf_end, last_block,
5806                                            &from_nchars, &to_nchars);
5807                   if (EQ (trans, Qt))
5808                     break;
5809                   c = *buf;
5810                 }
5811
5812               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5813                 {
5814                   dst = alloc_destination (coding,
5815                                            buf_end - buf
5816                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5817                                            dst);
5818                   dst_end = coding->destination + coding->dst_bytes;
5819                 }
5820
5821               for (i = 0; i < to_nchars; i++)
5822                 {
5823                   if (i > 0)
5824                     c = XINT (AREF (trans, i));
5825                   if (coding->dst_multibyte
5826                       || ! CHAR_BYTE8_P (c))
5827                     CHAR_STRING_ADVANCE (c, dst);
5828                   else
5829                     *dst++ = CHAR_TO_BYTE8 (c);
5830                 }
5831               produced_chars += to_nchars;
5832               *buf++ = to_nchars;
5833               while (--from_nchars > 0)
5834                 *buf++ = 0;
5835             }
5836           else
5837             /* This is an annotation datum.  (-C) is the length.  */
5838             buf += -c;
5839         }
5840       carryover = buf_end - buf;
5841     }
5842   else
5843     {
5844       const unsigned char *src = coding->source;
5845       const unsigned char *src_end = src + coding->src_bytes;
5846       Lisp_Object eol_type;
5847
5848       eol_type = CODING_ID_EOL_TYPE (coding->id);
5849
5850       if (coding->src_multibyte != coding->dst_multibyte)
5851         {
5852           if (coding->src_multibyte)
5853             {
5854               int multibytep = 1;
5855               int consumed_chars;
5856
5857               while (1)
5858                 {
5859                   const unsigned char *src_base = src;
5860                   int c;
5861
5862                   ONE_MORE_BYTE (c);
5863                   if (c == '\r')
5864                     {
5865                       if (EQ (eol_type, Qdos))
5866                         {
5867                           if (src == src_end)
5868                             {
5869                               record_conversion_result
5870                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5871                               goto no_more_source;
5872                             }
5873                           if (*src == '\n')
5874                             c = *src++;
5875                         }
5876                       else if (EQ (eol_type, Qmac))
5877                         c = '\n';
5878                     }
5879                   if (dst == dst_end)
5880                     {
5881                       coding->consumed = src - coding->source;
5882
5883                     if (EQ (coding->src_object, coding->dst_object))
5884                       dst_end = (unsigned char *) src;
5885                     if (dst == dst_end)
5886                       {
5887                         dst = alloc_destination (coding, src_end - src + 1,
5888                                                  dst);
5889                         dst_end = coding->destination + coding->dst_bytes;
5890                         coding_set_source (coding);
5891                         src = coding->source + coding->consumed;
5892                         src_end = coding->source + coding->src_bytes;
5893                       }
5894                     }
5895                   *dst++ = c;
5896                   produced_chars++;
5897                 }
5898             no_more_source:
5899               ;
5900             }
5901           else
5902             while (src < src_end)
5903               {
5904                 int multibytep = 1;
5905                 int c = *src++;
5906
5907                 if (c == '\r')
5908                   {
5909                     if (EQ (eol_type, Qdos))
5910                       {
5911                         if (src < src_end
5912                             && *src == '\n')
5913                           c = *src++;
5914                       }
5915                     else if (EQ (eol_type, Qmac))
5916                       c = '\n';
5917                   }
5918                 if (dst >= dst_end - 1)
5919                   {
5920                     coding->consumed = src - coding->source;
5921
5922                     if (EQ (coding->src_object, coding->dst_object))
5923                       dst_end = (unsigned char *) src;
5924                     if (dst >= dst_end - 1)
5925                       {
5926                         dst = alloc_destination (coding, src_end - src + 2,
5927                                                  dst);
5928                         dst_end = coding->destination + coding->dst_bytes;
5929                         coding_set_source (coding);
5930                         src = coding->source + coding->consumed;
5931                         src_end = coding->source + coding->src_bytes;
5932                       }
5933                   }
5934                 EMIT_ONE_BYTE (c);
5935               }
5936         }
5937       else
5938         {
5939           if (!EQ (coding->src_object, coding->dst_object))
5940             {
5941               int require = coding->src_bytes - coding->dst_bytes;
5942
5943               if (require > 0)
5944                 {
5945                   EMACS_INT offset = src - coding->source;
5946
5947                   dst = alloc_destination (coding, require, dst);
5948                   coding_set_source (coding);
5949                   src = coding->source + offset;
5950                   src_end = coding->source + coding->src_bytes;
5951                 }
5952             }
5953           produced_chars = coding->src_chars;
5954           while (src < src_end)
5955             {
5956               int c = *src++;
5957
5958               if (c == '\r')
5959                 {
5960                   if (EQ (eol_type, Qdos))
5961                     {
5962                       if (src < src_end
5963                           && *src == '\n')
5964                         c = *src++;
5965                       produced_chars--;
5966                     }
5967                   else if (EQ (eol_type, Qmac))
5968                     c = '\n';
5969                 }
5970               *dst++ = c;
5971             }
5972         }
5973       coding->consumed = coding->src_bytes;
5974       coding->consumed_char = coding->src_chars;
5975     }
5976
5977   produced = dst - (coding->destination + coding->produced);
5978   if (BUFFERP (coding->dst_object))
5979     insert_from_gap (produced_chars, produced);
5980   coding->produced += produced;
5981   coding->produced_char += produced_chars;
5982   return carryover;
5983 }
5984
5985 /* Compose text in CODING->object according to the annotation data at
5986    CHARBUF.  CHARBUF is an array:
5987      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5988  */
5989
5990 static INLINE void
5991 produce_composition (coding, charbuf, pos)
5992      struct coding_system *coding;
5993      int *charbuf;
5994      EMACS_INT pos;
5995 {
5996   int len;
5997   EMACS_INT to;
5998   enum composition_method method;
5999   Lisp_Object components;
6000
6001   len = -charbuf[0];
6002   to = pos + charbuf[2];
6003   if (to <= pos)
6004     return;
6005   method = (enum composition_method) (charbuf[3]);
6006
6007   if (method == COMPOSITION_RELATIVE)
6008     components = Qnil;
6009   else if (method >= COMPOSITION_WITH_RULE
6010            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6011     {
6012       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6013       int i;
6014
6015       len -= 4;
6016       charbuf += 4;
6017       for (i = 0; i < len; i++)
6018         {
6019           args[i] = make_number (charbuf[i]);
6020           if (args[i] < 0)
6021             return;
6022         }
6023       components = (method == COMPOSITION_WITH_ALTCHARS
6024                     ? Fstring (len, args) : Fvector (len, args));
6025     }
6026   else
6027     return;
6028   compose_text (pos, to, components, Qnil, coding->dst_object);
6029 }
6030
6031
6032 /* Put `charset' property on text in CODING->object according to
6033    the annotation data at CHARBUF.  CHARBUF is an array:
6034      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6035  */
6036
6037 static INLINE void
6038 produce_charset (coding, charbuf, pos)
6039      struct coding_system *coding;
6040      int *charbuf;
6041      EMACS_INT pos;
6042 {
6043   EMACS_INT from = pos - charbuf[2];
6044   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6045
6046   Fput_text_property (make_number (from), make_number (pos),
6047                       Qcharset, CHARSET_NAME (charset),
6048                       coding->dst_object);
6049 }
6050
6051
6052 #define CHARBUF_SIZE 0x4000
6053
6054 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6055   do {                                                                  \
6056     int size = CHARBUF_SIZE;;                                           \
6057                                                                         \
6058     coding->charbuf = NULL;                                             \
6059     while (size > 1024)                                                 \
6060       {                                                                 \
6061         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6062         if (coding->charbuf)                                            \
6063           break;                                                        \
6064         size >>= 1;                                                     \
6065       }                                                                 \
6066     if (! coding->charbuf)                                              \
6067       {                                                                 \
6068         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6069         return coding->result;                                          \
6070       }                                                                 \
6071     coding->charbuf_size = size;                                        \
6072   } while (0)
6073
6074
6075 static void
6076 produce_annotation (coding, pos)
6077      struct coding_system *coding;
6078      EMACS_INT pos;
6079 {
6080   int *charbuf = coding->charbuf;
6081   int *charbuf_end = charbuf + coding->charbuf_used;
6082
6083   if (NILP (coding->dst_object))
6084     return;
6085
6086   while (charbuf < charbuf_end)
6087     {
6088       if (*charbuf >= 0)
6089         pos += *charbuf++;
6090       else
6091         {
6092           int len = -*charbuf;
6093           switch (charbuf[1])
6094             {
6095             case CODING_ANNOTATE_COMPOSITION_MASK:
6096               produce_composition (coding, charbuf, pos);
6097               break;
6098             case CODING_ANNOTATE_CHARSET_MASK:
6099               produce_charset (coding, charbuf, pos);
6100               break;
6101             default:
6102               abort ();
6103             }
6104           charbuf += len;
6105         }
6106     }
6107 }
6108
6109 /* Decode the data at CODING->src_object into CODING->dst_object.
6110    CODING->src_object is a buffer, a string, or nil.
6111    CODING->dst_object is a buffer.
6112
6113    If CODING->src_object is a buffer, it must be the current buffer.
6114    In this case, if CODING->src_pos is positive, it is a position of
6115    the source text in the buffer, otherwise, the source text is in the
6116    gap area of the buffer, and CODING->src_pos specifies the offset of
6117    the text from GPT (which must be the same as PT).  If this is the
6118    same buffer as CODING->dst_object, CODING->src_pos must be
6119    negative.
6120
6121    If CODING->src_object is a string, CODING->src_pos in an index to
6122    that string.
6123
6124    If CODING->src_object is nil, CODING->source must already point to
6125    the non-relocatable memory area.  In this case, CODING->src_pos is
6126    an offset from CODING->source.
6127
6128    The decoded data is inserted at the current point of the buffer
6129    CODING->dst_object.
6130 */
6131
6132 static int
6133 decode_coding (coding)
6134      struct coding_system *coding;
6135 {
6136   Lisp_Object attrs;
6137   Lisp_Object undo_list;
6138   Lisp_Object translation_table;
6139   int carryover;
6140   int i;
6141
6142   if (BUFFERP (coding->src_object)
6143       && coding->src_pos > 0
6144       && coding->src_pos < GPT
6145       && coding->src_pos + coding->src_chars > GPT)
6146     move_gap_both (coding->src_pos, coding->src_pos_byte);
6147
6148   undo_list = Qt;
6149   if (BUFFERP (coding->dst_object))
6150     {
6151       if (current_buffer != XBUFFER (coding->dst_object))
6152         set_buffer_internal (XBUFFER (coding->dst_object));
6153       if (GPT != PT)
6154         move_gap_both (PT, PT_BYTE);
6155       undo_list = current_buffer->undo_list;
6156       current_buffer->undo_list = Qt;
6157     }
6158
6159   coding->consumed = coding->consumed_char = 0;
6160   coding->produced = coding->produced_char = 0;
6161   coding->chars_at_source = 0;
6162   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6163   coding->errors = 0;
6164
6165   ALLOC_CONVERSION_WORK_AREA (coding);
6166
6167   attrs = CODING_ID_ATTRS (coding->id);
6168   translation_table = get_translation_table (attrs, 0, NULL);
6169
6170   carryover = 0;
6171   do
6172     {
6173       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6174
6175       coding_set_source (coding);
6176       coding->annotated = 0;
6177       coding->charbuf_used = carryover;
6178       (*(coding->decoder)) (coding);
6179       coding_set_destination (coding);
6180       carryover = produce_chars (coding, translation_table, 0);
6181       if (coding->annotated)
6182         produce_annotation (coding, pos);
6183       for (i = 0; i < carryover; i++)
6184         coding->charbuf[i]
6185           = coding->charbuf[coding->charbuf_used - carryover + i];
6186     }
6187   while (coding->consumed < coding->src_bytes
6188          && ! coding->result);
6189
6190   if (carryover > 0)
6191     {
6192       coding_set_destination (coding);
6193       coding->charbuf_used = carryover;
6194       produce_chars (coding, translation_table, 1);
6195     }
6196
6197   coding->carryover_bytes = 0;
6198   if (coding->consumed < coding->src_bytes)
6199     {
6200       int nbytes = coding->src_bytes - coding->consumed;
6201       const unsigned char *src;
6202
6203       coding_set_source (coding);
6204       coding_set_destination (coding);
6205       src = coding->source + coding->consumed;
6206
6207       if (coding->mode & CODING_MODE_LAST_BLOCK)
6208         {
6209           /* Flush out unprocessed data as binary chars.  We are sure
6210              that the number of data is less than the size of
6211              coding->charbuf.  */
6212           coding->charbuf_used = 0;
6213           while (nbytes-- > 0)
6214             {
6215               int c = *src++;
6216
6217               if (c & 0x80)
6218                 c = BYTE8_TO_CHAR (c);
6219               coding->charbuf[coding->charbuf_used++] = c;
6220             }
6221           produce_chars (coding, Qnil, 1);
6222         }
6223       else
6224         {
6225           /* Record unprocessed bytes in coding->carryover.  We are
6226              sure that the number of data is less than the size of
6227              coding->carryover.  */
6228           unsigned char *p = coding->carryover;
6229
6230           coding->carryover_bytes = nbytes;
6231           while (nbytes-- > 0)
6232             *p++ = *src++;
6233         }
6234       coding->consumed = coding->src_bytes;
6235     }
6236
6237   if (BUFFERP (coding->dst_object))
6238     {
6239       current_buffer->undo_list = undo_list;
6240       record_insert (coding->dst_pos, coding->produced_char);
6241     }
6242   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6243     decode_eol (coding);
6244   return coding->result;
6245 }
6246
6247
6248 /* Extract an annotation datum from a composition starting at POS and
6249    ending before LIMIT of CODING->src_object (buffer or string), store
6250    the data in BUF, set *STOP to a starting position of the next
6251    composition (if any) or to LIMIT, and return the address of the
6252    next element of BUF.
6253
6254    If such an annotation is not found, set *STOP to a starting
6255    position of a composition after POS (if any) or to LIMIT, and
6256    return BUF.  */
6257
6258 static INLINE int *
6259 handle_composition_annotation (pos, limit, coding, buf, stop)
6260      EMACS_INT pos, limit;
6261      struct coding_system *coding;
6262      int *buf;
6263      EMACS_INT *stop;
6264 {
6265   EMACS_INT start, end;
6266   Lisp_Object prop;
6267
6268   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6269       || end > limit)
6270     *stop = limit;
6271   else if (start > pos)
6272     *stop = start;
6273   else
6274     {
6275       if (start == pos)
6276         {
6277           /* We found a composition.  Store the corresponding
6278              annotation data in BUF.  */
6279           int *head = buf;
6280           enum composition_method method = COMPOSITION_METHOD (prop);
6281           int nchars = COMPOSITION_LENGTH (prop);
6282
6283           ADD_COMPOSITION_DATA (buf, nchars, method);
6284           if (method != COMPOSITION_RELATIVE)
6285             {
6286               Lisp_Object components;
6287               int len, i, i_byte;
6288
6289               components = COMPOSITION_COMPONENTS (prop);
6290               if (VECTORP (components))
6291                 {
6292                   len = XVECTOR (components)->size;
6293                   for (i = 0; i < len; i++)
6294                     *buf++ = XINT (AREF (components, i));
6295                 }
6296               else if (STRINGP (components))
6297                 {
6298                   len = SCHARS (components);
6299                   i = i_byte = 0;
6300                   while (i < len)
6301                     {
6302                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6303                       buf++;
6304                     }
6305                 }
6306               else if (INTEGERP (components))
6307                 {
6308                   len = 1;
6309                   *buf++ = XINT (components);
6310                 }
6311               else if (CONSP (components))
6312                 {
6313                   for (len = 0; CONSP (components);
6314                        len++, components = XCDR (components))
6315                     *buf++ = XINT (XCAR (components));
6316                 }
6317               else
6318                 abort ();
6319               *head -= len;
6320             }
6321         }
6322
6323       if (find_composition (end, limit, &start, &end, &prop,
6324                             coding->src_object)
6325           && end <= limit)
6326         *stop = start;
6327       else
6328         *stop = limit;
6329     }
6330   return buf;
6331 }
6332
6333
6334 /* Extract an annotation datum from a text property `charset' at POS of
6335    CODING->src_object (buffer of string), store the data in BUF, set
6336    *STOP to the position where the value of `charset' property changes
6337    (limiting by LIMIT), and return the address of the next element of
6338    BUF.
6339
6340    If the property value is nil, set *STOP to the position where the
6341    property value is non-nil (limiting by LIMIT), and return BUF.  */
6342
6343 static INLINE int *
6344 handle_charset_annotation (pos, limit, coding, buf, stop)
6345      EMACS_INT pos, limit;
6346      struct coding_system *coding;
6347      int *buf;
6348      EMACS_INT *stop;
6349 {
6350   Lisp_Object val, next;
6351   int id;
6352
6353   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6354   if (! NILP (val) && CHARSETP (val))
6355     id = XINT (CHARSET_SYMBOL_ID (val));
6356   else
6357     id = -1;
6358   ADD_CHARSET_DATA (buf, 0, id);
6359   next = Fnext_single_property_change (make_number (pos), Qcharset,
6360                                        coding->src_object,
6361                                        make_number (limit));
6362   *stop = XINT (next);
6363   return buf;
6364 }
6365
6366
6367 static void
6368 consume_chars (coding, translation_table, max_lookup)
6369      struct coding_system *coding;
6370      Lisp_Object translation_table;
6371      int max_lookup;
6372 {
6373   int *buf = coding->charbuf;
6374   int *buf_end = coding->charbuf + coding->charbuf_size;
6375   const unsigned char *src = coding->source + coding->consumed;
6376   const unsigned char *src_end = coding->source + coding->src_bytes;
6377   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6378   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6379   int multibytep = coding->src_multibyte;
6380   Lisp_Object eol_type;
6381   int c;
6382   EMACS_INT stop, stop_composition, stop_charset;
6383   int *lookup_buf = NULL;
6384
6385   if (! NILP (translation_table))
6386     lookup_buf = alloca (sizeof (int) * max_lookup);
6387
6388   eol_type = CODING_ID_EOL_TYPE (coding->id);
6389   if (VECTORP (eol_type))
6390     eol_type = Qunix;
6391
6392   /* Note: composition handling is not yet implemented.  */
6393   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6394
6395   if (NILP (coding->src_object))
6396     stop = stop_composition = stop_charset = end_pos;
6397   else
6398     {
6399       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6400         stop = stop_composition = pos;
6401       else
6402         stop = stop_composition = end_pos;
6403       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6404         stop = stop_charset = pos;
6405       else
6406         stop_charset = end_pos;
6407     }
6408
6409   /* Compensate for CRLF and conversion.  */
6410   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6411   while (buf < buf_end)
6412     {
6413       Lisp_Object trans;
6414
6415       if (pos == stop)
6416         {
6417           if (pos == end_pos)
6418             break;
6419           if (pos == stop_composition)
6420             buf = handle_composition_annotation (pos, end_pos, coding,
6421                                                  buf, &stop_composition);
6422           if (pos == stop_charset)
6423             buf = handle_charset_annotation (pos, end_pos, coding,
6424                                              buf, &stop_charset);
6425           stop = (stop_composition < stop_charset
6426                   ? stop_composition : stop_charset);
6427         }
6428
6429       if (! multibytep)
6430         {
6431           EMACS_INT bytes;
6432
6433           if (coding->encoder == encode_coding_raw_text)
6434             c = *src++, pos++;
6435           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6436             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6437           else
6438             c = BYTE8_TO_CHAR (*src), src++, pos++;
6439         }
6440       else
6441         c = STRING_CHAR_ADVANCE (src), pos++;
6442       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6443         c = '\n';
6444       if (! EQ (eol_type, Qunix))
6445         {
6446           if (c == '\n')
6447             {
6448               if (EQ (eol_type, Qdos))
6449                 *buf++ = '\r';
6450               else
6451                 c = '\r';
6452             }
6453         }
6454
6455       trans = Qnil;
6456       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6457       if (NILP (trans))
6458         *buf++ = c;
6459       else
6460         {
6461           int from_nchars = 1, to_nchars = 1;
6462           int *lookup_buf_end;
6463           const unsigned char *p = src;
6464           int i;
6465
6466           lookup_buf[0] = c;
6467           for (i = 1; i < max_lookup && p < src_end; i++)
6468             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6469           lookup_buf_end = lookup_buf + i;
6470           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6471                                    &from_nchars, &to_nchars);
6472           if (EQ (trans, Qt)
6473               || buf + to_nchars > buf_end)
6474             break;
6475           *buf++ = *lookup_buf;
6476           for (i = 1; i < to_nchars; i++)
6477             *buf++ = XINT (AREF (trans, i));
6478           for (i = 1; i < from_nchars; i++, pos++)
6479             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6480         }
6481     }
6482
6483   coding->consumed = src - coding->source;
6484   coding->consumed_char = pos - coding->src_pos;
6485   coding->charbuf_used = buf - coding->charbuf;
6486   coding->chars_at_source = 0;
6487 }
6488
6489
6490 /* Encode the text at CODING->src_object into CODING->dst_object.
6491    CODING->src_object is a buffer or a string.
6492    CODING->dst_object is a buffer or nil.
6493
6494    If CODING->src_object is a buffer, it must be the current buffer.
6495    In this case, if CODING->src_pos is positive, it is a position of
6496    the source text in the buffer, otherwise. the source text is in the
6497    gap area of the buffer, and coding->src_pos specifies the offset of
6498    the text from GPT (which must be the same as PT).  If this is the
6499    same buffer as CODING->dst_object, CODING->src_pos must be
6500    negative and CODING should not have `pre-write-conversion'.
6501
6502    If CODING->src_object is a string, CODING should not have
6503    `pre-write-conversion'.
6504
6505    If CODING->dst_object is a buffer, the encoded data is inserted at
6506    the current point of that buffer.
6507
6508    If CODING->dst_object is nil, the encoded data is placed at the
6509    memory area specified by CODING->destination.  */
6510
6511 static int
6512 encode_coding (coding)
6513      struct coding_system *coding;
6514 {
6515   Lisp_Object attrs;
6516   Lisp_Object translation_table;
6517   int max_lookup;
6518
6519   attrs = CODING_ID_ATTRS (coding->id);
6520   if (coding->encoder == encode_coding_raw_text)
6521     translation_table = Qnil, max_lookup = 0;
6522   else
6523     translation_table = get_translation_table (attrs, 1, &max_lookup);
6524
6525   if (BUFFERP (coding->dst_object))
6526     {
6527       set_buffer_internal (XBUFFER (coding->dst_object));
6528       coding->dst_multibyte
6529         = ! NILP (current_buffer->enable_multibyte_characters);
6530     }
6531
6532   coding->consumed = coding->consumed_char = 0;
6533   coding->produced = coding->produced_char = 0;
6534   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6535   coding->errors = 0;
6536
6537   ALLOC_CONVERSION_WORK_AREA (coding);
6538
6539   do {
6540     coding_set_source (coding);
6541     consume_chars (coding, translation_table, max_lookup);
6542     coding_set_destination (coding);
6543     (*(coding->encoder)) (coding);
6544   } while (coding->consumed_char < coding->src_chars);
6545
6546   if (BUFFERP (coding->dst_object))
6547     insert_from_gap (coding->produced_char, coding->produced);
6548
6549   return (coding->result);
6550 }
6551
6552
6553 /* Name (or base name) of work buffer for code conversion.  */
6554 static Lisp_Object Vcode_conversion_workbuf_name;
6555
6556 /* A working buffer used by the top level conversion.  Once it is
6557    created, it is never destroyed.  It has the name
6558    Vcode_conversion_workbuf_name.  The other working buffers are
6559    destroyed after the use is finished, and their names are modified
6560    versions of Vcode_conversion_workbuf_name.  */
6561 static Lisp_Object Vcode_conversion_reused_workbuf;
6562
6563 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6564 static int reused_workbuf_in_use;
6565
6566
6567 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6568    multibyteness of returning buffer.  */
6569
6570 static Lisp_Object
6571 make_conversion_work_buffer (multibyte)
6572      int multibyte;
6573 {
6574   Lisp_Object name, workbuf;
6575   struct buffer *current;
6576
6577   if (reused_workbuf_in_use++)
6578     {
6579       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6580       workbuf = Fget_buffer_create (name);
6581     }
6582   else
6583     {
6584       name = Vcode_conversion_workbuf_name;
6585       workbuf = Fget_buffer_create (name);
6586       if (NILP (Vcode_conversion_reused_workbuf))
6587         Vcode_conversion_reused_workbuf = workbuf;
6588     }
6589   current = current_buffer;
6590   set_buffer_internal (XBUFFER (workbuf));
6591   Ferase_buffer ();
6592   current_buffer->undo_list = Qt;
6593   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6594   set_buffer_internal (current);
6595   return workbuf;
6596 }
6597
6598
6599 static Lisp_Object
6600 code_conversion_restore (arg)
6601      Lisp_Object arg;
6602 {
6603   Lisp_Object current, workbuf;
6604   struct gcpro gcpro1;
6605
6606   GCPRO1 (arg);
6607   current = XCAR (arg);
6608   workbuf = XCDR (arg);
6609   if (! NILP (workbuf))
6610     {
6611       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6612         reused_workbuf_in_use = 0;
6613       else if (! NILP (Fbuffer_live_p (workbuf)))
6614         Fkill_buffer (workbuf);
6615     }
6616   set_buffer_internal (XBUFFER (current));
6617   UNGCPRO;
6618   return Qnil;
6619 }
6620
6621 Lisp_Object
6622 code_conversion_save (with_work_buf, multibyte)
6623      int with_work_buf, multibyte;
6624 {
6625   Lisp_Object workbuf = Qnil;
6626
6627   if (with_work_buf)
6628     workbuf = make_conversion_work_buffer (multibyte);
6629   record_unwind_protect (code_conversion_restore,
6630                          Fcons (Fcurrent_buffer (), workbuf));
6631   return workbuf;
6632 }
6633
6634 int
6635 decode_coding_gap (coding, chars, bytes)
6636      struct coding_system *coding;
6637      EMACS_INT chars, bytes;
6638 {
6639   int count = specpdl_ptr - specpdl;
6640   Lisp_Object attrs;
6641
6642   code_conversion_save (0, 0);
6643
6644   coding->src_object = Fcurrent_buffer ();
6645   coding->src_chars = chars;
6646   coding->src_bytes = bytes;
6647   coding->src_pos = -chars;
6648   coding->src_pos_byte = -bytes;
6649   coding->src_multibyte = chars < bytes;
6650   coding->dst_object = coding->src_object;
6651   coding->dst_pos = PT;
6652   coding->dst_pos_byte = PT_BYTE;
6653   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6654   coding->mode |= CODING_MODE_LAST_BLOCK;
6655
6656   if (CODING_REQUIRE_DETECTION (coding))
6657     detect_coding (coding);
6658
6659   decode_coding (coding);
6660
6661   attrs = CODING_ID_ATTRS (coding->id);
6662   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6663     {
6664       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6665       Lisp_Object val;
6666
6667       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6668       val = call1 (CODING_ATTR_POST_READ (attrs),
6669                    make_number (coding->produced_char));
6670       CHECK_NATNUM (val);
6671       coding->produced_char += Z - prev_Z;
6672       coding->produced += Z_BYTE - prev_Z_BYTE;
6673     }
6674
6675   unbind_to (count, Qnil);
6676   return coding->result;
6677 }
6678
6679 int
6680 encode_coding_gap (coding, chars, bytes)
6681      struct coding_system *coding;
6682      EMACS_INT chars, bytes;
6683 {
6684   int count = specpdl_ptr - specpdl;
6685
6686   code_conversion_save (0, 0);
6687
6688   coding->src_object = Fcurrent_buffer ();
6689   coding->src_chars = chars;
6690   coding->src_bytes = bytes;
6691   coding->src_pos = -chars;
6692   coding->src_pos_byte = -bytes;
6693   coding->src_multibyte = chars < bytes;
6694   coding->dst_object = coding->src_object;
6695   coding->dst_pos = PT;
6696   coding->dst_pos_byte = PT_BYTE;
6697
6698   encode_coding (coding);
6699
6700   unbind_to (count, Qnil);
6701   return coding->result;
6702 }
6703
6704
6705 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6706    SRC_OBJECT into DST_OBJECT by coding context CODING.
6707
6708    SRC_OBJECT is a buffer, a string, or Qnil.
6709
6710    If it is a buffer, the text is at point of the buffer.  FROM and TO
6711    are positions in the buffer.
6712
6713    If it is a string, the text is at the beginning of the string.
6714    FROM and TO are indices to the string.
6715
6716    If it is nil, the text is at coding->source.  FROM and TO are
6717    indices to coding->source.
6718
6719    DST_OBJECT is a buffer, Qt, or Qnil.
6720
6721    If it is a buffer, the decoded text is inserted at point of the
6722    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6723    is deleted.
6724
6725    If it is Qt, a string is made from the decoded text, and
6726    set in CODING->dst_object.
6727
6728    If it is Qnil, the decoded text is stored at CODING->destination.
6729    The caller must allocate CODING->dst_bytes bytes at
6730    CODING->destination by xmalloc.  If the decoded text is longer than
6731    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6732  */
6733
6734 void
6735 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6736                       dst_object)
6737      struct coding_system *coding;
6738      Lisp_Object src_object;
6739      EMACS_INT from, from_byte, to, to_byte;
6740      Lisp_Object dst_object;
6741 {
6742   int count = specpdl_ptr - specpdl;
6743   unsigned char *destination;
6744   EMACS_INT dst_bytes;
6745   EMACS_INT chars = to - from;
6746   EMACS_INT bytes = to_byte - from_byte;
6747   Lisp_Object attrs;
6748   Lisp_Object buffer;
6749   int saved_pt = -1, saved_pt_byte;
6750
6751   buffer = Fcurrent_buffer ();
6752
6753   if (NILP (dst_object))
6754     {
6755       destination = coding->destination;
6756       dst_bytes = coding->dst_bytes;
6757     }
6758
6759   coding->src_object = src_object;
6760   coding->src_chars = chars;
6761   coding->src_bytes = bytes;
6762   coding->src_multibyte = chars < bytes;
6763
6764   if (STRINGP (src_object))
6765     {
6766       coding->src_pos = from;
6767       coding->src_pos_byte = from_byte;
6768     }
6769   else if (BUFFERP (src_object))
6770     {
6771       set_buffer_internal (XBUFFER (src_object));
6772       if (from != GPT)
6773         move_gap_both (from, from_byte);
6774       if (EQ (src_object, dst_object))
6775         {
6776           saved_pt = PT, saved_pt_byte = PT_BYTE;
6777           TEMP_SET_PT_BOTH (from, from_byte);
6778           del_range_both (from, from_byte, to, to_byte, 1);
6779           coding->src_pos = -chars;
6780           coding->src_pos_byte = -bytes;
6781         }
6782       else
6783         {
6784           coding->src_pos = from;
6785           coding->src_pos_byte = from_byte;
6786         }
6787     }
6788
6789   if (CODING_REQUIRE_DETECTION (coding))
6790     detect_coding (coding);
6791   attrs = CODING_ID_ATTRS (coding->id);
6792
6793   if (EQ (dst_object, Qt)
6794       || (! NILP (CODING_ATTR_POST_READ (attrs))
6795           && NILP (dst_object)))
6796     {
6797       coding->dst_object = code_conversion_save (1, 1);
6798       coding->dst_pos = BEG;
6799       coding->dst_pos_byte = BEG_BYTE;
6800       coding->dst_multibyte = 1;
6801     }
6802   else if (BUFFERP (dst_object))
6803     {
6804       code_conversion_save (0, 0);
6805       coding->dst_object = dst_object;
6806       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6807       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6808       coding->dst_multibyte
6809         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6810     }
6811   else
6812     {
6813       code_conversion_save (0, 0);
6814       coding->dst_object = Qnil;
6815       coding->dst_multibyte = 1;
6816     }
6817
6818   decode_coding (coding);
6819
6820   if (BUFFERP (coding->dst_object))
6821     set_buffer_internal (XBUFFER (coding->dst_object));
6822
6823   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6824     {
6825       struct gcpro gcpro1, gcpro2;
6826       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6827       Lisp_Object val;
6828
6829       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6830       GCPRO2 (coding->src_object, coding->dst_object);
6831       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6832                         make_number (coding->produced_char));
6833       UNGCPRO;
6834       CHECK_NATNUM (val);
6835       coding->produced_char += Z - prev_Z;
6836       coding->produced += Z_BYTE - prev_Z_BYTE;
6837     }
6838
6839   if (EQ (dst_object, Qt))
6840     {
6841       coding->dst_object = Fbuffer_string ();
6842     }
6843   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6844     {
6845       set_buffer_internal (XBUFFER (coding->dst_object));
6846       if (dst_bytes < coding->produced)
6847         {
6848           destination
6849             = (unsigned char *) xrealloc (destination, coding->produced);
6850           if (! destination)
6851             {
6852               record_conversion_result (coding,
6853                                         CODING_RESULT_INSUFFICIENT_DST);
6854               unbind_to (count, Qnil);
6855               return;
6856             }
6857           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6858             move_gap_both (BEGV, BEGV_BYTE);
6859           bcopy (BEGV_ADDR, destination, coding->produced);
6860           coding->destination = destination;
6861         }
6862     }
6863
6864   if (saved_pt >= 0)
6865     {
6866       /* This is the case of:
6867          (BUFFERP (src_object) && EQ (src_object, dst_object))
6868          As we have moved PT while replacing the original buffer
6869          contents, we must recover it now.  */
6870       set_buffer_internal (XBUFFER (src_object));
6871       if (saved_pt < from)
6872         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6873       else if (saved_pt < from + chars)
6874         TEMP_SET_PT_BOTH (from, from_byte);
6875       else if (! NILP (current_buffer->enable_multibyte_characters))
6876         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6877                           saved_pt_byte + (coding->produced - bytes));
6878       else
6879         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6880                           saved_pt_byte + (coding->produced - bytes));
6881     }
6882
6883   unbind_to (count, coding->dst_object);
6884 }
6885
6886
6887 void
6888 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6889                       dst_object)
6890      struct coding_system *coding;
6891      Lisp_Object src_object;
6892      EMACS_INT from, from_byte, to, to_byte;
6893      Lisp_Object dst_object;
6894 {
6895   int count = specpdl_ptr - specpdl;
6896   EMACS_INT chars = to - from;
6897   EMACS_INT bytes = to_byte - from_byte;
6898   Lisp_Object attrs;
6899   Lisp_Object buffer;
6900   int saved_pt = -1, saved_pt_byte;
6901   int kill_src_buffer = 0;
6902
6903   buffer = Fcurrent_buffer ();
6904
6905   coding->src_object = src_object;
6906   coding->src_chars = chars;
6907   coding->src_bytes = bytes;
6908   coding->src_multibyte = chars < bytes;
6909
6910   attrs = CODING_ID_ATTRS (coding->id);
6911
6912   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6913     {
6914       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6915       set_buffer_internal (XBUFFER (coding->src_object));
6916       if (STRINGP (src_object))
6917         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6918       else if (BUFFERP (src_object))
6919         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6920       else
6921         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6922
6923       if (EQ (src_object, dst_object))
6924         {
6925           set_buffer_internal (XBUFFER (src_object));
6926           saved_pt = PT, saved_pt_byte = PT_BYTE;
6927           del_range_both (from, from_byte, to, to_byte, 1);
6928           set_buffer_internal (XBUFFER (coding->src_object));
6929         }
6930
6931       {
6932         Lisp_Object args[3];
6933
6934         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6935         args[1] = make_number (BEG);
6936         args[2] = make_number (Z);
6937         safe_call (3, args);
6938       }
6939       if (XBUFFER (coding->src_object) != current_buffer)
6940         kill_src_buffer = 1;
6941       coding->src_object = Fcurrent_buffer ();
6942       if (BEG != GPT)
6943         move_gap_both (BEG, BEG_BYTE);
6944       coding->src_chars = Z - BEG;
6945       coding->src_bytes = Z_BYTE - BEG_BYTE;
6946       coding->src_pos = BEG;
6947       coding->src_pos_byte = BEG_BYTE;
6948       coding->src_multibyte = Z < Z_BYTE;
6949     }
6950   else if (STRINGP (src_object))
6951     {
6952       code_conversion_save (0, 0);
6953       coding->src_pos = from;
6954       coding->src_pos_byte = from_byte;
6955     }
6956   else if (BUFFERP (src_object))
6957     {
6958       code_conversion_save (0, 0);
6959       set_buffer_internal (XBUFFER (src_object));
6960       if (EQ (src_object, dst_object))
6961         {
6962           saved_pt = PT, saved_pt_byte = PT_BYTE;
6963           coding->src_object = del_range_1 (from, to, 1, 1);
6964           coding->src_pos = 0;
6965           coding->src_pos_byte = 0;
6966         }
6967       else
6968         {
6969           if (from < GPT && to >= GPT)
6970             move_gap_both (from, from_byte);
6971           coding->src_pos = from;
6972           coding->src_pos_byte = from_byte;
6973         }
6974     }
6975   else
6976     code_conversion_save (0, 0);
6977
6978   if (BUFFERP (dst_object))
6979     {
6980       coding->dst_object = dst_object;
6981       if (EQ (src_object, dst_object))
6982         {
6983           coding->dst_pos = from;
6984           coding->dst_pos_byte = from_byte;
6985         }
6986       else
6987         {
6988           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6989           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6990         }
6991       coding->dst_multibyte
6992         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6993     }
6994   else if (EQ (dst_object, Qt))
6995     {
6996       coding->dst_object = Qnil;
6997       coding->dst_bytes = coding->src_chars;
6998       if (coding->dst_bytes == 0)
6999         coding->dst_bytes = 1;
7000       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7001       coding->dst_multibyte = 0;
7002     }
7003   else
7004     {
7005       coding->dst_object = Qnil;
7006       coding->dst_multibyte = 0;
7007     }
7008
7009   encode_coding (coding);
7010
7011   if (EQ (dst_object, Qt))
7012     {
7013       if (BUFFERP (coding->dst_object))
7014         coding->dst_object = Fbuffer_string ();
7015       else
7016         {
7017           coding->dst_object
7018             = make_unibyte_string ((char *) coding->destination,
7019                                    coding->produced);
7020           xfree (coding->destination);
7021         }
7022     }
7023
7024   if (saved_pt >= 0)
7025     {
7026       /* This is the case of:
7027          (BUFFERP (src_object) && EQ (src_object, dst_object))
7028          As we have moved PT while replacing the original buffer
7029          contents, we must recover it now.  */
7030       set_buffer_internal (XBUFFER (src_object));
7031       if (saved_pt < from)
7032         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7033       else if (saved_pt < from + chars)
7034         TEMP_SET_PT_BOTH (from, from_byte);
7035       else if (! NILP (current_buffer->enable_multibyte_characters))
7036         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7037                           saved_pt_byte + (coding->produced - bytes));
7038       else
7039         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7040                           saved_pt_byte + (coding->produced - bytes));
7041     }
7042
7043   if (kill_src_buffer)
7044     Fkill_buffer (coding->src_object);
7045   unbind_to (count, Qnil);
7046 }
7047
7048
7049 Lisp_Object
7050 preferred_coding_system ()
7051 {
7052   int id = coding_categories[coding_priorities[0]].id;
7053
7054   return CODING_ID_NAME (id);
7055 }
7056
7057 \f
7058 #ifdef emacs
7059 /*** 8. Emacs Lisp library functions ***/
7060
7061 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7062        doc: /* Return t if OBJECT is nil or a coding-system.
7063 See the documentation of `define-coding-system' for information
7064 about coding-system objects.  */)
7065      (obj)
7066      Lisp_Object obj;
7067 {
7068   if (NILP (obj)
7069       || CODING_SYSTEM_ID (obj) >= 0)
7070     return Qt;
7071   if (! SYMBOLP (obj)
7072       || NILP (Fget (obj, Qcoding_system_define_form)))
7073     return Qnil;
7074   return Qt;
7075 }
7076
7077 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7078        Sread_non_nil_coding_system, 1, 1, 0,
7079        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7080      (prompt)
7081      Lisp_Object prompt;
7082 {
7083   Lisp_Object val;
7084   do
7085     {
7086       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7087                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7088     }
7089   while (SCHARS (val) == 0);
7090   return (Fintern (val, Qnil));
7091 }
7092
7093 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7094        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7095 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7096      (prompt, default_coding_system)
7097      Lisp_Object prompt, default_coding_system;
7098 {
7099   Lisp_Object val;
7100   if (SYMBOLP (default_coding_system))
7101     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7102   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7103                           Qt, Qnil, Qcoding_system_history,
7104                           default_coding_system, Qnil);
7105   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7106 }
7107
7108 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7109        1, 1, 0,
7110        doc: /* Check validity of CODING-SYSTEM.
7111 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7112 It is valid if it is nil or a symbol defined as a coding system by the
7113 function `define-coding-system'.  */)
7114   (coding_system)
7115      Lisp_Object coding_system;
7116 {
7117   Lisp_Object define_form;
7118
7119   define_form = Fget (coding_system, Qcoding_system_define_form);
7120   if (! NILP (define_form))
7121     {
7122       Fput (coding_system, Qcoding_system_define_form, Qnil);
7123       safe_eval (define_form);
7124     }
7125   if (!NILP (Fcoding_system_p (coding_system)))
7126     return coding_system;
7127   while (1)
7128     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7129 }
7130
7131 \f
7132 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7133    HIGHEST is nonzero, return the coding system of the highest
7134    priority among the detected coding systems.  Otherwize return a
7135    list of detected coding systems sorted by their priorities.  If
7136    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7137    multibyte form but contains only ASCII and eight-bit chars.
7138    Otherwise, the bytes are raw bytes.
7139
7140    CODING-SYSTEM controls the detection as below:
7141
7142    If it is nil, detect both text-format and eol-format.  If the
7143    text-format part of CODING-SYSTEM is already specified
7144    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7145    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7146    detect only text-format.  */
7147
7148 Lisp_Object
7149 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7150                       coding_system)
7151      const unsigned char *src;
7152      int src_chars, src_bytes, highest;
7153      int multibytep;
7154      Lisp_Object coding_system;
7155 {
7156   const unsigned char *src_end = src + src_bytes;
7157   Lisp_Object attrs, eol_type;
7158   Lisp_Object val;
7159   struct coding_system coding;
7160   int id;
7161   struct coding_detection_info detect_info;
7162   enum coding_category base_category;
7163
7164   if (NILP (coding_system))
7165     coding_system = Qundecided;
7166   setup_coding_system (coding_system, &coding);
7167   attrs = CODING_ID_ATTRS (coding.id);
7168   eol_type = CODING_ID_EOL_TYPE (coding.id);
7169   coding_system = CODING_ATTR_BASE_NAME (attrs);
7170
7171   coding.source = src;
7172   coding.src_chars = src_chars;
7173   coding.src_bytes = src_bytes;
7174   coding.src_multibyte = multibytep;
7175   coding.consumed = 0;
7176   coding.mode |= CODING_MODE_LAST_BLOCK;
7177
7178   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7179
7180   /* At first, detect text-format if necessary.  */
7181   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7182   if (base_category == coding_category_undecided)
7183     {
7184       enum coding_category category;
7185       struct coding_system *this;
7186       int c, i;
7187
7188       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7189       for (i = 0; src < src_end; i++, src++)
7190         {
7191           c = *src;
7192           if (c & 0x80)
7193             break;
7194           if (c < 0x20
7195               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7196               && inhibit_iso_escape_detection)
7197             {
7198               coding.head_ascii = src - coding.source;
7199               if (detect_coding_iso_2022 (&coding, &detect_info))
7200                 {
7201                   /* We have scanned the whole data.  */
7202                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7203                     /* We didn't find an 8-bit code.  */
7204                     src = src_end;
7205                   break;
7206                 }
7207             }
7208         }
7209       coding.head_ascii = src - coding.source;
7210
7211       if (src < src_end
7212           || detect_info.found)
7213         {
7214           if (src == src_end)
7215             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7216             for (i = 0; i < coding_category_raw_text; i++)
7217               {
7218                 category = coding_priorities[i];
7219                 if (detect_info.found & (1 << category))
7220                   break;
7221               }
7222           else
7223             for (i = 0; i < coding_category_raw_text; i++)
7224               {
7225                 category = coding_priorities[i];
7226                 this = coding_categories + category;
7227
7228                 if (this->id < 0)
7229                   {
7230                     /* No coding system of this category is defined.  */
7231                     detect_info.rejected |= (1 << category);
7232                   }
7233                 else if (category >= coding_category_raw_text)
7234                   continue;
7235                 else if (detect_info.checked & (1 << category))
7236                   {
7237                     if (highest
7238                         && (detect_info.found & (1 << category)))
7239                       break;
7240                   }
7241                 else
7242                   {
7243                     if ((*(this->detector)) (&coding, &detect_info)
7244                         && highest
7245                         && (detect_info.found & (1 << category)))
7246                       {
7247                         if (category == coding_category_utf_16_auto)
7248                           {
7249                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7250                               category = coding_category_utf_16_le;
7251                             else
7252                               category = coding_category_utf_16_be;
7253                           }
7254                         break;
7255                       }
7256                   }
7257               }
7258         }
7259
7260       if (detect_info.rejected == CATEGORY_MASK_ANY)
7261         {
7262           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7263           id = coding_categories[coding_category_raw_text].id;
7264           val = Fcons (make_number (id), Qnil);
7265         }
7266       else if (! detect_info.rejected && ! detect_info.found)
7267         {
7268           detect_info.found = CATEGORY_MASK_ANY;
7269           id = coding_categories[coding_category_undecided].id;
7270           val = Fcons (make_number (id), Qnil);
7271         }
7272       else if (highest)
7273         {
7274           if (detect_info.found)
7275             {
7276               detect_info.found = 1 << category;
7277               val = Fcons (make_number (this->id), Qnil);
7278             }
7279           else
7280             for (i = 0; i < coding_category_raw_text; i++)
7281               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7282                 {
7283                   detect_info.found = 1 << coding_priorities[i];
7284                   id = coding_categories[coding_priorities[i]].id;
7285                   val = Fcons (make_number (id), Qnil);
7286                   break;
7287                 }
7288         }
7289       else
7290         {
7291           int mask = detect_info.rejected | detect_info.found;
7292           int found = 0;
7293           val = Qnil;
7294
7295           for (i = coding_category_raw_text - 1; i >= 0; i--)
7296             {
7297               category = coding_priorities[i];
7298               if (! (mask & (1 << category)))
7299                 {
7300                   found |= 1 << category;
7301                   id = coding_categories[category].id;
7302                   val = Fcons (make_number (id), val);
7303                 }
7304             }
7305           for (i = coding_category_raw_text - 1; i >= 0; i--)
7306             {
7307               category = coding_priorities[i];
7308               if (detect_info.found & (1 << category))
7309                 {
7310                   id = coding_categories[category].id;
7311                   val = Fcons (make_number (id), val);
7312                 }
7313             }
7314           detect_info.found |= found;
7315         }
7316     }
7317   else if (base_category == coding_category_utf_16_auto)
7318     {
7319       if (detect_coding_utf_16 (&coding, &detect_info))
7320         {
7321           struct coding_system *this;
7322
7323           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7324             this = coding_categories + coding_category_utf_16_le;
7325           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7326             this = coding_categories + coding_category_utf_16_be;
7327           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7328             this = coding_categories + coding_category_utf_16_be_nosig;
7329           else
7330             this = coding_categories + coding_category_utf_16_le_nosig;
7331           val = Fcons (make_number (this->id), Qnil);
7332         }
7333     }
7334   else
7335     {
7336       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7337       val = Fcons (make_number (coding.id), Qnil);
7338     }
7339
7340   /* Then, detect eol-format if necessary.  */
7341   {
7342     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7343     Lisp_Object tail;
7344
7345     if (VECTORP (eol_type))
7346       {
7347         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7348           normal_eol = detect_eol (coding.source, src_bytes,
7349                                    coding_category_raw_text);
7350         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7351                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7352           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7353                                       coding_category_utf_16_be);
7354         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7355                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7356           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7357                                       coding_category_utf_16_le);
7358       }
7359     else
7360       {
7361         if (EQ (eol_type, Qunix))
7362           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7363         else if (EQ (eol_type, Qdos))
7364           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7365         else
7366           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7367       }
7368
7369     for (tail = val; CONSP (tail); tail = XCDR (tail))
7370       {
7371         enum coding_category category;
7372         int this_eol;
7373
7374         id = XINT (XCAR (tail));
7375         attrs = CODING_ID_ATTRS (id);
7376         category = XINT (CODING_ATTR_CATEGORY (attrs));
7377         eol_type = CODING_ID_EOL_TYPE (id);
7378         if (VECTORP (eol_type))
7379           {
7380             if (category == coding_category_utf_16_be
7381                 || category == coding_category_utf_16_be_nosig)
7382               this_eol = utf_16_be_eol;
7383             else if (category == coding_category_utf_16_le
7384                      || category == coding_category_utf_16_le_nosig)
7385               this_eol = utf_16_le_eol;
7386             else
7387               this_eol = normal_eol;
7388
7389             if (this_eol == EOL_SEEN_LF)
7390               XSETCAR (tail, AREF (eol_type, 0));
7391             else if (this_eol == EOL_SEEN_CRLF)
7392               XSETCAR (tail, AREF (eol_type, 1));
7393             else if (this_eol == EOL_SEEN_CR)
7394               XSETCAR (tail, AREF (eol_type, 2));
7395             else
7396               XSETCAR (tail, CODING_ID_NAME (id));
7397           }
7398         else
7399           XSETCAR (tail, CODING_ID_NAME (id));
7400       }
7401   }
7402
7403   return (highest ? XCAR (val) : val);
7404 }
7405
7406
7407 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7408        2, 3, 0,
7409        doc: /* Detect coding system of the text in the region between START and END.
7410 Return a list of possible coding systems ordered by priority.
7411
7412 If only ASCII characters are found, it returns a list of single element
7413 `undecided' or its subsidiary coding system according to a detected
7414 end-of-line format.
7415
7416 If optional argument HIGHEST is non-nil, return the coding system of
7417 highest priority.  */)
7418      (start, end, highest)
7419      Lisp_Object start, end, highest;
7420 {
7421   int from, to;
7422   int from_byte, to_byte;
7423
7424   CHECK_NUMBER_COERCE_MARKER (start);
7425   CHECK_NUMBER_COERCE_MARKER (end);
7426
7427   validate_region (&start, &end);
7428   from = XINT (start), to = XINT (end);
7429   from_byte = CHAR_TO_BYTE (from);
7430   to_byte = CHAR_TO_BYTE (to);
7431
7432   if (from < GPT && to >= GPT)
7433     move_gap_both (to, to_byte);
7434
7435   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7436                                to - from, to_byte - from_byte,
7437                                !NILP (highest),
7438                                !NILP (current_buffer
7439                                       ->enable_multibyte_characters),
7440                                Qnil);
7441 }
7442
7443 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7444        1, 2, 0,
7445        doc: /* Detect coding system of the text in STRING.
7446 Return a list of possible coding systems ordered by priority.
7447
7448 If only ASCII characters are found, it returns a list of single element
7449 `undecided' or its subsidiary coding system according to a detected
7450 end-of-line format.
7451
7452 If optional argument HIGHEST is non-nil, return the coding system of
7453 highest priority.  */)
7454      (string, highest)
7455      Lisp_Object string, highest;
7456 {
7457   CHECK_STRING (string);
7458
7459   return detect_coding_system (SDATA (string),
7460                                SCHARS (string), SBYTES (string),
7461                                !NILP (highest), STRING_MULTIBYTE (string),
7462                                Qnil);
7463 }
7464
7465
7466 static INLINE int
7467 char_encodable_p (c, attrs)
7468      int c;
7469      Lisp_Object attrs;
7470 {
7471   Lisp_Object tail;
7472   struct charset *charset;
7473   Lisp_Object translation_table;
7474
7475   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7476   if (! NILP (translation_table))
7477     c = translate_char (translation_table, c);
7478   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7479        CONSP (tail); tail = XCDR (tail))
7480     {
7481       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7482       if (CHAR_CHARSET_P (c, charset))
7483         break;
7484     }
7485   return (! NILP (tail));
7486 }
7487
7488
7489 /* Return a list of coding systems that safely encode the text between
7490    START and END.  If EXCLUDE is non-nil, it is a list of coding
7491    systems not to check.  The returned list doesn't contain any such
7492    coding systems.  In any case, if the text contains only ASCII or is
7493    unibyte, return t.  */
7494
7495 DEFUN ("find-coding-systems-region-internal",
7496        Ffind_coding_systems_region_internal,
7497        Sfind_coding_systems_region_internal, 2, 3, 0,
7498        doc: /* Internal use only.  */)
7499      (start, end, exclude)
7500      Lisp_Object start, end, exclude;
7501 {
7502   Lisp_Object coding_attrs_list, safe_codings;
7503   EMACS_INT start_byte, end_byte;
7504   const unsigned char *p, *pbeg, *pend;
7505   int c;
7506   Lisp_Object tail, elt;
7507
7508   if (STRINGP (start))
7509     {
7510       if (!STRING_MULTIBYTE (start)
7511           || SCHARS (start) == SBYTES (start))
7512         return Qt;
7513       start_byte = 0;
7514       end_byte = SBYTES (start);
7515     }
7516   else
7517     {
7518       CHECK_NUMBER_COERCE_MARKER (start);
7519       CHECK_NUMBER_COERCE_MARKER (end);
7520       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7521         args_out_of_range (start, end);
7522       if (NILP (current_buffer->enable_multibyte_characters))
7523         return Qt;
7524       start_byte = CHAR_TO_BYTE (XINT (start));
7525       end_byte = CHAR_TO_BYTE (XINT (end));
7526       if (XINT (end) - XINT (start) == end_byte - start_byte)
7527         return Qt;
7528
7529       if (XINT (start) < GPT && XINT (end) > GPT)
7530         {
7531           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7532             move_gap_both (XINT (start), start_byte);
7533           else
7534             move_gap_both (XINT (end), end_byte);
7535         }
7536     }
7537
7538   coding_attrs_list = Qnil;
7539   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7540     if (NILP (exclude)
7541         || NILP (Fmemq (XCAR (tail), exclude)))
7542       {
7543         Lisp_Object attrs;
7544
7545         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7546         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7547             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7548           {
7549             ASET (attrs, coding_attr_trans_tbl,
7550                   get_translation_table (attrs, 1, NULL));
7551             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7552           }
7553       }
7554
7555   if (STRINGP (start))
7556     p = pbeg = SDATA (start);
7557   else
7558     p = pbeg = BYTE_POS_ADDR (start_byte);
7559   pend = p + (end_byte - start_byte);
7560
7561   while (p < pend && ASCII_BYTE_P (*p)) p++;
7562   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7563
7564   while (p < pend)
7565     {
7566       if (ASCII_BYTE_P (*p))
7567         p++;
7568       else
7569         {
7570           c = STRING_CHAR_ADVANCE (p);
7571
7572           charset_map_loaded = 0;
7573           for (tail = coding_attrs_list; CONSP (tail);)
7574             {
7575               elt = XCAR (tail);
7576               if (NILP (elt))
7577                 tail = XCDR (tail);
7578               else if (char_encodable_p (c, elt))
7579                 tail = XCDR (tail);
7580               else if (CONSP (XCDR (tail)))
7581                 {
7582                   XSETCAR (tail, XCAR (XCDR (tail)));
7583                   XSETCDR (tail, XCDR (XCDR (tail)));
7584                 }
7585               else
7586                 {
7587                   XSETCAR (tail, Qnil);
7588                   tail = XCDR (tail);
7589                 }
7590             }
7591           if (charset_map_loaded)
7592             {
7593               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7594
7595               if (STRINGP (start))
7596                 pbeg = SDATA (start);
7597               else
7598                 pbeg = BYTE_POS_ADDR (start_byte);
7599               p = pbeg + p_offset;
7600               pend = pbeg + pend_offset;
7601             }
7602         }
7603     }
7604
7605   safe_codings = list2 (Qraw_text, Qno_conversion);
7606   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7607     if (! NILP (XCAR (tail)))
7608       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7609
7610   return safe_codings;
7611 }
7612
7613
7614 DEFUN ("unencodable-char-position", Funencodable_char_position,
7615        Sunencodable_char_position, 3, 5, 0,
7616        doc: /*
7617 Return position of first un-encodable character in a region.
7618 START and END specfiy the region and CODING-SYSTEM specifies the
7619 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7620
7621 If optional 4th argument COUNT is non-nil, it specifies at most how
7622 many un-encodable characters to search.  In this case, the value is a
7623 list of positions.
7624
7625 If optional 5th argument STRING is non-nil, it is a string to search
7626 for un-encodable characters.  In that case, START and END are indexes
7627 to the string.  */)
7628      (start, end, coding_system, count, string)
7629      Lisp_Object start, end, coding_system, count, string;
7630 {
7631   int n;
7632   struct coding_system coding;
7633   Lisp_Object attrs, charset_list, translation_table;
7634   Lisp_Object positions;
7635   int from, to;
7636   const unsigned char *p, *stop, *pend;
7637   int ascii_compatible;
7638
7639   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7640   attrs = CODING_ID_ATTRS (coding.id);
7641   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7642     return Qnil;
7643   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7644   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7645   translation_table = get_translation_table (attrs, 1, NULL);
7646
7647   if (NILP (string))
7648     {
7649       validate_region (&start, &end);
7650       from = XINT (start);
7651       to = XINT (end);
7652       if (NILP (current_buffer->enable_multibyte_characters)
7653           || (ascii_compatible
7654               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7655         return Qnil;
7656       p = CHAR_POS_ADDR (from);
7657       pend = CHAR_POS_ADDR (to);
7658       if (from < GPT && to >= GPT)
7659         stop = GPT_ADDR;
7660       else
7661         stop = pend;
7662     }
7663   else
7664     {
7665       CHECK_STRING (string);
7666       CHECK_NATNUM (start);
7667       CHECK_NATNUM (end);
7668       from = XINT (start);
7669       to = XINT (end);
7670       if (from > to
7671           || to > SCHARS (string))
7672         args_out_of_range_3 (string, start, end);
7673       if (! STRING_MULTIBYTE (string))
7674         return Qnil;
7675       p = SDATA (string) + string_char_to_byte (string, from);
7676       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7677       if (ascii_compatible && (to - from) == (pend - p))
7678         return Qnil;
7679     }
7680
7681   if (NILP (count))
7682     n = 1;
7683   else
7684     {
7685       CHECK_NATNUM (count);
7686       n = XINT (count);
7687     }
7688
7689   positions = Qnil;
7690   while (1)
7691     {
7692       int c;
7693
7694       if (ascii_compatible)
7695         while (p < stop && ASCII_BYTE_P (*p))
7696           p++, from++;
7697       if (p >= stop)
7698         {
7699           if (p >= pend)
7700             break;
7701           stop = pend;
7702           p = GAP_END_ADDR;
7703         }
7704
7705       c = STRING_CHAR_ADVANCE (p);
7706       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7707           && ! char_charset (translate_char (translation_table, c),
7708                              charset_list, NULL))
7709         {
7710           positions = Fcons (make_number (from), positions);
7711           n--;
7712           if (n == 0)
7713             break;
7714         }
7715
7716       from++;
7717     }
7718
7719   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7720 }
7721
7722
7723 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7724        Scheck_coding_systems_region, 3, 3, 0,
7725        doc: /* Check if the region is encodable by coding systems.
7726
7727 START and END are buffer positions specifying the region.
7728 CODING-SYSTEM-LIST is a list of coding systems to check.
7729
7730 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7731 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7732 whole region, POS0, POS1, ... are buffer positions where non-encodable
7733 characters are found.
7734
7735 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7736 value is nil.
7737
7738 START may be a string.  In that case, check if the string is
7739 encodable, and the value contains indices to the string instead of
7740 buffer positions.  END is ignored.  */)
7741      (start, end, coding_system_list)
7742      Lisp_Object start, end, coding_system_list;
7743 {
7744   Lisp_Object list;
7745   EMACS_INT start_byte, end_byte;
7746   int pos;
7747   const unsigned char *p, *pbeg, *pend;
7748   int c;
7749   Lisp_Object tail, elt, attrs;
7750
7751   if (STRINGP (start))
7752     {
7753       if (!STRING_MULTIBYTE (start)
7754           && SCHARS (start) != SBYTES (start))
7755         return Qnil;
7756       start_byte = 0;
7757       end_byte = SBYTES (start);
7758       pos = 0;
7759     }
7760   else
7761     {
7762       CHECK_NUMBER_COERCE_MARKER (start);
7763       CHECK_NUMBER_COERCE_MARKER (end);
7764       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7765         args_out_of_range (start, end);
7766       if (NILP (current_buffer->enable_multibyte_characters))
7767         return Qnil;
7768       start_byte = CHAR_TO_BYTE (XINT (start));
7769       end_byte = CHAR_TO_BYTE (XINT (end));
7770       if (XINT (end) - XINT (start) == end_byte - start_byte)
7771         return Qt;
7772
7773       if (XINT (start) < GPT && XINT (end) > GPT)
7774         {
7775           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7776             move_gap_both (XINT (start), start_byte);
7777           else
7778             move_gap_both (XINT (end), end_byte);
7779         }
7780       pos = XINT (start);
7781     }
7782
7783   list = Qnil;
7784   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7785     {
7786       elt = XCAR (tail);
7787       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7788       ASET (attrs, coding_attr_trans_tbl,
7789             get_translation_table (attrs, 1, NULL));
7790       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7791     }
7792
7793   if (STRINGP (start))
7794     p = pbeg = SDATA (start);
7795   else
7796     p = pbeg = BYTE_POS_ADDR (start_byte);
7797   pend = p + (end_byte - start_byte);
7798
7799   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7800   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7801
7802   while (p < pend)
7803     {
7804       if (ASCII_BYTE_P (*p))
7805         p++;
7806       else
7807         {
7808           c = STRING_CHAR_ADVANCE (p);
7809
7810           charset_map_loaded = 0;
7811           for (tail = list; CONSP (tail); tail = XCDR (tail))
7812             {
7813               elt = XCDR (XCAR (tail));
7814               if (! char_encodable_p (c, XCAR (elt)))
7815                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7816             }
7817           if (charset_map_loaded)
7818             {
7819               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7820
7821               if (STRINGP (start))
7822                 pbeg = SDATA (start);
7823               else
7824                 pbeg = BYTE_POS_ADDR (start_byte);
7825               p = pbeg + p_offset;
7826               pend = pbeg + pend_offset;
7827             }
7828         }
7829       pos++;
7830     }
7831
7832   tail = list;
7833   list = Qnil;
7834   for (; CONSP (tail); tail = XCDR (tail))
7835     {
7836       elt = XCAR (tail);
7837       if (CONSP (XCDR (XCDR (elt))))
7838         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7839                       list);
7840     }
7841
7842   return list;
7843 }
7844
7845
7846 Lisp_Object
7847 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7848      Lisp_Object start, end, coding_system, dst_object;
7849      int encodep, norecord;
7850 {
7851   struct coding_system coding;
7852   EMACS_INT from, from_byte, to, to_byte;
7853   Lisp_Object src_object;
7854
7855   CHECK_NUMBER_COERCE_MARKER (start);
7856   CHECK_NUMBER_COERCE_MARKER (end);
7857   if (NILP (coding_system))
7858     coding_system = Qno_conversion;
7859   else
7860     CHECK_CODING_SYSTEM (coding_system);
7861   src_object = Fcurrent_buffer ();
7862   if (NILP (dst_object))
7863     dst_object = src_object;
7864   else if (! EQ (dst_object, Qt))
7865     CHECK_BUFFER (dst_object);
7866
7867   validate_region (&start, &end);
7868   from = XFASTINT (start);
7869   from_byte = CHAR_TO_BYTE (from);
7870   to = XFASTINT (end);
7871   to_byte = CHAR_TO_BYTE (to);
7872
7873   setup_coding_system (coding_system, &coding);
7874   coding.mode |= CODING_MODE_LAST_BLOCK;
7875
7876   if (encodep)
7877     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7878                           dst_object);
7879   else
7880     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7881                           dst_object);
7882   if (! norecord)
7883     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7884
7885   return (BUFFERP (dst_object)
7886           ? make_number (coding.produced_char)
7887           : coding.dst_object);
7888 }
7889
7890
7891 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7892        3, 4, "r\nzCoding system: ",
7893        doc: /* Decode the current region from the specified coding system.
7894 When called from a program, takes four arguments:
7895         START, END, CODING-SYSTEM, and DESTINATION.
7896 START and END are buffer positions.
7897
7898 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7899 If nil, the region between START and END is replace by the decoded text.
7900 If buffer, the decoded text is inserted in the buffer.
7901 If t, the decoded text is returned.
7902
7903 This function sets `last-coding-system-used' to the precise coding system
7904 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7905 not fully specified.)
7906 It returns the length of the decoded text.  */)
7907      (start, end, coding_system, destination)
7908      Lisp_Object start, end, coding_system, destination;
7909 {
7910   return code_convert_region (start, end, coding_system, destination, 0, 0);
7911 }
7912
7913 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7914        3, 4, "r\nzCoding system: ",
7915        doc: /* Encode the current region by specified coding system.
7916 When called from a program, takes three arguments:
7917 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7918
7919 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7920 If nil, the region between START and END is replace by the encoded text.
7921 If buffer, the encoded text is inserted in the buffer.
7922 If t, the encoded text is returned.
7923
7924 This function sets `last-coding-system-used' to the precise coding system
7925 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7926 not fully specified.)
7927 It returns the length of the encoded text.  */)
7928   (start, end, coding_system, destination)
7929      Lisp_Object start, end, coding_system, destination;
7930 {
7931   return code_convert_region (start, end, coding_system, destination, 1, 0);
7932 }
7933
7934 Lisp_Object
7935 code_convert_string (string, coding_system, dst_object,
7936                      encodep, nocopy, norecord)
7937      Lisp_Object string, coding_system, dst_object;
7938      int encodep, nocopy, norecord;
7939 {
7940   struct coding_system coding;
7941   EMACS_INT chars, bytes;
7942
7943   CHECK_STRING (string);
7944   if (NILP (coding_system))
7945     {
7946       if (! norecord)
7947         Vlast_coding_system_used = Qno_conversion;
7948       if (NILP (dst_object))
7949         return (nocopy ? Fcopy_sequence (string) : string);
7950     }
7951
7952   if (NILP (coding_system))
7953     coding_system = Qno_conversion;
7954   else
7955     CHECK_CODING_SYSTEM (coding_system);
7956   if (NILP (dst_object))
7957     dst_object = Qt;
7958   else if (! EQ (dst_object, Qt))
7959     CHECK_BUFFER (dst_object);
7960
7961   setup_coding_system (coding_system, &coding);
7962   coding.mode |= CODING_MODE_LAST_BLOCK;
7963   chars = SCHARS (string);
7964   bytes = SBYTES (string);
7965   if (encodep)
7966     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7967   else
7968     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7969   if (! norecord)
7970     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7971
7972   return (BUFFERP (dst_object)
7973           ? make_number (coding.produced_char)
7974           : coding.dst_object);
7975 }
7976
7977
7978 /* Encode or decode STRING according to CODING_SYSTEM.
7979    Do not set Vlast_coding_system_used.
7980
7981    This function is called only from macros DECODE_FILE and
7982    ENCODE_FILE, thus we ignore character composition.  */
7983
7984 Lisp_Object
7985 code_convert_string_norecord (string, coding_system, encodep)
7986      Lisp_Object string, coding_system;
7987      int encodep;
7988 {
7989   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7990 }
7991
7992
7993 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7994        2, 4, 0,
7995        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7996
7997 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7998 if the decoding operation is trivial.
7999
8000 Optional fourth arg BUFFER non-nil meant that the decoded text is
8001 inserted in BUFFER instead of returned as a string.  In this case,
8002 the return value is BUFFER.
8003
8004 This function sets `last-coding-system-used' to the precise coding system
8005 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8006 not fully specified.  */)
8007   (string, coding_system, nocopy, buffer)
8008      Lisp_Object string, coding_system, nocopy, buffer;
8009 {
8010   return code_convert_string (string, coding_system, buffer,
8011                               0, ! NILP (nocopy), 0);
8012 }
8013
8014 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8015        2, 4, 0,
8016        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8017
8018 Optional third arg NOCOPY non-nil means it is OK to return STRING
8019 itself if the encoding operation is trivial.
8020
8021 Optional fourth arg BUFFER non-nil meant that the encoded text is
8022 inserted in BUFFER instead of returned as a string.  In this case,
8023 the return value is BUFFER.
8024
8025 This function sets `last-coding-system-used' to the precise coding system
8026 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8027 not fully specified.)  */)
8028      (string, coding_system, nocopy, buffer)
8029      Lisp_Object string, coding_system, nocopy, buffer;
8030 {
8031   return code_convert_string (string, coding_system, buffer,
8032                               1, ! NILP (nocopy), 1);
8033 }
8034
8035 \f
8036 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8037        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8038 Return the corresponding character.  */)
8039      (code)
8040      Lisp_Object code;
8041 {
8042   Lisp_Object spec, attrs, val;
8043   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8044   int c;
8045
8046   CHECK_NATNUM (code);
8047   c = XFASTINT (code);
8048   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8049   attrs = AREF (spec, 0);
8050
8051   if (ASCII_BYTE_P (c)
8052       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8053     return code;
8054
8055   val = CODING_ATTR_CHARSET_LIST (attrs);
8056   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8057   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8058   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8059
8060   if (c <= 0x7F)
8061     charset = charset_roman;
8062   else if (c >= 0xA0 && c < 0xDF)
8063     {
8064       charset = charset_kana;
8065       c -= 0x80;
8066     }
8067   else
8068     {
8069       int s1 = c >> 8, s2 = c & 0xFF;
8070
8071       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8072           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8073         error ("Invalid code: %d", code);
8074       SJIS_TO_JIS (c);
8075       charset = charset_kanji;
8076     }
8077   c = DECODE_CHAR (charset, c);
8078   if (c < 0)
8079     error ("Invalid code: %d", code);
8080   return make_number (c);
8081 }
8082
8083
8084 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8085        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8086 Return the corresponding code in SJIS.  */)
8087      (ch)
8088     Lisp_Object ch;
8089 {
8090   Lisp_Object spec, attrs, charset_list;
8091   int c;
8092   struct charset *charset;
8093   unsigned code;
8094
8095   CHECK_CHARACTER (ch);
8096   c = XFASTINT (ch);
8097   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8098   attrs = AREF (spec, 0);
8099
8100   if (ASCII_CHAR_P (c)
8101       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8102     return ch;
8103
8104   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8105   charset = char_charset (c, charset_list, &code);
8106   if (code == CHARSET_INVALID_CODE (charset))
8107     error ("Can't encode by shift_jis encoding: %d", c);
8108   JIS_TO_SJIS (code);
8109
8110   return make_number (code);
8111 }
8112
8113 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8114        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8115 Return the corresponding character.  */)
8116      (code)
8117      Lisp_Object code;
8118 {
8119   Lisp_Object spec, attrs, val;
8120   struct charset *charset_roman, *charset_big5, *charset;
8121   int c;
8122
8123   CHECK_NATNUM (code);
8124   c = XFASTINT (code);
8125   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8126   attrs = AREF (spec, 0);
8127
8128   if (ASCII_BYTE_P (c)
8129       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8130     return code;
8131
8132   val = CODING_ATTR_CHARSET_LIST (attrs);
8133   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8134   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8135
8136   if (c <= 0x7F)
8137     charset = charset_roman;
8138   else
8139     {
8140       int b1 = c >> 8, b2 = c & 0x7F;
8141       if (b1 < 0xA1 || b1 > 0xFE
8142           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8143         error ("Invalid code: %d", code);
8144       charset = charset_big5;
8145     }
8146   c = DECODE_CHAR (charset, (unsigned )c);
8147   if (c < 0)
8148     error ("Invalid code: %d", code);
8149   return make_number (c);
8150 }
8151
8152 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8153        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8154 Return the corresponding character code in Big5.  */)
8155      (ch)
8156      Lisp_Object ch;
8157 {
8158   Lisp_Object spec, attrs, charset_list;
8159   struct charset *charset;
8160   int c;
8161   unsigned code;
8162
8163   CHECK_CHARACTER (ch);
8164   c = XFASTINT (ch);
8165   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8166   attrs = AREF (spec, 0);
8167   if (ASCII_CHAR_P (c)
8168       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8169     return ch;
8170
8171   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8172   charset = char_charset (c, charset_list, &code);
8173   if (code == CHARSET_INVALID_CODE (charset))
8174     error ("Can't encode by Big5 encoding: %d", c);
8175
8176   return make_number (code);
8177 }
8178
8179 \f
8180 DEFUN ("set-terminal-coding-system-internal",
8181        Fset_terminal_coding_system_internal,
8182        Sset_terminal_coding_system_internal, 1, 1, 0,
8183        doc: /* Internal use only.  */)
8184      (coding_system)
8185      Lisp_Object coding_system;
8186 {
8187   CHECK_SYMBOL (coding_system);
8188   setup_coding_system (Fcheck_coding_system (coding_system),
8189                         &terminal_coding);
8190
8191   /* We had better not send unsafe characters to terminal.  */
8192   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8193   /* Characer composition should be disabled.  */
8194   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8195   terminal_coding.src_multibyte = 1;
8196   terminal_coding.dst_multibyte = 0;
8197   return Qnil;
8198 }
8199
8200 DEFUN ("set-safe-terminal-coding-system-internal",
8201        Fset_safe_terminal_coding_system_internal,
8202        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8203        doc: /* Internal use only.  */)
8204      (coding_system)
8205      Lisp_Object coding_system;
8206 {
8207   CHECK_SYMBOL (coding_system);
8208   setup_coding_system (Fcheck_coding_system (coding_system),
8209                        &safe_terminal_coding);
8210   /* Characer composition should be disabled.  */
8211   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8212   safe_terminal_coding.src_multibyte = 1;
8213   safe_terminal_coding.dst_multibyte = 0;
8214   return Qnil;
8215 }
8216
8217 DEFUN ("terminal-coding-system",
8218        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8219        doc: /* Return coding system specified for terminal output.  */)
8220      ()
8221 {
8222   Lisp_Object coding_system;
8223
8224   coding_system = CODING_ID_NAME (terminal_coding.id);
8225   /* For backward compatibility, return nil if it is `undecided'. */
8226   return (coding_system != Qundecided ? coding_system : Qnil);
8227 }
8228
8229 DEFUN ("set-keyboard-coding-system-internal",
8230        Fset_keyboard_coding_system_internal,
8231        Sset_keyboard_coding_system_internal, 1, 1, 0,
8232        doc: /* Internal use only.  */)
8233      (coding_system)
8234      Lisp_Object coding_system;
8235 {
8236   CHECK_SYMBOL (coding_system);
8237   setup_coding_system (Fcheck_coding_system (coding_system),
8238                        &keyboard_coding);
8239   /* Characer composition should be disabled.  */
8240   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8241   return Qnil;
8242 }
8243
8244 DEFUN ("keyboard-coding-system",
8245        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8246        doc: /* Return coding system specified for decoding keyboard input.  */)
8247      ()
8248 {
8249   return CODING_ID_NAME (keyboard_coding.id);
8250 }
8251
8252 \f
8253 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8254        Sfind_operation_coding_system,  1, MANY, 0,
8255        doc: /* Choose a coding system for an operation based on the target name.
8256 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8257 DECODING-SYSTEM is the coding system to use for decoding
8258 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8259 for encoding (in case OPERATION does encoding).
8260
8261 The first argument OPERATION specifies an I/O primitive:
8262   For file I/O, `insert-file-contents' or `write-region'.
8263   For process I/O, `call-process', `call-process-region', or `start-process'.
8264   For network I/O, `open-network-stream'.
8265
8266 The remaining arguments should be the same arguments that were passed
8267 to the primitive.  Depending on which primitive, one of those arguments
8268 is selected as the TARGET.  For example, if OPERATION does file I/O,
8269 whichever argument specifies the file name is TARGET.
8270
8271 TARGET has a meaning which depends on OPERATION:
8272   For file I/O, TARGET is a file name.
8273   For process I/O, TARGET is a process name.
8274   For network I/O, TARGET is a service name or a port number
8275
8276 This function looks up what specified for TARGET in,
8277 `file-coding-system-alist', `process-coding-system-alist',
8278 or `network-coding-system-alist' depending on OPERATION.
8279 They may specify a coding system, a cons of coding systems,
8280 or a function symbol to call.
8281 In the last case, we call the function with one argument,
8282 which is a list of all the arguments given to this function.
8283
8284 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8285      (nargs, args)
8286      int nargs;
8287      Lisp_Object *args;
8288 {
8289   Lisp_Object operation, target_idx, target, val;
8290   register Lisp_Object chain;
8291
8292   if (nargs < 2)
8293     error ("Too few arguments");
8294   operation = args[0];
8295   if (!SYMBOLP (operation)
8296       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8297     error ("Invalid first arguement");
8298   if (nargs < 1 + XINT (target_idx))
8299     error ("Too few arguments for operation: %s",
8300            SDATA (SYMBOL_NAME (operation)));
8301   target = args[XINT (target_idx) + 1];
8302   if (!(STRINGP (target)
8303         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8304     error ("Invalid %dth argument", XINT (target_idx) + 1);
8305
8306   chain = ((EQ (operation, Qinsert_file_contents)
8307             || EQ (operation, Qwrite_region))
8308            ? Vfile_coding_system_alist
8309            : (EQ (operation, Qopen_network_stream)
8310               ? Vnetwork_coding_system_alist
8311               : Vprocess_coding_system_alist));
8312   if (NILP (chain))
8313     return Qnil;
8314
8315   for (; CONSP (chain); chain = XCDR (chain))
8316     {
8317       Lisp_Object elt;
8318
8319       elt = XCAR (chain);
8320       if (CONSP (elt)
8321           && ((STRINGP (target)
8322                && STRINGP (XCAR (elt))
8323                && fast_string_match (XCAR (elt), target) >= 0)
8324               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8325         {
8326           val = XCDR (elt);
8327           /* Here, if VAL is both a valid coding system and a valid
8328              function symbol, we return VAL as a coding system.  */
8329           if (CONSP (val))
8330             return val;
8331           if (! SYMBOLP (val))
8332             return Qnil;
8333           if (! NILP (Fcoding_system_p (val)))
8334             return Fcons (val, val);
8335           if (! NILP (Ffboundp (val)))
8336             {
8337               val = call1 (val, Flist (nargs, args));
8338               if (CONSP (val))
8339                 return val;
8340               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8341                 return Fcons (val, val);
8342             }
8343           return Qnil;
8344         }
8345     }
8346   return Qnil;
8347 }
8348
8349 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8350        Sset_coding_system_priority, 0, MANY, 0,
8351        doc: /* Assign higher priority to the coding systems given as arguments.
8352 If multiple coding systems belongs to the same category,
8353 all but the first one are ignored.
8354
8355 usage: (set-coding-system-priority ...)  */)
8356      (nargs, args)
8357      int nargs;
8358      Lisp_Object *args;
8359 {
8360   int i, j;
8361   int changed[coding_category_max];
8362   enum coding_category priorities[coding_category_max];
8363
8364   bzero (changed, sizeof changed);
8365
8366   for (i = j = 0; i < nargs; i++)
8367     {
8368       enum coding_category category;
8369       Lisp_Object spec, attrs;
8370
8371       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8372       attrs = AREF (spec, 0);
8373       category = XINT (CODING_ATTR_CATEGORY (attrs));
8374       if (changed[category])
8375         /* Ignore this coding system because a coding system of the
8376            same category already had a higher priority.  */
8377         continue;
8378       changed[category] = 1;
8379       priorities[j++] = category;
8380       if (coding_categories[category].id >= 0
8381           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8382         setup_coding_system (args[i], &coding_categories[category]);
8383       Fset (AREF (Vcoding_category_table, category), args[i]);
8384     }
8385
8386   /* Now we have decided top J priorities.  Reflect the order of the
8387      original priorities to the remaining priorities.  */
8388
8389   for (i = j, j = 0; i < coding_category_max; i++, j++)
8390     {
8391       while (j < coding_category_max
8392              && changed[coding_priorities[j]])
8393         j++;
8394       if (j == coding_category_max)
8395         abort ();
8396       priorities[i] = coding_priorities[j];
8397     }
8398
8399   bcopy (priorities, coding_priorities, sizeof priorities);
8400
8401   /* Update `coding-category-list'.  */
8402   Vcoding_category_list = Qnil;
8403   for (i = coding_category_max - 1; i >= 0; i--)
8404     Vcoding_category_list
8405       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8406                Vcoding_category_list);
8407
8408   return Qnil;
8409 }
8410
8411 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8412        Scoding_system_priority_list, 0, 1, 0,
8413        doc: /* Return a list of coding systems ordered by their priorities.
8414 HIGHESTP non-nil means just return the highest priority one.  */)
8415      (highestp)
8416      Lisp_Object highestp;
8417 {
8418   int i;
8419   Lisp_Object val;
8420
8421   for (i = 0, val = Qnil; i < coding_category_max; i++)
8422     {
8423       enum coding_category category = coding_priorities[i];
8424       int id = coding_categories[category].id;
8425       Lisp_Object attrs;
8426
8427       if (id < 0)
8428         continue;
8429       attrs = CODING_ID_ATTRS (id);
8430       if (! NILP (highestp))
8431         return CODING_ATTR_BASE_NAME (attrs);
8432       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8433     }
8434   return Fnreverse (val);
8435 }
8436
8437 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8438
8439 static Lisp_Object
8440 make_subsidiaries (base)
8441      Lisp_Object base;
8442 {
8443   Lisp_Object subsidiaries;
8444   int base_name_len = SBYTES (SYMBOL_NAME (base));
8445   char *buf = (char *) alloca (base_name_len + 6);
8446   int i;
8447
8448   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8449   subsidiaries = Fmake_vector (make_number (3), Qnil);
8450   for (i = 0; i < 3; i++)
8451     {
8452       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8453       ASET (subsidiaries, i, intern (buf));
8454     }
8455   return subsidiaries;
8456 }
8457
8458
8459 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8460        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8461        doc: /* For internal use only.
8462 usage: (define-coding-system-internal ...)  */)
8463      (nargs, args)
8464      int nargs;
8465      Lisp_Object *args;
8466 {
8467   Lisp_Object name;
8468   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8469   Lisp_Object attrs;            /* Vector of attributes.  */
8470   Lisp_Object eol_type;
8471   Lisp_Object aliases;
8472   Lisp_Object coding_type, charset_list, safe_charsets;
8473   enum coding_category category;
8474   Lisp_Object tail, val;
8475   int max_charset_id = 0;
8476   int i;
8477
8478   if (nargs < coding_arg_max)
8479     goto short_args;
8480
8481   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8482
8483   name = args[coding_arg_name];
8484   CHECK_SYMBOL (name);
8485   CODING_ATTR_BASE_NAME (attrs) = name;
8486
8487   val = args[coding_arg_mnemonic];
8488   if (! STRINGP (val))
8489     CHECK_CHARACTER (val);
8490   CODING_ATTR_MNEMONIC (attrs) = val;
8491
8492   coding_type = args[coding_arg_coding_type];
8493   CHECK_SYMBOL (coding_type);
8494   CODING_ATTR_TYPE (attrs) = coding_type;
8495
8496   charset_list = args[coding_arg_charset_list];
8497   if (SYMBOLP (charset_list))
8498     {
8499       if (EQ (charset_list, Qiso_2022))
8500         {
8501           if (! EQ (coding_type, Qiso_2022))
8502             error ("Invalid charset-list");
8503           charset_list = Viso_2022_charset_list;
8504         }
8505       else if (EQ (charset_list, Qemacs_mule))
8506         {
8507           if (! EQ (coding_type, Qemacs_mule))
8508             error ("Invalid charset-list");
8509           charset_list = Vemacs_mule_charset_list;
8510         }
8511       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8512         if (max_charset_id < XFASTINT (XCAR (tail)))
8513           max_charset_id = XFASTINT (XCAR (tail));
8514     }
8515   else
8516     {
8517       charset_list = Fcopy_sequence (charset_list);
8518       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8519         {
8520           struct charset *charset;
8521
8522           val = Fcar (tail);
8523           CHECK_CHARSET_GET_CHARSET (val, charset);
8524           if (EQ (coding_type, Qiso_2022)
8525               ? CHARSET_ISO_FINAL (charset) < 0
8526               : EQ (coding_type, Qemacs_mule)
8527               ? CHARSET_EMACS_MULE_ID (charset) < 0
8528               : 0)
8529             error ("Can't handle charset `%s'",
8530                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8531
8532           XSETCAR (tail, make_number (charset->id));
8533           if (max_charset_id < charset->id)
8534             max_charset_id = charset->id;
8535         }
8536     }
8537   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8538
8539   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8540                                 make_number (255));
8541   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8542     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8543   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8544
8545   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8546
8547   val = args[coding_arg_decode_translation_table];
8548   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8549     CHECK_SYMBOL (val);
8550   CODING_ATTR_DECODE_TBL (attrs) = val;
8551
8552   val = args[coding_arg_encode_translation_table];
8553   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8554     CHECK_SYMBOL (val);
8555   CODING_ATTR_ENCODE_TBL (attrs) = val;
8556
8557   val = args[coding_arg_post_read_conversion];
8558   CHECK_SYMBOL (val);
8559   CODING_ATTR_POST_READ (attrs) = val;
8560
8561   val = args[coding_arg_pre_write_conversion];
8562   CHECK_SYMBOL (val);
8563   CODING_ATTR_PRE_WRITE (attrs) = val;
8564
8565   val = args[coding_arg_default_char];
8566   if (NILP (val))
8567     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8568   else
8569     {
8570       CHECK_CHARACTER (val);
8571       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8572     }
8573
8574   val = args[coding_arg_for_unibyte];
8575   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8576
8577   val = args[coding_arg_plist];
8578   CHECK_LIST (val);
8579   CODING_ATTR_PLIST (attrs) = val;
8580
8581   if (EQ (coding_type, Qcharset))
8582     {
8583       /* Generate a lisp vector of 256 elements.  Each element is nil,
8584          integer, or a list of charset IDs.
8585
8586          If Nth element is nil, the byte code N is invalid in this
8587          coding system.
8588
8589          If Nth element is a number NUM, N is the first byte of a
8590          charset whose ID is NUM.
8591
8592          If Nth element is a list of charset IDs, N is the first byte
8593          of one of them.  The list is sorted by dimensions of the
8594          charsets.  A charset of smaller dimension comes firtst. */
8595       val = Fmake_vector (make_number (256), Qnil);
8596
8597       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8598         {
8599           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8600           int dim = CHARSET_DIMENSION (charset);
8601           int idx = (dim - 1) * 4;
8602
8603           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8604             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8605
8606           for (i = charset->code_space[idx];
8607                i <= charset->code_space[idx + 1]; i++)
8608             {
8609               Lisp_Object tmp, tmp2;
8610               int dim2;
8611
8612               tmp = AREF (val, i);
8613               if (NILP (tmp))
8614                 tmp = XCAR (tail);
8615               else if (NUMBERP (tmp))
8616                 {
8617                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8618                   if (dim < dim2)
8619                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8620                   else
8621                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8622                 }
8623               else
8624                 {
8625                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8626                     {
8627                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8628                       if (dim < dim2)
8629                         break;
8630                     }
8631                   if (NILP (tmp2))
8632                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8633                   else
8634                     {
8635                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8636                       XSETCAR (tmp2, XCAR (tail));
8637                     }
8638                 }
8639               ASET (val, i, tmp);
8640             }
8641         }
8642       ASET (attrs, coding_attr_charset_valids, val);
8643       category = coding_category_charset;
8644     }
8645   else if (EQ (coding_type, Qccl))
8646     {
8647       Lisp_Object valids;
8648
8649       if (nargs < coding_arg_ccl_max)
8650         goto short_args;
8651
8652       val = args[coding_arg_ccl_decoder];
8653       CHECK_CCL_PROGRAM (val);
8654       if (VECTORP (val))
8655         val = Fcopy_sequence (val);
8656       ASET (attrs, coding_attr_ccl_decoder, val);
8657
8658       val = args[coding_arg_ccl_encoder];
8659       CHECK_CCL_PROGRAM (val);
8660       if (VECTORP (val))
8661         val = Fcopy_sequence (val);
8662       ASET (attrs, coding_attr_ccl_encoder, val);
8663
8664       val = args[coding_arg_ccl_valids];
8665       valids = Fmake_string (make_number (256), make_number (0));
8666       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8667         {
8668           int from, to;
8669
8670           val = Fcar (tail);
8671           if (INTEGERP (val))
8672             {
8673               from = to = XINT (val);
8674               if (from < 0 || from > 255)
8675                 args_out_of_range_3 (val, make_number (0), make_number (255));
8676             }
8677           else
8678             {
8679               CHECK_CONS (val);
8680               CHECK_NATNUM_CAR (val);
8681               CHECK_NATNUM_CDR (val);
8682               from = XINT (XCAR (val));
8683               if (from > 255)
8684                 args_out_of_range_3 (XCAR (val),
8685                                      make_number (0), make_number (255));
8686               to = XINT (XCDR (val));
8687               if (to < from || to > 255)
8688                 args_out_of_range_3 (XCDR (val),
8689                                      XCAR (val), make_number (255));
8690             }
8691           for (i = from; i <= to; i++)
8692             SSET (valids, i, 1);
8693         }
8694       ASET (attrs, coding_attr_ccl_valids, valids);
8695
8696       category = coding_category_ccl;
8697     }
8698   else if (EQ (coding_type, Qutf_16))
8699     {
8700       Lisp_Object bom, endian;
8701
8702       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8703
8704       if (nargs < coding_arg_utf16_max)
8705         goto short_args;
8706
8707       bom = args[coding_arg_utf16_bom];
8708       if (! NILP (bom) && ! EQ (bom, Qt))
8709         {
8710           CHECK_CONS (bom);
8711           val = XCAR (bom);
8712           CHECK_CODING_SYSTEM (val);
8713           val = XCDR (bom);
8714           CHECK_CODING_SYSTEM (val);
8715         }
8716       ASET (attrs, coding_attr_utf_16_bom, bom);
8717
8718       endian = args[coding_arg_utf16_endian];
8719       CHECK_SYMBOL (endian);
8720       if (NILP (endian))
8721         endian = Qbig;
8722       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8723         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8724       ASET (attrs, coding_attr_utf_16_endian, endian);
8725
8726       category = (CONSP (bom)
8727                   ? coding_category_utf_16_auto
8728                   : NILP (bom)
8729                   ? (EQ (endian, Qbig)
8730                      ? coding_category_utf_16_be_nosig
8731                      : coding_category_utf_16_le_nosig)
8732                   : (EQ (endian, Qbig)
8733                      ? coding_category_utf_16_be
8734                      : coding_category_utf_16_le));
8735     }
8736   else if (EQ (coding_type, Qiso_2022))
8737     {
8738       Lisp_Object initial, reg_usage, request, flags;
8739       int i;
8740
8741       if (nargs < coding_arg_iso2022_max)
8742         goto short_args;
8743
8744       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8745       CHECK_VECTOR (initial);
8746       for (i = 0; i < 4; i++)
8747         {
8748           val = Faref (initial, make_number (i));
8749           if (! NILP (val))
8750             {
8751               struct charset *charset;
8752
8753               CHECK_CHARSET_GET_CHARSET (val, charset);
8754               ASET (initial, i, make_number (CHARSET_ID (charset)));
8755               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8756                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8757             }
8758           else
8759             ASET (initial, i, make_number (-1));
8760         }
8761
8762       reg_usage = args[coding_arg_iso2022_reg_usage];
8763       CHECK_CONS (reg_usage);
8764       CHECK_NUMBER_CAR (reg_usage);
8765       CHECK_NUMBER_CDR (reg_usage);
8766
8767       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8768       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8769         {
8770           int id;
8771           Lisp_Object tmp;
8772
8773           val = Fcar (tail);
8774           CHECK_CONS (val);
8775           tmp = XCAR (val);
8776           CHECK_CHARSET_GET_ID (tmp, id);
8777           CHECK_NATNUM_CDR (val);
8778           if (XINT (XCDR (val)) >= 4)
8779             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8780           XSETCAR (val, make_number (id));
8781         }
8782
8783       flags = args[coding_arg_iso2022_flags];
8784       CHECK_NATNUM (flags);
8785       i = XINT (flags);
8786       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8787         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8788
8789       ASET (attrs, coding_attr_iso_initial, initial);
8790       ASET (attrs, coding_attr_iso_usage, reg_usage);
8791       ASET (attrs, coding_attr_iso_request, request);
8792       ASET (attrs, coding_attr_iso_flags, flags);
8793       setup_iso_safe_charsets (attrs);
8794
8795       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8796         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8797                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8798                     ? coding_category_iso_7_else
8799                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8800                     ? coding_category_iso_7
8801                     : coding_category_iso_7_tight);
8802       else
8803         {
8804           int id = XINT (AREF (initial, 1));
8805
8806           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8807                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8808                        || id < 0)
8809                       ? coding_category_iso_8_else
8810                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8811                       ? coding_category_iso_8_1
8812                       : coding_category_iso_8_2);
8813         }
8814       if (category != coding_category_iso_8_1
8815           && category != coding_category_iso_8_2)
8816         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8817     }
8818   else if (EQ (coding_type, Qemacs_mule))
8819     {
8820       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8821         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8822       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8823       category = coding_category_emacs_mule;
8824     }
8825   else if (EQ (coding_type, Qshift_jis))
8826     {
8827
8828       struct charset *charset;
8829
8830       if (XINT (Flength (charset_list)) != 3
8831           && XINT (Flength (charset_list)) != 4)
8832         error ("There should be three or four charsets");
8833
8834       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8835       if (CHARSET_DIMENSION (charset) != 1)
8836         error ("Dimension of charset %s is not one",
8837                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8838       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8839         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8840
8841       charset_list = XCDR (charset_list);
8842       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8843       if (CHARSET_DIMENSION (charset) != 1)
8844         error ("Dimension of charset %s is not one",
8845                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8846
8847       charset_list = XCDR (charset_list);
8848       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8849       if (CHARSET_DIMENSION (charset) != 2)
8850         error ("Dimension of charset %s is not two",
8851                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8852
8853       charset_list = XCDR (charset_list);
8854       if (! NILP (charset_list))
8855         {
8856           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8857           if (CHARSET_DIMENSION (charset) != 2)
8858             error ("Dimension of charset %s is not two",
8859                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8860         }
8861
8862       category = coding_category_sjis;
8863       Vsjis_coding_system = name;
8864     }
8865   else if (EQ (coding_type, Qbig5))
8866     {
8867       struct charset *charset;
8868
8869       if (XINT (Flength (charset_list)) != 2)
8870         error ("There should be just two charsets");
8871
8872       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8873       if (CHARSET_DIMENSION (charset) != 1)
8874         error ("Dimension of charset %s is not one",
8875                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8876       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8877         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8878
8879       charset_list = XCDR (charset_list);
8880       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8881       if (CHARSET_DIMENSION (charset) != 2)
8882         error ("Dimension of charset %s is not two",
8883                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8884
8885       category = coding_category_big5;
8886       Vbig5_coding_system = name;
8887     }
8888   else if (EQ (coding_type, Qraw_text))
8889     {
8890       category = coding_category_raw_text;
8891       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8892     }
8893   else if (EQ (coding_type, Qutf_8))
8894     {
8895       category = coding_category_utf_8;
8896       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8897     }
8898   else if (EQ (coding_type, Qundecided))
8899     category = coding_category_undecided;
8900   else
8901     error ("Invalid coding system type: %s",
8902            SDATA (SYMBOL_NAME (coding_type)));
8903
8904   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8905   CODING_ATTR_PLIST (attrs)
8906     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8907                                 CODING_ATTR_PLIST (attrs)));
8908   CODING_ATTR_PLIST (attrs)
8909     = Fcons (QCascii_compatible_p,
8910              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8911                     CODING_ATTR_PLIST (attrs)));
8912
8913   eol_type = args[coding_arg_eol_type];
8914   if (! NILP (eol_type)
8915       && ! EQ (eol_type, Qunix)
8916       && ! EQ (eol_type, Qdos)
8917       && ! EQ (eol_type, Qmac))
8918     error ("Invalid eol-type");
8919
8920   aliases = Fcons (name, Qnil);
8921
8922   if (NILP (eol_type))
8923     {
8924       eol_type = make_subsidiaries (name);
8925       for (i = 0; i < 3; i++)
8926         {
8927           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8928
8929           this_name = AREF (eol_type, i);
8930           this_aliases = Fcons (this_name, Qnil);
8931           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8932           this_spec = Fmake_vector (make_number (3), attrs);
8933           ASET (this_spec, 1, this_aliases);
8934           ASET (this_spec, 2, this_eol_type);
8935           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8936           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8937           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8938           if (NILP (val))
8939             Vcoding_system_alist
8940               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8941                        Vcoding_system_alist);
8942         }
8943     }
8944
8945   spec_vec = Fmake_vector (make_number (3), attrs);
8946   ASET (spec_vec, 1, aliases);
8947   ASET (spec_vec, 2, eol_type);
8948
8949   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8950   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8951   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8952   if (NILP (val))
8953     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8954                                   Vcoding_system_alist);
8955
8956   {
8957     int id = coding_categories[category].id;
8958
8959     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8960       setup_coding_system (name, &coding_categories[category]);
8961   }
8962
8963   return Qnil;
8964
8965  short_args:
8966   return Fsignal (Qwrong_number_of_arguments,
8967                   Fcons (intern ("define-coding-system-internal"),
8968                          make_number (nargs)));
8969 }
8970
8971
8972 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8973        3, 3, 0,
8974        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8975   (coding_system, prop, val)
8976      Lisp_Object coding_system, prop, val;
8977 {
8978   Lisp_Object spec, attrs;
8979
8980   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8981   attrs = AREF (spec, 0);
8982   if (EQ (prop, QCmnemonic))
8983     {
8984       if (! STRINGP (val))
8985         CHECK_CHARACTER (val);
8986       CODING_ATTR_MNEMONIC (attrs) = val;
8987     }
8988   else if (EQ (prop, QCdefalut_char))
8989     {
8990       if (NILP (val))
8991         val = make_number (' ');
8992       else
8993         CHECK_CHARACTER (val);
8994       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8995     }
8996   else if (EQ (prop, QCdecode_translation_table))
8997     {
8998       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8999         CHECK_SYMBOL (val);
9000       CODING_ATTR_DECODE_TBL (attrs) = val;
9001     }
9002   else if (EQ (prop, QCencode_translation_table))
9003     {
9004       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9005         CHECK_SYMBOL (val);
9006       CODING_ATTR_ENCODE_TBL (attrs) = val;
9007     }
9008   else if (EQ (prop, QCpost_read_conversion))
9009     {
9010       CHECK_SYMBOL (val);
9011       CODING_ATTR_POST_READ (attrs) = val;
9012     }
9013   else if (EQ (prop, QCpre_write_conversion))
9014     {
9015       CHECK_SYMBOL (val);
9016       CODING_ATTR_PRE_WRITE (attrs) = val;
9017     }
9018   else if (EQ (prop, QCascii_compatible_p))
9019     {
9020       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9021     }
9022
9023   CODING_ATTR_PLIST (attrs)
9024     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9025   return val;
9026 }
9027
9028
9029 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9030        Sdefine_coding_system_alias, 2, 2, 0,
9031        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9032      (alias, coding_system)
9033      Lisp_Object alias, coding_system;
9034 {
9035   Lisp_Object spec, aliases, eol_type, val;
9036
9037   CHECK_SYMBOL (alias);
9038   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9039   aliases = AREF (spec, 1);
9040   /* ALISES should be a list of length more than zero, and the first
9041      element is a base coding system.  Append ALIAS at the tail of the
9042      list.  */
9043   while (!NILP (XCDR (aliases)))
9044     aliases = XCDR (aliases);
9045   XSETCDR (aliases, Fcons (alias, Qnil));
9046
9047   eol_type = AREF (spec, 2);
9048   if (VECTORP (eol_type))
9049     {
9050       Lisp_Object subsidiaries;
9051       int i;
9052
9053       subsidiaries = make_subsidiaries (alias);
9054       for (i = 0; i < 3; i++)
9055         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9056                                      AREF (eol_type, i));
9057     }
9058
9059   Fputhash (alias, spec, Vcoding_system_hash_table);
9060   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9061   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9062   if (NILP (val))
9063     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9064                                   Vcoding_system_alist);
9065
9066   return Qnil;
9067 }
9068
9069 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9070        1, 1, 0,
9071        doc: /* Return the base of CODING-SYSTEM.
9072 Any alias or subsidiary coding system is not a base coding system.  */)
9073   (coding_system)
9074      Lisp_Object coding_system;
9075 {
9076   Lisp_Object spec, attrs;
9077
9078   if (NILP (coding_system))
9079     return (Qno_conversion);
9080   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9081   attrs = AREF (spec, 0);
9082   return CODING_ATTR_BASE_NAME (attrs);
9083 }
9084
9085 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9086        1, 1, 0,
9087        doc: "Return the property list of CODING-SYSTEM.")
9088      (coding_system)
9089      Lisp_Object coding_system;
9090 {
9091   Lisp_Object spec, attrs;
9092
9093   if (NILP (coding_system))
9094     coding_system = Qno_conversion;
9095   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9096   attrs = AREF (spec, 0);
9097   return CODING_ATTR_PLIST (attrs);
9098 }
9099
9100
9101 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9102        1, 1, 0,
9103        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9104      (coding_system)
9105      Lisp_Object coding_system;
9106 {
9107   Lisp_Object spec;
9108
9109   if (NILP (coding_system))
9110     coding_system = Qno_conversion;
9111   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9112   return AREF (spec, 1);
9113 }
9114
9115 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9116        Scoding_system_eol_type, 1, 1, 0,
9117        doc: /* Return eol-type of CODING-SYSTEM.
9118 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9119
9120 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9121 and CR respectively.
9122
9123 A vector value indicates that a format of end-of-line should be
9124 detected automatically.  Nth element of the vector is the subsidiary
9125 coding system whose eol-type is N.  */)
9126      (coding_system)
9127      Lisp_Object coding_system;
9128 {
9129   Lisp_Object spec, eol_type;
9130   int n;
9131
9132   if (NILP (coding_system))
9133     coding_system = Qno_conversion;
9134   if (! CODING_SYSTEM_P (coding_system))
9135     return Qnil;
9136   spec = CODING_SYSTEM_SPEC (coding_system);
9137   eol_type = AREF (spec, 2);
9138   if (VECTORP (eol_type))
9139     return Fcopy_sequence (eol_type);
9140   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9141   return make_number (n);
9142 }
9143
9144 #endif /* emacs */
9145
9146 \f
9147 /*** 9. Post-amble ***/
9148
9149 void
9150 init_coding_once ()
9151 {
9152   int i;
9153
9154   for (i = 0; i < coding_category_max; i++)
9155     {
9156       coding_categories[i].id = -1;
9157       coding_priorities[i] = i;
9158     }
9159
9160   /* ISO2022 specific initialize routine.  */
9161   for (i = 0; i < 0x20; i++)
9162     iso_code_class[i] = ISO_control_0;
9163   for (i = 0x21; i < 0x7F; i++)
9164     iso_code_class[i] = ISO_graphic_plane_0;
9165   for (i = 0x80; i < 0xA0; i++)
9166     iso_code_class[i] = ISO_control_1;
9167   for (i = 0xA1; i < 0xFF; i++)
9168     iso_code_class[i] = ISO_graphic_plane_1;
9169   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9170   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9171   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9172   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9173   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9174   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9175   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9176   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9177   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9178
9179   for (i = 0; i < 256; i++)
9180     {
9181       emacs_mule_bytes[i] = 1;
9182     }
9183   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9184   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9185   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9186   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9187 }
9188
9189 #ifdef emacs
9190
9191 void
9192 syms_of_coding ()
9193 {
9194   staticpro (&Vcoding_system_hash_table);
9195   {
9196     Lisp_Object args[2];
9197     args[0] = QCtest;
9198     args[1] = Qeq;
9199     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9200   }
9201
9202   staticpro (&Vsjis_coding_system);
9203   Vsjis_coding_system = Qnil;
9204
9205   staticpro (&Vbig5_coding_system);
9206   Vbig5_coding_system = Qnil;
9207
9208   staticpro (&Vcode_conversion_reused_workbuf);
9209   Vcode_conversion_reused_workbuf = Qnil;
9210
9211   staticpro (&Vcode_conversion_workbuf_name);
9212   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9213
9214   reused_workbuf_in_use = 0;
9215
9216   DEFSYM (Qcharset, "charset");
9217   DEFSYM (Qtarget_idx, "target-idx");
9218   DEFSYM (Qcoding_system_history, "coding-system-history");
9219   Fset (Qcoding_system_history, Qnil);
9220
9221   /* Target FILENAME is the first argument.  */
9222   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9223   /* Target FILENAME is the third argument.  */
9224   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9225
9226   DEFSYM (Qcall_process, "call-process");
9227   /* Target PROGRAM is the first argument.  */
9228   Fput (Qcall_process, Qtarget_idx, make_number (0));
9229
9230   DEFSYM (Qcall_process_region, "call-process-region");
9231   /* Target PROGRAM is the third argument.  */
9232   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9233
9234   DEFSYM (Qstart_process, "start-process");
9235   /* Target PROGRAM is the third argument.  */
9236   Fput (Qstart_process, Qtarget_idx, make_number (2));
9237
9238   DEFSYM (Qopen_network_stream, "open-network-stream");
9239   /* Target SERVICE is the fourth argument.  */
9240   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9241
9242   DEFSYM (Qcoding_system, "coding-system");
9243   DEFSYM (Qcoding_aliases, "coding-aliases");
9244
9245   DEFSYM (Qeol_type, "eol-type");
9246   DEFSYM (Qunix, "unix");
9247   DEFSYM (Qdos, "dos");
9248
9249   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9250   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9251   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9252   DEFSYM (Qdefault_char, "default-char");
9253   DEFSYM (Qundecided, "undecided");
9254   DEFSYM (Qno_conversion, "no-conversion");
9255   DEFSYM (Qraw_text, "raw-text");
9256
9257   DEFSYM (Qiso_2022, "iso-2022");
9258
9259   DEFSYM (Qutf_8, "utf-8");
9260   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9261
9262   DEFSYM (Qutf_16, "utf-16");
9263   DEFSYM (Qbig, "big");
9264   DEFSYM (Qlittle, "little");
9265
9266   DEFSYM (Qshift_jis, "shift-jis");
9267   DEFSYM (Qbig5, "big5");
9268
9269   DEFSYM (Qcoding_system_p, "coding-system-p");
9270
9271   DEFSYM (Qcoding_system_error, "coding-system-error");
9272   Fput (Qcoding_system_error, Qerror_conditions,
9273         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9274   Fput (Qcoding_system_error, Qerror_message,
9275         build_string ("Invalid coding system"));
9276
9277   /* Intern this now in case it isn't already done.
9278      Setting this variable twice is harmless.
9279      But don't staticpro it here--that is done in alloc.c.  */
9280   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9281
9282   DEFSYM (Qtranslation_table, "translation-table");
9283   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9284   DEFSYM (Qtranslation_table_id, "translation-table-id");
9285   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9286   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9287
9288   DEFSYM (Qvalid_codes, "valid-codes");
9289
9290   DEFSYM (Qemacs_mule, "emacs-mule");
9291
9292   DEFSYM (QCcategory, ":category");
9293   DEFSYM (QCmnemonic, ":mnemonic");
9294   DEFSYM (QCdefalut_char, ":default-char");
9295   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9296   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9297   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9298   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9299   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9300
9301   Vcoding_category_table
9302     = Fmake_vector (make_number (coding_category_max), Qnil);
9303   staticpro (&Vcoding_category_table);
9304   /* Followings are target of code detection.  */
9305   ASET (Vcoding_category_table, coding_category_iso_7,
9306         intern ("coding-category-iso-7"));
9307   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9308         intern ("coding-category-iso-7-tight"));
9309   ASET (Vcoding_category_table, coding_category_iso_8_1,
9310         intern ("coding-category-iso-8-1"));
9311   ASET (Vcoding_category_table, coding_category_iso_8_2,
9312         intern ("coding-category-iso-8-2"));
9313   ASET (Vcoding_category_table, coding_category_iso_7_else,
9314         intern ("coding-category-iso-7-else"));
9315   ASET (Vcoding_category_table, coding_category_iso_8_else,
9316         intern ("coding-category-iso-8-else"));
9317   ASET (Vcoding_category_table, coding_category_utf_8,
9318         intern ("coding-category-utf-8"));
9319   ASET (Vcoding_category_table, coding_category_utf_16_be,
9320         intern ("coding-category-utf-16-be"));
9321   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9322         intern ("coding-category-utf-16-auto"));
9323   ASET (Vcoding_category_table, coding_category_utf_16_le,
9324         intern ("coding-category-utf-16-le"));
9325   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9326         intern ("coding-category-utf-16-be-nosig"));
9327   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9328         intern ("coding-category-utf-16-le-nosig"));
9329   ASET (Vcoding_category_table, coding_category_charset,
9330         intern ("coding-category-charset"));
9331   ASET (Vcoding_category_table, coding_category_sjis,
9332         intern ("coding-category-sjis"));
9333   ASET (Vcoding_category_table, coding_category_big5,
9334         intern ("coding-category-big5"));
9335   ASET (Vcoding_category_table, coding_category_ccl,
9336         intern ("coding-category-ccl"));
9337   ASET (Vcoding_category_table, coding_category_emacs_mule,
9338         intern ("coding-category-emacs-mule"));
9339   /* Followings are NOT target of code detection.  */
9340   ASET (Vcoding_category_table, coding_category_raw_text,
9341         intern ("coding-category-raw-text"));
9342   ASET (Vcoding_category_table, coding_category_undecided,
9343         intern ("coding-category-undecided"));
9344
9345   DEFSYM (Qinsufficient_source, "insufficient-source");
9346   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9347   DEFSYM (Qinvalid_source, "invalid-source");
9348   DEFSYM (Qinterrupted, "interrupted");
9349   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9350   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9351
9352   defsubr (&Scoding_system_p);
9353   defsubr (&Sread_coding_system);
9354   defsubr (&Sread_non_nil_coding_system);
9355   defsubr (&Scheck_coding_system);
9356   defsubr (&Sdetect_coding_region);
9357   defsubr (&Sdetect_coding_string);
9358   defsubr (&Sfind_coding_systems_region_internal);
9359   defsubr (&Sunencodable_char_position);
9360   defsubr (&Scheck_coding_systems_region);
9361   defsubr (&Sdecode_coding_region);
9362   defsubr (&Sencode_coding_region);
9363   defsubr (&Sdecode_coding_string);
9364   defsubr (&Sencode_coding_string);
9365   defsubr (&Sdecode_sjis_char);
9366   defsubr (&Sencode_sjis_char);
9367   defsubr (&Sdecode_big5_char);
9368   defsubr (&Sencode_big5_char);
9369   defsubr (&Sset_terminal_coding_system_internal);
9370   defsubr (&Sset_safe_terminal_coding_system_internal);
9371   defsubr (&Sterminal_coding_system);
9372   defsubr (&Sset_keyboard_coding_system_internal);
9373   defsubr (&Skeyboard_coding_system);
9374   defsubr (&Sfind_operation_coding_system);
9375   defsubr (&Sset_coding_system_priority);
9376   defsubr (&Sdefine_coding_system_internal);
9377   defsubr (&Sdefine_coding_system_alias);
9378   defsubr (&Scoding_system_put);
9379   defsubr (&Scoding_system_base);
9380   defsubr (&Scoding_system_plist);
9381   defsubr (&Scoding_system_aliases);
9382   defsubr (&Scoding_system_eol_type);
9383   defsubr (&Scoding_system_priority_list);
9384
9385   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9386                doc: /* List of coding systems.
9387
9388 Do not alter the value of this variable manually.  This variable should be
9389 updated by the functions `define-coding-system' and
9390 `define-coding-system-alias'.  */);
9391   Vcoding_system_list = Qnil;
9392
9393   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9394                doc: /* Alist of coding system names.
9395 Each element is one element list of coding system name.
9396 This variable is given to `completing-read' as TABLE argument.
9397
9398 Do not alter the value of this variable manually.  This variable should be
9399 updated by the functions `make-coding-system' and
9400 `define-coding-system-alias'.  */);
9401   Vcoding_system_alist = Qnil;
9402
9403   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9404                doc: /* List of coding-categories (symbols) ordered by priority.
9405
9406 On detecting a coding system, Emacs tries code detection algorithms
9407 associated with each coding-category one by one in this order.  When
9408 one algorithm agrees with a byte sequence of source text, the coding
9409 system bound to the corresponding coding-category is selected.
9410
9411 Don't modify this variable directly, but use `set-coding-priority'.  */);
9412   {
9413     int i;
9414
9415     Vcoding_category_list = Qnil;
9416     for (i = coding_category_max - 1; i >= 0; i--)
9417       Vcoding_category_list
9418         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9419                  Vcoding_category_list);
9420   }
9421
9422   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9423                doc: /* Specify the coding system for read operations.
9424 It is useful to bind this variable with `let', but do not set it globally.
9425 If the value is a coding system, it is used for decoding on read operation.
9426 If not, an appropriate element is used from one of the coding system alists:
9427 There are three such tables, `file-coding-system-alist',
9428 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9429   Vcoding_system_for_read = Qnil;
9430
9431   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9432                doc: /* Specify the coding system for write operations.
9433 Programs bind this variable with `let', but you should not set it globally.
9434 If the value is a coding system, it is used for encoding of output,
9435 when writing it to a file and when sending it to a file or subprocess.
9436
9437 If this does not specify a coding system, an appropriate element
9438 is used from one of the coding system alists:
9439 There are three such tables, `file-coding-system-alist',
9440 `process-coding-system-alist', and `network-coding-system-alist'.
9441 For output to files, if the above procedure does not specify a coding system,
9442 the value of `buffer-file-coding-system' is used.  */);
9443   Vcoding_system_for_write = Qnil;
9444
9445   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9446                doc: /*
9447 Coding system used in the latest file or process I/O.  */);
9448   Vlast_coding_system_used = Qnil;
9449
9450   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9451                doc: /*
9452 Error status of the last code conversion.
9453
9454 When an error was detected in the last code conversion, this variable
9455 is set to one of the following symbols.
9456   `insufficient-source'
9457   `inconsistent-eol'
9458   `invalid-source'
9459   `interrupted'
9460   `insufficient-memory'
9461 When no error was detected, the value doesn't change.  So, to check
9462 the error status of a code conversion by this variable, you must
9463 explicitly set this variable to nil before performing code
9464 conversion.  */);
9465   Vlast_code_conversion_error = Qnil;
9466
9467   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9468                doc: /*
9469 *Non-nil means always inhibit code conversion of end-of-line format.
9470 See info node `Coding Systems' and info node `Text and Binary' concerning
9471 such conversion.  */);
9472   inhibit_eol_conversion = 0;
9473
9474   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9475                doc: /*
9476 Non-nil means process buffer inherits coding system of process output.
9477 Bind it to t if the process output is to be treated as if it were a file
9478 read from some filesystem.  */);
9479   inherit_process_coding_system = 0;
9480
9481   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9482                doc: /*
9483 Alist to decide a coding system to use for a file I/O operation.
9484 The format is ((PATTERN . VAL) ...),
9485 where PATTERN is a regular expression matching a file name,
9486 VAL is a coding system, a cons of coding systems, or a function symbol.
9487 If VAL is a coding system, it is used for both decoding and encoding
9488 the file contents.
9489 If VAL is a cons of coding systems, the car part is used for decoding,
9490 and the cdr part is used for encoding.
9491 If VAL is a function symbol, the function must return a coding system
9492 or a cons of coding systems which are used as above.  The function gets
9493 the arguments with which `find-operation-coding-systems' was called.
9494
9495 See also the function `find-operation-coding-system'
9496 and the variable `auto-coding-alist'.  */);
9497   Vfile_coding_system_alist = Qnil;
9498
9499   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9500                doc: /*
9501 Alist to decide a coding system to use for a process I/O operation.
9502 The format is ((PATTERN . VAL) ...),
9503 where PATTERN is a regular expression matching a program name,
9504 VAL is a coding system, a cons of coding systems, or a function symbol.
9505 If VAL is a coding system, it is used for both decoding what received
9506 from the program and encoding what sent to the program.
9507 If VAL is a cons of coding systems, the car part is used for decoding,
9508 and the cdr part is used for encoding.
9509 If VAL is a function symbol, the function must return a coding system
9510 or a cons of coding systems which are used as above.
9511
9512 See also the function `find-operation-coding-system'.  */);
9513   Vprocess_coding_system_alist = Qnil;
9514
9515   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9516                doc: /*
9517 Alist to decide a coding system to use for a network I/O operation.
9518 The format is ((PATTERN . VAL) ...),
9519 where PATTERN is a regular expression matching a network service name
9520 or is a port number to connect to,
9521 VAL is a coding system, a cons of coding systems, or a function symbol.
9522 If VAL is a coding system, it is used for both decoding what received
9523 from the network stream and encoding what sent to the network stream.
9524 If VAL is a cons of coding systems, the car part is used for decoding,
9525 and the cdr part is used for encoding.
9526 If VAL is a function symbol, the function must return a coding system
9527 or a cons of coding systems which are used as above.
9528
9529 See also the function `find-operation-coding-system'.  */);
9530   Vnetwork_coding_system_alist = Qnil;
9531
9532   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9533                doc: /* Coding system to use with system messages.
9534 Also used for decoding keyboard input on X Window system.  */);
9535   Vlocale_coding_system = Qnil;
9536
9537   /* The eol mnemonics are reset in startup.el system-dependently.  */
9538   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9539                doc: /*
9540 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9541   eol_mnemonic_unix = build_string (":");
9542
9543   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9544                doc: /*
9545 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9546   eol_mnemonic_dos = build_string ("\\");
9547
9548   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9549                doc: /*
9550 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9551   eol_mnemonic_mac = build_string ("/");
9552
9553   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9554                doc: /*
9555 *String displayed in mode line when end-of-line format is not yet determined.  */);
9556   eol_mnemonic_undecided = build_string (":");
9557
9558   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9559                doc: /*
9560 *Non-nil enables character translation while encoding and decoding.  */);
9561   Venable_character_translation = Qt;
9562
9563   DEFVAR_LISP ("standard-translation-table-for-decode",
9564                &Vstandard_translation_table_for_decode,
9565                doc: /* Table for translating characters while decoding.  */);
9566   Vstandard_translation_table_for_decode = Qnil;
9567
9568   DEFVAR_LISP ("standard-translation-table-for-encode",
9569                &Vstandard_translation_table_for_encode,
9570                doc: /* Table for translating characters while encoding.  */);
9571   Vstandard_translation_table_for_encode = Qnil;
9572
9573   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9574                doc: /* Alist of charsets vs revision numbers.
9575 While encoding, if a charset (car part of an element) is found,
9576 designate it with the escape sequence identifying revision (cdr part
9577 of the element).  */);
9578   Vcharset_revision_table = Qnil;
9579
9580   DEFVAR_LISP ("default-process-coding-system",
9581                &Vdefault_process_coding_system,
9582                doc: /* Cons of coding systems used for process I/O by default.
9583 The car part is used for decoding a process output,
9584 the cdr part is used for encoding a text to be sent to a process.  */);
9585   Vdefault_process_coding_system = Qnil;
9586
9587   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9588                doc: /*
9589 Table of extra Latin codes in the range 128..159 (inclusive).
9590 This is a vector of length 256.
9591 If Nth element is non-nil, the existence of code N in a file
9592 \(or output of subprocess) doesn't prevent it to be detected as
9593 a coding system of ISO 2022 variant which has a flag
9594 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9595 or reading output of a subprocess.
9596 Only 128th through 159th elements has a meaning.  */);
9597   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9598
9599   DEFVAR_LISP ("select-safe-coding-system-function",
9600                &Vselect_safe_coding_system_function,
9601                doc: /*
9602 Function to call to select safe coding system for encoding a text.
9603
9604 If set, this function is called to force a user to select a proper
9605 coding system which can encode the text in the case that a default
9606 coding system used in each operation can't encode the text.
9607
9608 The default value is `select-safe-coding-system' (which see).  */);
9609   Vselect_safe_coding_system_function = Qnil;
9610
9611   DEFVAR_BOOL ("coding-system-require-warning",
9612                &coding_system_require_warning,
9613                doc: /* Internal use only.
9614 If non-nil, on writing a file, `select-safe-coding-system-function' is
9615 called even if `coding-system-for-write' is non-nil.  The command
9616 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9617   coding_system_require_warning = 0;
9618
9619
9620   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9621                &inhibit_iso_escape_detection,
9622                doc: /*
9623 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9624
9625 By default, on reading a file, Emacs tries to detect how the text is
9626 encoded.  This code detection is sensitive to escape sequences.  If
9627 the sequence is valid as ISO2022, the code is determined as one of
9628 the ISO2022 encodings, and the file is decoded by the corresponding
9629 coding system (e.g. `iso-2022-7bit').
9630
9631 However, there may be a case that you want to read escape sequences in
9632 a file as is.  In such a case, you can set this variable to non-nil.
9633 Then, as the code detection ignores any escape sequences, no file is
9634 detected as encoded in some ISO2022 encoding.  The result is that all
9635 escape sequences become visible in a buffer.
9636
9637 The default value is nil, and it is strongly recommended not to change
9638 it.  That is because many Emacs Lisp source files that contain
9639 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9640 in Emacs's distribution, and they won't be decoded correctly on
9641 reading if you suppress escape sequence detection.
9642
9643 The other way to read escape sequences in a file without decoding is
9644 to explicitly specify some coding system that doesn't use ISO2022's
9645 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9646   inhibit_iso_escape_detection = 0;
9647
9648   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9649                doc: /* Char table for translating self-inserting characters.
9650 This is applied to the result of input methods, not their input.  See also
9651 `keyboard-translate-table'.  */);
9652     Vtranslation_table_for_input = Qnil;
9653
9654   {
9655     Lisp_Object args[coding_arg_max];
9656     Lisp_Object plist[16];
9657     int i;
9658
9659     for (i = 0; i < coding_arg_max; i++)
9660       args[i] = Qnil;
9661
9662     plist[0] = intern (":name");
9663     plist[1] = args[coding_arg_name] = Qno_conversion;
9664     plist[2] = intern (":mnemonic");
9665     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9666     plist[4] = intern (":coding-type");
9667     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9668     plist[6] = intern (":ascii-compatible-p");
9669     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9670     plist[8] = intern (":default-char");
9671     plist[9] = args[coding_arg_default_char] = make_number (0);
9672     plist[10] = intern (":for-unibyte");
9673     plist[11] = args[coding_arg_for_unibyte] = Qt;
9674     plist[12] = intern (":docstring");
9675     plist[13] = build_string ("Do no conversion.\n\
9676 \n\
9677 When you visit a file with this coding, the file is read into a\n\
9678 unibyte buffer as is, thus each byte of a file is treated as a\n\
9679 character.");
9680     plist[14] = intern (":eol-type");
9681     plist[15] = args[coding_arg_eol_type] = Qunix;
9682     args[coding_arg_plist] = Flist (16, plist);
9683     Fdefine_coding_system_internal (coding_arg_max, args);
9684
9685     plist[1] = args[coding_arg_name] = Qundecided;
9686     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9687     plist[5] = args[coding_arg_coding_type] = Qundecided;
9688     /* This is already set.
9689        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9690     plist[8] = intern (":charset-list");
9691     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9692     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9693     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9694     plist[15] = args[coding_arg_eol_type] = Qnil;
9695     args[coding_arg_plist] = Flist (16, plist);
9696     Fdefine_coding_system_internal (coding_arg_max, args);
9697   }
9698
9699   setup_coding_system (Qno_conversion, &keyboard_coding);
9700   setup_coding_system (Qundecided, &terminal_coding);
9701   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9702
9703   {
9704     int i;
9705
9706     for (i = 0; i < coding_category_max; i++)
9707       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9708   }
9709 }
9710
9711 char *
9712 emacs_strerror (error_number)
9713      int error_number;
9714 {
9715   char *str;
9716
9717   synchronize_system_messages_locale ();
9718   str = strerror (error_number);
9719
9720   if (! NILP (Vlocale_coding_system))
9721     {
9722       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9723                                                       Vlocale_coding_system,
9724                                                       0);
9725       str = (char *) SDATA (dec);
9726     }
9727
9728   return str;
9729 }
9730
9731 #endif /* emacs */
9732
9733 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9734    (do not change this comment) */