src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory;
 315
 316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 318 Lisp_Object Qstart_process, Qopen_network_stream;
 319 Lisp_Object Qtarget_idx;
 320
 321 int coding_system_require_warning;
 322
 323 Lisp_Object Vselect_safe_coding_system_function;
 324
 325 /* Mnemonic string for each format of end-of-line.  */
 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 327 /* Mnemonic string to indicate format of end-of-line is not yet
 328    decided.  */
 329 Lisp_Object eol_mnemonic_undecided;
 330
 331 #ifdef emacs
 332
 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 334
 335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 336
 337 /* Coding system emacs-mule and raw-text are for converting only
 338    end-of-line format.  */
 339 Lisp_Object Qemacs_mule, Qraw_text;
 340 Lisp_Object Qutf_8_emacs;
 341
 342 /* Coding-systems are handed between Emacs Lisp programs and C internal
 343    routines by the following three variables.  */
 344 /* Coding-system for reading files and receiving data from process.  */
 345 Lisp_Object Vcoding_system_for_read;
 346 /* Coding-system for writing files and sending data to process.  */
 347 Lisp_Object Vcoding_system_for_write;
 348 /* Coding-system actually used in the latest I/O.  */
 349 Lisp_Object Vlast_coding_system_used;
 350
 351 /* A vector of length 256 which contains information about special
 352    Latin codes (especially for dealing with Microsoft codes).  */
 353 Lisp_Object Vlatin_extra_code_table;
 354
 355 /* Flag to inhibit code conversion of end-of-line format.  */
 356 int inhibit_eol_conversion;
 357
 358 /* Flag to inhibit ISO2022 escape sequence detection.  */
 359 int inhibit_iso_escape_detection;
 360
 361 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 362 int inherit_process_coding_system;
 363
 364 /* Coding system to be used to encode text for terminal display.  */
 365 struct coding_system terminal_coding;
 366
 367 /* Coding system to be used to encode text for terminal display when
 368    terminal coding system is nil.  */
 369 struct coding_system safe_terminal_coding;
 370
 371 /* Coding system of what is sent from terminal keyboard.  */
 372 struct coding_system keyboard_coding;
 373
 374 Lisp_Object Vfile_coding_system_alist;
 375 Lisp_Object Vprocess_coding_system_alist;
 376 Lisp_Object Vnetwork_coding_system_alist;
 377
 378 Lisp_Object Vlocale_coding_system;
 379
 380 #endif /* emacs */
 381
 382 /* Flag to tell if we look up translation table on character code
 383    conversion.  */
 384 Lisp_Object Venable_character_translation;
 385 /* Standard translation table to look up on decoding (reading).  */
 386 Lisp_Object Vstandard_translation_table_for_decode;
 387 /* Standard translation table to look up on encoding (writing).  */
 388 Lisp_Object Vstandard_translation_table_for_encode;
 389
 390 Lisp_Object Qtranslation_table;
 391 Lisp_Object Qtranslation_table_id;
 392 Lisp_Object Qtranslation_table_for_decode;
 393 Lisp_Object Qtranslation_table_for_encode;
 394
 395 /* Alist of charsets vs revision number.  */
 396 static Lisp_Object Vcharset_revision_table;
 397
 398 /* Default coding systems used for process I/O.  */
 399 Lisp_Object Vdefault_process_coding_system;
 400
 401 /* Char table for translating Quail and self-inserting input.  */
 402 Lisp_Object Vtranslation_table_for_input;
 403
 404 /* Two special coding systems.  */
 405 Lisp_Object Vsjis_coding_system;
 406 Lisp_Object Vbig5_coding_system;
 407
 408
 409 static int detect_coding_utf_8 P_ ((struct coding_system *,
 410                                     struct coding_detection_info *info));
 411 static void decode_coding_utf_8 P_ ((struct coding_system *));
 412 static int encode_coding_utf_8 P_ ((struct coding_system *));
 413
 414 static int detect_coding_utf_16 P_ ((struct coding_system *,
 415                                      struct coding_detection_info *info));
 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
 418
 419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 420                                        struct coding_detection_info *info));
 421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 423
 424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 425                                          struct coding_detection_info *info));
 426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 428
 429 static int detect_coding_sjis P_ ((struct coding_system *,
 430                                    struct coding_detection_info *info));
 431 static void decode_coding_sjis P_ ((struct coding_system *));
 432 static int encode_coding_sjis P_ ((struct coding_system *));
 433
 434 static int detect_coding_big5 P_ ((struct coding_system *,
 435                                    struct coding_detection_info *info));
 436 static void decode_coding_big5 P_ ((struct coding_system *));
 437 static int encode_coding_big5 P_ ((struct coding_system *));
 438
 439 static int detect_coding_ccl P_ ((struct coding_system *,
 440                                   struct coding_detection_info *info));
 441 static void decode_coding_ccl P_ ((struct coding_system *));
 442 static int encode_coding_ccl P_ ((struct coding_system *));
 443
 444 static void decode_coding_raw_text P_ ((struct coding_system *));
 445 static int encode_coding_raw_text P_ ((struct coding_system *));
 446
 447
 448 /* ISO2022 section */
 449
 450 #define CODING_ISO_INITIAL(coding, reg)                 \
 451   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 452                      coding_attr_iso_initial),          \
 453                reg)))
 454
 455
 456 #define CODING_ISO_REQUEST(coding, charset_id)  \
 457   ((charset_id <= (coding)->max_charset_id      \
 458     ? (coding)->safe_charsets[charset_id]       \
 459     : -1))
 460
 461
 462 #define CODING_ISO_FLAGS(coding)        \
 463   ((coding)->spec.iso_2022.flags)
 464 #define CODING_ISO_DESIGNATION(coding, reg)     \
 465   ((coding)->spec.iso_2022.current_designation[reg])
 466 #define CODING_ISO_INVOCATION(coding, plane)    \
 467   ((coding)->spec.iso_2022.current_invocation[plane])
 468 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 469   ((coding)->spec.iso_2022.single_shifting)
 470 #define CODING_ISO_BOL(coding)  \
 471   ((coding)->spec.iso_2022.bol)
 472 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 473   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 474
 475 /* Control characters of ISO2022.  */
 476                         /* code */      /* function */
 477 #define ISO_CODE_LF     0x0A            /* line-feed */
 478 #define ISO_CODE_CR     0x0D            /* carriage-return */
 479 #define ISO_CODE_SO     0x0E            /* shift-out */
 480 #define ISO_CODE_SI     0x0F            /* shift-in */
 481 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 482 #define ISO_CODE_ESC    0x1B            /* escape */
 483 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 484 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 485 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 486
 487 /* All code (1-byte) of ISO2022 is classified into one of the
 488    followings.  */
 489 enum iso_code_class_type
 490   {
 491     ISO_control_0,              /* Control codes in the range
 492                                    0x00..0x1F and 0x7F, except for the
 493                                    following 5 codes.  */
 494     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 495     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 496     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 497     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 498     ISO_control_1,              /* Control codes in the range
 499                                    0x80..0x9F, except for the
 500                                    following 3 codes.  */
 501     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 502     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 503     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 504     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 505     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 506     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 507     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 508   };
 509
 510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 511     `iso-flags' attribute of an iso2022 coding system.  */
 512
 513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 514    instead of the correct short-form sequence (e.g. ESC $ A).  */
 515 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 516
 517 /* If set, reset graphic planes and registers at end-of-line to the
 518    initial state.  */
 519 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 520
 521 /* If set, reset graphic planes and registers before any control
 522    characters to the initial state.  */
 523 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 524
 525 /* If set, encode by 7-bit environment.  */
 526 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 527
 528 /* If set, use locking-shift function.  */
 529 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 530
 531 /* If set, use single-shift function.  Overwrite
 532    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 533 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 534
 535 /* If set, use designation escape sequence.  */
 536 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 537
 538 /* If set, produce revision number sequence.  */
 539 #define CODING_ISO_FLAG_REVISION        0x0080
 540
 541 /* If set, produce ISO6429's direction specifying sequence.  */
 542 #define CODING_ISO_FLAG_DIRECTION       0x0100
 543
 544 /* If set, assume designation states are reset at beginning of line on
 545    output.  */
 546 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 547
 548 /* If set, designation sequence should be placed at beginning of line
 549    on output.  */
 550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 551
 552 /* If set, do not encode unsafe charactes on output.  */
 553 #define CODING_ISO_FLAG_SAFE            0x0800
 554
 555 /* If set, extra latin codes (128..159) are accepted as a valid code
 556    on input.  */
 557 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 558
 559 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 560
 561 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 562
 563 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 564
 565 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 566
 567 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 568
 569 /* A character to be produced on output if encoding of the original
 570    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 572
 573
 574 /* UTF-16 section */
 575 #define CODING_UTF_16_BOM(coding)       \
 576   ((coding)->spec.utf_16.bom)
 577
 578 #define CODING_UTF_16_ENDIAN(coding)    \
 579   ((coding)->spec.utf_16.endian)
 580
 581 #define CODING_UTF_16_SURROGATE(coding) \
 582   ((coding)->spec.utf_16.surrogate)
 583
 584
 585 /* CCL section */
 586 #define CODING_CCL_DECODER(coding)      \
 587   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 588 #define CODING_CCL_ENCODER(coding)      \
 589   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 590 #define CODING_CCL_VALIDS(coding)                                          \
 591   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 592
 593 /* Index for each coding category in `coding_categories' */
 594
 595 enum coding_category
 596   {
 597     coding_category_iso_7,
 598     coding_category_iso_7_tight,
 599     coding_category_iso_8_1,
 600     coding_category_iso_8_2,
 601     coding_category_iso_7_else,
 602     coding_category_iso_8_else,
 603     coding_category_utf_8,
 604     coding_category_utf_16_auto,
 605     coding_category_utf_16_be,
 606     coding_category_utf_16_le,
 607     coding_category_utf_16_be_nosig,
 608     coding_category_utf_16_le_nosig,
 609     coding_category_charset,
 610     coding_category_sjis,
 611     coding_category_big5,
 612     coding_category_ccl,
 613     coding_category_emacs_mule,
 614     /* All above are targets of code detection.  */
 615     coding_category_raw_text,
 616     coding_category_undecided,
 617     coding_category_max
 618   };
 619
 620 /* Definitions of flag bits used in detect_coding_XXXX.  */
 621 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 622 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 623 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 624 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 625 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 626 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 627 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 628 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 629 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 630 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 631 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 632 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 633 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 634 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 635 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 636 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 637 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 638 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 639
 640 /* This value is returned if detect_coding_mask () find nothing other
 641    than ASCII characters.  */
 642 #define CATEGORY_MASK_ANY               \
 643   (CATEGORY_MASK_ISO_7                  \
 644    | CATEGORY_MASK_ISO_7_TIGHT          \
 645    | CATEGORY_MASK_ISO_8_1              \
 646    | CATEGORY_MASK_ISO_8_2              \
 647    | CATEGORY_MASK_ISO_7_ELSE           \
 648    | CATEGORY_MASK_ISO_8_ELSE           \
 649    | CATEGORY_MASK_UTF_8                \
 650    | CATEGORY_MASK_UTF_16_BE            \
 651    | CATEGORY_MASK_UTF_16_LE            \
 652    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 653    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 654    | CATEGORY_MASK_CHARSET              \
 655    | CATEGORY_MASK_SJIS                 \
 656    | CATEGORY_MASK_BIG5                 \
 657    | CATEGORY_MASK_CCL                  \
 658    | CATEGORY_MASK_EMACS_MULE)
 659
 660
 661 #define CATEGORY_MASK_ISO_7BIT \
 662   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 663
 664 #define CATEGORY_MASK_ISO_8BIT \
 665   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 666
 667 #define CATEGORY_MASK_ISO_ELSE \
 668   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 669
 670 #define CATEGORY_MASK_ISO_ESCAPE        \
 671   (CATEGORY_MASK_ISO_7                  \
 672    | CATEGORY_MASK_ISO_7_TIGHT          \
 673    | CATEGORY_MASK_ISO_7_ELSE           \
 674    | CATEGORY_MASK_ISO_8_ELSE)
 675
 676 #define CATEGORY_MASK_ISO       \
 677   (  CATEGORY_MASK_ISO_7BIT     \
 678      | CATEGORY_MASK_ISO_8BIT   \
 679      | CATEGORY_MASK_ISO_ELSE)
 680
 681 #define CATEGORY_MASK_UTF_16            \
 682   (CATEGORY_MASK_UTF_16_BE              \
 683    | CATEGORY_MASK_UTF_16_LE            \
 684    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 685    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 686
 687
 688 /* List of symbols `coding-category-xxx' ordered by priority.  This
 689    variable is exposed to Emacs Lisp.  */
 690 static Lisp_Object Vcoding_category_list;
 691
 692 /* Table of coding categories (Lisp symbols).  This variable is for
 693    internal use oly.  */
 694 static Lisp_Object Vcoding_category_table;
 695
 696 /* Table of coding-categories ordered by priority.  */
 697 static enum coding_category coding_priorities[coding_category_max];
 698
 699 /* Nth element is a coding context for the coding system bound to the
 700    Nth coding category.  */
 701 static struct coding_system coding_categories[coding_category_max];
 702
 703 /*** Commonly used macros and functions ***/
 704
 705 #ifndef min
 706 #define min(a, b) ((a) < (b) ? (a) : (b))
 707 #endif
 708 #ifndef max
 709 #define max(a, b) ((a) > (b) ? (a) : (b))
 710 #endif
 711
 712 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 713   do {                                                  \
 714     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 715     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 716   } while (0)
 717
 718
 719 /* Safely get one byte from the source text pointed by SRC which ends
 720    at SRC_END, and set C to that byte.  If there are not enough bytes
 721    in the source, it jumps to `no_more_source'.  The caller
 722    should declare and set these variables appropriately in advance:
 723         src, src_end, multibytep
 724 */
 725
 726 #define ONE_MORE_BYTE(c)                                        \
 727   do {                                                          \
 728     if (src == src_end)                                         \
 729       {                                                         \
 730         if (src_base < src)                                     \
 731           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 732         goto no_more_source;                                    \
 733       }                                                         \
 734     c = *src++;                                                 \
 735     if (multibytep && (c & 0x80))                               \
 736       {                                                         \
 737         if ((c & 0xFE) != 0xC0)                                 \
 738           error ("Undecodable char found");                     \
 739         c = ((c & 1) << 6) | *src++;                            \
 740       }                                                         \
 741     consumed_chars++;                                           \
 742   } while (0)
 743
 744
 745 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 746   do {                                          \
 747     c = *src++;                                 \
 748     if (multibytep && (c & 0x80))               \
 749       {                                         \
 750         if ((c & 0xFE) != 0xC0)                 \
 751           error ("Undecodable char found");     \
 752         c = ((c & 1) << 6) | *src++;            \
 753       }                                         \
 754     consumed_chars++;                           \
 755   } while (0)
 756
 757
 758 /* Store a byte C in the place pointed by DST and increment DST to the
 759    next free point, and increment PRODUCED_CHARS.  The caller should
 760    assure that C is 0..127, and declare and set the variable `dst'
 761    appropriately in advance.
 762 */
 763
 764
 765 #define EMIT_ONE_ASCII_BYTE(c)  \
 766   do {                          \
 767     produced_chars++;           \
 768     *dst++ = (c);               \
 769   } while (0)
 770
 771
 772 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 773
 774 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 775   do {                                  \
 776     produced_chars += 2;                \
 777     *dst++ = (c1), *dst++ = (c2);       \
 778   } while (0)
 779
 780
 781 /* Store a byte C in the place pointed by DST and increment DST to the
 782    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 783    nonzero, store in an appropriate multibyte from.  The caller should
 784    declare and set the variables `dst' and `multibytep' appropriately
 785    in advance.  */
 786
 787 #define EMIT_ONE_BYTE(c)                \
 788   do {                                  \
 789     produced_chars++;                   \
 790     if (multibytep)                     \
 791       {                                 \
 792         int ch = (c);                   \
 793         if (ch >= 0x80)                 \
 794           ch = BYTE8_TO_CHAR (ch);      \
 795         CHAR_STRING_ADVANCE (ch, dst);  \
 796       }                                 \
 797     else                                \
 798       *dst++ = (c);                     \
 799   } while (0)
 800
 801
 802 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 803
 804 #define EMIT_TWO_BYTES(c1, c2)          \
 805   do {                                  \
 806     produced_chars += 2;                \
 807     if (multibytep)                     \
 808       {                                 \
 809         int ch;                         \
 810                                         \
 811         ch = (c1);                      \
 812         if (ch >= 0x80)                 \
 813           ch = BYTE8_TO_CHAR (ch);      \
 814         CHAR_STRING_ADVANCE (ch, dst);  \
 815         ch = (c2);                      \
 816         if (ch >= 0x80)                 \
 817           ch = BYTE8_TO_CHAR (ch);      \
 818         CHAR_STRING_ADVANCE (ch, dst);  \
 819       }                                 \
 820     else                                \
 821       {                                 \
 822         *dst++ = (c1);                  \
 823         *dst++ = (c2);                  \
 824       }                                 \
 825   } while (0)
 826
 827
 828 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 829   do {                                  \
 830     EMIT_ONE_BYTE (c1);                 \
 831     EMIT_TWO_BYTES (c2, c3);            \
 832   } while (0)
 833
 834
 835 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 836   do {                                          \
 837     EMIT_TWO_BYTES (c1, c2);                    \
 838     EMIT_TWO_BYTES (c3, c4);                    \
 839   } while (0)
 840
 841
 842 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 843   do {                                                                       \
 844     charset_map_loaded = 0;                                                  \
 845     c = DECODE_CHAR (charset, code);                                         \
 846     if (charset_map_loaded)                                                  \
 847       {                                                                      \
 848         const unsigned char *orig = coding->source;                          \
 849         EMACS_INT offset;                                                    \
 850                                                                              \
 851         coding_set_source (coding);                                          \
 852         offset = coding->source - orig;                                      \
 853         src += offset;                                                       \
 854         src_base += offset;                                                  \
 855         src_end += offset;                                                   \
 856       }                                                                      \
 857   } while (0)
 858
 859
 860 #define ASSURE_DESTINATION(bytes)                               \
 861   do {                                                          \
 862     if (dst + (bytes) >= dst_end)                               \
 863       {                                                         \
 864         int more_bytes = charbuf_end - charbuf + (bytes);       \
 865                                                                 \
 866         dst = alloc_destination (coding, more_bytes, dst);      \
 867         dst_end = coding->destination + coding->dst_bytes;      \
 868       }                                                         \
 869   } while (0)
 870
 871
 872
 873 static void
 874 coding_set_source (coding)
 875      struct coding_system *coding;
 876 {
 877   if (BUFFERP (coding->src_object))
 878     {
 879       struct buffer *buf = XBUFFER (coding->src_object);
 880
 881       if (coding->src_pos < 0)
 882         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 883       else
 884         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 885     }
 886   else if (STRINGP (coding->src_object))
 887     {
 888       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 889     }
 890   else
 891     /* Otherwise, the source is C string and is never relocated
 892        automatically.  Thus we don't have to update anything.  */
 893     ;
 894 }
 895
 896 static void
 897 coding_set_destination (coding)
 898      struct coding_system *coding;
 899 {
 900   if (BUFFERP (coding->dst_object))
 901     {
 902       if (coding->src_pos < 0)
 903         {
 904           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 905           coding->dst_bytes = (GAP_END_ADDR
 906                                - (coding->src_bytes - coding->consumed)
 907                                - coding->destination);
 908         }
 909       else
 910         {
 911           /* We are sure that coding->dst_pos_byte is before the gap
 912              of the buffer. */
 913           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 914                                  + coding->dst_pos_byte - 1);
 915           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 916                                - coding->destination);
 917         }
 918     }
 919   else
 920     /* Otherwise, the destination is C string and is never relocated
 921        automatically.  Thus we don't have to update anything.  */
 922     ;
 923 }
 924
 925
 926 static void
 927 coding_alloc_by_realloc (coding, bytes)
 928      struct coding_system *coding;
 929      EMACS_INT bytes;
 930 {
 931   coding->destination = (unsigned char *) xrealloc (coding->destination,
 932                                                     coding->dst_bytes + bytes);
 933   coding->dst_bytes += bytes;
 934 }
 935
 936 static void
 937 coding_alloc_by_making_gap (coding, bytes)
 938      struct coding_system *coding;
 939      EMACS_INT bytes;
 940 {
 941   if (BUFFERP (coding->dst_object)
 942       && EQ (coding->src_object, coding->dst_object))
 943     {
 944       EMACS_INT add = coding->src_bytes - coding->consumed;
 945
 946       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 947       make_gap (bytes);
 948       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 949     }
 950   else
 951     {
 952       Lisp_Object this_buffer;
 953
 954       this_buffer = Fcurrent_buffer ();
 955       set_buffer_internal (XBUFFER (coding->dst_object));
 956       make_gap (bytes);
 957       set_buffer_internal (XBUFFER (this_buffer));
 958     }
 959 }
 960
 961
 962 static unsigned char *
 963 alloc_destination (coding, nbytes, dst)
 964      struct coding_system *coding;
 965      EMACS_INT nbytes;
 966      unsigned char *dst;
 967 {
 968   EMACS_INT offset = dst - coding->destination;
 969
 970   if (BUFFERP (coding->dst_object))
 971     coding_alloc_by_making_gap (coding, nbytes);
 972   else
 973     coding_alloc_by_realloc (coding, nbytes);
 974   coding->result = CODING_RESULT_SUCCESS;
 975   coding_set_destination (coding);
 976   dst = coding->destination + offset;
 977   return dst;
 978 }
 979
 980 /** Macros for annotations.  */
 981
 982 /* Maximum length of annotation data (sum of annotations for
 983    composition and charset).  */
 984 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
 985
 986 /* An annotation data is stored in the array coding->charbuf in this
 987    format:
 988      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
 989    LENGTH is the number of elements in the annotation.
 990    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
 991    FROM and TO specify the range of text annotated.  They are relative
 992    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
 993
 994    The format of the following elements depend on ANNOTATION_MASK.
 995
 996    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
 997    follows:
 998      ... METHOD [ COMPOSITION-COMPONENTS ... ]
 999    METHOD is one of enum composition_method.
1000    Optionnal COMPOSITION-COMPONENTS are characters and composition
1001    rules.
1002
1003    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1004    follows.  */
1005
1006 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1007   do {                                                  \
1008     *(buf)++ = -(len);                                  \
1009     *(buf)++ = (mask);                                  \
1010     *(buf)++ = (from);                                  \
1011     *(buf)++ = (to);                                    \
1012     coding->annotated = 1;                              \
1013   } while (0);
1014
1015 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1016   do {                                                                        \
1017     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1018     *buf++ = method;                                                          \
1019   } while (0)
1020
1021
1022 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1023   do {                                                                    \
1024     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1025     *buf++ = id;                                                          \
1026   } while (0)
1027
1028 \f
1029 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1030
1031
1032
1033 \f
1034 /*** 3. UTF-8 ***/
1035
1036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1037    Check if a text is encoded in UTF-8.  If it is, return 1, else
1038    return 0.  */
1039
1040 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1041 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1042 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1043 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1044 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1045 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1046
1047 static int
1048 detect_coding_utf_8 (coding, detect_info)
1049      struct coding_system *coding;
1050      struct coding_detection_info *detect_info;
1051 {
1052   const unsigned char *src = coding->source, *src_base = src;
1053   const unsigned char *src_end = coding->source + coding->src_bytes;
1054   int multibytep = coding->src_multibyte;
1055   int consumed_chars = 0;
1056   int found = 0;
1057   int incomplete;
1058
1059   detect_info->checked |= CATEGORY_MASK_UTF_8;
1060   /* A coding system of this category is always ASCII compatible.  */
1061   src += coding->head_ascii;
1062
1063   while (1)
1064     {
1065       int c, c1, c2, c3, c4;
1066
1067       incomplete = 0;
1068       ONE_MORE_BYTE (c);
1069       if (UTF_8_1_OCTET_P (c))
1070         continue;
1071       incomplete = 1;
1072       ONE_MORE_BYTE (c1);
1073       if (! UTF_8_EXTRA_OCTET_P (c1))
1074         break;
1075       if (UTF_8_2_OCTET_LEADING_P (c))
1076         {
1077           found = CATEGORY_MASK_UTF_8;
1078           continue;
1079         }
1080       ONE_MORE_BYTE (c2);
1081       if (! UTF_8_EXTRA_OCTET_P (c2))
1082         break;
1083       if (UTF_8_3_OCTET_LEADING_P (c))
1084         {
1085           found = CATEGORY_MASK_UTF_8;
1086           continue;
1087         }
1088       ONE_MORE_BYTE (c3);
1089       if (! UTF_8_EXTRA_OCTET_P (c3))
1090         break;
1091       if (UTF_8_4_OCTET_LEADING_P (c))
1092         {
1093           found = CATEGORY_MASK_UTF_8;
1094           continue;
1095         }
1096       ONE_MORE_BYTE (c4);
1097       if (! UTF_8_EXTRA_OCTET_P (c4))
1098         break;
1099       if (UTF_8_5_OCTET_LEADING_P (c))
1100         {
1101           found = CATEGORY_MASK_UTF_8;
1102           continue;
1103         }
1104       break;
1105     }
1106   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1107   return 0;
1108
1109  no_more_source:
1110   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1111     {
1112       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1113       return 0;
1114     }
1115   detect_info->found |= found;
1116   return 1;
1117 }
1118
1119
1120 static void
1121 decode_coding_utf_8 (coding)
1122      struct coding_system *coding;
1123 {
1124   const unsigned char *src = coding->source + coding->consumed;
1125   const unsigned char *src_end = coding->source + coding->src_bytes;
1126   const unsigned char *src_base;
1127   int *charbuf = coding->charbuf;
1128   int *charbuf_end = charbuf + coding->charbuf_size;
1129   int consumed_chars = 0, consumed_chars_base;
1130   int multibytep = coding->src_multibyte;
1131   Lisp_Object attr, charset_list;
1132
1133   CODING_GET_INFO (coding, attr, charset_list);
1134
1135   while (1)
1136     {
1137       int c, c1, c2, c3, c4, c5;
1138
1139       src_base = src;
1140       consumed_chars_base = consumed_chars;
1141
1142       if (charbuf >= charbuf_end)
1143         break;
1144
1145       ONE_MORE_BYTE (c1);
1146       if (UTF_8_1_OCTET_P(c1))
1147         {
1148           c = c1;
1149         }
1150       else
1151         {
1152           ONE_MORE_BYTE (c2);
1153           if (! UTF_8_EXTRA_OCTET_P (c2))
1154             goto invalid_code;
1155           if (UTF_8_2_OCTET_LEADING_P (c1))
1156             {
1157               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1158               /* Reject overlong sequences here and below.  Encoders
1159                  producing them are incorrect, they can be misleading,
1160                  and they mess up read/write invariance.  */
1161               if (c < 128)
1162                 goto invalid_code;
1163             }
1164           else
1165             {
1166               ONE_MORE_BYTE (c3);
1167               if (! UTF_8_EXTRA_OCTET_P (c3))
1168                 goto invalid_code;
1169               if (UTF_8_3_OCTET_LEADING_P (c1))
1170                 {
1171                   c = (((c1 & 0xF) << 12)
1172                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1173                   if (c < 0x800
1174                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1175                     goto invalid_code;
1176                 }
1177               else
1178                 {
1179                   ONE_MORE_BYTE (c4);
1180                   if (! UTF_8_EXTRA_OCTET_P (c4))
1181                     goto invalid_code;
1182                   if (UTF_8_4_OCTET_LEADING_P (c1))
1183                     {
1184                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1185                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1186                     if (c < 0x10000)
1187                       goto invalid_code;
1188                     }
1189                   else
1190                     {
1191                       ONE_MORE_BYTE (c5);
1192                       if (! UTF_8_EXTRA_OCTET_P (c5))
1193                         goto invalid_code;
1194                       if (UTF_8_5_OCTET_LEADING_P (c1))
1195                         {
1196                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1197                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1198                                | (c5 & 0x3F));
1199                           if ((c > MAX_CHAR) || (c < 0x200000))
1200                             goto invalid_code;
1201                         }
1202                       else
1203                         goto invalid_code;
1204                     }
1205                 }
1206             }
1207         }
1208
1209       *charbuf++ = c;
1210       continue;
1211
1212     invalid_code:
1213       src = src_base;
1214       consumed_chars = consumed_chars_base;
1215       ONE_MORE_BYTE (c);
1216       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1217       coding->errors++;
1218     }
1219
1220  no_more_source:
1221   coding->consumed_char += consumed_chars_base;
1222   coding->consumed = src_base - coding->source;
1223   coding->charbuf_used = charbuf - coding->charbuf;
1224 }
1225
1226
1227 static int
1228 encode_coding_utf_8 (coding)
1229      struct coding_system *coding;
1230 {
1231   int multibytep = coding->dst_multibyte;
1232   int *charbuf = coding->charbuf;
1233   int *charbuf_end = charbuf + coding->charbuf_used;
1234   unsigned char *dst = coding->destination + coding->produced;
1235   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1236   int produced_chars = 0;
1237   int c;
1238
1239   if (multibytep)
1240     {
1241       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1242
1243       while (charbuf < charbuf_end)
1244         {
1245           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1246
1247           ASSURE_DESTINATION (safe_room);
1248           c = *charbuf++;
1249           if (CHAR_BYTE8_P (c))
1250             {
1251               c = CHAR_TO_BYTE8 (c);
1252               EMIT_ONE_BYTE (c);
1253             }
1254           else
1255             {
1256               CHAR_STRING_ADVANCE (c, pend);
1257               for (p = str; p < pend; p++)
1258                 EMIT_ONE_BYTE (*p);
1259             }
1260         }
1261     }
1262   else
1263     {
1264       int safe_room = MAX_MULTIBYTE_LENGTH;
1265
1266       while (charbuf < charbuf_end)
1267         {
1268           ASSURE_DESTINATION (safe_room);
1269           c = *charbuf++;
1270           dst += CHAR_STRING (c, dst);
1271           produced_chars++;
1272         }
1273     }
1274   coding->result = CODING_RESULT_SUCCESS;
1275   coding->produced_char += produced_chars;
1276   coding->produced = dst - coding->destination;
1277   return 0;
1278 }
1279
1280
1281 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1282    Check if a text is encoded in one of UTF-16 based coding systems.
1283    If it is, return 1, else return 0.  */
1284
1285 #define UTF_16_HIGH_SURROGATE_P(val) \
1286   (((val) & 0xFC00) == 0xD800)
1287
1288 #define UTF_16_LOW_SURROGATE_P(val) \
1289   (((val) & 0xFC00) == 0xDC00)
1290
1291 #define UTF_16_INVALID_P(val)   \
1292   (((val) == 0xFFFE)            \
1293    || ((val) == 0xFFFF)         \
1294    || UTF_16_LOW_SURROGATE_P (val))
1295
1296
1297 static int
1298 detect_coding_utf_16 (coding, detect_info)
1299      struct coding_system *coding;
1300      struct coding_detection_info *detect_info;
1301 {
1302   const unsigned char *src = coding->source, *src_base = src;
1303   const unsigned char *src_end = coding->source + coding->src_bytes;
1304   int multibytep = coding->src_multibyte;
1305   int consumed_chars = 0;
1306   int c1, c2;
1307
1308   detect_info->checked |= CATEGORY_MASK_UTF_16;
1309   if (coding->mode & CODING_MODE_LAST_BLOCK
1310       && (coding->src_chars & 1))
1311     {
1312       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1313       return 0;
1314     }
1315
1316   ONE_MORE_BYTE (c1);
1317   ONE_MORE_BYTE (c2);
1318   if ((c1 == 0xFF) && (c2 == 0xFE))
1319     {
1320       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1321                              | CATEGORY_MASK_UTF_16_AUTO);
1322       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1323                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1324                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1325     }
1326   else if ((c1 == 0xFE) && (c2 == 0xFF))
1327     {
1328       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1329                              | CATEGORY_MASK_UTF_16_AUTO);
1330       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1331                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1332                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1333     }
1334   else
1335     {
1336       unsigned char b1[256], b2[256];
1337       int b1_variants = 1, b2_variants = 1;
1338       int n;
1339
1340       bzero (b1, 256), bzero (b2, 256);
1341       b1[c1]++, b2[c2]++;
1342       for (n = 0; n < 256 && src < src_end; n++)
1343         {
1344           ONE_MORE_BYTE (c1);
1345           ONE_MORE_BYTE (c2);
1346           if (! b1[c1++]) b1_variants++;
1347           if (! b2[c2++]) b2_variants++;
1348         }
1349       if (b1_variants < b2_variants)
1350         detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1351       else
1352         detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1353       detect_info->rejected
1354         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1355     }
1356  no_more_source:
1357   return 1;
1358 }
1359
1360 static void
1361 decode_coding_utf_16 (coding)
1362      struct coding_system *coding;
1363 {
1364   const unsigned char *src = coding->source + coding->consumed;
1365   const unsigned char *src_end = coding->source + coding->src_bytes;
1366   const unsigned char *src_base;
1367   int *charbuf = coding->charbuf;
1368   int *charbuf_end = charbuf + coding->charbuf_size;
1369   int consumed_chars = 0, consumed_chars_base;
1370   int multibytep = coding->src_multibyte;
1371   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1372   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1373   int surrogate = CODING_UTF_16_SURROGATE (coding);
1374   Lisp_Object attr, charset_list;
1375
1376   CODING_GET_INFO (coding, attr, charset_list);
1377
1378   if (bom == utf_16_with_bom)
1379     {
1380       int c, c1, c2;
1381
1382       src_base = src;
1383       ONE_MORE_BYTE (c1);
1384       ONE_MORE_BYTE (c2);
1385       c = (c1 << 8) | c2;
1386
1387       if (endian == utf_16_big_endian
1388           ? c != 0xFEFF : c != 0xFFFE)
1389         {
1390           /* The first two bytes are not BOM.  Treat them as bytes
1391              for a normal character.  */
1392           src = src_base;
1393           coding->errors++;
1394         }
1395       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396     }
1397   else if (bom == utf_16_detect_bom)
1398     {
1399       /* We have already tried to detect BOM and failed in
1400          detect_coding.  */
1401       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1402     }
1403
1404   while (1)
1405     {
1406       int c, c1, c2;
1407
1408       src_base = src;
1409       consumed_chars_base = consumed_chars;
1410
1411       if (charbuf + 2 >= charbuf_end)
1412         break;
1413
1414       ONE_MORE_BYTE (c1);
1415       ONE_MORE_BYTE (c2);
1416       c = (endian == utf_16_big_endian
1417            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1418       if (surrogate)
1419         {
1420           if (! UTF_16_LOW_SURROGATE_P (c))
1421             {
1422               if (endian == utf_16_big_endian)
1423                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1424               else
1425                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1426               *charbuf++ = c1;
1427               *charbuf++ = c2;
1428               coding->errors++;
1429               if (UTF_16_HIGH_SURROGATE_P (c))
1430                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1431               else
1432                 *charbuf++ = c;
1433             }
1434           else
1435             {
1436               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1437               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1438               *charbuf++ = c;
1439             }
1440         }
1441       else
1442         {
1443           if (UTF_16_HIGH_SURROGATE_P (c))
1444             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1445           else
1446             *charbuf++ = c;
1447         }
1448     }
1449
1450  no_more_source:
1451   coding->consumed_char += consumed_chars_base;
1452   coding->consumed = src_base - coding->source;
1453   coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456 static int
1457 encode_coding_utf_16 (coding)
1458      struct coding_system *coding;
1459 {
1460   int multibytep = coding->dst_multibyte;
1461   int *charbuf = coding->charbuf;
1462   int *charbuf_end = charbuf + coding->charbuf_used;
1463   unsigned char *dst = coding->destination + coding->produced;
1464   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465   int safe_room = 8;
1466   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1467   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1468   int produced_chars = 0;
1469   Lisp_Object attrs, charset_list;
1470   int c;
1471
1472   CODING_GET_INFO (coding, attrs, charset_list);
1473
1474   if (bom != utf_16_without_bom)
1475     {
1476       ASSURE_DESTINATION (safe_room);
1477       if (big_endian)
1478         EMIT_TWO_BYTES (0xFE, 0xFF);
1479       else
1480         EMIT_TWO_BYTES (0xFF, 0xFE);
1481       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1482     }
1483
1484   while (charbuf < charbuf_end)
1485     {
1486       ASSURE_DESTINATION (safe_room);
1487       c = *charbuf++;
1488       if (c >= MAX_UNICODE_CHAR)
1489         c = coding->default_char;
1490
1491       if (c < 0x10000)
1492         {
1493           if (big_endian)
1494             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1495           else
1496             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1497         }
1498       else
1499         {
1500           int c1, c2;
1501
1502           c -= 0x10000;
1503           c1 = (c >> 10) + 0xD800;
1504           c2 = (c & 0x3FF) + 0xDC00;
1505           if (big_endian)
1506             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1507           else
1508             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1509         }
1510     }
1511   coding->result = CODING_RESULT_SUCCESS;
1512   coding->produced = dst - coding->destination;
1513   coding->produced_char += produced_chars;
1514   return 0;
1515 }
1516
1517 \f
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1519
1520 /* Emacs' internal format for representation of multiple character
1521    sets is a kind of multi-byte encoding, i.e. characters are
1522    represented by variable-length sequences of one-byte codes.
1523
1524    ASCII characters and control characters (e.g. `tab', `newline') are
1525    represented by one-byte sequences which are their ASCII codes, in
1526    the range 0x00 through 0x7F.
1527
1528    8-bit characters of the range 0x80..0x9F are represented by
1529    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1530    code + 0x20).
1531
1532    8-bit characters of the range 0xA0..0xFF are represented by
1533    one-byte sequences which are their 8-bit code.
1534
1535    The other characters are represented by a sequence of `base
1536    leading-code', optional `extended leading-code', and one or two
1537    `position-code's.  The length of the sequence is determined by the
1538    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1539    whereas extended leading-code and position-code take the range 0xA0
1540    through 0xFF.  See `charset.h' for more details about leading-code
1541    and position-code.
1542
1543    --- CODE RANGE of Emacs' internal format ---
1544    character set        range
1545    -------------        -----
1546    ascii                0x00..0x7F
1547    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548    eight-bit-graphic    0xA0..0xBF
1549    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1550    ---------------------------------------------
1551
1552    As this is the internal character representation, the format is
1553    usually not used externally (i.e. in a file or in a data sent to a
1554    process).  But, it is possible to have a text externally in this
1555    format (i.e. by encoding by the coding system `emacs-mule').
1556
1557    In that case, a sequence of one-byte codes has a slightly different
1558    form.
1559
1560    At first, all characters in eight-bit-control are represented by
1561    one-byte sequences which are their 8-bit code.
1562
1563    Next, character composition data are represented by the byte
1564    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1565    where,
1566         METHOD is 0xF0 plus one of composition method (enum
1567         composition_method),
1568
1569         BYTES is 0xA0 plus a byte length of this composition data,
1570
1571         CHARS is 0x20 plus a number of characters composed by this
1572         data,
1573
1574         COMPONENTs are characters of multibye form or composition
1575         rules encoded by two-byte of ASCII codes.
1576
1577    In addition, for backward compatibility, the following formats are
1578    also recognized as composition data on decoding.
1579
1580    0x80 MSEQ ...
1581    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1582
1583    Here,
1584         MSEQ is a multibyte form but in these special format:
1585           ASCII: 0xA0 ASCII_CODE+0x80,
1586           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587         RULE is a one byte code of the range 0xA0..0xF0 that
1588         represents a composition rule.
1589   */
1590
1591 char emacs_mule_bytes[256];
1592
1593 int
1594 emacs_mule_char (coding, src, nbytes, nchars, id)
1595      struct coding_system *coding;
1596      unsigned char *src;
1597      int *nbytes, *nchars, *id;
1598 {
1599   const unsigned char *src_end = coding->source + coding->src_bytes;
1600   const unsigned char *src_base = src;
1601   int multibytep = coding->src_multibyte;
1602   struct charset *charset;
1603   unsigned code;
1604   int c;
1605   int consumed_chars = 0;
1606
1607   ONE_MORE_BYTE (c);
1608   switch (emacs_mule_bytes[c])
1609     {
1610     case 2:
1611       if (! (charset = emacs_mule_charset[c]))
1612         goto invalid_code;
1613       ONE_MORE_BYTE (c);
1614       code = c & 0x7F;
1615       break;
1616
1617     case 3:
1618       if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1619           || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1620         {
1621           ONE_MORE_BYTE (c);
1622           if (! (charset = emacs_mule_charset[c]))
1623             goto invalid_code;
1624           ONE_MORE_BYTE (c);
1625           code = c & 0x7F;
1626         }
1627       else
1628         {
1629           if (! (charset = emacs_mule_charset[c]))
1630             goto invalid_code;
1631           ONE_MORE_BYTE (c);
1632           code = (c & 0x7F) << 8;
1633           ONE_MORE_BYTE (c);
1634           code |= c & 0x7F;
1635         }
1636       break;
1637
1638     case 4:
1639       ONE_MORE_BYTE (c);
1640       if (! (charset = emacs_mule_charset[c]))
1641         goto invalid_code;
1642       ONE_MORE_BYTE (c);
1643       code = (c & 0x7F) << 8;
1644       ONE_MORE_BYTE (c);
1645       code |= c & 0x7F;
1646       break;
1647
1648     case 1:
1649       code = c;
1650       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1651                                  ? charset_ascii : charset_eight_bit);
1652       break;
1653
1654     default:
1655       abort ();
1656     }
1657   c = DECODE_CHAR (charset, code);
1658   if (c < 0)
1659     goto invalid_code;
1660   *nbytes = src - src_base;
1661   *nchars = consumed_chars;
1662   if (id)
1663     *id = charset->id;
1664   return c;
1665
1666  no_more_source:
1667   return -2;
1668
1669  invalid_code:
1670   return -1;
1671 }
1672
1673
1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1676    else return 0.  */
1677
1678 static int
1679 detect_coding_emacs_mule (coding, detect_info)
1680      struct coding_system *coding;
1681      struct coding_detection_info *detect_info;
1682 {
1683   const unsigned char *src = coding->source, *src_base = src;
1684   const unsigned char *src_end = coding->source + coding->src_bytes;
1685   int multibytep = coding->src_multibyte;
1686   int consumed_chars = 0;
1687   int c;
1688   int found = 0;
1689   int incomplete;
1690
1691   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1692   /* A coding system of this category is always ASCII compatible.  */
1693   src += coding->head_ascii;
1694
1695   while (1)
1696     {
1697       incomplete = 0;
1698       ONE_MORE_BYTE (c);
1699       incomplete = 1;
1700
1701       if (c == 0x80)
1702         {
1703           /* Perhaps the start of composite character.  We simple skip
1704              it because analyzing it is too heavy for detecting.  But,
1705              at least, we check that the composite character
1706              constitues of more than 4 bytes.  */
1707           const unsigned char *src_base;
1708
1709         repeat:
1710           src_base = src;
1711           do
1712             {
1713               ONE_MORE_BYTE (c);
1714             }
1715           while (c >= 0xA0);
1716
1717           if (src - src_base <= 4)
1718             break;
1719           found = CATEGORY_MASK_EMACS_MULE;
1720           if (c == 0x80)
1721             goto repeat;
1722         }
1723
1724       if (c < 0x80)
1725         {
1726           if (c < 0x20
1727               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1728             break;
1729         }
1730       else
1731         {
1732           const unsigned char *src_base = src - 1;
1733
1734           do
1735             {
1736               ONE_MORE_BYTE (c);
1737             }
1738           while (c >= 0xA0);
1739           if (src - src_base != emacs_mule_bytes[*src_base])
1740             break;
1741           found = CATEGORY_MASK_EMACS_MULE;
1742         }
1743     }
1744   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745   return 0;
1746
1747  no_more_source:
1748   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1749     {
1750       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1751       return 0;
1752     }
1753   detect_info->found |= found;
1754   return 1;
1755 }
1756
1757
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1759
1760 /* Decode a character represented as a component of composition
1761    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1762    update SRC to the head of next character (or an encoded composition
1763    rule).  If SRC doesn't points a composition component, set C to -1.
1764    If SRC points an invalid byte sequence, global exit by a return
1765    value 0.  */
1766
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1768   if (1)                                                        \
1769     {                                                           \
1770       int c;                                                    \
1771       int nbytes, nchars;                                       \
1772                                                                 \
1773       if (src == src_end)                                       \
1774         break;                                                  \
1775       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776       if (c < 0)                                                \
1777         {                                                       \
1778           if (c == -2)                                          \
1779             break;                                              \
1780           goto invalid_code;                                    \
1781         }                                                       \
1782       *buf++ = c;                                               \
1783       src += nbytes;                                            \
1784       consumed_chars += nchars;                                 \
1785     }                                                           \
1786   else
1787
1788
1789 /* Decode a composition rule represented as a component of composition
1790    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1791    and increment BUF.  If SRC points an invalid byte sequence, set C
1792    to -1.  */
1793
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1795   do {                                                  \
1796     int c, gref, nref;                                  \
1797                                                         \
1798     if (src >= src_end)                                 \
1799       goto invalid_code;                                \
1800     ONE_MORE_BYTE_NO_CHECK (c);                         \
1801     c -= 0x20;                                          \
1802     if (c < 0 || c >= 81)                               \
1803       goto invalid_code;                                \
1804                                                         \
1805     gref = c / 9, nref = c % 9;                         \
1806     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1807   } while (0)
1808
1809
1810 /* Decode a composition rule represented as a component of composition
1811    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1812    and increment BUF.  If SRC points an invalid byte sequence, set C
1813    to -1.  */
1814
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1816   do {                                                  \
1817     int gref, nref;                                     \
1818                                                         \
1819     if (src + 1>= src_end)                              \
1820       goto invalid_code;                                \
1821     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1822     gref -= 0x20;                                       \
1823     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1824     nref -= 0x20;                                       \
1825     if (gref < 0 || gref >= 81                          \
1826         || nref < 0 || nref >= 81)                      \
1827       goto invalid_code;                                \
1828     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1829   } while (0)
1830
1831
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1833   do {                                                                  \
1834     /* Emacs 21 style format.  The first three bytes at SRC are         \
1835        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1836        the byte length of this composition information, CHARS is the    \
1837        number of characters composed by this composition.  */           \
1838     enum composition_method method = c - 0xF2;                          \
1839     int *charbuf_base = charbuf;                                        \
1840     int from, to;                                                       \
1841     int consumed_chars_limit;                                           \
1842     int nbytes, nchars;                                                 \
1843                                                                         \
1844     ONE_MORE_BYTE (c);                                                  \
1845     nbytes = c - 0xA0;                                                  \
1846     if (nbytes < 3)                                                     \
1847       goto invalid_code;                                                \
1848     ONE_MORE_BYTE (c);                                                  \
1849     nchars = c - 0xA0;                                                  \
1850     from = coding->produced + char_offset;                              \
1851     to = from + nchars;                                                 \
1852     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1853     consumed_chars_limit = consumed_chars_base + nbytes;                \
1854     if (method != COMPOSITION_RELATIVE)                                 \
1855       {                                                                 \
1856         int i = 0;                                                      \
1857         while (consumed_chars < consumed_chars_limit)                   \
1858           {                                                             \
1859             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1860               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1861             else                                                        \
1862               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1863             i++;                                                        \
1864           }                                                             \
1865         if (consumed_chars < consumed_chars_limit)                      \
1866           goto invalid_code;                                            \
1867         charbuf_base[0] -= i;                                           \
1868       }                                                                 \
1869   } while (0)
1870
1871
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1873   do {                                                          \
1874     /* Emacs 20 style format for relative composition.  */      \
1875     /* Store multibyte form of characters to be composed.  */   \
1876     enum composition_method method = COMPOSITION_RELATIVE;      \
1877     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1878     int *buf = components;                                      \
1879     int i, j;                                                   \
1880     int from, to;                                               \
1881                                                                 \
1882     src = src_base;                                             \
1883     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1884     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1885       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1886     if (i < 2)                                                  \
1887       goto invalid_code;                                        \
1888     from = coding->produced_char + char_offset;                 \
1889     to = from + i;                                              \
1890     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1891     for (j = 0; j < i; j++)                                     \
1892       *charbuf++ = components[j];                               \
1893   } while (0)
1894
1895
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1897   do {                                                          \
1898     /* Emacs 20 style format for rule-base composition.  */     \
1899     /* Store multibyte form of characters to be composed.  */   \
1900     enum composition_method method = COMPOSITION_WITH_RULE;     \
1901     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1902     int *buf = components;                                      \
1903     int i, j;                                                   \
1904     int from, to;                                               \
1905                                                                 \
1906     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1907     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1908       {                                                         \
1909         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1910         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1911       }                                                         \
1912     if (i < 1 || (buf - components) % 2 == 0)                   \
1913       goto invalid_code;                                        \
1914     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1915       goto no_more_source;                                      \
1916     from = coding->produced_char + char_offset;                 \
1917     to = from + i;                                              \
1918     ADD_COMPOSITION_DATA (buf, from, to, method);               \
1919     for (j = 0; j < i; j++)                                     \
1920       *charbuf++ = components[j];                               \
1921     for (j = 0; j < i; j += 2)                                  \
1922       *charbuf++ = components[j];                               \
1923   } while (0)
1924
1925
1926 static void
1927 decode_coding_emacs_mule (coding)
1928      struct coding_system *coding;
1929 {
1930   const unsigned char *src = coding->source + coding->consumed;
1931   const unsigned char *src_end = coding->source + coding->src_bytes;
1932   const unsigned char *src_base;
1933   int *charbuf = coding->charbuf;
1934   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1935   int consumed_chars = 0, consumed_chars_base;
1936   int multibytep = coding->src_multibyte;
1937   Lisp_Object attrs, charset_list;
1938   int char_offset = coding->produced_char;
1939   int last_offset = char_offset;
1940   int last_id = charset_ascii;
1941
1942   CODING_GET_INFO (coding, attrs, charset_list);
1943
1944   while (1)
1945     {
1946       int c;
1947
1948       src_base = src;
1949       consumed_chars_base = consumed_chars;
1950
1951       if (charbuf >= charbuf_end)
1952         break;
1953
1954       ONE_MORE_BYTE (c);
1955
1956       if (c < 0x80)
1957         {
1958           *charbuf++ = c;
1959           char_offset++;
1960         }
1961       else if (c == 0x80)
1962         {
1963           ONE_MORE_BYTE (c);
1964           if (c - 0xF2 >= COMPOSITION_RELATIVE
1965               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1966             DECODE_EMACS_MULE_21_COMPOSITION (c);
1967           else if (c < 0xC0)
1968             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1969           else if (c == 0xFF)
1970             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1971           else
1972             goto invalid_code;
1973         }
1974       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1975         {
1976           int nbytes, nchars;
1977           int id;
1978
1979           src = src_base;
1980           consumed_chars = consumed_chars_base;
1981           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1982           if (c < 0)
1983             {
1984               if (c == -2)
1985                 break;
1986               goto invalid_code;
1987             }
1988           if (last_id != id)
1989             {
1990               if (last_id != charset_ascii)
1991                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
1992               last_id = id;
1993               last_offset = char_offset;
1994             }
1995           *charbuf++ = c;
1996           src += nbytes;
1997           consumed_chars += nchars;
1998           char_offset++;
1999         }
2000       continue;
2001
2002     invalid_code:
2003       src = src_base;
2004       consumed_chars = consumed_chars_base;
2005       ONE_MORE_BYTE (c);
2006       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2007       char_offset++;
2008       coding->errors++;
2009     }
2010
2011  no_more_source:
2012   if (last_id != charset_ascii)
2013     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2014   coding->consumed_char += consumed_chars_base;
2015   coding->consumed = src_base - coding->source;
2016   coding->charbuf_used = charbuf - coding->charbuf;
2017 }
2018
2019
2020 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2021   do {                                          \
2022     if (id < 0xA0)                              \
2023       codes[0] = id, codes[1] = 0;              \
2024     else if (id < 0xE0)                         \
2025       codes[0] = 0x9A, codes[1] = id;           \
2026     else if (id < 0xF0)                         \
2027       codes[0] = 0x9B, codes[1] = id;           \
2028     else if (id < 0xF5)                         \
2029       codes[0] = 0x9C, codes[1] = id;           \
2030     else                                        \
2031       codes[0] = 0x9D, codes[1] = id;           \
2032   } while (0);
2033
2034
2035 static int
2036 encode_coding_emacs_mule (coding)
2037      struct coding_system *coding;
2038 {
2039   int multibytep = coding->dst_multibyte;
2040   int *charbuf = coding->charbuf;
2041   int *charbuf_end = charbuf + coding->charbuf_used;
2042   unsigned char *dst = coding->destination + coding->produced;
2043   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2044   int safe_room = 8;
2045   int produced_chars = 0;
2046   Lisp_Object attrs, charset_list;
2047   int c;
2048   int preferred_charset_id = -1;
2049
2050   CODING_GET_INFO (coding, attrs, charset_list);
2051   if (! EQ (charset_list, Vemacs_mule_charset_list))
2052     {
2053       CODING_ATTR_CHARSET_LIST (attrs)
2054         = charset_list = Vemacs_mule_charset_list;
2055     }
2056
2057   while (charbuf < charbuf_end)
2058     {
2059       ASSURE_DESTINATION (safe_room);
2060       c = *charbuf++;
2061
2062       if (c < 0)
2063         {
2064           /* Handle an annotation.  */
2065           switch (*charbuf)
2066             {
2067             case CODING_ANNOTATE_COMPOSITION_MASK:
2068               /* Not yet implemented.  */
2069               break;
2070             case CODING_ANNOTATE_CHARSET_MASK:
2071               preferred_charset_id = charbuf[3];
2072               if (preferred_charset_id >= 0
2073                   && NILP (Fmemq (make_number (preferred_charset_id),
2074                                   charset_list)))
2075                 preferred_charset_id = -1;
2076               break;
2077             default:
2078               abort ();
2079             }
2080           charbuf += -c - 1;
2081           continue;
2082         }
2083
2084       if (ASCII_CHAR_P (c))
2085         EMIT_ONE_ASCII_BYTE (c);
2086       else if (CHAR_BYTE8_P (c))
2087         {
2088           c = CHAR_TO_BYTE8 (c);
2089           EMIT_ONE_BYTE (c);
2090         }
2091       else
2092         {
2093           struct charset *charset;
2094           unsigned code;
2095           int dimension;
2096           int emacs_mule_id;
2097           unsigned char leading_codes[2];
2098
2099           if (preferred_charset_id >= 0)
2100             {
2101               charset = CHARSET_FROM_ID (preferred_charset_id);
2102               if (! CHAR_CHARSET_P (c, charset))
2103                 charset = char_charset (c, charset_list, NULL);
2104             }
2105           else
2106             charset = char_charset (c, charset_list, &code);
2107           if (! charset)
2108             {
2109               c = coding->default_char;
2110               if (ASCII_CHAR_P (c))
2111                 {
2112                   EMIT_ONE_ASCII_BYTE (c);
2113                   continue;
2114                 }
2115               charset = char_charset (c, charset_list, &code);
2116             }
2117           dimension = CHARSET_DIMENSION (charset);
2118           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2119           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2120           EMIT_ONE_BYTE (leading_codes[0]);
2121           if (leading_codes[1])
2122             EMIT_ONE_BYTE (leading_codes[1]);
2123           if (dimension == 1)
2124             EMIT_ONE_BYTE (code | 0x80);
2125           else
2126             {
2127               code |= 0x8080;
2128               EMIT_ONE_BYTE (code >> 8);
2129               EMIT_ONE_BYTE (code & 0xFF);
2130             }
2131         }
2132     }
2133   coding->result = CODING_RESULT_SUCCESS;
2134   coding->produced_char += produced_chars;
2135   coding->produced = dst - coding->destination;
2136   return 0;
2137 }
2138
2139 \f
2140 /*** 7. ISO2022 handlers ***/
2141
2142 /* The following note describes the coding system ISO2022 briefly.
2143    Since the intention of this note is to help understand the
2144    functions in this file, some parts are NOT ACCURATE or are OVERLY
2145    SIMPLIFIED.  For thorough understanding, please refer to the
2146    original document of ISO2022.  This is equivalent to the standard
2147    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2148
2149    ISO2022 provides many mechanisms to encode several character sets
2150    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2151    is encoded using bytes less than 128.  This may make the encoded
2152    text a little bit longer, but the text passes more easily through
2153    several types of gateway, some of which strip off the MSB (Most
2154    Significant Bit).
2155
2156    There are two kinds of character sets: control character sets and
2157    graphic character sets.  The former contain control characters such
2158    as `newline' and `escape' to provide control functions (control
2159    functions are also provided by escape sequences).  The latter
2160    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2161    two control character sets and many graphic character sets.
2162
2163    Graphic character sets are classified into one of the following
2164    four classes, according to the number of bytes (DIMENSION) and
2165    number of characters in one dimension (CHARS) of the set:
2166    - DIMENSION1_CHARS94
2167    - DIMENSION1_CHARS96
2168    - DIMENSION2_CHARS94
2169    - DIMENSION2_CHARS96
2170
2171    In addition, each character set is assigned an identification tag,
2172    unique for each set, called the "final character" (denoted as <F>
2173    hereafter).  The <F> of each character set is decided by ECMA(*)
2174    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2175    (0x30..0x3F are for private use only).
2176
2177    Note (*): ECMA = European Computer Manufacturers Association
2178
2179    Here are examples of graphic character sets [NAME(<F>)]:
2180         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2181         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2182         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2183         o DIMENSION2_CHARS96 -- none for the moment
2184
2185    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2186         C0 [0x00..0x1F] -- control character plane 0
2187         GL [0x20..0x7F] -- graphic character plane 0
2188         C1 [0x80..0x9F] -- control character plane 1
2189         GR [0xA0..0xFF] -- graphic character plane 1
2190
2191    A control character set is directly designated and invoked to C0 or
2192    C1 by an escape sequence.  The most common case is that:
2193    - ISO646's  control character set is designated/invoked to C0, and
2194    - ISO6429's control character set is designated/invoked to C1,
2195    and usually these designations/invocations are omitted in encoded
2196    text.  In a 7-bit environment, only C0 can be used, and a control
2197    character for C1 is encoded by an appropriate escape sequence to
2198    fit into the environment.  All control characters for C1 are
2199    defined to have corresponding escape sequences.
2200
2201    A graphic character set is at first designated to one of four
2202    graphic registers (G0 through G3), then these graphic registers are
2203    invoked to GL or GR.  These designations and invocations can be
2204    done independently.  The most common case is that G0 is invoked to
2205    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2206    these invocations and designations are omitted in encoded text.
2207    In a 7-bit environment, only GL can be used.
2208
2209    When a graphic character set of CHARS94 is invoked to GL, codes
2210    0x20 and 0x7F of the GL area work as control characters SPACE and
2211    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2212    be used.
2213
2214    There are two ways of invocation: locking-shift and single-shift.
2215    With locking-shift, the invocation lasts until the next different
2216    invocation, whereas with single-shift, the invocation affects the
2217    following character only and doesn't affect the locking-shift
2218    state.  Invocations are done by the following control characters or
2219    escape sequences:
2220
2221    ----------------------------------------------------------------------
2222    abbrev  function                  cntrl escape seq   description
2223    ----------------------------------------------------------------------
2224    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2225    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2226    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2227    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2228    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2229    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2230    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2231    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2232    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2233    ----------------------------------------------------------------------
2234    (*) These are not used by any known coding system.
2235
2236    Control characters for these functions are defined by macros
2237    ISO_CODE_XXX in `coding.h'.
2238
2239    Designations are done by the following escape sequences:
2240    ----------------------------------------------------------------------
2241    escape sequence      description
2242    ----------------------------------------------------------------------
2243    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2244    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2245    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2246    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2247    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2248    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2249    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2250    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2251    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2252    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2253    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2254    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2255    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2256    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2257    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2258    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2259    ----------------------------------------------------------------------
2260
2261    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2262    of dimension 1, chars 94, and final character <F>, etc...
2263
2264    Note (*): Although these designations are not allowed in ISO2022,
2265    Emacs accepts them on decoding, and produces them on encoding
2266    CHARS96 character sets in a coding system which is characterized as
2267    7-bit environment, non-locking-shift, and non-single-shift.
2268
2269    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2270    '(' must be omitted.  We refer to this as "short-form" hereafter.
2271
2272    Now you may notice that there are a lot of ways of encoding the
2273    same multilingual text in ISO2022.  Actually, there exist many
2274    coding systems such as Compound Text (used in X11's inter client
2275    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2276    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2277    localized platforms), and all of these are variants of ISO2022.
2278
2279    In addition to the above, Emacs handles two more kinds of escape
2280    sequences: ISO6429's direction specification and Emacs' private
2281    sequence for specifying character composition.
2282
2283    ISO6429's direction specification takes the following form:
2284         o CSI ']'      -- end of the current direction
2285         o CSI '0' ']'  -- end of the current direction
2286         o CSI '1' ']'  -- start of left-to-right text
2287         o CSI '2' ']'  -- start of right-to-left text
2288    The control character CSI (0x9B: control sequence introducer) is
2289    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2290
2291    Character composition specification takes the following form:
2292         o ESC '0' -- start relative composition
2293         o ESC '1' -- end composition
2294         o ESC '2' -- start rule-base composition (*)
2295         o ESC '3' -- start relative composition with alternate chars  (**)
2296         o ESC '4' -- start rule-base composition with alternate chars  (**)
2297   Since these are not standard escape sequences of any ISO standard,
2298   the use of them with these meanings is restricted to Emacs only.
2299
2300   (*) This form is used only in Emacs 20.7 and older versions,
2301   but newer versions can safely decode it.
2302   (**) This form is used only in Emacs 21.1 and newer versions,
2303   and older versions can't decode it.
2304
2305   Here's a list of example usages of these composition escape
2306   sequences (categorized by `enum composition_method').
2307
2308   COMPOSITION_RELATIVE:
2309         ESC 0 CHAR [ CHAR ] ESC 1
2310   COMPOSITION_WITH_RULE:
2311         ESC 2 CHAR [ RULE CHAR ] ESC 1
2312   COMPOSITION_WITH_ALTCHARS:
2313         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2314   COMPOSITION_WITH_RULE_ALTCHARS:
2315         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2316
2317 enum iso_code_class_type iso_code_class[256];
2318
2319 #define SAFE_CHARSET_P(coding, id)      \
2320   ((id) <= (coding)->max_charset_id     \
2321    && (coding)->safe_charsets[id] >= 0)
2322
2323
2324 #define SHIFT_OUT_OK(category)  \
2325   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2326
2327 static void
2328 setup_iso_safe_charsets (attrs)
2329      Lisp_Object attrs;
2330 {
2331   Lisp_Object charset_list, safe_charsets;
2332   Lisp_Object request;
2333   Lisp_Object reg_usage;
2334   Lisp_Object tail;
2335   int reg94, reg96;
2336   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2337   int max_charset_id;
2338
2339   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2340   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2341       && ! EQ (charset_list, Viso_2022_charset_list))
2342     {
2343       CODING_ATTR_CHARSET_LIST (attrs)
2344         = charset_list = Viso_2022_charset_list;
2345       ASET (attrs, coding_attr_safe_charsets, Qnil);
2346     }
2347
2348   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2349     return;
2350
2351   max_charset_id = 0;
2352   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2353     {
2354       int id = XINT (XCAR (tail));
2355       if (max_charset_id < id)
2356         max_charset_id = id;
2357     }
2358
2359   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2360                                 make_number (255));
2361   request = AREF (attrs, coding_attr_iso_request);
2362   reg_usage = AREF (attrs, coding_attr_iso_usage);
2363   reg94 = XINT (XCAR (reg_usage));
2364   reg96 = XINT (XCDR (reg_usage));
2365
2366   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2367     {
2368       Lisp_Object id;
2369       Lisp_Object reg;
2370       struct charset *charset;
2371
2372       id = XCAR (tail);
2373       charset = CHARSET_FROM_ID (XINT (id));
2374       reg = Fcdr (Fassq (id, request));
2375       if (! NILP (reg))
2376         SSET (safe_charsets, XINT (id), XINT (reg));
2377       else if (charset->iso_chars_96)
2378         {
2379           if (reg96 < 4)
2380             SSET (safe_charsets, XINT (id), reg96);
2381         }
2382       else
2383         {
2384           if (reg94 < 4)
2385             SSET (safe_charsets, XINT (id), reg94);
2386         }
2387     }
2388   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2389 }
2390
2391
2392 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2393    Check if a text is encoded in one of ISO-2022 based codig systems.
2394    If it is, return 1, else return 0.  */
2395
2396 static int
2397 detect_coding_iso_2022 (coding, detect_info)
2398      struct coding_system *coding;
2399      struct coding_detection_info *detect_info;
2400 {
2401   const unsigned char *src = coding->source, *src_base = src;
2402   const unsigned char *src_end = coding->source + coding->src_bytes;
2403   int multibytep = coding->src_multibyte;
2404   int single_shifting = 0;
2405   int id;
2406   int c, c1;
2407   int consumed_chars = 0;
2408   int i;
2409   int rejected = 0;
2410   int found = 0;
2411
2412   detect_info->checked |= CATEGORY_MASK_ISO;
2413
2414   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2415     {
2416       struct coding_system *this = &(coding_categories[i]);
2417       Lisp_Object attrs, val;
2418
2419       attrs = CODING_ID_ATTRS (this->id);
2420       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2421           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2422         setup_iso_safe_charsets (attrs);
2423       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2424       this->max_charset_id = SCHARS (val) - 1;
2425       this->safe_charsets = (char *) SDATA (val);
2426     }
2427
2428   /* A coding system of this category is always ASCII compatible.  */
2429   src += coding->head_ascii;
2430
2431   while (rejected != CATEGORY_MASK_ISO)
2432     {
2433       ONE_MORE_BYTE (c);
2434       switch (c)
2435         {
2436         case ISO_CODE_ESC:
2437           if (inhibit_iso_escape_detection)
2438             break;
2439           single_shifting = 0;
2440           ONE_MORE_BYTE (c);
2441           if (c >= '(' && c <= '/')
2442             {
2443               /* Designation sequence for a charset of dimension 1.  */
2444               ONE_MORE_BYTE (c1);
2445               if (c1 < ' ' || c1 >= 0x80
2446                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2447                 /* Invalid designation sequence.  Just ignore.  */
2448                 break;
2449             }
2450           else if (c == '$')
2451             {
2452               /* Designation sequence for a charset of dimension 2.  */
2453               ONE_MORE_BYTE (c);
2454               if (c >= '@' && c <= 'B')
2455                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2456                 id = iso_charset_table[1][0][c];
2457               else if (c >= '(' && c <= '/')
2458                 {
2459                   ONE_MORE_BYTE (c1);
2460                   if (c1 < ' ' || c1 >= 0x80
2461                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2462                     /* Invalid designation sequence.  Just ignore.  */
2463                     break;
2464                 }
2465               else
2466                 /* Invalid designation sequence.  Just ignore it.  */
2467                 break;
2468             }
2469           else if (c == 'N' || c == 'O')
2470             {
2471               /* ESC <Fe> for SS2 or SS3.  */
2472               single_shifting = 1;
2473               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2474               break;
2475             }
2476           else if (c >= '0' && c <= '4')
2477             {
2478               /* ESC <Fp> for start/end composition.  */
2479               found |= CATEGORY_MASK_ISO;
2480               break;
2481             }
2482           else
2483             {
2484               /* Invalid escape sequence.  Just ignore it.  */
2485               break;
2486             }
2487
2488           /* We found a valid designation sequence for CHARSET.  */
2489           rejected |= CATEGORY_MASK_ISO_8BIT;
2490           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2491                               id))
2492             found |= CATEGORY_MASK_ISO_7;
2493           else
2494             rejected |= CATEGORY_MASK_ISO_7;
2495           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2496                               id))
2497             found |= CATEGORY_MASK_ISO_7_TIGHT;
2498           else
2499             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2500           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2501                               id))
2502             found |= CATEGORY_MASK_ISO_7_ELSE;
2503           else
2504             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2505           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2506                               id))
2507             found |= CATEGORY_MASK_ISO_8_ELSE;
2508           else
2509             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2510           break;
2511
2512         case ISO_CODE_SO:
2513         case ISO_CODE_SI:
2514           /* Locking shift out/in.  */
2515           if (inhibit_iso_escape_detection)
2516             break;
2517           single_shifting = 0;
2518           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2519           found |= CATEGORY_MASK_ISO_ELSE;
2520           break;
2521
2522         case ISO_CODE_CSI:
2523           /* Control sequence introducer.  */
2524           single_shifting = 0;
2525           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2526           found |= CATEGORY_MASK_ISO_8_ELSE;
2527           goto check_extra_latin;
2528
2529
2530         case ISO_CODE_SS2:
2531         case ISO_CODE_SS3:
2532           /* Single shift.   */
2533           if (inhibit_iso_escape_detection)
2534             break;
2535           single_shifting = 1;
2536           rejected |= CATEGORY_MASK_ISO_7BIT;
2537           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2538               & CODING_ISO_FLAG_SINGLE_SHIFT)
2539             found |= CATEGORY_MASK_ISO_8_1;
2540           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2541               & CODING_ISO_FLAG_SINGLE_SHIFT)
2542             found |= CATEGORY_MASK_ISO_8_2;
2543           goto check_extra_latin;
2544
2545         default:
2546           if (c < 0x80)
2547             {
2548               single_shifting = 0;
2549               break;
2550             }
2551           if (c >= 0xA0)
2552             {
2553               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2554               found |= CATEGORY_MASK_ISO_8_1;
2555               /* Check the length of succeeding codes of the range
2556                  0xA0..0FF.  If the byte length is even, we include
2557                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2558                  only when we are not single shifting.  */
2559               if (! single_shifting
2560                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2561                 {
2562                   int i = 1;
2563                   while (src < src_end)
2564                     {
2565                       ONE_MORE_BYTE (c);
2566                       if (c < 0xA0)
2567                         break;
2568                       i++;
2569                     }
2570
2571                   if (i & 1 && src < src_end)
2572                     rejected |= CATEGORY_MASK_ISO_8_2;
2573                   else
2574                     found |= CATEGORY_MASK_ISO_8_2;
2575                 }
2576               break;
2577             }
2578         check_extra_latin:
2579           single_shifting = 0;
2580           if (! VECTORP (Vlatin_extra_code_table)
2581               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2582             {
2583               rejected = CATEGORY_MASK_ISO;
2584               break;
2585             }
2586           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2587               & CODING_ISO_FLAG_LATIN_EXTRA)
2588             found |= CATEGORY_MASK_ISO_8_1;
2589           else
2590             rejected |= CATEGORY_MASK_ISO_8_1;
2591           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2592               & CODING_ISO_FLAG_LATIN_EXTRA)
2593             found |= CATEGORY_MASK_ISO_8_2;
2594           else
2595             rejected |= CATEGORY_MASK_ISO_8_2;
2596         }
2597     }
2598   detect_info->rejected |= CATEGORY_MASK_ISO;
2599   return 0;
2600
2601  no_more_source:
2602   detect_info->rejected |= rejected;
2603   detect_info->found |= (found & ~rejected);
2604   return 1;
2605 }
2606
2607
2608 /* Set designation state into CODING.  */
2609 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2610   do {                                                                  \
2611     int id, prev;                                                       \
2612                                                                         \
2613     if (final < '0' || final >= 128                                     \
2614         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2615         || !SAFE_CHARSET_P (coding, id))                                \
2616       {                                                                 \
2617         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2618         goto invalid_code;                                              \
2619       }                                                                 \
2620     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2621     if (id == charset_jisx0201_roman)                                   \
2622       {                                                                 \
2623         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2624           id = charset_ascii;                                           \
2625       }                                                                 \
2626     else if (id == charset_jisx0208_1978)                               \
2627       {                                                                 \
2628         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2629           id = charset_jisx0208;                                        \
2630       }                                                                 \
2631     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2632     /* If there was an invalid designation to REG previously, and this  \
2633        designation is ASCII to REG, we should keep this designation     \
2634        sequence.  */                                                    \
2635     if (prev == -2 && id == charset_ascii)                              \
2636       goto invalid_code;                                                \
2637   } while (0)
2638
2639
2640 #define MAYBE_FINISH_COMPOSITION()                              \
2641   do {                                                          \
2642     int i;                                                      \
2643     if (composition_state == COMPOSING_NO)                      \
2644       break;                                                    \
2645     /* It is assured that we have enough room for producing     \
2646        characters stored in the table `components'.  */         \
2647     if (charbuf + component_idx > charbuf_end)                  \
2648       goto no_more_source;                                      \
2649     composition_state = COMPOSING_NO;                           \
2650     if (method == COMPOSITION_RELATIVE                          \
2651         || method == COMPOSITION_WITH_ALTCHARS)                 \
2652       {                                                         \
2653         for (i = 0; i < component_idx; i++)                     \
2654           *charbuf++ = components[i];                           \
2655         char_offset += component_idx;                           \
2656       }                                                         \
2657     else                                                        \
2658       {                                                         \
2659         for (i = 0; i < component_idx; i += 2)                  \
2660           *charbuf++ = components[i];                           \
2661         char_offset += (component_idx / 2) + 1;                 \
2662       }                                                         \
2663   } while (0)
2664
2665
2666 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2667    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2668    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2669    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2670    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2671   */
2672
2673 #define DECODE_COMPOSITION_START(c1)                                    \
2674   do {                                                                  \
2675     if (c1 == '0'                                                       \
2676         && composition_state == COMPOSING_COMPONENT_RULE)               \
2677       {                                                                 \
2678         component_len = component_idx;                                  \
2679         composition_state = COMPOSING_CHAR;                             \
2680       }                                                                 \
2681     else                                                                \
2682       {                                                                 \
2683         const unsigned char *p;                                         \
2684                                                                         \
2685         MAYBE_FINISH_COMPOSITION ();                                    \
2686         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2687           goto no_more_source;                                          \
2688         for (p = src; p < src_end - 1; p++)                             \
2689           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2690             break;                                                      \
2691         if (p == src_end - 1)                                           \
2692           {                                                             \
2693             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2694               goto invalid_code;                                        \
2695             goto no_more_source;                                        \
2696           }                                                             \
2697                                                                         \
2698         /* This is surely the start of a composition.  */               \
2699         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2700                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2701                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2702                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2703         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2704                              : COMPOSING_COMPONENT_CHAR);               \
2705         component_idx = component_len = 0;                              \
2706       }                                                                 \
2707   } while (0)
2708
2709
2710 /* Handle compositoin end sequence ESC 1.  */
2711
2712 #define DECODE_COMPOSITION_END()                                        \
2713   do {                                                                  \
2714     int nchars = (component_len > 0 ? component_idx - component_len     \
2715                   : method == COMPOSITION_RELATIVE ? component_idx      \
2716                   : (component_idx + 1) / 2);                           \
2717     int i;                                                              \
2718     int *saved_charbuf = charbuf;                                       \
2719     int from = char_offset;                                             \
2720     int to = from + nchars;                                             \
2721                                                                         \
2722     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2723     if (method != COMPOSITION_RELATIVE)                                 \
2724       {                                                                 \
2725         if (component_len == 0)                                         \
2726           for (i = 0; i < component_idx; i++)                           \
2727             *charbuf++ = components[i];                                 \
2728         else                                                            \
2729           for (i = 0; i < component_len; i++)                           \
2730             *charbuf++ = components[i];                                 \
2731         *saved_charbuf = saved_charbuf - charbuf;                       \
2732       }                                                                 \
2733     if (method == COMPOSITION_WITH_RULE)                                \
2734       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2735         *charbuf++ = components[i];                                     \
2736     else                                                                \
2737       for (i = component_len; i < component_idx; i++, char_offset++)    \
2738         *charbuf++ = components[i];                                     \
2739     coding->annotated = 1;                                              \
2740     composition_state = COMPOSING_NO;                                   \
2741   } while (0)
2742
2743
2744 /* Decode a composition rule from the byte C1 (and maybe one more byte
2745    from SRC) and store one encoded composition rule in
2746    coding->cmp_data.  */
2747
2748 #define DECODE_COMPOSITION_RULE(c1)                                     \
2749   do {                                                                  \
2750     (c1) -= 32;                                                         \
2751     if (c1 < 81)                /* old format (before ver.21) */        \
2752       {                                                                 \
2753         int gref = (c1) / 9;                                            \
2754         int nref = (c1) % 9;                                            \
2755         if (gref == 4) gref = 10;                                       \
2756         if (nref == 4) nref = 10;                                       \
2757         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2758       }                                                                 \
2759     else if (c1 < 93)           /* new format (after ver.21) */         \
2760       {                                                                 \
2761         ONE_MORE_BYTE (c2);                                             \
2762         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2763       }                                                                 \
2764     else                                                                \
2765       c1 = 0;                                                           \
2766   } while (0)
2767
2768
2769 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2770
2771 static void
2772 decode_coding_iso_2022 (coding)
2773      struct coding_system *coding;
2774 {
2775   const unsigned char *src = coding->source + coding->consumed;
2776   const unsigned char *src_end = coding->source + coding->src_bytes;
2777   const unsigned char *src_base;
2778   int *charbuf = coding->charbuf;
2779   int *charbuf_end
2780     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2781   int consumed_chars = 0, consumed_chars_base;
2782   int multibytep = coding->src_multibyte;
2783   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2784   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2785   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2786   struct charset *charset;
2787   int c;
2788   /* For handling composition sequence.  */
2789 #define COMPOSING_NO                    0
2790 #define COMPOSING_CHAR                  1
2791 #define COMPOSING_RULE                  2
2792 #define COMPOSING_COMPONENT_CHAR        3
2793 #define COMPOSING_COMPONENT_RULE        4
2794
2795   int composition_state = COMPOSING_NO;
2796   enum composition_method method;
2797   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2798   int component_idx;
2799   int component_len;
2800   Lisp_Object attrs, charset_list;
2801   int char_offset = coding->produced_char;
2802   int last_offset = char_offset;
2803   int last_id = charset_ascii;
2804
2805   CODING_GET_INFO (coding, attrs, charset_list);
2806   setup_iso_safe_charsets (attrs);
2807
2808   while (1)
2809     {
2810       int c1, c2;
2811
2812       src_base = src;
2813       consumed_chars_base = consumed_chars;
2814
2815       if (charbuf >= charbuf_end)
2816         break;
2817
2818       ONE_MORE_BYTE (c1);
2819
2820       /* We produce at most one character.  */
2821       switch (iso_code_class [c1])
2822         {
2823         case ISO_0x20_or_0x7F:
2824           if (composition_state != COMPOSING_NO)
2825             {
2826               if (composition_state == COMPOSING_RULE
2827                   || composition_state == COMPOSING_COMPONENT_RULE)
2828                 {
2829                   DECODE_COMPOSITION_RULE (c1);
2830                   components[component_idx++] = c1;
2831                   composition_state--;
2832                   continue;
2833                 }
2834             }
2835           if (charset_id_0 < 0
2836               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2837             /* This is SPACE or DEL.  */
2838             charset = CHARSET_FROM_ID (charset_ascii);
2839           else
2840             charset = CHARSET_FROM_ID (charset_id_0);
2841           break;
2842
2843         case ISO_graphic_plane_0:
2844           if (composition_state != COMPOSING_NO)
2845             {
2846               if (composition_state == COMPOSING_RULE
2847                   || composition_state == COMPOSING_COMPONENT_RULE)
2848                 {
2849                   DECODE_COMPOSITION_RULE (c1);
2850                   components[component_idx++] = c1;
2851                   composition_state--;
2852                   continue;
2853                 }
2854             }
2855           charset = CHARSET_FROM_ID (charset_id_0);
2856           break;
2857
2858         case ISO_0xA0_or_0xFF:
2859           if (charset_id_1 < 0
2860               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2861               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2862             goto invalid_code;
2863           /* This is a graphic character, we fall down ... */
2864
2865         case ISO_graphic_plane_1:
2866           if (charset_id_1 < 0)
2867             goto invalid_code;
2868           charset = CHARSET_FROM_ID (charset_id_1);
2869           break;
2870
2871         case ISO_control_0:
2872           MAYBE_FINISH_COMPOSITION ();
2873           charset = CHARSET_FROM_ID (charset_ascii);
2874           break;
2875
2876         case ISO_control_1:
2877           MAYBE_FINISH_COMPOSITION ();
2878           goto invalid_code;
2879
2880         case ISO_shift_out:
2881           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2882               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2883             goto invalid_code;
2884           CODING_ISO_INVOCATION (coding, 0) = 1;
2885           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2886           continue;
2887
2888         case ISO_shift_in:
2889           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2890             goto invalid_code;
2891           CODING_ISO_INVOCATION (coding, 0) = 0;
2892           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2893           continue;
2894
2895         case ISO_single_shift_2_7:
2896         case ISO_single_shift_2:
2897           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2898             goto invalid_code;
2899           /* SS2 is handled as an escape sequence of ESC 'N' */
2900           c1 = 'N';
2901           goto label_escape_sequence;
2902
2903         case ISO_single_shift_3:
2904           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2905             goto invalid_code;
2906           /* SS2 is handled as an escape sequence of ESC 'O' */
2907           c1 = 'O';
2908           goto label_escape_sequence;
2909
2910         case ISO_control_sequence_introducer:
2911           /* CSI is handled as an escape sequence of ESC '[' ...  */
2912           c1 = '[';
2913           goto label_escape_sequence;
2914
2915         case ISO_escape:
2916           ONE_MORE_BYTE (c1);
2917         label_escape_sequence:
2918           /* Escape sequences handled here are invocation,
2919              designation, direction specification, and character
2920              composition specification.  */
2921           switch (c1)
2922             {
2923             case '&':           /* revision of following character set */
2924               ONE_MORE_BYTE (c1);
2925               if (!(c1 >= '@' && c1 <= '~'))
2926                 goto invalid_code;
2927               ONE_MORE_BYTE (c1);
2928               if (c1 != ISO_CODE_ESC)
2929                 goto invalid_code;
2930               ONE_MORE_BYTE (c1);
2931               goto label_escape_sequence;
2932
2933             case '$':           /* designation of 2-byte character set */
2934               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2935                 goto invalid_code;
2936               ONE_MORE_BYTE (c1);
2937               if (c1 >= '@' && c1 <= 'B')
2938                 {       /* designation of JISX0208.1978, GB2312.1980,
2939                            or JISX0208.1980 */
2940                   DECODE_DESIGNATION (0, 2, 0, c1);
2941                 }
2942               else if (c1 >= 0x28 && c1 <= 0x2B)
2943                 {       /* designation of DIMENSION2_CHARS94 character set */
2944                   ONE_MORE_BYTE (c2);
2945                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2946                 }
2947               else if (c1 >= 0x2C && c1 <= 0x2F)
2948                 {       /* designation of DIMENSION2_CHARS96 character set */
2949                   ONE_MORE_BYTE (c2);
2950                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2951                 }
2952               else
2953                 goto invalid_code;
2954               /* We must update these variables now.  */
2955               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2956               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2957               continue;
2958
2959             case 'n':           /* invocation of locking-shift-2 */
2960               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2961                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2962                 goto invalid_code;
2963               CODING_ISO_INVOCATION (coding, 0) = 2;
2964               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2965               continue;
2966
2967             case 'o':           /* invocation of locking-shift-3 */
2968               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2969                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2970                 goto invalid_code;
2971               CODING_ISO_INVOCATION (coding, 0) = 3;
2972               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2973               continue;
2974
2975             case 'N':           /* invocation of single-shift-2 */
2976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2977                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2978                 goto invalid_code;
2979               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2980               ONE_MORE_BYTE (c1);
2981               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2982                 goto invalid_code;
2983               break;
2984
2985             case 'O':           /* invocation of single-shift-3 */
2986               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2987                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2988                 goto invalid_code;
2989               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2990               ONE_MORE_BYTE (c1);
2991               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2992                 goto invalid_code;
2993               break;
2994
2995             case '0': case '2': case '3': case '4': /* start composition */
2996               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2997                 goto invalid_code;
2998               DECODE_COMPOSITION_START (c1);
2999               continue;
3000
3001             case '1':           /* end composition */
3002               if (composition_state == COMPOSING_NO)
3003                 goto invalid_code;
3004               DECODE_COMPOSITION_END ();
3005               continue;
3006
3007             case '[':           /* specification of direction */
3008               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3009                 goto invalid_code;
3010               /* For the moment, nested direction is not supported.
3011                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3012                  left-to-right, and nozero means right-to-left.  */
3013               ONE_MORE_BYTE (c1);
3014               switch (c1)
3015                 {
3016                 case ']':       /* end of the current direction */
3017                   coding->mode &= ~CODING_MODE_DIRECTION;
3018
3019                 case '0':       /* end of the current direction */
3020                 case '1':       /* start of left-to-right direction */
3021                   ONE_MORE_BYTE (c1);
3022                   if (c1 == ']')
3023                     coding->mode &= ~CODING_MODE_DIRECTION;
3024                   else
3025                     goto invalid_code;
3026                   break;
3027
3028                 case '2':       /* start of right-to-left direction */
3029                   ONE_MORE_BYTE (c1);
3030                   if (c1 == ']')
3031                     coding->mode |= CODING_MODE_DIRECTION;
3032                   else
3033                     goto invalid_code;
3034                   break;
3035
3036                 default:
3037                   goto invalid_code;
3038                 }
3039               continue;
3040
3041             case '%':
3042               ONE_MORE_BYTE (c1);
3043               if (c1 == '/')
3044                 {
3045                   /* CTEXT extended segment:
3046                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3047                      We keep these bytes as is for the moment.
3048                      They may be decoded by post-read-conversion.  */
3049                   int dim, M, L;
3050                   int size;
3051
3052                   ONE_MORE_BYTE (dim);
3053                   ONE_MORE_BYTE (M);
3054                   ONE_MORE_BYTE (L);
3055                   size = ((M - 128) * 128) + (L - 128);
3056                   if (charbuf + 8 + size > charbuf_end)
3057                     goto break_loop;
3058                   *charbuf++ = ISO_CODE_ESC;
3059                   *charbuf++ = '%';
3060                   *charbuf++ = '/';
3061                   *charbuf++ = dim;
3062                   *charbuf++ = BYTE8_TO_CHAR (M);
3063                   *charbuf++ = BYTE8_TO_CHAR (L);
3064                   while (size-- > 0)
3065                     {
3066                       ONE_MORE_BYTE (c1);
3067                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3068                     }
3069                 }
3070               else if (c1 == 'G')
3071                 {
3072                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3073                      ESC % G --UTF-8-BYTES-- ESC % @
3074                      We keep these bytes as is for the moment.
3075                      They may be decoded by post-read-conversion.  */
3076                   int *p = charbuf;
3077
3078                   if (p + 6 > charbuf_end)
3079                     goto break_loop;
3080                   *p++ = ISO_CODE_ESC;
3081                   *p++ = '%';
3082                   *p++ = 'G';
3083                   while (p < charbuf_end)
3084                     {
3085                       ONE_MORE_BYTE (c1);
3086                       if (c1 == ISO_CODE_ESC
3087                           && src + 1 < src_end
3088                           && src[0] == '%'
3089                           && src[1] == '@')
3090                         break;
3091                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3092                     }
3093                   if (p + 3 > charbuf_end)
3094                     goto break_loop;
3095                   *p++ = ISO_CODE_ESC;
3096                   *p++ = '%';
3097                   *p++ = '@';
3098                   charbuf = p;
3099                 }
3100               else
3101                 goto invalid_code;
3102               continue;
3103               break;
3104
3105             default:
3106               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3107                 goto invalid_code;
3108               if (c1 >= 0x28 && c1 <= 0x2B)
3109                 {       /* designation of DIMENSION1_CHARS94 character set */
3110                   ONE_MORE_BYTE (c2);
3111                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3112                 }
3113               else if (c1 >= 0x2C && c1 <= 0x2F)
3114                 {       /* designation of DIMENSION1_CHARS96 character set */
3115                   ONE_MORE_BYTE (c2);
3116                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3117                 }
3118               else
3119                 goto invalid_code;
3120               /* We must update these variables now.  */
3121               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3122               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3123               continue;
3124             }
3125         }
3126
3127       if (charset->id != charset_ascii
3128           && last_id != charset->id)
3129         {
3130           if (last_id != charset_ascii)
3131             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3132           last_id = charset->id;
3133           last_offset = char_offset;
3134         }
3135
3136       /* Now we know CHARSET and 1st position code C1 of a character.
3137          Produce a decoded character while getting 2nd position code
3138          C2 if necessary.  */
3139       c1 &= 0x7F;
3140       if (CHARSET_DIMENSION (charset) > 1)
3141         {
3142           ONE_MORE_BYTE (c2);
3143           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3144             /* C2 is not in a valid range.  */
3145             goto invalid_code;
3146           c1 = (c1 << 8) | (c2 & 0x7F);
3147           if (CHARSET_DIMENSION (charset) > 2)
3148             {
3149               ONE_MORE_BYTE (c2);
3150               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3151                 /* C2 is not in a valid range.  */
3152                 goto invalid_code;
3153               c1 = (c1 << 8) | (c2 & 0x7F);
3154             }
3155         }
3156
3157       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3158       if (c < 0)
3159         {
3160           MAYBE_FINISH_COMPOSITION ();
3161           for (; src_base < src; src_base++, char_offset++)
3162             {
3163               if (ASCII_BYTE_P (*src_base))
3164                 *charbuf++ = *src_base;
3165               else
3166                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3167             }
3168         }
3169       else if (composition_state == COMPOSING_NO)
3170         {
3171           *charbuf++ = c;
3172           char_offset++;
3173         }
3174       else
3175         {
3176           components[component_idx++] = c;
3177           if (method == COMPOSITION_WITH_RULE
3178               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3179                   && composition_state == COMPOSING_COMPONENT_CHAR))
3180             composition_state++;
3181         }
3182       continue;
3183
3184     invalid_code:
3185       MAYBE_FINISH_COMPOSITION ();
3186       src = src_base;
3187       consumed_chars = consumed_chars_base;
3188       ONE_MORE_BYTE (c);
3189       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3190       char_offset++;
3191       coding->errors++;
3192       continue;
3193
3194     break_loop:
3195       break;
3196     }
3197
3198  no_more_source:
3199   if (last_id != charset_ascii)
3200     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3201   coding->consumed_char += consumed_chars_base;
3202   coding->consumed = src_base - coding->source;
3203   coding->charbuf_used = charbuf - coding->charbuf;
3204 }
3205
3206
3207 /* ISO2022 encoding stuff.  */
3208
3209 /*
3210    It is not enough to say just "ISO2022" on encoding, we have to
3211    specify more details.  In Emacs, each coding system of ISO2022
3212    variant has the following specifications:
3213         1. Initial designation to G0 thru G3.
3214         2. Allows short-form designation?
3215         3. ASCII should be designated to G0 before control characters?
3216         4. ASCII should be designated to G0 at end of line?
3217         5. 7-bit environment or 8-bit environment?
3218         6. Use locking-shift?
3219         7. Use Single-shift?
3220    And the following two are only for Japanese:
3221         8. Use ASCII in place of JIS0201-1976-Roman?
3222         9. Use JISX0208-1983 in place of JISX0208-1978?
3223    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3224    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3225    details.
3226 */
3227
3228 /* Produce codes (escape sequence) for designating CHARSET to graphic
3229    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3230    '@', 'A', or 'B' and the coding system CODING allows, produce
3231    designation sequence of short-form.  */
3232
3233 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3234   do {                                                                  \
3235     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3236     char *intermediate_char_94 = "()*+";                                \
3237     char *intermediate_char_96 = ",-./";                                \
3238     int revision = -1;                                                  \
3239     int c;                                                              \
3240                                                                         \
3241     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3242       revision = CHARSET_ISO_REVISION (charset);                        \
3243                                                                         \
3244     if (revision >= 0)                                                  \
3245       {                                                                 \
3246         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3247         EMIT_ONE_BYTE ('@' + revision);                                 \
3248       }                                                                 \
3249     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3250     if (CHARSET_DIMENSION (charset) == 1)                               \
3251       {                                                                 \
3252         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3253           c = intermediate_char_94[reg];                                \
3254         else                                                            \
3255           c = intermediate_char_96[reg];                                \
3256         EMIT_ONE_ASCII_BYTE (c);                                        \
3257       }                                                                 \
3258     else                                                                \
3259       {                                                                 \
3260         EMIT_ONE_ASCII_BYTE ('$');                                      \
3261         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3262           {                                                             \
3263             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3264                 || reg != 0                                             \
3265                 || final_char < '@' || final_char > 'B')                \
3266               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3267           }                                                             \
3268         else                                                            \
3269           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3270       }                                                                 \
3271     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3272                                                                         \
3273     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3274   } while (0)
3275
3276
3277 /* The following two macros produce codes (control character or escape
3278    sequence) for ISO2022 single-shift functions (single-shift-2 and
3279    single-shift-3).  */
3280
3281 #define ENCODE_SINGLE_SHIFT_2                                           \
3282   do {                                                                  \
3283     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3284       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3285     else                                                                \
3286       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3287     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3288   } while (0)
3289
3290
3291 #define ENCODE_SINGLE_SHIFT_3                                           \
3292   do {                                                                  \
3293     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3294       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3295     else                                                                \
3296       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3297     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3298   } while (0)
3299
3300
3301 /* The following four macros produce codes (control character or
3302    escape sequence) for ISO2022 locking-shift functions (shift-in,
3303    shift-out, locking-shift-2, and locking-shift-3).  */
3304
3305 #define ENCODE_SHIFT_IN                                 \
3306   do {                                                  \
3307     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3308     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3309   } while (0)
3310
3311
3312 #define ENCODE_SHIFT_OUT                                \
3313   do {                                                  \
3314     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3315     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3316   } while (0)
3317
3318
3319 #define ENCODE_LOCKING_SHIFT_2                          \
3320   do {                                                  \
3321     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3322     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3323   } while (0)
3324
3325
3326 #define ENCODE_LOCKING_SHIFT_3                          \
3327   do {                                                  \
3328     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3329     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3330   } while (0)
3331
3332
3333 /* Produce codes for a DIMENSION1 character whose character set is
3334    CHARSET and whose position-code is C1.  Designation and invocation
3335    sequences are also produced in advance if necessary.  */
3336
3337 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3338   do {                                                                  \
3339     int id = CHARSET_ID (charset);                                      \
3340                                                                         \
3341     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3342         && id == charset_ascii)                                         \
3343       {                                                                 \
3344         id = charset_jisx0201_roman;                                    \
3345         charset = CHARSET_FROM_ID (id);                                 \
3346       }                                                                 \
3347                                                                         \
3348     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3349       {                                                                 \
3350         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3351           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3352         else                                                            \
3353           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3354         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3355         break;                                                          \
3356       }                                                                 \
3357     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3358       {                                                                 \
3359         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3360         break;                                                          \
3361       }                                                                 \
3362     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3363       {                                                                 \
3364         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3365         break;                                                          \
3366       }                                                                 \
3367     else                                                                \
3368       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3369          must invoke it, or, at first, designate it to some graphic     \
3370          register.  Then repeat the loop to actually produce the        \
3371          character.  */                                                 \
3372       dst = encode_invocation_designation (charset, coding, dst,        \
3373                                            &produced_chars);            \
3374   } while (1)
3375
3376
3377 /* Produce codes for a DIMENSION2 character whose character set is
3378    CHARSET and whose position-codes are C1 and C2.  Designation and
3379    invocation codes are also produced in advance if necessary.  */
3380
3381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3382   do {                                                                  \
3383     int id = CHARSET_ID (charset);                                      \
3384                                                                         \
3385     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3386         && id == charset_jisx0208)                                      \
3387       {                                                                 \
3388         id = charset_jisx0208_1978;                                     \
3389         charset = CHARSET_FROM_ID (id);                                 \
3390       }                                                                 \
3391                                                                         \
3392     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3393       {                                                                 \
3394         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3395           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3396         else                                                            \
3397           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3398         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3399         break;                                                          \
3400       }                                                                 \
3401     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3402       {                                                                 \
3403         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3404         break;                                                          \
3405       }                                                                 \
3406     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3407       {                                                                 \
3408         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3409         break;                                                          \
3410       }                                                                 \
3411     else                                                                \
3412       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3413          must invoke it, or, at first, designate it to some graphic     \
3414          register.  Then repeat the loop to actually produce the        \
3415          character.  */                                                 \
3416       dst = encode_invocation_designation (charset, coding, dst,        \
3417                                            &produced_chars);            \
3418   } while (1)
3419
3420
3421 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3422   do {                                                                     \
3423     int code = ENCODE_CHAR ((charset),(c));                                \
3424                                                                            \
3425     if (CHARSET_DIMENSION (charset) == 1)                                  \
3426       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3427     else                                                                   \
3428       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3429   } while (0)
3430
3431
3432 /* Produce designation and invocation codes at a place pointed by DST
3433    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3434    Return new DST.  */
3435
3436 unsigned char *
3437 encode_invocation_designation (charset, coding, dst, p_nchars)
3438      struct charset *charset;
3439      struct coding_system *coding;
3440      unsigned char *dst;
3441      int *p_nchars;
3442 {
3443   int multibytep = coding->dst_multibyte;
3444   int produced_chars = *p_nchars;
3445   int reg;                      /* graphic register number */
3446   int id = CHARSET_ID (charset);
3447
3448   /* At first, check designations.  */
3449   for (reg = 0; reg < 4; reg++)
3450     if (id == CODING_ISO_DESIGNATION (coding, reg))
3451       break;
3452
3453   if (reg >= 4)
3454     {
3455       /* CHARSET is not yet designated to any graphic registers.  */
3456       /* At first check the requested designation.  */
3457       reg = CODING_ISO_REQUEST (coding, id);
3458       if (reg < 0)
3459         /* Since CHARSET requests no special designation, designate it
3460            to graphic register 0.  */
3461         reg = 0;
3462
3463       ENCODE_DESIGNATION (charset, reg, coding);
3464     }
3465
3466   if (CODING_ISO_INVOCATION (coding, 0) != reg
3467       && CODING_ISO_INVOCATION (coding, 1) != reg)
3468     {
3469       /* Since the graphic register REG is not invoked to any graphic
3470          planes, invoke it to graphic plane 0.  */
3471       switch (reg)
3472         {
3473         case 0:                 /* graphic register 0 */
3474           ENCODE_SHIFT_IN;
3475           break;
3476
3477         case 1:                 /* graphic register 1 */
3478           ENCODE_SHIFT_OUT;
3479           break;
3480
3481         case 2:                 /* graphic register 2 */
3482           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3483             ENCODE_SINGLE_SHIFT_2;
3484           else
3485             ENCODE_LOCKING_SHIFT_2;
3486           break;
3487
3488         case 3:                 /* graphic register 3 */
3489           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3490             ENCODE_SINGLE_SHIFT_3;
3491           else
3492             ENCODE_LOCKING_SHIFT_3;
3493           break;
3494         }
3495     }
3496
3497   *p_nchars = produced_chars;
3498   return dst;
3499 }
3500
3501 /* The following three macros produce codes for indicating direction
3502    of text.  */
3503 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3504   do {                                                                  \
3505     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3506       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3507     else                                                                \
3508       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3509   } while (0)
3510
3511
3512 #define ENCODE_DIRECTION_R2L()                  \
3513   do {                                          \
3514     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3515     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3516   } while (0)
3517
3518
3519 #define ENCODE_DIRECTION_L2R()                  \
3520   do {                                          \
3521     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3522     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3523   } while (0)
3524
3525
3526 /* Produce codes for designation and invocation to reset the graphic
3527    planes and registers to initial state.  */
3528 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3529   do {                                                                  \
3530     int reg;                                                            \
3531     struct charset *charset;                                            \
3532                                                                         \
3533     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3534       ENCODE_SHIFT_IN;                                                  \
3535     for (reg = 0; reg < 4; reg++)                                       \
3536       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3537           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3538               != CODING_ISO_INITIAL (coding, reg)))                     \
3539         {                                                               \
3540           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3541           ENCODE_DESIGNATION (charset, reg, coding);                    \
3542         }                                                               \
3543   } while (0)
3544
3545
3546 /* Produce designation sequences of charsets in the line started from
3547    SRC to a place pointed by DST, and return updated DST.
3548
3549    If the current block ends before any end-of-line, we may fail to
3550    find all the necessary designations.  */
3551
3552 static unsigned char *
3553 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3554      struct coding_system *coding;
3555      int *charbuf, *charbuf_end;
3556      unsigned char *dst;
3557 {
3558   struct charset *charset;
3559   /* Table of charsets to be designated to each graphic register.  */
3560   int r[4];
3561   int c, found = 0, reg;
3562   int produced_chars = 0;
3563   int multibytep = coding->dst_multibyte;
3564   Lisp_Object attrs;
3565   Lisp_Object charset_list;
3566
3567   attrs = CODING_ID_ATTRS (coding->id);
3568   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3569   if (EQ (charset_list, Qiso_2022))
3570     charset_list = Viso_2022_charset_list;
3571
3572   for (reg = 0; reg < 4; reg++)
3573     r[reg] = -1;
3574
3575   while (found < 4)
3576     {
3577       int id;
3578
3579       c = *charbuf++;
3580       if (c == '\n')
3581         break;
3582       charset = char_charset (c, charset_list, NULL);
3583       id = CHARSET_ID (charset);
3584       reg = CODING_ISO_REQUEST (coding, id);
3585       if (reg >= 0 && r[reg] < 0)
3586         {
3587           found++;
3588           r[reg] = id;
3589         }
3590     }
3591
3592   if (found)
3593     {
3594       for (reg = 0; reg < 4; reg++)
3595         if (r[reg] >= 0
3596             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3597           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3598     }
3599
3600   return dst;
3601 }
3602
3603 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3604
3605 static int
3606 encode_coding_iso_2022 (coding)
3607      struct coding_system *coding;
3608 {
3609   int multibytep = coding->dst_multibyte;
3610   int *charbuf = coding->charbuf;
3611   int *charbuf_end = charbuf + coding->charbuf_used;
3612   unsigned char *dst = coding->destination + coding->produced;
3613   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3614   int safe_room = 16;
3615   int bol_designation
3616     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3617        && CODING_ISO_BOL (coding));
3618   int produced_chars = 0;
3619   Lisp_Object attrs, eol_type, charset_list;
3620   int ascii_compatible;
3621   int c;
3622   int preferred_charset_id = -1;
3623
3624   CODING_GET_INFO (coding, attrs, charset_list);
3625   eol_type = CODING_ID_EOL_TYPE (coding->id);
3626   if (VECTORP (eol_type))
3627     eol_type = Qunix;
3628
3629   setup_iso_safe_charsets (attrs);
3630   /* Charset list may have been changed.  */
3631   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3632   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3633
3634   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3635
3636   while (charbuf < charbuf_end)
3637     {
3638       ASSURE_DESTINATION (safe_room);
3639
3640       if (bol_designation)
3641         {
3642           unsigned char *dst_prev = dst;
3643
3644           /* We have to produce designation sequences if any now.  */
3645           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3646           bol_designation = 0;
3647           /* We are sure that designation sequences are all ASCII bytes.  */
3648           produced_chars += dst - dst_prev;
3649         }
3650
3651       c = *charbuf++;
3652
3653       if (c < 0)
3654         {
3655           /* Handle an annotation.  */
3656           switch (*charbuf)
3657             {
3658             case CODING_ANNOTATE_COMPOSITION_MASK:
3659               /* Not yet implemented.  */
3660               break;
3661             case CODING_ANNOTATE_CHARSET_MASK:
3662               preferred_charset_id = charbuf[3];
3663               if (preferred_charset_id >= 0
3664                   && NILP (Fmemq (make_number (preferred_charset_id),
3665                                   charset_list)))
3666                 preferred_charset_id = -1;
3667               break;
3668             default:
3669               abort ();
3670             }
3671           charbuf += -c - 1;
3672           continue;
3673         }
3674
3675       /* Now encode the character C.  */
3676       if (c < 0x20 || c == 0x7F)
3677         {
3678           if (c == '\n'
3679               || (c == '\r' && EQ (eol_type, Qmac)))
3680             {
3681               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3682                 ENCODE_RESET_PLANE_AND_REGISTER ();
3683               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3684                 {
3685                   int i;
3686
3687                   for (i = 0; i < 4; i++)
3688                     CODING_ISO_DESIGNATION (coding, i)
3689                       = CODING_ISO_INITIAL (coding, i);
3690                 }
3691               bol_designation
3692                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3693             }
3694           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3695             ENCODE_RESET_PLANE_AND_REGISTER ();
3696           EMIT_ONE_ASCII_BYTE (c);
3697         }
3698       else if (ASCII_CHAR_P (c))
3699         {
3700           if (ascii_compatible)
3701             EMIT_ONE_ASCII_BYTE (c);
3702           else
3703             {
3704               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3705               ENCODE_ISO_CHARACTER (charset, c);
3706             }
3707         }
3708       else if (CHAR_BYTE8_P (c))
3709         {
3710           c = CHAR_TO_BYTE8 (c);
3711           EMIT_ONE_BYTE (c);
3712         }
3713       else
3714         {
3715           struct charset *charset;
3716
3717           if (preferred_charset_id >= 0)
3718             {
3719               charset = CHARSET_FROM_ID (preferred_charset_id);
3720               if (! CHAR_CHARSET_P (c, charset))
3721                 charset = char_charset (c, charset_list, NULL);
3722             }
3723           else
3724             charset = char_charset (c, charset_list, NULL);
3725           if (!charset)
3726             {
3727               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3728                 {
3729                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3730                   charset = CHARSET_FROM_ID (charset_ascii);
3731                 }
3732               else
3733                 {
3734                   c = coding->default_char;
3735                   charset = char_charset (c, charset_list, NULL);
3736                 }
3737             }
3738           ENCODE_ISO_CHARACTER (charset, c);
3739         }
3740     }
3741
3742   if (coding->mode & CODING_MODE_LAST_BLOCK
3743       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3744     {
3745       ASSURE_DESTINATION (safe_room);
3746       ENCODE_RESET_PLANE_AND_REGISTER ();
3747     }
3748   coding->result = CODING_RESULT_SUCCESS;
3749   CODING_ISO_BOL (coding) = bol_designation;
3750   coding->produced_char += produced_chars;
3751   coding->produced = dst - coding->destination;
3752   return 0;
3753 }
3754
3755 \f
3756 /*** 8,9. SJIS and BIG5 handlers ***/
3757
3758 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3759    quite widely.  So, for the moment, Emacs supports them in the bare
3760    C code.  But, in the future, they may be supported only by CCL.  */
3761
3762 /* SJIS is a coding system encoding three character sets: ASCII, right
3763    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3764    as is.  A character of charset katakana-jisx0201 is encoded by
3765    "position-code + 0x80".  A character of charset japanese-jisx0208
3766    is encoded in 2-byte but two position-codes are divided and shifted
3767    so that it fit in the range below.
3768
3769    --- CODE RANGE of SJIS ---
3770    (character set)      (range)
3771    ASCII                0x00 .. 0x7F
3772    KATAKANA-JISX0201    0xA0 .. 0xDF
3773    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3774             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3775    -------------------------------
3776
3777 */
3778
3779 /* BIG5 is a coding system encoding two character sets: ASCII and
3780    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3781    character set and is encoded in two-byte.
3782
3783    --- CODE RANGE of BIG5 ---
3784    (character set)      (range)
3785    ASCII                0x00 .. 0x7F
3786    Big5 (1st byte)      0xA1 .. 0xFE
3787         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3788    --------------------------
3789
3790   */
3791
3792 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3793    Check if a text is encoded in SJIS.  If it is, return
3794    CATEGORY_MASK_SJIS, else return 0.  */
3795
3796 static int
3797 detect_coding_sjis (coding, detect_info)
3798      struct coding_system *coding;
3799      struct coding_detection_info *detect_info;
3800 {
3801   const unsigned char *src = coding->source, *src_base = src;
3802   const unsigned char *src_end = coding->source + coding->src_bytes;
3803   int multibytep = coding->src_multibyte;
3804   int consumed_chars = 0;
3805   int found = 0;
3806   int c;
3807   int incomplete;
3808
3809   detect_info->checked |= CATEGORY_MASK_SJIS;
3810   /* A coding system of this category is always ASCII compatible.  */
3811   src += coding->head_ascii;
3812
3813   while (1)
3814     {
3815       incomplete = 0;
3816       ONE_MORE_BYTE (c);
3817       incomplete = 1;
3818       if (c < 0x80)
3819         continue;
3820       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3821         {
3822           ONE_MORE_BYTE (c);
3823           if (c < 0x40 || c == 0x7F || c > 0xFC)
3824             break;
3825           found = CATEGORY_MASK_SJIS;
3826         }
3827       else if (c >= 0xA0 && c < 0xE0)
3828         found = CATEGORY_MASK_SJIS;
3829       else
3830         break;
3831     }
3832   detect_info->rejected |= CATEGORY_MASK_SJIS;
3833   return 0;
3834
3835  no_more_source:
3836   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3837     {
3838       detect_info->rejected |= CATEGORY_MASK_SJIS;
3839       return 0;
3840     }
3841   detect_info->found |= found;
3842   return 1;
3843 }
3844
3845 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3846    Check if a text is encoded in BIG5.  If it is, return
3847    CATEGORY_MASK_BIG5, else return 0.  */
3848
3849 static int
3850 detect_coding_big5 (coding, detect_info)
3851      struct coding_system *coding;
3852      struct coding_detection_info *detect_info;
3853 {
3854   const unsigned char *src = coding->source, *src_base = src;
3855   const unsigned char *src_end = coding->source + coding->src_bytes;
3856   int multibytep = coding->src_multibyte;
3857   int consumed_chars = 0;
3858   int found = 0;
3859   int c;
3860   int incomplete;
3861
3862   detect_info->checked |= CATEGORY_MASK_BIG5;
3863   /* A coding system of this category is always ASCII compatible.  */
3864   src += coding->head_ascii;
3865
3866   while (1)
3867     {
3868       incomplete = 0;
3869       ONE_MORE_BYTE (c);
3870       incomplete = 1;
3871       if (c < 0x80)
3872         continue;
3873       if (c >= 0xA1)
3874         {
3875           ONE_MORE_BYTE (c);
3876           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3877             return 0;
3878           found = CATEGORY_MASK_BIG5;
3879         }
3880       else
3881         break;
3882     }
3883   detect_info->rejected |= CATEGORY_MASK_BIG5;
3884   return 0;
3885
3886  no_more_source:
3887   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3888     {
3889       detect_info->rejected |= CATEGORY_MASK_BIG5;
3890       return 0;
3891     }
3892   detect_info->found |= found;
3893   return 1;
3894 }
3895
3896 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3897    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3898
3899 static void
3900 decode_coding_sjis (coding)
3901      struct coding_system *coding;
3902 {
3903   const unsigned char *src = coding->source + coding->consumed;
3904   const unsigned char *src_end = coding->source + coding->src_bytes;
3905   const unsigned char *src_base;
3906   int *charbuf = coding->charbuf;
3907   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3908   int consumed_chars = 0, consumed_chars_base;
3909   int multibytep = coding->src_multibyte;
3910   struct charset *charset_roman, *charset_kanji, *charset_kana;
3911   Lisp_Object attrs, charset_list, val;
3912   int char_offset = coding->produced_char;
3913   int last_offset = char_offset;
3914   int last_id = charset_ascii;
3915
3916   CODING_GET_INFO (coding, attrs, charset_list);
3917
3918   val = charset_list;
3919   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3920   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3921   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3922
3923   while (1)
3924     {
3925       int c, c1;
3926       struct charset *charset;
3927
3928       src_base = src;
3929       consumed_chars_base = consumed_chars;
3930
3931       if (charbuf >= charbuf_end)
3932         break;
3933
3934       ONE_MORE_BYTE (c);
3935
3936       if (c < 0x80)
3937         charset = charset_roman;
3938       else
3939         {
3940           if (c >= 0xF0)
3941             goto invalid_code;
3942           if (c < 0xA0 || c >= 0xE0)
3943             {
3944               /* SJIS -> JISX0208 */
3945               ONE_MORE_BYTE (c1);
3946               if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3947                 goto invalid_code;
3948               c = (c << 8) | c1;
3949               SJIS_TO_JIS (c);
3950               charset = charset_kanji;
3951             }
3952           else if (c > 0xA0)
3953             {
3954               /* SJIS -> JISX0201-Kana */
3955               c &= 0x7F;
3956               charset = charset_kana;
3957             }
3958           else
3959             goto invalid_code;
3960         }
3961       if (charset->id != charset_ascii
3962           && last_id != charset->id)
3963         {
3964           if (last_id != charset_ascii)
3965             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3966           last_id = charset->id;
3967           last_offset = char_offset;
3968         }
3969       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3970       *charbuf++ = c;
3971       char_offset++;
3972       continue;
3973
3974     invalid_code:
3975       src = src_base;
3976       consumed_chars = consumed_chars_base;
3977       ONE_MORE_BYTE (c);
3978       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3979       char_offset++;
3980       coding->errors++;
3981     }
3982
3983  no_more_source:
3984   if (last_id != charset_ascii)
3985     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3986   coding->consumed_char += consumed_chars_base;
3987   coding->consumed = src_base - coding->source;
3988   coding->charbuf_used = charbuf - coding->charbuf;
3989 }
3990
3991 static void
3992 decode_coding_big5 (coding)
3993      struct coding_system *coding;
3994 {
3995   const unsigned char *src = coding->source + coding->consumed;
3996   const unsigned char *src_end = coding->source + coding->src_bytes;
3997   const unsigned char *src_base;
3998   int *charbuf = coding->charbuf;
3999   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4000   int consumed_chars = 0, consumed_chars_base;
4001   int multibytep = coding->src_multibyte;
4002   struct charset *charset_roman, *charset_big5;
4003   Lisp_Object attrs, charset_list, val;
4004   int char_offset = coding->produced_char;
4005   int last_offset = char_offset;
4006   int last_id = charset_ascii;
4007
4008   CODING_GET_INFO (coding, attrs, charset_list);
4009   val = charset_list;
4010   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4011   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4012
4013   while (1)
4014     {
4015       int c, c1;
4016       struct charset *charset;
4017
4018       src_base = src;
4019       consumed_chars_base = consumed_chars;
4020
4021       if (charbuf >= charbuf_end)
4022         break;
4023
4024       ONE_MORE_BYTE (c);
4025
4026       if (c < 0x80)
4027         charset = charset_roman;
4028       else
4029         {
4030           /* BIG5 -> Big5 */
4031           if (c < 0xA1 || c > 0xFE)
4032             goto invalid_code;
4033           ONE_MORE_BYTE (c1);
4034           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4035             goto invalid_code;
4036           c = c << 8 | c1;
4037           charset = charset_big5;
4038         }
4039       if (charset->id != charset_ascii
4040           && last_id != charset->id)
4041         {
4042           if (last_id != charset_ascii)
4043             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4044           last_id = charset->id;
4045           last_offset = char_offset;
4046         }
4047       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4048       *charbuf++ = c;
4049       char_offset++;
4050       continue;
4051
4052     invalid_code:
4053       src = src_base;
4054       consumed_chars = consumed_chars_base;
4055       ONE_MORE_BYTE (c);
4056       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4057       char_offset++;
4058       coding->errors++;
4059     }
4060
4061  no_more_source:
4062   if (last_id != charset_ascii)
4063     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4064   coding->consumed_char += consumed_chars_base;
4065   coding->consumed = src_base - coding->source;
4066   coding->charbuf_used = charbuf - coding->charbuf;
4067 }
4068
4069 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4070    This function can encode charsets `ascii', `katakana-jisx0201',
4071    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4072    are sure that all these charsets are registered as official charset
4073    (i.e. do not have extended leading-codes).  Characters of other
4074    charsets are produced without any encoding.  If SJIS_P is 1, encode
4075    SJIS text, else encode BIG5 text.  */
4076
4077 static int
4078 encode_coding_sjis (coding)
4079      struct coding_system *coding;
4080 {
4081   int multibytep = coding->dst_multibyte;
4082   int *charbuf = coding->charbuf;
4083   int *charbuf_end = charbuf + coding->charbuf_used;
4084   unsigned char *dst = coding->destination + coding->produced;
4085   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4086   int safe_room = 4;
4087   int produced_chars = 0;
4088   Lisp_Object attrs, charset_list, val;
4089   int ascii_compatible;
4090   struct charset *charset_roman, *charset_kanji, *charset_kana;
4091   int c;
4092
4093   CODING_GET_INFO (coding, attrs, charset_list);
4094   val = charset_list;
4095   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4096   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4097   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4098
4099   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4100
4101   while (charbuf < charbuf_end)
4102     {
4103       ASSURE_DESTINATION (safe_room);
4104       c = *charbuf++;
4105       /* Now encode the character C.  */
4106       if (ASCII_CHAR_P (c) && ascii_compatible)
4107         EMIT_ONE_ASCII_BYTE (c);
4108       else if (CHAR_BYTE8_P (c))
4109         {
4110           c = CHAR_TO_BYTE8 (c);
4111           EMIT_ONE_BYTE (c);
4112         }
4113       else
4114         {
4115           unsigned code;
4116           struct charset *charset = char_charset (c, charset_list, &code);
4117
4118           if (!charset)
4119             {
4120               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4121                 {
4122                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4123                   charset = CHARSET_FROM_ID (charset_ascii);
4124                 }
4125               else
4126                 {
4127                   c = coding->default_char;
4128                   charset = char_charset (c, charset_list, &code);
4129                 }
4130             }
4131           if (code == CHARSET_INVALID_CODE (charset))
4132             abort ();
4133           if (charset == charset_kanji)
4134             {
4135               int c1, c2;
4136               JIS_TO_SJIS (code);
4137               c1 = code >> 8, c2 = code & 0xFF;
4138               EMIT_TWO_BYTES (c1, c2);
4139             }
4140           else if (charset == charset_kana)
4141             EMIT_ONE_BYTE (code | 0x80);
4142           else
4143             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4144         }
4145     }
4146   coding->result = CODING_RESULT_SUCCESS;
4147   coding->produced_char += produced_chars;
4148   coding->produced = dst - coding->destination;
4149   return 0;
4150 }
4151
4152 static int
4153 encode_coding_big5 (coding)
4154      struct coding_system *coding;
4155 {
4156   int multibytep = coding->dst_multibyte;
4157   int *charbuf = coding->charbuf;
4158   int *charbuf_end = charbuf + coding->charbuf_used;
4159   unsigned char *dst = coding->destination + coding->produced;
4160   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4161   int safe_room = 4;
4162   int produced_chars = 0;
4163   Lisp_Object attrs, charset_list, val;
4164   int ascii_compatible;
4165   struct charset *charset_roman, *charset_big5;
4166   int c;
4167
4168   CODING_GET_INFO (coding, attrs, charset_list);
4169   val = charset_list;
4170   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4171   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4172   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4173
4174   while (charbuf < charbuf_end)
4175     {
4176       ASSURE_DESTINATION (safe_room);
4177       c = *charbuf++;
4178       /* Now encode the character C.  */
4179       if (ASCII_CHAR_P (c) && ascii_compatible)
4180         EMIT_ONE_ASCII_BYTE (c);
4181       else if (CHAR_BYTE8_P (c))
4182         {
4183           c = CHAR_TO_BYTE8 (c);
4184           EMIT_ONE_BYTE (c);
4185         }
4186       else
4187         {
4188           unsigned code;
4189           struct charset *charset = char_charset (c, charset_list, &code);
4190
4191           if (! charset)
4192             {
4193               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4194                 {
4195                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4196                   charset = CHARSET_FROM_ID (charset_ascii);
4197                 }
4198               else
4199                 {
4200                   c = coding->default_char;
4201                   charset = char_charset (c, charset_list, &code);
4202                 }
4203             }
4204           if (code == CHARSET_INVALID_CODE (charset))
4205             abort ();
4206           if (charset == charset_big5)
4207             {
4208               int c1, c2;
4209
4210               c1 = code >> 8, c2 = code & 0xFF;
4211               EMIT_TWO_BYTES (c1, c2);
4212             }
4213           else
4214             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4215         }
4216     }
4217   coding->result = CODING_RESULT_SUCCESS;
4218   coding->produced_char += produced_chars;
4219   coding->produced = dst - coding->destination;
4220   return 0;
4221 }
4222
4223 \f
4224 /*** 10. CCL handlers ***/
4225
4226 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4227    Check if a text is encoded in a coding system of which
4228    encoder/decoder are written in CCL program.  If it is, return
4229    CATEGORY_MASK_CCL, else return 0.  */
4230
4231 static int
4232 detect_coding_ccl (coding, detect_info)
4233      struct coding_system *coding;
4234      struct coding_detection_info *detect_info;
4235 {
4236   const unsigned char *src = coding->source, *src_base = src;
4237   const unsigned char *src_end = coding->source + coding->src_bytes;
4238   int multibytep = coding->src_multibyte;
4239   int consumed_chars = 0;
4240   int found = 0;
4241   unsigned char *valids = CODING_CCL_VALIDS (coding);
4242   int head_ascii = coding->head_ascii;
4243   Lisp_Object attrs;
4244
4245   detect_info->checked |= CATEGORY_MASK_CCL;
4246
4247   coding = &coding_categories[coding_category_ccl];
4248   attrs = CODING_ID_ATTRS (coding->id);
4249   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4250     src += head_ascii;
4251
4252   while (1)
4253     {
4254       int c;
4255       ONE_MORE_BYTE (c);
4256       if (! valids[c])
4257         break;
4258       if ((valids[c] > 1))
4259         found = CATEGORY_MASK_CCL;
4260     }
4261   detect_info->rejected |= CATEGORY_MASK_CCL;
4262   return 0;
4263
4264  no_more_source:
4265   detect_info->found |= found;
4266   return 1;
4267 }
4268
4269 static void
4270 decode_coding_ccl (coding)
4271      struct coding_system *coding;
4272 {
4273   const unsigned char *src = coding->source + coding->consumed;
4274   const unsigned char *src_end = coding->source + coding->src_bytes;
4275   int *charbuf = coding->charbuf;
4276   int *charbuf_end = charbuf + coding->charbuf_size;
4277   int consumed_chars = 0;
4278   int multibytep = coding->src_multibyte;
4279   struct ccl_program ccl;
4280   int source_charbuf[1024];
4281   int source_byteidx[1024];
4282   Lisp_Object attrs, charset_list;
4283
4284   CODING_GET_INFO (coding, attrs, charset_list);
4285   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4286
4287   while (src < src_end)
4288     {
4289       const unsigned char *p = src;
4290       int *source, *source_end;
4291       int i = 0;
4292
4293       if (multibytep)
4294         while (i < 1024 && p < src_end)
4295           {
4296             source_byteidx[i] = p - src;
4297             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4298           }
4299       else
4300         while (i < 1024 && p < src_end)
4301           source_charbuf[i++] = *p++;
4302
4303       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4304         ccl.last_block = 1;
4305
4306       source = source_charbuf;
4307       source_end = source + i;
4308       while (source < source_end)
4309         {
4310           ccl_driver (&ccl, source, charbuf,
4311                       source_end - source, charbuf_end - charbuf,
4312                       charset_list);
4313           source += ccl.consumed;
4314           charbuf += ccl.produced;
4315           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4316             break;
4317         }
4318       if (source < source_end)
4319         src += source_byteidx[source - source_charbuf];
4320       else
4321         src = p;
4322       consumed_chars += source - source_charbuf;
4323
4324       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4325           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4326         break;
4327     }
4328
4329   switch (ccl.status)
4330     {
4331     case CCL_STAT_SUSPEND_BY_SRC:
4332       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4333       break;
4334     case CCL_STAT_SUSPEND_BY_DST:
4335       break;
4336     case CCL_STAT_QUIT:
4337     case CCL_STAT_INVALID_CMD:
4338       coding->result = CODING_RESULT_INTERRUPT;
4339       break;
4340     default:
4341       coding->result = CODING_RESULT_SUCCESS;
4342       break;
4343     }
4344   coding->consumed_char += consumed_chars;
4345   coding->consumed = src - coding->source;
4346   coding->charbuf_used = charbuf - coding->charbuf;
4347 }
4348
4349 static int
4350 encode_coding_ccl (coding)
4351      struct coding_system *coding;
4352 {
4353   struct ccl_program ccl;
4354   int multibytep = coding->dst_multibyte;
4355   int *charbuf = coding->charbuf;
4356   int *charbuf_end = charbuf + coding->charbuf_used;
4357   unsigned char *dst = coding->destination + coding->produced;
4358   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4359   unsigned char *adjusted_dst_end = dst_end - 1;
4360   int destination_charbuf[1024];
4361   int i, produced_chars = 0;
4362   Lisp_Object attrs, charset_list;
4363
4364   CODING_GET_INFO (coding, attrs, charset_list);
4365   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4366
4367   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4368   ccl.dst_multibyte = coding->dst_multibyte;
4369
4370   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4371     {
4372       int dst_bytes = dst_end - dst;
4373       if (dst_bytes > 1024)
4374         dst_bytes = 1024;
4375
4376       ccl_driver (&ccl, charbuf, destination_charbuf,
4377                   charbuf_end - charbuf, dst_bytes, charset_list);
4378       charbuf += ccl.consumed;
4379       if (multibytep)
4380         for (i = 0; i < ccl.produced; i++)
4381           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4382       else
4383         {
4384           for (i = 0; i < ccl.produced; i++)
4385             *dst++ = destination_charbuf[i] & 0xFF;
4386           produced_chars += ccl.produced;
4387         }
4388     }
4389
4390   switch (ccl.status)
4391     {
4392     case CCL_STAT_SUSPEND_BY_SRC:
4393       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394       break;
4395     case CCL_STAT_SUSPEND_BY_DST:
4396       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4397       break;
4398     case CCL_STAT_QUIT:
4399     case CCL_STAT_INVALID_CMD:
4400       coding->result = CODING_RESULT_INTERRUPT;
4401       break;
4402     default:
4403       coding->result = CODING_RESULT_SUCCESS;
4404       break;
4405     }
4406
4407   coding->produced_char += produced_chars;
4408   coding->produced = dst - coding->destination;
4409   return 0;
4410 }
4411
4412
4413 \f
4414 /*** 10, 11. no-conversion handlers ***/
4415
4416 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4417
4418 static void
4419 decode_coding_raw_text (coding)
4420      struct coding_system *coding;
4421 {
4422   coding->chars_at_source = 1;
4423   coding->consumed_char = 0;
4424   coding->consumed = 0;
4425   coding->result = CODING_RESULT_SUCCESS;
4426 }
4427
4428 static int
4429 encode_coding_raw_text (coding)
4430      struct coding_system *coding;
4431 {
4432   int multibytep = coding->dst_multibyte;
4433   int *charbuf = coding->charbuf;
4434   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4435   unsigned char *dst = coding->destination + coding->produced;
4436   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4437   int produced_chars = 0;
4438   int c;
4439
4440   if (multibytep)
4441     {
4442       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4443
4444       if (coding->src_multibyte)
4445         while (charbuf < charbuf_end)
4446           {
4447             ASSURE_DESTINATION (safe_room);
4448             c = *charbuf++;
4449             if (ASCII_CHAR_P (c))
4450               EMIT_ONE_ASCII_BYTE (c);
4451             else if (CHAR_BYTE8_P (c))
4452               {
4453                 c = CHAR_TO_BYTE8 (c);
4454                 EMIT_ONE_BYTE (c);
4455               }
4456             else
4457               {
4458                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4459
4460                 CHAR_STRING_ADVANCE (c, p1);
4461                 while (p0 < p1)
4462                   {
4463                     EMIT_ONE_BYTE (*p0);
4464                     p0++;
4465                   }
4466               }
4467           }
4468       else
4469         while (charbuf < charbuf_end)
4470           {
4471             ASSURE_DESTINATION (safe_room);
4472             c = *charbuf++;
4473             EMIT_ONE_BYTE (c);
4474           }
4475     }
4476   else
4477     {
4478       if (coding->src_multibyte)
4479         {
4480           int safe_room = MAX_MULTIBYTE_LENGTH;
4481
4482           while (charbuf < charbuf_end)
4483             {
4484               ASSURE_DESTINATION (safe_room);
4485               c = *charbuf++;
4486               if (ASCII_CHAR_P (c))
4487                 *dst++ = c;
4488               else if (CHAR_BYTE8_P (c))
4489                 *dst++ = CHAR_TO_BYTE8 (c);
4490               else
4491                 CHAR_STRING_ADVANCE (c, dst);
4492               produced_chars++;
4493             }
4494         }
4495       else
4496         {
4497           ASSURE_DESTINATION (charbuf_end - charbuf);
4498           while (charbuf < charbuf_end && dst < dst_end)
4499             *dst++ = *charbuf++;
4500           produced_chars = dst - (coding->destination + coding->dst_bytes);
4501         }
4502     }
4503   coding->result = CODING_RESULT_SUCCESS;
4504   coding->produced_char += produced_chars;
4505   coding->produced = dst - coding->destination;
4506   return 0;
4507 }
4508
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510    Check if a text is encoded in a charset-based coding system.  If it
4511    is, return 1, else return 0.  */
4512
4513 static int
4514 detect_coding_charset (coding, detect_info)
4515      struct coding_system *coding;
4516      struct coding_detection_info *detect_info;
4517 {
4518   const unsigned char *src = coding->source, *src_base = src;
4519   const unsigned char *src_end = coding->source + coding->src_bytes;
4520   int multibytep = coding->src_multibyte;
4521   int consumed_chars = 0;
4522   Lisp_Object attrs, valids;
4523   int found = 0;
4524
4525   detect_info->checked |= CATEGORY_MASK_CHARSET;
4526
4527   coding = &coding_categories[coding_category_charset];
4528   attrs = CODING_ID_ATTRS (coding->id);
4529   valids = AREF (attrs, coding_attr_charset_valids);
4530
4531   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4532     src += coding->head_ascii;
4533
4534   while (1)
4535     {
4536       int c;
4537
4538       ONE_MORE_BYTE (c);
4539       if (NILP (AREF (valids, c)))
4540         break;
4541       if (c >= 0x80)
4542         found = CATEGORY_MASK_CHARSET;
4543     }
4544   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4545   return 0;
4546
4547  no_more_source:
4548   detect_info->found |= found;
4549   return 1;
4550 }
4551
4552 static void
4553 decode_coding_charset (coding)
4554      struct coding_system *coding;
4555 {
4556   const unsigned char *src = coding->source + coding->consumed;
4557   const unsigned char *src_end = coding->source + coding->src_bytes;
4558   const unsigned char *src_base;
4559   int *charbuf = coding->charbuf;
4560   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4561   int consumed_chars = 0, consumed_chars_base;
4562   int multibytep = coding->src_multibyte;
4563   Lisp_Object attrs, charset_list, valids;
4564   int char_offset = coding->produced_char;
4565   int last_offset = char_offset;
4566   int last_id = charset_ascii;
4567
4568   CODING_GET_INFO (coding, attrs, charset_list);
4569   valids = AREF (attrs, coding_attr_charset_valids);
4570
4571   while (1)
4572     {
4573       int c;
4574       Lisp_Object val;
4575       struct charset *charset;
4576       int dim;
4577       int len = 1;
4578       unsigned code;
4579
4580       src_base = src;
4581       consumed_chars_base = consumed_chars;
4582
4583       if (charbuf >= charbuf_end)
4584         break;
4585
4586       ONE_MORE_BYTE (c);
4587       code = c;
4588
4589       val = AREF (valids, c);
4590       if (NILP (val))
4591         goto invalid_code;
4592       if (INTEGERP (val))
4593         {
4594           charset = CHARSET_FROM_ID (XFASTINT (val));
4595           dim = CHARSET_DIMENSION (charset);
4596           while (len < dim)
4597             {
4598               ONE_MORE_BYTE (c);
4599               code = (code << 8) | c;
4600               len++;
4601             }
4602           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4603                               charset, code, c);
4604         }
4605       else
4606         {
4607           /* VAL is a list of charset IDs.  It is assured that the
4608              list is sorted by charset dimensions (smaller one
4609              comes first).  */
4610           while (CONSP (val))
4611             {
4612               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4613               dim = CHARSET_DIMENSION (charset);
4614               while (len < dim)
4615                 {
4616                   ONE_MORE_BYTE (c);
4617                   code = (code << 8) | c;
4618                   len++;
4619                 }
4620               CODING_DECODE_CHAR (coding, src, src_base,
4621                                   src_end, charset, code, c);
4622               if (c >= 0)
4623                 break;
4624               val = XCDR (val);
4625             }
4626         }
4627       if (c < 0)
4628         goto invalid_code;
4629       if (charset->id != charset_ascii
4630           && last_id != charset->id)
4631         {
4632           if (last_id != charset_ascii)
4633             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4634           last_id = charset->id;
4635           last_offset = char_offset;
4636         }
4637
4638       *charbuf++ = c;
4639       char_offset++;
4640       continue;
4641
4642     invalid_code:
4643       src = src_base;
4644       consumed_chars = consumed_chars_base;
4645       ONE_MORE_BYTE (c);
4646       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4647       char_offset++;
4648       coding->errors++;
4649     }
4650
4651  no_more_source:
4652   if (last_id != charset_ascii)
4653     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4654   coding->consumed_char += consumed_chars_base;
4655   coding->consumed = src_base - coding->source;
4656   coding->charbuf_used = charbuf - coding->charbuf;
4657 }
4658
4659 static int
4660 encode_coding_charset (coding)
4661      struct coding_system *coding;
4662 {
4663   int multibytep = coding->dst_multibyte;
4664   int *charbuf = coding->charbuf;
4665   int *charbuf_end = charbuf + coding->charbuf_used;
4666   unsigned char *dst = coding->destination + coding->produced;
4667   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4668   int safe_room = MAX_MULTIBYTE_LENGTH;
4669   int produced_chars = 0;
4670   Lisp_Object attrs, charset_list;
4671   int ascii_compatible;
4672   int c;
4673
4674   CODING_GET_INFO (coding, attrs, charset_list);
4675   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4676
4677   while (charbuf < charbuf_end)
4678     {
4679       struct charset *charset;
4680       unsigned code;
4681
4682       ASSURE_DESTINATION (safe_room);
4683       c = *charbuf++;
4684       if (ascii_compatible && ASCII_CHAR_P (c))
4685         EMIT_ONE_ASCII_BYTE (c);
4686       else if (CHAR_BYTE8_P (c))
4687         {
4688           c = CHAR_TO_BYTE8 (c);
4689           EMIT_ONE_BYTE (c);
4690         }
4691       else
4692         {
4693           charset = char_charset (c, charset_list, &code);
4694           if (charset)
4695             {
4696               if (CHARSET_DIMENSION (charset) == 1)
4697                 EMIT_ONE_BYTE (code);
4698               else if (CHARSET_DIMENSION (charset) == 2)
4699                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4700               else if (CHARSET_DIMENSION (charset) == 3)
4701                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4702               else
4703                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4704                                  (code >> 8) & 0xFF, code & 0xFF);
4705             }
4706           else
4707             {
4708               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4709                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4710               else
4711                 c = coding->default_char;
4712               EMIT_ONE_BYTE (c);
4713             }
4714         }
4715     }
4716
4717   coding->result = CODING_RESULT_SUCCESS;
4718   coding->produced_char += produced_chars;
4719   coding->produced = dst - coding->destination;
4720   return 0;
4721 }
4722
4723 \f
4724 /*** 7. C library functions ***/
4725
4726 /* Setup coding context CODING from information about CODING_SYSTEM.
4727    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4728    CODING_SYSTEM is invalid, signal an error.  */
4729
4730 void
4731 setup_coding_system (coding_system, coding)
4732      Lisp_Object coding_system;
4733      struct coding_system *coding;
4734 {
4735   Lisp_Object attrs;
4736   Lisp_Object eol_type;
4737   Lisp_Object coding_type;
4738   Lisp_Object val;
4739
4740   if (NILP (coding_system))
4741     coding_system = Qno_conversion;
4742
4743   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4744
4745   attrs = CODING_ID_ATTRS (coding->id);
4746   eol_type = CODING_ID_EOL_TYPE (coding->id);
4747
4748   coding->mode = 0;
4749   coding->head_ascii = -1;
4750   coding->common_flags
4751     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4752   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4753     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4754   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4755     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4756   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4757     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4758
4759   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4760   coding->max_charset_id = SCHARS (val) - 1;
4761   coding->safe_charsets = (char *) SDATA (val);
4762   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4763
4764   coding_type = CODING_ATTR_TYPE (attrs);
4765   if (EQ (coding_type, Qundecided))
4766     {
4767       coding->detector = NULL;
4768       coding->decoder = decode_coding_raw_text;
4769       coding->encoder = encode_coding_raw_text;
4770       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4771     }
4772   else if (EQ (coding_type, Qiso_2022))
4773     {
4774       int i;
4775       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4776
4777       /* Invoke graphic register 0 to plane 0.  */
4778       CODING_ISO_INVOCATION (coding, 0) = 0;
4779       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4780       CODING_ISO_INVOCATION (coding, 1)
4781         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4782       /* Setup the initial status of designation.  */
4783       for (i = 0; i < 4; i++)
4784         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4785       /* Not single shifting initially.  */
4786       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4787       /* Beginning of buffer should also be regarded as bol. */
4788       CODING_ISO_BOL (coding) = 1;
4789       coding->detector = detect_coding_iso_2022;
4790       coding->decoder = decode_coding_iso_2022;
4791       coding->encoder = encode_coding_iso_2022;
4792       if (flags & CODING_ISO_FLAG_SAFE)
4793         coding->mode |= CODING_MODE_SAFE_ENCODING;
4794       coding->common_flags
4795         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4796             | CODING_REQUIRE_FLUSHING_MASK);
4797       if (flags & CODING_ISO_FLAG_COMPOSITION)
4798         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4799       if (flags & CODING_ISO_FLAG_DESIGNATION)
4800         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4801       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4802         {
4803           setup_iso_safe_charsets (attrs);
4804           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4805           coding->max_charset_id = SCHARS (val) - 1;
4806           coding->safe_charsets = (char *) SDATA (val);
4807         }
4808       CODING_ISO_FLAGS (coding) = flags;
4809     }
4810   else if (EQ (coding_type, Qcharset))
4811     {
4812       coding->detector = detect_coding_charset;
4813       coding->decoder = decode_coding_charset;
4814       coding->encoder = encode_coding_charset;
4815       coding->common_flags
4816         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4817     }
4818   else if (EQ (coding_type, Qutf_8))
4819     {
4820       coding->detector = detect_coding_utf_8;
4821       coding->decoder = decode_coding_utf_8;
4822       coding->encoder = encode_coding_utf_8;
4823       coding->common_flags
4824         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4825     }
4826   else if (EQ (coding_type, Qutf_16))
4827     {
4828       val = AREF (attrs, coding_attr_utf_16_bom);
4829       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4830                                     : EQ (val, Qt) ? utf_16_with_bom
4831                                     : utf_16_without_bom);
4832       val = AREF (attrs, coding_attr_utf_16_endian);
4833       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4834                                        : utf_16_little_endian);
4835       CODING_UTF_16_SURROGATE (coding) = 0;
4836       coding->detector = detect_coding_utf_16;
4837       coding->decoder = decode_coding_utf_16;
4838       coding->encoder = encode_coding_utf_16;
4839       coding->common_flags
4840         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4841       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4842         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4843     }
4844   else if (EQ (coding_type, Qccl))
4845     {
4846       coding->detector = detect_coding_ccl;
4847       coding->decoder = decode_coding_ccl;
4848       coding->encoder = encode_coding_ccl;
4849       coding->common_flags
4850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4851             | CODING_REQUIRE_FLUSHING_MASK);
4852     }
4853   else if (EQ (coding_type, Qemacs_mule))
4854     {
4855       coding->detector = detect_coding_emacs_mule;
4856       coding->decoder = decode_coding_emacs_mule;
4857       coding->encoder = encode_coding_emacs_mule;
4858       coding->common_flags
4859         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4860       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4861           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4862         {
4863           Lisp_Object tail, safe_charsets;
4864           int max_charset_id = 0;
4865
4866           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4867                tail = XCDR (tail))
4868             if (max_charset_id < XFASTINT (XCAR (tail)))
4869               max_charset_id = XFASTINT (XCAR (tail));
4870           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4871                                         make_number (255));
4872           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4873                tail = XCDR (tail))
4874             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4875           coding->max_charset_id = max_charset_id;
4876           coding->safe_charsets = (char *) SDATA (safe_charsets);
4877         }
4878     }
4879   else if (EQ (coding_type, Qshift_jis))
4880     {
4881       coding->detector = detect_coding_sjis;
4882       coding->decoder = decode_coding_sjis;
4883       coding->encoder = encode_coding_sjis;
4884       coding->common_flags
4885         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4886     }
4887   else if (EQ (coding_type, Qbig5))
4888     {
4889       coding->detector = detect_coding_big5;
4890       coding->decoder = decode_coding_big5;
4891       coding->encoder = encode_coding_big5;
4892       coding->common_flags
4893         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4894     }
4895   else                          /* EQ (coding_type, Qraw_text) */
4896     {
4897       coding->detector = NULL;
4898       coding->decoder = decode_coding_raw_text;
4899       coding->encoder = encode_coding_raw_text;
4900     }
4901
4902   return;
4903 }
4904
4905 /* Return raw-text or one of its subsidiaries that has the same
4906    eol_type as CODING-SYSTEM.  */
4907
4908 Lisp_Object
4909 raw_text_coding_system (coding_system)
4910      Lisp_Object coding_system;
4911 {
4912   Lisp_Object spec, attrs;
4913   Lisp_Object eol_type, raw_text_eol_type;
4914
4915   if (NILP (coding_system))
4916     return Qraw_text;
4917   spec = CODING_SYSTEM_SPEC (coding_system);
4918   attrs = AREF (spec, 0);
4919
4920   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4921     return coding_system;
4922
4923   eol_type = AREF (spec, 2);
4924   if (VECTORP (eol_type))
4925     return Qraw_text;
4926   spec = CODING_SYSTEM_SPEC (Qraw_text);
4927   raw_text_eol_type = AREF (spec, 2);
4928   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4929           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4930           : AREF (raw_text_eol_type, 2));
4931 }
4932
4933
4934 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4935    does, return one of the subsidiary that has the same eol-spec as
4936    PARENT.  Otherwise, return CODING_SYSTEM.  */
4937
4938 Lisp_Object
4939 coding_inherit_eol_type (coding_system, parent)
4940      Lisp_Object coding_system, parent;
4941 {
4942   Lisp_Object spec, eol_type;
4943
4944   if (NILP (coding_system))
4945     coding_system = Qraw_text;
4946   spec = CODING_SYSTEM_SPEC (coding_system);
4947   eol_type = AREF (spec, 2);
4948   if (VECTORP (eol_type)
4949       && ! NILP (parent))
4950     {
4951       Lisp_Object parent_spec;
4952       Lisp_Object parent_eol_type;
4953
4954       parent_spec
4955         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4956       parent_eol_type = AREF (parent_spec, 2);
4957       if (EQ (parent_eol_type, Qunix))
4958         coding_system = AREF (eol_type, 0);
4959       else if (EQ (parent_eol_type, Qdos))
4960         coding_system = AREF (eol_type, 1);
4961       else if (EQ (parent_eol_type, Qmac))
4962         coding_system = AREF (eol_type, 2);
4963     }
4964   return coding_system;
4965 }
4966
4967 /* Emacs has a mechanism to automatically detect a coding system if it
4968    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4969    it's impossible to distinguish some coding systems accurately
4970    because they use the same range of codes.  So, at first, coding
4971    systems are categorized into 7, those are:
4972
4973    o coding-category-emacs-mule
4974
4975         The category for a coding system which has the same code range
4976         as Emacs' internal format.  Assigned the coding-system (Lisp
4977         symbol) `emacs-mule' by default.
4978
4979    o coding-category-sjis
4980
4981         The category for a coding system which has the same code range
4982         as SJIS.  Assigned the coding-system (Lisp
4983         symbol) `japanese-shift-jis' by default.
4984
4985    o coding-category-iso-7
4986
4987         The category for a coding system which has the same code range
4988         as ISO2022 of 7-bit environment.  This doesn't use any locking
4989         shift and single shift functions.  This can encode/decode all
4990         charsets.  Assigned the coding-system (Lisp symbol)
4991         `iso-2022-7bit' by default.
4992
4993    o coding-category-iso-7-tight
4994
4995         Same as coding-category-iso-7 except that this can
4996         encode/decode only the specified charsets.
4997
4998    o coding-category-iso-8-1
4999
5000         The category for a coding system which has the same code range
5001         as ISO2022 of 8-bit environment and graphic plane 1 used only
5002         for DIMENSION1 charset.  This doesn't use any locking shift
5003         and single shift functions.  Assigned the coding-system (Lisp
5004         symbol) `iso-latin-1' by default.
5005
5006    o coding-category-iso-8-2
5007
5008         The category for a coding system which has the same code range
5009         as ISO2022 of 8-bit environment and graphic plane 1 used only
5010         for DIMENSION2 charset.  This doesn't use any locking shift
5011         and single shift functions.  Assigned the coding-system (Lisp
5012         symbol) `japanese-iso-8bit' by default.
5013
5014    o coding-category-iso-7-else
5015
5016         The category for a coding system which has the same code range
5017         as ISO2022 of 7-bit environemnt but uses locking shift or
5018         single shift functions.  Assigned the coding-system (Lisp
5019         symbol) `iso-2022-7bit-lock' by default.
5020
5021    o coding-category-iso-8-else
5022
5023         The category for a coding system which has the same code range
5024         as ISO2022 of 8-bit environemnt but uses locking shift or
5025         single shift functions.  Assigned the coding-system (Lisp
5026         symbol) `iso-2022-8bit-ss2' by default.
5027
5028    o coding-category-big5
5029
5030         The category for a coding system which has the same code range
5031         as BIG5.  Assigned the coding-system (Lisp symbol)
5032         `cn-big5' by default.
5033
5034    o coding-category-utf-8
5035
5036         The category for a coding system which has the same code range
5037         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5038         symbol) `utf-8' by default.
5039
5040    o coding-category-utf-16-be
5041
5042         The category for a coding system in which a text has an
5043         Unicode signature (cf. Unicode Standard) in the order of BIG
5044         endian at the head.  Assigned the coding-system (Lisp symbol)
5045         `utf-16-be' by default.
5046
5047    o coding-category-utf-16-le
5048
5049         The category for a coding system in which a text has an
5050         Unicode signature (cf. Unicode Standard) in the order of
5051         LITTLE endian at the head.  Assigned the coding-system (Lisp
5052         symbol) `utf-16-le' by default.
5053
5054    o coding-category-ccl
5055
5056         The category for a coding system of which encoder/decoder is
5057         written in CCL programs.  The default value is nil, i.e., no
5058         coding system is assigned.
5059
5060    o coding-category-binary
5061
5062         The category for a coding system not categorized in any of the
5063         above.  Assigned the coding-system (Lisp symbol)
5064         `no-conversion' by default.
5065
5066    Each of them is a Lisp symbol and the value is an actual
5067    `coding-system's (this is also a Lisp symbol) assigned by a user.
5068    What Emacs does actually is to detect a category of coding system.
5069    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5070    decide only one possible category, it selects a category of the
5071    highest priority.  Priorities of categories are also specified by a
5072    user in a Lisp variable `coding-category-list'.
5073
5074 */
5075
5076 #define EOL_SEEN_NONE   0
5077 #define EOL_SEEN_LF     1
5078 #define EOL_SEEN_CR     2
5079 #define EOL_SEEN_CRLF   4
5080
5081 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5082    SOURCE is encoded.  If CATEGORY is one of
5083    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5084    two-byte, else they are encoded by one-byte.
5085
5086    Return one of EOL_SEEN_XXX.  */
5087
5088 #define MAX_EOL_CHECK_COUNT 3
5089
5090 static int
5091 detect_eol (source, src_bytes, category)
5092      unsigned char *source;
5093      EMACS_INT src_bytes;
5094      enum coding_category category;
5095 {
5096   unsigned char *src = source, *src_end = src + src_bytes;
5097   unsigned char c;
5098   int total  = 0;
5099   int eol_seen = EOL_SEEN_NONE;
5100
5101   if ((1 << category) & CATEGORY_MASK_UTF_16)
5102     {
5103       int msb, lsb;
5104
5105       msb = category == (coding_category_utf_16_le
5106                          | coding_category_utf_16_le_nosig);
5107       lsb = 1 - msb;
5108
5109       while (src + 1 < src_end)
5110         {
5111           c = src[lsb];
5112           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5113             {
5114               int this_eol;
5115
5116               if (c == '\n')
5117                 this_eol = EOL_SEEN_LF;
5118               else if (src + 3 >= src_end
5119                        || src[msb + 2] != 0
5120                        || src[lsb + 2] != '\n')
5121                 this_eol = EOL_SEEN_CR;
5122               else
5123                 this_eol = EOL_SEEN_CRLF;
5124
5125               if (eol_seen == EOL_SEEN_NONE)
5126                 /* This is the first end-of-line.  */
5127                 eol_seen = this_eol;
5128               else if (eol_seen != this_eol)
5129                 {
5130                   /* The found type is different from what found before.  */
5131                   eol_seen = EOL_SEEN_LF;
5132                   break;
5133                 }
5134               if (++total == MAX_EOL_CHECK_COUNT)
5135                 break;
5136             }
5137           src += 2;
5138         }
5139     }
5140   else
5141     {
5142       while (src < src_end)
5143         {
5144           c = *src++;
5145           if (c == '\n' || c == '\r')
5146             {
5147               int this_eol;
5148
5149               if (c == '\n')
5150                 this_eol = EOL_SEEN_LF;
5151               else if (src >= src_end || *src != '\n')
5152                 this_eol = EOL_SEEN_CR;
5153               else
5154                 this_eol = EOL_SEEN_CRLF, src++;
5155
5156               if (eol_seen == EOL_SEEN_NONE)
5157                 /* This is the first end-of-line.  */
5158                 eol_seen = this_eol;
5159               else if (eol_seen != this_eol)
5160                 {
5161                   /* The found type is different from what found before.  */
5162                   eol_seen = EOL_SEEN_LF;
5163                   break;
5164                 }
5165               if (++total == MAX_EOL_CHECK_COUNT)
5166                 break;
5167             }
5168         }
5169     }
5170   return eol_seen;
5171 }
5172
5173
5174 static Lisp_Object
5175 adjust_coding_eol_type (coding, eol_seen)
5176      struct coding_system *coding;
5177      int eol_seen;
5178 {
5179   Lisp_Object eol_type;
5180
5181   eol_type = CODING_ID_EOL_TYPE (coding->id);
5182   if (eol_seen & EOL_SEEN_LF)
5183     {
5184       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5185       eol_type = Qunix;
5186     }
5187   else if (eol_seen & EOL_SEEN_CRLF)
5188     {
5189       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5190       eol_type = Qdos;
5191     }
5192   else if (eol_seen & EOL_SEEN_CR)
5193     {
5194       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5195       eol_type = Qmac;
5196     }
5197   return eol_type;
5198 }
5199
5200 /* Detect how a text specified in CODING is encoded.  If a coding
5201    system is detected, update fields of CODING by the detected coding
5202    system.  */
5203
5204 void
5205 detect_coding (coding)
5206      struct coding_system *coding;
5207 {
5208   const unsigned char *src, *src_end;
5209   Lisp_Object attrs, coding_type;
5210
5211   coding->consumed = coding->consumed_char = 0;
5212   coding->produced = coding->produced_char = 0;
5213   coding_set_source (coding);
5214
5215   src_end = coding->source + coding->src_bytes;
5216
5217   /* If we have not yet decided the text encoding type, detect it
5218      now.  */
5219   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5220     {
5221       int c, i;
5222
5223       for (i = 0, src = coding->source; src < src_end; i++, src++)
5224         {
5225           c = *src;
5226           if (c & 0x80 || (c < 0x20 && (c == 0
5227                                         || c == ISO_CODE_ESC
5228                                         || c == ISO_CODE_SI
5229                                         || c == ISO_CODE_SO)))
5230             break;
5231         }
5232       /* Skipped bytes must be even for utf-16 detector.  */
5233       if (i % 2)
5234         src--;
5235       coding->head_ascii = src - (coding->source + coding->consumed);
5236
5237       if (coding->head_ascii < coding->src_bytes)
5238         {
5239           struct coding_detection_info detect_info;
5240           enum coding_category category;
5241           struct coding_system *this;
5242
5243           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5244           for (i = 0; i < coding_category_raw_text; i++)
5245             {
5246               category = coding_priorities[i];
5247               this = coding_categories + category;
5248               if (this->id < 0)
5249                 {
5250                   /* No coding system of this category is defined.  */
5251                   detect_info.rejected |= (1 << category);
5252                 }
5253               else if (category >= coding_category_raw_text)
5254                 continue;
5255               else if (detect_info.checked & (1 << category))
5256                 {
5257                   if (detect_info.found & (1 << category))
5258                     break;
5259                 }
5260               else if ((*(this->detector)) (coding, &detect_info)
5261                        && detect_info.found & (1 << category))
5262                 {
5263                   if (category == coding_category_utf_16_auto)
5264                     {
5265                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5266                         category = coding_category_utf_16_le;
5267                       else
5268                         category = coding_category_utf_16_be;
5269                     }
5270                   break;
5271                 }
5272             }
5273           if (i < coding_category_raw_text)
5274             setup_coding_system (CODING_ID_NAME (this->id), coding);
5275           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5276             setup_coding_system (Qraw_text, coding);
5277           else if (detect_info.rejected)
5278             for (i = 0; i < coding_category_raw_text; i++)
5279               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5280                 {
5281                   this = coding_categories + coding_priorities[i];
5282                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5283                   break;
5284                 }
5285         }
5286     }
5287   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5288            == coding_category_utf_16_auto)
5289     {
5290       Lisp_Object coding_systems;
5291       struct coding_detection_info detect_info;
5292
5293       coding_systems
5294         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5295       detect_info.found = detect_info.rejected = 0;
5296       if (CONSP (coding_systems)
5297           && detect_coding_utf_16 (coding, &detect_info))
5298         {
5299           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5300             setup_coding_system (XCAR (coding_systems), coding);
5301           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5302             setup_coding_system (XCDR (coding_systems), coding);
5303         }
5304     }
5305 }
5306
5307
5308 static void
5309 decode_eol (coding)
5310      struct coding_system *coding;
5311 {
5312   Lisp_Object eol_type;
5313   unsigned char *p, *pbeg, *pend;
5314
5315   eol_type = CODING_ID_EOL_TYPE (coding->id);
5316   if (EQ (eol_type, Qunix))
5317     return;
5318
5319   if (NILP (coding->dst_object))
5320     pbeg = coding->destination;
5321   else
5322     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5323   pend = pbeg + coding->produced;
5324
5325   if (VECTORP (eol_type))
5326     {
5327       int eol_seen = EOL_SEEN_NONE;
5328
5329       for (p = pbeg; p < pend; p++)
5330         {
5331           if (*p == '\n')
5332             eol_seen |= EOL_SEEN_LF;
5333           else if (*p == '\r')
5334             {
5335               if (p + 1 < pend && *(p + 1) == '\n')
5336                 {
5337                   eol_seen |= EOL_SEEN_CRLF;
5338                   p++;
5339                 }
5340               else
5341                 eol_seen |= EOL_SEEN_CR;
5342             }
5343         }
5344       if (eol_seen != EOL_SEEN_NONE
5345           && eol_seen != EOL_SEEN_LF
5346           && eol_seen != EOL_SEEN_CRLF
5347           && eol_seen != EOL_SEEN_CR)
5348         eol_seen = EOL_SEEN_LF;
5349       if (eol_seen != EOL_SEEN_NONE)
5350         eol_type = adjust_coding_eol_type (coding, eol_seen);
5351     }
5352
5353   if (EQ (eol_type, Qmac))
5354     {
5355       for (p = pbeg; p < pend; p++)
5356         if (*p == '\r')
5357           *p = '\n';
5358     }
5359   else if (EQ (eol_type, Qdos))
5360     {
5361       int n = 0;
5362
5363       if (NILP (coding->dst_object))
5364         {
5365           for (p = pend - 2; p >= pbeg; p--)
5366             if (*p == '\r')
5367               {
5368                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5369                 n++;
5370               }
5371         }
5372       else
5373         {
5374           for (p = pend - 2; p >= pbeg; p--)
5375             if (*p == '\r')
5376               {
5377                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5378                 int pos = BYTE_TO_CHAR (pos_byte);
5379
5380                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5381                 n++;
5382               }
5383         }
5384       coding->produced -= n;
5385       coding->produced_char -= n;
5386     }
5387 }
5388
5389 static void
5390 translate_chars (coding, table)
5391      struct coding_system *coding;
5392      Lisp_Object table;
5393 {
5394   int *charbuf = coding->charbuf;
5395   int *charbuf_end = charbuf + coding->charbuf_used;
5396   int c;
5397
5398   if (coding->chars_at_source)
5399     return;
5400
5401   while (charbuf < charbuf_end)
5402     {
5403       c = *charbuf;
5404       if (c < 0)
5405         charbuf += c;
5406       else
5407         *charbuf++ = translate_char (table, c);
5408     }
5409 }
5410
5411 static int
5412 produce_chars (coding)
5413      struct coding_system *coding;
5414 {
5415   unsigned char *dst = coding->destination + coding->produced;
5416   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5417   int produced;
5418   int produced_chars = 0;
5419
5420   if (! coding->chars_at_source)
5421     {
5422       /* Characters are in coding->charbuf.  */
5423       int *buf = coding->charbuf;
5424       int *buf_end = buf + coding->charbuf_used;
5425       unsigned char *adjusted_dst_end;
5426
5427       if (BUFFERP (coding->src_object)
5428           && EQ (coding->src_object, coding->dst_object))
5429         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5430       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5431
5432       while (buf < buf_end)
5433         {
5434           int c = *buf++;
5435
5436           if (dst >= adjusted_dst_end)
5437             {
5438               dst = alloc_destination (coding,
5439                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5440                                        dst);
5441               dst_end = coding->destination + coding->dst_bytes;
5442               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5443             }
5444           if (c >= 0)
5445             {
5446               if (coding->dst_multibyte
5447                   || ! CHAR_BYTE8_P (c))
5448                 CHAR_STRING_ADVANCE (c, dst);
5449               else
5450                 *dst++ = CHAR_TO_BYTE8 (c);
5451               produced_chars++;
5452             }
5453           else
5454             /* This is an annotation datum.  (-C) is the length of
5455                it.  */
5456             buf += -c - 1;
5457         }
5458     }
5459   else
5460     {
5461       const unsigned char *src = coding->source;
5462       const unsigned char *src_end = src + coding->src_bytes;
5463       Lisp_Object eol_type;
5464
5465       eol_type = CODING_ID_EOL_TYPE (coding->id);
5466
5467       if (coding->src_multibyte != coding->dst_multibyte)
5468         {
5469           if (coding->src_multibyte)
5470             {
5471               int multibytep = 1;
5472               int consumed_chars;
5473
5474               while (1)
5475                 {
5476                   const unsigned char *src_base = src;
5477                   int c;
5478
5479                   ONE_MORE_BYTE (c);
5480                   if (c == '\r')
5481                     {
5482                       if (EQ (eol_type, Qdos))
5483                         {
5484                           if (src == src_end)
5485                             {
5486                               coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5487                               goto no_more_source;
5488                             }
5489                           if (*src == '\n')
5490                             c = *src++;
5491                         }
5492                       else if (EQ (eol_type, Qmac))
5493                         c = '\n';
5494                     }
5495                   if (dst == dst_end)
5496                     {
5497                       coding->consumed = src - coding->source;
5498
5499                     if (EQ (coding->src_object, coding->dst_object))
5500                       dst_end = (unsigned char *) src;
5501                     if (dst == dst_end)
5502                       {
5503                         dst = alloc_destination (coding, src_end - src + 1,
5504                                                  dst);
5505                         dst_end = coding->destination + coding->dst_bytes;
5506                         coding_set_source (coding);
5507                         src = coding->source + coding->consumed;
5508                         src_end = coding->source + coding->src_bytes;
5509                       }
5510                     }
5511                   *dst++ = c;
5512                   produced_chars++;
5513                 }
5514             no_more_source:
5515               ;
5516             }
5517           else
5518             while (src < src_end)
5519               {
5520                 int multibytep = 1;
5521                 int c = *src++;
5522
5523                 if (c == '\r')
5524                   {
5525                     if (EQ (eol_type, Qdos))
5526                       {
5527                         if (src < src_end
5528                             && *src == '\n')
5529                           c = *src++;
5530                       }
5531                     else if (EQ (eol_type, Qmac))
5532                       c = '\n';
5533                   }
5534                 if (dst >= dst_end - 1)
5535                   {
5536                     coding->consumed = src - coding->source;
5537
5538                     if (EQ (coding->src_object, coding->dst_object))
5539                       dst_end = (unsigned char *) src;
5540                     if (dst >= dst_end - 1)
5541                       {
5542                         dst = alloc_destination (coding, src_end - src + 2,
5543                                                  dst);
5544                         dst_end = coding->destination + coding->dst_bytes;
5545                         coding_set_source (coding);
5546                         src = coding->source + coding->consumed;
5547                         src_end = coding->source + coding->src_bytes;
5548                       }
5549                   }
5550                 EMIT_ONE_BYTE (c);
5551               }
5552         }
5553       else
5554         {
5555           if (!EQ (coding->src_object, coding->dst_object))
5556             {
5557               int require = coding->src_bytes - coding->dst_bytes;
5558
5559               if (require > 0)
5560                 {
5561                   EMACS_INT offset = src - coding->source;
5562
5563                   dst = alloc_destination (coding, require, dst);
5564                   coding_set_source (coding);
5565                   src = coding->source + offset;
5566                   src_end = coding->source + coding->src_bytes;
5567                 }
5568             }
5569           produced_chars = coding->src_chars;
5570           while (src < src_end)
5571             {
5572               int c = *src++;
5573
5574               if (c == '\r')
5575                 {
5576                   if (EQ (eol_type, Qdos))
5577                     {
5578                       if (src < src_end
5579                           && *src == '\n')
5580                         c = *src++;
5581                       produced_chars--;
5582                     }
5583                   else if (EQ (eol_type, Qmac))
5584                     c = '\n';
5585                 }
5586               *dst++ = c;
5587             }
5588         }
5589       coding->consumed = coding->src_bytes;
5590       coding->consumed_char = coding->src_chars;
5591     }
5592
5593   produced = dst - (coding->destination + coding->produced);
5594   if (BUFFERP (coding->dst_object))
5595     insert_from_gap (produced_chars, produced);
5596   coding->produced += produced;
5597   coding->produced_char += produced_chars;
5598   return produced_chars;
5599 }
5600
5601 /* Compose text in CODING->object according to the annotation data at
5602    CHARBUF.  CHARBUF is an array:
5603      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5604  */
5605
5606 static INLINE void
5607 produce_composition (coding, charbuf)
5608      struct coding_system *coding;
5609      int *charbuf;
5610 {
5611   int len;
5612   EMACS_INT from, to;
5613   enum composition_method method;
5614   Lisp_Object components;
5615
5616   len = -charbuf[0];
5617   from = coding->dst_pos + charbuf[2];
5618   to = coding->dst_pos + charbuf[3];
5619   method = (enum composition_method) (charbuf[4]);
5620
5621   if (method == COMPOSITION_RELATIVE)
5622     components = Qnil;
5623   else
5624     {
5625       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5626       int i;
5627
5628       len -= 5;
5629       charbuf += 5;
5630       for (i = 0; i < len; i++)
5631         args[i] = make_number (charbuf[i]);
5632       components = (method == COMPOSITION_WITH_ALTCHARS
5633                     ? Fstring (len, args) : Fvector (len, args));
5634     }
5635   compose_text (from, to, components, Qnil, coding->dst_object);
5636 }
5637
5638
5639 /* Put `charset' property on text in CODING->object according to
5640    the annotation data at CHARBUF.  CHARBUF is an array:
5641      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5642  */
5643
5644 static INLINE void
5645 produce_charset (coding, charbuf)
5646      struct coding_system *coding;
5647      int *charbuf;
5648 {
5649   EMACS_INT from = coding->dst_pos + charbuf[2];
5650   EMACS_INT to = coding->dst_pos + charbuf[3];
5651   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5652
5653   Fput_text_property (make_number (from), make_number (to),
5654                       Qcharset, CHARSET_NAME (charset),
5655                       coding->dst_object);
5656 }
5657
5658
5659 #define CHARBUF_SIZE 0x4000
5660
5661 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5662   do {                                                                  \
5663     int size = CHARBUF_SIZE;;                                           \
5664                                                                         \
5665     coding->charbuf = NULL;                                             \
5666     while (size > 1024)                                                 \
5667       {                                                                 \
5668         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5669         if (coding->charbuf)                                            \
5670           break;                                                        \
5671         size >>= 1;                                                     \
5672       }                                                                 \
5673     if (! coding->charbuf)                                              \
5674       {                                                                 \
5675         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5676         return coding->result;                                          \
5677       }                                                                 \
5678     coding->charbuf_size = size;                                        \
5679   } while (0)
5680
5681
5682 static void
5683 produce_annotation (coding)
5684      struct coding_system *coding;
5685 {
5686   int *charbuf = coding->charbuf;
5687   int *charbuf_end = charbuf + coding->charbuf_used;
5688
5689   if (NILP (coding->dst_object))
5690     return;
5691
5692   while (charbuf < charbuf_end)
5693     {
5694       if (*charbuf >= 0)
5695         charbuf++;
5696       else
5697         {
5698           int len = -*charbuf;
5699           switch (charbuf[1])
5700             {
5701             case CODING_ANNOTATE_COMPOSITION_MASK:
5702               produce_composition (coding, charbuf);
5703               break;
5704             case CODING_ANNOTATE_CHARSET_MASK:
5705               produce_charset (coding, charbuf);
5706               break;
5707             default:
5708               abort ();
5709             }
5710           charbuf += len;
5711         }
5712     }
5713 }
5714
5715 /* Decode the data at CODING->src_object into CODING->dst_object.
5716    CODING->src_object is a buffer, a string, or nil.
5717    CODING->dst_object is a buffer.
5718
5719    If CODING->src_object is a buffer, it must be the current buffer.
5720    In this case, if CODING->src_pos is positive, it is a position of
5721    the source text in the buffer, otherwise, the source text is in the
5722    gap area of the buffer, and CODING->src_pos specifies the offset of
5723    the text from GPT (which must be the same as PT).  If this is the
5724    same buffer as CODING->dst_object, CODING->src_pos must be
5725    negative.
5726
5727    If CODING->src_object is a string, CODING->src_pos in an index to
5728    that string.
5729
5730    If CODING->src_object is nil, CODING->source must already point to
5731    the non-relocatable memory area.  In this case, CODING->src_pos is
5732    an offset from CODING->source.
5733
5734    The decoded data is inserted at the current point of the buffer
5735    CODING->dst_object.
5736 */
5737
5738 static int
5739 decode_coding (coding)
5740      struct coding_system *coding;
5741 {
5742   Lisp_Object attrs;
5743   Lisp_Object undo_list;
5744
5745   if (BUFFERP (coding->src_object)
5746       && coding->src_pos > 0
5747       && coding->src_pos < GPT
5748       && coding->src_pos + coding->src_chars > GPT)
5749     move_gap_both (coding->src_pos, coding->src_pos_byte);
5750
5751   undo_list = Qt;
5752   if (BUFFERP (coding->dst_object))
5753     {
5754       if (current_buffer != XBUFFER (coding->dst_object))
5755         set_buffer_internal (XBUFFER (coding->dst_object));
5756       if (GPT != PT)
5757         move_gap_both (PT, PT_BYTE);
5758       undo_list = current_buffer->undo_list;
5759       current_buffer->undo_list = Qt;
5760     }
5761
5762   coding->consumed = coding->consumed_char = 0;
5763   coding->produced = coding->produced_char = 0;
5764   coding->chars_at_source = 0;
5765   coding->result = CODING_RESULT_SUCCESS;
5766   coding->errors = 0;
5767
5768   ALLOC_CONVERSION_WORK_AREA (coding);
5769
5770   attrs = CODING_ID_ATTRS (coding->id);
5771
5772   do
5773     {
5774       coding_set_source (coding);
5775       coding->annotated = 0;
5776       (*(coding->decoder)) (coding);
5777       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5778         translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5779       else if (!NILP (Vstandard_translation_table_for_decode))
5780         translate_chars (coding, Vstandard_translation_table_for_decode);
5781       coding_set_destination (coding);
5782       produce_chars (coding);
5783       if (coding->annotated)
5784         produce_annotation (coding);
5785     }
5786   while (coding->consumed < coding->src_bytes
5787          && ! coding->result);
5788
5789   coding->carryover_bytes = 0;
5790   if (coding->consumed < coding->src_bytes)
5791     {
5792       int nbytes = coding->src_bytes - coding->consumed;
5793       const unsigned char *src;
5794
5795       coding_set_source (coding);
5796       coding_set_destination (coding);
5797       src = coding->source + coding->consumed;
5798
5799       if (coding->mode & CODING_MODE_LAST_BLOCK)
5800         {
5801           /* Flush out unprocessed data as binary chars.  We are sure
5802              that the number of data is less than the size of
5803              coding->charbuf.  */
5804           while (nbytes-- > 0)
5805             {
5806               int c = *src++;
5807
5808               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5809             }
5810           produce_chars (coding);
5811         }
5812       else
5813         {
5814           /* Record unprocessed bytes in coding->carryover.  We are
5815              sure that the number of data is less than the size of
5816              coding->carryover.  */
5817           unsigned char *p = coding->carryover;
5818
5819           coding->carryover_bytes = nbytes;
5820           while (nbytes-- > 0)
5821             *p++ = *src++;
5822         }
5823       coding->consumed = coding->src_bytes;
5824     }
5825
5826   if (BUFFERP (coding->dst_object))
5827     {
5828       current_buffer->undo_list = undo_list;
5829       record_insert (coding->dst_pos, coding->produced_char);
5830     }
5831   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5832     decode_eol (coding);
5833   return coding->result;
5834 }
5835
5836
5837 /* Extract an annotation datum from a composition starting at POS and
5838    ending before LIMIT of CODING->src_object (buffer or string), store
5839    the data in BUF, set *STOP to a starting position of the next
5840    composition (if any) or to LIMIT, and return the address of the
5841    next element of BUF.
5842
5843    If such an annotation is not found, set *STOP to a starting
5844    position of a composition after POS (if any) or to LIMIT, and
5845    return BUF.  */
5846
5847 static INLINE int *
5848 handle_composition_annotation (pos, limit, coding, buf, stop)
5849      EMACS_INT pos, limit;
5850      struct coding_system *coding;
5851      int *buf;
5852      EMACS_INT *stop;
5853 {
5854   EMACS_INT start, end;
5855   Lisp_Object prop;
5856
5857   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5858       || end > limit)
5859     *stop = limit;
5860   else if (start > pos)
5861     *stop = start;
5862   else
5863     {
5864       if (start == pos)
5865         {
5866           /* We found a composition.  Store the corresponding
5867              annotation data in BUF.  */
5868           int *head = buf;
5869           enum composition_method method = COMPOSITION_METHOD (prop);
5870           int nchars = COMPOSITION_LENGTH (prop);
5871
5872           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5873           if (method != COMPOSITION_RELATIVE)
5874             {
5875               Lisp_Object components;
5876               int len, i, i_byte;
5877
5878               components = COMPOSITION_COMPONENTS (prop);
5879               if (VECTORP (components))
5880                 {
5881                   len = XVECTOR (components)->size;
5882                   for (i = 0; i < len; i++)
5883                     *buf++ = XINT (AREF (components, i));
5884                 }
5885               else if (STRINGP (components))
5886                 {
5887                   len = SCHARS (components);
5888                   i = i_byte = 0;
5889                   while (i < len)
5890                     {
5891                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5892                       buf++;
5893                     }
5894                 }
5895               else if (INTEGERP (components))
5896                 {
5897                   len = 1;
5898                   *buf++ = XINT (components);
5899                 }
5900               else if (CONSP (components))
5901                 {
5902                   for (len = 0; CONSP (components);
5903                        len++, components = XCDR (components))
5904                     *buf++ = XINT (XCAR (components));
5905                 }
5906               else
5907                 abort ();
5908               *head -= len;
5909             }
5910         }
5911
5912       if (find_composition (end, limit, &start, &end, &prop,
5913                             coding->src_object)
5914           && end <= limit)
5915         *stop = start;
5916       else
5917         *stop = limit;
5918     }
5919   return buf;
5920 }
5921
5922
5923 /* Extract an annotation datum from a text property `charset' at POS of
5924    CODING->src_object (buffer of string), store the data in BUF, set
5925    *STOP to the position where the value of `charset' property changes
5926    (limiting by LIMIT), and return the address of the next element of
5927    BUF.
5928
5929    If the property value is nil, set *STOP to the position where the
5930    property value is non-nil (limiting by LIMIT), and return BUF.  */
5931
5932 static INLINE int *
5933 handle_charset_annotation (pos, limit, coding, buf, stop)
5934      EMACS_INT pos, limit;
5935      struct coding_system *coding;
5936      int *buf;
5937      EMACS_INT *stop;
5938 {
5939   Lisp_Object val, next;
5940   int id;
5941
5942   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5943   if (! NILP (val) && CHARSETP (val))
5944     id = XINT (CHARSET_SYMBOL_ID (val));
5945   else
5946     id = -1;
5947   ADD_CHARSET_DATA (buf, 0, 0, id);
5948   next = Fnext_single_property_change (make_number (pos), Qcharset,
5949                                        coding->src_object,
5950                                        make_number (limit));
5951   *stop = XINT (next);
5952   return buf;
5953 }
5954
5955
5956 static void
5957 consume_chars (coding)
5958      struct coding_system *coding;
5959 {
5960   int *buf = coding->charbuf;
5961   int *buf_end = coding->charbuf + coding->charbuf_size;
5962   const unsigned char *src = coding->source + coding->consumed;
5963   const unsigned char *src_end = coding->source + coding->src_bytes;
5964   EMACS_INT pos = coding->src_pos + coding->consumed_char;
5965   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5966   int multibytep = coding->src_multibyte;
5967   Lisp_Object eol_type;
5968   int c;
5969   EMACS_INT stop, stop_composition, stop_charset;
5970
5971   eol_type = CODING_ID_EOL_TYPE (coding->id);
5972   if (VECTORP (eol_type))
5973     eol_type = Qunix;
5974
5975   /* Note: composition handling is not yet implemented.  */
5976   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5977
5978   if (NILP (coding->src_object))
5979     stop = stop_composition = stop_charset = end_pos;
5980   else
5981     {
5982       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5983         stop = stop_composition = pos;
5984       else
5985         stop = stop_composition = end_pos;
5986       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5987         stop = stop_charset = pos;
5988       else
5989         stop_charset = end_pos;
5990     }
5991
5992   /* Compensate for CRLF and conversion.  */
5993   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5994   while (buf < buf_end)
5995     {
5996       if (pos == stop)
5997         {
5998           if (pos == end_pos)
5999             break;
6000           if (pos == stop_composition)
6001             buf = handle_composition_annotation (pos, end_pos, coding,
6002                                                  buf, &stop_composition);
6003           if (pos == stop_charset)
6004             buf = handle_charset_annotation (pos, end_pos, coding,
6005                                              buf, &stop_charset);
6006           stop = (stop_composition < stop_charset
6007                   ? stop_composition : stop_charset);
6008         }
6009
6010       if (! multibytep)
6011         {
6012           EMACS_INT bytes;
6013
6014           if (! CODING_FOR_UNIBYTE (coding)
6015               && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6016             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6017           else
6018             c = *src++, pos++;
6019         }
6020       else
6021         c = STRING_CHAR_ADVANCE (src), pos++;
6022       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6023         c = '\n';
6024       if (! EQ (eol_type, Qunix))
6025         {
6026           if (c == '\n')
6027             {
6028               if (EQ (eol_type, Qdos))
6029                 *buf++ = '\r';
6030               else
6031                 c = '\r';
6032             }
6033         }
6034       *buf++ = c;
6035     }
6036
6037   coding->consumed = src - coding->source;
6038   coding->consumed_char = pos - coding->src_pos;
6039   coding->charbuf_used = buf - coding->charbuf;
6040   coding->chars_at_source = 0;
6041 }
6042
6043
6044 /* Encode the text at CODING->src_object into CODING->dst_object.
6045    CODING->src_object is a buffer or a string.
6046    CODING->dst_object is a buffer or nil.
6047
6048    If CODING->src_object is a buffer, it must be the current buffer.
6049    In this case, if CODING->src_pos is positive, it is a position of
6050    the source text in the buffer, otherwise. the source text is in the
6051    gap area of the buffer, and coding->src_pos specifies the offset of
6052    the text from GPT (which must be the same as PT).  If this is the
6053    same buffer as CODING->dst_object, CODING->src_pos must be
6054    negative and CODING should not have `pre-write-conversion'.
6055
6056    If CODING->src_object is a string, CODING should not have
6057    `pre-write-conversion'.
6058
6059    If CODING->dst_object is a buffer, the encoded data is inserted at
6060    the current point of that buffer.
6061
6062    If CODING->dst_object is nil, the encoded data is placed at the
6063    memory area specified by CODING->destination.  */
6064
6065 static int
6066 encode_coding (coding)
6067      struct coding_system *coding;
6068 {
6069   Lisp_Object attrs;
6070
6071   attrs = CODING_ID_ATTRS (coding->id);
6072
6073   if (BUFFERP (coding->dst_object))
6074     {
6075       set_buffer_internal (XBUFFER (coding->dst_object));
6076       coding->dst_multibyte
6077         = ! NILP (current_buffer->enable_multibyte_characters);
6078     }
6079
6080   coding->consumed = coding->consumed_char = 0;
6081   coding->produced = coding->produced_char = 0;
6082   coding->result = CODING_RESULT_SUCCESS;
6083   coding->errors = 0;
6084
6085   ALLOC_CONVERSION_WORK_AREA (coding);
6086
6087   do {
6088     coding_set_source (coding);
6089     consume_chars (coding);
6090
6091     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6092       translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6093     else if (!NILP (Vstandard_translation_table_for_encode))
6094       translate_chars (coding, Vstandard_translation_table_for_encode);
6095
6096     coding_set_destination (coding);
6097     (*(coding->encoder)) (coding);
6098   } while (coding->consumed_char < coding->src_chars);
6099
6100   if (BUFFERP (coding->dst_object))
6101     insert_from_gap (coding->produced_char, coding->produced);
6102
6103   return (coding->result);
6104 }
6105
6106
6107 /* Name (or base name) of work buffer for code conversion.  */
6108 static Lisp_Object Vcode_conversion_workbuf_name;
6109
6110 /* A working buffer used by the top level conversion.  Once it is
6111    created, it is never destroyed.  It has the name
6112    Vcode_conversion_workbuf_name.  The other working buffers are
6113    destroyed after the use is finished, and their names are modified
6114    versions of Vcode_conversion_workbuf_name.  */
6115 static Lisp_Object Vcode_conversion_reused_workbuf;
6116
6117 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6118 static int reused_workbuf_in_use;
6119
6120
6121 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6122    multibyteness of returning buffer.  */
6123
6124 Lisp_Object
6125 make_conversion_work_buffer (multibyte)
6126 {
6127   Lisp_Object name, workbuf;
6128   struct buffer *current;
6129
6130   if (reused_workbuf_in_use++)
6131     name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6132   else
6133     name = Vcode_conversion_workbuf_name;
6134   workbuf = Fget_buffer_create (name);
6135   current = current_buffer;
6136   set_buffer_internal (XBUFFER (workbuf));
6137   Ferase_buffer ();
6138   current_buffer->undo_list = Qt;
6139   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6140   set_buffer_internal (current);
6141   return workbuf;
6142 }
6143
6144
6145 static Lisp_Object
6146 code_conversion_restore (arg)
6147      Lisp_Object arg;
6148 {
6149   Lisp_Object current, workbuf;
6150
6151   current = XCAR (arg);
6152   workbuf = XCDR (arg);
6153   if (! NILP (workbuf))
6154     {
6155       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6156         reused_workbuf_in_use = 0;
6157       else if (! NILP (Fbuffer_live_p (workbuf)))
6158         Fkill_buffer (workbuf);
6159     }
6160   set_buffer_internal (XBUFFER (current));
6161   return Qnil;
6162 }
6163
6164 Lisp_Object
6165 code_conversion_save (with_work_buf, multibyte)
6166      int with_work_buf, multibyte;
6167 {
6168   Lisp_Object workbuf = Qnil;
6169
6170   if (with_work_buf)
6171     workbuf = make_conversion_work_buffer (multibyte);
6172   record_unwind_protect (code_conversion_restore,
6173                          Fcons (Fcurrent_buffer (), workbuf));
6174   return workbuf;
6175 }
6176
6177 int
6178 decode_coding_gap (coding, chars, bytes)
6179      struct coding_system *coding;
6180      EMACS_INT chars, bytes;
6181 {
6182   int count = specpdl_ptr - specpdl;
6183   Lisp_Object attrs;
6184
6185   code_conversion_save (0, 0);
6186
6187   coding->src_object = Fcurrent_buffer ();
6188   coding->src_chars = chars;
6189   coding->src_bytes = bytes;
6190   coding->src_pos = -chars;
6191   coding->src_pos_byte = -bytes;
6192   coding->src_multibyte = chars < bytes;
6193   coding->dst_object = coding->src_object;
6194   coding->dst_pos = PT;
6195   coding->dst_pos_byte = PT_BYTE;
6196   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6197   coding->mode |= CODING_MODE_LAST_BLOCK;
6198
6199   if (CODING_REQUIRE_DETECTION (coding))
6200     detect_coding (coding);
6201
6202   decode_coding (coding);
6203
6204   attrs = CODING_ID_ATTRS (coding->id);
6205   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6206     {
6207       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6208       Lisp_Object val;
6209
6210       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6211       val = call1 (CODING_ATTR_POST_READ (attrs),
6212                    make_number (coding->produced_char));
6213       CHECK_NATNUM (val);
6214       coding->produced_char += Z - prev_Z;
6215       coding->produced += Z_BYTE - prev_Z_BYTE;
6216     }
6217
6218   unbind_to (count, Qnil);
6219   return coding->result;
6220 }
6221
6222 int
6223 encode_coding_gap (coding, chars, bytes)
6224      struct coding_system *coding;
6225      EMACS_INT chars, bytes;
6226 {
6227   int count = specpdl_ptr - specpdl;
6228
6229   code_conversion_save (0, 0);
6230
6231   coding->src_object = Fcurrent_buffer ();
6232   coding->src_chars = chars;
6233   coding->src_bytes = bytes;
6234   coding->src_pos = -chars;
6235   coding->src_pos_byte = -bytes;
6236   coding->src_multibyte = chars < bytes;
6237   coding->dst_object = coding->src_object;
6238   coding->dst_pos = PT;
6239   coding->dst_pos_byte = PT_BYTE;
6240
6241   encode_coding (coding);
6242
6243   unbind_to (count, Qnil);
6244   return coding->result;
6245 }
6246
6247
6248 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6249    SRC_OBJECT into DST_OBJECT by coding context CODING.
6250
6251    SRC_OBJECT is a buffer, a string, or Qnil.
6252
6253    If it is a buffer, the text is at point of the buffer.  FROM and TO
6254    are positions in the buffer.
6255
6256    If it is a string, the text is at the beginning of the string.
6257    FROM and TO are indices to the string.
6258
6259    If it is nil, the text is at coding->source.  FROM and TO are
6260    indices to coding->source.
6261
6262    DST_OBJECT is a buffer, Qt, or Qnil.
6263
6264    If it is a buffer, the decoded text is inserted at point of the
6265    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6266    is deleted.
6267
6268    If it is Qt, a string is made from the decoded text, and
6269    set in CODING->dst_object.
6270
6271    If it is Qnil, the decoded text is stored at CODING->destination.
6272    The caller must allocate CODING->dst_bytes bytes at
6273    CODING->destination by xmalloc.  If the decoded text is longer than
6274    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6275  */
6276
6277 void
6278 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6279                       dst_object)
6280      struct coding_system *coding;
6281      Lisp_Object src_object;
6282      EMACS_INT from, from_byte, to, to_byte;
6283      Lisp_Object dst_object;
6284 {
6285   int count = specpdl_ptr - specpdl;
6286   unsigned char *destination;
6287   EMACS_INT dst_bytes;
6288   EMACS_INT chars = to - from;
6289   EMACS_INT bytes = to_byte - from_byte;
6290   Lisp_Object attrs;
6291   Lisp_Object buffer;
6292   int saved_pt = -1, saved_pt_byte;
6293
6294   buffer = Fcurrent_buffer ();
6295
6296   if (NILP (dst_object))
6297     {
6298       destination = coding->destination;
6299       dst_bytes = coding->dst_bytes;
6300     }
6301
6302   coding->src_object = src_object;
6303   coding->src_chars = chars;
6304   coding->src_bytes = bytes;
6305   coding->src_multibyte = chars < bytes;
6306
6307   if (STRINGP (src_object))
6308     {
6309       coding->src_pos = from;
6310       coding->src_pos_byte = from_byte;
6311     }
6312   else if (BUFFERP (src_object))
6313     {
6314       set_buffer_internal (XBUFFER (src_object));
6315       if (from != GPT)
6316         move_gap_both (from, from_byte);
6317       if (EQ (src_object, dst_object))
6318         {
6319           saved_pt = PT, saved_pt_byte = PT_BYTE;
6320           TEMP_SET_PT_BOTH (from, from_byte);
6321           del_range_both (from, from_byte, to, to_byte, 1);
6322           coding->src_pos = -chars;
6323           coding->src_pos_byte = -bytes;
6324         }
6325       else
6326         {
6327           coding->src_pos = from;
6328           coding->src_pos_byte = from_byte;
6329         }
6330     }
6331
6332   if (CODING_REQUIRE_DETECTION (coding))
6333     detect_coding (coding);
6334   attrs = CODING_ID_ATTRS (coding->id);
6335
6336   if (EQ (dst_object, Qt)
6337       || (! NILP (CODING_ATTR_POST_READ (attrs))
6338           && NILP (dst_object)))
6339     {
6340       coding->dst_object = code_conversion_save (1, 1);
6341       coding->dst_pos = BEG;
6342       coding->dst_pos_byte = BEG_BYTE;
6343       coding->dst_multibyte = 1;
6344     }
6345   else if (BUFFERP (dst_object))
6346     {
6347       code_conversion_save (0, 0);
6348       coding->dst_object = dst_object;
6349       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6350       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6351       coding->dst_multibyte
6352         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6353     }
6354   else
6355     {
6356       code_conversion_save (0, 0);
6357       coding->dst_object = Qnil;
6358       coding->dst_multibyte = 1;
6359     }
6360
6361   decode_coding (coding);
6362
6363   if (BUFFERP (coding->dst_object))
6364     set_buffer_internal (XBUFFER (coding->dst_object));
6365
6366   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6367     {
6368       struct gcpro gcpro1, gcpro2;
6369       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6370       Lisp_Object val;
6371
6372       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6373       GCPRO2 (coding->src_object, coding->dst_object);
6374       val = call1 (CODING_ATTR_POST_READ (attrs),
6375                    make_number (coding->produced_char));
6376       UNGCPRO;
6377       CHECK_NATNUM (val);
6378       coding->produced_char += Z - prev_Z;
6379       coding->produced += Z_BYTE - prev_Z_BYTE;
6380     }
6381
6382   if (EQ (dst_object, Qt))
6383     {
6384       coding->dst_object = Fbuffer_string ();
6385     }
6386   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6387     {
6388       set_buffer_internal (XBUFFER (coding->dst_object));
6389       if (dst_bytes < coding->produced)
6390         {
6391           destination
6392             = (unsigned char *) xrealloc (destination, coding->produced);
6393           if (! destination)
6394             {
6395               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6396               unbind_to (count, Qnil);
6397               return;
6398             }
6399           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6400             move_gap_both (BEGV, BEGV_BYTE);
6401           bcopy (BEGV_ADDR, destination, coding->produced);
6402           coding->destination = destination;
6403         }
6404     }
6405
6406   if (saved_pt >= 0)
6407     {
6408       /* This is the case of:
6409          (BUFFERP (src_object) && EQ (src_object, dst_object))
6410          As we have moved PT while replacing the original buffer
6411          contents, we must recover it now.  */
6412       set_buffer_internal (XBUFFER (src_object));
6413       if (saved_pt < from)
6414         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6415       else if (saved_pt < from + chars)
6416         TEMP_SET_PT_BOTH (from, from_byte);
6417       else if (! NILP (current_buffer->enable_multibyte_characters))
6418         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6419                           saved_pt_byte + (coding->produced - bytes));
6420       else
6421         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6422                           saved_pt_byte + (coding->produced - bytes));
6423     }
6424
6425   unbind_to (count, Qnil);
6426 }
6427
6428
6429 void
6430 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6431                       dst_object)
6432      struct coding_system *coding;
6433      Lisp_Object src_object;
6434      EMACS_INT from, from_byte, to, to_byte;
6435      Lisp_Object dst_object;
6436 {
6437   int count = specpdl_ptr - specpdl;
6438   EMACS_INT chars = to - from;
6439   EMACS_INT bytes = to_byte - from_byte;
6440   Lisp_Object attrs;
6441   Lisp_Object buffer;
6442   int saved_pt = -1, saved_pt_byte;
6443
6444   buffer = Fcurrent_buffer ();
6445
6446   coding->src_object = src_object;
6447   coding->src_chars = chars;
6448   coding->src_bytes = bytes;
6449   coding->src_multibyte = chars < bytes;
6450
6451   attrs = CODING_ID_ATTRS (coding->id);
6452
6453   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6454     {
6455       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6456       set_buffer_internal (XBUFFER (coding->src_object));
6457       if (STRINGP (src_object))
6458         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6459       else if (BUFFERP (src_object))
6460         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6461       else
6462         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6463
6464       if (EQ (src_object, dst_object))
6465         {
6466           set_buffer_internal (XBUFFER (src_object));
6467           saved_pt = PT, saved_pt_byte = PT_BYTE;
6468           del_range_both (from, from_byte, to, to_byte, 1);
6469           set_buffer_internal (XBUFFER (coding->src_object));
6470         }
6471
6472       call2 (CODING_ATTR_PRE_WRITE (attrs),
6473              make_number (BEG), make_number (Z));
6474       coding->src_object = Fcurrent_buffer ();
6475       if (BEG != GPT)
6476         move_gap_both (BEG, BEG_BYTE);
6477       coding->src_chars = Z - BEG;
6478       coding->src_bytes = Z_BYTE - BEG_BYTE;
6479       coding->src_pos = BEG;
6480       coding->src_pos_byte = BEG_BYTE;
6481       coding->src_multibyte = Z < Z_BYTE;
6482     }
6483   else if (STRINGP (src_object))
6484     {
6485       code_conversion_save (0, 0);
6486       coding->src_pos = from;
6487       coding->src_pos_byte = from_byte;
6488     }
6489   else if (BUFFERP (src_object))
6490     {
6491       code_conversion_save (0, 0);
6492       set_buffer_internal (XBUFFER (src_object));
6493       if (EQ (src_object, dst_object))
6494         {
6495           saved_pt = PT, saved_pt_byte = PT_BYTE;
6496           coding->src_object = del_range_1 (from, to, 1, 1);
6497           coding->src_pos = 0;
6498           coding->src_pos_byte = 0;
6499         }
6500       else
6501         {
6502           if (from < GPT && to >= GPT)
6503             move_gap_both (from, from_byte);
6504           coding->src_pos = from;
6505           coding->src_pos_byte = from_byte;
6506         }
6507     }
6508   else
6509     code_conversion_save (0, 0);
6510
6511   if (BUFFERP (dst_object))
6512     {
6513       coding->dst_object = dst_object;
6514       if (EQ (src_object, dst_object))
6515         {
6516           coding->dst_pos = from;
6517           coding->dst_pos_byte = from_byte;
6518         }
6519       else
6520         {
6521           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6522           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6523         }
6524       coding->dst_multibyte
6525         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6526     }
6527   else if (EQ (dst_object, Qt))
6528     {
6529       coding->dst_object = Qnil;
6530       coding->dst_bytes = coding->src_chars;
6531       if (coding->dst_bytes == 0)
6532         coding->dst_bytes = 1;
6533       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6534       coding->dst_multibyte = 0;
6535     }
6536   else
6537     {
6538       coding->dst_object = Qnil;
6539       coding->dst_multibyte = 0;
6540     }
6541
6542   encode_coding (coding);
6543
6544   if (EQ (dst_object, Qt))
6545     {
6546       if (BUFFERP (coding->dst_object))
6547         coding->dst_object = Fbuffer_string ();
6548       else
6549         {
6550           coding->dst_object
6551             = make_unibyte_string ((char *) coding->destination,
6552                                    coding->produced);
6553           xfree (coding->destination);
6554         }
6555     }
6556
6557   if (saved_pt >= 0)
6558     {
6559       /* This is the case of:
6560          (BUFFERP (src_object) && EQ (src_object, dst_object))
6561          As we have moved PT while replacing the original buffer
6562          contents, we must recover it now.  */
6563       set_buffer_internal (XBUFFER (src_object));
6564       if (saved_pt < from)
6565         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6566       else if (saved_pt < from + chars)
6567         TEMP_SET_PT_BOTH (from, from_byte);
6568       else if (! NILP (current_buffer->enable_multibyte_characters))
6569         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6570                           saved_pt_byte + (coding->produced - bytes));
6571       else
6572         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6573                           saved_pt_byte + (coding->produced - bytes));
6574     }
6575
6576   unbind_to (count, Qnil);
6577 }
6578
6579
6580 Lisp_Object
6581 preferred_coding_system ()
6582 {
6583   int id = coding_categories[coding_priorities[0]].id;
6584
6585   return CODING_ID_NAME (id);
6586 }
6587
6588 \f
6589 #ifdef emacs
6590 /*** 8. Emacs Lisp library functions ***/
6591
6592 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6593        doc: /* Return t if OBJECT is nil or a coding-system.
6594 See the documentation of `define-coding-system' for information
6595 about coding-system objects.  */)
6596      (obj)
6597      Lisp_Object obj;
6598 {
6599   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6600 }
6601
6602 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6603        Sread_non_nil_coding_system, 1, 1, 0,
6604        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6605      (prompt)
6606      Lisp_Object prompt;
6607 {
6608   Lisp_Object val;
6609   do
6610     {
6611       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6612                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6613     }
6614   while (SCHARS (val) == 0);
6615   return (Fintern (val, Qnil));
6616 }
6617
6618 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6619        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6620 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6621      (prompt, default_coding_system)
6622      Lisp_Object prompt, default_coding_system;
6623 {
6624   Lisp_Object val;
6625   if (SYMBOLP (default_coding_system))
6626     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6627   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6628                           Qt, Qnil, Qcoding_system_history,
6629                           default_coding_system, Qnil);
6630   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6631 }
6632
6633 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6634        1, 1, 0,
6635        doc: /* Check validity of CODING-SYSTEM.
6636 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6637   (coding_system)
6638      Lisp_Object coding_system;
6639 {
6640   CHECK_SYMBOL (coding_system);
6641   if (!NILP (Fcoding_system_p (coding_system)))
6642     return coding_system;
6643   while (1)
6644     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6645 }
6646
6647 \f
6648 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6649    HIGHEST is nonzero, return the coding system of the highest
6650    priority among the detected coding systems.  Otherwize return a
6651    list of detected coding systems sorted by their priorities.  If
6652    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6653    multibyte form but contains only ASCII and eight-bit chars.
6654    Otherwise, the bytes are raw bytes.
6655
6656    CODING-SYSTEM controls the detection as below:
6657
6658    If it is nil, detect both text-format and eol-format.  If the
6659    text-format part of CODING-SYSTEM is already specified
6660    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6661    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6662    detect only text-format.  */
6663
6664 Lisp_Object
6665 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6666                       coding_system)
6667      const unsigned char *src;
6668      int src_chars, src_bytes, highest;
6669      int multibytep;
6670      Lisp_Object coding_system;
6671 {
6672   const unsigned char *src_end = src + src_bytes;
6673   Lisp_Object attrs, eol_type;
6674   Lisp_Object val;
6675   struct coding_system coding;
6676   int id;
6677   struct coding_detection_info detect_info;
6678   enum coding_category base_category;
6679
6680   if (NILP (coding_system))
6681     coding_system = Qundecided;
6682   setup_coding_system (coding_system, &coding);
6683   attrs = CODING_ID_ATTRS (coding.id);
6684   eol_type = CODING_ID_EOL_TYPE (coding.id);
6685   coding_system = CODING_ATTR_BASE_NAME (attrs);
6686
6687   coding.source = src;
6688   coding.src_chars = src_chars;
6689   coding.src_bytes = src_bytes;
6690   coding.src_multibyte = multibytep;
6691   coding.consumed = 0;
6692   coding.mode |= CODING_MODE_LAST_BLOCK;
6693
6694   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6695
6696   /* At first, detect text-format if necessary.  */
6697   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6698   if (base_category == coding_category_undecided)
6699     {
6700       enum coding_category category;
6701       struct coding_system *this;
6702       int c, i;
6703
6704       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
6705       for (i = 0; src < src_end; i++, src++)
6706         {
6707           c = *src;
6708           if (c & 0x80 || (c < 0x20 && (c == 0
6709                                         || c == ISO_CODE_ESC
6710                                         || c == ISO_CODE_SI
6711                                         || c == ISO_CODE_SO)))
6712             break;
6713         }
6714       /* Skipped bytes must be even for utf-16 detecor.  */
6715       if (i % 2)
6716         src--;
6717       coding.head_ascii = src - coding.source;
6718
6719       if (src < src_end)
6720         for (i = 0; i < coding_category_raw_text; i++)
6721           {
6722             category = coding_priorities[i];
6723             this = coding_categories + category;
6724
6725             if (this->id < 0)
6726               {
6727                 /* No coding system of this category is defined.  */
6728                 detect_info.rejected |= (1 << category);
6729               }
6730             else if (category >= coding_category_raw_text)
6731               continue;
6732             else if (detect_info.checked & (1 << category))
6733               {
6734                 if (highest
6735                     && (detect_info.found & (1 << category)))
6736                   break;
6737               }
6738             else
6739               {
6740                 if ((*(this->detector)) (&coding, &detect_info)
6741                     && highest
6742                     && (detect_info.found & (1 << category)))
6743                   {
6744                     if (category == coding_category_utf_16_auto)
6745                       {
6746                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6747                           category = coding_category_utf_16_le;
6748                         else
6749                           category = coding_category_utf_16_be;
6750                       }
6751                     break;
6752                   }
6753               }
6754           }
6755
6756       if (detect_info.rejected == CATEGORY_MASK_ANY)
6757         {
6758           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6759           id = coding_categories[coding_category_raw_text].id;
6760           val = Fcons (make_number (id), Qnil);
6761         }
6762       else if (! detect_info.rejected && ! detect_info.found)
6763         {
6764           detect_info.found = CATEGORY_MASK_ANY;
6765           id = coding_categories[coding_category_undecided].id;
6766           val = Fcons (make_number (id), Qnil);
6767         }
6768       else if (highest)
6769         {
6770           if (detect_info.found)
6771             {
6772               detect_info.found = 1 << category;
6773               val = Fcons (make_number (this->id), Qnil);
6774             }
6775           else
6776             for (i = 0; i < coding_category_raw_text; i++)
6777               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6778                 {
6779                   detect_info.found = 1 << coding_priorities[i];
6780                   id = coding_categories[coding_priorities[i]].id;
6781                   val = Fcons (make_number (id), Qnil);
6782                   break;
6783                 }
6784         }
6785       else
6786         {
6787           int mask = detect_info.rejected | detect_info.found;
6788           int found = 0;
6789           val = Qnil;
6790
6791           for (i = coding_category_raw_text - 1; i >= 0; i--)
6792             {
6793               category = coding_priorities[i];
6794               if (! (mask & (1 << category)))
6795                 {
6796                   found |= 1 << category;
6797                   id = coding_categories[category].id;
6798                   val = Fcons (make_number (id), val);
6799                 }
6800             }
6801           for (i = coding_category_raw_text - 1; i >= 0; i--)
6802             {
6803               category = coding_priorities[i];
6804               if (detect_info.found & (1 << category))
6805                 {
6806                   id = coding_categories[category].id;
6807                   val = Fcons (make_number (id), val);
6808                 }
6809             }
6810           detect_info.found |= found;
6811         }
6812     }
6813   else if (base_category == coding_category_utf_16_auto)
6814     {
6815       if (detect_coding_utf_16 (&coding, &detect_info))
6816         {
6817           enum coding_category category;
6818           struct coding_system *this;
6819
6820           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6821             this = coding_categories + coding_category_utf_16_le;
6822           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6823             this = coding_categories + coding_category_utf_16_be;
6824           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6825             this = coding_categories + coding_category_utf_16_be_nosig;
6826           else
6827             this = coding_categories + coding_category_utf_16_le_nosig;
6828           val = Fcons (make_number (this->id), Qnil);
6829         }
6830     }
6831   else
6832     {
6833       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6834       val = Fcons (make_number (coding.id), Qnil);
6835     }
6836
6837   /* Then, detect eol-format if necessary.  */
6838   {
6839     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6840     Lisp_Object tail;
6841
6842     if (VECTORP (eol_type))
6843       {
6844         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6845           normal_eol = detect_eol (coding.source, src_bytes,
6846                                    coding_category_raw_text);
6847         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6848                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
6849           utf_16_be_eol = detect_eol (coding.source, src_bytes,
6850                                       coding_category_utf_16_be);
6851         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6852                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
6853           utf_16_le_eol = detect_eol (coding.source, src_bytes,
6854                                       coding_category_utf_16_le);
6855       }
6856     else
6857       {
6858         if (EQ (eol_type, Qunix))
6859           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6860         else if (EQ (eol_type, Qdos))
6861           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6862         else
6863           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6864       }
6865
6866     for (tail = val; CONSP (tail); tail = XCDR (tail))
6867       {
6868         enum coding_category category;
6869         int this_eol;
6870
6871         id = XINT (XCAR (tail));
6872         attrs = CODING_ID_ATTRS (id);
6873         category = XINT (CODING_ATTR_CATEGORY (attrs));
6874         eol_type = CODING_ID_EOL_TYPE (id);
6875         if (VECTORP (eol_type))
6876           {
6877             if (category == coding_category_utf_16_be
6878                 || category == coding_category_utf_16_be_nosig)
6879               this_eol = utf_16_be_eol;
6880             else if (category == coding_category_utf_16_le
6881                      || category == coding_category_utf_16_le_nosig)
6882               this_eol = utf_16_le_eol;
6883             else
6884               this_eol = normal_eol;
6885
6886             if (this_eol == EOL_SEEN_LF)
6887               XSETCAR (tail, AREF (eol_type, 0));
6888             else if (this_eol == EOL_SEEN_CRLF)
6889               XSETCAR (tail, AREF (eol_type, 1));
6890             else if (this_eol == EOL_SEEN_CR)
6891               XSETCAR (tail, AREF (eol_type, 2));
6892             else
6893               XSETCAR (tail, CODING_ID_NAME (id));
6894           }
6895         else
6896           XSETCAR (tail, CODING_ID_NAME (id));
6897       }
6898   }
6899
6900   return (highest ? XCAR (val) : val);
6901 }
6902
6903
6904 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6905        2, 3, 0,
6906        doc: /* Detect coding system of the text in the region between START and END.
6907 Return a list of possible coding systems ordered by priority.
6908
6909 If only ASCII characters are found, it returns a list of single element
6910 `undecided' or its subsidiary coding system according to a detected
6911 end-of-line format.
6912
6913 If optional argument HIGHEST is non-nil, return the coding system of
6914 highest priority.  */)
6915      (start, end, highest)
6916      Lisp_Object start, end, highest;
6917 {
6918   int from, to;
6919   int from_byte, to_byte;
6920
6921   CHECK_NUMBER_COERCE_MARKER (start);
6922   CHECK_NUMBER_COERCE_MARKER (end);
6923
6924   validate_region (&start, &end);
6925   from = XINT (start), to = XINT (end);
6926   from_byte = CHAR_TO_BYTE (from);
6927   to_byte = CHAR_TO_BYTE (to);
6928
6929   if (from < GPT && to >= GPT)
6930     move_gap_both (to, to_byte);
6931
6932   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6933                                to - from, to_byte - from_byte,
6934                                !NILP (highest),
6935                                !NILP (current_buffer
6936                                       ->enable_multibyte_characters),
6937                                Qnil);
6938 }
6939
6940 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6941        1, 2, 0,
6942        doc: /* Detect coding system of the text in STRING.
6943 Return a list of possible coding systems ordered by priority.
6944
6945 If only ASCII characters are found, it returns a list of single element
6946 `undecided' or its subsidiary coding system according to a detected
6947 end-of-line format.
6948
6949 If optional argument HIGHEST is non-nil, return the coding system of
6950 highest priority.  */)
6951      (string, highest)
6952      Lisp_Object string, highest;
6953 {
6954   CHECK_STRING (string);
6955
6956   return detect_coding_system (SDATA (string),
6957                                SCHARS (string), SBYTES (string),
6958                                !NILP (highest), STRING_MULTIBYTE (string),
6959                                Qnil);
6960 }
6961
6962
6963 static INLINE int
6964 char_encodable_p (c, attrs)
6965      int c;
6966      Lisp_Object attrs;
6967 {
6968   Lisp_Object tail;
6969   struct charset *charset;
6970
6971   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6972        CONSP (tail); tail = XCDR (tail))
6973     {
6974       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6975       if (CHAR_CHARSET_P (c, charset))
6976         break;
6977     }
6978   return (! NILP (tail));
6979 }
6980
6981
6982 /* Return a list of coding systems that safely encode the text between
6983    START and END.  If EXCLUDE is non-nil, it is a list of coding
6984    systems not to check.  The returned list doesn't contain any such
6985    coding systems.  In any case, if the text contains only ASCII or is
6986    unibyte, return t.  */
6987
6988 DEFUN ("find-coding-systems-region-internal",
6989        Ffind_coding_systems_region_internal,
6990        Sfind_coding_systems_region_internal, 2, 3, 0,
6991        doc: /* Internal use only.  */)
6992      (start, end, exclude)
6993      Lisp_Object start, end, exclude;
6994 {
6995   Lisp_Object coding_attrs_list, safe_codings;
6996   EMACS_INT start_byte, end_byte;
6997   const unsigned char *p, *pbeg, *pend;
6998   int c;
6999   Lisp_Object tail, elt;
7000
7001   if (STRINGP (start))
7002     {
7003       if (!STRING_MULTIBYTE (start)
7004           || SCHARS (start) == SBYTES (start))
7005         return Qt;
7006       start_byte = 0;
7007       end_byte = SBYTES (start);
7008     }
7009   else
7010     {
7011       CHECK_NUMBER_COERCE_MARKER (start);
7012       CHECK_NUMBER_COERCE_MARKER (end);
7013       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7014         args_out_of_range (start, end);
7015       if (NILP (current_buffer->enable_multibyte_characters))
7016         return Qt;
7017       start_byte = CHAR_TO_BYTE (XINT (start));
7018       end_byte = CHAR_TO_BYTE (XINT (end));
7019       if (XINT (end) - XINT (start) == end_byte - start_byte)
7020         return Qt;
7021
7022       if (XINT (start) < GPT && XINT (end) > GPT)
7023         {
7024           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7025             move_gap_both (XINT (start), start_byte);
7026           else
7027             move_gap_both (XINT (end), end_byte);
7028         }
7029     }
7030
7031   coding_attrs_list = Qnil;
7032   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7033     if (NILP (exclude)
7034         || NILP (Fmemq (XCAR (tail), exclude)))
7035       {
7036         Lisp_Object attrs;
7037
7038         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7039         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7040             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7041           coding_attrs_list = Fcons (attrs, coding_attrs_list);
7042       }
7043
7044   if (STRINGP (start))
7045     p = pbeg = SDATA (start);
7046   else
7047     p = pbeg = BYTE_POS_ADDR (start_byte);
7048   pend = p + (end_byte - start_byte);
7049
7050   while (p < pend && ASCII_BYTE_P (*p)) p++;
7051   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7052
7053   while (p < pend)
7054     {
7055       if (ASCII_BYTE_P (*p))
7056         p++;
7057       else
7058         {
7059           c = STRING_CHAR_ADVANCE (p);
7060
7061           charset_map_loaded = 0;
7062           for (tail = coding_attrs_list; CONSP (tail);)
7063             {
7064               elt = XCAR (tail);
7065               if (NILP (elt))
7066                 tail = XCDR (tail);
7067               else if (char_encodable_p (c, elt))
7068                 tail = XCDR (tail);
7069               else if (CONSP (XCDR (tail)))
7070                 {
7071                   XSETCAR (tail, XCAR (XCDR (tail)));
7072                   XSETCDR (tail, XCDR (XCDR (tail)));
7073                 }
7074               else
7075                 {
7076                   XSETCAR (tail, Qnil);
7077                   tail = XCDR (tail);
7078                 }
7079             }
7080           if (charset_map_loaded)
7081             {
7082               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7083
7084               if (STRINGP (start))
7085                 pbeg = SDATA (start);
7086               else
7087                 pbeg = BYTE_POS_ADDR (start_byte);
7088               p = pbeg + p_offset;
7089               pend = pbeg + pend_offset;
7090             }
7091         }
7092     }
7093
7094   safe_codings = Qnil;
7095   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7096     if (! NILP (XCAR (tail)))
7097       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7098
7099   return safe_codings;
7100 }
7101
7102
7103 DEFUN ("unencodable-char-position", Funencodable_char_position,
7104        Sunencodable_char_position, 3, 5, 0,
7105        doc: /*
7106 Return position of first un-encodable character in a region.
7107 START and END specfiy the region and CODING-SYSTEM specifies the
7108 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7109
7110 If optional 4th argument COUNT is non-nil, it specifies at most how
7111 many un-encodable characters to search.  In this case, the value is a
7112 list of positions.
7113
7114 If optional 5th argument STRING is non-nil, it is a string to search
7115 for un-encodable characters.  In that case, START and END are indexes
7116 to the string.  */)
7117      (start, end, coding_system, count, string)
7118      Lisp_Object start, end, coding_system, count, string;
7119 {
7120   int n;
7121   struct coding_system coding;
7122   Lisp_Object attrs, charset_list;
7123   Lisp_Object positions;
7124   int from, to;
7125   const unsigned char *p, *stop, *pend;
7126   int ascii_compatible;
7127
7128   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7129   attrs = CODING_ID_ATTRS (coding.id);
7130   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7131     return Qnil;
7132   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7133   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7134
7135   if (NILP (string))
7136     {
7137       validate_region (&start, &end);
7138       from = XINT (start);
7139       to = XINT (end);
7140       if (NILP (current_buffer->enable_multibyte_characters)
7141           || (ascii_compatible
7142               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7143         return Qnil;
7144       p = CHAR_POS_ADDR (from);
7145       pend = CHAR_POS_ADDR (to);
7146       if (from < GPT && to >= GPT)
7147         stop = GPT_ADDR;
7148       else
7149         stop = pend;
7150     }
7151   else
7152     {
7153       CHECK_STRING (string);
7154       CHECK_NATNUM (start);
7155       CHECK_NATNUM (end);
7156       from = XINT (start);
7157       to = XINT (end);
7158       if (from > to
7159           || to > SCHARS (string))
7160         args_out_of_range_3 (string, start, end);
7161       if (! STRING_MULTIBYTE (string))
7162         return Qnil;
7163       p = SDATA (string) + string_char_to_byte (string, from);
7164       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7165       if (ascii_compatible && (to - from) == (pend - p))
7166         return Qnil;
7167     }
7168
7169   if (NILP (count))
7170     n = 1;
7171   else
7172     {
7173       CHECK_NATNUM (count);
7174       n = XINT (count);
7175     }
7176
7177   positions = Qnil;
7178   while (1)
7179     {
7180       int c;
7181
7182       if (ascii_compatible)
7183         while (p < stop && ASCII_BYTE_P (*p))
7184           p++, from++;
7185       if (p >= stop)
7186         {
7187           if (p >= pend)
7188             break;
7189           stop = pend;
7190           p = GAP_END_ADDR;
7191         }
7192
7193       c = STRING_CHAR_ADVANCE (p);
7194       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7195           && ! char_charset (c, charset_list, NULL))
7196         {
7197           positions = Fcons (make_number (from), positions);
7198           n--;
7199           if (n == 0)
7200             break;
7201         }
7202
7203       from++;
7204     }
7205
7206   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7207 }
7208
7209
7210 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7211        Scheck_coding_systems_region, 3, 3, 0,
7212        doc: /* Check if the region is encodable by coding systems.
7213
7214 START and END are buffer positions specifying the region.
7215 CODING-SYSTEM-LIST is a list of coding systems to check.
7216
7217 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7218 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7219 whole region, POS0, POS1, ... are buffer positions where non-encodable
7220 characters are found.
7221
7222 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7223 value is nil.
7224
7225 START may be a string.  In that case, check if the string is
7226 encodable, and the value contains indices to the string instead of
7227 buffer positions.  END is ignored.  */)
7228      (start, end, coding_system_list)
7229      Lisp_Object start, end, coding_system_list;
7230 {
7231   Lisp_Object list;
7232   EMACS_INT start_byte, end_byte;
7233   int pos;
7234   const unsigned char *p, *pbeg, *pend;
7235   int c;
7236   Lisp_Object tail, elt;
7237
7238   if (STRINGP (start))
7239     {
7240       if (!STRING_MULTIBYTE (start)
7241           && SCHARS (start) != SBYTES (start))
7242         return Qnil;
7243       start_byte = 0;
7244       end_byte = SBYTES (start);
7245       pos = 0;
7246     }
7247   else
7248     {
7249       CHECK_NUMBER_COERCE_MARKER (start);
7250       CHECK_NUMBER_COERCE_MARKER (end);
7251       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7252         args_out_of_range (start, end);
7253       if (NILP (current_buffer->enable_multibyte_characters))
7254         return Qnil;
7255       start_byte = CHAR_TO_BYTE (XINT (start));
7256       end_byte = CHAR_TO_BYTE (XINT (end));
7257       if (XINT (end) - XINT (start) == end_byte - start_byte)
7258         return Qt;
7259
7260       if (XINT (start) < GPT && XINT (end) > GPT)
7261         {
7262           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7263             move_gap_both (XINT (start), start_byte);
7264           else
7265             move_gap_both (XINT (end), end_byte);
7266         }
7267       pos = XINT (start);
7268     }
7269
7270   list = Qnil;
7271   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7272     {
7273       elt = XCAR (tail);
7274       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7275                                        Qnil)),
7276                     list);
7277     }
7278
7279   if (STRINGP (start))
7280     p = pbeg = SDATA (start);
7281   else
7282     p = pbeg = BYTE_POS_ADDR (start_byte);
7283   pend = p + (end_byte - start_byte);
7284
7285   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7286   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7287
7288   while (p < pend)
7289     {
7290       if (ASCII_BYTE_P (*p))
7291         p++;
7292       else
7293         {
7294           c = STRING_CHAR_ADVANCE (p);
7295
7296           charset_map_loaded = 0;
7297           for (tail = list; CONSP (tail); tail = XCDR (tail))
7298             {
7299               elt = XCDR (XCAR (tail));
7300               if (! char_encodable_p (c, XCAR (elt)))
7301                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7302             }
7303           if (charset_map_loaded)
7304             {
7305               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7306
7307               if (STRINGP (start))
7308                 pbeg = SDATA (start);
7309               else
7310                 pbeg = BYTE_POS_ADDR (start_byte);
7311               p = pbeg + p_offset;
7312               pend = pbeg + pend_offset;
7313             }
7314         }
7315       pos++;
7316     }
7317
7318   tail = list;
7319   list = Qnil;
7320   for (; CONSP (tail); tail = XCDR (tail))
7321     {
7322       elt = XCAR (tail);
7323       if (CONSP (XCDR (XCDR (elt))))
7324         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7325                       list);
7326     }
7327
7328   return list;
7329 }
7330
7331
7332
7333 Lisp_Object
7334 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7335      Lisp_Object start, end, coding_system, dst_object;
7336      int encodep, norecord;
7337 {
7338   struct coding_system coding;
7339   EMACS_INT from, from_byte, to, to_byte;
7340   Lisp_Object src_object;
7341
7342   CHECK_NUMBER_COERCE_MARKER (start);
7343   CHECK_NUMBER_COERCE_MARKER (end);
7344   if (NILP (coding_system))
7345     coding_system = Qno_conversion;
7346   else
7347     CHECK_CODING_SYSTEM (coding_system);
7348   src_object = Fcurrent_buffer ();
7349   if (NILP (dst_object))
7350     dst_object = src_object;
7351   else if (! EQ (dst_object, Qt))
7352     CHECK_BUFFER (dst_object);
7353
7354   validate_region (&start, &end);
7355   from = XFASTINT (start);
7356   from_byte = CHAR_TO_BYTE (from);
7357   to = XFASTINT (end);
7358   to_byte = CHAR_TO_BYTE (to);
7359
7360   setup_coding_system (coding_system, &coding);
7361   coding.mode |= CODING_MODE_LAST_BLOCK;
7362
7363   if (encodep)
7364     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7365                           dst_object);
7366   else
7367     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7368                           dst_object);
7369   if (! norecord)
7370     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7371
7372   if (coding.result != CODING_RESULT_SUCCESS)
7373     error ("Code conversion error: %d", coding.result);
7374
7375   return (BUFFERP (dst_object)
7376           ? make_number (coding.produced_char)
7377           : coding.dst_object);
7378 }
7379
7380
7381 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7382        3, 4, "r\nzCoding system: ",
7383        doc: /* Decode the current region from the specified coding system.
7384 When called from a program, takes four arguments:
7385         START, END, CODING-SYSTEM, and DESTINATION.
7386 START and END are buffer positions.
7387
7388 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7389 If nil, the region between START and END is replace by the decoded text.
7390 If buffer, the decoded text is inserted in the buffer.
7391 If t, the decoded text is returned.
7392
7393 This function sets `last-coding-system-used' to the precise coding system
7394 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7395 not fully specified.)
7396 It returns the length of the decoded text.  */)
7397      (start, end, coding_system, destination)
7398      Lisp_Object start, end, coding_system, destination;
7399 {
7400   return code_convert_region (start, end, coding_system, destination, 0, 0);
7401 }
7402
7403 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7404        3, 4, "r\nzCoding system: ",
7405        doc: /* Encode the current region by specified coding system.
7406 When called from a program, takes three arguments:
7407 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7408
7409 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7410 If nil, the region between START and END is replace by the encoded text.
7411 If buffer, the encoded text is inserted in the buffer.
7412 If t, the encoded text is returned.
7413
7414 This function sets `last-coding-system-used' to the precise coding system
7415 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7416 not fully specified.)
7417 It returns the length of the encoded text.  */)
7418   (start, end, coding_system, destination)
7419      Lisp_Object start, end, coding_system, destination;
7420 {
7421   return code_convert_region (start, end, coding_system, destination, 1, 0);
7422 }
7423
7424 Lisp_Object
7425 code_convert_string (string, coding_system, dst_object,
7426                      encodep, nocopy, norecord)
7427      Lisp_Object string, coding_system, dst_object;
7428      int encodep, nocopy, norecord;
7429 {
7430   struct coding_system coding;
7431   EMACS_INT chars, bytes;
7432
7433   CHECK_STRING (string);
7434   if (NILP (coding_system))
7435     {
7436       if (! norecord)
7437         Vlast_coding_system_used = Qno_conversion;
7438       if (NILP (dst_object))
7439         return (nocopy ? Fcopy_sequence (string) : string);
7440     }
7441
7442   if (NILP (coding_system))
7443     coding_system = Qno_conversion;
7444   else
7445     CHECK_CODING_SYSTEM (coding_system);
7446   if (NILP (dst_object))
7447     dst_object = Qt;
7448   else if (! EQ (dst_object, Qt))
7449     CHECK_BUFFER (dst_object);
7450
7451   setup_coding_system (coding_system, &coding);
7452   coding.mode |= CODING_MODE_LAST_BLOCK;
7453   chars = SCHARS (string);
7454   bytes = SBYTES (string);
7455   if (encodep)
7456     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7457   else
7458     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7459   if (! norecord)
7460     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7461
7462   if (coding.result != CODING_RESULT_SUCCESS)
7463     error ("Code conversion error: %d", coding.result);
7464
7465   return (BUFFERP (dst_object)
7466           ? make_number (coding.produced_char)
7467           : coding.dst_object);
7468 }
7469
7470
7471 /* Encode or decode STRING according to CODING_SYSTEM.
7472    Do not set Vlast_coding_system_used.
7473
7474    This function is called only from macros DECODE_FILE and
7475    ENCODE_FILE, thus we ignore character composition.  */
7476
7477 Lisp_Object
7478 code_convert_string_norecord (string, coding_system, encodep)
7479      Lisp_Object string, coding_system;
7480      int encodep;
7481 {
7482   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7483 }
7484
7485
7486 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7487        2, 4, 0,
7488        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7489
7490 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7491 if the decoding operation is trivial.
7492
7493 Optional fourth arg BUFFER non-nil meant that the decoded text is
7494 inserted in BUFFER instead of returned as a string.  In this case,
7495 the return value is BUFFER.
7496
7497 This function sets `last-coding-system-used' to the precise coding system
7498 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7499 not fully specified.  */)
7500   (string, coding_system, nocopy, buffer)
7501      Lisp_Object string, coding_system, nocopy, buffer;
7502 {
7503   return code_convert_string (string, coding_system, buffer,
7504                               0, ! NILP (nocopy), 0);
7505 }
7506
7507 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7508        2, 4, 0,
7509        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7510
7511 Optional third arg NOCOPY non-nil means it is OK to return STRING
7512 itself if the encoding operation is trivial.
7513
7514 Optional fourth arg BUFFER non-nil meant that the encoded text is
7515 inserted in BUFFER instead of returned as a string.  In this case,
7516 the return value is BUFFER.
7517
7518 This function sets `last-coding-system-used' to the precise coding system
7519 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7520 not fully specified.)  */)
7521      (string, coding_system, nocopy, buffer)
7522      Lisp_Object string, coding_system, nocopy, buffer;
7523 {
7524   return code_convert_string (string, coding_system, buffer,
7525                               1, ! NILP (nocopy), 1);
7526 }
7527
7528 \f
7529 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7530        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7531 Return the corresponding character.  */)
7532      (code)
7533      Lisp_Object code;
7534 {
7535   Lisp_Object spec, attrs, val;
7536   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7537   int c;
7538
7539   CHECK_NATNUM (code);
7540   c = XFASTINT (code);
7541   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7542   attrs = AREF (spec, 0);
7543
7544   if (ASCII_BYTE_P (c)
7545       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7546     return code;
7547
7548   val = CODING_ATTR_CHARSET_LIST (attrs);
7549   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7550   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7551   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7552
7553   if (c <= 0x7F)
7554     charset = charset_roman;
7555   else if (c >= 0xA0 && c < 0xDF)
7556     {
7557       charset = charset_kana;
7558       c -= 0x80;
7559     }
7560   else
7561     {
7562       int s1 = c >> 8, s2 = c & 0xFF;
7563
7564       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7565           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7566         error ("Invalid code: %d", code);
7567       SJIS_TO_JIS (c);
7568       charset = charset_kanji;
7569     }
7570   c = DECODE_CHAR (charset, c);
7571   if (c < 0)
7572     error ("Invalid code: %d", code);
7573   return make_number (c);
7574 }
7575
7576
7577 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7578        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7579 Return the corresponding code in SJIS.  */)
7580      (ch)
7581     Lisp_Object ch;
7582 {
7583   Lisp_Object spec, attrs, charset_list;
7584   int c;
7585   struct charset *charset;
7586   unsigned code;
7587
7588   CHECK_CHARACTER (ch);
7589   c = XFASTINT (ch);
7590   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7591   attrs = AREF (spec, 0);
7592
7593   if (ASCII_CHAR_P (c)
7594       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7595     return ch;
7596
7597   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7598   charset = char_charset (c, charset_list, &code);
7599   if (code == CHARSET_INVALID_CODE (charset))
7600     error ("Can't encode by shift_jis encoding: %d", c);
7601   JIS_TO_SJIS (code);
7602
7603   return make_number (code);
7604 }
7605
7606 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7607        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7608 Return the corresponding character.  */)
7609      (code)
7610      Lisp_Object code;
7611 {
7612   Lisp_Object spec, attrs, val;
7613   struct charset *charset_roman, *charset_big5, *charset;
7614   int c;
7615
7616   CHECK_NATNUM (code);
7617   c = XFASTINT (code);
7618   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7619   attrs = AREF (spec, 0);
7620
7621   if (ASCII_BYTE_P (c)
7622       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7623     return code;
7624
7625   val = CODING_ATTR_CHARSET_LIST (attrs);
7626   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7627   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7628
7629   if (c <= 0x7F)
7630     charset = charset_roman;
7631   else
7632     {
7633       int b1 = c >> 8, b2 = c & 0x7F;
7634       if (b1 < 0xA1 || b1 > 0xFE
7635           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7636         error ("Invalid code: %d", code);
7637       charset = charset_big5;
7638     }
7639   c = DECODE_CHAR (charset, (unsigned )c);
7640   if (c < 0)
7641     error ("Invalid code: %d", code);
7642   return make_number (c);
7643 }
7644
7645 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7646        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7647 Return the corresponding character code in Big5.  */)
7648      (ch)
7649      Lisp_Object ch;
7650 {
7651   Lisp_Object spec, attrs, charset_list;
7652   struct charset *charset;
7653   int c;
7654   unsigned code;
7655
7656   CHECK_CHARACTER (ch);
7657   c = XFASTINT (ch);
7658   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7659   attrs = AREF (spec, 0);
7660   if (ASCII_CHAR_P (c)
7661       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7662     return ch;
7663
7664   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7665   charset = char_charset (c, charset_list, &code);
7666   if (code == CHARSET_INVALID_CODE (charset))
7667     error ("Can't encode by Big5 encoding: %d", c);
7668
7669   return make_number (code);
7670 }
7671
7672 \f
7673 DEFUN ("set-terminal-coding-system-internal",
7674        Fset_terminal_coding_system_internal,
7675        Sset_terminal_coding_system_internal, 1, 1, 0,
7676        doc: /* Internal use only.  */)
7677      (coding_system)
7678      Lisp_Object coding_system;
7679 {
7680   CHECK_SYMBOL (coding_system);
7681   setup_coding_system (Fcheck_coding_system (coding_system),
7682                         &terminal_coding);
7683
7684   /* We had better not send unsafe characters to terminal.  */
7685   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7686   /* Characer composition should be disabled.  */
7687   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7688   terminal_coding.src_multibyte = 1;
7689   terminal_coding.dst_multibyte = 0;
7690   return Qnil;
7691 }
7692
7693 DEFUN ("set-safe-terminal-coding-system-internal",
7694        Fset_safe_terminal_coding_system_internal,
7695        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7696        doc: /* Internal use only.  */)
7697      (coding_system)
7698      Lisp_Object coding_system;
7699 {
7700   CHECK_SYMBOL (coding_system);
7701   setup_coding_system (Fcheck_coding_system (coding_system),
7702                        &safe_terminal_coding);
7703   /* Characer composition should be disabled.  */
7704   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7705   safe_terminal_coding.src_multibyte = 1;
7706   safe_terminal_coding.dst_multibyte = 0;
7707   return Qnil;
7708 }
7709
7710 DEFUN ("terminal-coding-system",
7711        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7712        doc: /* Return coding system specified for terminal output.  */)
7713      ()
7714 {
7715   return CODING_ID_NAME (terminal_coding.id);
7716 }
7717
7718 DEFUN ("set-keyboard-coding-system-internal",
7719        Fset_keyboard_coding_system_internal,
7720        Sset_keyboard_coding_system_internal, 1, 1, 0,
7721        doc: /* Internal use only.  */)
7722      (coding_system)
7723      Lisp_Object coding_system;
7724 {
7725   CHECK_SYMBOL (coding_system);
7726   setup_coding_system (Fcheck_coding_system (coding_system),
7727                        &keyboard_coding);
7728   /* Characer composition should be disabled.  */
7729   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7730   return Qnil;
7731 }
7732
7733 DEFUN ("keyboard-coding-system",
7734        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7735        doc: /* Return coding system specified for decoding keyboard input.  */)
7736      ()
7737 {
7738   return CODING_ID_NAME (keyboard_coding.id);
7739 }
7740
7741 \f
7742 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7743        Sfind_operation_coding_system,  1, MANY, 0,
7744        doc: /* Choose a coding system for an operation based on the target name.
7745 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7746 DECODING-SYSTEM is the coding system to use for decoding
7747 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7748 for encoding (in case OPERATION does encoding).
7749
7750 The first argument OPERATION specifies an I/O primitive:
7751   For file I/O, `insert-file-contents' or `write-region'.
7752   For process I/O, `call-process', `call-process-region', or `start-process'.
7753   For network I/O, `open-network-stream'.
7754
7755 The remaining arguments should be the same arguments that were passed
7756 to the primitive.  Depending on which primitive, one of those arguments
7757 is selected as the TARGET.  For example, if OPERATION does file I/O,
7758 whichever argument specifies the file name is TARGET.
7759
7760 TARGET has a meaning which depends on OPERATION:
7761   For file I/O, TARGET is a file name.
7762   For process I/O, TARGET is a process name.
7763   For network I/O, TARGET is a service name or a port number
7764
7765 This function looks up what specified for TARGET in,
7766 `file-coding-system-alist', `process-coding-system-alist',
7767 or `network-coding-system-alist' depending on OPERATION.
7768 They may specify a coding system, a cons of coding systems,
7769 or a function symbol to call.
7770 In the last case, we call the function with one argument,
7771 which is a list of all the arguments given to this function.
7772
7773 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7774      (nargs, args)
7775      int nargs;
7776      Lisp_Object *args;
7777 {
7778   Lisp_Object operation, target_idx, target, val;
7779   register Lisp_Object chain;
7780
7781   if (nargs < 2)
7782     error ("Too few arguments");
7783   operation = args[0];
7784   if (!SYMBOLP (operation)
7785       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7786     error ("Invalid first arguement");
7787   if (nargs < 1 + XINT (target_idx))
7788     error ("Too few arguments for operation: %s",
7789            SDATA (SYMBOL_NAME (operation)));
7790   target = args[XINT (target_idx) + 1];
7791   if (!(STRINGP (target)
7792         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7793     error ("Invalid %dth argument", XINT (target_idx) + 1);
7794
7795   chain = ((EQ (operation, Qinsert_file_contents)
7796             || EQ (operation, Qwrite_region))
7797            ? Vfile_coding_system_alist
7798            : (EQ (operation, Qopen_network_stream)
7799               ? Vnetwork_coding_system_alist
7800               : Vprocess_coding_system_alist));
7801   if (NILP (chain))
7802     return Qnil;
7803
7804   for (; CONSP (chain); chain = XCDR (chain))
7805     {
7806       Lisp_Object elt;
7807
7808       elt = XCAR (chain);
7809       if (CONSP (elt)
7810           && ((STRINGP (target)
7811                && STRINGP (XCAR (elt))
7812                && fast_string_match (XCAR (elt), target) >= 0)
7813               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7814         {
7815           val = XCDR (elt);
7816           /* Here, if VAL is both a valid coding system and a valid
7817              function symbol, we return VAL as a coding system.  */
7818           if (CONSP (val))
7819             return val;
7820           if (! SYMBOLP (val))
7821             return Qnil;
7822           if (! NILP (Fcoding_system_p (val)))
7823             return Fcons (val, val);
7824           if (! NILP (Ffboundp (val)))
7825             {
7826               val = call1 (val, Flist (nargs, args));
7827               if (CONSP (val))
7828                 return val;
7829               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7830                 return Fcons (val, val);
7831             }
7832           return Qnil;
7833         }
7834     }
7835   return Qnil;
7836 }
7837
7838 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7839        Sset_coding_system_priority, 0, MANY, 0,
7840        doc: /* Assign higher priority to the coding systems given as arguments.
7841 If multiple coding systems belongs to the same category,
7842 all but the first one are ignored.
7843
7844 usage: (set-coding-system-priority ...)  */)
7845      (nargs, args)
7846      int nargs;
7847      Lisp_Object *args;
7848 {
7849   int i, j;
7850   int changed[coding_category_max];
7851   enum coding_category priorities[coding_category_max];
7852
7853   bzero (changed, sizeof changed);
7854
7855   for (i = j = 0; i < nargs; i++)
7856     {
7857       enum coding_category category;
7858       Lisp_Object spec, attrs;
7859
7860       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7861       attrs = AREF (spec, 0);
7862       category = XINT (CODING_ATTR_CATEGORY (attrs));
7863       if (changed[category])
7864         /* Ignore this coding system because a coding system of the
7865            same category already had a higher priority.  */
7866         continue;
7867       changed[category] = 1;
7868       priorities[j++] = category;
7869       if (coding_categories[category].id >= 0
7870           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7871         setup_coding_system (args[i], &coding_categories[category]);
7872       Fset (AREF (Vcoding_category_table, category), args[i]);
7873     }
7874
7875   /* Now we have decided top J priorities.  Reflect the order of the
7876      original priorities to the remaining priorities.  */
7877
7878   for (i = j, j = 0; i < coding_category_max; i++, j++)
7879     {
7880       while (j < coding_category_max
7881              && changed[coding_priorities[j]])
7882         j++;
7883       if (j == coding_category_max)
7884         abort ();
7885       priorities[i] = coding_priorities[j];
7886     }
7887
7888   bcopy (priorities, coding_priorities, sizeof priorities);
7889
7890   /* Update `coding-category-list'.  */
7891   Vcoding_category_list = Qnil;
7892   for (i = coding_category_max - 1; i >= 0; i--)
7893     Vcoding_category_list
7894       = Fcons (AREF (Vcoding_category_table, priorities[i]),
7895                Vcoding_category_list);
7896
7897   return Qnil;
7898 }
7899
7900 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7901        Scoding_system_priority_list, 0, 1, 0,
7902        doc: /* Return a list of coding systems ordered by their priorities.
7903 HIGHESTP non-nil means just return the highest priority one.  */)
7904      (highestp)
7905      Lisp_Object highestp;
7906 {
7907   int i;
7908   Lisp_Object val;
7909
7910   for (i = 0, val = Qnil; i < coding_category_max; i++)
7911     {
7912       enum coding_category category = coding_priorities[i];
7913       int id = coding_categories[category].id;
7914       Lisp_Object attrs;
7915
7916       if (id < 0)
7917         continue;
7918       attrs = CODING_ID_ATTRS (id);
7919       if (! NILP (highestp))
7920         return CODING_ATTR_BASE_NAME (attrs);
7921       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7922     }
7923   return Fnreverse (val);
7924 }
7925
7926 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7927
7928 static Lisp_Object
7929 make_subsidiaries (base)
7930      Lisp_Object base;
7931 {
7932   Lisp_Object subsidiaries;
7933   int base_name_len = SBYTES (SYMBOL_NAME (base));
7934   char *buf = (char *) alloca (base_name_len + 6);
7935   int i;
7936
7937   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7938   subsidiaries = Fmake_vector (make_number (3), Qnil);
7939   for (i = 0; i < 3; i++)
7940     {
7941       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7942       ASET (subsidiaries, i, intern (buf));
7943     }
7944   return subsidiaries;
7945 }
7946
7947
7948 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7949        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7950        doc: /* For internal use only.
7951 usage: (define-coding-system-internal ...)  */)
7952      (nargs, args)
7953      int nargs;
7954      Lisp_Object *args;
7955 {
7956   Lisp_Object name;
7957   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7958   Lisp_Object attrs;            /* Vector of attributes.  */
7959   Lisp_Object eol_type;
7960   Lisp_Object aliases;
7961   Lisp_Object coding_type, charset_list, safe_charsets;
7962   enum coding_category category;
7963   Lisp_Object tail, val;
7964   int max_charset_id = 0;
7965   int i;
7966
7967   if (nargs < coding_arg_max)
7968     goto short_args;
7969
7970   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7971
7972   name = args[coding_arg_name];
7973   CHECK_SYMBOL (name);
7974   CODING_ATTR_BASE_NAME (attrs) = name;
7975
7976   val = args[coding_arg_mnemonic];
7977   if (! STRINGP (val))
7978     CHECK_CHARACTER (val);
7979   CODING_ATTR_MNEMONIC (attrs) = val;
7980
7981   coding_type = args[coding_arg_coding_type];
7982   CHECK_SYMBOL (coding_type);
7983   CODING_ATTR_TYPE (attrs) = coding_type;
7984
7985   charset_list = args[coding_arg_charset_list];
7986   if (SYMBOLP (charset_list))
7987     {
7988       if (EQ (charset_list, Qiso_2022))
7989         {
7990           if (! EQ (coding_type, Qiso_2022))
7991             error ("Invalid charset-list");
7992           charset_list = Viso_2022_charset_list;
7993         }
7994       else if (EQ (charset_list, Qemacs_mule))
7995         {
7996           if (! EQ (coding_type, Qemacs_mule))
7997             error ("Invalid charset-list");
7998           charset_list = Vemacs_mule_charset_list;
7999         }
8000       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8001         if (max_charset_id < XFASTINT (XCAR (tail)))
8002           max_charset_id = XFASTINT (XCAR (tail));
8003     }
8004   else
8005     {
8006       charset_list = Fcopy_sequence (charset_list);
8007       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8008         {
8009           struct charset *charset;
8010
8011           val = Fcar (tail);
8012           CHECK_CHARSET_GET_CHARSET (val, charset);
8013           if (EQ (coding_type, Qiso_2022)
8014               ? CHARSET_ISO_FINAL (charset) < 0
8015               : EQ (coding_type, Qemacs_mule)
8016               ? CHARSET_EMACS_MULE_ID (charset) < 0
8017               : 0)
8018             error ("Can't handle charset `%s'",
8019                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8020
8021           XSETCAR (tail, make_number (charset->id));
8022           if (max_charset_id < charset->id)
8023             max_charset_id = charset->id;
8024         }
8025     }
8026   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8027
8028   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8029                                 make_number (255));
8030   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8031     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8032   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8033
8034   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8035
8036   val = args[coding_arg_decode_translation_table];
8037   if (! NILP (val))
8038     CHECK_CHAR_TABLE (val);
8039   CODING_ATTR_DECODE_TBL (attrs) = val;
8040
8041   val = args[coding_arg_encode_translation_table];
8042   if (! NILP (val))
8043     CHECK_CHAR_TABLE (val);
8044   CODING_ATTR_ENCODE_TBL (attrs) = val;
8045
8046   val = args[coding_arg_post_read_conversion];
8047   CHECK_SYMBOL (val);
8048   CODING_ATTR_POST_READ (attrs) = val;
8049
8050   val = args[coding_arg_pre_write_conversion];
8051   CHECK_SYMBOL (val);
8052   CODING_ATTR_PRE_WRITE (attrs) = val;
8053
8054   val = args[coding_arg_default_char];
8055   if (NILP (val))
8056     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8057   else
8058     {
8059       CHECK_CHARACTER (val);
8060       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8061     }
8062
8063   val = args[coding_arg_for_unibyte];
8064   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8065
8066   val = args[coding_arg_plist];
8067   CHECK_LIST (val);
8068   CODING_ATTR_PLIST (attrs) = val;
8069
8070   if (EQ (coding_type, Qcharset))
8071     {
8072       /* Generate a lisp vector of 256 elements.  Each element is nil,
8073          integer, or a list of charset IDs.
8074
8075          If Nth element is nil, the byte code N is invalid in this
8076          coding system.
8077
8078          If Nth element is a number NUM, N is the first byte of a
8079          charset whose ID is NUM.
8080
8081          If Nth element is a list of charset IDs, N is the first byte
8082          of one of them.  The list is sorted by dimensions of the
8083          charsets.  A charset of smaller dimension comes firtst. */
8084       val = Fmake_vector (make_number (256), Qnil);
8085
8086       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8087         {
8088           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8089           int dim = CHARSET_DIMENSION (charset);
8090           int idx = (dim - 1) * 4;
8091
8092           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8093             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8094
8095           for (i = charset->code_space[idx];
8096                i <= charset->code_space[idx + 1]; i++)
8097             {
8098               Lisp_Object tmp, tmp2;
8099               int dim2;
8100
8101               tmp = AREF (val, i);
8102               if (NILP (tmp))
8103                 tmp = XCAR (tail);
8104               else if (NUMBERP (tmp))
8105                 {
8106                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8107                   if (dim < dim2)
8108                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8109                   else
8110                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8111                 }
8112               else
8113                 {
8114                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8115                     {
8116                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8117                       if (dim < dim2)
8118                         break;
8119                     }
8120                   if (NILP (tmp2))
8121                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8122                   else
8123                     {
8124                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8125                       XSETCAR (tmp2, XCAR (tail));
8126                     }
8127                 }
8128               ASET (val, i, tmp);
8129             }
8130         }
8131       ASET (attrs, coding_attr_charset_valids, val);
8132       category = coding_category_charset;
8133     }
8134   else if (EQ (coding_type, Qccl))
8135     {
8136       Lisp_Object valids;
8137
8138       if (nargs < coding_arg_ccl_max)
8139         goto short_args;
8140
8141       val = args[coding_arg_ccl_decoder];
8142       CHECK_CCL_PROGRAM (val);
8143       if (VECTORP (val))
8144         val = Fcopy_sequence (val);
8145       ASET (attrs, coding_attr_ccl_decoder, val);
8146
8147       val = args[coding_arg_ccl_encoder];
8148       CHECK_CCL_PROGRAM (val);
8149       if (VECTORP (val))
8150         val = Fcopy_sequence (val);
8151       ASET (attrs, coding_attr_ccl_encoder, val);
8152
8153       val = args[coding_arg_ccl_valids];
8154       valids = Fmake_string (make_number (256), make_number (0));
8155       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8156         {
8157           int from, to;
8158
8159           val = Fcar (tail);
8160           if (INTEGERP (val))
8161             {
8162               from = to = XINT (val);
8163               if (from < 0 || from > 255)
8164                 args_out_of_range_3 (val, make_number (0), make_number (255));
8165             }
8166           else
8167             {
8168               CHECK_CONS (val);
8169               CHECK_NATNUM_CAR (val);
8170               CHECK_NATNUM_CDR (val);
8171               from = XINT (XCAR (val));
8172               if (from > 255)
8173                 args_out_of_range_3 (XCAR (val),
8174                                      make_number (0), make_number (255));
8175               to = XINT (XCDR (val));
8176               if (to < from || to > 255)
8177                 args_out_of_range_3 (XCDR (val),
8178                                      XCAR (val), make_number (255));
8179             }
8180           for (i = from; i <= to; i++)
8181             SSET (valids, i, 1);
8182         }
8183       ASET (attrs, coding_attr_ccl_valids, valids);
8184
8185       category = coding_category_ccl;
8186     }
8187   else if (EQ (coding_type, Qutf_16))
8188     {
8189       Lisp_Object bom, endian;
8190
8191       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8192
8193       if (nargs < coding_arg_utf16_max)
8194         goto short_args;
8195
8196       bom = args[coding_arg_utf16_bom];
8197       if (! NILP (bom) && ! EQ (bom, Qt))
8198         {
8199           CHECK_CONS (bom);
8200           val = XCAR (bom);
8201           CHECK_CODING_SYSTEM (val);
8202           val = XCDR (bom);
8203           CHECK_CODING_SYSTEM (val);
8204         }
8205       ASET (attrs, coding_attr_utf_16_bom, bom);
8206
8207       endian = args[coding_arg_utf16_endian];
8208       CHECK_SYMBOL (endian);
8209       if (NILP (endian))
8210         endian = Qbig;
8211       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8212         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8213       ASET (attrs, coding_attr_utf_16_endian, endian);
8214
8215       category = (CONSP (bom)
8216                   ? coding_category_utf_16_auto
8217                   : NILP (bom)
8218                   ? (EQ (endian, Qbig)
8219                      ? coding_category_utf_16_be_nosig
8220                      : coding_category_utf_16_le_nosig)
8221                   : (EQ (endian, Qbig)
8222                      ? coding_category_utf_16_be
8223                      : coding_category_utf_16_le));
8224     }
8225   else if (EQ (coding_type, Qiso_2022))
8226     {
8227       Lisp_Object initial, reg_usage, request, flags;
8228       int i;
8229
8230       if (nargs < coding_arg_iso2022_max)
8231         goto short_args;
8232
8233       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8234       CHECK_VECTOR (initial);
8235       for (i = 0; i < 4; i++)
8236         {
8237           val = Faref (initial, make_number (i));
8238           if (! NILP (val))
8239             {
8240               struct charset *charset;
8241
8242               CHECK_CHARSET_GET_CHARSET (val, charset);
8243               ASET (initial, i, make_number (CHARSET_ID (charset)));
8244               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8245                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8246             }
8247           else
8248             ASET (initial, i, make_number (-1));
8249         }
8250
8251       reg_usage = args[coding_arg_iso2022_reg_usage];
8252       CHECK_CONS (reg_usage);
8253       CHECK_NUMBER_CAR (reg_usage);
8254       CHECK_NUMBER_CDR (reg_usage);
8255
8256       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8257       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8258         {
8259           int id;
8260           Lisp_Object tmp;
8261
8262           val = Fcar (tail);
8263           CHECK_CONS (val);
8264           tmp = XCAR (val);
8265           CHECK_CHARSET_GET_ID (tmp, id);
8266           CHECK_NATNUM_CDR (val);
8267           if (XINT (XCDR (val)) >= 4)
8268             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8269           XSETCAR (val, make_number (id));
8270         }
8271
8272       flags = args[coding_arg_iso2022_flags];
8273       CHECK_NATNUM (flags);
8274       i = XINT (flags);
8275       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8276         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8277
8278       ASET (attrs, coding_attr_iso_initial, initial);
8279       ASET (attrs, coding_attr_iso_usage, reg_usage);
8280       ASET (attrs, coding_attr_iso_request, request);
8281       ASET (attrs, coding_attr_iso_flags, flags);
8282       setup_iso_safe_charsets (attrs);
8283
8284       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8285         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8286                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8287                     ? coding_category_iso_7_else
8288                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8289                     ? coding_category_iso_7
8290                     : coding_category_iso_7_tight);
8291       else
8292         {
8293           int id = XINT (AREF (initial, 1));
8294
8295           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8296                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8297                        || id < 0)
8298                       ? coding_category_iso_8_else
8299                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8300                       ? coding_category_iso_8_1
8301                       : coding_category_iso_8_2);
8302         }
8303       if (category != coding_category_iso_8_1
8304           && category != coding_category_iso_8_2)
8305         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8306     }
8307   else if (EQ (coding_type, Qemacs_mule))
8308     {
8309       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8310         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8311       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8312       category = coding_category_emacs_mule;
8313     }
8314   else if (EQ (coding_type, Qshift_jis))
8315     {
8316
8317       struct charset *charset;
8318
8319       if (XINT (Flength (charset_list)) != 3)
8320         error ("There should be just three charsets");
8321
8322       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8323       if (CHARSET_DIMENSION (charset) != 1)
8324         error ("Dimension of charset %s is not one",
8325                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8326       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8327         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8328
8329       charset_list = XCDR (charset_list);
8330       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8331       if (CHARSET_DIMENSION (charset) != 1)
8332         error ("Dimension of charset %s is not one",
8333                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8334
8335       charset_list = XCDR (charset_list);
8336       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8337       if (CHARSET_DIMENSION (charset) != 2)
8338         error ("Dimension of charset %s is not two",
8339                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8340
8341       category = coding_category_sjis;
8342       Vsjis_coding_system = name;
8343     }
8344   else if (EQ (coding_type, Qbig5))
8345     {
8346       struct charset *charset;
8347
8348       if (XINT (Flength (charset_list)) != 2)
8349         error ("There should be just two charsets");
8350
8351       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8352       if (CHARSET_DIMENSION (charset) != 1)
8353         error ("Dimension of charset %s is not one",
8354                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8355       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8356         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8357
8358       charset_list = XCDR (charset_list);
8359       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8360       if (CHARSET_DIMENSION (charset) != 2)
8361         error ("Dimension of charset %s is not two",
8362                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8363
8364       category = coding_category_big5;
8365       Vbig5_coding_system = name;
8366     }
8367   else if (EQ (coding_type, Qraw_text))
8368     {
8369       category = coding_category_raw_text;
8370       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8371     }
8372   else if (EQ (coding_type, Qutf_8))
8373     {
8374       category = coding_category_utf_8;
8375       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8376     }
8377   else if (EQ (coding_type, Qundecided))
8378     category = coding_category_undecided;
8379   else
8380     error ("Invalid coding system type: %s",
8381            SDATA (SYMBOL_NAME (coding_type)));
8382
8383   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8384   CODING_ATTR_PLIST (attrs)
8385     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8386                                 CODING_ATTR_PLIST (attrs)));
8387
8388   eol_type = args[coding_arg_eol_type];
8389   if (! NILP (eol_type)
8390       && ! EQ (eol_type, Qunix)
8391       && ! EQ (eol_type, Qdos)
8392       && ! EQ (eol_type, Qmac))
8393     error ("Invalid eol-type");
8394
8395   aliases = Fcons (name, Qnil);
8396
8397   if (NILP (eol_type))
8398     {
8399       eol_type = make_subsidiaries (name);
8400       for (i = 0; i < 3; i++)
8401         {
8402           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8403
8404           this_name = AREF (eol_type, i);
8405           this_aliases = Fcons (this_name, Qnil);
8406           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8407           this_spec = Fmake_vector (make_number (3), attrs);
8408           ASET (this_spec, 1, this_aliases);
8409           ASET (this_spec, 2, this_eol_type);
8410           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8411           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8412           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8413                                         Vcoding_system_alist);
8414         }
8415     }
8416
8417   spec_vec = Fmake_vector (make_number (3), attrs);
8418   ASET (spec_vec, 1, aliases);
8419   ASET (spec_vec, 2, eol_type);
8420
8421   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8422   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8423   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8424                                 Vcoding_system_alist);
8425
8426   {
8427     int id = coding_categories[category].id;
8428
8429     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8430       setup_coding_system (name, &coding_categories[category]);
8431   }
8432
8433   return Qnil;
8434
8435  short_args:
8436   return Fsignal (Qwrong_number_of_arguments,
8437                   Fcons (intern ("define-coding-system-internal"),
8438                          make_number (nargs)));
8439 }
8440
8441
8442 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8443        Sdefine_coding_system_alias, 2, 2, 0,
8444        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8445      (alias, coding_system)
8446      Lisp_Object alias, coding_system;
8447 {
8448   Lisp_Object spec, aliases, eol_type;
8449
8450   CHECK_SYMBOL (alias);
8451   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8452   aliases = AREF (spec, 1);
8453   /* ALISES should be a list of length more than zero, and the first
8454      element is a base coding system.  Append ALIAS at the tail of the
8455      list.  */
8456   while (!NILP (XCDR (aliases)))
8457     aliases = XCDR (aliases);
8458   XSETCDR (aliases, Fcons (alias, Qnil));
8459
8460   eol_type = AREF (spec, 2);
8461   if (VECTORP (eol_type))
8462     {
8463       Lisp_Object subsidiaries;
8464       int i;
8465
8466       subsidiaries = make_subsidiaries (alias);
8467       for (i = 0; i < 3; i++)
8468         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8469                                      AREF (eol_type, i));
8470     }
8471
8472   Fputhash (alias, spec, Vcoding_system_hash_table);
8473   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8474   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8475                                 Vcoding_system_alist);
8476
8477   return Qnil;
8478 }
8479
8480 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8481        1, 1, 0,
8482        doc: /* Return the base of CODING-SYSTEM.
8483 Any alias or subsidiary coding system is not a base coding system.  */)
8484   (coding_system)
8485      Lisp_Object coding_system;
8486 {
8487   Lisp_Object spec, attrs;
8488
8489   if (NILP (coding_system))
8490     return (Qno_conversion);
8491   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8492   attrs = AREF (spec, 0);
8493   return CODING_ATTR_BASE_NAME (attrs);
8494 }
8495
8496 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8497        1, 1, 0,
8498        doc: "Return the property list of CODING-SYSTEM.")
8499      (coding_system)
8500      Lisp_Object coding_system;
8501 {
8502   Lisp_Object spec, attrs;
8503
8504   if (NILP (coding_system))
8505     coding_system = Qno_conversion;
8506   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8507   attrs = AREF (spec, 0);
8508   return CODING_ATTR_PLIST (attrs);
8509 }
8510
8511
8512 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8513        1, 1, 0,
8514        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8515      (coding_system)
8516      Lisp_Object coding_system;
8517 {
8518   Lisp_Object spec;
8519
8520   if (NILP (coding_system))
8521     coding_system = Qno_conversion;
8522   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8523   return AREF (spec, 1);
8524 }
8525
8526 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8527        Scoding_system_eol_type, 1, 1, 0,
8528        doc: /* Return eol-type of CODING-SYSTEM.
8529 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8530
8531 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8532 and CR respectively.
8533
8534 A vector value indicates that a format of end-of-line should be
8535 detected automatically.  Nth element of the vector is the subsidiary
8536 coding system whose eol-type is N.  */)
8537      (coding_system)
8538      Lisp_Object coding_system;
8539 {
8540   Lisp_Object spec, eol_type;
8541   int n;
8542
8543   if (NILP (coding_system))
8544     coding_system = Qno_conversion;
8545   if (! CODING_SYSTEM_P (coding_system))
8546     return Qnil;
8547   spec = CODING_SYSTEM_SPEC (coding_system);
8548   eol_type = AREF (spec, 2);
8549   if (VECTORP (eol_type))
8550     return Fcopy_sequence (eol_type);
8551   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8552   return make_number (n);
8553 }
8554
8555 #endif /* emacs */
8556
8557 \f
8558 /*** 9. Post-amble ***/
8559
8560 void
8561 init_coding_once ()
8562 {
8563   int i;
8564
8565   for (i = 0; i < coding_category_max; i++)
8566     {
8567       coding_categories[i].id = -1;
8568       coding_priorities[i] = i;
8569     }
8570
8571   /* ISO2022 specific initialize routine.  */
8572   for (i = 0; i < 0x20; i++)
8573     iso_code_class[i] = ISO_control_0;
8574   for (i = 0x21; i < 0x7F; i++)
8575     iso_code_class[i] = ISO_graphic_plane_0;
8576   for (i = 0x80; i < 0xA0; i++)
8577     iso_code_class[i] = ISO_control_1;
8578   for (i = 0xA1; i < 0xFF; i++)
8579     iso_code_class[i] = ISO_graphic_plane_1;
8580   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8581   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8582   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8583   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8584   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8585   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8586   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8587   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8588   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8589
8590   for (i = 0; i < 256; i++)
8591     {
8592       emacs_mule_bytes[i] = 1;
8593     }
8594   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8595   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8596   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8597   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8598 }
8599
8600 #ifdef emacs
8601
8602 void
8603 syms_of_coding ()
8604 {
8605   staticpro (&Vcoding_system_hash_table);
8606   {
8607     Lisp_Object args[2];
8608     args[0] = QCtest;
8609     args[1] = Qeq;
8610     Vcoding_system_hash_table = Fmake_hash_table (2, args);
8611   }
8612
8613   staticpro (&Vsjis_coding_system);
8614   Vsjis_coding_system = Qnil;
8615
8616   staticpro (&Vbig5_coding_system);
8617   Vbig5_coding_system = Qnil;
8618
8619   staticpro (&Vcode_conversion_reused_workbuf);
8620   Vcode_conversion_reused_workbuf = Qnil;
8621
8622   staticpro (&Vcode_conversion_workbuf_name);
8623   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8624
8625   reused_workbuf_in_use = 0;
8626
8627   DEFSYM (Qcharset, "charset");
8628   DEFSYM (Qtarget_idx, "target-idx");
8629   DEFSYM (Qcoding_system_history, "coding-system-history");
8630   Fset (Qcoding_system_history, Qnil);
8631
8632   /* Target FILENAME is the first argument.  */
8633   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8634   /* Target FILENAME is the third argument.  */
8635   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8636
8637   DEFSYM (Qcall_process, "call-process");
8638   /* Target PROGRAM is the first argument.  */
8639   Fput (Qcall_process, Qtarget_idx, make_number (0));
8640
8641   DEFSYM (Qcall_process_region, "call-process-region");
8642   /* Target PROGRAM is the third argument.  */
8643   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8644
8645   DEFSYM (Qstart_process, "start-process");
8646   /* Target PROGRAM is the third argument.  */
8647   Fput (Qstart_process, Qtarget_idx, make_number (2));
8648
8649   DEFSYM (Qopen_network_stream, "open-network-stream");
8650   /* Target SERVICE is the fourth argument.  */
8651   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8652
8653   DEFSYM (Qcoding_system, "coding-system");
8654   DEFSYM (Qcoding_aliases, "coding-aliases");
8655
8656   DEFSYM (Qeol_type, "eol-type");
8657   DEFSYM (Qunix, "unix");
8658   DEFSYM (Qdos, "dos");
8659
8660   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8661   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8662   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8663   DEFSYM (Qdefault_char, "default-char");
8664   DEFSYM (Qundecided, "undecided");
8665   DEFSYM (Qno_conversion, "no-conversion");
8666   DEFSYM (Qraw_text, "raw-text");
8667
8668   DEFSYM (Qiso_2022, "iso-2022");
8669
8670   DEFSYM (Qutf_8, "utf-8");
8671   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8672
8673   DEFSYM (Qutf_16, "utf-16");
8674   DEFSYM (Qbig, "big");
8675   DEFSYM (Qlittle, "little");
8676
8677   DEFSYM (Qshift_jis, "shift-jis");
8678   DEFSYM (Qbig5, "big5");
8679
8680   DEFSYM (Qcoding_system_p, "coding-system-p");
8681
8682   DEFSYM (Qcoding_system_error, "coding-system-error");
8683   Fput (Qcoding_system_error, Qerror_conditions,
8684         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8685   Fput (Qcoding_system_error, Qerror_message,
8686         build_string ("Invalid coding system"));
8687
8688   /* Intern this now in case it isn't already done.
8689      Setting this variable twice is harmless.
8690      But don't staticpro it here--that is done in alloc.c.  */
8691   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8692
8693   DEFSYM (Qtranslation_table, "translation-table");
8694   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8695   DEFSYM (Qtranslation_table_id, "translation-table-id");
8696   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8697   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8698
8699   DEFSYM (Qvalid_codes, "valid-codes");
8700
8701   DEFSYM (Qemacs_mule, "emacs-mule");
8702
8703   DEFSYM (QCcategory, ":category");
8704
8705   Vcoding_category_table
8706     = Fmake_vector (make_number (coding_category_max), Qnil);
8707   staticpro (&Vcoding_category_table);
8708   /* Followings are target of code detection.  */
8709   ASET (Vcoding_category_table, coding_category_iso_7,
8710         intern ("coding-category-iso-7"));
8711   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8712         intern ("coding-category-iso-7-tight"));
8713   ASET (Vcoding_category_table, coding_category_iso_8_1,
8714         intern ("coding-category-iso-8-1"));
8715   ASET (Vcoding_category_table, coding_category_iso_8_2,
8716         intern ("coding-category-iso-8-2"));
8717   ASET (Vcoding_category_table, coding_category_iso_7_else,
8718         intern ("coding-category-iso-7-else"));
8719   ASET (Vcoding_category_table, coding_category_iso_8_else,
8720         intern ("coding-category-iso-8-else"));
8721   ASET (Vcoding_category_table, coding_category_utf_8,
8722         intern ("coding-category-utf-8"));
8723   ASET (Vcoding_category_table, coding_category_utf_16_be,
8724         intern ("coding-category-utf-16-be"));
8725   ASET (Vcoding_category_table, coding_category_utf_16_auto,
8726         intern ("coding-category-utf-16-auto"));
8727   ASET (Vcoding_category_table, coding_category_utf_16_le,
8728         intern ("coding-category-utf-16-le"));
8729   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8730         intern ("coding-category-utf-16-be-nosig"));
8731   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8732         intern ("coding-category-utf-16-le-nosig"));
8733   ASET (Vcoding_category_table, coding_category_charset,
8734         intern ("coding-category-charset"));
8735   ASET (Vcoding_category_table, coding_category_sjis,
8736         intern ("coding-category-sjis"));
8737   ASET (Vcoding_category_table, coding_category_big5,
8738         intern ("coding-category-big5"));
8739   ASET (Vcoding_category_table, coding_category_ccl,
8740         intern ("coding-category-ccl"));
8741   ASET (Vcoding_category_table, coding_category_emacs_mule,
8742         intern ("coding-category-emacs-mule"));
8743   /* Followings are NOT target of code detection.  */
8744   ASET (Vcoding_category_table, coding_category_raw_text,
8745         intern ("coding-category-raw-text"));
8746   ASET (Vcoding_category_table, coding_category_undecided,
8747         intern ("coding-category-undecided"));
8748
8749   defsubr (&Scoding_system_p);
8750   defsubr (&Sread_coding_system);
8751   defsubr (&Sread_non_nil_coding_system);
8752   defsubr (&Scheck_coding_system);
8753   defsubr (&Sdetect_coding_region);
8754   defsubr (&Sdetect_coding_string);
8755   defsubr (&Sfind_coding_systems_region_internal);
8756   defsubr (&Sunencodable_char_position);
8757   defsubr (&Scheck_coding_systems_region);
8758   defsubr (&Sdecode_coding_region);
8759   defsubr (&Sencode_coding_region);
8760   defsubr (&Sdecode_coding_string);
8761   defsubr (&Sencode_coding_string);
8762   defsubr (&Sdecode_sjis_char);
8763   defsubr (&Sencode_sjis_char);
8764   defsubr (&Sdecode_big5_char);
8765   defsubr (&Sencode_big5_char);
8766   defsubr (&Sset_terminal_coding_system_internal);
8767   defsubr (&Sset_safe_terminal_coding_system_internal);
8768   defsubr (&Sterminal_coding_system);
8769   defsubr (&Sset_keyboard_coding_system_internal);
8770   defsubr (&Skeyboard_coding_system);
8771   defsubr (&Sfind_operation_coding_system);
8772   defsubr (&Sset_coding_system_priority);
8773   defsubr (&Sdefine_coding_system_internal);
8774   defsubr (&Sdefine_coding_system_alias);
8775   defsubr (&Scoding_system_base);
8776   defsubr (&Scoding_system_plist);
8777   defsubr (&Scoding_system_aliases);
8778   defsubr (&Scoding_system_eol_type);
8779   defsubr (&Scoding_system_priority_list);
8780
8781   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8782                doc: /* List of coding systems.
8783
8784 Do not alter the value of this variable manually.  This variable should be
8785 updated by the functions `define-coding-system' and
8786 `define-coding-system-alias'.  */);
8787   Vcoding_system_list = Qnil;
8788
8789   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8790                doc: /* Alist of coding system names.
8791 Each element is one element list of coding system name.
8792 This variable is given to `completing-read' as TABLE argument.
8793
8794 Do not alter the value of this variable manually.  This variable should be
8795 updated by the functions `make-coding-system' and
8796 `define-coding-system-alias'.  */);
8797   Vcoding_system_alist = Qnil;
8798
8799   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8800                doc: /* List of coding-categories (symbols) ordered by priority.
8801
8802 On detecting a coding system, Emacs tries code detection algorithms
8803 associated with each coding-category one by one in this order.  When
8804 one algorithm agrees with a byte sequence of source text, the coding
8805 system bound to the corresponding coding-category is selected.  */);
8806   {
8807     int i;
8808
8809     Vcoding_category_list = Qnil;
8810     for (i = coding_category_max - 1; i >= 0; i--)
8811       Vcoding_category_list
8812         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8813                  Vcoding_category_list);
8814   }
8815
8816   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8817                doc: /* Specify the coding system for read operations.
8818 It is useful to bind this variable with `let', but do not set it globally.
8819 If the value is a coding system, it is used for decoding on read operation.
8820 If not, an appropriate element is used from one of the coding system alists:
8821 There are three such tables, `file-coding-system-alist',
8822 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8823   Vcoding_system_for_read = Qnil;
8824
8825   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8826                doc: /* Specify the coding system for write operations.
8827 Programs bind this variable with `let', but you should not set it globally.
8828 If the value is a coding system, it is used for encoding of output,
8829 when writing it to a file and when sending it to a file or subprocess.
8830
8831 If this does not specify a coding system, an appropriate element
8832 is used from one of the coding system alists:
8833 There are three such tables, `file-coding-system-alist',
8834 `process-coding-system-alist', and `network-coding-system-alist'.
8835 For output to files, if the above procedure does not specify a coding system,
8836 the value of `buffer-file-coding-system' is used.  */);
8837   Vcoding_system_for_write = Qnil;
8838
8839   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8840                doc: /*
8841 Coding system used in the latest file or process I/O.  */);
8842   Vlast_coding_system_used = Qnil;
8843
8844   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8845                doc: /*
8846 *Non-nil means always inhibit code conversion of end-of-line format.
8847 See info node `Coding Systems' and info node `Text and Binary' concerning
8848 such conversion.  */);
8849   inhibit_eol_conversion = 0;
8850
8851   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8852                doc: /*
8853 Non-nil means process buffer inherits coding system of process output.
8854 Bind it to t if the process output is to be treated as if it were a file
8855 read from some filesystem.  */);
8856   inherit_process_coding_system = 0;
8857
8858   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8859                doc: /*
8860 Alist to decide a coding system to use for a file I/O operation.
8861 The format is ((PATTERN . VAL) ...),
8862 where PATTERN is a regular expression matching a file name,
8863 VAL is a coding system, a cons of coding systems, or a function symbol.
8864 If VAL is a coding system, it is used for both decoding and encoding
8865 the file contents.
8866 If VAL is a cons of coding systems, the car part is used for decoding,
8867 and the cdr part is used for encoding.
8868 If VAL is a function symbol, the function must return a coding system
8869 or a cons of coding systems which are used as above.  The function gets
8870 the arguments with which `find-operation-coding-systems' was called.
8871
8872 See also the function `find-operation-coding-system'
8873 and the variable `auto-coding-alist'.  */);
8874   Vfile_coding_system_alist = Qnil;
8875
8876   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8877                doc: /*
8878 Alist to decide a coding system to use for a process I/O operation.
8879 The format is ((PATTERN . VAL) ...),
8880 where PATTERN is a regular expression matching a program name,
8881 VAL is a coding system, a cons of coding systems, or a function symbol.
8882 If VAL is a coding system, it is used for both decoding what received
8883 from the program and encoding what sent to the program.
8884 If VAL is a cons of coding systems, the car part is used for decoding,
8885 and the cdr part is used for encoding.
8886 If VAL is a function symbol, the function must return a coding system
8887 or a cons of coding systems which are used as above.
8888
8889 See also the function `find-operation-coding-system'.  */);
8890   Vprocess_coding_system_alist = Qnil;
8891
8892   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8893                doc: /*
8894 Alist to decide a coding system to use for a network I/O operation.
8895 The format is ((PATTERN . VAL) ...),
8896 where PATTERN is a regular expression matching a network service name
8897 or is a port number to connect to,
8898 VAL is a coding system, a cons of coding systems, or a function symbol.
8899 If VAL is a coding system, it is used for both decoding what received
8900 from the network stream and encoding what sent to the network stream.
8901 If VAL is a cons of coding systems, the car part is used for decoding,
8902 and the cdr part is used for encoding.
8903 If VAL is a function symbol, the function must return a coding system
8904 or a cons of coding systems which are used as above.
8905
8906 See also the function `find-operation-coding-system'.  */);
8907   Vnetwork_coding_system_alist = Qnil;
8908
8909   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8910                doc: /* Coding system to use with system messages.
8911 Also used for decoding keyboard input on X Window system.  */);
8912   Vlocale_coding_system = Qnil;
8913
8914   /* The eol mnemonics are reset in startup.el system-dependently.  */
8915   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8916                doc: /*
8917 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8918   eol_mnemonic_unix = build_string (":");
8919
8920   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8921                doc: /*
8922 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8923   eol_mnemonic_dos = build_string ("\\");
8924
8925   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8926                doc: /*
8927 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8928   eol_mnemonic_mac = build_string ("/");
8929
8930   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8931                doc: /*
8932 *String displayed in mode line when end-of-line format is not yet determined.  */);
8933   eol_mnemonic_undecided = build_string (":");
8934
8935   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8936                doc: /*
8937 *Non-nil enables character translation while encoding and decoding.  */);
8938   Venable_character_translation = Qt;
8939
8940   DEFVAR_LISP ("standard-translation-table-for-decode",
8941                &Vstandard_translation_table_for_decode,
8942                doc: /* Table for translating characters while decoding.  */);
8943   Vstandard_translation_table_for_decode = Qnil;
8944
8945   DEFVAR_LISP ("standard-translation-table-for-encode",
8946                &Vstandard_translation_table_for_encode,
8947                doc: /* Table for translating characters while encoding.  */);
8948   Vstandard_translation_table_for_encode = Qnil;
8949
8950   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8951                doc: /* Alist of charsets vs revision numbers.
8952 While encoding, if a charset (car part of an element) is found,
8953 designate it with the escape sequence identifying revision (cdr part
8954 of the element).  */);
8955   Vcharset_revision_table = Qnil;
8956
8957   DEFVAR_LISP ("default-process-coding-system",
8958                &Vdefault_process_coding_system,
8959                doc: /* Cons of coding systems used for process I/O by default.
8960 The car part is used for decoding a process output,
8961 the cdr part is used for encoding a text to be sent to a process.  */);
8962   Vdefault_process_coding_system = Qnil;
8963
8964   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8965                doc: /*
8966 Table of extra Latin codes in the range 128..159 (inclusive).
8967 This is a vector of length 256.
8968 If Nth element is non-nil, the existence of code N in a file
8969 \(or output of subprocess) doesn't prevent it to be detected as
8970 a coding system of ISO 2022 variant which has a flag
8971 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8972 or reading output of a subprocess.
8973 Only 128th through 159th elements has a meaning.  */);
8974   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8975
8976   DEFVAR_LISP ("select-safe-coding-system-function",
8977                &Vselect_safe_coding_system_function,
8978                doc: /*
8979 Function to call to select safe coding system for encoding a text.
8980
8981 If set, this function is called to force a user to select a proper
8982 coding system which can encode the text in the case that a default
8983 coding system used in each operation can't encode the text.
8984
8985 The default value is `select-safe-coding-system' (which see).  */);
8986   Vselect_safe_coding_system_function = Qnil;
8987
8988   DEFVAR_BOOL ("coding-system-require-warning",
8989                &coding_system_require_warning,
8990                doc: /* Internal use only.
8991 If non-nil, on writing a file, `select-safe-coding-system-function' is
8992 called even if `coding-system-for-write' is non-nil.  The command
8993 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8994   coding_system_require_warning = 0;
8995
8996
8997   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8998                &inhibit_iso_escape_detection,
8999                doc: /*
9000 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9001
9002 By default, on reading a file, Emacs tries to detect how the text is
9003 encoded.  This code detection is sensitive to escape sequences.  If
9004 the sequence is valid as ISO2022, the code is determined as one of
9005 the ISO2022 encodings, and the file is decoded by the corresponding
9006 coding system (e.g. `iso-2022-7bit').
9007
9008 However, there may be a case that you want to read escape sequences in
9009 a file as is.  In such a case, you can set this variable to non-nil.
9010 Then, as the code detection ignores any escape sequences, no file is
9011 detected as encoded in some ISO2022 encoding.  The result is that all
9012 escape sequences become visible in a buffer.
9013
9014 The default value is nil, and it is strongly recommended not to change
9015 it.  That is because many Emacs Lisp source files that contain
9016 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9017 in Emacs's distribution, and they won't be decoded correctly on
9018 reading if you suppress escape sequence detection.
9019
9020 The other way to read escape sequences in a file without decoding is
9021 to explicitly specify some coding system that doesn't use ISO2022's
9022 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9023   inhibit_iso_escape_detection = 0;
9024
9025   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9026                doc: /* Char table for translating self-inserting characters.
9027 This is applied to the result of input methods, not their input.  See also
9028 `keyboard-translate-table'.  */);
9029     Vtranslation_table_for_input = Qnil;
9030
9031   {
9032     Lisp_Object args[coding_arg_max];
9033     Lisp_Object plist[16];
9034     int i;
9035
9036     for (i = 0; i < coding_arg_max; i++)
9037       args[i] = Qnil;
9038
9039     plist[0] = intern (":name");
9040     plist[1] = args[coding_arg_name] = Qno_conversion;
9041     plist[2] = intern (":mnemonic");
9042     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9043     plist[4] = intern (":coding-type");
9044     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9045     plist[6] = intern (":ascii-compatible-p");
9046     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9047     plist[8] = intern (":default-char");
9048     plist[9] = args[coding_arg_default_char] = make_number (0);
9049     plist[10] = intern (":for-unibyte");
9050     plist[11] = args[coding_arg_for_unibyte] = Qt;
9051     plist[12] = intern (":docstring");
9052     plist[13] = build_string ("Do no conversion.\n\
9053 \n\
9054 When you visit a file with this coding, the file is read into a\n\
9055 unibyte buffer as is, thus each byte of a file is treated as a\n\
9056 character.");
9057     plist[14] = intern (":eol-type");
9058     plist[15] = args[coding_arg_eol_type] = Qunix;
9059     args[coding_arg_plist] = Flist (16, plist);
9060     Fdefine_coding_system_internal (coding_arg_max, args);
9061   }
9062
9063   setup_coding_system (Qno_conversion, &keyboard_coding);
9064   setup_coding_system (Qno_conversion, &terminal_coding);
9065   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9066
9067   {
9068     int i;
9069
9070     for (i = 0; i < coding_category_max; i++)
9071       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9072   }
9073 }
9074
9075 char *
9076 emacs_strerror (error_number)
9077      int error_number;
9078 {
9079   char *str;
9080
9081   synchronize_system_messages_locale ();
9082   str = strerror (error_number);
9083
9084   if (! NILP (Vlocale_coding_system))
9085     {
9086       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9087                                                       Vlocale_coding_system,
9088                                                       0);
9089       str = (char *) SDATA (dec);
9090     }
9091
9092   return str;
9093 }
9094
9095 #endif /* emacs */