src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 int coding_system_require_warning;
 327
 328 Lisp_Object Vselect_safe_coding_system_function;
 329
 330 /* Mnemonic string for each format of end-of-line.  */
 331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 332 /* Mnemonic string to indicate format of end-of-line is not yet
 333    decided.  */
 334 Lisp_Object eol_mnemonic_undecided;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding-system for reading files and receiving data from process.  */
 350 Lisp_Object Vcoding_system_for_read;
 351 /* Coding-system for writing files and sending data to process.  */
 352 Lisp_Object Vcoding_system_for_write;
 353 /* Coding-system actually used in the latest I/O.  */
 354 Lisp_Object Vlast_coding_system_used;
 355 /* Set to non-nil when an error is detected while code conversion.  */
 356 Lisp_Object Vlast_code_conversion_error;
 357 /* A vector of length 256 which contains information about special
 358    Latin codes (especially for dealing with Microsoft codes).  */
 359 Lisp_Object Vlatin_extra_code_table;
 360
 361 /* Flag to inhibit code conversion of end-of-line format.  */
 362 int inhibit_eol_conversion;
 363
 364 /* Flag to inhibit ISO2022 escape sequence detection.  */
 365 int inhibit_iso_escape_detection;
 366
 367 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 368 int inherit_process_coding_system;
 369
 370 /* Coding system to be used to encode text for terminal display.  */
 371 struct coding_system terminal_coding;
 372
 373 /* Coding system to be used to encode text for terminal display when
 374    terminal coding system is nil.  */
 375 struct coding_system safe_terminal_coding;
 376
 377 /* Coding system of what is sent from terminal keyboard.  */
 378 struct coding_system keyboard_coding;
 379
 380 Lisp_Object Vfile_coding_system_alist;
 381 Lisp_Object Vprocess_coding_system_alist;
 382 Lisp_Object Vnetwork_coding_system_alist;
 383
 384 Lisp_Object Vlocale_coding_system;
 385
 386 #endif /* emacs */
 387
 388 /* Flag to tell if we look up translation table on character code
 389    conversion.  */
 390 Lisp_Object Venable_character_translation;
 391 /* Standard translation table to look up on decoding (reading).  */
 392 Lisp_Object Vstandard_translation_table_for_decode;
 393 /* Standard translation table to look up on encoding (writing).  */
 394 Lisp_Object Vstandard_translation_table_for_encode;
 395
 396 Lisp_Object Qtranslation_table;
 397 Lisp_Object Qtranslation_table_id;
 398 Lisp_Object Qtranslation_table_for_decode;
 399 Lisp_Object Qtranslation_table_for_encode;
 400
 401 /* Alist of charsets vs revision number.  */
 402 static Lisp_Object Vcharset_revision_table;
 403
 404 /* Default coding systems used for process I/O.  */
 405 Lisp_Object Vdefault_process_coding_system;
 406
 407 /* Char table for translating Quail and self-inserting input.  */
 408 Lisp_Object Vtranslation_table_for_input;
 409
 410 /* Two special coding systems.  */
 411 Lisp_Object Vsjis_coding_system;
 412 Lisp_Object Vbig5_coding_system;
 413
 414 static void record_conversion_result (struct coding_system *coding,
 415                                       enum coding_result_code result);
 416 static int detect_coding_utf_8 P_ ((struct coding_system *,
 417                                     struct coding_detection_info *info));
 418 static void decode_coding_utf_8 P_ ((struct coding_system *));
 419 static int encode_coding_utf_8 P_ ((struct coding_system *));
 420
 421 static int detect_coding_utf_16 P_ ((struct coding_system *,
 422                                      struct coding_detection_info *info));
 423 static void decode_coding_utf_16 P_ ((struct coding_system *));
 424 static int encode_coding_utf_16 P_ ((struct coding_system *));
 425
 426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 427                                        struct coding_detection_info *info));
 428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 430
 431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 432                                          struct coding_detection_info *info));
 433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 435
 436 static int detect_coding_sjis P_ ((struct coding_system *,
 437                                    struct coding_detection_info *info));
 438 static void decode_coding_sjis P_ ((struct coding_system *));
 439 static int encode_coding_sjis P_ ((struct coding_system *));
 440
 441 static int detect_coding_big5 P_ ((struct coding_system *,
 442                                    struct coding_detection_info *info));
 443 static void decode_coding_big5 P_ ((struct coding_system *));
 444 static int encode_coding_big5 P_ ((struct coding_system *));
 445
 446 static int detect_coding_ccl P_ ((struct coding_system *,
 447                                   struct coding_detection_info *info));
 448 static void decode_coding_ccl P_ ((struct coding_system *));
 449 static int encode_coding_ccl P_ ((struct coding_system *));
 450
 451 static void decode_coding_raw_text P_ ((struct coding_system *));
 452 static int encode_coding_raw_text P_ ((struct coding_system *));
 453
 454
 455 /* ISO2022 section */
 456
 457 #define CODING_ISO_INITIAL(coding, reg)                 \
 458   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 459                      coding_attr_iso_initial),          \
 460                reg)))
 461
 462
 463 #define CODING_ISO_REQUEST(coding, charset_id)  \
 464   ((charset_id <= (coding)->max_charset_id      \
 465     ? (coding)->safe_charsets[charset_id]       \
 466     : -1))
 467
 468
 469 #define CODING_ISO_FLAGS(coding)        \
 470   ((coding)->spec.iso_2022.flags)
 471 #define CODING_ISO_DESIGNATION(coding, reg)     \
 472   ((coding)->spec.iso_2022.current_designation[reg])
 473 #define CODING_ISO_INVOCATION(coding, plane)    \
 474   ((coding)->spec.iso_2022.current_invocation[plane])
 475 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 476   ((coding)->spec.iso_2022.single_shifting)
 477 #define CODING_ISO_BOL(coding)  \
 478   ((coding)->spec.iso_2022.bol)
 479 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 480   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 481
 482 /* Control characters of ISO2022.  */
 483                         /* code */      /* function */
 484 #define ISO_CODE_LF     0x0A            /* line-feed */
 485 #define ISO_CODE_CR     0x0D            /* carriage-return */
 486 #define ISO_CODE_SO     0x0E            /* shift-out */
 487 #define ISO_CODE_SI     0x0F            /* shift-in */
 488 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 489 #define ISO_CODE_ESC    0x1B            /* escape */
 490 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 491 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 492 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 493
 494 /* All code (1-byte) of ISO2022 is classified into one of the
 495    followings.  */
 496 enum iso_code_class_type
 497   {
 498     ISO_control_0,              /* Control codes in the range
 499                                    0x00..0x1F and 0x7F, except for the
 500                                    following 5 codes.  */
 501     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 502     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 503     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 504     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 505     ISO_control_1,              /* Control codes in the range
 506                                    0x80..0x9F, except for the
 507                                    following 3 codes.  */
 508     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 509     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 510     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 511     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 512     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 513     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 514     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 515   };
 516
 517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 518     `iso-flags' attribute of an iso2022 coding system.  */
 519
 520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 521    instead of the correct short-form sequence (e.g. ESC $ A).  */
 522 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 523
 524 /* If set, reset graphic planes and registers at end-of-line to the
 525    initial state.  */
 526 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 527
 528 /* If set, reset graphic planes and registers before any control
 529    characters to the initial state.  */
 530 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 531
 532 /* If set, encode by 7-bit environment.  */
 533 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 534
 535 /* If set, use locking-shift function.  */
 536 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 537
 538 /* If set, use single-shift function.  Overwrite
 539    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 540 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 541
 542 /* If set, use designation escape sequence.  */
 543 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 544
 545 /* If set, produce revision number sequence.  */
 546 #define CODING_ISO_FLAG_REVISION        0x0080
 547
 548 /* If set, produce ISO6429's direction specifying sequence.  */
 549 #define CODING_ISO_FLAG_DIRECTION       0x0100
 550
 551 /* If set, assume designation states are reset at beginning of line on
 552    output.  */
 553 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 554
 555 /* If set, designation sequence should be placed at beginning of line
 556    on output.  */
 557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 558
 559 /* If set, do not encode unsafe charactes on output.  */
 560 #define CODING_ISO_FLAG_SAFE            0x0800
 561
 562 /* If set, extra latin codes (128..159) are accepted as a valid code
 563    on input.  */
 564 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 565
 566 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 567
 568 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 569
 570 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 571
 572 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 573
 574 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 575
 576 /* A character to be produced on output if encoding of the original
 577    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 579
 580
 581 /* UTF-16 section */
 582 #define CODING_UTF_16_BOM(coding)       \
 583   ((coding)->spec.utf_16.bom)
 584
 585 #define CODING_UTF_16_ENDIAN(coding)    \
 586   ((coding)->spec.utf_16.endian)
 587
 588 #define CODING_UTF_16_SURROGATE(coding) \
 589   ((coding)->spec.utf_16.surrogate)
 590
 591
 592 /* CCL section */
 593 #define CODING_CCL_DECODER(coding)      \
 594   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 595 #define CODING_CCL_ENCODER(coding)      \
 596   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 597 #define CODING_CCL_VALIDS(coding)                                          \
 598   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 599
 600 /* Index for each coding category in `coding_categories' */
 601
 602 enum coding_category
 603   {
 604     coding_category_iso_7,
 605     coding_category_iso_7_tight,
 606     coding_category_iso_8_1,
 607     coding_category_iso_8_2,
 608     coding_category_iso_7_else,
 609     coding_category_iso_8_else,
 610     coding_category_utf_8,
 611     coding_category_utf_16_auto,
 612     coding_category_utf_16_be,
 613     coding_category_utf_16_le,
 614     coding_category_utf_16_be_nosig,
 615     coding_category_utf_16_le_nosig,
 616     coding_category_charset,
 617     coding_category_sjis,
 618     coding_category_big5,
 619     coding_category_ccl,
 620     coding_category_emacs_mule,
 621     /* All above are targets of code detection.  */
 622     coding_category_raw_text,
 623     coding_category_undecided,
 624     coding_category_max
 625   };
 626
 627 /* Definitions of flag bits used in detect_coding_XXXX.  */
 628 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 629 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 630 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 631 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 632 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 633 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 634 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 635 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 636 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 637 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 638 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 639 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 640 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 641 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 642 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 643 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 644 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 645 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 646
 647 /* This value is returned if detect_coding_mask () find nothing other
 648    than ASCII characters.  */
 649 #define CATEGORY_MASK_ANY               \
 650   (CATEGORY_MASK_ISO_7                  \
 651    | CATEGORY_MASK_ISO_7_TIGHT          \
 652    | CATEGORY_MASK_ISO_8_1              \
 653    | CATEGORY_MASK_ISO_8_2              \
 654    | CATEGORY_MASK_ISO_7_ELSE           \
 655    | CATEGORY_MASK_ISO_8_ELSE           \
 656    | CATEGORY_MASK_UTF_8                \
 657    | CATEGORY_MASK_UTF_16_BE            \
 658    | CATEGORY_MASK_UTF_16_LE            \
 659    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 660    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 661    | CATEGORY_MASK_CHARSET              \
 662    | CATEGORY_MASK_SJIS                 \
 663    | CATEGORY_MASK_BIG5                 \
 664    | CATEGORY_MASK_CCL                  \
 665    | CATEGORY_MASK_EMACS_MULE)
 666
 667
 668 #define CATEGORY_MASK_ISO_7BIT \
 669   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 670
 671 #define CATEGORY_MASK_ISO_8BIT \
 672   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 673
 674 #define CATEGORY_MASK_ISO_ELSE \
 675   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 676
 677 #define CATEGORY_MASK_ISO_ESCAPE        \
 678   (CATEGORY_MASK_ISO_7                  \
 679    | CATEGORY_MASK_ISO_7_TIGHT          \
 680    | CATEGORY_MASK_ISO_7_ELSE           \
 681    | CATEGORY_MASK_ISO_8_ELSE)
 682
 683 #define CATEGORY_MASK_ISO       \
 684   (  CATEGORY_MASK_ISO_7BIT     \
 685      | CATEGORY_MASK_ISO_8BIT   \
 686      | CATEGORY_MASK_ISO_ELSE)
 687
 688 #define CATEGORY_MASK_UTF_16            \
 689   (CATEGORY_MASK_UTF_16_BE              \
 690    | CATEGORY_MASK_UTF_16_LE            \
 691    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 692    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 693
 694
 695 /* List of symbols `coding-category-xxx' ordered by priority.  This
 696    variable is exposed to Emacs Lisp.  */
 697 static Lisp_Object Vcoding_category_list;
 698
 699 /* Table of coding categories (Lisp symbols).  This variable is for
 700    internal use oly.  */
 701 static Lisp_Object Vcoding_category_table;
 702
 703 /* Table of coding-categories ordered by priority.  */
 704 static enum coding_category coding_priorities[coding_category_max];
 705
 706 /* Nth element is a coding context for the coding system bound to the
 707    Nth coding category.  */
 708 static struct coding_system coding_categories[coding_category_max];
 709
 710 /*** Commonly used macros and functions ***/
 711
 712 #ifndef min
 713 #define min(a, b) ((a) < (b) ? (a) : (b))
 714 #endif
 715 #ifndef max
 716 #define max(a, b) ((a) > (b) ? (a) : (b))
 717 #endif
 718
 719 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 720   do {                                                  \
 721     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 722     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 723   } while (0)
 724
 725
 726 /* Safely get one byte from the source text pointed by SRC which ends
 727    at SRC_END, and set C to that byte.  If there are not enough bytes
 728    in the source, it jumps to `no_more_source'.  If multibytep is
 729    nonzero, and a multibyte character is found at SRC, set C to the
 730    negative value of the character code.  The caller should declare
 731    and set these variables appropriately in advance:
 732         src, src_end, multibytep */
 733
 734 #define ONE_MORE_BYTE(c)                                \
 735   do {                                                  \
 736     if (src == src_end)                                 \
 737       {                                                 \
 738         if (src_base < src)                             \
 739           record_conversion_result                      \
 740             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 741         goto no_more_source;                            \
 742       }                                                 \
 743     c = *src++;                                         \
 744     if (multibytep && (c & 0x80))                       \
 745       {                                                 \
 746         if ((c & 0xFE) == 0xC0)                         \
 747           c = ((c & 1) << 6) | *src++;                  \
 748         else                                            \
 749           {                                             \
 750             c = - string_char (--src, &src, NULL);      \
 751             record_conversion_result                    \
 752               (coding, CODING_RESULT_INVALID_SRC);      \
 753           }                                             \
 754       }                                                 \
 755     consumed_chars++;                                   \
 756   } while (0)
 757
 758
 759 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 760   do {                                                  \
 761     c = *src++;                                         \
 762     if (multibytep && (c & 0x80))                       \
 763       {                                                 \
 764         if ((c & 0xFE) == 0xC0)                         \
 765           c = ((c & 1) << 6) | *src++;                  \
 766         else                                            \
 767           {                                             \
 768             c = - string_char (--src, &src, NULL);      \
 769             record_conversion_result                    \
 770               (coding, CODING_RESULT_INVALID_SRC);      \
 771           }                                             \
 772       }                                                 \
 773     consumed_chars++;                                   \
 774   } while (0)
 775
 776
 777 /* Store a byte C in the place pointed by DST and increment DST to the
 778    next free point, and increment PRODUCED_CHARS.  The caller should
 779    assure that C is 0..127, and declare and set the variable `dst'
 780    appropriately in advance.
 781 */
 782
 783
 784 #define EMIT_ONE_ASCII_BYTE(c)  \
 785   do {                          \
 786     produced_chars++;           \
 787     *dst++ = (c);               \
 788   } while (0)
 789
 790
 791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 792
 793 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 794   do {                                  \
 795     produced_chars += 2;                \
 796     *dst++ = (c1), *dst++ = (c2);       \
 797   } while (0)
 798
 799
 800 /* Store a byte C in the place pointed by DST and increment DST to the
 801    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 802    nonzero, store in an appropriate multibyte from.  The caller should
 803    declare and set the variables `dst' and `multibytep' appropriately
 804    in advance.  */
 805
 806 #define EMIT_ONE_BYTE(c)                \
 807   do {                                  \
 808     produced_chars++;                   \
 809     if (multibytep)                     \
 810       {                                 \
 811         int ch = (c);                   \
 812         if (ch >= 0x80)                 \
 813           ch = BYTE8_TO_CHAR (ch);      \
 814         CHAR_STRING_ADVANCE (ch, dst);  \
 815       }                                 \
 816     else                                \
 817       *dst++ = (c);                     \
 818   } while (0)
 819
 820
 821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 822
 823 #define EMIT_TWO_BYTES(c1, c2)          \
 824   do {                                  \
 825     produced_chars += 2;                \
 826     if (multibytep)                     \
 827       {                                 \
 828         int ch;                         \
 829                                         \
 830         ch = (c1);                      \
 831         if (ch >= 0x80)                 \
 832           ch = BYTE8_TO_CHAR (ch);      \
 833         CHAR_STRING_ADVANCE (ch, dst);  \
 834         ch = (c2);                      \
 835         if (ch >= 0x80)                 \
 836           ch = BYTE8_TO_CHAR (ch);      \
 837         CHAR_STRING_ADVANCE (ch, dst);  \
 838       }                                 \
 839     else                                \
 840       {                                 \
 841         *dst++ = (c1);                  \
 842         *dst++ = (c2);                  \
 843       }                                 \
 844   } while (0)
 845
 846
 847 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 848   do {                                  \
 849     EMIT_ONE_BYTE (c1);                 \
 850     EMIT_TWO_BYTES (c2, c3);            \
 851   } while (0)
 852
 853
 854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 855   do {                                          \
 856     EMIT_TWO_BYTES (c1, c2);                    \
 857     EMIT_TWO_BYTES (c3, c4);                    \
 858   } while (0)
 859
 860
 861 static void
 862 record_conversion_result (struct coding_system *coding,
 863                           enum coding_result_code result)
 864 {
 865   coding->result = result;
 866   switch (result)
 867     {
 868     case CODING_RESULT_INSUFFICIENT_SRC:
 869       Vlast_code_conversion_error = Qinsufficient_source;
 870       break;
 871     case CODING_RESULT_INCONSISTENT_EOL:
 872       Vlast_code_conversion_error = Qinconsistent_eol;
 873       break;
 874     case CODING_RESULT_INVALID_SRC:
 875       Vlast_code_conversion_error = Qinvalid_source;
 876       break;
 877     case CODING_RESULT_INTERRUPT:
 878       Vlast_code_conversion_error = Qinterrupted;
 879       break;
 880     case CODING_RESULT_INSUFFICIENT_MEM:
 881       Vlast_code_conversion_error = Qinsufficient_memory;
 882       break;
 883     }
 884 }
 885
 886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 887   do {                                                                       \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded)                                                  \
 891       {                                                                      \
 892         const unsigned char *orig = coding->source;                          \
 893         EMACS_INT offset;                                                    \
 894                                                                              \
 895         coding_set_source (coding);                                          \
 896         offset = coding->source - orig;                                      \
 897         src += offset;                                                       \
 898         src_base += offset;                                                  \
 899         src_end += offset;                                                   \
 900       }                                                                      \
 901   } while (0)
 902
 903
 904 #define ASSURE_DESTINATION(bytes)                               \
 905   do {                                                          \
 906     if (dst + (bytes) >= dst_end)                               \
 907       {                                                         \
 908         int more_bytes = charbuf_end - charbuf + (bytes);       \
 909                                                                 \
 910         dst = alloc_destination (coding, more_bytes, dst);      \
 911         dst_end = coding->destination + coding->dst_bytes;      \
 912       }                                                         \
 913   } while (0)
 914
 915
 916
 917 static void
 918 coding_set_source (coding)
 919      struct coding_system *coding;
 920 {
 921   if (BUFFERP (coding->src_object))
 922     {
 923       struct buffer *buf = XBUFFER (coding->src_object);
 924
 925       if (coding->src_pos < 0)
 926         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 927       else
 928         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 929     }
 930   else if (STRINGP (coding->src_object))
 931     {
 932       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 933     }
 934   else
 935     /* Otherwise, the source is C string and is never relocated
 936        automatically.  Thus we don't have to update anything.  */
 937     ;
 938 }
 939
 940 static void
 941 coding_set_destination (coding)
 942      struct coding_system *coding;
 943 {
 944   if (BUFFERP (coding->dst_object))
 945     {
 946       if (coding->src_pos < 0)
 947         {
 948           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 949           coding->dst_bytes = (GAP_END_ADDR
 950                                - (coding->src_bytes - coding->consumed)
 951                                - coding->destination);
 952         }
 953       else
 954         {
 955           /* We are sure that coding->dst_pos_byte is before the gap
 956              of the buffer. */
 957           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 958                                  + coding->dst_pos_byte - 1);
 959           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 960                                - coding->destination);
 961         }
 962     }
 963   else
 964     /* Otherwise, the destination is C string and is never relocated
 965        automatically.  Thus we don't have to update anything.  */
 966     ;
 967 }
 968
 969
 970 static void
 971 coding_alloc_by_realloc (coding, bytes)
 972      struct coding_system *coding;
 973      EMACS_INT bytes;
 974 {
 975   coding->destination = (unsigned char *) xrealloc (coding->destination,
 976                                                     coding->dst_bytes + bytes);
 977   coding->dst_bytes += bytes;
 978 }
 979
 980 static void
 981 coding_alloc_by_making_gap (coding, bytes)
 982      struct coding_system *coding;
 983      EMACS_INT bytes;
 984 {
 985   if (BUFFERP (coding->dst_object)
 986       && EQ (coding->src_object, coding->dst_object))
 987     {
 988       EMACS_INT add = coding->src_bytes - coding->consumed;
 989
 990       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 991       make_gap (bytes);
 992       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 993     }
 994   else
 995     {
 996       Lisp_Object this_buffer;
 997
 998       this_buffer = Fcurrent_buffer ();
 999       set_buffer_internal (XBUFFER (coding->dst_object));
1000       make_gap (bytes);
1001       set_buffer_internal (XBUFFER (this_buffer));
1002     }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008      struct coding_system *coding;
1009      EMACS_INT nbytes;
1010      unsigned char *dst;
1011 {
1012   EMACS_INT offset = dst - coding->destination;
1013
1014   if (BUFFERP (coding->dst_object))
1015     coding_alloc_by_making_gap (coding, nbytes);
1016   else
1017     coding_alloc_by_realloc (coding, nbytes);
1018   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019   coding_set_destination (coding);
1020   dst = coding->destination + offset;
1021   return dst;
1022 }
1023
1024 /** Macros for annotations.  */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027    composition and charset).  */
1028 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031    format:
1032      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033    LENGTH is the number of elements in the annotation.
1034    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035    FROM and TO specify the range of text annotated.  They are relative
1036    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038    The format of the following elements depend on ANNOTATION_MASK.
1039
1040    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041    follows:
1042      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043    METHOD is one of enum composition_method.
1044    Optionnal COMPOSITION-COMPONENTS are characters and composition
1045    rules.
1046
1047    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048    follows.  */
1049
1050 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1051   do {                                                  \
1052     *(buf)++ = -(len);                                  \
1053     *(buf)++ = (mask);                                  \
1054     *(buf)++ = (from);                                  \
1055     *(buf)++ = (to);                                    \
1056     coding->annotated = 1;                              \
1057   } while (0);
1058
1059 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1060   do {                                                                        \
1061     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062     *buf++ = method;                                                          \
1063   } while (0)
1064
1065
1066 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1067   do {                                                                    \
1068     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069     *buf++ = id;                                                          \
1070   } while (0)
1071
1072 \f
1073 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077 \f
1078 /*** 3. UTF-8 ***/
1079
1080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1081    Check if a text is encoded in UTF-8.  If it is, return 1, else
1082    return 0.  */
1083
1084 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1085 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1086 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091 static int
1092 detect_coding_utf_8 (coding, detect_info)
1093      struct coding_system *coding;
1094      struct coding_detection_info *detect_info;
1095 {
1096   const unsigned char *src = coding->source, *src_base;
1097   const unsigned char *src_end = coding->source + coding->src_bytes;
1098   int multibytep = coding->src_multibyte;
1099   int consumed_chars = 0;
1100   int found = 0;
1101
1102   detect_info->checked |= CATEGORY_MASK_UTF_8;
1103   /* A coding system of this category is always ASCII compatible.  */
1104   src += coding->head_ascii;
1105
1106   while (1)
1107     {
1108       int c, c1, c2, c3, c4;
1109
1110       src_base = src;
1111       ONE_MORE_BYTE (c);
1112       if (c < 0 || UTF_8_1_OCTET_P (c))
1113         continue;
1114       ONE_MORE_BYTE (c1);
1115       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1116         break;
1117       if (UTF_8_2_OCTET_LEADING_P (c))
1118         {
1119           found = CATEGORY_MASK_UTF_8;
1120           continue;
1121         }
1122       ONE_MORE_BYTE (c2);
1123       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1124         break;
1125       if (UTF_8_3_OCTET_LEADING_P (c))
1126         {
1127           found = CATEGORY_MASK_UTF_8;
1128           continue;
1129         }
1130       ONE_MORE_BYTE (c3);
1131       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1132         break;
1133       if (UTF_8_4_OCTET_LEADING_P (c))
1134         {
1135           found = CATEGORY_MASK_UTF_8;
1136           continue;
1137         }
1138       ONE_MORE_BYTE (c4);
1139       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1140         break;
1141       if (UTF_8_5_OCTET_LEADING_P (c))
1142         {
1143           found = CATEGORY_MASK_UTF_8;
1144           continue;
1145         }
1146       break;
1147     }
1148   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1149   return 0;
1150
1151  no_more_source:
1152   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1153     {
1154       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1155       return 0;
1156     }
1157   detect_info->found |= found;
1158   return 1;
1159 }
1160
1161
1162 static void
1163 decode_coding_utf_8 (coding)
1164      struct coding_system *coding;
1165 {
1166   const unsigned char *src = coding->source + coding->consumed;
1167   const unsigned char *src_end = coding->source + coding->src_bytes;
1168   const unsigned char *src_base;
1169   int *charbuf = coding->charbuf;
1170   int *charbuf_end = charbuf + coding->charbuf_size;
1171   int consumed_chars = 0, consumed_chars_base;
1172   int multibytep = coding->src_multibyte;
1173   Lisp_Object attr, charset_list;
1174
1175   CODING_GET_INFO (coding, attr, charset_list);
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4, c5;
1180
1181       src_base = src;
1182       consumed_chars_base = consumed_chars;
1183
1184       if (charbuf >= charbuf_end)
1185         break;
1186
1187       ONE_MORE_BYTE (c1);
1188       if (c1 < 0)
1189         {
1190           c = - c1;
1191         }
1192       else if (UTF_8_1_OCTET_P(c1))
1193         {
1194           c = c1;
1195         }
1196       else
1197         {
1198           ONE_MORE_BYTE (c2);
1199           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1200             goto invalid_code;
1201           if (UTF_8_2_OCTET_LEADING_P (c1))
1202             {
1203               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204               /* Reject overlong sequences here and below.  Encoders
1205                  producing them are incorrect, they can be misleading,
1206                  and they mess up read/write invariance.  */
1207               if (c < 128)
1208                 goto invalid_code;
1209             }
1210           else
1211             {
1212               ONE_MORE_BYTE (c3);
1213               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1214                 goto invalid_code;
1215               if (UTF_8_3_OCTET_LEADING_P (c1))
1216                 {
1217                   c = (((c1 & 0xF) << 12)
1218                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1219                   if (c < 0x800
1220                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1221                     goto invalid_code;
1222                 }
1223               else
1224                 {
1225                   ONE_MORE_BYTE (c4);
1226                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227                     goto invalid_code;
1228                   if (UTF_8_4_OCTET_LEADING_P (c1))
1229                     {
1230                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1232                     if (c < 0x10000)
1233                       goto invalid_code;
1234                     }
1235                   else
1236                     {
1237                       ONE_MORE_BYTE (c5);
1238                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1239                         goto invalid_code;
1240                       if (UTF_8_5_OCTET_LEADING_P (c1))
1241                         {
1242                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244                                | (c5 & 0x3F));
1245                           if ((c > MAX_CHAR) || (c < 0x200000))
1246                             goto invalid_code;
1247                         }
1248                       else
1249                         goto invalid_code;
1250                     }
1251                 }
1252             }
1253         }
1254
1255       *charbuf++ = c;
1256       continue;
1257
1258     invalid_code:
1259       src = src_base;
1260       consumed_chars = consumed_chars_base;
1261       ONE_MORE_BYTE (c);
1262       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263       coding->errors++;
1264     }
1265
1266  no_more_source:
1267   coding->consumed_char += consumed_chars_base;
1268   coding->consumed = src_base - coding->source;
1269   coding->charbuf_used = charbuf - coding->charbuf;
1270 }
1271
1272
1273 static int
1274 encode_coding_utf_8 (coding)
1275      struct coding_system *coding;
1276 {
1277   int multibytep = coding->dst_multibyte;
1278   int *charbuf = coding->charbuf;
1279   int *charbuf_end = charbuf + coding->charbuf_used;
1280   unsigned char *dst = coding->destination + coding->produced;
1281   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1282   int produced_chars = 0;
1283   int c;
1284
1285   if (multibytep)
1286     {
1287       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289       while (charbuf < charbuf_end)
1290         {
1291           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1292
1293           ASSURE_DESTINATION (safe_room);
1294           c = *charbuf++;
1295           if (CHAR_BYTE8_P (c))
1296             {
1297               c = CHAR_TO_BYTE8 (c);
1298               EMIT_ONE_BYTE (c);
1299             }
1300           else
1301             {
1302               CHAR_STRING_ADVANCE (c, pend);
1303               for (p = str; p < pend; p++)
1304                 EMIT_ONE_BYTE (*p);
1305             }
1306         }
1307     }
1308   else
1309     {
1310       int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312       while (charbuf < charbuf_end)
1313         {
1314           ASSURE_DESTINATION (safe_room);
1315           c = *charbuf++;
1316           dst += CHAR_STRING (c, dst);
1317           produced_chars++;
1318         }
1319     }
1320   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1321   coding->produced_char += produced_chars;
1322   coding->produced = dst - coding->destination;
1323   return 0;
1324 }
1325
1326
1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1328    Check if a text is encoded in one of UTF-16 based coding systems.
1329    If it is, return 1, else return 0.  */
1330
1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1332   (((val) & 0xFC00) == 0xD800)
1333
1334 #define UTF_16_LOW_SURROGATE_P(val) \
1335   (((val) & 0xFC00) == 0xDC00)
1336
1337 #define UTF_16_INVALID_P(val)   \
1338   (((val) == 0xFFFE)            \
1339    || ((val) == 0xFFFF)         \
1340    || UTF_16_LOW_SURROGATE_P (val))
1341
1342
1343 static int
1344 detect_coding_utf_16 (coding, detect_info)
1345      struct coding_system *coding;
1346      struct coding_detection_info *detect_info;
1347 {
1348   const unsigned char *src = coding->source, *src_base = src;
1349   const unsigned char *src_end = coding->source + coding->src_bytes;
1350   int multibytep = coding->src_multibyte;
1351   int consumed_chars = 0;
1352   int c1, c2;
1353
1354   detect_info->checked |= CATEGORY_MASK_UTF_16;
1355   if (coding->mode & CODING_MODE_LAST_BLOCK
1356       && (coding->src_chars & 1))
1357     {
1358       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359       return 0;
1360     }
1361
1362   ONE_MORE_BYTE (c1);
1363   ONE_MORE_BYTE (c2);
1364   if ((c1 == 0xFF) && (c2 == 0xFE))
1365     {
1366       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367                              | CATEGORY_MASK_UTF_16_AUTO);
1368       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1371     }
1372   else if ((c1 == 0xFE) && (c2 == 0xFF))
1373     {
1374       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375                              | CATEGORY_MASK_UTF_16_AUTO);
1376       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379     }
1380   else if (c1 >= 0 && c2 >= 0)
1381     {
1382       unsigned char b1[256], b2[256];
1383       int b1_variants = 1, b2_variants = 1;
1384       int n;
1385
1386       bzero (b1, 256), bzero (b2, 256);
1387       b1[c1]++, b2[c2]++;
1388       for (n = 0; n < 256 && src < src_end; n++)
1389         {
1390           src_base = src;
1391           ONE_MORE_BYTE (c1);
1392           ONE_MORE_BYTE (c2);
1393           if (c1 < 0 || c2 < 0)
1394             break;
1395           if (! b1[c1++]) b1_variants++;
1396           if (! b2[c2++]) b2_variants++;
1397         }
1398       if (b1_variants < b2_variants)
1399         detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400       else
1401         detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402       detect_info->rejected
1403         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1404     }
1405  no_more_source:
1406   return 1;
1407 }
1408
1409 static void
1410 decode_coding_utf_16 (coding)
1411      struct coding_system *coding;
1412 {
1413   const unsigned char *src = coding->source + coding->consumed;
1414   const unsigned char *src_end = coding->source + coding->src_bytes;
1415   const unsigned char *src_base;
1416   int *charbuf = coding->charbuf;
1417   int *charbuf_end = charbuf + coding->charbuf_size;
1418   int consumed_chars = 0, consumed_chars_base;
1419   int multibytep = coding->src_multibyte;
1420   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422   int surrogate = CODING_UTF_16_SURROGATE (coding);
1423   Lisp_Object attr, charset_list;
1424
1425   CODING_GET_INFO (coding, attr, charset_list);
1426
1427   if (bom == utf_16_with_bom)
1428     {
1429       int c, c1, c2;
1430
1431       src_base = src;
1432       ONE_MORE_BYTE (c1);
1433       ONE_MORE_BYTE (c2);
1434       c = (c1 << 8) | c2;
1435
1436       if (endian == utf_16_big_endian
1437           ? c != 0xFEFF : c != 0xFFFE)
1438         {
1439           /* The first two bytes are not BOM.  Treat them as bytes
1440              for a normal character.  */
1441           src = src_base;
1442           coding->errors++;
1443         }
1444       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445     }
1446   else if (bom == utf_16_detect_bom)
1447     {
1448       /* We have already tried to detect BOM and failed in
1449          detect_coding.  */
1450       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1451     }
1452
1453   while (1)
1454     {
1455       int c, c1, c2;
1456
1457       src_base = src;
1458       consumed_chars_base = consumed_chars;
1459
1460       if (charbuf + 2 >= charbuf_end)
1461         break;
1462
1463       ONE_MORE_BYTE (c1);
1464       if (c1 < 0)
1465         {
1466           *charbuf++ = -c1;
1467           continue;
1468         }
1469       ONE_MORE_BYTE (c2);
1470       if (c2 < 0)
1471         {
1472           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473           *charbuf++ = -c2;
1474           continue;
1475         }
1476       c = (endian == utf_16_big_endian
1477            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1478       if (surrogate)
1479         {
1480           if (! UTF_16_LOW_SURROGATE_P (c))
1481             {
1482               if (endian == utf_16_big_endian)
1483                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484               else
1485                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486               *charbuf++ = c1;
1487               *charbuf++ = c2;
1488               coding->errors++;
1489               if (UTF_16_HIGH_SURROGATE_P (c))
1490                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1491               else
1492                 *charbuf++ = c;
1493             }
1494           else
1495             {
1496               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1498               *charbuf++ = c;
1499             }
1500         }
1501       else
1502         {
1503           if (UTF_16_HIGH_SURROGATE_P (c))
1504             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505           else
1506             *charbuf++ = c;
1507         }
1508     }
1509
1510  no_more_source:
1511   coding->consumed_char += consumed_chars_base;
1512   coding->consumed = src_base - coding->source;
1513   coding->charbuf_used = charbuf - coding->charbuf;
1514 }
1515
1516 static int
1517 encode_coding_utf_16 (coding)
1518      struct coding_system *coding;
1519 {
1520   int multibytep = coding->dst_multibyte;
1521   int *charbuf = coding->charbuf;
1522   int *charbuf_end = charbuf + coding->charbuf_used;
1523   unsigned char *dst = coding->destination + coding->produced;
1524   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525   int safe_room = 8;
1526   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528   int produced_chars = 0;
1529   Lisp_Object attrs, charset_list;
1530   int c;
1531
1532   CODING_GET_INFO (coding, attrs, charset_list);
1533
1534   if (bom != utf_16_without_bom)
1535     {
1536       ASSURE_DESTINATION (safe_room);
1537       if (big_endian)
1538         EMIT_TWO_BYTES (0xFE, 0xFF);
1539       else
1540         EMIT_TWO_BYTES (0xFF, 0xFE);
1541       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542     }
1543
1544   while (charbuf < charbuf_end)
1545     {
1546       ASSURE_DESTINATION (safe_room);
1547       c = *charbuf++;
1548       if (c >= MAX_UNICODE_CHAR)
1549         c = coding->default_char;
1550
1551       if (c < 0x10000)
1552         {
1553           if (big_endian)
1554             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555           else
1556             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557         }
1558       else
1559         {
1560           int c1, c2;
1561
1562           c -= 0x10000;
1563           c1 = (c >> 10) + 0xD800;
1564           c2 = (c & 0x3FF) + 0xDC00;
1565           if (big_endian)
1566             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567           else
1568             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569         }
1570     }
1571   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1572   coding->produced = dst - coding->destination;
1573   coding->produced_char += produced_chars;
1574   return 0;
1575 }
1576
1577 \f
1578 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580 /* Emacs' internal format for representation of multiple character
1581    sets is a kind of multi-byte encoding, i.e. characters are
1582    represented by variable-length sequences of one-byte codes.
1583
1584    ASCII characters and control characters (e.g. `tab', `newline') are
1585    represented by one-byte sequences which are their ASCII codes, in
1586    the range 0x00 through 0x7F.
1587
1588    8-bit characters of the range 0x80..0x9F are represented by
1589    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590    code + 0x20).
1591
1592    8-bit characters of the range 0xA0..0xFF are represented by
1593    one-byte sequences which are their 8-bit code.
1594
1595    The other characters are represented by a sequence of `base
1596    leading-code', optional `extended leading-code', and one or two
1597    `position-code's.  The length of the sequence is determined by the
1598    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1599    whereas extended leading-code and position-code take the range 0xA0
1600    through 0xFF.  See `charset.h' for more details about leading-code
1601    and position-code.
1602
1603    --- CODE RANGE of Emacs' internal format ---
1604    character set        range
1605    -------------        -----
1606    ascii                0x00..0x7F
1607    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608    eight-bit-graphic    0xA0..0xBF
1609    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1610    ---------------------------------------------
1611
1612    As this is the internal character representation, the format is
1613    usually not used externally (i.e. in a file or in a data sent to a
1614    process).  But, it is possible to have a text externally in this
1615    format (i.e. by encoding by the coding system `emacs-mule').
1616
1617    In that case, a sequence of one-byte codes has a slightly different
1618    form.
1619
1620    At first, all characters in eight-bit-control are represented by
1621    one-byte sequences which are their 8-bit code.
1622
1623    Next, character composition data are represented by the byte
1624    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625    where,
1626         METHOD is 0xF0 plus one of composition method (enum
1627         composition_method),
1628
1629         BYTES is 0xA0 plus a byte length of this composition data,
1630
1631         CHARS is 0x20 plus a number of characters composed by this
1632         data,
1633
1634         COMPONENTs are characters of multibye form or composition
1635         rules encoded by two-byte of ASCII codes.
1636
1637    In addition, for backward compatibility, the following formats are
1638    also recognized as composition data on decoding.
1639
1640    0x80 MSEQ ...
1641    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643    Here,
1644         MSEQ is a multibyte form but in these special format:
1645           ASCII: 0xA0 ASCII_CODE+0x80,
1646           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647         RULE is a one byte code of the range 0xA0..0xF0 that
1648         represents a composition rule.
1649   */
1650
1651 char emacs_mule_bytes[256];
1652
1653 int
1654 emacs_mule_char (coding, src, nbytes, nchars, id)
1655      struct coding_system *coding;
1656      const unsigned char *src;
1657      int *nbytes, *nchars, *id;
1658 {
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base = src;
1661   int multibytep = coding->src_multibyte;
1662   struct charset *charset;
1663   unsigned code;
1664   int c;
1665   int consumed_chars = 0;
1666
1667   ONE_MORE_BYTE (c);
1668   if (c < 0)
1669     {
1670       c = -c;
1671       charset = emacs_mule_charset[0];
1672     }
1673   else
1674     {
1675       switch (emacs_mule_bytes[c])
1676         {
1677         case 2:
1678           if (! (charset = emacs_mule_charset[c]))
1679             goto invalid_code;
1680           ONE_MORE_BYTE (c);
1681           if (c < 0)
1682             goto invalid_code;
1683           code = c & 0x7F;
1684           break;
1685
1686         case 3:
1687           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689             {
1690               ONE_MORE_BYTE (c);
1691               if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692                 goto invalid_code;
1693               ONE_MORE_BYTE (c);
1694               if (c < 0)
1695                 goto invalid_code;
1696               code = c & 0x7F;
1697             }
1698           else
1699             {
1700               if (! (charset = emacs_mule_charset[c]))
1701                 goto invalid_code;
1702               ONE_MORE_BYTE (c);
1703               if (c < 0)
1704                 goto invalid_code;
1705               code = (c & 0x7F) << 8;
1706               ONE_MORE_BYTE (c);
1707               if (c < 0)
1708                 goto invalid_code;
1709               code |= c & 0x7F;
1710             }
1711           break;
1712
1713         case 4:
1714           ONE_MORE_BYTE (c);
1715           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1716             goto invalid_code;
1717           ONE_MORE_BYTE (c);
1718           if (c < 0)
1719             goto invalid_code;
1720           code = (c & 0x7F) << 8;
1721           ONE_MORE_BYTE (c);
1722           if (c < 0)
1723             goto invalid_code;
1724           code |= c & 0x7F;
1725           break;
1726
1727         case 1:
1728           code = c;
1729           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730                                      ? charset_ascii : charset_eight_bit);
1731           break;
1732
1733         default:
1734           abort ();
1735         }
1736       c = DECODE_CHAR (charset, code);
1737       if (c < 0)
1738         goto invalid_code;
1739     }
1740   *nbytes = src - src_base;
1741   *nchars = consumed_chars;
1742   if (id)
1743     *id = charset->id;
1744   return c;
1745
1746  no_more_source:
1747   return -2;
1748
1749  invalid_code:
1750   return -1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1755    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1756    else return 0.  */
1757
1758 static int
1759 detect_coding_emacs_mule (coding, detect_info)
1760      struct coding_system *coding;
1761      struct coding_detection_info *detect_info;
1762 {
1763   const unsigned char *src = coding->source, *src_base;
1764   const unsigned char *src_end = coding->source + coding->src_bytes;
1765   int multibytep = coding->src_multibyte;
1766   int consumed_chars = 0;
1767   int c;
1768   int found = 0;
1769
1770   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1771   /* A coding system of this category is always ASCII compatible.  */
1772   src += coding->head_ascii;
1773
1774   while (1)
1775     {
1776       src_base = src;
1777       ONE_MORE_BYTE (c);
1778       if (c < 0)
1779         continue;
1780       if (c == 0x80)
1781         {
1782           /* Perhaps the start of composite character.  We simple skip
1783              it because analyzing it is too heavy for detecting.  But,
1784              at least, we check that the composite character
1785              constitues of more than 4 bytes.  */
1786           const unsigned char *src_base;
1787
1788         repeat:
1789           src_base = src;
1790           do
1791             {
1792               ONE_MORE_BYTE (c);
1793             }
1794           while (c >= 0xA0);
1795
1796           if (src - src_base <= 4)
1797             break;
1798           found = CATEGORY_MASK_EMACS_MULE;
1799           if (c == 0x80)
1800             goto repeat;
1801         }
1802
1803       if (c < 0x80)
1804         {
1805           if (c < 0x20
1806               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807             break;
1808         }
1809       else
1810         {
1811           const unsigned char *src_base = src - 1;
1812
1813           do
1814             {
1815               ONE_MORE_BYTE (c);
1816             }
1817           while (c >= 0xA0);
1818           if (src - src_base != emacs_mule_bytes[*src_base])
1819             break;
1820           found = CATEGORY_MASK_EMACS_MULE;
1821         }
1822     }
1823   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1824   return 0;
1825
1826  no_more_source:
1827   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1828     {
1829       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1830       return 0;
1831     }
1832   detect_info->found |= found;
1833   return 1;
1834 }
1835
1836
1837 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1838
1839 /* Decode a character represented as a component of composition
1840    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1841    update SRC to the head of next character (or an encoded composition
1842    rule).  If SRC doesn't points a composition component, set C to -1.
1843    If SRC points an invalid byte sequence, global exit by a return
1844    value 0.  */
1845
1846 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1847   if (1)                                                        \
1848     {                                                           \
1849       int c;                                                    \
1850       int nbytes, nchars;                                       \
1851                                                                 \
1852       if (src == src_end)                                       \
1853         break;                                                  \
1854       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1855       if (c < 0)                                                \
1856         {                                                       \
1857           if (c == -2)                                          \
1858             break;                                              \
1859           goto invalid_code;                                    \
1860         }                                                       \
1861       *buf++ = c;                                               \
1862       src += nbytes;                                            \
1863       consumed_chars += nchars;                                 \
1864     }                                                           \
1865   else
1866
1867
1868 /* Decode a composition rule represented as a component of composition
1869    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1870    and increment BUF.  If SRC points an invalid byte sequence, set C
1871    to -1.  */
1872
1873 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1874   do {                                                  \
1875     int c, gref, nref;                                  \
1876                                                         \
1877     if (src >= src_end)                                 \
1878       goto invalid_code;                                \
1879     ONE_MORE_BYTE_NO_CHECK (c);                         \
1880     c -= 0x20;                                          \
1881     if (c < 0 || c >= 81)                               \
1882       goto invalid_code;                                \
1883                                                         \
1884     gref = c / 9, nref = c % 9;                         \
1885     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1886   } while (0)
1887
1888
1889 /* Decode a composition rule represented as a component of composition
1890    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1891    and increment BUF.  If SRC points an invalid byte sequence, set C
1892    to -1.  */
1893
1894 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1895   do {                                                  \
1896     int gref, nref;                                     \
1897                                                         \
1898     if (src + 1>= src_end)                              \
1899       goto invalid_code;                                \
1900     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1901     gref -= 0x20;                                       \
1902     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1903     nref -= 0x20;                                       \
1904     if (gref < 0 || gref >= 81                          \
1905         || nref < 0 || nref >= 81)                      \
1906       goto invalid_code;                                \
1907     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1908   } while (0)
1909
1910
1911 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1912   do {                                                                  \
1913     /* Emacs 21 style format.  The first three bytes at SRC are         \
1914        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1915        the byte length of this composition information, CHARS is the    \
1916        number of characters composed by this composition.  */           \
1917     enum composition_method method = c - 0xF2;                          \
1918     int *charbuf_base = charbuf;                                        \
1919     int from, to;                                                       \
1920     int consumed_chars_limit;                                           \
1921     int nbytes, nchars;                                                 \
1922                                                                         \
1923     ONE_MORE_BYTE (c);                                                  \
1924     if (c < 0)                                                          \
1925       goto invalid_code;                                                \
1926     nbytes = c - 0xA0;                                                  \
1927     if (nbytes < 3)                                                     \
1928       goto invalid_code;                                                \
1929     ONE_MORE_BYTE (c);                                                  \
1930     if (c < 0)                                                          \
1931       goto invalid_code;                                                \
1932     nchars = c - 0xA0;                                                  \
1933     from = coding->produced + char_offset;                              \
1934     to = from + nchars;                                                 \
1935     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1936     consumed_chars_limit = consumed_chars_base + nbytes;                \
1937     if (method != COMPOSITION_RELATIVE)                                 \
1938       {                                                                 \
1939         int i = 0;                                                      \
1940         while (consumed_chars < consumed_chars_limit)                   \
1941           {                                                             \
1942             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1943               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1944             else                                                        \
1945               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1946             i++;                                                        \
1947           }                                                             \
1948         if (consumed_chars < consumed_chars_limit)                      \
1949           goto invalid_code;                                            \
1950         charbuf_base[0] -= i;                                           \
1951       }                                                                 \
1952   } while (0)
1953
1954
1955 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1956   do {                                                          \
1957     /* Emacs 20 style format for relative composition.  */      \
1958     /* Store multibyte form of characters to be composed.  */   \
1959     enum composition_method method = COMPOSITION_RELATIVE;      \
1960     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1961     int *buf = components;                                      \
1962     int i, j;                                                   \
1963     int from, to;                                               \
1964                                                                 \
1965     src = src_base;                                             \
1966     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1967     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1968       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1969     if (i < 2)                                                  \
1970       goto invalid_code;                                        \
1971     from = coding->produced_char + char_offset;                 \
1972     to = from + i;                                              \
1973     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1974     for (j = 0; j < i; j++)                                     \
1975       *charbuf++ = components[j];                               \
1976   } while (0)
1977
1978
1979 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1980   do {                                                          \
1981     /* Emacs 20 style format for rule-base composition.  */     \
1982     /* Store multibyte form of characters to be composed.  */   \
1983     enum composition_method method = COMPOSITION_WITH_RULE;     \
1984     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1985     int *buf = components;                                      \
1986     int i, j;                                                   \
1987     int from, to;                                               \
1988                                                                 \
1989     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1990     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1991       {                                                         \
1992         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1993         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1994       }                                                         \
1995     if (i < 1 || (buf - components) % 2 == 0)                   \
1996       goto invalid_code;                                        \
1997     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1998       goto no_more_source;                                      \
1999     from = coding->produced_char + char_offset;                 \
2000     to = from + i;                                              \
2001     ADD_COMPOSITION_DATA (buf, from, to, method);               \
2002     for (j = 0; j < i; j++)                                     \
2003       *charbuf++ = components[j];                               \
2004     for (j = 0; j < i; j += 2)                                  \
2005       *charbuf++ = components[j];                               \
2006   } while (0)
2007
2008
2009 static void
2010 decode_coding_emacs_mule (coding)
2011      struct coding_system *coding;
2012 {
2013   const unsigned char *src = coding->source + coding->consumed;
2014   const unsigned char *src_end = coding->source + coding->src_bytes;
2015   const unsigned char *src_base;
2016   int *charbuf = coding->charbuf;
2017   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2018   int consumed_chars = 0, consumed_chars_base;
2019   int multibytep = coding->src_multibyte;
2020   Lisp_Object attrs, charset_list;
2021   int char_offset = coding->produced_char;
2022   int last_offset = char_offset;
2023   int last_id = charset_ascii;
2024
2025   CODING_GET_INFO (coding, attrs, charset_list);
2026
2027   while (1)
2028     {
2029       int c;
2030
2031       src_base = src;
2032       consumed_chars_base = consumed_chars;
2033
2034       if (charbuf >= charbuf_end)
2035         break;
2036
2037       ONE_MORE_BYTE (c);
2038       if (c < 0)
2039         {
2040           *charbuf++ = -c;
2041           char_offset++;
2042         }
2043       else if (c < 0x80)
2044         {
2045           *charbuf++ = c;
2046           char_offset++;
2047         }
2048       else if (c == 0x80)
2049         {
2050           ONE_MORE_BYTE (c);
2051           if (c < 0)
2052             goto invalid_code;
2053           if (c - 0xF2 >= COMPOSITION_RELATIVE
2054               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2055             DECODE_EMACS_MULE_21_COMPOSITION (c);
2056           else if (c < 0xC0)
2057             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2058           else if (c == 0xFF)
2059             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2060           else
2061             goto invalid_code;
2062         }
2063       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2064         {
2065           int nbytes, nchars;
2066           int id;
2067
2068           src = src_base;
2069           consumed_chars = consumed_chars_base;
2070           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2071           if (c < 0)
2072             {
2073               if (c == -2)
2074                 break;
2075               goto invalid_code;
2076             }
2077           if (last_id != id)
2078             {
2079               if (last_id != charset_ascii)
2080                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2081               last_id = id;
2082               last_offset = char_offset;
2083             }
2084           *charbuf++ = c;
2085           src += nbytes;
2086           consumed_chars += nchars;
2087           char_offset++;
2088         }
2089       continue;
2090
2091     invalid_code:
2092       src = src_base;
2093       consumed_chars = consumed_chars_base;
2094       ONE_MORE_BYTE (c);
2095       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2096       char_offset++;
2097       coding->errors++;
2098     }
2099
2100  no_more_source:
2101   if (last_id != charset_ascii)
2102     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2103   coding->consumed_char += consumed_chars_base;
2104   coding->consumed = src_base - coding->source;
2105   coding->charbuf_used = charbuf - coding->charbuf;
2106 }
2107
2108
2109 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2110   do {                                          \
2111     if (id < 0xA0)                              \
2112       codes[0] = id, codes[1] = 0;              \
2113     else if (id < 0xE0)                         \
2114       codes[0] = 0x9A, codes[1] = id;           \
2115     else if (id < 0xF0)                         \
2116       codes[0] = 0x9B, codes[1] = id;           \
2117     else if (id < 0xF5)                         \
2118       codes[0] = 0x9C, codes[1] = id;           \
2119     else                                        \
2120       codes[0] = 0x9D, codes[1] = id;           \
2121   } while (0);
2122
2123
2124 static int
2125 encode_coding_emacs_mule (coding)
2126      struct coding_system *coding;
2127 {
2128   int multibytep = coding->dst_multibyte;
2129   int *charbuf = coding->charbuf;
2130   int *charbuf_end = charbuf + coding->charbuf_used;
2131   unsigned char *dst = coding->destination + coding->produced;
2132   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2133   int safe_room = 8;
2134   int produced_chars = 0;
2135   Lisp_Object attrs, charset_list;
2136   int c;
2137   int preferred_charset_id = -1;
2138
2139   CODING_GET_INFO (coding, attrs, charset_list);
2140   if (! EQ (charset_list, Vemacs_mule_charset_list))
2141     {
2142       CODING_ATTR_CHARSET_LIST (attrs)
2143         = charset_list = Vemacs_mule_charset_list;
2144     }
2145
2146   while (charbuf < charbuf_end)
2147     {
2148       ASSURE_DESTINATION (safe_room);
2149       c = *charbuf++;
2150
2151       if (c < 0)
2152         {
2153           /* Handle an annotation.  */
2154           switch (*charbuf)
2155             {
2156             case CODING_ANNOTATE_COMPOSITION_MASK:
2157               /* Not yet implemented.  */
2158               break;
2159             case CODING_ANNOTATE_CHARSET_MASK:
2160               preferred_charset_id = charbuf[3];
2161               if (preferred_charset_id >= 0
2162                   && NILP (Fmemq (make_number (preferred_charset_id),
2163                                   charset_list)))
2164                 preferred_charset_id = -1;
2165               break;
2166             default:
2167               abort ();
2168             }
2169           charbuf += -c - 1;
2170           continue;
2171         }
2172
2173       if (ASCII_CHAR_P (c))
2174         EMIT_ONE_ASCII_BYTE (c);
2175       else if (CHAR_BYTE8_P (c))
2176         {
2177           c = CHAR_TO_BYTE8 (c);
2178           EMIT_ONE_BYTE (c);
2179         }
2180       else
2181         {
2182           struct charset *charset;
2183           unsigned code;
2184           int dimension;
2185           int emacs_mule_id;
2186           unsigned char leading_codes[2];
2187
2188           if (preferred_charset_id >= 0)
2189             {
2190               charset = CHARSET_FROM_ID (preferred_charset_id);
2191               if (! CHAR_CHARSET_P (c, charset))
2192                 charset = char_charset (c, charset_list, NULL);
2193             }
2194           else
2195             charset = char_charset (c, charset_list, &code);
2196           if (! charset)
2197             {
2198               c = coding->default_char;
2199               if (ASCII_CHAR_P (c))
2200                 {
2201                   EMIT_ONE_ASCII_BYTE (c);
2202                   continue;
2203                 }
2204               charset = char_charset (c, charset_list, &code);
2205             }
2206           dimension = CHARSET_DIMENSION (charset);
2207           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2208           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2209           EMIT_ONE_BYTE (leading_codes[0]);
2210           if (leading_codes[1])
2211             EMIT_ONE_BYTE (leading_codes[1]);
2212           if (dimension == 1)
2213             EMIT_ONE_BYTE (code | 0x80);
2214           else
2215             {
2216               code |= 0x8080;
2217               EMIT_ONE_BYTE (code >> 8);
2218               EMIT_ONE_BYTE (code & 0xFF);
2219             }
2220         }
2221     }
2222   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2223   coding->produced_char += produced_chars;
2224   coding->produced = dst - coding->destination;
2225   return 0;
2226 }
2227
2228 \f
2229 /*** 7. ISO2022 handlers ***/
2230
2231 /* The following note describes the coding system ISO2022 briefly.
2232    Since the intention of this note is to help understand the
2233    functions in this file, some parts are NOT ACCURATE or are OVERLY
2234    SIMPLIFIED.  For thorough understanding, please refer to the
2235    original document of ISO2022.  This is equivalent to the standard
2236    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2237
2238    ISO2022 provides many mechanisms to encode several character sets
2239    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2240    is encoded using bytes less than 128.  This may make the encoded
2241    text a little bit longer, but the text passes more easily through
2242    several types of gateway, some of which strip off the MSB (Most
2243    Significant Bit).
2244
2245    There are two kinds of character sets: control character sets and
2246    graphic character sets.  The former contain control characters such
2247    as `newline' and `escape' to provide control functions (control
2248    functions are also provided by escape sequences).  The latter
2249    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2250    two control character sets and many graphic character sets.
2251
2252    Graphic character sets are classified into one of the following
2253    four classes, according to the number of bytes (DIMENSION) and
2254    number of characters in one dimension (CHARS) of the set:
2255    - DIMENSION1_CHARS94
2256    - DIMENSION1_CHARS96
2257    - DIMENSION2_CHARS94
2258    - DIMENSION2_CHARS96
2259
2260    In addition, each character set is assigned an identification tag,
2261    unique for each set, called the "final character" (denoted as <F>
2262    hereafter).  The <F> of each character set is decided by ECMA(*)
2263    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2264    (0x30..0x3F are for private use only).
2265
2266    Note (*): ECMA = European Computer Manufacturers Association
2267
2268    Here are examples of graphic character sets [NAME(<F>)]:
2269         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2270         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2271         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2272         o DIMENSION2_CHARS96 -- none for the moment
2273
2274    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2275         C0 [0x00..0x1F] -- control character plane 0
2276         GL [0x20..0x7F] -- graphic character plane 0
2277         C1 [0x80..0x9F] -- control character plane 1
2278         GR [0xA0..0xFF] -- graphic character plane 1
2279
2280    A control character set is directly designated and invoked to C0 or
2281    C1 by an escape sequence.  The most common case is that:
2282    - ISO646's  control character set is designated/invoked to C0, and
2283    - ISO6429's control character set is designated/invoked to C1,
2284    and usually these designations/invocations are omitted in encoded
2285    text.  In a 7-bit environment, only C0 can be used, and a control
2286    character for C1 is encoded by an appropriate escape sequence to
2287    fit into the environment.  All control characters for C1 are
2288    defined to have corresponding escape sequences.
2289
2290    A graphic character set is at first designated to one of four
2291    graphic registers (G0 through G3), then these graphic registers are
2292    invoked to GL or GR.  These designations and invocations can be
2293    done independently.  The most common case is that G0 is invoked to
2294    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2295    these invocations and designations are omitted in encoded text.
2296    In a 7-bit environment, only GL can be used.
2297
2298    When a graphic character set of CHARS94 is invoked to GL, codes
2299    0x20 and 0x7F of the GL area work as control characters SPACE and
2300    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2301    be used.
2302
2303    There are two ways of invocation: locking-shift and single-shift.
2304    With locking-shift, the invocation lasts until the next different
2305    invocation, whereas with single-shift, the invocation affects the
2306    following character only and doesn't affect the locking-shift
2307    state.  Invocations are done by the following control characters or
2308    escape sequences:
2309
2310    ----------------------------------------------------------------------
2311    abbrev  function                  cntrl escape seq   description
2312    ----------------------------------------------------------------------
2313    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2314    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2315    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2316    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2317    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2318    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2319    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2320    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2321    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2322    ----------------------------------------------------------------------
2323    (*) These are not used by any known coding system.
2324
2325    Control characters for these functions are defined by macros
2326    ISO_CODE_XXX in `coding.h'.
2327
2328    Designations are done by the following escape sequences:
2329    ----------------------------------------------------------------------
2330    escape sequence      description
2331    ----------------------------------------------------------------------
2332    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2333    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2334    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2335    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2336    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2337    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2338    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2339    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2340    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2341    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2342    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2343    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2344    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2345    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2346    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2347    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2348    ----------------------------------------------------------------------
2349
2350    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2351    of dimension 1, chars 94, and final character <F>, etc...
2352
2353    Note (*): Although these designations are not allowed in ISO2022,
2354    Emacs accepts them on decoding, and produces them on encoding
2355    CHARS96 character sets in a coding system which is characterized as
2356    7-bit environment, non-locking-shift, and non-single-shift.
2357
2358    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2359    '(' must be omitted.  We refer to this as "short-form" hereafter.
2360
2361    Now you may notice that there are a lot of ways of encoding the
2362    same multilingual text in ISO2022.  Actually, there exist many
2363    coding systems such as Compound Text (used in X11's inter client
2364    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2365    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2366    localized platforms), and all of these are variants of ISO2022.
2367
2368    In addition to the above, Emacs handles two more kinds of escape
2369    sequences: ISO6429's direction specification and Emacs' private
2370    sequence for specifying character composition.
2371
2372    ISO6429's direction specification takes the following form:
2373         o CSI ']'      -- end of the current direction
2374         o CSI '0' ']'  -- end of the current direction
2375         o CSI '1' ']'  -- start of left-to-right text
2376         o CSI '2' ']'  -- start of right-to-left text
2377    The control character CSI (0x9B: control sequence introducer) is
2378    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2379
2380    Character composition specification takes the following form:
2381         o ESC '0' -- start relative composition
2382         o ESC '1' -- end composition
2383         o ESC '2' -- start rule-base composition (*)
2384         o ESC '3' -- start relative composition with alternate chars  (**)
2385         o ESC '4' -- start rule-base composition with alternate chars  (**)
2386   Since these are not standard escape sequences of any ISO standard,
2387   the use of them with these meanings is restricted to Emacs only.
2388
2389   (*) This form is used only in Emacs 20.7 and older versions,
2390   but newer versions can safely decode it.
2391   (**) This form is used only in Emacs 21.1 and newer versions,
2392   and older versions can't decode it.
2393
2394   Here's a list of example usages of these composition escape
2395   sequences (categorized by `enum composition_method').
2396
2397   COMPOSITION_RELATIVE:
2398         ESC 0 CHAR [ CHAR ] ESC 1
2399   COMPOSITION_WITH_RULE:
2400         ESC 2 CHAR [ RULE CHAR ] ESC 1
2401   COMPOSITION_WITH_ALTCHARS:
2402         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2403   COMPOSITION_WITH_RULE_ALTCHARS:
2404         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2405
2406 enum iso_code_class_type iso_code_class[256];
2407
2408 #define SAFE_CHARSET_P(coding, id)      \
2409   ((id) <= (coding)->max_charset_id     \
2410    && (coding)->safe_charsets[id] >= 0)
2411
2412
2413 #define SHIFT_OUT_OK(category)  \
2414   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2415
2416 static void
2417 setup_iso_safe_charsets (attrs)
2418      Lisp_Object attrs;
2419 {
2420   Lisp_Object charset_list, safe_charsets;
2421   Lisp_Object request;
2422   Lisp_Object reg_usage;
2423   Lisp_Object tail;
2424   int reg94, reg96;
2425   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2426   int max_charset_id;
2427
2428   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2429   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2430       && ! EQ (charset_list, Viso_2022_charset_list))
2431     {
2432       CODING_ATTR_CHARSET_LIST (attrs)
2433         = charset_list = Viso_2022_charset_list;
2434       ASET (attrs, coding_attr_safe_charsets, Qnil);
2435     }
2436
2437   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2438     return;
2439
2440   max_charset_id = 0;
2441   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2442     {
2443       int id = XINT (XCAR (tail));
2444       if (max_charset_id < id)
2445         max_charset_id = id;
2446     }
2447
2448   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2449                                 make_number (255));
2450   request = AREF (attrs, coding_attr_iso_request);
2451   reg_usage = AREF (attrs, coding_attr_iso_usage);
2452   reg94 = XINT (XCAR (reg_usage));
2453   reg96 = XINT (XCDR (reg_usage));
2454
2455   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2456     {
2457       Lisp_Object id;
2458       Lisp_Object reg;
2459       struct charset *charset;
2460
2461       id = XCAR (tail);
2462       charset = CHARSET_FROM_ID (XINT (id));
2463       reg = Fcdr (Fassq (id, request));
2464       if (! NILP (reg))
2465         SSET (safe_charsets, XINT (id), XINT (reg));
2466       else if (charset->iso_chars_96)
2467         {
2468           if (reg96 < 4)
2469             SSET (safe_charsets, XINT (id), reg96);
2470         }
2471       else
2472         {
2473           if (reg94 < 4)
2474             SSET (safe_charsets, XINT (id), reg94);
2475         }
2476     }
2477   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2478 }
2479
2480
2481 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2482    Check if a text is encoded in one of ISO-2022 based codig systems.
2483    If it is, return 1, else return 0.  */
2484
2485 static int
2486 detect_coding_iso_2022 (coding, detect_info)
2487      struct coding_system *coding;
2488      struct coding_detection_info *detect_info;
2489 {
2490   const unsigned char *src = coding->source, *src_base = src;
2491   const unsigned char *src_end = coding->source + coding->src_bytes;
2492   int multibytep = coding->src_multibyte;
2493   int single_shifting = 0;
2494   int id;
2495   int c, c1;
2496   int consumed_chars = 0;
2497   int i;
2498   int rejected = 0;
2499   int found = 0;
2500
2501   detect_info->checked |= CATEGORY_MASK_ISO;
2502
2503   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2504     {
2505       struct coding_system *this = &(coding_categories[i]);
2506       Lisp_Object attrs, val;
2507
2508       attrs = CODING_ID_ATTRS (this->id);
2509       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2510           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2511         setup_iso_safe_charsets (attrs);
2512       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2513       this->max_charset_id = SCHARS (val) - 1;
2514       this->safe_charsets = (char *) SDATA (val);
2515     }
2516
2517   /* A coding system of this category is always ASCII compatible.  */
2518   src += coding->head_ascii;
2519
2520   while (rejected != CATEGORY_MASK_ISO)
2521     {
2522       src_base = src;
2523       ONE_MORE_BYTE (c);
2524       switch (c)
2525         {
2526         case ISO_CODE_ESC:
2527           if (inhibit_iso_escape_detection)
2528             break;
2529           single_shifting = 0;
2530           ONE_MORE_BYTE (c);
2531           if (c >= '(' && c <= '/')
2532             {
2533               /* Designation sequence for a charset of dimension 1.  */
2534               ONE_MORE_BYTE (c1);
2535               if (c1 < ' ' || c1 >= 0x80
2536                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2537                 /* Invalid designation sequence.  Just ignore.  */
2538                 break;
2539             }
2540           else if (c == '$')
2541             {
2542               /* Designation sequence for a charset of dimension 2.  */
2543               ONE_MORE_BYTE (c);
2544               if (c >= '@' && c <= 'B')
2545                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2546                 id = iso_charset_table[1][0][c];
2547               else if (c >= '(' && c <= '/')
2548                 {
2549                   ONE_MORE_BYTE (c1);
2550                   if (c1 < ' ' || c1 >= 0x80
2551                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2552                     /* Invalid designation sequence.  Just ignore.  */
2553                     break;
2554                 }
2555               else
2556                 /* Invalid designation sequence.  Just ignore it.  */
2557                 break;
2558             }
2559           else if (c == 'N' || c == 'O')
2560             {
2561               /* ESC <Fe> for SS2 or SS3.  */
2562               single_shifting = 1;
2563               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2564               break;
2565             }
2566           else if (c >= '0' && c <= '4')
2567             {
2568               /* ESC <Fp> for start/end composition.  */
2569               found |= CATEGORY_MASK_ISO;
2570               break;
2571             }
2572           else
2573             {
2574               /* Invalid escape sequence.  Just ignore it.  */
2575               break;
2576             }
2577
2578           /* We found a valid designation sequence for CHARSET.  */
2579           rejected |= CATEGORY_MASK_ISO_8BIT;
2580           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2581                               id))
2582             found |= CATEGORY_MASK_ISO_7;
2583           else
2584             rejected |= CATEGORY_MASK_ISO_7;
2585           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2586                               id))
2587             found |= CATEGORY_MASK_ISO_7_TIGHT;
2588           else
2589             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2590           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2591                               id))
2592             found |= CATEGORY_MASK_ISO_7_ELSE;
2593           else
2594             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2595           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2596                               id))
2597             found |= CATEGORY_MASK_ISO_8_ELSE;
2598           else
2599             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2600           break;
2601
2602         case ISO_CODE_SO:
2603         case ISO_CODE_SI:
2604           /* Locking shift out/in.  */
2605           if (inhibit_iso_escape_detection)
2606             break;
2607           single_shifting = 0;
2608           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2609           found |= CATEGORY_MASK_ISO_ELSE;
2610           break;
2611
2612         case ISO_CODE_CSI:
2613           /* Control sequence introducer.  */
2614           single_shifting = 0;
2615           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2616           found |= CATEGORY_MASK_ISO_8_ELSE;
2617           goto check_extra_latin;
2618
2619         case ISO_CODE_SS2:
2620         case ISO_CODE_SS3:
2621           /* Single shift.   */
2622           if (inhibit_iso_escape_detection)
2623             break;
2624           single_shifting = 0;
2625           rejected |= CATEGORY_MASK_ISO_7BIT;
2626           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2627               & CODING_ISO_FLAG_SINGLE_SHIFT)
2628             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2629           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2630               & CODING_ISO_FLAG_SINGLE_SHIFT)
2631             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2632           if (single_shifting)
2633             break;
2634           goto check_extra_latin;
2635
2636         default:
2637           if (c < 0)
2638             continue;
2639           if (c < 0x80)
2640             {
2641               single_shifting = 0;
2642               break;
2643             }
2644           if (c >= 0xA0)
2645             {
2646               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2647               found |= CATEGORY_MASK_ISO_8_1;
2648               /* Check the length of succeeding codes of the range
2649                  0xA0..0FF.  If the byte length is even, we include
2650                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2651                  only when we are not single shifting.  */
2652               if (! single_shifting
2653                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2654                 {
2655                   int i = 1;
2656                   while (src < src_end)
2657                     {
2658                       ONE_MORE_BYTE (c);
2659                       if (c < 0xA0)
2660                         break;
2661                       i++;
2662                     }
2663
2664                   if (i & 1 && src < src_end)
2665                     rejected |= CATEGORY_MASK_ISO_8_2;
2666                   else
2667                     found |= CATEGORY_MASK_ISO_8_2;
2668                 }
2669               break;
2670             }
2671         check_extra_latin:
2672           single_shifting = 0;
2673           if (! VECTORP (Vlatin_extra_code_table)
2674               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2675             {
2676               rejected = CATEGORY_MASK_ISO;
2677               break;
2678             }
2679           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2680               & CODING_ISO_FLAG_LATIN_EXTRA)
2681             found |= CATEGORY_MASK_ISO_8_1;
2682           else
2683             rejected |= CATEGORY_MASK_ISO_8_1;
2684           rejected |= CATEGORY_MASK_ISO_8_2;
2685         }
2686     }
2687   detect_info->rejected |= CATEGORY_MASK_ISO;
2688   return 0;
2689
2690  no_more_source:
2691   detect_info->rejected |= rejected;
2692   detect_info->found |= (found & ~rejected);
2693   return 1;
2694 }
2695
2696
2697 /* Set designation state into CODING.  */
2698 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2699   do {                                                                  \
2700     int id, prev;                                                       \
2701                                                                         \
2702     if (final < '0' || final >= 128                                     \
2703         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2704         || !SAFE_CHARSET_P (coding, id))                                \
2705       {                                                                 \
2706         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2707         goto invalid_code;                                              \
2708       }                                                                 \
2709     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2710     if (id == charset_jisx0201_roman)                                   \
2711       {                                                                 \
2712         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2713           id = charset_ascii;                                           \
2714       }                                                                 \
2715     else if (id == charset_jisx0208_1978)                               \
2716       {                                                                 \
2717         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2718           id = charset_jisx0208;                                        \
2719       }                                                                 \
2720     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2721     /* If there was an invalid designation to REG previously, and this  \
2722        designation is ASCII to REG, we should keep this designation     \
2723        sequence.  */                                                    \
2724     if (prev == -2 && id == charset_ascii)                              \
2725       goto invalid_code;                                                \
2726   } while (0)
2727
2728
2729 #define MAYBE_FINISH_COMPOSITION()                              \
2730   do {                                                          \
2731     int i;                                                      \
2732     if (composition_state == COMPOSING_NO)                      \
2733       break;                                                    \
2734     /* It is assured that we have enough room for producing     \
2735        characters stored in the table `components'.  */         \
2736     if (charbuf + component_idx > charbuf_end)                  \
2737       goto no_more_source;                                      \
2738     composition_state = COMPOSING_NO;                           \
2739     if (method == COMPOSITION_RELATIVE                          \
2740         || method == COMPOSITION_WITH_ALTCHARS)                 \
2741       {                                                         \
2742         for (i = 0; i < component_idx; i++)                     \
2743           *charbuf++ = components[i];                           \
2744         char_offset += component_idx;                           \
2745       }                                                         \
2746     else                                                        \
2747       {                                                         \
2748         for (i = 0; i < component_idx; i += 2)                  \
2749           *charbuf++ = components[i];                           \
2750         char_offset += (component_idx / 2) + 1;                 \
2751       }                                                         \
2752   } while (0)
2753
2754
2755 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2756    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2757    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2758    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2759    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2760   */
2761
2762 #define DECODE_COMPOSITION_START(c1)                                    \
2763   do {                                                                  \
2764     if (c1 == '0'                                                       \
2765         && composition_state == COMPOSING_COMPONENT_RULE)               \
2766       {                                                                 \
2767         component_len = component_idx;                                  \
2768         composition_state = COMPOSING_CHAR;                             \
2769       }                                                                 \
2770     else                                                                \
2771       {                                                                 \
2772         const unsigned char *p;                                         \
2773                                                                         \
2774         MAYBE_FINISH_COMPOSITION ();                                    \
2775         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2776           goto no_more_source;                                          \
2777         for (p = src; p < src_end - 1; p++)                             \
2778           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2779             break;                                                      \
2780         if (p == src_end - 1)                                           \
2781           {                                                             \
2782             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2783               goto invalid_code;                                        \
2784             goto no_more_source;                                        \
2785           }                                                             \
2786                                                                         \
2787         /* This is surely the start of a composition.  */               \
2788         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2789                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2790                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2791                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2792         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2793                              : COMPOSING_COMPONENT_CHAR);               \
2794         component_idx = component_len = 0;                              \
2795       }                                                                 \
2796   } while (0)
2797
2798
2799 /* Handle compositoin end sequence ESC 1.  */
2800
2801 #define DECODE_COMPOSITION_END()                                        \
2802   do {                                                                  \
2803     int nchars = (component_len > 0 ? component_idx - component_len     \
2804                   : method == COMPOSITION_RELATIVE ? component_idx      \
2805                   : (component_idx + 1) / 2);                           \
2806     int i;                                                              \
2807     int *saved_charbuf = charbuf;                                       \
2808     int from = char_offset;                                             \
2809     int to = from + nchars;                                             \
2810                                                                         \
2811     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2812     if (method != COMPOSITION_RELATIVE)                                 \
2813       {                                                                 \
2814         if (component_len == 0)                                         \
2815           for (i = 0; i < component_idx; i++)                           \
2816             *charbuf++ = components[i];                                 \
2817         else                                                            \
2818           for (i = 0; i < component_len; i++)                           \
2819             *charbuf++ = components[i];                                 \
2820         *saved_charbuf = saved_charbuf - charbuf;                       \
2821       }                                                                 \
2822     if (method == COMPOSITION_WITH_RULE)                                \
2823       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2824         *charbuf++ = components[i];                                     \
2825     else                                                                \
2826       for (i = component_len; i < component_idx; i++, char_offset++)    \
2827         *charbuf++ = components[i];                                     \
2828     coding->annotated = 1;                                              \
2829     composition_state = COMPOSING_NO;                                   \
2830   } while (0)
2831
2832
2833 /* Decode a composition rule from the byte C1 (and maybe one more byte
2834    from SRC) and store one encoded composition rule in
2835    coding->cmp_data.  */
2836
2837 #define DECODE_COMPOSITION_RULE(c1)                                     \
2838   do {                                                                  \
2839     (c1) -= 32;                                                         \
2840     if (c1 < 81)                /* old format (before ver.21) */        \
2841       {                                                                 \
2842         int gref = (c1) / 9;                                            \
2843         int nref = (c1) % 9;                                            \
2844         if (gref == 4) gref = 10;                                       \
2845         if (nref == 4) nref = 10;                                       \
2846         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2847       }                                                                 \
2848     else if (c1 < 93)           /* new format (after ver.21) */         \
2849       {                                                                 \
2850         ONE_MORE_BYTE (c2);                                             \
2851         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2852       }                                                                 \
2853     else                                                                \
2854       c1 = 0;                                                           \
2855   } while (0)
2856
2857
2858 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2859
2860 static void
2861 decode_coding_iso_2022 (coding)
2862      struct coding_system *coding;
2863 {
2864   const unsigned char *src = coding->source + coding->consumed;
2865   const unsigned char *src_end = coding->source + coding->src_bytes;
2866   const unsigned char *src_base;
2867   int *charbuf = coding->charbuf;
2868   int *charbuf_end
2869     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2870   int consumed_chars = 0, consumed_chars_base;
2871   int multibytep = coding->src_multibyte;
2872   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2873   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2874   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2875   struct charset *charset;
2876   int c;
2877   /* For handling composition sequence.  */
2878 #define COMPOSING_NO                    0
2879 #define COMPOSING_CHAR                  1
2880 #define COMPOSING_RULE                  2
2881 #define COMPOSING_COMPONENT_CHAR        3
2882 #define COMPOSING_COMPONENT_RULE        4
2883
2884   int composition_state = COMPOSING_NO;
2885   enum composition_method method;
2886   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2887   int component_idx;
2888   int component_len;
2889   Lisp_Object attrs, charset_list;
2890   int char_offset = coding->produced_char;
2891   int last_offset = char_offset;
2892   int last_id = charset_ascii;
2893
2894   CODING_GET_INFO (coding, attrs, charset_list);
2895   setup_iso_safe_charsets (attrs);
2896
2897   while (1)
2898     {
2899       int c1, c2;
2900
2901       src_base = src;
2902       consumed_chars_base = consumed_chars;
2903
2904       if (charbuf >= charbuf_end)
2905         break;
2906
2907       ONE_MORE_BYTE (c1);
2908       if (c1 < 0)
2909         goto invalid_code;
2910
2911       /* We produce at most one character.  */
2912       switch (iso_code_class [c1])
2913         {
2914         case ISO_0x20_or_0x7F:
2915           if (composition_state != COMPOSING_NO)
2916             {
2917               if (composition_state == COMPOSING_RULE
2918                   || composition_state == COMPOSING_COMPONENT_RULE)
2919                 {
2920                   DECODE_COMPOSITION_RULE (c1);
2921                   components[component_idx++] = c1;
2922                   composition_state--;
2923                   continue;
2924                 }
2925             }
2926           if (charset_id_0 < 0
2927               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2928             /* This is SPACE or DEL.  */
2929             charset = CHARSET_FROM_ID (charset_ascii);
2930           else
2931             charset = CHARSET_FROM_ID (charset_id_0);
2932           break;
2933
2934         case ISO_graphic_plane_0:
2935           if (composition_state != COMPOSING_NO)
2936             {
2937               if (composition_state == COMPOSING_RULE
2938                   || composition_state == COMPOSING_COMPONENT_RULE)
2939                 {
2940                   DECODE_COMPOSITION_RULE (c1);
2941                   components[component_idx++] = c1;
2942                   composition_state--;
2943                   continue;
2944                 }
2945             }
2946           charset = CHARSET_FROM_ID (charset_id_0);
2947           break;
2948
2949         case ISO_0xA0_or_0xFF:
2950           if (charset_id_1 < 0
2951               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2952               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2953             goto invalid_code;
2954           /* This is a graphic character, we fall down ... */
2955
2956         case ISO_graphic_plane_1:
2957           if (charset_id_1 < 0)
2958             goto invalid_code;
2959           charset = CHARSET_FROM_ID (charset_id_1);
2960           break;
2961
2962         case ISO_control_0:
2963           MAYBE_FINISH_COMPOSITION ();
2964           charset = CHARSET_FROM_ID (charset_ascii);
2965           break;
2966
2967         case ISO_control_1:
2968           MAYBE_FINISH_COMPOSITION ();
2969           goto invalid_code;
2970
2971         case ISO_shift_out:
2972           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2973               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2974             goto invalid_code;
2975           CODING_ISO_INVOCATION (coding, 0) = 1;
2976           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2977           continue;
2978
2979         case ISO_shift_in:
2980           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2981             goto invalid_code;
2982           CODING_ISO_INVOCATION (coding, 0) = 0;
2983           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2984           continue;
2985
2986         case ISO_single_shift_2_7:
2987         case ISO_single_shift_2:
2988           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2989             goto invalid_code;
2990           /* SS2 is handled as an escape sequence of ESC 'N' */
2991           c1 = 'N';
2992           goto label_escape_sequence;
2993
2994         case ISO_single_shift_3:
2995           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2996             goto invalid_code;
2997           /* SS2 is handled as an escape sequence of ESC 'O' */
2998           c1 = 'O';
2999           goto label_escape_sequence;
3000
3001         case ISO_control_sequence_introducer:
3002           /* CSI is handled as an escape sequence of ESC '[' ...  */
3003           c1 = '[';
3004           goto label_escape_sequence;
3005
3006         case ISO_escape:
3007           ONE_MORE_BYTE (c1);
3008         label_escape_sequence:
3009           /* Escape sequences handled here are invocation,
3010              designation, direction specification, and character
3011              composition specification.  */
3012           switch (c1)
3013             {
3014             case '&':           /* revision of following character set */
3015               ONE_MORE_BYTE (c1);
3016               if (!(c1 >= '@' && c1 <= '~'))
3017                 goto invalid_code;
3018               ONE_MORE_BYTE (c1);
3019               if (c1 != ISO_CODE_ESC)
3020                 goto invalid_code;
3021               ONE_MORE_BYTE (c1);
3022               goto label_escape_sequence;
3023
3024             case '$':           /* designation of 2-byte character set */
3025               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3026                 goto invalid_code;
3027               ONE_MORE_BYTE (c1);
3028               if (c1 >= '@' && c1 <= 'B')
3029                 {       /* designation of JISX0208.1978, GB2312.1980,
3030                            or JISX0208.1980 */
3031                   DECODE_DESIGNATION (0, 2, 0, c1);
3032                 }
3033               else if (c1 >= 0x28 && c1 <= 0x2B)
3034                 {       /* designation of DIMENSION2_CHARS94 character set */
3035                   ONE_MORE_BYTE (c2);
3036                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3037                 }
3038               else if (c1 >= 0x2C && c1 <= 0x2F)
3039                 {       /* designation of DIMENSION2_CHARS96 character set */
3040                   ONE_MORE_BYTE (c2);
3041                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3042                 }
3043               else
3044                 goto invalid_code;
3045               /* We must update these variables now.  */
3046               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3047               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3048               continue;
3049
3050             case 'n':           /* invocation of locking-shift-2 */
3051               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3052                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3053                 goto invalid_code;
3054               CODING_ISO_INVOCATION (coding, 0) = 2;
3055               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3056               continue;
3057
3058             case 'o':           /* invocation of locking-shift-3 */
3059               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3060                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3061                 goto invalid_code;
3062               CODING_ISO_INVOCATION (coding, 0) = 3;
3063               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3064               continue;
3065
3066             case 'N':           /* invocation of single-shift-2 */
3067               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3068                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3069                 goto invalid_code;
3070               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3071               ONE_MORE_BYTE (c1);
3072               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3073                 goto invalid_code;
3074               break;
3075
3076             case 'O':           /* invocation of single-shift-3 */
3077               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3078                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3079                 goto invalid_code;
3080               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3081               ONE_MORE_BYTE (c1);
3082               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3083                 goto invalid_code;
3084               break;
3085
3086             case '0': case '2': case '3': case '4': /* start composition */
3087               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3088                 goto invalid_code;
3089               DECODE_COMPOSITION_START (c1);
3090               continue;
3091
3092             case '1':           /* end composition */
3093               if (composition_state == COMPOSING_NO)
3094                 goto invalid_code;
3095               DECODE_COMPOSITION_END ();
3096               continue;
3097
3098             case '[':           /* specification of direction */
3099               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3100                 goto invalid_code;
3101               /* For the moment, nested direction is not supported.
3102                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3103                  left-to-right, and nozero means right-to-left.  */
3104               ONE_MORE_BYTE (c1);
3105               switch (c1)
3106                 {
3107                 case ']':       /* end of the current direction */
3108                   coding->mode &= ~CODING_MODE_DIRECTION;
3109
3110                 case '0':       /* end of the current direction */
3111                 case '1':       /* start of left-to-right direction */
3112                   ONE_MORE_BYTE (c1);
3113                   if (c1 == ']')
3114                     coding->mode &= ~CODING_MODE_DIRECTION;
3115                   else
3116                     goto invalid_code;
3117                   break;
3118
3119                 case '2':       /* start of right-to-left direction */
3120                   ONE_MORE_BYTE (c1);
3121                   if (c1 == ']')
3122                     coding->mode |= CODING_MODE_DIRECTION;
3123                   else
3124                     goto invalid_code;
3125                   break;
3126
3127                 default:
3128                   goto invalid_code;
3129                 }
3130               continue;
3131
3132             case '%':
3133               ONE_MORE_BYTE (c1);
3134               if (c1 == '/')
3135                 {
3136                   /* CTEXT extended segment:
3137                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3138                      We keep these bytes as is for the moment.
3139                      They may be decoded by post-read-conversion.  */
3140                   int dim, M, L;
3141                   int size;
3142
3143                   ONE_MORE_BYTE (dim);
3144                   ONE_MORE_BYTE (M);
3145                   ONE_MORE_BYTE (L);
3146                   size = ((M - 128) * 128) + (L - 128);
3147                   if (charbuf + 8 + size > charbuf_end)
3148                     goto break_loop;
3149                   *charbuf++ = ISO_CODE_ESC;
3150                   *charbuf++ = '%';
3151                   *charbuf++ = '/';
3152                   *charbuf++ = dim;
3153                   *charbuf++ = BYTE8_TO_CHAR (M);
3154                   *charbuf++ = BYTE8_TO_CHAR (L);
3155                   while (size-- > 0)
3156                     {
3157                       ONE_MORE_BYTE (c1);
3158                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3159                     }
3160                 }
3161               else if (c1 == 'G')
3162                 {
3163                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3164                      ESC % G --UTF-8-BYTES-- ESC % @
3165                      We keep these bytes as is for the moment.
3166                      They may be decoded by post-read-conversion.  */
3167                   int *p = charbuf;
3168
3169                   if (p + 6 > charbuf_end)
3170                     goto break_loop;
3171                   *p++ = ISO_CODE_ESC;
3172                   *p++ = '%';
3173                   *p++ = 'G';
3174                   while (p < charbuf_end)
3175                     {
3176                       ONE_MORE_BYTE (c1);
3177                       if (c1 == ISO_CODE_ESC
3178                           && src + 1 < src_end
3179                           && src[0] == '%'
3180                           && src[1] == '@')
3181                         break;
3182                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3183                     }
3184                   if (p + 3 > charbuf_end)
3185                     goto break_loop;
3186                   *p++ = ISO_CODE_ESC;
3187                   *p++ = '%';
3188                   *p++ = '@';
3189                   charbuf = p;
3190                 }
3191               else
3192                 goto invalid_code;
3193               continue;
3194               break;
3195
3196             default:
3197               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3198                 goto invalid_code;
3199               if (c1 >= 0x28 && c1 <= 0x2B)
3200                 {       /* designation of DIMENSION1_CHARS94 character set */
3201                   ONE_MORE_BYTE (c2);
3202                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3203                 }
3204               else if (c1 >= 0x2C && c1 <= 0x2F)
3205                 {       /* designation of DIMENSION1_CHARS96 character set */
3206                   ONE_MORE_BYTE (c2);
3207                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3208                 }
3209               else
3210                 goto invalid_code;
3211               /* We must update these variables now.  */
3212               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3213               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3214               continue;
3215             }
3216         }
3217
3218       if (charset->id != charset_ascii
3219           && last_id != charset->id)
3220         {
3221           if (last_id != charset_ascii)
3222             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3223           last_id = charset->id;
3224           last_offset = char_offset;
3225         }
3226
3227       /* Now we know CHARSET and 1st position code C1 of a character.
3228          Produce a decoded character while getting 2nd position code
3229          C2 if necessary.  */
3230       c1 &= 0x7F;
3231       if (CHARSET_DIMENSION (charset) > 1)
3232         {
3233           ONE_MORE_BYTE (c2);
3234           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3235             /* C2 is not in a valid range.  */
3236             goto invalid_code;
3237           c1 = (c1 << 8) | (c2 & 0x7F);
3238           if (CHARSET_DIMENSION (charset) > 2)
3239             {
3240               ONE_MORE_BYTE (c2);
3241               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3242                 /* C2 is not in a valid range.  */
3243                 goto invalid_code;
3244               c1 = (c1 << 8) | (c2 & 0x7F);
3245             }
3246         }
3247
3248       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3249       if (c < 0)
3250         {
3251           MAYBE_FINISH_COMPOSITION ();
3252           for (; src_base < src; src_base++, char_offset++)
3253             {
3254               if (ASCII_BYTE_P (*src_base))
3255                 *charbuf++ = *src_base;
3256               else
3257                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3258             }
3259         }
3260       else if (composition_state == COMPOSING_NO)
3261         {
3262           *charbuf++ = c;
3263           char_offset++;
3264         }
3265       else
3266         {
3267           components[component_idx++] = c;
3268           if (method == COMPOSITION_WITH_RULE
3269               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3270                   && composition_state == COMPOSING_COMPONENT_CHAR))
3271             composition_state++;
3272         }
3273       continue;
3274
3275     invalid_code:
3276       MAYBE_FINISH_COMPOSITION ();
3277       src = src_base;
3278       consumed_chars = consumed_chars_base;
3279       ONE_MORE_BYTE (c);
3280       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3281       char_offset++;
3282       coding->errors++;
3283       continue;
3284
3285     break_loop:
3286       break;
3287     }
3288
3289  no_more_source:
3290   if (last_id != charset_ascii)
3291     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3292   coding->consumed_char += consumed_chars_base;
3293   coding->consumed = src_base - coding->source;
3294   coding->charbuf_used = charbuf - coding->charbuf;
3295 }
3296
3297
3298 /* ISO2022 encoding stuff.  */
3299
3300 /*
3301    It is not enough to say just "ISO2022" on encoding, we have to
3302    specify more details.  In Emacs, each coding system of ISO2022
3303    variant has the following specifications:
3304         1. Initial designation to G0 thru G3.
3305         2. Allows short-form designation?
3306         3. ASCII should be designated to G0 before control characters?
3307         4. ASCII should be designated to G0 at end of line?
3308         5. 7-bit environment or 8-bit environment?
3309         6. Use locking-shift?
3310         7. Use Single-shift?
3311    And the following two are only for Japanese:
3312         8. Use ASCII in place of JIS0201-1976-Roman?
3313         9. Use JISX0208-1983 in place of JISX0208-1978?
3314    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3315    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3316    details.
3317 */
3318
3319 /* Produce codes (escape sequence) for designating CHARSET to graphic
3320    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3321    '@', 'A', or 'B' and the coding system CODING allows, produce
3322    designation sequence of short-form.  */
3323
3324 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3325   do {                                                                  \
3326     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3327     char *intermediate_char_94 = "()*+";                                \
3328     char *intermediate_char_96 = ",-./";                                \
3329     int revision = -1;                                                  \
3330     int c;                                                              \
3331                                                                         \
3332     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3333       revision = CHARSET_ISO_REVISION (charset);                        \
3334                                                                         \
3335     if (revision >= 0)                                                  \
3336       {                                                                 \
3337         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3338         EMIT_ONE_BYTE ('@' + revision);                                 \
3339       }                                                                 \
3340     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3341     if (CHARSET_DIMENSION (charset) == 1)                               \
3342       {                                                                 \
3343         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3344           c = intermediate_char_94[reg];                                \
3345         else                                                            \
3346           c = intermediate_char_96[reg];                                \
3347         EMIT_ONE_ASCII_BYTE (c);                                        \
3348       }                                                                 \
3349     else                                                                \
3350       {                                                                 \
3351         EMIT_ONE_ASCII_BYTE ('$');                                      \
3352         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3353           {                                                             \
3354             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3355                 || reg != 0                                             \
3356                 || final_char < '@' || final_char > 'B')                \
3357               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3358           }                                                             \
3359         else                                                            \
3360           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3361       }                                                                 \
3362     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3363                                                                         \
3364     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3365   } while (0)
3366
3367
3368 /* The following two macros produce codes (control character or escape
3369    sequence) for ISO2022 single-shift functions (single-shift-2 and
3370    single-shift-3).  */
3371
3372 #define ENCODE_SINGLE_SHIFT_2                                           \
3373   do {                                                                  \
3374     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3375       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3376     else                                                                \
3377       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3378     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3379   } while (0)
3380
3381
3382 #define ENCODE_SINGLE_SHIFT_3                                           \
3383   do {                                                                  \
3384     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3385       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3386     else                                                                \
3387       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3388     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3389   } while (0)
3390
3391
3392 /* The following four macros produce codes (control character or
3393    escape sequence) for ISO2022 locking-shift functions (shift-in,
3394    shift-out, locking-shift-2, and locking-shift-3).  */
3395
3396 #define ENCODE_SHIFT_IN                                 \
3397   do {                                                  \
3398     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3399     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3400   } while (0)
3401
3402
3403 #define ENCODE_SHIFT_OUT                                \
3404   do {                                                  \
3405     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3406     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3407   } while (0)
3408
3409
3410 #define ENCODE_LOCKING_SHIFT_2                          \
3411   do {                                                  \
3412     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3413     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3414   } while (0)
3415
3416
3417 #define ENCODE_LOCKING_SHIFT_3                          \
3418   do {                                                  \
3419     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3420     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3421   } while (0)
3422
3423
3424 /* Produce codes for a DIMENSION1 character whose character set is
3425    CHARSET and whose position-code is C1.  Designation and invocation
3426    sequences are also produced in advance if necessary.  */
3427
3428 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3429   do {                                                                  \
3430     int id = CHARSET_ID (charset);                                      \
3431                                                                         \
3432     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3433         && id == charset_ascii)                                         \
3434       {                                                                 \
3435         id = charset_jisx0201_roman;                                    \
3436         charset = CHARSET_FROM_ID (id);                                 \
3437       }                                                                 \
3438                                                                         \
3439     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3440       {                                                                 \
3441         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3442           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3443         else                                                            \
3444           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3445         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3446         break;                                                          \
3447       }                                                                 \
3448     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3449       {                                                                 \
3450         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3451         break;                                                          \
3452       }                                                                 \
3453     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3454       {                                                                 \
3455         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3456         break;                                                          \
3457       }                                                                 \
3458     else                                                                \
3459       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3460          must invoke it, or, at first, designate it to some graphic     \
3461          register.  Then repeat the loop to actually produce the        \
3462          character.  */                                                 \
3463       dst = encode_invocation_designation (charset, coding, dst,        \
3464                                            &produced_chars);            \
3465   } while (1)
3466
3467
3468 /* Produce codes for a DIMENSION2 character whose character set is
3469    CHARSET and whose position-codes are C1 and C2.  Designation and
3470    invocation codes are also produced in advance if necessary.  */
3471
3472 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3473   do {                                                                  \
3474     int id = CHARSET_ID (charset);                                      \
3475                                                                         \
3476     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3477         && id == charset_jisx0208)                                      \
3478       {                                                                 \
3479         id = charset_jisx0208_1978;                                     \
3480         charset = CHARSET_FROM_ID (id);                                 \
3481       }                                                                 \
3482                                                                         \
3483     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3484       {                                                                 \
3485         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3486           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3487         else                                                            \
3488           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3489         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3490         break;                                                          \
3491       }                                                                 \
3492     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3493       {                                                                 \
3494         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3495         break;                                                          \
3496       }                                                                 \
3497     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3498       {                                                                 \
3499         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3500         break;                                                          \
3501       }                                                                 \
3502     else                                                                \
3503       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3504          must invoke it, or, at first, designate it to some graphic     \
3505          register.  Then repeat the loop to actually produce the        \
3506          character.  */                                                 \
3507       dst = encode_invocation_designation (charset, coding, dst,        \
3508                                            &produced_chars);            \
3509   } while (1)
3510
3511
3512 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3513   do {                                                                     \
3514     int code = ENCODE_CHAR ((charset),(c));                                \
3515                                                                            \
3516     if (CHARSET_DIMENSION (charset) == 1)                                  \
3517       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3518     else                                                                   \
3519       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3520   } while (0)
3521
3522
3523 /* Produce designation and invocation codes at a place pointed by DST
3524    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3525    Return new DST.  */
3526
3527 unsigned char *
3528 encode_invocation_designation (charset, coding, dst, p_nchars)
3529      struct charset *charset;
3530      struct coding_system *coding;
3531      unsigned char *dst;
3532      int *p_nchars;
3533 {
3534   int multibytep = coding->dst_multibyte;
3535   int produced_chars = *p_nchars;
3536   int reg;                      /* graphic register number */
3537   int id = CHARSET_ID (charset);
3538
3539   /* At first, check designations.  */
3540   for (reg = 0; reg < 4; reg++)
3541     if (id == CODING_ISO_DESIGNATION (coding, reg))
3542       break;
3543
3544   if (reg >= 4)
3545     {
3546       /* CHARSET is not yet designated to any graphic registers.  */
3547       /* At first check the requested designation.  */
3548       reg = CODING_ISO_REQUEST (coding, id);
3549       if (reg < 0)
3550         /* Since CHARSET requests no special designation, designate it
3551            to graphic register 0.  */
3552         reg = 0;
3553
3554       ENCODE_DESIGNATION (charset, reg, coding);
3555     }
3556
3557   if (CODING_ISO_INVOCATION (coding, 0) != reg
3558       && CODING_ISO_INVOCATION (coding, 1) != reg)
3559     {
3560       /* Since the graphic register REG is not invoked to any graphic
3561          planes, invoke it to graphic plane 0.  */
3562       switch (reg)
3563         {
3564         case 0:                 /* graphic register 0 */
3565           ENCODE_SHIFT_IN;
3566           break;
3567
3568         case 1:                 /* graphic register 1 */
3569           ENCODE_SHIFT_OUT;
3570           break;
3571
3572         case 2:                 /* graphic register 2 */
3573           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3574             ENCODE_SINGLE_SHIFT_2;
3575           else
3576             ENCODE_LOCKING_SHIFT_2;
3577           break;
3578
3579         case 3:                 /* graphic register 3 */
3580           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3581             ENCODE_SINGLE_SHIFT_3;
3582           else
3583             ENCODE_LOCKING_SHIFT_3;
3584           break;
3585         }
3586     }
3587
3588   *p_nchars = produced_chars;
3589   return dst;
3590 }
3591
3592 /* The following three macros produce codes for indicating direction
3593    of text.  */
3594 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3595   do {                                                                  \
3596     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3597       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3598     else                                                                \
3599       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3600   } while (0)
3601
3602
3603 #define ENCODE_DIRECTION_R2L()                  \
3604   do {                                          \
3605     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3606     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3607   } while (0)
3608
3609
3610 #define ENCODE_DIRECTION_L2R()                  \
3611   do {                                          \
3612     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3613     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3614   } while (0)
3615
3616
3617 /* Produce codes for designation and invocation to reset the graphic
3618    planes and registers to initial state.  */
3619 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3620   do {                                                                  \
3621     int reg;                                                            \
3622     struct charset *charset;                                            \
3623                                                                         \
3624     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3625       ENCODE_SHIFT_IN;                                                  \
3626     for (reg = 0; reg < 4; reg++)                                       \
3627       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3628           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3629               != CODING_ISO_INITIAL (coding, reg)))                     \
3630         {                                                               \
3631           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3632           ENCODE_DESIGNATION (charset, reg, coding);                    \
3633         }                                                               \
3634   } while (0)
3635
3636
3637 /* Produce designation sequences of charsets in the line started from
3638    SRC to a place pointed by DST, and return updated DST.
3639
3640    If the current block ends before any end-of-line, we may fail to
3641    find all the necessary designations.  */
3642
3643 static unsigned char *
3644 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3645      struct coding_system *coding;
3646      int *charbuf, *charbuf_end;
3647      unsigned char *dst;
3648 {
3649   struct charset *charset;
3650   /* Table of charsets to be designated to each graphic register.  */
3651   int r[4];
3652   int c, found = 0, reg;
3653   int produced_chars = 0;
3654   int multibytep = coding->dst_multibyte;
3655   Lisp_Object attrs;
3656   Lisp_Object charset_list;
3657
3658   attrs = CODING_ID_ATTRS (coding->id);
3659   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3660   if (EQ (charset_list, Qiso_2022))
3661     charset_list = Viso_2022_charset_list;
3662
3663   for (reg = 0; reg < 4; reg++)
3664     r[reg] = -1;
3665
3666   while (found < 4)
3667     {
3668       int id;
3669
3670       c = *charbuf++;
3671       if (c == '\n')
3672         break;
3673       charset = char_charset (c, charset_list, NULL);
3674       id = CHARSET_ID (charset);
3675       reg = CODING_ISO_REQUEST (coding, id);
3676       if (reg >= 0 && r[reg] < 0)
3677         {
3678           found++;
3679           r[reg] = id;
3680         }
3681     }
3682
3683   if (found)
3684     {
3685       for (reg = 0; reg < 4; reg++)
3686         if (r[reg] >= 0
3687             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3688           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3689     }
3690
3691   return dst;
3692 }
3693
3694 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3695
3696 static int
3697 encode_coding_iso_2022 (coding)
3698      struct coding_system *coding;
3699 {
3700   int multibytep = coding->dst_multibyte;
3701   int *charbuf = coding->charbuf;
3702   int *charbuf_end = charbuf + coding->charbuf_used;
3703   unsigned char *dst = coding->destination + coding->produced;
3704   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3705   int safe_room = 16;
3706   int bol_designation
3707     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3708        && CODING_ISO_BOL (coding));
3709   int produced_chars = 0;
3710   Lisp_Object attrs, eol_type, charset_list;
3711   int ascii_compatible;
3712   int c;
3713   int preferred_charset_id = -1;
3714
3715   CODING_GET_INFO (coding, attrs, charset_list);
3716   eol_type = CODING_ID_EOL_TYPE (coding->id);
3717   if (VECTORP (eol_type))
3718     eol_type = Qunix;
3719
3720   setup_iso_safe_charsets (attrs);
3721   /* Charset list may have been changed.  */
3722   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3723   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3724
3725   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3726
3727   while (charbuf < charbuf_end)
3728     {
3729       ASSURE_DESTINATION (safe_room);
3730
3731       if (bol_designation)
3732         {
3733           unsigned char *dst_prev = dst;
3734
3735           /* We have to produce designation sequences if any now.  */
3736           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3737           bol_designation = 0;
3738           /* We are sure that designation sequences are all ASCII bytes.  */
3739           produced_chars += dst - dst_prev;
3740         }
3741
3742       c = *charbuf++;
3743
3744       if (c < 0)
3745         {
3746           /* Handle an annotation.  */
3747           switch (*charbuf)
3748             {
3749             case CODING_ANNOTATE_COMPOSITION_MASK:
3750               /* Not yet implemented.  */
3751               break;
3752             case CODING_ANNOTATE_CHARSET_MASK:
3753               preferred_charset_id = charbuf[3];
3754               if (preferred_charset_id >= 0
3755                   && NILP (Fmemq (make_number (preferred_charset_id),
3756                                   charset_list)))
3757                 preferred_charset_id = -1;
3758               break;
3759             default:
3760               abort ();
3761             }
3762           charbuf += -c - 1;
3763           continue;
3764         }
3765
3766       /* Now encode the character C.  */
3767       if (c < 0x20 || c == 0x7F)
3768         {
3769           if (c == '\n'
3770               || (c == '\r' && EQ (eol_type, Qmac)))
3771             {
3772               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3773                 ENCODE_RESET_PLANE_AND_REGISTER ();
3774               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3775                 {
3776                   int i;
3777
3778                   for (i = 0; i < 4; i++)
3779                     CODING_ISO_DESIGNATION (coding, i)
3780                       = CODING_ISO_INITIAL (coding, i);
3781                 }
3782               bol_designation
3783                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3784             }
3785           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3786             ENCODE_RESET_PLANE_AND_REGISTER ();
3787           EMIT_ONE_ASCII_BYTE (c);
3788         }
3789       else if (ASCII_CHAR_P (c))
3790         {
3791           if (ascii_compatible)
3792             EMIT_ONE_ASCII_BYTE (c);
3793           else
3794             {
3795               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3796               ENCODE_ISO_CHARACTER (charset, c);
3797             }
3798         }
3799       else if (CHAR_BYTE8_P (c))
3800         {
3801           c = CHAR_TO_BYTE8 (c);
3802           EMIT_ONE_BYTE (c);
3803         }
3804       else
3805         {
3806           struct charset *charset;
3807
3808           if (preferred_charset_id >= 0)
3809             {
3810               charset = CHARSET_FROM_ID (preferred_charset_id);
3811               if (! CHAR_CHARSET_P (c, charset))
3812                 charset = char_charset (c, charset_list, NULL);
3813             }
3814           else
3815             charset = char_charset (c, charset_list, NULL);
3816           if (!charset)
3817             {
3818               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3819                 {
3820                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3821                   charset = CHARSET_FROM_ID (charset_ascii);
3822                 }
3823               else
3824                 {
3825                   c = coding->default_char;
3826                   charset = char_charset (c, charset_list, NULL);
3827                 }
3828             }
3829           ENCODE_ISO_CHARACTER (charset, c);
3830         }
3831     }
3832
3833   if (coding->mode & CODING_MODE_LAST_BLOCK
3834       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3835     {
3836       ASSURE_DESTINATION (safe_room);
3837       ENCODE_RESET_PLANE_AND_REGISTER ();
3838     }
3839   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3840   CODING_ISO_BOL (coding) = bol_designation;
3841   coding->produced_char += produced_chars;
3842   coding->produced = dst - coding->destination;
3843   return 0;
3844 }
3845
3846 \f
3847 /*** 8,9. SJIS and BIG5 handlers ***/
3848
3849 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3850    quite widely.  So, for the moment, Emacs supports them in the bare
3851    C code.  But, in the future, they may be supported only by CCL.  */
3852
3853 /* SJIS is a coding system encoding three character sets: ASCII, right
3854    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3855    as is.  A character of charset katakana-jisx0201 is encoded by
3856    "position-code + 0x80".  A character of charset japanese-jisx0208
3857    is encoded in 2-byte but two position-codes are divided and shifted
3858    so that it fit in the range below.
3859
3860    --- CODE RANGE of SJIS ---
3861    (character set)      (range)
3862    ASCII                0x00 .. 0x7F
3863    KATAKANA-JISX0201    0xA0 .. 0xDF
3864    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3865             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3866    -------------------------------
3867
3868 */
3869
3870 /* BIG5 is a coding system encoding two character sets: ASCII and
3871    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3872    character set and is encoded in two-byte.
3873
3874    --- CODE RANGE of BIG5 ---
3875    (character set)      (range)
3876    ASCII                0x00 .. 0x7F
3877    Big5 (1st byte)      0xA1 .. 0xFE
3878         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3879    --------------------------
3880
3881   */
3882
3883 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3884    Check if a text is encoded in SJIS.  If it is, return
3885    CATEGORY_MASK_SJIS, else return 0.  */
3886
3887 static int
3888 detect_coding_sjis (coding, detect_info)
3889      struct coding_system *coding;
3890      struct coding_detection_info *detect_info;
3891 {
3892   const unsigned char *src = coding->source, *src_base;
3893   const unsigned char *src_end = coding->source + coding->src_bytes;
3894   int multibytep = coding->src_multibyte;
3895   int consumed_chars = 0;
3896   int found = 0;
3897   int c;
3898
3899   detect_info->checked |= CATEGORY_MASK_SJIS;
3900   /* A coding system of this category is always ASCII compatible.  */
3901   src += coding->head_ascii;
3902
3903   while (1)
3904     {
3905       src_base = src;
3906       ONE_MORE_BYTE (c);
3907       if (c < 0x80)
3908         continue;
3909       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3910         {
3911           ONE_MORE_BYTE (c);
3912           if (c < 0x40 || c == 0x7F || c > 0xFC)
3913             break;
3914           found = CATEGORY_MASK_SJIS;
3915         }
3916       else if (c >= 0xA0 && c < 0xE0)
3917         found = CATEGORY_MASK_SJIS;
3918       else
3919         break;
3920     }
3921   detect_info->rejected |= CATEGORY_MASK_SJIS;
3922   return 0;
3923
3924  no_more_source:
3925   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3926     {
3927       detect_info->rejected |= CATEGORY_MASK_SJIS;
3928       return 0;
3929     }
3930   detect_info->found |= found;
3931   return 1;
3932 }
3933
3934 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3935    Check if a text is encoded in BIG5.  If it is, return
3936    CATEGORY_MASK_BIG5, else return 0.  */
3937
3938 static int
3939 detect_coding_big5 (coding, detect_info)
3940      struct coding_system *coding;
3941      struct coding_detection_info *detect_info;
3942 {
3943   const unsigned char *src = coding->source, *src_base;
3944   const unsigned char *src_end = coding->source + coding->src_bytes;
3945   int multibytep = coding->src_multibyte;
3946   int consumed_chars = 0;
3947   int found = 0;
3948   int c;
3949
3950   detect_info->checked |= CATEGORY_MASK_BIG5;
3951   /* A coding system of this category is always ASCII compatible.  */
3952   src += coding->head_ascii;
3953
3954   while (1)
3955     {
3956       src_base = src;
3957       ONE_MORE_BYTE (c);
3958       if (c < 0x80)
3959         continue;
3960       if (c >= 0xA1)
3961         {
3962           ONE_MORE_BYTE (c);
3963           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3964             return 0;
3965           found = CATEGORY_MASK_BIG5;
3966         }
3967       else
3968         break;
3969     }
3970   detect_info->rejected |= CATEGORY_MASK_BIG5;
3971   return 0;
3972
3973  no_more_source:
3974   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3975     {
3976       detect_info->rejected |= CATEGORY_MASK_BIG5;
3977       return 0;
3978     }
3979   detect_info->found |= found;
3980   return 1;
3981 }
3982
3983 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3984    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3985
3986 static void
3987 decode_coding_sjis (coding)
3988      struct coding_system *coding;
3989 {
3990   const unsigned char *src = coding->source + coding->consumed;
3991   const unsigned char *src_end = coding->source + coding->src_bytes;
3992   const unsigned char *src_base;
3993   int *charbuf = coding->charbuf;
3994   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3995   int consumed_chars = 0, consumed_chars_base;
3996   int multibytep = coding->src_multibyte;
3997   struct charset *charset_roman, *charset_kanji, *charset_kana;
3998   struct charset *charset_kanji2;
3999   Lisp_Object attrs, charset_list, val;
4000   int char_offset = coding->produced_char;
4001   int last_offset = char_offset;
4002   int last_id = charset_ascii;
4003
4004   CODING_GET_INFO (coding, attrs, charset_list);
4005
4006   val = charset_list;
4007   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4008   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4009   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4010   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4011
4012   while (1)
4013     {
4014       int c, c1;
4015       struct charset *charset;
4016
4017       src_base = src;
4018       consumed_chars_base = consumed_chars;
4019
4020       if (charbuf >= charbuf_end)
4021         break;
4022
4023       ONE_MORE_BYTE (c);
4024       if (c < 0)
4025         goto invalid_code;
4026       if (c < 0x80)
4027         charset = charset_roman;
4028       else if (c == 0x80 || c == 0xA0)
4029         goto invalid_code;
4030       else if (c >= 0xA1 && c <= 0xDF)
4031         {
4032           /* SJIS -> JISX0201-Kana */
4033           c &= 0x7F;
4034           charset = charset_kana;
4035         }
4036       else if (c <= 0xEF)
4037         {
4038           /* SJIS -> JISX0208 */
4039           ONE_MORE_BYTE (c1);
4040           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4041             goto invalid_code;
4042           c = (c << 8) | c1;
4043           SJIS_TO_JIS (c);
4044           charset = charset_kanji;
4045         }
4046       else if (c <= 0xFC && charset_kanji2)
4047         {
4048           /* SJIS -> JISX0213-2 */
4049           ONE_MORE_BYTE (c1);
4050           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4051             goto invalid_code;
4052           c = (c << 8) | c1;
4053           SJIS_TO_JIS2 (c);
4054           charset = charset_kanji2;
4055         }
4056       else
4057         goto invalid_code;
4058       if (charset->id != charset_ascii
4059           && last_id != charset->id)
4060         {
4061           if (last_id != charset_ascii)
4062             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4063           last_id = charset->id;
4064           last_offset = char_offset;
4065         }
4066       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4067       *charbuf++ = c;
4068       char_offset++;
4069       continue;
4070
4071     invalid_code:
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078     }
4079
4080  no_more_source:
4081   if (last_id != charset_ascii)
4082     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4083   coding->consumed_char += consumed_chars_base;
4084   coding->consumed = src_base - coding->source;
4085   coding->charbuf_used = charbuf - coding->charbuf;
4086 }
4087
4088 static void
4089 decode_coding_big5 (coding)
4090      struct coding_system *coding;
4091 {
4092   const unsigned char *src = coding->source + coding->consumed;
4093   const unsigned char *src_end = coding->source + coding->src_bytes;
4094   const unsigned char *src_base;
4095   int *charbuf = coding->charbuf;
4096   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4097   int consumed_chars = 0, consumed_chars_base;
4098   int multibytep = coding->src_multibyte;
4099   struct charset *charset_roman, *charset_big5;
4100   Lisp_Object attrs, charset_list, val;
4101   int char_offset = coding->produced_char;
4102   int last_offset = char_offset;
4103   int last_id = charset_ascii;
4104
4105   CODING_GET_INFO (coding, attrs, charset_list);
4106   val = charset_list;
4107   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4108   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4109
4110   while (1)
4111     {
4112       int c, c1;
4113       struct charset *charset;
4114
4115       src_base = src;
4116       consumed_chars_base = consumed_chars;
4117
4118       if (charbuf >= charbuf_end)
4119         break;
4120
4121       ONE_MORE_BYTE (c);
4122
4123       if (c < 0)
4124         goto invalid_code;
4125       if (c < 0x80)
4126         charset = charset_roman;
4127       else
4128         {
4129           /* BIG5 -> Big5 */
4130           if (c < 0xA1 || c > 0xFE)
4131             goto invalid_code;
4132           ONE_MORE_BYTE (c1);
4133           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4134             goto invalid_code;
4135           c = c << 8 | c1;
4136           charset = charset_big5;
4137         }
4138       if (charset->id != charset_ascii
4139           && last_id != charset->id)
4140         {
4141           if (last_id != charset_ascii)
4142             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4143           last_id = charset->id;
4144           last_offset = char_offset;
4145         }
4146       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4147       *charbuf++ = c;
4148       char_offset++;
4149       continue;
4150
4151     invalid_code:
4152       src = src_base;
4153       consumed_chars = consumed_chars_base;
4154       ONE_MORE_BYTE (c);
4155       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4156       char_offset++;
4157       coding->errors++;
4158     }
4159
4160  no_more_source:
4161   if (last_id != charset_ascii)
4162     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4163   coding->consumed_char += consumed_chars_base;
4164   coding->consumed = src_base - coding->source;
4165   coding->charbuf_used = charbuf - coding->charbuf;
4166 }
4167
4168 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4169    This function can encode charsets `ascii', `katakana-jisx0201',
4170    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4171    are sure that all these charsets are registered as official charset
4172    (i.e. do not have extended leading-codes).  Characters of other
4173    charsets are produced without any encoding.  If SJIS_P is 1, encode
4174    SJIS text, else encode BIG5 text.  */
4175
4176 static int
4177 encode_coding_sjis (coding)
4178      struct coding_system *coding;
4179 {
4180   int multibytep = coding->dst_multibyte;
4181   int *charbuf = coding->charbuf;
4182   int *charbuf_end = charbuf + coding->charbuf_used;
4183   unsigned char *dst = coding->destination + coding->produced;
4184   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4185   int safe_room = 4;
4186   int produced_chars = 0;
4187   Lisp_Object attrs, charset_list, val;
4188   int ascii_compatible;
4189   struct charset *charset_roman, *charset_kanji, *charset_kana;
4190   struct charset *charset_kanji2;
4191   int c;
4192
4193   CODING_GET_INFO (coding, attrs, charset_list);
4194   val = charset_list;
4195   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4196   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4197   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4198   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4199
4200   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4201
4202   while (charbuf < charbuf_end)
4203     {
4204       ASSURE_DESTINATION (safe_room);
4205       c = *charbuf++;
4206       /* Now encode the character C.  */
4207       if (ASCII_CHAR_P (c) && ascii_compatible)
4208         EMIT_ONE_ASCII_BYTE (c);
4209       else if (CHAR_BYTE8_P (c))
4210         {
4211           c = CHAR_TO_BYTE8 (c);
4212           EMIT_ONE_BYTE (c);
4213         }
4214       else
4215         {
4216           unsigned code;
4217           struct charset *charset = char_charset (c, charset_list, &code);
4218
4219           if (!charset)
4220             {
4221               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4222                 {
4223                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4224                   charset = CHARSET_FROM_ID (charset_ascii);
4225                 }
4226               else
4227                 {
4228                   c = coding->default_char;
4229                   charset = char_charset (c, charset_list, &code);
4230                 }
4231             }
4232           if (code == CHARSET_INVALID_CODE (charset))
4233             abort ();
4234           if (charset == charset_kanji)
4235             {
4236               int c1, c2;
4237               JIS_TO_SJIS (code);
4238               c1 = code >> 8, c2 = code & 0xFF;
4239               EMIT_TWO_BYTES (c1, c2);
4240             }
4241           else if (charset == charset_kana)
4242             EMIT_ONE_BYTE (code | 0x80);
4243           else if (charset_kanji2 && charset == charset_kanji2)
4244             {
4245               int c1, c2;
4246
4247               c1 = code >> 8;
4248               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4249                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4250                 {
4251                   JIS_TO_SJIS2 (code);
4252                   c1 = code >> 8, c2 = code & 0xFF;
4253                   EMIT_TWO_BYTES (c1, c2);
4254                 }
4255               else
4256                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4257             }
4258           else
4259             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4260         }
4261     }
4262   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4263   coding->produced_char += produced_chars;
4264   coding->produced = dst - coding->destination;
4265   return 0;
4266 }
4267
4268 static int
4269 encode_coding_big5 (coding)
4270      struct coding_system *coding;
4271 {
4272   int multibytep = coding->dst_multibyte;
4273   int *charbuf = coding->charbuf;
4274   int *charbuf_end = charbuf + coding->charbuf_used;
4275   unsigned char *dst = coding->destination + coding->produced;
4276   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4277   int safe_room = 4;
4278   int produced_chars = 0;
4279   Lisp_Object attrs, charset_list, val;
4280   int ascii_compatible;
4281   struct charset *charset_roman, *charset_big5;
4282   int c;
4283
4284   CODING_GET_INFO (coding, attrs, charset_list);
4285   val = charset_list;
4286   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4287   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4288   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4289
4290   while (charbuf < charbuf_end)
4291     {
4292       ASSURE_DESTINATION (safe_room);
4293       c = *charbuf++;
4294       /* Now encode the character C.  */
4295       if (ASCII_CHAR_P (c) && ascii_compatible)
4296         EMIT_ONE_ASCII_BYTE (c);
4297       else if (CHAR_BYTE8_P (c))
4298         {
4299           c = CHAR_TO_BYTE8 (c);
4300           EMIT_ONE_BYTE (c);
4301         }
4302       else
4303         {
4304           unsigned code;
4305           struct charset *charset = char_charset (c, charset_list, &code);
4306
4307           if (! charset)
4308             {
4309               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4310                 {
4311                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4312                   charset = CHARSET_FROM_ID (charset_ascii);
4313                 }
4314               else
4315                 {
4316                   c = coding->default_char;
4317                   charset = char_charset (c, charset_list, &code);
4318                 }
4319             }
4320           if (code == CHARSET_INVALID_CODE (charset))
4321             abort ();
4322           if (charset == charset_big5)
4323             {
4324               int c1, c2;
4325
4326               c1 = code >> 8, c2 = code & 0xFF;
4327               EMIT_TWO_BYTES (c1, c2);
4328             }
4329           else
4330             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4331         }
4332     }
4333   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4334   coding->produced_char += produced_chars;
4335   coding->produced = dst - coding->destination;
4336   return 0;
4337 }
4338
4339 \f
4340 /*** 10. CCL handlers ***/
4341
4342 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4343    Check if a text is encoded in a coding system of which
4344    encoder/decoder are written in CCL program.  If it is, return
4345    CATEGORY_MASK_CCL, else return 0.  */
4346
4347 static int
4348 detect_coding_ccl (coding, detect_info)
4349      struct coding_system *coding;
4350      struct coding_detection_info *detect_info;
4351 {
4352   const unsigned char *src = coding->source, *src_base;
4353   const unsigned char *src_end = coding->source + coding->src_bytes;
4354   int multibytep = coding->src_multibyte;
4355   int consumed_chars = 0;
4356   int found = 0;
4357   unsigned char *valids = CODING_CCL_VALIDS (coding);
4358   int head_ascii = coding->head_ascii;
4359   Lisp_Object attrs;
4360
4361   detect_info->checked |= CATEGORY_MASK_CCL;
4362
4363   coding = &coding_categories[coding_category_ccl];
4364   attrs = CODING_ID_ATTRS (coding->id);
4365   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4366     src += head_ascii;
4367
4368   while (1)
4369     {
4370       int c;
4371
4372       src_base = src;
4373       ONE_MORE_BYTE (c);
4374       if (c < 0 || ! valids[c])
4375         break;
4376       if ((valids[c] > 1))
4377         found = CATEGORY_MASK_CCL;
4378     }
4379   detect_info->rejected |= CATEGORY_MASK_CCL;
4380   return 0;
4381
4382  no_more_source:
4383   detect_info->found |= found;
4384   return 1;
4385 }
4386
4387 static void
4388 decode_coding_ccl (coding)
4389      struct coding_system *coding;
4390 {
4391   const unsigned char *src = coding->source + coding->consumed;
4392   const unsigned char *src_end = coding->source + coding->src_bytes;
4393   int *charbuf = coding->charbuf;
4394   int *charbuf_end = charbuf + coding->charbuf_size;
4395   int consumed_chars = 0;
4396   int multibytep = coding->src_multibyte;
4397   struct ccl_program ccl;
4398   int source_charbuf[1024];
4399   int source_byteidx[1024];
4400   Lisp_Object attrs, charset_list;
4401
4402   CODING_GET_INFO (coding, attrs, charset_list);
4403   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4404
4405   while (src < src_end)
4406     {
4407       const unsigned char *p = src;
4408       int *source, *source_end;
4409       int i = 0;
4410
4411       if (multibytep)
4412         while (i < 1024 && p < src_end)
4413           {
4414             source_byteidx[i] = p - src;
4415             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4416           }
4417       else
4418         while (i < 1024 && p < src_end)
4419           source_charbuf[i++] = *p++;
4420
4421       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4422         ccl.last_block = 1;
4423
4424       source = source_charbuf;
4425       source_end = source + i;
4426       while (source < source_end)
4427         {
4428           ccl_driver (&ccl, source, charbuf,
4429                       source_end - source, charbuf_end - charbuf,
4430                       charset_list);
4431           source += ccl.consumed;
4432           charbuf += ccl.produced;
4433           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4434             break;
4435         }
4436       if (source < source_end)
4437         src += source_byteidx[source - source_charbuf];
4438       else
4439         src = p;
4440       consumed_chars += source - source_charbuf;
4441
4442       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4443           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4444         break;
4445     }
4446
4447   switch (ccl.status)
4448     {
4449     case CCL_STAT_SUSPEND_BY_SRC:
4450       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4451       break;
4452     case CCL_STAT_SUSPEND_BY_DST:
4453       break;
4454     case CCL_STAT_QUIT:
4455     case CCL_STAT_INVALID_CMD:
4456       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4457       break;
4458     default:
4459       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4460       break;
4461     }
4462   coding->consumed_char += consumed_chars;
4463   coding->consumed = src - coding->source;
4464   coding->charbuf_used = charbuf - coding->charbuf;
4465 }
4466
4467 static int
4468 encode_coding_ccl (coding)
4469      struct coding_system *coding;
4470 {
4471   struct ccl_program ccl;
4472   int multibytep = coding->dst_multibyte;
4473   int *charbuf = coding->charbuf;
4474   int *charbuf_end = charbuf + coding->charbuf_used;
4475   unsigned char *dst = coding->destination + coding->produced;
4476   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4477   unsigned char *adjusted_dst_end = dst_end - 1;
4478   int destination_charbuf[1024];
4479   int i, produced_chars = 0;
4480   Lisp_Object attrs, charset_list;
4481
4482   CODING_GET_INFO (coding, attrs, charset_list);
4483   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4484
4485   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4486   ccl.dst_multibyte = coding->dst_multibyte;
4487
4488   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4489     {
4490       int dst_bytes = dst_end - dst;
4491       if (dst_bytes > 1024)
4492         dst_bytes = 1024;
4493
4494       ccl_driver (&ccl, charbuf, destination_charbuf,
4495                   charbuf_end - charbuf, dst_bytes, charset_list);
4496       charbuf += ccl.consumed;
4497       if (multibytep)
4498         for (i = 0; i < ccl.produced; i++)
4499           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4500       else
4501         {
4502           for (i = 0; i < ccl.produced; i++)
4503             *dst++ = destination_charbuf[i] & 0xFF;
4504           produced_chars += ccl.produced;
4505         }
4506     }
4507
4508   switch (ccl.status)
4509     {
4510     case CCL_STAT_SUSPEND_BY_SRC:
4511       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4512       break;
4513     case CCL_STAT_SUSPEND_BY_DST:
4514       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4515       break;
4516     case CCL_STAT_QUIT:
4517     case CCL_STAT_INVALID_CMD:
4518       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4519       break;
4520     default:
4521       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4522       break;
4523     }
4524
4525   coding->produced_char += produced_chars;
4526   coding->produced = dst - coding->destination;
4527   return 0;
4528 }
4529
4530
4531 \f
4532 /*** 10, 11. no-conversion handlers ***/
4533
4534 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4535
4536 static void
4537 decode_coding_raw_text (coding)
4538      struct coding_system *coding;
4539 {
4540   coding->chars_at_source = 1;
4541   coding->consumed_char = 0;
4542   coding->consumed = 0;
4543   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4544 }
4545
4546 static int
4547 encode_coding_raw_text (coding)
4548      struct coding_system *coding;
4549 {
4550   int multibytep = coding->dst_multibyte;
4551   int *charbuf = coding->charbuf;
4552   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4553   unsigned char *dst = coding->destination + coding->produced;
4554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4555   int produced_chars = 0;
4556   int c;
4557
4558   if (multibytep)
4559     {
4560       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4561
4562       if (coding->src_multibyte)
4563         while (charbuf < charbuf_end)
4564           {
4565             ASSURE_DESTINATION (safe_room);
4566             c = *charbuf++;
4567             if (ASCII_CHAR_P (c))
4568               EMIT_ONE_ASCII_BYTE (c);
4569             else if (CHAR_BYTE8_P (c))
4570               {
4571                 c = CHAR_TO_BYTE8 (c);
4572                 EMIT_ONE_BYTE (c);
4573               }
4574             else
4575               {
4576                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4577
4578                 CHAR_STRING_ADVANCE (c, p1);
4579                 while (p0 < p1)
4580                   {
4581                     EMIT_ONE_BYTE (*p0);
4582                     p0++;
4583                   }
4584               }
4585           }
4586       else
4587         while (charbuf < charbuf_end)
4588           {
4589             ASSURE_DESTINATION (safe_room);
4590             c = *charbuf++;
4591             EMIT_ONE_BYTE (c);
4592           }
4593     }
4594   else
4595     {
4596       if (coding->src_multibyte)
4597         {
4598           int safe_room = MAX_MULTIBYTE_LENGTH;
4599
4600           while (charbuf < charbuf_end)
4601             {
4602               ASSURE_DESTINATION (safe_room);
4603               c = *charbuf++;
4604               if (ASCII_CHAR_P (c))
4605                 *dst++ = c;
4606               else if (CHAR_BYTE8_P (c))
4607                 *dst++ = CHAR_TO_BYTE8 (c);
4608               else
4609                 CHAR_STRING_ADVANCE (c, dst);
4610               produced_chars++;
4611             }
4612         }
4613       else
4614         {
4615           ASSURE_DESTINATION (charbuf_end - charbuf);
4616           while (charbuf < charbuf_end && dst < dst_end)
4617             *dst++ = *charbuf++;
4618           produced_chars = dst - (coding->destination + coding->dst_bytes);
4619         }
4620     }
4621   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4622   coding->produced_char += produced_chars;
4623   coding->produced = dst - coding->destination;
4624   return 0;
4625 }
4626
4627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4628    Check if a text is encoded in a charset-based coding system.  If it
4629    is, return 1, else return 0.  */
4630
4631 static int
4632 detect_coding_charset (coding, detect_info)
4633      struct coding_system *coding;
4634      struct coding_detection_info *detect_info;
4635 {
4636   const unsigned char *src = coding->source, *src_base;
4637   const unsigned char *src_end = coding->source + coding->src_bytes;
4638   int multibytep = coding->src_multibyte;
4639   int consumed_chars = 0;
4640   Lisp_Object attrs, valids;
4641   int found = 0;
4642
4643   detect_info->checked |= CATEGORY_MASK_CHARSET;
4644
4645   coding = &coding_categories[coding_category_charset];
4646   attrs = CODING_ID_ATTRS (coding->id);
4647   valids = AREF (attrs, coding_attr_charset_valids);
4648
4649   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4650     src += coding->head_ascii;
4651
4652   while (1)
4653     {
4654       int c;
4655
4656       src_base = src;
4657       ONE_MORE_BYTE (c);
4658       if (c < 0)
4659         continue;
4660       if (NILP (AREF (valids, c)))
4661         break;
4662       if (c >= 0x80)
4663         found = CATEGORY_MASK_CHARSET;
4664     }
4665   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4666   return 0;
4667
4668  no_more_source:
4669   detect_info->found |= found;
4670   return 1;
4671 }
4672
4673 static void
4674 decode_coding_charset (coding)
4675      struct coding_system *coding;
4676 {
4677   const unsigned char *src = coding->source + coding->consumed;
4678   const unsigned char *src_end = coding->source + coding->src_bytes;
4679   const unsigned char *src_base;
4680   int *charbuf = coding->charbuf;
4681   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4682   int consumed_chars = 0, consumed_chars_base;
4683   int multibytep = coding->src_multibyte;
4684   Lisp_Object attrs, charset_list, valids;
4685   int char_offset = coding->produced_char;
4686   int last_offset = char_offset;
4687   int last_id = charset_ascii;
4688
4689   CODING_GET_INFO (coding, attrs, charset_list);
4690   valids = AREF (attrs, coding_attr_charset_valids);
4691
4692   while (1)
4693     {
4694       int c;
4695       Lisp_Object val;
4696       struct charset *charset;
4697       int dim;
4698       int len = 1;
4699       unsigned code;
4700
4701       src_base = src;
4702       consumed_chars_base = consumed_chars;
4703
4704       if (charbuf >= charbuf_end)
4705         break;
4706
4707       ONE_MORE_BYTE (c);
4708       if (c < 0)
4709         goto invalid_code;
4710       code = c;
4711
4712       val = AREF (valids, c);
4713       if (NILP (val))
4714         goto invalid_code;
4715       if (INTEGERP (val))
4716         {
4717           charset = CHARSET_FROM_ID (XFASTINT (val));
4718           dim = CHARSET_DIMENSION (charset);
4719           while (len < dim)
4720             {
4721               ONE_MORE_BYTE (c);
4722               code = (code << 8) | c;
4723               len++;
4724             }
4725           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4726                               charset, code, c);
4727         }
4728       else
4729         {
4730           /* VAL is a list of charset IDs.  It is assured that the
4731              list is sorted by charset dimensions (smaller one
4732              comes first).  */
4733           while (CONSP (val))
4734             {
4735               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4736               dim = CHARSET_DIMENSION (charset);
4737               while (len < dim)
4738                 {
4739                   ONE_MORE_BYTE (c);
4740                   code = (code << 8) | c;
4741                   len++;
4742                 }
4743               CODING_DECODE_CHAR (coding, src, src_base,
4744                                   src_end, charset, code, c);
4745               if (c >= 0)
4746                 break;
4747               val = XCDR (val);
4748             }
4749         }
4750       if (c < 0)
4751         goto invalid_code;
4752       if (charset->id != charset_ascii
4753           && last_id != charset->id)
4754         {
4755           if (last_id != charset_ascii)
4756             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4757           last_id = charset->id;
4758           last_offset = char_offset;
4759         }
4760
4761       *charbuf++ = c;
4762       char_offset++;
4763       continue;
4764
4765     invalid_code:
4766       src = src_base;
4767       consumed_chars = consumed_chars_base;
4768       ONE_MORE_BYTE (c);
4769       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4770       char_offset++;
4771       coding->errors++;
4772     }
4773
4774  no_more_source:
4775   if (last_id != charset_ascii)
4776     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4777   coding->consumed_char += consumed_chars_base;
4778   coding->consumed = src_base - coding->source;
4779   coding->charbuf_used = charbuf - coding->charbuf;
4780 }
4781
4782 static int
4783 encode_coding_charset (coding)
4784      struct coding_system *coding;
4785 {
4786   int multibytep = coding->dst_multibyte;
4787   int *charbuf = coding->charbuf;
4788   int *charbuf_end = charbuf + coding->charbuf_used;
4789   unsigned char *dst = coding->destination + coding->produced;
4790   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4791   int safe_room = MAX_MULTIBYTE_LENGTH;
4792   int produced_chars = 0;
4793   Lisp_Object attrs, charset_list;
4794   int ascii_compatible;
4795   int c;
4796
4797   CODING_GET_INFO (coding, attrs, charset_list);
4798   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4799
4800   while (charbuf < charbuf_end)
4801     {
4802       struct charset *charset;
4803       unsigned code;
4804
4805       ASSURE_DESTINATION (safe_room);
4806       c = *charbuf++;
4807       if (ascii_compatible && ASCII_CHAR_P (c))
4808         EMIT_ONE_ASCII_BYTE (c);
4809       else if (CHAR_BYTE8_P (c))
4810         {
4811           c = CHAR_TO_BYTE8 (c);
4812           EMIT_ONE_BYTE (c);
4813         }
4814       else
4815         {
4816           charset = char_charset (c, charset_list, &code);
4817           if (charset)
4818             {
4819               if (CHARSET_DIMENSION (charset) == 1)
4820                 EMIT_ONE_BYTE (code);
4821               else if (CHARSET_DIMENSION (charset) == 2)
4822                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4823               else if (CHARSET_DIMENSION (charset) == 3)
4824                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4825               else
4826                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4827                                  (code >> 8) & 0xFF, code & 0xFF);
4828             }
4829           else
4830             {
4831               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4832                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4833               else
4834                 c = coding->default_char;
4835               EMIT_ONE_BYTE (c);
4836             }
4837         }
4838     }
4839
4840   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4841   coding->produced_char += produced_chars;
4842   coding->produced = dst - coding->destination;
4843   return 0;
4844 }
4845
4846 \f
4847 /*** 7. C library functions ***/
4848
4849 /* Setup coding context CODING from information about CODING_SYSTEM.
4850    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4851    CODING_SYSTEM is invalid, signal an error.  */
4852
4853 void
4854 setup_coding_system (coding_system, coding)
4855      Lisp_Object coding_system;
4856      struct coding_system *coding;
4857 {
4858   Lisp_Object attrs;
4859   Lisp_Object eol_type;
4860   Lisp_Object coding_type;
4861   Lisp_Object val;
4862
4863   if (NILP (coding_system))
4864     coding_system = Qno_conversion;
4865
4866   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4867
4868   attrs = CODING_ID_ATTRS (coding->id);
4869   eol_type = CODING_ID_EOL_TYPE (coding->id);
4870
4871   coding->mode = 0;
4872   coding->head_ascii = -1;
4873   coding->common_flags
4874     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4875   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4876     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4877   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4878     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4879   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4880     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4881
4882   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4883   coding->max_charset_id = SCHARS (val) - 1;
4884   coding->safe_charsets = (char *) SDATA (val);
4885   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4886
4887   coding_type = CODING_ATTR_TYPE (attrs);
4888   if (EQ (coding_type, Qundecided))
4889     {
4890       coding->detector = NULL;
4891       coding->decoder = decode_coding_raw_text;
4892       coding->encoder = encode_coding_raw_text;
4893       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4894     }
4895   else if (EQ (coding_type, Qiso_2022))
4896     {
4897       int i;
4898       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4899
4900       /* Invoke graphic register 0 to plane 0.  */
4901       CODING_ISO_INVOCATION (coding, 0) = 0;
4902       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4903       CODING_ISO_INVOCATION (coding, 1)
4904         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4905       /* Setup the initial status of designation.  */
4906       for (i = 0; i < 4; i++)
4907         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4908       /* Not single shifting initially.  */
4909       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4910       /* Beginning of buffer should also be regarded as bol. */
4911       CODING_ISO_BOL (coding) = 1;
4912       coding->detector = detect_coding_iso_2022;
4913       coding->decoder = decode_coding_iso_2022;
4914       coding->encoder = encode_coding_iso_2022;
4915       if (flags & CODING_ISO_FLAG_SAFE)
4916         coding->mode |= CODING_MODE_SAFE_ENCODING;
4917       coding->common_flags
4918         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4919             | CODING_REQUIRE_FLUSHING_MASK);
4920       if (flags & CODING_ISO_FLAG_COMPOSITION)
4921         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4922       if (flags & CODING_ISO_FLAG_DESIGNATION)
4923         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4924       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4925         {
4926           setup_iso_safe_charsets (attrs);
4927           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4928           coding->max_charset_id = SCHARS (val) - 1;
4929           coding->safe_charsets = (char *) SDATA (val);
4930         }
4931       CODING_ISO_FLAGS (coding) = flags;
4932     }
4933   else if (EQ (coding_type, Qcharset))
4934     {
4935       coding->detector = detect_coding_charset;
4936       coding->decoder = decode_coding_charset;
4937       coding->encoder = encode_coding_charset;
4938       coding->common_flags
4939         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4940     }
4941   else if (EQ (coding_type, Qutf_8))
4942     {
4943       coding->detector = detect_coding_utf_8;
4944       coding->decoder = decode_coding_utf_8;
4945       coding->encoder = encode_coding_utf_8;
4946       coding->common_flags
4947         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4948     }
4949   else if (EQ (coding_type, Qutf_16))
4950     {
4951       val = AREF (attrs, coding_attr_utf_16_bom);
4952       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4953                                     : EQ (val, Qt) ? utf_16_with_bom
4954                                     : utf_16_without_bom);
4955       val = AREF (attrs, coding_attr_utf_16_endian);
4956       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4957                                        : utf_16_little_endian);
4958       CODING_UTF_16_SURROGATE (coding) = 0;
4959       coding->detector = detect_coding_utf_16;
4960       coding->decoder = decode_coding_utf_16;
4961       coding->encoder = encode_coding_utf_16;
4962       coding->common_flags
4963         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4964       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4965         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4966     }
4967   else if (EQ (coding_type, Qccl))
4968     {
4969       coding->detector = detect_coding_ccl;
4970       coding->decoder = decode_coding_ccl;
4971       coding->encoder = encode_coding_ccl;
4972       coding->common_flags
4973         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4974             | CODING_REQUIRE_FLUSHING_MASK);
4975     }
4976   else if (EQ (coding_type, Qemacs_mule))
4977     {
4978       coding->detector = detect_coding_emacs_mule;
4979       coding->decoder = decode_coding_emacs_mule;
4980       coding->encoder = encode_coding_emacs_mule;
4981       coding->common_flags
4982         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4983       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4984           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4985         {
4986           Lisp_Object tail, safe_charsets;
4987           int max_charset_id = 0;
4988
4989           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4990                tail = XCDR (tail))
4991             if (max_charset_id < XFASTINT (XCAR (tail)))
4992               max_charset_id = XFASTINT (XCAR (tail));
4993           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4994                                         make_number (255));
4995           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4996                tail = XCDR (tail))
4997             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4998           coding->max_charset_id = max_charset_id;
4999           coding->safe_charsets = (char *) SDATA (safe_charsets);
5000         }
5001     }
5002   else if (EQ (coding_type, Qshift_jis))
5003     {
5004       coding->detector = detect_coding_sjis;
5005       coding->decoder = decode_coding_sjis;
5006       coding->encoder = encode_coding_sjis;
5007       coding->common_flags
5008         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5009     }
5010   else if (EQ (coding_type, Qbig5))
5011     {
5012       coding->detector = detect_coding_big5;
5013       coding->decoder = decode_coding_big5;
5014       coding->encoder = encode_coding_big5;
5015       coding->common_flags
5016         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5017     }
5018   else                          /* EQ (coding_type, Qraw_text) */
5019     {
5020       coding->detector = NULL;
5021       coding->decoder = decode_coding_raw_text;
5022       coding->encoder = encode_coding_raw_text;
5023     }
5024
5025   return;
5026 }
5027
5028 /* Return raw-text or one of its subsidiaries that has the same
5029    eol_type as CODING-SYSTEM.  */
5030
5031 Lisp_Object
5032 raw_text_coding_system (coding_system)
5033      Lisp_Object coding_system;
5034 {
5035   Lisp_Object spec, attrs;
5036   Lisp_Object eol_type, raw_text_eol_type;
5037
5038   if (NILP (coding_system))
5039     return Qraw_text;
5040   spec = CODING_SYSTEM_SPEC (coding_system);
5041   attrs = AREF (spec, 0);
5042
5043   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5044     return coding_system;
5045
5046   eol_type = AREF (spec, 2);
5047   if (VECTORP (eol_type))
5048     return Qraw_text;
5049   spec = CODING_SYSTEM_SPEC (Qraw_text);
5050   raw_text_eol_type = AREF (spec, 2);
5051   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5052           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5053           : AREF (raw_text_eol_type, 2));
5054 }
5055
5056
5057 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5058    does, return one of the subsidiary that has the same eol-spec as
5059    PARENT.  Otherwise, return CODING_SYSTEM.  */
5060
5061 Lisp_Object
5062 coding_inherit_eol_type (coding_system, parent)
5063      Lisp_Object coding_system, parent;
5064 {
5065   Lisp_Object spec, eol_type;
5066
5067   if (NILP (coding_system))
5068     coding_system = Qraw_text;
5069   spec = CODING_SYSTEM_SPEC (coding_system);
5070   eol_type = AREF (spec, 2);
5071   if (VECTORP (eol_type)
5072       && ! NILP (parent))
5073     {
5074       Lisp_Object parent_spec;
5075       Lisp_Object parent_eol_type;
5076
5077       parent_spec
5078         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5079       parent_eol_type = AREF (parent_spec, 2);
5080       if (EQ (parent_eol_type, Qunix))
5081         coding_system = AREF (eol_type, 0);
5082       else if (EQ (parent_eol_type, Qdos))
5083         coding_system = AREF (eol_type, 1);
5084       else if (EQ (parent_eol_type, Qmac))
5085         coding_system = AREF (eol_type, 2);
5086     }
5087   return coding_system;
5088 }
5089
5090 /* Emacs has a mechanism to automatically detect a coding system if it
5091    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5092    it's impossible to distinguish some coding systems accurately
5093    because they use the same range of codes.  So, at first, coding
5094    systems are categorized into 7, those are:
5095
5096    o coding-category-emacs-mule
5097
5098         The category for a coding system which has the same code range
5099         as Emacs' internal format.  Assigned the coding-system (Lisp
5100         symbol) `emacs-mule' by default.
5101
5102    o coding-category-sjis
5103
5104         The category for a coding system which has the same code range
5105         as SJIS.  Assigned the coding-system (Lisp
5106         symbol) `japanese-shift-jis' by default.
5107
5108    o coding-category-iso-7
5109
5110         The category for a coding system which has the same code range
5111         as ISO2022 of 7-bit environment.  This doesn't use any locking
5112         shift and single shift functions.  This can encode/decode all
5113         charsets.  Assigned the coding-system (Lisp symbol)
5114         `iso-2022-7bit' by default.
5115
5116    o coding-category-iso-7-tight
5117
5118         Same as coding-category-iso-7 except that this can
5119         encode/decode only the specified charsets.
5120
5121    o coding-category-iso-8-1
5122
5123         The category for a coding system which has the same code range
5124         as ISO2022 of 8-bit environment and graphic plane 1 used only
5125         for DIMENSION1 charset.  This doesn't use any locking shift
5126         and single shift functions.  Assigned the coding-system (Lisp
5127         symbol) `iso-latin-1' by default.
5128
5129    o coding-category-iso-8-2
5130
5131         The category for a coding system which has the same code range
5132         as ISO2022 of 8-bit environment and graphic plane 1 used only
5133         for DIMENSION2 charset.  This doesn't use any locking shift
5134         and single shift functions.  Assigned the coding-system (Lisp
5135         symbol) `japanese-iso-8bit' by default.
5136
5137    o coding-category-iso-7-else
5138
5139         The category for a coding system which has the same code range
5140         as ISO2022 of 7-bit environemnt but uses locking shift or
5141         single shift functions.  Assigned the coding-system (Lisp
5142         symbol) `iso-2022-7bit-lock' by default.
5143
5144    o coding-category-iso-8-else
5145
5146         The category for a coding system which has the same code range
5147         as ISO2022 of 8-bit environemnt but uses locking shift or
5148         single shift functions.  Assigned the coding-system (Lisp
5149         symbol) `iso-2022-8bit-ss2' by default.
5150
5151    o coding-category-big5
5152
5153         The category for a coding system which has the same code range
5154         as BIG5.  Assigned the coding-system (Lisp symbol)
5155         `cn-big5' by default.
5156
5157    o coding-category-utf-8
5158
5159         The category for a coding system which has the same code range
5160         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5161         symbol) `utf-8' by default.
5162
5163    o coding-category-utf-16-be
5164
5165         The category for a coding system in which a text has an
5166         Unicode signature (cf. Unicode Standard) in the order of BIG
5167         endian at the head.  Assigned the coding-system (Lisp symbol)
5168         `utf-16-be' by default.
5169
5170    o coding-category-utf-16-le
5171
5172         The category for a coding system in which a text has an
5173         Unicode signature (cf. Unicode Standard) in the order of
5174         LITTLE endian at the head.  Assigned the coding-system (Lisp
5175         symbol) `utf-16-le' by default.
5176
5177    o coding-category-ccl
5178
5179         The category for a coding system of which encoder/decoder is
5180         written in CCL programs.  The default value is nil, i.e., no
5181         coding system is assigned.
5182
5183    o coding-category-binary
5184
5185         The category for a coding system not categorized in any of the
5186         above.  Assigned the coding-system (Lisp symbol)
5187         `no-conversion' by default.
5188
5189    Each of them is a Lisp symbol and the value is an actual
5190    `coding-system's (this is also a Lisp symbol) assigned by a user.
5191    What Emacs does actually is to detect a category of coding system.
5192    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5193    decide only one possible category, it selects a category of the
5194    highest priority.  Priorities of categories are also specified by a
5195    user in a Lisp variable `coding-category-list'.
5196
5197 */
5198
5199 #define EOL_SEEN_NONE   0
5200 #define EOL_SEEN_LF     1
5201 #define EOL_SEEN_CR     2
5202 #define EOL_SEEN_CRLF   4
5203
5204 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5205    SOURCE is encoded.  If CATEGORY is one of
5206    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5207    two-byte, else they are encoded by one-byte.
5208
5209    Return one of EOL_SEEN_XXX.  */
5210
5211 #define MAX_EOL_CHECK_COUNT 3
5212
5213 static int
5214 detect_eol (source, src_bytes, category)
5215      unsigned char *source;
5216      EMACS_INT src_bytes;
5217      enum coding_category category;
5218 {
5219   unsigned char *src = source, *src_end = src + src_bytes;
5220   unsigned char c;
5221   int total  = 0;
5222   int eol_seen = EOL_SEEN_NONE;
5223
5224   if ((1 << category) & CATEGORY_MASK_UTF_16)
5225     {
5226       int msb, lsb;
5227
5228       msb = category == (coding_category_utf_16_le
5229                          | coding_category_utf_16_le_nosig);
5230       lsb = 1 - msb;
5231
5232       while (src + 1 < src_end)
5233         {
5234           c = src[lsb];
5235           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5236             {
5237               int this_eol;
5238
5239               if (c == '\n')
5240                 this_eol = EOL_SEEN_LF;
5241               else if (src + 3 >= src_end
5242                        || src[msb + 2] != 0
5243                        || src[lsb + 2] != '\n')
5244                 this_eol = EOL_SEEN_CR;
5245               else
5246                 this_eol = EOL_SEEN_CRLF;
5247
5248               if (eol_seen == EOL_SEEN_NONE)
5249                 /* This is the first end-of-line.  */
5250                 eol_seen = this_eol;
5251               else if (eol_seen != this_eol)
5252                 {
5253                   /* The found type is different from what found before.  */
5254                   eol_seen = EOL_SEEN_LF;
5255                   break;
5256                 }
5257               if (++total == MAX_EOL_CHECK_COUNT)
5258                 break;
5259             }
5260           src += 2;
5261         }
5262     }
5263   else
5264     {
5265       while (src < src_end)
5266         {
5267           c = *src++;
5268           if (c == '\n' || c == '\r')
5269             {
5270               int this_eol;
5271
5272               if (c == '\n')
5273                 this_eol = EOL_SEEN_LF;
5274               else if (src >= src_end || *src != '\n')
5275                 this_eol = EOL_SEEN_CR;
5276               else
5277                 this_eol = EOL_SEEN_CRLF, src++;
5278
5279               if (eol_seen == EOL_SEEN_NONE)
5280                 /* This is the first end-of-line.  */
5281                 eol_seen = this_eol;
5282               else if (eol_seen != this_eol)
5283                 {
5284                   /* The found type is different from what found before.  */
5285                   eol_seen = EOL_SEEN_LF;
5286                   break;
5287                 }
5288               if (++total == MAX_EOL_CHECK_COUNT)
5289                 break;
5290             }
5291         }
5292     }
5293   return eol_seen;
5294 }
5295
5296
5297 static Lisp_Object
5298 adjust_coding_eol_type (coding, eol_seen)
5299      struct coding_system *coding;
5300      int eol_seen;
5301 {
5302   Lisp_Object eol_type;
5303
5304   eol_type = CODING_ID_EOL_TYPE (coding->id);
5305   if (eol_seen & EOL_SEEN_LF)
5306     {
5307       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5308       eol_type = Qunix;
5309     }
5310   else if (eol_seen & EOL_SEEN_CRLF)
5311     {
5312       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5313       eol_type = Qdos;
5314     }
5315   else if (eol_seen & EOL_SEEN_CR)
5316     {
5317       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5318       eol_type = Qmac;
5319     }
5320   return eol_type;
5321 }
5322
5323 /* Detect how a text specified in CODING is encoded.  If a coding
5324    system is detected, update fields of CODING by the detected coding
5325    system.  */
5326
5327 void
5328 detect_coding (coding)
5329      struct coding_system *coding;
5330 {
5331   const unsigned char *src, *src_end;
5332   Lisp_Object attrs, coding_type;
5333
5334   coding->consumed = coding->consumed_char = 0;
5335   coding->produced = coding->produced_char = 0;
5336   coding_set_source (coding);
5337
5338   src_end = coding->source + coding->src_bytes;
5339
5340   /* If we have not yet decided the text encoding type, detect it
5341      now.  */
5342   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5343     {
5344       int c, i;
5345
5346       for (i = 0, src = coding->source; src < src_end; i++, src++)
5347         {
5348           c = *src;
5349           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5350                                         || c == ISO_CODE_SI
5351                                         || c == ISO_CODE_SO)))
5352             break;
5353         }
5354       coding->head_ascii = src - (coding->source + coding->consumed);
5355
5356       if (coding->head_ascii < coding->src_bytes)
5357         {
5358           struct coding_detection_info detect_info;
5359           enum coding_category category;
5360           struct coding_system *this;
5361
5362           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5363           for (i = 0; i < coding_category_raw_text; i++)
5364             {
5365               category = coding_priorities[i];
5366               this = coding_categories + category;
5367               if (this->id < 0)
5368                 {
5369                   /* No coding system of this category is defined.  */
5370                   detect_info.rejected |= (1 << category);
5371                 }
5372               else if (category >= coding_category_raw_text)
5373                 continue;
5374               else if (detect_info.checked & (1 << category))
5375                 {
5376                   if (detect_info.found & (1 << category))
5377                     break;
5378                 }
5379               else if ((*(this->detector)) (coding, &detect_info)
5380                        && detect_info.found & (1 << category))
5381                 {
5382                   if (category == coding_category_utf_16_auto)
5383                     {
5384                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5385                         category = coding_category_utf_16_le;
5386                       else
5387                         category = coding_category_utf_16_be;
5388                     }
5389                   break;
5390                 }
5391             }
5392           if (i < coding_category_raw_text)
5393             setup_coding_system (CODING_ID_NAME (this->id), coding);
5394           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5395             setup_coding_system (Qraw_text, coding);
5396           else if (detect_info.rejected)
5397             for (i = 0; i < coding_category_raw_text; i++)
5398               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5399                 {
5400                   this = coding_categories + coding_priorities[i];
5401                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5402                   break;
5403                 }
5404         }
5405     }
5406   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5407            == coding_category_utf_16_auto)
5408     {
5409       Lisp_Object coding_systems;
5410       struct coding_detection_info detect_info;
5411
5412       coding_systems
5413         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5414       detect_info.found = detect_info.rejected = 0;
5415       if (CONSP (coding_systems)
5416           && detect_coding_utf_16 (coding, &detect_info))
5417         {
5418           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5419             setup_coding_system (XCAR (coding_systems), coding);
5420           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5421             setup_coding_system (XCDR (coding_systems), coding);
5422         }
5423     }
5424 }
5425
5426
5427 static void
5428 decode_eol (coding)
5429      struct coding_system *coding;
5430 {
5431   Lisp_Object eol_type;
5432   unsigned char *p, *pbeg, *pend;
5433
5434   eol_type = CODING_ID_EOL_TYPE (coding->id);
5435   if (EQ (eol_type, Qunix))
5436     return;
5437
5438   if (NILP (coding->dst_object))
5439     pbeg = coding->destination;
5440   else
5441     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5442   pend = pbeg + coding->produced;
5443
5444   if (VECTORP (eol_type))
5445     {
5446       int eol_seen = EOL_SEEN_NONE;
5447
5448       for (p = pbeg; p < pend; p++)
5449         {
5450           if (*p == '\n')
5451             eol_seen |= EOL_SEEN_LF;
5452           else if (*p == '\r')
5453             {
5454               if (p + 1 < pend && *(p + 1) == '\n')
5455                 {
5456                   eol_seen |= EOL_SEEN_CRLF;
5457                   p++;
5458                 }
5459               else
5460                 eol_seen |= EOL_SEEN_CR;
5461             }
5462         }
5463       if (eol_seen != EOL_SEEN_NONE
5464           && eol_seen != EOL_SEEN_LF
5465           && eol_seen != EOL_SEEN_CRLF
5466           && eol_seen != EOL_SEEN_CR)
5467         eol_seen = EOL_SEEN_LF;
5468       if (eol_seen != EOL_SEEN_NONE)
5469         eol_type = adjust_coding_eol_type (coding, eol_seen);
5470     }
5471
5472   if (EQ (eol_type, Qmac))
5473     {
5474       for (p = pbeg; p < pend; p++)
5475         if (*p == '\r')
5476           *p = '\n';
5477     }
5478   else if (EQ (eol_type, Qdos))
5479     {
5480       int n = 0;
5481
5482       if (NILP (coding->dst_object))
5483         {
5484           for (p = pend - 2; p >= pbeg; p--)
5485             if (*p == '\r')
5486               {
5487                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5488                 n++;
5489               }
5490         }
5491       else
5492         {
5493           for (p = pend - 2; p >= pbeg; p--)
5494             if (*p == '\r')
5495               {
5496                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5497                 int pos = BYTE_TO_CHAR (pos_byte);
5498
5499                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5500                 n++;
5501               }
5502         }
5503       coding->produced -= n;
5504       coding->produced_char -= n;
5505     }
5506 }
5507
5508
5509 /* Return a translation table (or list of them) from coding system
5510    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5511    decoding (ENCODEP is zero). */
5512
5513 static INLINE
5514 get_translation_table (attrs, encodep)
5515 {
5516   Lisp_Object standard, translation_table;
5517
5518   if (encodep)
5519     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5520       standard = Vstandard_translation_table_for_encode;
5521   else
5522     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5523       standard = Vstandard_translation_table_for_decode;
5524   if (NILP (translation_table))
5525     return standard;
5526   if (SYMBOLP (translation_table))
5527     translation_table = Fget (translation_table, Qtranslation_table);
5528   else if (CONSP (translation_table))
5529     {
5530       Lisp_Object val;
5531
5532       translation_table = Fcopy_sequence (translation_table);
5533       for (val = translation_table; CONSP (val); val = XCDR (val))
5534         if (SYMBOLP (XCAR (val)))
5535           XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5536     }
5537   if (! NILP (standard))
5538     {
5539       if (CONSP (translation_table))
5540         translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5541       else
5542         translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5543     }
5544   return translation_table;
5545 }
5546
5547
5548 static void
5549 translate_chars (coding, table)
5550      struct coding_system *coding;
5551      Lisp_Object table;
5552 {
5553   int *charbuf = coding->charbuf;
5554   int *charbuf_end = charbuf + coding->charbuf_used;
5555   int c;
5556
5557   if (coding->chars_at_source)
5558     return;
5559
5560   while (charbuf < charbuf_end)
5561     {
5562       c = *charbuf;
5563       if (c < 0)
5564         charbuf += -c;
5565       else
5566         *charbuf++ = translate_char (table, c);
5567     }
5568 }
5569
5570 static int
5571 produce_chars (coding)
5572      struct coding_system *coding;
5573 {
5574   unsigned char *dst = coding->destination + coding->produced;
5575   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5576   int produced;
5577   int produced_chars = 0;
5578
5579   if (! coding->chars_at_source)
5580     {
5581       /* Characters are in coding->charbuf.  */
5582       int *buf = coding->charbuf;
5583       int *buf_end = buf + coding->charbuf_used;
5584       unsigned char *adjusted_dst_end;
5585
5586       if (BUFFERP (coding->src_object)
5587           && EQ (coding->src_object, coding->dst_object))
5588         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5589       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5590
5591       while (buf < buf_end)
5592         {
5593           int c = *buf++;
5594
5595           if (dst >= adjusted_dst_end)
5596             {
5597               dst = alloc_destination (coding,
5598                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5599                                        dst);
5600               dst_end = coding->destination + coding->dst_bytes;
5601               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5602             }
5603           if (c >= 0)
5604             {
5605               if (coding->dst_multibyte
5606                   || ! CHAR_BYTE8_P (c))
5607                 CHAR_STRING_ADVANCE (c, dst);
5608               else
5609                 *dst++ = CHAR_TO_BYTE8 (c);
5610               produced_chars++;
5611             }
5612           else
5613             /* This is an annotation datum.  (-C) is the length of
5614                it.  */
5615             buf += -c - 1;
5616         }
5617     }
5618   else
5619     {
5620       const unsigned char *src = coding->source;
5621       const unsigned char *src_end = src + coding->src_bytes;
5622       Lisp_Object eol_type;
5623
5624       eol_type = CODING_ID_EOL_TYPE (coding->id);
5625
5626       if (coding->src_multibyte != coding->dst_multibyte)
5627         {
5628           if (coding->src_multibyte)
5629             {
5630               int multibytep = 1;
5631               int consumed_chars;
5632
5633               while (1)
5634                 {
5635                   const unsigned char *src_base = src;
5636                   int c;
5637
5638                   ONE_MORE_BYTE (c);
5639                   if (c == '\r')
5640                     {
5641                       if (EQ (eol_type, Qdos))
5642                         {
5643                           if (src == src_end)
5644                             {
5645                               record_conversion_result
5646                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5647                               goto no_more_source;
5648                             }
5649                           if (*src == '\n')
5650                             c = *src++;
5651                         }
5652                       else if (EQ (eol_type, Qmac))
5653                         c = '\n';
5654                     }
5655                   if (dst == dst_end)
5656                     {
5657                       coding->consumed = src - coding->source;
5658
5659                     if (EQ (coding->src_object, coding->dst_object))
5660                       dst_end = (unsigned char *) src;
5661                     if (dst == dst_end)
5662                       {
5663                         dst = alloc_destination (coding, src_end - src + 1,
5664                                                  dst);
5665                         dst_end = coding->destination + coding->dst_bytes;
5666                         coding_set_source (coding);
5667                         src = coding->source + coding->consumed;
5668                         src_end = coding->source + coding->src_bytes;
5669                       }
5670                     }
5671                   *dst++ = c;
5672                   produced_chars++;
5673                 }
5674             no_more_source:
5675               ;
5676             }
5677           else
5678             while (src < src_end)
5679               {
5680                 int multibytep = 1;
5681                 int c = *src++;
5682
5683                 if (c == '\r')
5684                   {
5685                     if (EQ (eol_type, Qdos))
5686                       {
5687                         if (src < src_end
5688                             && *src == '\n')
5689                           c = *src++;
5690                       }
5691                     else if (EQ (eol_type, Qmac))
5692                       c = '\n';
5693                   }
5694                 if (dst >= dst_end - 1)
5695                   {
5696                     coding->consumed = src - coding->source;
5697
5698                     if (EQ (coding->src_object, coding->dst_object))
5699                       dst_end = (unsigned char *) src;
5700                     if (dst >= dst_end - 1)
5701                       {
5702                         dst = alloc_destination (coding, src_end - src + 2,
5703                                                  dst);
5704                         dst_end = coding->destination + coding->dst_bytes;
5705                         coding_set_source (coding);
5706                         src = coding->source + coding->consumed;
5707                         src_end = coding->source + coding->src_bytes;
5708                       }
5709                   }
5710                 EMIT_ONE_BYTE (c);
5711               }
5712         }
5713       else
5714         {
5715           if (!EQ (coding->src_object, coding->dst_object))
5716             {
5717               int require = coding->src_bytes - coding->dst_bytes;
5718
5719               if (require > 0)
5720                 {
5721                   EMACS_INT offset = src - coding->source;
5722
5723                   dst = alloc_destination (coding, require, dst);
5724                   coding_set_source (coding);
5725                   src = coding->source + offset;
5726                   src_end = coding->source + coding->src_bytes;
5727                 }
5728             }
5729           produced_chars = coding->src_chars;
5730           while (src < src_end)
5731             {
5732               int c = *src++;
5733
5734               if (c == '\r')
5735                 {
5736                   if (EQ (eol_type, Qdos))
5737                     {
5738                       if (src < src_end
5739                           && *src == '\n')
5740                         c = *src++;
5741                       produced_chars--;
5742                     }
5743                   else if (EQ (eol_type, Qmac))
5744                     c = '\n';
5745                 }
5746               *dst++ = c;
5747             }
5748         }
5749       coding->consumed = coding->src_bytes;
5750       coding->consumed_char = coding->src_chars;
5751     }
5752
5753   produced = dst - (coding->destination + coding->produced);
5754   if (BUFFERP (coding->dst_object))
5755     insert_from_gap (produced_chars, produced);
5756   coding->produced += produced;
5757   coding->produced_char += produced_chars;
5758   return produced_chars;
5759 }
5760
5761 /* Compose text in CODING->object according to the annotation data at
5762    CHARBUF.  CHARBUF is an array:
5763      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5764  */
5765
5766 static INLINE void
5767 produce_composition (coding, charbuf)
5768      struct coding_system *coding;
5769      int *charbuf;
5770 {
5771   int len;
5772   EMACS_INT from, to;
5773   enum composition_method method;
5774   Lisp_Object components;
5775
5776   len = -charbuf[0];
5777   from = coding->dst_pos + charbuf[2];
5778   to = coding->dst_pos + charbuf[3];
5779   method = (enum composition_method) (charbuf[4]);
5780
5781   if (method == COMPOSITION_RELATIVE)
5782     components = Qnil;
5783   else
5784     {
5785       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5786       int i;
5787
5788       len -= 5;
5789       charbuf += 5;
5790       for (i = 0; i < len; i++)
5791         args[i] = make_number (charbuf[i]);
5792       components = (method == COMPOSITION_WITH_ALTCHARS
5793                     ? Fstring (len, args) : Fvector (len, args));
5794     }
5795   compose_text (from, to, components, Qnil, coding->dst_object);
5796 }
5797
5798
5799 /* Put `charset' property on text in CODING->object according to
5800    the annotation data at CHARBUF.  CHARBUF is an array:
5801      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5802  */
5803
5804 static INLINE void
5805 produce_charset (coding, charbuf)
5806      struct coding_system *coding;
5807      int *charbuf;
5808 {
5809   EMACS_INT from = coding->dst_pos + charbuf[2];
5810   EMACS_INT to = coding->dst_pos + charbuf[3];
5811   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5812
5813   Fput_text_property (make_number (from), make_number (to),
5814                       Qcharset, CHARSET_NAME (charset),
5815                       coding->dst_object);
5816 }
5817
5818
5819 #define CHARBUF_SIZE 0x4000
5820
5821 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5822   do {                                                                  \
5823     int size = CHARBUF_SIZE;;                                           \
5824                                                                         \
5825     coding->charbuf = NULL;                                             \
5826     while (size > 1024)                                                 \
5827       {                                                                 \
5828         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5829         if (coding->charbuf)                                            \
5830           break;                                                        \
5831         size >>= 1;                                                     \
5832       }                                                                 \
5833     if (! coding->charbuf)                                              \
5834       {                                                                 \
5835         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5836         return coding->result;                                          \
5837       }                                                                 \
5838     coding->charbuf_size = size;                                        \
5839   } while (0)
5840
5841
5842 static void
5843 produce_annotation (coding)
5844      struct coding_system *coding;
5845 {
5846   int *charbuf = coding->charbuf;
5847   int *charbuf_end = charbuf + coding->charbuf_used;
5848
5849   if (NILP (coding->dst_object))
5850     return;
5851
5852   while (charbuf < charbuf_end)
5853     {
5854       if (*charbuf >= 0)
5855         charbuf++;
5856       else
5857         {
5858           int len = -*charbuf;
5859           switch (charbuf[1])
5860             {
5861             case CODING_ANNOTATE_COMPOSITION_MASK:
5862               produce_composition (coding, charbuf);
5863               break;
5864             case CODING_ANNOTATE_CHARSET_MASK:
5865               produce_charset (coding, charbuf);
5866               break;
5867             default:
5868               abort ();
5869             }
5870           charbuf += len;
5871         }
5872     }
5873 }
5874
5875 /* Decode the data at CODING->src_object into CODING->dst_object.
5876    CODING->src_object is a buffer, a string, or nil.
5877    CODING->dst_object is a buffer.
5878
5879    If CODING->src_object is a buffer, it must be the current buffer.
5880    In this case, if CODING->src_pos is positive, it is a position of
5881    the source text in the buffer, otherwise, the source text is in the
5882    gap area of the buffer, and CODING->src_pos specifies the offset of
5883    the text from GPT (which must be the same as PT).  If this is the
5884    same buffer as CODING->dst_object, CODING->src_pos must be
5885    negative.
5886
5887    If CODING->src_object is a string, CODING->src_pos in an index to
5888    that string.
5889
5890    If CODING->src_object is nil, CODING->source must already point to
5891    the non-relocatable memory area.  In this case, CODING->src_pos is
5892    an offset from CODING->source.
5893
5894    The decoded data is inserted at the current point of the buffer
5895    CODING->dst_object.
5896 */
5897
5898 static int
5899 decode_coding (coding)
5900      struct coding_system *coding;
5901 {
5902   Lisp_Object attrs;
5903   Lisp_Object undo_list;
5904   Lisp_Object translation_table;
5905
5906   if (BUFFERP (coding->src_object)
5907       && coding->src_pos > 0
5908       && coding->src_pos < GPT
5909       && coding->src_pos + coding->src_chars > GPT)
5910     move_gap_both (coding->src_pos, coding->src_pos_byte);
5911
5912   undo_list = Qt;
5913   if (BUFFERP (coding->dst_object))
5914     {
5915       if (current_buffer != XBUFFER (coding->dst_object))
5916         set_buffer_internal (XBUFFER (coding->dst_object));
5917       if (GPT != PT)
5918         move_gap_both (PT, PT_BYTE);
5919       undo_list = current_buffer->undo_list;
5920       current_buffer->undo_list = Qt;
5921     }
5922
5923   coding->consumed = coding->consumed_char = 0;
5924   coding->produced = coding->produced_char = 0;
5925   coding->chars_at_source = 0;
5926   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5927   coding->errors = 0;
5928
5929   ALLOC_CONVERSION_WORK_AREA (coding);
5930
5931   attrs = CODING_ID_ATTRS (coding->id);
5932   translation_table = get_translation_table (attrs, 0);
5933
5934   do
5935     {
5936       coding_set_source (coding);
5937       coding->annotated = 0;
5938       (*(coding->decoder)) (coding);
5939       if (!NILP (translation_table))
5940         translate_chars (coding, translation_table);
5941       coding_set_destination (coding);
5942       produce_chars (coding);
5943       if (coding->annotated)
5944         produce_annotation (coding);
5945     }
5946   while (coding->consumed < coding->src_bytes
5947          && ! coding->result);
5948
5949   coding->carryover_bytes = 0;
5950   if (coding->consumed < coding->src_bytes)
5951     {
5952       int nbytes = coding->src_bytes - coding->consumed;
5953       const unsigned char *src;
5954
5955       coding_set_source (coding);
5956       coding_set_destination (coding);
5957       src = coding->source + coding->consumed;
5958
5959       if (coding->mode & CODING_MODE_LAST_BLOCK)
5960         {
5961           /* Flush out unprocessed data as binary chars.  We are sure
5962              that the number of data is less than the size of
5963              coding->charbuf.  */
5964           coding->charbuf_used = 0;
5965           while (nbytes-- > 0)
5966             {
5967               int c = *src++;
5968
5969               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5970             }
5971           produce_chars (coding);
5972         }
5973       else
5974         {
5975           /* Record unprocessed bytes in coding->carryover.  We are
5976              sure that the number of data is less than the size of
5977              coding->carryover.  */
5978           unsigned char *p = coding->carryover;
5979
5980           coding->carryover_bytes = nbytes;
5981           while (nbytes-- > 0)
5982             *p++ = *src++;
5983         }
5984       coding->consumed = coding->src_bytes;
5985     }
5986
5987   if (BUFFERP (coding->dst_object))
5988     {
5989       current_buffer->undo_list = undo_list;
5990       record_insert (coding->dst_pos, coding->produced_char);
5991     }
5992   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5993     decode_eol (coding);
5994   return coding->result;
5995 }
5996
5997
5998 /* Extract an annotation datum from a composition starting at POS and
5999    ending before LIMIT of CODING->src_object (buffer or string), store
6000    the data in BUF, set *STOP to a starting position of the next
6001    composition (if any) or to LIMIT, and return the address of the
6002    next element of BUF.
6003
6004    If such an annotation is not found, set *STOP to a starting
6005    position of a composition after POS (if any) or to LIMIT, and
6006    return BUF.  */
6007
6008 static INLINE int *
6009 handle_composition_annotation (pos, limit, coding, buf, stop)
6010      EMACS_INT pos, limit;
6011      struct coding_system *coding;
6012      int *buf;
6013      EMACS_INT *stop;
6014 {
6015   EMACS_INT start, end;
6016   Lisp_Object prop;
6017
6018   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6019       || end > limit)
6020     *stop = limit;
6021   else if (start > pos)
6022     *stop = start;
6023   else
6024     {
6025       if (start == pos)
6026         {
6027           /* We found a composition.  Store the corresponding
6028              annotation data in BUF.  */
6029           int *head = buf;
6030           enum composition_method method = COMPOSITION_METHOD (prop);
6031           int nchars = COMPOSITION_LENGTH (prop);
6032
6033           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6034           if (method != COMPOSITION_RELATIVE)
6035             {
6036               Lisp_Object components;
6037               int len, i, i_byte;
6038
6039               components = COMPOSITION_COMPONENTS (prop);
6040               if (VECTORP (components))
6041                 {
6042                   len = XVECTOR (components)->size;
6043                   for (i = 0; i < len; i++)
6044                     *buf++ = XINT (AREF (components, i));
6045                 }
6046               else if (STRINGP (components))
6047                 {
6048                   len = SCHARS (components);
6049                   i = i_byte = 0;
6050                   while (i < len)
6051                     {
6052                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6053                       buf++;
6054                     }
6055                 }
6056               else if (INTEGERP (components))
6057                 {
6058                   len = 1;
6059                   *buf++ = XINT (components);
6060                 }
6061               else if (CONSP (components))
6062                 {
6063                   for (len = 0; CONSP (components);
6064                        len++, components = XCDR (components))
6065                     *buf++ = XINT (XCAR (components));
6066                 }
6067               else
6068                 abort ();
6069               *head -= len;
6070             }
6071         }
6072
6073       if (find_composition (end, limit, &start, &end, &prop,
6074                             coding->src_object)
6075           && end <= limit)
6076         *stop = start;
6077       else
6078         *stop = limit;
6079     }
6080   return buf;
6081 }
6082
6083
6084 /* Extract an annotation datum from a text property `charset' at POS of
6085    CODING->src_object (buffer of string), store the data in BUF, set
6086    *STOP to the position where the value of `charset' property changes
6087    (limiting by LIMIT), and return the address of the next element of
6088    BUF.
6089
6090    If the property value is nil, set *STOP to the position where the
6091    property value is non-nil (limiting by LIMIT), and return BUF.  */
6092
6093 static INLINE int *
6094 handle_charset_annotation (pos, limit, coding, buf, stop)
6095      EMACS_INT pos, limit;
6096      struct coding_system *coding;
6097      int *buf;
6098      EMACS_INT *stop;
6099 {
6100   Lisp_Object val, next;
6101   int id;
6102
6103   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6104   if (! NILP (val) && CHARSETP (val))
6105     id = XINT (CHARSET_SYMBOL_ID (val));
6106   else
6107     id = -1;
6108   ADD_CHARSET_DATA (buf, 0, 0, id);
6109   next = Fnext_single_property_change (make_number (pos), Qcharset,
6110                                        coding->src_object,
6111                                        make_number (limit));
6112   *stop = XINT (next);
6113   return buf;
6114 }
6115
6116
6117 static void
6118 consume_chars (coding)
6119      struct coding_system *coding;
6120 {
6121   int *buf = coding->charbuf;
6122   int *buf_end = coding->charbuf + coding->charbuf_size;
6123   const unsigned char *src = coding->source + coding->consumed;
6124   const unsigned char *src_end = coding->source + coding->src_bytes;
6125   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6126   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6127   int multibytep = coding->src_multibyte;
6128   Lisp_Object eol_type;
6129   int c;
6130   EMACS_INT stop, stop_composition, stop_charset;
6131
6132   eol_type = CODING_ID_EOL_TYPE (coding->id);
6133   if (VECTORP (eol_type))
6134     eol_type = Qunix;
6135
6136   /* Note: composition handling is not yet implemented.  */
6137   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6138
6139   if (NILP (coding->src_object))
6140     stop = stop_composition = stop_charset = end_pos;
6141   else
6142     {
6143       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6144         stop = stop_composition = pos;
6145       else
6146         stop = stop_composition = end_pos;
6147       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6148         stop = stop_charset = pos;
6149       else
6150         stop_charset = end_pos;
6151     }
6152
6153   /* Compensate for CRLF and conversion.  */
6154   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6155   while (buf < buf_end)
6156     {
6157       if (pos == stop)
6158         {
6159           if (pos == end_pos)
6160             break;
6161           if (pos == stop_composition)
6162             buf = handle_composition_annotation (pos, end_pos, coding,
6163                                                  buf, &stop_composition);
6164           if (pos == stop_charset)
6165             buf = handle_charset_annotation (pos, end_pos, coding,
6166                                              buf, &stop_charset);
6167           stop = (stop_composition < stop_charset
6168                   ? stop_composition : stop_charset);
6169         }
6170
6171       if (! multibytep)
6172         {
6173           EMACS_INT bytes;
6174
6175           if (! CODING_FOR_UNIBYTE (coding)
6176               && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6177             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6178           else
6179             c = *src++, pos++;
6180         }
6181       else
6182         c = STRING_CHAR_ADVANCE (src), pos++;
6183       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6184         c = '\n';
6185       if (! EQ (eol_type, Qunix))
6186         {
6187           if (c == '\n')
6188             {
6189               if (EQ (eol_type, Qdos))
6190                 *buf++ = '\r';
6191               else
6192                 c = '\r';
6193             }
6194         }
6195       *buf++ = c;
6196     }
6197
6198   coding->consumed = src - coding->source;
6199   coding->consumed_char = pos - coding->src_pos;
6200   coding->charbuf_used = buf - coding->charbuf;
6201   coding->chars_at_source = 0;
6202 }
6203
6204
6205 /* Encode the text at CODING->src_object into CODING->dst_object.
6206    CODING->src_object is a buffer or a string.
6207    CODING->dst_object is a buffer or nil.
6208
6209    If CODING->src_object is a buffer, it must be the current buffer.
6210    In this case, if CODING->src_pos is positive, it is a position of
6211    the source text in the buffer, otherwise. the source text is in the
6212    gap area of the buffer, and coding->src_pos specifies the offset of
6213    the text from GPT (which must be the same as PT).  If this is the
6214    same buffer as CODING->dst_object, CODING->src_pos must be
6215    negative and CODING should not have `pre-write-conversion'.
6216
6217    If CODING->src_object is a string, CODING should not have
6218    `pre-write-conversion'.
6219
6220    If CODING->dst_object is a buffer, the encoded data is inserted at
6221    the current point of that buffer.
6222
6223    If CODING->dst_object is nil, the encoded data is placed at the
6224    memory area specified by CODING->destination.  */
6225
6226 static int
6227 encode_coding (coding)
6228      struct coding_system *coding;
6229 {
6230   Lisp_Object attrs;
6231   Lisp_Object translation_table;
6232
6233   attrs = CODING_ID_ATTRS (coding->id);
6234   translation_table = get_translation_table (attrs, 1);
6235
6236   if (BUFFERP (coding->dst_object))
6237     {
6238       set_buffer_internal (XBUFFER (coding->dst_object));
6239       coding->dst_multibyte
6240         = ! NILP (current_buffer->enable_multibyte_characters);
6241     }
6242
6243   coding->consumed = coding->consumed_char = 0;
6244   coding->produced = coding->produced_char = 0;
6245   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6246   coding->errors = 0;
6247
6248   ALLOC_CONVERSION_WORK_AREA (coding);
6249
6250   do {
6251     coding_set_source (coding);
6252     consume_chars (coding);
6253
6254     if (!NILP (translation_table))
6255       translate_chars (coding, translation_table);
6256
6257     coding_set_destination (coding);
6258     (*(coding->encoder)) (coding);
6259   } while (coding->consumed_char < coding->src_chars);
6260
6261   if (BUFFERP (coding->dst_object))
6262     insert_from_gap (coding->produced_char, coding->produced);
6263
6264   return (coding->result);
6265 }
6266
6267
6268 /* Name (or base name) of work buffer for code conversion.  */
6269 static Lisp_Object Vcode_conversion_workbuf_name;
6270
6271 /* A working buffer used by the top level conversion.  Once it is
6272    created, it is never destroyed.  It has the name
6273    Vcode_conversion_workbuf_name.  The other working buffers are
6274    destroyed after the use is finished, and their names are modified
6275    versions of Vcode_conversion_workbuf_name.  */
6276 static Lisp_Object Vcode_conversion_reused_workbuf;
6277
6278 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6279 static int reused_workbuf_in_use;
6280
6281
6282 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6283    multibyteness of returning buffer.  */
6284
6285 Lisp_Object
6286 make_conversion_work_buffer (multibyte)
6287 {
6288   Lisp_Object name, workbuf;
6289   struct buffer *current;
6290
6291   if (reused_workbuf_in_use++)
6292     {
6293       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6294       workbuf = Fget_buffer_create (name);
6295     }
6296   else
6297     {
6298       name = Vcode_conversion_workbuf_name;
6299       workbuf = Fget_buffer_create (name);
6300       if (NILP (Vcode_conversion_reused_workbuf))
6301         Vcode_conversion_reused_workbuf = workbuf;
6302     }
6303   current = current_buffer;
6304   set_buffer_internal (XBUFFER (workbuf));
6305   Ferase_buffer ();
6306   current_buffer->undo_list = Qt;
6307   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6308   set_buffer_internal (current);
6309   return workbuf;
6310 }
6311
6312
6313 static Lisp_Object
6314 code_conversion_restore (arg)
6315      Lisp_Object arg;
6316 {
6317   Lisp_Object current, workbuf;
6318
6319   current = XCAR (arg);
6320   workbuf = XCDR (arg);
6321   if (! NILP (workbuf))
6322     {
6323       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6324         reused_workbuf_in_use = 0;
6325       else if (! NILP (Fbuffer_live_p (workbuf)))
6326         Fkill_buffer (workbuf);
6327     }
6328   set_buffer_internal (XBUFFER (current));
6329   return Qnil;
6330 }
6331
6332 Lisp_Object
6333 code_conversion_save (with_work_buf, multibyte)
6334      int with_work_buf, multibyte;
6335 {
6336   Lisp_Object workbuf = Qnil;
6337
6338   if (with_work_buf)
6339     workbuf = make_conversion_work_buffer (multibyte);
6340   record_unwind_protect (code_conversion_restore,
6341                          Fcons (Fcurrent_buffer (), workbuf));
6342   return workbuf;
6343 }
6344
6345 int
6346 decode_coding_gap (coding, chars, bytes)
6347      struct coding_system *coding;
6348      EMACS_INT chars, bytes;
6349 {
6350   int count = specpdl_ptr - specpdl;
6351   Lisp_Object attrs;
6352
6353   code_conversion_save (0, 0);
6354
6355   coding->src_object = Fcurrent_buffer ();
6356   coding->src_chars = chars;
6357   coding->src_bytes = bytes;
6358   coding->src_pos = -chars;
6359   coding->src_pos_byte = -bytes;
6360   coding->src_multibyte = chars < bytes;
6361   coding->dst_object = coding->src_object;
6362   coding->dst_pos = PT;
6363   coding->dst_pos_byte = PT_BYTE;
6364   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6365   coding->mode |= CODING_MODE_LAST_BLOCK;
6366
6367   if (CODING_REQUIRE_DETECTION (coding))
6368     detect_coding (coding);
6369
6370   decode_coding (coding);
6371
6372   attrs = CODING_ID_ATTRS (coding->id);
6373   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6374     {
6375       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6376       Lisp_Object val;
6377
6378       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6379       val = call1 (CODING_ATTR_POST_READ (attrs),
6380                    make_number (coding->produced_char));
6381       CHECK_NATNUM (val);
6382       coding->produced_char += Z - prev_Z;
6383       coding->produced += Z_BYTE - prev_Z_BYTE;
6384     }
6385
6386   unbind_to (count, Qnil);
6387   return coding->result;
6388 }
6389
6390 int
6391 encode_coding_gap (coding, chars, bytes)
6392      struct coding_system *coding;
6393      EMACS_INT chars, bytes;
6394 {
6395   int count = specpdl_ptr - specpdl;
6396
6397   code_conversion_save (0, 0);
6398
6399   coding->src_object = Fcurrent_buffer ();
6400   coding->src_chars = chars;
6401   coding->src_bytes = bytes;
6402   coding->src_pos = -chars;
6403   coding->src_pos_byte = -bytes;
6404   coding->src_multibyte = chars < bytes;
6405   coding->dst_object = coding->src_object;
6406   coding->dst_pos = PT;
6407   coding->dst_pos_byte = PT_BYTE;
6408
6409   encode_coding (coding);
6410
6411   unbind_to (count, Qnil);
6412   return coding->result;
6413 }
6414
6415
6416 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6417    SRC_OBJECT into DST_OBJECT by coding context CODING.
6418
6419    SRC_OBJECT is a buffer, a string, or Qnil.
6420
6421    If it is a buffer, the text is at point of the buffer.  FROM and TO
6422    are positions in the buffer.
6423
6424    If it is a string, the text is at the beginning of the string.
6425    FROM and TO are indices to the string.
6426
6427    If it is nil, the text is at coding->source.  FROM and TO are
6428    indices to coding->source.
6429
6430    DST_OBJECT is a buffer, Qt, or Qnil.
6431
6432    If it is a buffer, the decoded text is inserted at point of the
6433    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6434    is deleted.
6435
6436    If it is Qt, a string is made from the decoded text, and
6437    set in CODING->dst_object.
6438
6439    If it is Qnil, the decoded text is stored at CODING->destination.
6440    The caller must allocate CODING->dst_bytes bytes at
6441    CODING->destination by xmalloc.  If the decoded text is longer than
6442    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6443  */
6444
6445 void
6446 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6447                       dst_object)
6448      struct coding_system *coding;
6449      Lisp_Object src_object;
6450      EMACS_INT from, from_byte, to, to_byte;
6451      Lisp_Object dst_object;
6452 {
6453   int count = specpdl_ptr - specpdl;
6454   unsigned char *destination;
6455   EMACS_INT dst_bytes;
6456   EMACS_INT chars = to - from;
6457   EMACS_INT bytes = to_byte - from_byte;
6458   Lisp_Object attrs;
6459   Lisp_Object buffer;
6460   int saved_pt = -1, saved_pt_byte;
6461
6462   buffer = Fcurrent_buffer ();
6463
6464   if (NILP (dst_object))
6465     {
6466       destination = coding->destination;
6467       dst_bytes = coding->dst_bytes;
6468     }
6469
6470   coding->src_object = src_object;
6471   coding->src_chars = chars;
6472   coding->src_bytes = bytes;
6473   coding->src_multibyte = chars < bytes;
6474
6475   if (STRINGP (src_object))
6476     {
6477       coding->src_pos = from;
6478       coding->src_pos_byte = from_byte;
6479     }
6480   else if (BUFFERP (src_object))
6481     {
6482       set_buffer_internal (XBUFFER (src_object));
6483       if (from != GPT)
6484         move_gap_both (from, from_byte);
6485       if (EQ (src_object, dst_object))
6486         {
6487           saved_pt = PT, saved_pt_byte = PT_BYTE;
6488           TEMP_SET_PT_BOTH (from, from_byte);
6489           del_range_both (from, from_byte, to, to_byte, 1);
6490           coding->src_pos = -chars;
6491           coding->src_pos_byte = -bytes;
6492         }
6493       else
6494         {
6495           coding->src_pos = from;
6496           coding->src_pos_byte = from_byte;
6497         }
6498     }
6499
6500   if (CODING_REQUIRE_DETECTION (coding))
6501     detect_coding (coding);
6502   attrs = CODING_ID_ATTRS (coding->id);
6503
6504   if (EQ (dst_object, Qt)
6505       || (! NILP (CODING_ATTR_POST_READ (attrs))
6506           && NILP (dst_object)))
6507     {
6508       coding->dst_object = code_conversion_save (1, 1);
6509       coding->dst_pos = BEG;
6510       coding->dst_pos_byte = BEG_BYTE;
6511       coding->dst_multibyte = 1;
6512     }
6513   else if (BUFFERP (dst_object))
6514     {
6515       code_conversion_save (0, 0);
6516       coding->dst_object = dst_object;
6517       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6518       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6519       coding->dst_multibyte
6520         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6521     }
6522   else
6523     {
6524       code_conversion_save (0, 0);
6525       coding->dst_object = Qnil;
6526       coding->dst_multibyte = 1;
6527     }
6528
6529   decode_coding (coding);
6530
6531   if (BUFFERP (coding->dst_object))
6532     set_buffer_internal (XBUFFER (coding->dst_object));
6533
6534   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6535     {
6536       struct gcpro gcpro1, gcpro2;
6537       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6538       Lisp_Object val;
6539
6540       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6541       GCPRO2 (coding->src_object, coding->dst_object);
6542       val = call1 (CODING_ATTR_POST_READ (attrs),
6543                    make_number (coding->produced_char));
6544       UNGCPRO;
6545       CHECK_NATNUM (val);
6546       coding->produced_char += Z - prev_Z;
6547       coding->produced += Z_BYTE - prev_Z_BYTE;
6548     }
6549
6550   if (EQ (dst_object, Qt))
6551     {
6552       coding->dst_object = Fbuffer_string ();
6553     }
6554   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6555     {
6556       set_buffer_internal (XBUFFER (coding->dst_object));
6557       if (dst_bytes < coding->produced)
6558         {
6559           destination
6560             = (unsigned char *) xrealloc (destination, coding->produced);
6561           if (! destination)
6562             {
6563               record_conversion_result (coding,
6564                                         CODING_RESULT_INSUFFICIENT_DST);
6565               unbind_to (count, Qnil);
6566               return;
6567             }
6568           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6569             move_gap_both (BEGV, BEGV_BYTE);
6570           bcopy (BEGV_ADDR, destination, coding->produced);
6571           coding->destination = destination;
6572         }
6573     }
6574
6575   if (saved_pt >= 0)
6576     {
6577       /* This is the case of:
6578          (BUFFERP (src_object) && EQ (src_object, dst_object))
6579          As we have moved PT while replacing the original buffer
6580          contents, we must recover it now.  */
6581       set_buffer_internal (XBUFFER (src_object));
6582       if (saved_pt < from)
6583         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6584       else if (saved_pt < from + chars)
6585         TEMP_SET_PT_BOTH (from, from_byte);
6586       else if (! NILP (current_buffer->enable_multibyte_characters))
6587         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6588                           saved_pt_byte + (coding->produced - bytes));
6589       else
6590         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6591                           saved_pt_byte + (coding->produced - bytes));
6592     }
6593
6594   unbind_to (count, coding->dst_object);
6595 }
6596
6597
6598 void
6599 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6600                       dst_object)
6601      struct coding_system *coding;
6602      Lisp_Object src_object;
6603      EMACS_INT from, from_byte, to, to_byte;
6604      Lisp_Object dst_object;
6605 {
6606   int count = specpdl_ptr - specpdl;
6607   EMACS_INT chars = to - from;
6608   EMACS_INT bytes = to_byte - from_byte;
6609   Lisp_Object attrs;
6610   Lisp_Object buffer;
6611   int saved_pt = -1, saved_pt_byte;
6612
6613   buffer = Fcurrent_buffer ();
6614
6615   coding->src_object = src_object;
6616   coding->src_chars = chars;
6617   coding->src_bytes = bytes;
6618   coding->src_multibyte = chars < bytes;
6619
6620   attrs = CODING_ID_ATTRS (coding->id);
6621
6622   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6623     {
6624       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6625       set_buffer_internal (XBUFFER (coding->src_object));
6626       if (STRINGP (src_object))
6627         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6628       else if (BUFFERP (src_object))
6629         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6630       else
6631         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6632
6633       if (EQ (src_object, dst_object))
6634         {
6635           set_buffer_internal (XBUFFER (src_object));
6636           saved_pt = PT, saved_pt_byte = PT_BYTE;
6637           del_range_both (from, from_byte, to, to_byte, 1);
6638           set_buffer_internal (XBUFFER (coding->src_object));
6639         }
6640
6641       call2 (CODING_ATTR_PRE_WRITE (attrs),
6642              make_number (BEG), make_number (Z));
6643       coding->src_object = Fcurrent_buffer ();
6644       if (BEG != GPT)
6645         move_gap_both (BEG, BEG_BYTE);
6646       coding->src_chars = Z - BEG;
6647       coding->src_bytes = Z_BYTE - BEG_BYTE;
6648       coding->src_pos = BEG;
6649       coding->src_pos_byte = BEG_BYTE;
6650       coding->src_multibyte = Z < Z_BYTE;
6651     }
6652   else if (STRINGP (src_object))
6653     {
6654       code_conversion_save (0, 0);
6655       coding->src_pos = from;
6656       coding->src_pos_byte = from_byte;
6657     }
6658   else if (BUFFERP (src_object))
6659     {
6660       code_conversion_save (0, 0);
6661       set_buffer_internal (XBUFFER (src_object));
6662       if (EQ (src_object, dst_object))
6663         {
6664           saved_pt = PT, saved_pt_byte = PT_BYTE;
6665           coding->src_object = del_range_1 (from, to, 1, 1);
6666           coding->src_pos = 0;
6667           coding->src_pos_byte = 0;
6668         }
6669       else
6670         {
6671           if (from < GPT && to >= GPT)
6672             move_gap_both (from, from_byte);
6673           coding->src_pos = from;
6674           coding->src_pos_byte = from_byte;
6675         }
6676     }
6677   else
6678     code_conversion_save (0, 0);
6679
6680   if (BUFFERP (dst_object))
6681     {
6682       coding->dst_object = dst_object;
6683       if (EQ (src_object, dst_object))
6684         {
6685           coding->dst_pos = from;
6686           coding->dst_pos_byte = from_byte;
6687         }
6688       else
6689         {
6690           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6691           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6692         }
6693       coding->dst_multibyte
6694         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6695     }
6696   else if (EQ (dst_object, Qt))
6697     {
6698       coding->dst_object = Qnil;
6699       coding->dst_bytes = coding->src_chars;
6700       if (coding->dst_bytes == 0)
6701         coding->dst_bytes = 1;
6702       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6703       coding->dst_multibyte = 0;
6704     }
6705   else
6706     {
6707       coding->dst_object = Qnil;
6708       coding->dst_multibyte = 0;
6709     }
6710
6711   encode_coding (coding);
6712
6713   if (EQ (dst_object, Qt))
6714     {
6715       if (BUFFERP (coding->dst_object))
6716         coding->dst_object = Fbuffer_string ();
6717       else
6718         {
6719           coding->dst_object
6720             = make_unibyte_string ((char *) coding->destination,
6721                                    coding->produced);
6722           xfree (coding->destination);
6723         }
6724     }
6725
6726   if (saved_pt >= 0)
6727     {
6728       /* This is the case of:
6729          (BUFFERP (src_object) && EQ (src_object, dst_object))
6730          As we have moved PT while replacing the original buffer
6731          contents, we must recover it now.  */
6732       set_buffer_internal (XBUFFER (src_object));
6733       if (saved_pt < from)
6734         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6735       else if (saved_pt < from + chars)
6736         TEMP_SET_PT_BOTH (from, from_byte);
6737       else if (! NILP (current_buffer->enable_multibyte_characters))
6738         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6739                           saved_pt_byte + (coding->produced - bytes));
6740       else
6741         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6742                           saved_pt_byte + (coding->produced - bytes));
6743     }
6744
6745   unbind_to (count, Qnil);
6746 }
6747
6748
6749 Lisp_Object
6750 preferred_coding_system ()
6751 {
6752   int id = coding_categories[coding_priorities[0]].id;
6753
6754   return CODING_ID_NAME (id);
6755 }
6756
6757 \f
6758 #ifdef emacs
6759 /*** 8. Emacs Lisp library functions ***/
6760
6761 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6762        doc: /* Return t if OBJECT is nil or a coding-system.
6763 See the documentation of `define-coding-system' for information
6764 about coding-system objects.  */)
6765      (obj)
6766      Lisp_Object obj;
6767 {
6768   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6769 }
6770
6771 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6772        Sread_non_nil_coding_system, 1, 1, 0,
6773        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6774      (prompt)
6775      Lisp_Object prompt;
6776 {
6777   Lisp_Object val;
6778   do
6779     {
6780       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6781                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6782     }
6783   while (SCHARS (val) == 0);
6784   return (Fintern (val, Qnil));
6785 }
6786
6787 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6788        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6789 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6790      (prompt, default_coding_system)
6791      Lisp_Object prompt, default_coding_system;
6792 {
6793   Lisp_Object val;
6794   if (SYMBOLP (default_coding_system))
6795     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6796   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6797                           Qt, Qnil, Qcoding_system_history,
6798                           default_coding_system, Qnil);
6799   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6800 }
6801
6802 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6803        1, 1, 0,
6804        doc: /* Check validity of CODING-SYSTEM.
6805 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6806   (coding_system)
6807      Lisp_Object coding_system;
6808 {
6809   CHECK_SYMBOL (coding_system);
6810   if (!NILP (Fcoding_system_p (coding_system)))
6811     return coding_system;
6812   while (1)
6813     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6814 }
6815
6816 \f
6817 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6818    HIGHEST is nonzero, return the coding system of the highest
6819    priority among the detected coding systems.  Otherwize return a
6820    list of detected coding systems sorted by their priorities.  If
6821    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6822    multibyte form but contains only ASCII and eight-bit chars.
6823    Otherwise, the bytes are raw bytes.
6824
6825    CODING-SYSTEM controls the detection as below:
6826
6827    If it is nil, detect both text-format and eol-format.  If the
6828    text-format part of CODING-SYSTEM is already specified
6829    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6830    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6831    detect only text-format.  */
6832
6833 Lisp_Object
6834 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6835                       coding_system)
6836      const unsigned char *src;
6837      int src_chars, src_bytes, highest;
6838      int multibytep;
6839      Lisp_Object coding_system;
6840 {
6841   const unsigned char *src_end = src + src_bytes;
6842   Lisp_Object attrs, eol_type;
6843   Lisp_Object val;
6844   struct coding_system coding;
6845   int id;
6846   struct coding_detection_info detect_info;
6847   enum coding_category base_category;
6848
6849   if (NILP (coding_system))
6850     coding_system = Qundecided;
6851   setup_coding_system (coding_system, &coding);
6852   attrs = CODING_ID_ATTRS (coding.id);
6853   eol_type = CODING_ID_EOL_TYPE (coding.id);
6854   coding_system = CODING_ATTR_BASE_NAME (attrs);
6855
6856   coding.source = src;
6857   coding.src_chars = src_chars;
6858   coding.src_bytes = src_bytes;
6859   coding.src_multibyte = multibytep;
6860   coding.consumed = 0;
6861   coding.mode |= CODING_MODE_LAST_BLOCK;
6862
6863   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6864
6865   /* At first, detect text-format if necessary.  */
6866   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6867   if (base_category == coding_category_undecided)
6868     {
6869       enum coding_category category;
6870       struct coding_system *this;
6871       int c, i;
6872
6873       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
6874       for (i = 0; src < src_end; i++, src++)
6875         {
6876           c = *src;
6877           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6878                                         || c == ISO_CODE_SI
6879                                         || c == ISO_CODE_SO)))
6880             break;
6881         }
6882       coding.head_ascii = src - coding.source;
6883
6884       if (src < src_end)
6885         for (i = 0; i < coding_category_raw_text; i++)
6886           {
6887             category = coding_priorities[i];
6888             this = coding_categories + category;
6889
6890             if (this->id < 0)
6891               {
6892                 /* No coding system of this category is defined.  */
6893                 detect_info.rejected |= (1 << category);
6894               }
6895             else if (category >= coding_category_raw_text)
6896               continue;
6897             else if (detect_info.checked & (1 << category))
6898               {
6899                 if (highest
6900                     && (detect_info.found & (1 << category)))
6901                   break;
6902               }
6903             else
6904               {
6905                 if ((*(this->detector)) (&coding, &detect_info)
6906                     && highest
6907                     && (detect_info.found & (1 << category)))
6908                   {
6909                     if (category == coding_category_utf_16_auto)
6910                       {
6911                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6912                           category = coding_category_utf_16_le;
6913                         else
6914                           category = coding_category_utf_16_be;
6915                       }
6916                     break;
6917                   }
6918               }
6919           }
6920
6921       if (detect_info.rejected == CATEGORY_MASK_ANY)
6922         {
6923           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6924           id = coding_categories[coding_category_raw_text].id;
6925           val = Fcons (make_number (id), Qnil);
6926         }
6927       else if (! detect_info.rejected && ! detect_info.found)
6928         {
6929           detect_info.found = CATEGORY_MASK_ANY;
6930           id = coding_categories[coding_category_undecided].id;
6931           val = Fcons (make_number (id), Qnil);
6932         }
6933       else if (highest)
6934         {
6935           if (detect_info.found)
6936             {
6937               detect_info.found = 1 << category;
6938               val = Fcons (make_number (this->id), Qnil);
6939             }
6940           else
6941             for (i = 0; i < coding_category_raw_text; i++)
6942               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6943                 {
6944                   detect_info.found = 1 << coding_priorities[i];
6945                   id = coding_categories[coding_priorities[i]].id;
6946                   val = Fcons (make_number (id), Qnil);
6947                   break;
6948                 }
6949         }
6950       else
6951         {
6952           int mask = detect_info.rejected | detect_info.found;
6953           int found = 0;
6954           val = Qnil;
6955
6956           for (i = coding_category_raw_text - 1; i >= 0; i--)
6957             {
6958               category = coding_priorities[i];
6959               if (! (mask & (1 << category)))
6960                 {
6961                   found |= 1 << category;
6962                   id = coding_categories[category].id;
6963                   val = Fcons (make_number (id), val);
6964                 }
6965             }
6966           for (i = coding_category_raw_text - 1; i >= 0; i--)
6967             {
6968               category = coding_priorities[i];
6969               if (detect_info.found & (1 << category))
6970                 {
6971                   id = coding_categories[category].id;
6972                   val = Fcons (make_number (id), val);
6973                 }
6974             }
6975           detect_info.found |= found;
6976         }
6977     }
6978   else if (base_category == coding_category_utf_16_auto)
6979     {
6980       if (detect_coding_utf_16 (&coding, &detect_info))
6981         {
6982           enum coding_category category;
6983           struct coding_system *this;
6984
6985           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6986             this = coding_categories + coding_category_utf_16_le;
6987           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6988             this = coding_categories + coding_category_utf_16_be;
6989           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6990             this = coding_categories + coding_category_utf_16_be_nosig;
6991           else
6992             this = coding_categories + coding_category_utf_16_le_nosig;
6993           val = Fcons (make_number (this->id), Qnil);
6994         }
6995     }
6996   else
6997     {
6998       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6999       val = Fcons (make_number (coding.id), Qnil);
7000     }
7001
7002   /* Then, detect eol-format if necessary.  */
7003   {
7004     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7005     Lisp_Object tail;
7006
7007     if (VECTORP (eol_type))
7008       {
7009         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7010           normal_eol = detect_eol (coding.source, src_bytes,
7011                                    coding_category_raw_text);
7012         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7013                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7014           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7015                                       coding_category_utf_16_be);
7016         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7017                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7018           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7019                                       coding_category_utf_16_le);
7020       }
7021     else
7022       {
7023         if (EQ (eol_type, Qunix))
7024           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7025         else if (EQ (eol_type, Qdos))
7026           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7027         else
7028           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7029       }
7030
7031     for (tail = val; CONSP (tail); tail = XCDR (tail))
7032       {
7033         enum coding_category category;
7034         int this_eol;
7035
7036         id = XINT (XCAR (tail));
7037         attrs = CODING_ID_ATTRS (id);
7038         category = XINT (CODING_ATTR_CATEGORY (attrs));
7039         eol_type = CODING_ID_EOL_TYPE (id);
7040         if (VECTORP (eol_type))
7041           {
7042             if (category == coding_category_utf_16_be
7043                 || category == coding_category_utf_16_be_nosig)
7044               this_eol = utf_16_be_eol;
7045             else if (category == coding_category_utf_16_le
7046                      || category == coding_category_utf_16_le_nosig)
7047               this_eol = utf_16_le_eol;
7048             else
7049               this_eol = normal_eol;
7050
7051             if (this_eol == EOL_SEEN_LF)
7052               XSETCAR (tail, AREF (eol_type, 0));
7053             else if (this_eol == EOL_SEEN_CRLF)
7054               XSETCAR (tail, AREF (eol_type, 1));
7055             else if (this_eol == EOL_SEEN_CR)
7056               XSETCAR (tail, AREF (eol_type, 2));
7057             else
7058               XSETCAR (tail, CODING_ID_NAME (id));
7059           }
7060         else
7061           XSETCAR (tail, CODING_ID_NAME (id));
7062       }
7063   }
7064
7065   return (highest ? XCAR (val) : val);
7066 }
7067
7068
7069 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7070        2, 3, 0,
7071        doc: /* Detect coding system of the text in the region between START and END.
7072 Return a list of possible coding systems ordered by priority.
7073
7074 If only ASCII characters are found, it returns a list of single element
7075 `undecided' or its subsidiary coding system according to a detected
7076 end-of-line format.
7077
7078 If optional argument HIGHEST is non-nil, return the coding system of
7079 highest priority.  */)
7080      (start, end, highest)
7081      Lisp_Object start, end, highest;
7082 {
7083   int from, to;
7084   int from_byte, to_byte;
7085
7086   CHECK_NUMBER_COERCE_MARKER (start);
7087   CHECK_NUMBER_COERCE_MARKER (end);
7088
7089   validate_region (&start, &end);
7090   from = XINT (start), to = XINT (end);
7091   from_byte = CHAR_TO_BYTE (from);
7092   to_byte = CHAR_TO_BYTE (to);
7093
7094   if (from < GPT && to >= GPT)
7095     move_gap_both (to, to_byte);
7096
7097   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7098                                to - from, to_byte - from_byte,
7099                                !NILP (highest),
7100                                !NILP (current_buffer
7101                                       ->enable_multibyte_characters),
7102                                Qnil);
7103 }
7104
7105 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7106        1, 2, 0,
7107        doc: /* Detect coding system of the text in STRING.
7108 Return a list of possible coding systems ordered by priority.
7109
7110 If only ASCII characters are found, it returns a list of single element
7111 `undecided' or its subsidiary coding system according to a detected
7112 end-of-line format.
7113
7114 If optional argument HIGHEST is non-nil, return the coding system of
7115 highest priority.  */)
7116      (string, highest)
7117      Lisp_Object string, highest;
7118 {
7119   CHECK_STRING (string);
7120
7121   return detect_coding_system (SDATA (string),
7122                                SCHARS (string), SBYTES (string),
7123                                !NILP (highest), STRING_MULTIBYTE (string),
7124                                Qnil);
7125 }
7126
7127
7128 static INLINE int
7129 char_encodable_p (c, attrs)
7130      int c;
7131      Lisp_Object attrs;
7132 {
7133   Lisp_Object tail;
7134   struct charset *charset;
7135   Lisp_Object translation_table;
7136
7137   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7138   if (! NILP (translation_table))
7139     c = translate_char (translation_table, c);
7140   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7141        CONSP (tail); tail = XCDR (tail))
7142     {
7143       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7144       if (CHAR_CHARSET_P (c, charset))
7145         break;
7146     }
7147   return (! NILP (tail));
7148 }
7149
7150
7151 /* Return a list of coding systems that safely encode the text between
7152    START and END.  If EXCLUDE is non-nil, it is a list of coding
7153    systems not to check.  The returned list doesn't contain any such
7154    coding systems.  In any case, if the text contains only ASCII or is
7155    unibyte, return t.  */
7156
7157 DEFUN ("find-coding-systems-region-internal",
7158        Ffind_coding_systems_region_internal,
7159        Sfind_coding_systems_region_internal, 2, 3, 0,
7160        doc: /* Internal use only.  */)
7161      (start, end, exclude)
7162      Lisp_Object start, end, exclude;
7163 {
7164   Lisp_Object coding_attrs_list, safe_codings;
7165   EMACS_INT start_byte, end_byte;
7166   const unsigned char *p, *pbeg, *pend;
7167   int c;
7168   Lisp_Object tail, elt;
7169
7170   if (STRINGP (start))
7171     {
7172       if (!STRING_MULTIBYTE (start)
7173           || SCHARS (start) == SBYTES (start))
7174         return Qt;
7175       start_byte = 0;
7176       end_byte = SBYTES (start);
7177     }
7178   else
7179     {
7180       CHECK_NUMBER_COERCE_MARKER (start);
7181       CHECK_NUMBER_COERCE_MARKER (end);
7182       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7183         args_out_of_range (start, end);
7184       if (NILP (current_buffer->enable_multibyte_characters))
7185         return Qt;
7186       start_byte = CHAR_TO_BYTE (XINT (start));
7187       end_byte = CHAR_TO_BYTE (XINT (end));
7188       if (XINT (end) - XINT (start) == end_byte - start_byte)
7189         return Qt;
7190
7191       if (XINT (start) < GPT && XINT (end) > GPT)
7192         {
7193           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7194             move_gap_both (XINT (start), start_byte);
7195           else
7196             move_gap_both (XINT (end), end_byte);
7197         }
7198     }
7199
7200   coding_attrs_list = Qnil;
7201   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7202     if (NILP (exclude)
7203         || NILP (Fmemq (XCAR (tail), exclude)))
7204       {
7205         Lisp_Object attrs;
7206
7207         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7208         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7209             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7210           {
7211             ASET (attrs, coding_attr_trans_tbl,
7212                   get_translation_table (attrs, 1));
7213             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7214           }
7215       }
7216
7217   if (STRINGP (start))
7218     p = pbeg = SDATA (start);
7219   else
7220     p = pbeg = BYTE_POS_ADDR (start_byte);
7221   pend = p + (end_byte - start_byte);
7222
7223   while (p < pend && ASCII_BYTE_P (*p)) p++;
7224   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7225
7226   while (p < pend)
7227     {
7228       if (ASCII_BYTE_P (*p))
7229         p++;
7230       else
7231         {
7232           c = STRING_CHAR_ADVANCE (p);
7233
7234           charset_map_loaded = 0;
7235           for (tail = coding_attrs_list; CONSP (tail);)
7236             {
7237               elt = XCAR (tail);
7238               if (NILP (elt))
7239                 tail = XCDR (tail);
7240               else if (char_encodable_p (c, elt))
7241                 tail = XCDR (tail);
7242               else if (CONSP (XCDR (tail)))
7243                 {
7244                   XSETCAR (tail, XCAR (XCDR (tail)));
7245                   XSETCDR (tail, XCDR (XCDR (tail)));
7246                 }
7247               else
7248                 {
7249                   XSETCAR (tail, Qnil);
7250                   tail = XCDR (tail);
7251                 }
7252             }
7253           if (charset_map_loaded)
7254             {
7255               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7256
7257               if (STRINGP (start))
7258                 pbeg = SDATA (start);
7259               else
7260                 pbeg = BYTE_POS_ADDR (start_byte);
7261               p = pbeg + p_offset;
7262               pend = pbeg + pend_offset;
7263             }
7264         }
7265     }
7266
7267   safe_codings = Qnil;
7268   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7269     if (! NILP (XCAR (tail)))
7270       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7271
7272   return safe_codings;
7273 }
7274
7275
7276 DEFUN ("unencodable-char-position", Funencodable_char_position,
7277        Sunencodable_char_position, 3, 5, 0,
7278        doc: /*
7279 Return position of first un-encodable character in a region.
7280 START and END specfiy the region and CODING-SYSTEM specifies the
7281 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7282
7283 If optional 4th argument COUNT is non-nil, it specifies at most how
7284 many un-encodable characters to search.  In this case, the value is a
7285 list of positions.
7286
7287 If optional 5th argument STRING is non-nil, it is a string to search
7288 for un-encodable characters.  In that case, START and END are indexes
7289 to the string.  */)
7290      (start, end, coding_system, count, string)
7291      Lisp_Object start, end, coding_system, count, string;
7292 {
7293   int n;
7294   struct coding_system coding;
7295   Lisp_Object attrs, charset_list, translation_table;
7296   Lisp_Object positions;
7297   int from, to;
7298   const unsigned char *p, *stop, *pend;
7299   int ascii_compatible;
7300
7301   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7302   attrs = CODING_ID_ATTRS (coding.id);
7303   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7304     return Qnil;
7305   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7306   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7307   translation_table = get_translation_table (attrs, 1);
7308
7309   if (NILP (string))
7310     {
7311       validate_region (&start, &end);
7312       from = XINT (start);
7313       to = XINT (end);
7314       if (NILP (current_buffer->enable_multibyte_characters)
7315           || (ascii_compatible
7316               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7317         return Qnil;
7318       p = CHAR_POS_ADDR (from);
7319       pend = CHAR_POS_ADDR (to);
7320       if (from < GPT && to >= GPT)
7321         stop = GPT_ADDR;
7322       else
7323         stop = pend;
7324     }
7325   else
7326     {
7327       CHECK_STRING (string);
7328       CHECK_NATNUM (start);
7329       CHECK_NATNUM (end);
7330       from = XINT (start);
7331       to = XINT (end);
7332       if (from > to
7333           || to > SCHARS (string))
7334         args_out_of_range_3 (string, start, end);
7335       if (! STRING_MULTIBYTE (string))
7336         return Qnil;
7337       p = SDATA (string) + string_char_to_byte (string, from);
7338       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7339       if (ascii_compatible && (to - from) == (pend - p))
7340         return Qnil;
7341     }
7342
7343   if (NILP (count))
7344     n = 1;
7345   else
7346     {
7347       CHECK_NATNUM (count);
7348       n = XINT (count);
7349     }
7350
7351   positions = Qnil;
7352   while (1)
7353     {
7354       int c;
7355
7356       if (ascii_compatible)
7357         while (p < stop && ASCII_BYTE_P (*p))
7358           p++, from++;
7359       if (p >= stop)
7360         {
7361           if (p >= pend)
7362             break;
7363           stop = pend;
7364           p = GAP_END_ADDR;
7365         }
7366
7367       c = STRING_CHAR_ADVANCE (p);
7368       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7369           && ! char_charset (translate_char (translation_table, c),
7370                              charset_list, NULL))
7371         {
7372           positions = Fcons (make_number (from), positions);
7373           n--;
7374           if (n == 0)
7375             break;
7376         }
7377
7378       from++;
7379     }
7380
7381   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7382 }
7383
7384
7385 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7386        Scheck_coding_systems_region, 3, 3, 0,
7387        doc: /* Check if the region is encodable by coding systems.
7388
7389 START and END are buffer positions specifying the region.
7390 CODING-SYSTEM-LIST is a list of coding systems to check.
7391
7392 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7393 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7394 whole region, POS0, POS1, ... are buffer positions where non-encodable
7395 characters are found.
7396
7397 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7398 value is nil.
7399
7400 START may be a string.  In that case, check if the string is
7401 encodable, and the value contains indices to the string instead of
7402 buffer positions.  END is ignored.  */)
7403      (start, end, coding_system_list)
7404      Lisp_Object start, end, coding_system_list;
7405 {
7406   Lisp_Object list;
7407   EMACS_INT start_byte, end_byte;
7408   int pos;
7409   const unsigned char *p, *pbeg, *pend;
7410   int c;
7411   Lisp_Object tail, elt, attrs;
7412
7413   if (STRINGP (start))
7414     {
7415       if (!STRING_MULTIBYTE (start)
7416           && SCHARS (start) != SBYTES (start))
7417         return Qnil;
7418       start_byte = 0;
7419       end_byte = SBYTES (start);
7420       pos = 0;
7421     }
7422   else
7423     {
7424       CHECK_NUMBER_COERCE_MARKER (start);
7425       CHECK_NUMBER_COERCE_MARKER (end);
7426       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7427         args_out_of_range (start, end);
7428       if (NILP (current_buffer->enable_multibyte_characters))
7429         return Qnil;
7430       start_byte = CHAR_TO_BYTE (XINT (start));
7431       end_byte = CHAR_TO_BYTE (XINT (end));
7432       if (XINT (end) - XINT (start) == end_byte - start_byte)
7433         return Qt;
7434
7435       if (XINT (start) < GPT && XINT (end) > GPT)
7436         {
7437           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7438             move_gap_both (XINT (start), start_byte);
7439           else
7440             move_gap_both (XINT (end), end_byte);
7441         }
7442       pos = XINT (start);
7443     }
7444
7445   list = Qnil;
7446   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7447     {
7448       elt = XCAR (tail);
7449       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7450       ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7451       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7452     }
7453
7454   if (STRINGP (start))
7455     p = pbeg = SDATA (start);
7456   else
7457     p = pbeg = BYTE_POS_ADDR (start_byte);
7458   pend = p + (end_byte - start_byte);
7459
7460   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7461   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7462
7463   while (p < pend)
7464     {
7465       if (ASCII_BYTE_P (*p))
7466         p++;
7467       else
7468         {
7469           c = STRING_CHAR_ADVANCE (p);
7470
7471           charset_map_loaded = 0;
7472           for (tail = list; CONSP (tail); tail = XCDR (tail))
7473             {
7474               elt = XCDR (XCAR (tail));
7475               if (! char_encodable_p (c, XCAR (elt)))
7476                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7477             }
7478           if (charset_map_loaded)
7479             {
7480               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7481
7482               if (STRINGP (start))
7483                 pbeg = SDATA (start);
7484               else
7485                 pbeg = BYTE_POS_ADDR (start_byte);
7486               p = pbeg + p_offset;
7487               pend = pbeg + pend_offset;
7488             }
7489         }
7490       pos++;
7491     }
7492
7493   tail = list;
7494   list = Qnil;
7495   for (; CONSP (tail); tail = XCDR (tail))
7496     {
7497       elt = XCAR (tail);
7498       if (CONSP (XCDR (XCDR (elt))))
7499         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7500                       list);
7501     }
7502
7503   return list;
7504 }
7505
7506
7507 Lisp_Object
7508 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7509      Lisp_Object start, end, coding_system, dst_object;
7510      int encodep, norecord;
7511 {
7512   struct coding_system coding;
7513   EMACS_INT from, from_byte, to, to_byte;
7514   Lisp_Object src_object;
7515
7516   CHECK_NUMBER_COERCE_MARKER (start);
7517   CHECK_NUMBER_COERCE_MARKER (end);
7518   if (NILP (coding_system))
7519     coding_system = Qno_conversion;
7520   else
7521     CHECK_CODING_SYSTEM (coding_system);
7522   src_object = Fcurrent_buffer ();
7523   if (NILP (dst_object))
7524     dst_object = src_object;
7525   else if (! EQ (dst_object, Qt))
7526     CHECK_BUFFER (dst_object);
7527
7528   validate_region (&start, &end);
7529   from = XFASTINT (start);
7530   from_byte = CHAR_TO_BYTE (from);
7531   to = XFASTINT (end);
7532   to_byte = CHAR_TO_BYTE (to);
7533
7534   setup_coding_system (coding_system, &coding);
7535   coding.mode |= CODING_MODE_LAST_BLOCK;
7536
7537   if (encodep)
7538     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7539                           dst_object);
7540   else
7541     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7542                           dst_object);
7543   if (! norecord)
7544     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7545
7546   return (BUFFERP (dst_object)
7547           ? make_number (coding.produced_char)
7548           : coding.dst_object);
7549 }
7550
7551
7552 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7553        3, 4, "r\nzCoding system: ",
7554        doc: /* Decode the current region from the specified coding system.
7555 When called from a program, takes four arguments:
7556         START, END, CODING-SYSTEM, and DESTINATION.
7557 START and END are buffer positions.
7558
7559 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7560 If nil, the region between START and END is replace by the decoded text.
7561 If buffer, the decoded text is inserted in the buffer.
7562 If t, the decoded text is returned.
7563
7564 This function sets `last-coding-system-used' to the precise coding system
7565 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7566 not fully specified.)
7567 It returns the length of the decoded text.  */)
7568      (start, end, coding_system, destination)
7569      Lisp_Object start, end, coding_system, destination;
7570 {
7571   return code_convert_region (start, end, coding_system, destination, 0, 0);
7572 }
7573
7574 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7575        3, 4, "r\nzCoding system: ",
7576        doc: /* Encode the current region by specified coding system.
7577 When called from a program, takes three arguments:
7578 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7579
7580 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7581 If nil, the region between START and END is replace by the encoded text.
7582 If buffer, the encoded text is inserted in the buffer.
7583 If t, the encoded text is returned.
7584
7585 This function sets `last-coding-system-used' to the precise coding system
7586 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7587 not fully specified.)
7588 It returns the length of the encoded text.  */)
7589   (start, end, coding_system, destination)
7590      Lisp_Object start, end, coding_system, destination;
7591 {
7592   return code_convert_region (start, end, coding_system, destination, 1, 0);
7593 }
7594
7595 Lisp_Object
7596 code_convert_string (string, coding_system, dst_object,
7597                      encodep, nocopy, norecord)
7598      Lisp_Object string, coding_system, dst_object;
7599      int encodep, nocopy, norecord;
7600 {
7601   struct coding_system coding;
7602   EMACS_INT chars, bytes;
7603
7604   CHECK_STRING (string);
7605   if (NILP (coding_system))
7606     {
7607       if (! norecord)
7608         Vlast_coding_system_used = Qno_conversion;
7609       if (NILP (dst_object))
7610         return (nocopy ? Fcopy_sequence (string) : string);
7611     }
7612
7613   if (NILP (coding_system))
7614     coding_system = Qno_conversion;
7615   else
7616     CHECK_CODING_SYSTEM (coding_system);
7617   if (NILP (dst_object))
7618     dst_object = Qt;
7619   else if (! EQ (dst_object, Qt))
7620     CHECK_BUFFER (dst_object);
7621
7622   setup_coding_system (coding_system, &coding);
7623   coding.mode |= CODING_MODE_LAST_BLOCK;
7624   chars = SCHARS (string);
7625   bytes = SBYTES (string);
7626   if (encodep)
7627     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7628   else
7629     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7630   if (! norecord)
7631     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7632
7633   return (BUFFERP (dst_object)
7634           ? make_number (coding.produced_char)
7635           : coding.dst_object);
7636 }
7637
7638
7639 /* Encode or decode STRING according to CODING_SYSTEM.
7640    Do not set Vlast_coding_system_used.
7641
7642    This function is called only from macros DECODE_FILE and
7643    ENCODE_FILE, thus we ignore character composition.  */
7644
7645 Lisp_Object
7646 code_convert_string_norecord (string, coding_system, encodep)
7647      Lisp_Object string, coding_system;
7648      int encodep;
7649 {
7650   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7651 }
7652
7653
7654 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7655        2, 4, 0,
7656        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7657
7658 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7659 if the decoding operation is trivial.
7660
7661 Optional fourth arg BUFFER non-nil meant that the decoded text is
7662 inserted in BUFFER instead of returned as a string.  In this case,
7663 the return value is BUFFER.
7664
7665 This function sets `last-coding-system-used' to the precise coding system
7666 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7667 not fully specified.  */)
7668   (string, coding_system, nocopy, buffer)
7669      Lisp_Object string, coding_system, nocopy, buffer;
7670 {
7671   return code_convert_string (string, coding_system, buffer,
7672                               0, ! NILP (nocopy), 0);
7673 }
7674
7675 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7676        2, 4, 0,
7677        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7678
7679 Optional third arg NOCOPY non-nil means it is OK to return STRING
7680 itself if the encoding operation is trivial.
7681
7682 Optional fourth arg BUFFER non-nil meant that the encoded text is
7683 inserted in BUFFER instead of returned as a string.  In this case,
7684 the return value is BUFFER.
7685
7686 This function sets `last-coding-system-used' to the precise coding system
7687 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7688 not fully specified.)  */)
7689      (string, coding_system, nocopy, buffer)
7690      Lisp_Object string, coding_system, nocopy, buffer;
7691 {
7692   return code_convert_string (string, coding_system, buffer,
7693                               1, ! NILP (nocopy), 1);
7694 }
7695
7696 \f
7697 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7698        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7699 Return the corresponding character.  */)
7700      (code)
7701      Lisp_Object code;
7702 {
7703   Lisp_Object spec, attrs, val;
7704   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7705   int c;
7706
7707   CHECK_NATNUM (code);
7708   c = XFASTINT (code);
7709   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7710   attrs = AREF (spec, 0);
7711
7712   if (ASCII_BYTE_P (c)
7713       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7714     return code;
7715
7716   val = CODING_ATTR_CHARSET_LIST (attrs);
7717   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7718   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7719   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7720
7721   if (c <= 0x7F)
7722     charset = charset_roman;
7723   else if (c >= 0xA0 && c < 0xDF)
7724     {
7725       charset = charset_kana;
7726       c -= 0x80;
7727     }
7728   else
7729     {
7730       int s1 = c >> 8, s2 = c & 0xFF;
7731
7732       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7733           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7734         error ("Invalid code: %d", code);
7735       SJIS_TO_JIS (c);
7736       charset = charset_kanji;
7737     }
7738   c = DECODE_CHAR (charset, c);
7739   if (c < 0)
7740     error ("Invalid code: %d", code);
7741   return make_number (c);
7742 }
7743
7744
7745 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7746        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7747 Return the corresponding code in SJIS.  */)
7748      (ch)
7749     Lisp_Object ch;
7750 {
7751   Lisp_Object spec, attrs, charset_list;
7752   int c;
7753   struct charset *charset;
7754   unsigned code;
7755
7756   CHECK_CHARACTER (ch);
7757   c = XFASTINT (ch);
7758   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7759   attrs = AREF (spec, 0);
7760
7761   if (ASCII_CHAR_P (c)
7762       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7763     return ch;
7764
7765   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7766   charset = char_charset (c, charset_list, &code);
7767   if (code == CHARSET_INVALID_CODE (charset))
7768     error ("Can't encode by shift_jis encoding: %d", c);
7769   JIS_TO_SJIS (code);
7770
7771   return make_number (code);
7772 }
7773
7774 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7775        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7776 Return the corresponding character.  */)
7777      (code)
7778      Lisp_Object code;
7779 {
7780   Lisp_Object spec, attrs, val;
7781   struct charset *charset_roman, *charset_big5, *charset;
7782   int c;
7783
7784   CHECK_NATNUM (code);
7785   c = XFASTINT (code);
7786   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7787   attrs = AREF (spec, 0);
7788
7789   if (ASCII_BYTE_P (c)
7790       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7791     return code;
7792
7793   val = CODING_ATTR_CHARSET_LIST (attrs);
7794   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7795   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7796
7797   if (c <= 0x7F)
7798     charset = charset_roman;
7799   else
7800     {
7801       int b1 = c >> 8, b2 = c & 0x7F;
7802       if (b1 < 0xA1 || b1 > 0xFE
7803           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7804         error ("Invalid code: %d", code);
7805       charset = charset_big5;
7806     }
7807   c = DECODE_CHAR (charset, (unsigned )c);
7808   if (c < 0)
7809     error ("Invalid code: %d", code);
7810   return make_number (c);
7811 }
7812
7813 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7814        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7815 Return the corresponding character code in Big5.  */)
7816      (ch)
7817      Lisp_Object ch;
7818 {
7819   Lisp_Object spec, attrs, charset_list;
7820   struct charset *charset;
7821   int c;
7822   unsigned code;
7823
7824   CHECK_CHARACTER (ch);
7825   c = XFASTINT (ch);
7826   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7827   attrs = AREF (spec, 0);
7828   if (ASCII_CHAR_P (c)
7829       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7830     return ch;
7831
7832   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7833   charset = char_charset (c, charset_list, &code);
7834   if (code == CHARSET_INVALID_CODE (charset))
7835     error ("Can't encode by Big5 encoding: %d", c);
7836
7837   return make_number (code);
7838 }
7839
7840 \f
7841 DEFUN ("set-terminal-coding-system-internal",
7842        Fset_terminal_coding_system_internal,
7843        Sset_terminal_coding_system_internal, 1, 1, 0,
7844        doc: /* Internal use only.  */)
7845      (coding_system)
7846      Lisp_Object coding_system;
7847 {
7848   CHECK_SYMBOL (coding_system);
7849   setup_coding_system (Fcheck_coding_system (coding_system),
7850                         &terminal_coding);
7851
7852   /* We had better not send unsafe characters to terminal.  */
7853   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7854   /* Characer composition should be disabled.  */
7855   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7856   terminal_coding.src_multibyte = 1;
7857   terminal_coding.dst_multibyte = 0;
7858   return Qnil;
7859 }
7860
7861 DEFUN ("set-safe-terminal-coding-system-internal",
7862        Fset_safe_terminal_coding_system_internal,
7863        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7864        doc: /* Internal use only.  */)
7865      (coding_system)
7866      Lisp_Object coding_system;
7867 {
7868   CHECK_SYMBOL (coding_system);
7869   setup_coding_system (Fcheck_coding_system (coding_system),
7870                        &safe_terminal_coding);
7871   /* Characer composition should be disabled.  */
7872   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7873   safe_terminal_coding.src_multibyte = 1;
7874   safe_terminal_coding.dst_multibyte = 0;
7875   return Qnil;
7876 }
7877
7878 DEFUN ("terminal-coding-system",
7879        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7880        doc: /* Return coding system specified for terminal output.  */)
7881      ()
7882 {
7883   return CODING_ID_NAME (terminal_coding.id);
7884 }
7885
7886 DEFUN ("set-keyboard-coding-system-internal",
7887        Fset_keyboard_coding_system_internal,
7888        Sset_keyboard_coding_system_internal, 1, 1, 0,
7889        doc: /* Internal use only.  */)
7890      (coding_system)
7891      Lisp_Object coding_system;
7892 {
7893   CHECK_SYMBOL (coding_system);
7894   setup_coding_system (Fcheck_coding_system (coding_system),
7895                        &keyboard_coding);
7896   /* Characer composition should be disabled.  */
7897   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7898   return Qnil;
7899 }
7900
7901 DEFUN ("keyboard-coding-system",
7902        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7903        doc: /* Return coding system specified for decoding keyboard input.  */)
7904      ()
7905 {
7906   return CODING_ID_NAME (keyboard_coding.id);
7907 }
7908
7909 \f
7910 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7911        Sfind_operation_coding_system,  1, MANY, 0,
7912        doc: /* Choose a coding system for an operation based on the target name.
7913 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7914 DECODING-SYSTEM is the coding system to use for decoding
7915 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7916 for encoding (in case OPERATION does encoding).
7917
7918 The first argument OPERATION specifies an I/O primitive:
7919   For file I/O, `insert-file-contents' or `write-region'.
7920   For process I/O, `call-process', `call-process-region', or `start-process'.
7921   For network I/O, `open-network-stream'.
7922
7923 The remaining arguments should be the same arguments that were passed
7924 to the primitive.  Depending on which primitive, one of those arguments
7925 is selected as the TARGET.  For example, if OPERATION does file I/O,
7926 whichever argument specifies the file name is TARGET.
7927
7928 TARGET has a meaning which depends on OPERATION:
7929   For file I/O, TARGET is a file name.
7930   For process I/O, TARGET is a process name.
7931   For network I/O, TARGET is a service name or a port number
7932
7933 This function looks up what specified for TARGET in,
7934 `file-coding-system-alist', `process-coding-system-alist',
7935 or `network-coding-system-alist' depending on OPERATION.
7936 They may specify a coding system, a cons of coding systems,
7937 or a function symbol to call.
7938 In the last case, we call the function with one argument,
7939 which is a list of all the arguments given to this function.
7940
7941 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7942      (nargs, args)
7943      int nargs;
7944      Lisp_Object *args;
7945 {
7946   Lisp_Object operation, target_idx, target, val;
7947   register Lisp_Object chain;
7948
7949   if (nargs < 2)
7950     error ("Too few arguments");
7951   operation = args[0];
7952   if (!SYMBOLP (operation)
7953       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7954     error ("Invalid first arguement");
7955   if (nargs < 1 + XINT (target_idx))
7956     error ("Too few arguments for operation: %s",
7957            SDATA (SYMBOL_NAME (operation)));
7958   target = args[XINT (target_idx) + 1];
7959   if (!(STRINGP (target)
7960         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7961     error ("Invalid %dth argument", XINT (target_idx) + 1);
7962
7963   chain = ((EQ (operation, Qinsert_file_contents)
7964             || EQ (operation, Qwrite_region))
7965            ? Vfile_coding_system_alist
7966            : (EQ (operation, Qopen_network_stream)
7967               ? Vnetwork_coding_system_alist
7968               : Vprocess_coding_system_alist));
7969   if (NILP (chain))
7970     return Qnil;
7971
7972   for (; CONSP (chain); chain = XCDR (chain))
7973     {
7974       Lisp_Object elt;
7975
7976       elt = XCAR (chain);
7977       if (CONSP (elt)
7978           && ((STRINGP (target)
7979                && STRINGP (XCAR (elt))
7980                && fast_string_match (XCAR (elt), target) >= 0)
7981               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7982         {
7983           val = XCDR (elt);
7984           /* Here, if VAL is both a valid coding system and a valid
7985              function symbol, we return VAL as a coding system.  */
7986           if (CONSP (val))
7987             return val;
7988           if (! SYMBOLP (val))
7989             return Qnil;
7990           if (! NILP (Fcoding_system_p (val)))
7991             return Fcons (val, val);
7992           if (! NILP (Ffboundp (val)))
7993             {
7994               val = call1 (val, Flist (nargs, args));
7995               if (CONSP (val))
7996                 return val;
7997               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7998                 return Fcons (val, val);
7999             }
8000           return Qnil;
8001         }
8002     }
8003   return Qnil;
8004 }
8005
8006 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8007        Sset_coding_system_priority, 0, MANY, 0,
8008        doc: /* Assign higher priority to the coding systems given as arguments.
8009 If multiple coding systems belongs to the same category,
8010 all but the first one are ignored.
8011
8012 usage: (set-coding-system-priority ...)  */)
8013      (nargs, args)
8014      int nargs;
8015      Lisp_Object *args;
8016 {
8017   int i, j;
8018   int changed[coding_category_max];
8019   enum coding_category priorities[coding_category_max];
8020
8021   bzero (changed, sizeof changed);
8022
8023   for (i = j = 0; i < nargs; i++)
8024     {
8025       enum coding_category category;
8026       Lisp_Object spec, attrs;
8027
8028       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8029       attrs = AREF (spec, 0);
8030       category = XINT (CODING_ATTR_CATEGORY (attrs));
8031       if (changed[category])
8032         /* Ignore this coding system because a coding system of the
8033            same category already had a higher priority.  */
8034         continue;
8035       changed[category] = 1;
8036       priorities[j++] = category;
8037       if (coding_categories[category].id >= 0
8038           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8039         setup_coding_system (args[i], &coding_categories[category]);
8040       Fset (AREF (Vcoding_category_table, category), args[i]);
8041     }
8042
8043   /* Now we have decided top J priorities.  Reflect the order of the
8044      original priorities to the remaining priorities.  */
8045
8046   for (i = j, j = 0; i < coding_category_max; i++, j++)
8047     {
8048       while (j < coding_category_max
8049              && changed[coding_priorities[j]])
8050         j++;
8051       if (j == coding_category_max)
8052         abort ();
8053       priorities[i] = coding_priorities[j];
8054     }
8055
8056   bcopy (priorities, coding_priorities, sizeof priorities);
8057
8058   /* Update `coding-category-list'.  */
8059   Vcoding_category_list = Qnil;
8060   for (i = coding_category_max - 1; i >= 0; i--)
8061     Vcoding_category_list
8062       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8063                Vcoding_category_list);
8064
8065   return Qnil;
8066 }
8067
8068 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8069        Scoding_system_priority_list, 0, 1, 0,
8070        doc: /* Return a list of coding systems ordered by their priorities.
8071 HIGHESTP non-nil means just return the highest priority one.  */)
8072      (highestp)
8073      Lisp_Object highestp;
8074 {
8075   int i;
8076   Lisp_Object val;
8077
8078   for (i = 0, val = Qnil; i < coding_category_max; i++)
8079     {
8080       enum coding_category category = coding_priorities[i];
8081       int id = coding_categories[category].id;
8082       Lisp_Object attrs;
8083
8084       if (id < 0)
8085         continue;
8086       attrs = CODING_ID_ATTRS (id);
8087       if (! NILP (highestp))
8088         return CODING_ATTR_BASE_NAME (attrs);
8089       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8090     }
8091   return Fnreverse (val);
8092 }
8093
8094 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8095
8096 static Lisp_Object
8097 make_subsidiaries (base)
8098      Lisp_Object base;
8099 {
8100   Lisp_Object subsidiaries;
8101   int base_name_len = SBYTES (SYMBOL_NAME (base));
8102   char *buf = (char *) alloca (base_name_len + 6);
8103   int i;
8104
8105   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8106   subsidiaries = Fmake_vector (make_number (3), Qnil);
8107   for (i = 0; i < 3; i++)
8108     {
8109       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8110       ASET (subsidiaries, i, intern (buf));
8111     }
8112   return subsidiaries;
8113 }
8114
8115
8116 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8117        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8118        doc: /* For internal use only.
8119 usage: (define-coding-system-internal ...)  */)
8120      (nargs, args)
8121      int nargs;
8122      Lisp_Object *args;
8123 {
8124   Lisp_Object name;
8125   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8126   Lisp_Object attrs;            /* Vector of attributes.  */
8127   Lisp_Object eol_type;
8128   Lisp_Object aliases;
8129   Lisp_Object coding_type, charset_list, safe_charsets;
8130   enum coding_category category;
8131   Lisp_Object tail, val;
8132   int max_charset_id = 0;
8133   int i;
8134
8135   if (nargs < coding_arg_max)
8136     goto short_args;
8137
8138   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8139
8140   name = args[coding_arg_name];
8141   CHECK_SYMBOL (name);
8142   CODING_ATTR_BASE_NAME (attrs) = name;
8143
8144   val = args[coding_arg_mnemonic];
8145   if (! STRINGP (val))
8146     CHECK_CHARACTER (val);
8147   CODING_ATTR_MNEMONIC (attrs) = val;
8148
8149   coding_type = args[coding_arg_coding_type];
8150   CHECK_SYMBOL (coding_type);
8151   CODING_ATTR_TYPE (attrs) = coding_type;
8152
8153   charset_list = args[coding_arg_charset_list];
8154   if (SYMBOLP (charset_list))
8155     {
8156       if (EQ (charset_list, Qiso_2022))
8157         {
8158           if (! EQ (coding_type, Qiso_2022))
8159             error ("Invalid charset-list");
8160           charset_list = Viso_2022_charset_list;
8161         }
8162       else if (EQ (charset_list, Qemacs_mule))
8163         {
8164           if (! EQ (coding_type, Qemacs_mule))
8165             error ("Invalid charset-list");
8166           charset_list = Vemacs_mule_charset_list;
8167         }
8168       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8169         if (max_charset_id < XFASTINT (XCAR (tail)))
8170           max_charset_id = XFASTINT (XCAR (tail));
8171     }
8172   else
8173     {
8174       charset_list = Fcopy_sequence (charset_list);
8175       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8176         {
8177           struct charset *charset;
8178
8179           val = Fcar (tail);
8180           CHECK_CHARSET_GET_CHARSET (val, charset);
8181           if (EQ (coding_type, Qiso_2022)
8182               ? CHARSET_ISO_FINAL (charset) < 0
8183               : EQ (coding_type, Qemacs_mule)
8184               ? CHARSET_EMACS_MULE_ID (charset) < 0
8185               : 0)
8186             error ("Can't handle charset `%s'",
8187                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8188
8189           XSETCAR (tail, make_number (charset->id));
8190           if (max_charset_id < charset->id)
8191             max_charset_id = charset->id;
8192         }
8193     }
8194   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8195
8196   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8197                                 make_number (255));
8198   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8199     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8200   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8201
8202   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8203
8204   val = args[coding_arg_decode_translation_table];
8205   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8206     CHECK_SYMBOL (val);
8207   CODING_ATTR_DECODE_TBL (attrs) = val;
8208
8209   val = args[coding_arg_encode_translation_table];
8210   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8211     CHECK_SYMBOL (val);
8212   CODING_ATTR_ENCODE_TBL (attrs) = val;
8213
8214   val = args[coding_arg_post_read_conversion];
8215   CHECK_SYMBOL (val);
8216   CODING_ATTR_POST_READ (attrs) = val;
8217
8218   val = args[coding_arg_pre_write_conversion];
8219   CHECK_SYMBOL (val);
8220   CODING_ATTR_PRE_WRITE (attrs) = val;
8221
8222   val = args[coding_arg_default_char];
8223   if (NILP (val))
8224     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8225   else
8226     {
8227       CHECK_CHARACTER (val);
8228       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8229     }
8230
8231   val = args[coding_arg_for_unibyte];
8232   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8233
8234   val = args[coding_arg_plist];
8235   CHECK_LIST (val);
8236   CODING_ATTR_PLIST (attrs) = val;
8237
8238   if (EQ (coding_type, Qcharset))
8239     {
8240       /* Generate a lisp vector of 256 elements.  Each element is nil,
8241          integer, or a list of charset IDs.
8242
8243          If Nth element is nil, the byte code N is invalid in this
8244          coding system.
8245
8246          If Nth element is a number NUM, N is the first byte of a
8247          charset whose ID is NUM.
8248
8249          If Nth element is a list of charset IDs, N is the first byte
8250          of one of them.  The list is sorted by dimensions of the
8251          charsets.  A charset of smaller dimension comes firtst. */
8252       val = Fmake_vector (make_number (256), Qnil);
8253
8254       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8255         {
8256           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8257           int dim = CHARSET_DIMENSION (charset);
8258           int idx = (dim - 1) * 4;
8259
8260           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8261             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8262
8263           for (i = charset->code_space[idx];
8264                i <= charset->code_space[idx + 1]; i++)
8265             {
8266               Lisp_Object tmp, tmp2;
8267               int dim2;
8268
8269               tmp = AREF (val, i);
8270               if (NILP (tmp))
8271                 tmp = XCAR (tail);
8272               else if (NUMBERP (tmp))
8273                 {
8274                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8275                   if (dim < dim2)
8276                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8277                   else
8278                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8279                 }
8280               else
8281                 {
8282                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8283                     {
8284                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8285                       if (dim < dim2)
8286                         break;
8287                     }
8288                   if (NILP (tmp2))
8289                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8290                   else
8291                     {
8292                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8293                       XSETCAR (tmp2, XCAR (tail));
8294                     }
8295                 }
8296               ASET (val, i, tmp);
8297             }
8298         }
8299       ASET (attrs, coding_attr_charset_valids, val);
8300       category = coding_category_charset;
8301     }
8302   else if (EQ (coding_type, Qccl))
8303     {
8304       Lisp_Object valids;
8305
8306       if (nargs < coding_arg_ccl_max)
8307         goto short_args;
8308
8309       val = args[coding_arg_ccl_decoder];
8310       CHECK_CCL_PROGRAM (val);
8311       if (VECTORP (val))
8312         val = Fcopy_sequence (val);
8313       ASET (attrs, coding_attr_ccl_decoder, val);
8314
8315       val = args[coding_arg_ccl_encoder];
8316       CHECK_CCL_PROGRAM (val);
8317       if (VECTORP (val))
8318         val = Fcopy_sequence (val);
8319       ASET (attrs, coding_attr_ccl_encoder, val);
8320
8321       val = args[coding_arg_ccl_valids];
8322       valids = Fmake_string (make_number (256), make_number (0));
8323       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8324         {
8325           int from, to;
8326
8327           val = Fcar (tail);
8328           if (INTEGERP (val))
8329             {
8330               from = to = XINT (val);
8331               if (from < 0 || from > 255)
8332                 args_out_of_range_3 (val, make_number (0), make_number (255));
8333             }
8334           else
8335             {
8336               CHECK_CONS (val);
8337               CHECK_NATNUM_CAR (val);
8338               CHECK_NATNUM_CDR (val);
8339               from = XINT (XCAR (val));
8340               if (from > 255)
8341                 args_out_of_range_3 (XCAR (val),
8342                                      make_number (0), make_number (255));
8343               to = XINT (XCDR (val));
8344               if (to < from || to > 255)
8345                 args_out_of_range_3 (XCDR (val),
8346                                      XCAR (val), make_number (255));
8347             }
8348           for (i = from; i <= to; i++)
8349             SSET (valids, i, 1);
8350         }
8351       ASET (attrs, coding_attr_ccl_valids, valids);
8352
8353       category = coding_category_ccl;
8354     }
8355   else if (EQ (coding_type, Qutf_16))
8356     {
8357       Lisp_Object bom, endian;
8358
8359       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8360
8361       if (nargs < coding_arg_utf16_max)
8362         goto short_args;
8363
8364       bom = args[coding_arg_utf16_bom];
8365       if (! NILP (bom) && ! EQ (bom, Qt))
8366         {
8367           CHECK_CONS (bom);
8368           val = XCAR (bom);
8369           CHECK_CODING_SYSTEM (val);
8370           val = XCDR (bom);
8371           CHECK_CODING_SYSTEM (val);
8372         }
8373       ASET (attrs, coding_attr_utf_16_bom, bom);
8374
8375       endian = args[coding_arg_utf16_endian];
8376       CHECK_SYMBOL (endian);
8377       if (NILP (endian))
8378         endian = Qbig;
8379       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8380         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8381       ASET (attrs, coding_attr_utf_16_endian, endian);
8382
8383       category = (CONSP (bom)
8384                   ? coding_category_utf_16_auto
8385                   : NILP (bom)
8386                   ? (EQ (endian, Qbig)
8387                      ? coding_category_utf_16_be_nosig
8388                      : coding_category_utf_16_le_nosig)
8389                   : (EQ (endian, Qbig)
8390                      ? coding_category_utf_16_be
8391                      : coding_category_utf_16_le));
8392     }
8393   else if (EQ (coding_type, Qiso_2022))
8394     {
8395       Lisp_Object initial, reg_usage, request, flags;
8396       int i;
8397
8398       if (nargs < coding_arg_iso2022_max)
8399         goto short_args;
8400
8401       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8402       CHECK_VECTOR (initial);
8403       for (i = 0; i < 4; i++)
8404         {
8405           val = Faref (initial, make_number (i));
8406           if (! NILP (val))
8407             {
8408               struct charset *charset;
8409
8410               CHECK_CHARSET_GET_CHARSET (val, charset);
8411               ASET (initial, i, make_number (CHARSET_ID (charset)));
8412               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8413                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8414             }
8415           else
8416             ASET (initial, i, make_number (-1));
8417         }
8418
8419       reg_usage = args[coding_arg_iso2022_reg_usage];
8420       CHECK_CONS (reg_usage);
8421       CHECK_NUMBER_CAR (reg_usage);
8422       CHECK_NUMBER_CDR (reg_usage);
8423
8424       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8425       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8426         {
8427           int id;
8428           Lisp_Object tmp;
8429
8430           val = Fcar (tail);
8431           CHECK_CONS (val);
8432           tmp = XCAR (val);
8433           CHECK_CHARSET_GET_ID (tmp, id);
8434           CHECK_NATNUM_CDR (val);
8435           if (XINT (XCDR (val)) >= 4)
8436             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8437           XSETCAR (val, make_number (id));
8438         }
8439
8440       flags = args[coding_arg_iso2022_flags];
8441       CHECK_NATNUM (flags);
8442       i = XINT (flags);
8443       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8444         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8445
8446       ASET (attrs, coding_attr_iso_initial, initial);
8447       ASET (attrs, coding_attr_iso_usage, reg_usage);
8448       ASET (attrs, coding_attr_iso_request, request);
8449       ASET (attrs, coding_attr_iso_flags, flags);
8450       setup_iso_safe_charsets (attrs);
8451
8452       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8453         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8454                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8455                     ? coding_category_iso_7_else
8456                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8457                     ? coding_category_iso_7
8458                     : coding_category_iso_7_tight);
8459       else
8460         {
8461           int id = XINT (AREF (initial, 1));
8462
8463           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8464                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8465                        || id < 0)
8466                       ? coding_category_iso_8_else
8467                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8468                       ? coding_category_iso_8_1
8469                       : coding_category_iso_8_2);
8470         }
8471       if (category != coding_category_iso_8_1
8472           && category != coding_category_iso_8_2)
8473         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8474     }
8475   else if (EQ (coding_type, Qemacs_mule))
8476     {
8477       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8478         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8479       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8480       category = coding_category_emacs_mule;
8481     }
8482   else if (EQ (coding_type, Qshift_jis))
8483     {
8484
8485       struct charset *charset;
8486
8487       if (XINT (Flength (charset_list)) != 3
8488           && XINT (Flength (charset_list)) != 4)
8489         error ("There should be three or four charsets");
8490
8491       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8492       if (CHARSET_DIMENSION (charset) != 1)
8493         error ("Dimension of charset %s is not one",
8494                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8495       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8496         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8497
8498       charset_list = XCDR (charset_list);
8499       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8500       if (CHARSET_DIMENSION (charset) != 1)
8501         error ("Dimension of charset %s is not one",
8502                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8503
8504       charset_list = XCDR (charset_list);
8505       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8506       if (CHARSET_DIMENSION (charset) != 2)
8507         error ("Dimension of charset %s is not two",
8508                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8509
8510       charset_list = XCDR (charset_list);
8511       if (! NILP (charset_list))
8512         {
8513           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8514           if (CHARSET_DIMENSION (charset) != 2)
8515             error ("Dimension of charset %s is not two",
8516                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8517         }
8518
8519       category = coding_category_sjis;
8520       Vsjis_coding_system = name;
8521     }
8522   else if (EQ (coding_type, Qbig5))
8523     {
8524       struct charset *charset;
8525
8526       if (XINT (Flength (charset_list)) != 2)
8527         error ("There should be just two charsets");
8528
8529       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8530       if (CHARSET_DIMENSION (charset) != 1)
8531         error ("Dimension of charset %s is not one",
8532                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8533       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8534         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8535
8536       charset_list = XCDR (charset_list);
8537       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8538       if (CHARSET_DIMENSION (charset) != 2)
8539         error ("Dimension of charset %s is not two",
8540                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8541
8542       category = coding_category_big5;
8543       Vbig5_coding_system = name;
8544     }
8545   else if (EQ (coding_type, Qraw_text))
8546     {
8547       category = coding_category_raw_text;
8548       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8549     }
8550   else if (EQ (coding_type, Qutf_8))
8551     {
8552       category = coding_category_utf_8;
8553       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8554     }
8555   else if (EQ (coding_type, Qundecided))
8556     category = coding_category_undecided;
8557   else
8558     error ("Invalid coding system type: %s",
8559            SDATA (SYMBOL_NAME (coding_type)));
8560
8561   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8562   CODING_ATTR_PLIST (attrs)
8563     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8564                                 CODING_ATTR_PLIST (attrs)));
8565
8566   eol_type = args[coding_arg_eol_type];
8567   if (! NILP (eol_type)
8568       && ! EQ (eol_type, Qunix)
8569       && ! EQ (eol_type, Qdos)
8570       && ! EQ (eol_type, Qmac))
8571     error ("Invalid eol-type");
8572
8573   aliases = Fcons (name, Qnil);
8574
8575   if (NILP (eol_type))
8576     {
8577       eol_type = make_subsidiaries (name);
8578       for (i = 0; i < 3; i++)
8579         {
8580           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8581
8582           this_name = AREF (eol_type, i);
8583           this_aliases = Fcons (this_name, Qnil);
8584           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8585           this_spec = Fmake_vector (make_number (3), attrs);
8586           ASET (this_spec, 1, this_aliases);
8587           ASET (this_spec, 2, this_eol_type);
8588           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8589           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8590           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8591                                         Vcoding_system_alist);
8592         }
8593     }
8594
8595   spec_vec = Fmake_vector (make_number (3), attrs);
8596   ASET (spec_vec, 1, aliases);
8597   ASET (spec_vec, 2, eol_type);
8598
8599   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8600   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8601   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8602                                 Vcoding_system_alist);
8603
8604   {
8605     int id = coding_categories[category].id;
8606
8607     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8608       setup_coding_system (name, &coding_categories[category]);
8609   }
8610
8611   return Qnil;
8612
8613  short_args:
8614   return Fsignal (Qwrong_number_of_arguments,
8615                   Fcons (intern ("define-coding-system-internal"),
8616                          make_number (nargs)));
8617 }
8618
8619
8620 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8621        3, 3, 0,
8622        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8623   (coding_system, prop, val)
8624      Lisp_Object coding_system, prop, val;
8625 {
8626   Lisp_Object spec, attrs, plist;
8627
8628   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8629   attrs = AREF (spec, 0);
8630   if (EQ (prop, QCmnemonic))
8631     {
8632       if (! STRINGP (val))
8633         CHECK_CHARACTER (val);
8634       CODING_ATTR_MNEMONIC (attrs) = val;
8635     }
8636   else if (EQ (prop, QCdefalut_char))
8637     {
8638       if (NILP (val))
8639         val = make_number (' ');
8640       else
8641         CHECK_CHARACTER (val);
8642       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8643     }
8644   else if (EQ (prop, QCdecode_translation_table))
8645     {
8646       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8647         CHECK_SYMBOL (val);
8648       CODING_ATTR_DECODE_TBL (attrs) = val;
8649     }
8650   else if (EQ (prop, QCencode_translation_table))
8651     {
8652       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8653         CHECK_SYMBOL (val);
8654       CODING_ATTR_ENCODE_TBL (attrs) = val;
8655     }
8656   else if (EQ (prop, QCpost_read_conversion))
8657     {
8658       CHECK_SYMBOL (val);
8659       CODING_ATTR_POST_READ (attrs) = val;
8660     }
8661   else if (EQ (prop, QCpre_write_conversion))
8662     {
8663       CHECK_SYMBOL (val);
8664       CODING_ATTR_PRE_WRITE (attrs) = val;
8665     }
8666
8667   CODING_ATTR_PLIST (attrs)
8668     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8669   return val;
8670 }
8671
8672
8673 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8674        Sdefine_coding_system_alias, 2, 2, 0,
8675        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8676      (alias, coding_system)
8677      Lisp_Object alias, coding_system;
8678 {
8679   Lisp_Object spec, aliases, eol_type;
8680
8681   CHECK_SYMBOL (alias);
8682   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8683   aliases = AREF (spec, 1);
8684   /* ALISES should be a list of length more than zero, and the first
8685      element is a base coding system.  Append ALIAS at the tail of the
8686      list.  */
8687   while (!NILP (XCDR (aliases)))
8688     aliases = XCDR (aliases);
8689   XSETCDR (aliases, Fcons (alias, Qnil));
8690
8691   eol_type = AREF (spec, 2);
8692   if (VECTORP (eol_type))
8693     {
8694       Lisp_Object subsidiaries;
8695       int i;
8696
8697       subsidiaries = make_subsidiaries (alias);
8698       for (i = 0; i < 3; i++)
8699         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8700                                      AREF (eol_type, i));
8701     }
8702
8703   Fputhash (alias, spec, Vcoding_system_hash_table);
8704   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8705   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8706                                 Vcoding_system_alist);
8707
8708   return Qnil;
8709 }
8710
8711 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8712        1, 1, 0,
8713        doc: /* Return the base of CODING-SYSTEM.
8714 Any alias or subsidiary coding system is not a base coding system.  */)
8715   (coding_system)
8716      Lisp_Object coding_system;
8717 {
8718   Lisp_Object spec, attrs;
8719
8720   if (NILP (coding_system))
8721     return (Qno_conversion);
8722   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8723   attrs = AREF (spec, 0);
8724   return CODING_ATTR_BASE_NAME (attrs);
8725 }
8726
8727 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8728        1, 1, 0,
8729        doc: "Return the property list of CODING-SYSTEM.")
8730      (coding_system)
8731      Lisp_Object coding_system;
8732 {
8733   Lisp_Object spec, attrs;
8734
8735   if (NILP (coding_system))
8736     coding_system = Qno_conversion;
8737   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8738   attrs = AREF (spec, 0);
8739   return CODING_ATTR_PLIST (attrs);
8740 }
8741
8742
8743 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8744        1, 1, 0,
8745        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8746      (coding_system)
8747      Lisp_Object coding_system;
8748 {
8749   Lisp_Object spec;
8750
8751   if (NILP (coding_system))
8752     coding_system = Qno_conversion;
8753   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8754   return AREF (spec, 1);
8755 }
8756
8757 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8758        Scoding_system_eol_type, 1, 1, 0,
8759        doc: /* Return eol-type of CODING-SYSTEM.
8760 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8761
8762 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8763 and CR respectively.
8764
8765 A vector value indicates that a format of end-of-line should be
8766 detected automatically.  Nth element of the vector is the subsidiary
8767 coding system whose eol-type is N.  */)
8768      (coding_system)
8769      Lisp_Object coding_system;
8770 {
8771   Lisp_Object spec, eol_type;
8772   int n;
8773
8774   if (NILP (coding_system))
8775     coding_system = Qno_conversion;
8776   if (! CODING_SYSTEM_P (coding_system))
8777     return Qnil;
8778   spec = CODING_SYSTEM_SPEC (coding_system);
8779   eol_type = AREF (spec, 2);
8780   if (VECTORP (eol_type))
8781     return Fcopy_sequence (eol_type);
8782   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8783   return make_number (n);
8784 }
8785
8786 #endif /* emacs */
8787
8788 \f
8789 /*** 9. Post-amble ***/
8790
8791 void
8792 init_coding_once ()
8793 {
8794   int i;
8795
8796   for (i = 0; i < coding_category_max; i++)
8797     {
8798       coding_categories[i].id = -1;
8799       coding_priorities[i] = i;
8800     }
8801
8802   /* ISO2022 specific initialize routine.  */
8803   for (i = 0; i < 0x20; i++)
8804     iso_code_class[i] = ISO_control_0;
8805   for (i = 0x21; i < 0x7F; i++)
8806     iso_code_class[i] = ISO_graphic_plane_0;
8807   for (i = 0x80; i < 0xA0; i++)
8808     iso_code_class[i] = ISO_control_1;
8809   for (i = 0xA1; i < 0xFF; i++)
8810     iso_code_class[i] = ISO_graphic_plane_1;
8811   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8812   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8813   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8814   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8815   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8816   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8817   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8818   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8819   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8820
8821   for (i = 0; i < 256; i++)
8822     {
8823       emacs_mule_bytes[i] = 1;
8824     }
8825   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8826   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8827   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8828   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8829 }
8830
8831 #ifdef emacs
8832
8833 void
8834 syms_of_coding ()
8835 {
8836   staticpro (&Vcoding_system_hash_table);
8837   {
8838     Lisp_Object args[2];
8839     args[0] = QCtest;
8840     args[1] = Qeq;
8841     Vcoding_system_hash_table = Fmake_hash_table (2, args);
8842   }
8843
8844   staticpro (&Vsjis_coding_system);
8845   Vsjis_coding_system = Qnil;
8846
8847   staticpro (&Vbig5_coding_system);
8848   Vbig5_coding_system = Qnil;
8849
8850   staticpro (&Vcode_conversion_reused_workbuf);
8851   Vcode_conversion_reused_workbuf = Qnil;
8852
8853   staticpro (&Vcode_conversion_workbuf_name);
8854   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8855
8856   reused_workbuf_in_use = 0;
8857
8858   DEFSYM (Qcharset, "charset");
8859   DEFSYM (Qtarget_idx, "target-idx");
8860   DEFSYM (Qcoding_system_history, "coding-system-history");
8861   Fset (Qcoding_system_history, Qnil);
8862
8863   /* Target FILENAME is the first argument.  */
8864   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8865   /* Target FILENAME is the third argument.  */
8866   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8867
8868   DEFSYM (Qcall_process, "call-process");
8869   /* Target PROGRAM is the first argument.  */
8870   Fput (Qcall_process, Qtarget_idx, make_number (0));
8871
8872   DEFSYM (Qcall_process_region, "call-process-region");
8873   /* Target PROGRAM is the third argument.  */
8874   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8875
8876   DEFSYM (Qstart_process, "start-process");
8877   /* Target PROGRAM is the third argument.  */
8878   Fput (Qstart_process, Qtarget_idx, make_number (2));
8879
8880   DEFSYM (Qopen_network_stream, "open-network-stream");
8881   /* Target SERVICE is the fourth argument.  */
8882   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8883
8884   DEFSYM (Qcoding_system, "coding-system");
8885   DEFSYM (Qcoding_aliases, "coding-aliases");
8886
8887   DEFSYM (Qeol_type, "eol-type");
8888   DEFSYM (Qunix, "unix");
8889   DEFSYM (Qdos, "dos");
8890
8891   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8892   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8893   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8894   DEFSYM (Qdefault_char, "default-char");
8895   DEFSYM (Qundecided, "undecided");
8896   DEFSYM (Qno_conversion, "no-conversion");
8897   DEFSYM (Qraw_text, "raw-text");
8898
8899   DEFSYM (Qiso_2022, "iso-2022");
8900
8901   DEFSYM (Qutf_8, "utf-8");
8902   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8903
8904   DEFSYM (Qutf_16, "utf-16");
8905   DEFSYM (Qbig, "big");
8906   DEFSYM (Qlittle, "little");
8907
8908   DEFSYM (Qshift_jis, "shift-jis");
8909   DEFSYM (Qbig5, "big5");
8910
8911   DEFSYM (Qcoding_system_p, "coding-system-p");
8912
8913   DEFSYM (Qcoding_system_error, "coding-system-error");
8914   Fput (Qcoding_system_error, Qerror_conditions,
8915         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8916   Fput (Qcoding_system_error, Qerror_message,
8917         build_string ("Invalid coding system"));
8918
8919   /* Intern this now in case it isn't already done.
8920      Setting this variable twice is harmless.
8921      But don't staticpro it here--that is done in alloc.c.  */
8922   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8923
8924   DEFSYM (Qtranslation_table, "translation-table");
8925   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8926   DEFSYM (Qtranslation_table_id, "translation-table-id");
8927   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8928   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8929
8930   DEFSYM (Qvalid_codes, "valid-codes");
8931
8932   DEFSYM (Qemacs_mule, "emacs-mule");
8933
8934   DEFSYM (QCcategory, ":category");
8935   DEFSYM (QCmnemonic, ":mnemonic");
8936   DEFSYM (QCdefalut_char, ":default-char");
8937   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8938   DEFSYM (QCencode_translation_table, ":encode-translation-table");
8939   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8940   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
8941
8942   Vcoding_category_table
8943     = Fmake_vector (make_number (coding_category_max), Qnil);
8944   staticpro (&Vcoding_category_table);
8945   /* Followings are target of code detection.  */
8946   ASET (Vcoding_category_table, coding_category_iso_7,
8947         intern ("coding-category-iso-7"));
8948   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8949         intern ("coding-category-iso-7-tight"));
8950   ASET (Vcoding_category_table, coding_category_iso_8_1,
8951         intern ("coding-category-iso-8-1"));
8952   ASET (Vcoding_category_table, coding_category_iso_8_2,
8953         intern ("coding-category-iso-8-2"));
8954   ASET (Vcoding_category_table, coding_category_iso_7_else,
8955         intern ("coding-category-iso-7-else"));
8956   ASET (Vcoding_category_table, coding_category_iso_8_else,
8957         intern ("coding-category-iso-8-else"));
8958   ASET (Vcoding_category_table, coding_category_utf_8,
8959         intern ("coding-category-utf-8"));
8960   ASET (Vcoding_category_table, coding_category_utf_16_be,
8961         intern ("coding-category-utf-16-be"));
8962   ASET (Vcoding_category_table, coding_category_utf_16_auto,
8963         intern ("coding-category-utf-16-auto"));
8964   ASET (Vcoding_category_table, coding_category_utf_16_le,
8965         intern ("coding-category-utf-16-le"));
8966   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8967         intern ("coding-category-utf-16-be-nosig"));
8968   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8969         intern ("coding-category-utf-16-le-nosig"));
8970   ASET (Vcoding_category_table, coding_category_charset,
8971         intern ("coding-category-charset"));
8972   ASET (Vcoding_category_table, coding_category_sjis,
8973         intern ("coding-category-sjis"));
8974   ASET (Vcoding_category_table, coding_category_big5,
8975         intern ("coding-category-big5"));
8976   ASET (Vcoding_category_table, coding_category_ccl,
8977         intern ("coding-category-ccl"));
8978   ASET (Vcoding_category_table, coding_category_emacs_mule,
8979         intern ("coding-category-emacs-mule"));
8980   /* Followings are NOT target of code detection.  */
8981   ASET (Vcoding_category_table, coding_category_raw_text,
8982         intern ("coding-category-raw-text"));
8983   ASET (Vcoding_category_table, coding_category_undecided,
8984         intern ("coding-category-undecided"));
8985
8986   DEFSYM (Qinsufficient_source, "insufficient-source");
8987   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8988   DEFSYM (Qinvalid_source, "invalid-source");
8989   DEFSYM (Qinterrupted, "interrupted");
8990   DEFSYM (Qinsufficient_memory, "insufficient-memory");
8991
8992   defsubr (&Scoding_system_p);
8993   defsubr (&Sread_coding_system);
8994   defsubr (&Sread_non_nil_coding_system);
8995   defsubr (&Scheck_coding_system);
8996   defsubr (&Sdetect_coding_region);
8997   defsubr (&Sdetect_coding_string);
8998   defsubr (&Sfind_coding_systems_region_internal);
8999   defsubr (&Sunencodable_char_position);
9000   defsubr (&Scheck_coding_systems_region);
9001   defsubr (&Sdecode_coding_region);
9002   defsubr (&Sencode_coding_region);
9003   defsubr (&Sdecode_coding_string);
9004   defsubr (&Sencode_coding_string);
9005   defsubr (&Sdecode_sjis_char);
9006   defsubr (&Sencode_sjis_char);
9007   defsubr (&Sdecode_big5_char);
9008   defsubr (&Sencode_big5_char);
9009   defsubr (&Sset_terminal_coding_system_internal);
9010   defsubr (&Sset_safe_terminal_coding_system_internal);
9011   defsubr (&Sterminal_coding_system);
9012   defsubr (&Sset_keyboard_coding_system_internal);
9013   defsubr (&Skeyboard_coding_system);
9014   defsubr (&Sfind_operation_coding_system);
9015   defsubr (&Sset_coding_system_priority);
9016   defsubr (&Sdefine_coding_system_internal);
9017   defsubr (&Sdefine_coding_system_alias);
9018   defsubr (&Scoding_system_put);
9019   defsubr (&Scoding_system_base);
9020   defsubr (&Scoding_system_plist);
9021   defsubr (&Scoding_system_aliases);
9022   defsubr (&Scoding_system_eol_type);
9023   defsubr (&Scoding_system_priority_list);
9024
9025   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9026                doc: /* List of coding systems.
9027
9028 Do not alter the value of this variable manually.  This variable should be
9029 updated by the functions `define-coding-system' and
9030 `define-coding-system-alias'.  */);
9031   Vcoding_system_list = Qnil;
9032
9033   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9034                doc: /* Alist of coding system names.
9035 Each element is one element list of coding system name.
9036 This variable is given to `completing-read' as TABLE argument.
9037
9038 Do not alter the value of this variable manually.  This variable should be
9039 updated by the functions `make-coding-system' and
9040 `define-coding-system-alias'.  */);
9041   Vcoding_system_alist = Qnil;
9042
9043   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9044                doc: /* List of coding-categories (symbols) ordered by priority.
9045
9046 On detecting a coding system, Emacs tries code detection algorithms
9047 associated with each coding-category one by one in this order.  When
9048 one algorithm agrees with a byte sequence of source text, the coding
9049 system bound to the corresponding coding-category is selected.  */);
9050   {
9051     int i;
9052
9053     Vcoding_category_list = Qnil;
9054     for (i = coding_category_max - 1; i >= 0; i--)
9055       Vcoding_category_list
9056         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9057                  Vcoding_category_list);
9058   }
9059
9060   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9061                doc: /* Specify the coding system for read operations.
9062 It is useful to bind this variable with `let', but do not set it globally.
9063 If the value is a coding system, it is used for decoding on read operation.
9064 If not, an appropriate element is used from one of the coding system alists:
9065 There are three such tables, `file-coding-system-alist',
9066 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9067   Vcoding_system_for_read = Qnil;
9068
9069   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9070                doc: /* Specify the coding system for write operations.
9071 Programs bind this variable with `let', but you should not set it globally.
9072 If the value is a coding system, it is used for encoding of output,
9073 when writing it to a file and when sending it to a file or subprocess.
9074
9075 If this does not specify a coding system, an appropriate element
9076 is used from one of the coding system alists:
9077 There are three such tables, `file-coding-system-alist',
9078 `process-coding-system-alist', and `network-coding-system-alist'.
9079 For output to files, if the above procedure does not specify a coding system,
9080 the value of `buffer-file-coding-system' is used.  */);
9081   Vcoding_system_for_write = Qnil;
9082
9083   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9084                doc: /*
9085 Coding system used in the latest file or process I/O.  */);
9086   Vlast_coding_system_used = Qnil;
9087
9088   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9089                doc: /*
9090 Error status of the last code conversion.
9091
9092 When an error was detected in the last code conversion, this variable
9093 is set to one of the following symbols.
9094   `insufficient-source'
9095   `inconsistent-eol'
9096   `invalid-source'
9097   `interrupted'
9098   `insufficient-memory'
9099 When no error was detected, the value doesn't change.  So, to check
9100 the error status of a code conversion by this variable, you must
9101 explicitly set this variable to nil before performing code
9102 conversion.  */);
9103   Vlast_code_conversion_error = Qnil;
9104
9105   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9106                doc: /*
9107 *Non-nil means always inhibit code conversion of end-of-line format.
9108 See info node `Coding Systems' and info node `Text and Binary' concerning
9109 such conversion.  */);
9110   inhibit_eol_conversion = 0;
9111
9112   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9113                doc: /*
9114 Non-nil means process buffer inherits coding system of process output.
9115 Bind it to t if the process output is to be treated as if it were a file
9116 read from some filesystem.  */);
9117   inherit_process_coding_system = 0;
9118
9119   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9120                doc: /*
9121 Alist to decide a coding system to use for a file I/O operation.
9122 The format is ((PATTERN . VAL) ...),
9123 where PATTERN is a regular expression matching a file name,
9124 VAL is a coding system, a cons of coding systems, or a function symbol.
9125 If VAL is a coding system, it is used for both decoding and encoding
9126 the file contents.
9127 If VAL is a cons of coding systems, the car part is used for decoding,
9128 and the cdr part is used for encoding.
9129 If VAL is a function symbol, the function must return a coding system
9130 or a cons of coding systems which are used as above.  The function gets
9131 the arguments with which `find-operation-coding-systems' was called.
9132
9133 See also the function `find-operation-coding-system'
9134 and the variable `auto-coding-alist'.  */);
9135   Vfile_coding_system_alist = Qnil;
9136
9137   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9138                doc: /*
9139 Alist to decide a coding system to use for a process I/O operation.
9140 The format is ((PATTERN . VAL) ...),
9141 where PATTERN is a regular expression matching a program name,
9142 VAL is a coding system, a cons of coding systems, or a function symbol.
9143 If VAL is a coding system, it is used for both decoding what received
9144 from the program and encoding what sent to the program.
9145 If VAL is a cons of coding systems, the car part is used for decoding,
9146 and the cdr part is used for encoding.
9147 If VAL is a function symbol, the function must return a coding system
9148 or a cons of coding systems which are used as above.
9149
9150 See also the function `find-operation-coding-system'.  */);
9151   Vprocess_coding_system_alist = Qnil;
9152
9153   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9154                doc: /*
9155 Alist to decide a coding system to use for a network I/O operation.
9156 The format is ((PATTERN . VAL) ...),
9157 where PATTERN is a regular expression matching a network service name
9158 or is a port number to connect to,
9159 VAL is a coding system, a cons of coding systems, or a function symbol.
9160 If VAL is a coding system, it is used for both decoding what received
9161 from the network stream and encoding what sent to the network stream.
9162 If VAL is a cons of coding systems, the car part is used for decoding,
9163 and the cdr part is used for encoding.
9164 If VAL is a function symbol, the function must return a coding system
9165 or a cons of coding systems which are used as above.
9166
9167 See also the function `find-operation-coding-system'.  */);
9168   Vnetwork_coding_system_alist = Qnil;
9169
9170   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9171                doc: /* Coding system to use with system messages.
9172 Also used for decoding keyboard input on X Window system.  */);
9173   Vlocale_coding_system = Qnil;
9174
9175   /* The eol mnemonics are reset in startup.el system-dependently.  */
9176   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9177                doc: /*
9178 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9179   eol_mnemonic_unix = build_string (":");
9180
9181   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9182                doc: /*
9183 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9184   eol_mnemonic_dos = build_string ("\\");
9185
9186   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9187                doc: /*
9188 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9189   eol_mnemonic_mac = build_string ("/");
9190
9191   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9192                doc: /*
9193 *String displayed in mode line when end-of-line format is not yet determined.  */);
9194   eol_mnemonic_undecided = build_string (":");
9195
9196   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9197                doc: /*
9198 *Non-nil enables character translation while encoding and decoding.  */);
9199   Venable_character_translation = Qt;
9200
9201   DEFVAR_LISP ("standard-translation-table-for-decode",
9202                &Vstandard_translation_table_for_decode,
9203                doc: /* Table for translating characters while decoding.  */);
9204   Vstandard_translation_table_for_decode = Qnil;
9205
9206   DEFVAR_LISP ("standard-translation-table-for-encode",
9207                &Vstandard_translation_table_for_encode,
9208                doc: /* Table for translating characters while encoding.  */);
9209   Vstandard_translation_table_for_encode = Qnil;
9210
9211   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9212                doc: /* Alist of charsets vs revision numbers.
9213 While encoding, if a charset (car part of an element) is found,
9214 designate it with the escape sequence identifying revision (cdr part
9215 of the element).  */);
9216   Vcharset_revision_table = Qnil;
9217
9218   DEFVAR_LISP ("default-process-coding-system",
9219                &Vdefault_process_coding_system,
9220                doc: /* Cons of coding systems used for process I/O by default.
9221 The car part is used for decoding a process output,
9222 the cdr part is used for encoding a text to be sent to a process.  */);
9223   Vdefault_process_coding_system = Qnil;
9224
9225   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9226                doc: /*
9227 Table of extra Latin codes in the range 128..159 (inclusive).
9228 This is a vector of length 256.
9229 If Nth element is non-nil, the existence of code N in a file
9230 \(or output of subprocess) doesn't prevent it to be detected as
9231 a coding system of ISO 2022 variant which has a flag
9232 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9233 or reading output of a subprocess.
9234 Only 128th through 159th elements has a meaning.  */);
9235   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9236
9237   DEFVAR_LISP ("select-safe-coding-system-function",
9238                &Vselect_safe_coding_system_function,
9239                doc: /*
9240 Function to call to select safe coding system for encoding a text.
9241
9242 If set, this function is called to force a user to select a proper
9243 coding system which can encode the text in the case that a default
9244 coding system used in each operation can't encode the text.
9245
9246 The default value is `select-safe-coding-system' (which see).  */);
9247   Vselect_safe_coding_system_function = Qnil;
9248
9249   DEFVAR_BOOL ("coding-system-require-warning",
9250                &coding_system_require_warning,
9251                doc: /* Internal use only.
9252 If non-nil, on writing a file, `select-safe-coding-system-function' is
9253 called even if `coding-system-for-write' is non-nil.  The command
9254 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9255   coding_system_require_warning = 0;
9256
9257
9258   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9259                &inhibit_iso_escape_detection,
9260                doc: /*
9261 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9262
9263 By default, on reading a file, Emacs tries to detect how the text is
9264 encoded.  This code detection is sensitive to escape sequences.  If
9265 the sequence is valid as ISO2022, the code is determined as one of
9266 the ISO2022 encodings, and the file is decoded by the corresponding
9267 coding system (e.g. `iso-2022-7bit').
9268
9269 However, there may be a case that you want to read escape sequences in
9270 a file as is.  In such a case, you can set this variable to non-nil.
9271 Then, as the code detection ignores any escape sequences, no file is
9272 detected as encoded in some ISO2022 encoding.  The result is that all
9273 escape sequences become visible in a buffer.
9274
9275 The default value is nil, and it is strongly recommended not to change
9276 it.  That is because many Emacs Lisp source files that contain
9277 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9278 in Emacs's distribution, and they won't be decoded correctly on
9279 reading if you suppress escape sequence detection.
9280
9281 The other way to read escape sequences in a file without decoding is
9282 to explicitly specify some coding system that doesn't use ISO2022's
9283 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9284   inhibit_iso_escape_detection = 0;
9285
9286   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9287                doc: /* Char table for translating self-inserting characters.
9288 This is applied to the result of input methods, not their input.  See also
9289 `keyboard-translate-table'.  */);
9290     Vtranslation_table_for_input = Qnil;
9291
9292   {
9293     Lisp_Object args[coding_arg_max];
9294     Lisp_Object plist[16];
9295     int i;
9296
9297     for (i = 0; i < coding_arg_max; i++)
9298       args[i] = Qnil;
9299
9300     plist[0] = intern (":name");
9301     plist[1] = args[coding_arg_name] = Qno_conversion;
9302     plist[2] = intern (":mnemonic");
9303     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9304     plist[4] = intern (":coding-type");
9305     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9306     plist[6] = intern (":ascii-compatible-p");
9307     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9308     plist[8] = intern (":default-char");
9309     plist[9] = args[coding_arg_default_char] = make_number (0);
9310     plist[10] = intern (":for-unibyte");
9311     plist[11] = args[coding_arg_for_unibyte] = Qt;
9312     plist[12] = intern (":docstring");
9313     plist[13] = build_string ("Do no conversion.\n\
9314 \n\
9315 When you visit a file with this coding, the file is read into a\n\
9316 unibyte buffer as is, thus each byte of a file is treated as a\n\
9317 character.");
9318     plist[14] = intern (":eol-type");
9319     plist[15] = args[coding_arg_eol_type] = Qunix;
9320     args[coding_arg_plist] = Flist (16, plist);
9321     Fdefine_coding_system_internal (coding_arg_max, args);
9322   }
9323
9324   setup_coding_system (Qno_conversion, &keyboard_coding);
9325   setup_coding_system (Qno_conversion, &terminal_coding);
9326   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9327
9328   {
9329     int i;
9330
9331     for (i = 0; i < coding_category_max; i++)
9332       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9333   }
9334 }
9335
9336 char *
9337 emacs_strerror (error_number)
9338      int error_number;
9339 {
9340   char *str;
9341
9342   synchronize_system_messages_locale ();
9343   str = strerror (error_number);
9344
9345   if (! NILP (Vlocale_coding_system))
9346     {
9347       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9348                                                       Vlocale_coding_system,
9349                                                       0);
9350       str = (char *) SDATA (dec);
9351     }
9352
9353   return str;
9354 }
9355
9356 #endif /* emacs */