src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314
 315 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 317 Lisp_Object Qstart_process, Qopen_network_stream;
 318 Lisp_Object Qtarget_idx;
 319
 320 Lisp_Object Vselect_safe_coding_system_function;
 321
 322 /* Mnemonic string for each format of end-of-line.  */
 323 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 324 /* Mnemonic string to indicate format of end-of-line is not yet
 325    decided.  */
 326 Lisp_Object eol_mnemonic_undecided;
 327
 328 #ifdef emacs
 329
 330 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 331
 332 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 333
 334 /* Coding system emacs-mule and raw-text are for converting only
 335    end-of-line format.  */
 336 Lisp_Object Qemacs_mule, Qraw_text;
 337
 338 /* Coding-systems are handed between Emacs Lisp programs and C internal
 339    routines by the following three variables.  */
 340 /* Coding-system for reading files and receiving data from process.  */
 341 Lisp_Object Vcoding_system_for_read;
 342 /* Coding-system for writing files and sending data to process.  */
 343 Lisp_Object Vcoding_system_for_write;
 344 /* Coding-system actually used in the latest I/O.  */
 345 Lisp_Object Vlast_coding_system_used;
 346
 347 /* A vector of length 256 which contains information about special
 348    Latin codes (especially for dealing with Microsoft codes).  */
 349 Lisp_Object Vlatin_extra_code_table;
 350
 351 /* Flag to inhibit code conversion of end-of-line format.  */
 352 int inhibit_eol_conversion;
 353
 354 /* Flag to inhibit ISO2022 escape sequence detection.  */
 355 int inhibit_iso_escape_detection;
 356
 357 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 358 int inherit_process_coding_system;
 359
 360 /* Coding system to be used to encode text for terminal display.  */
 361 struct coding_system terminal_coding;
 362
 363 /* Coding system to be used to encode text for terminal display when
 364    terminal coding system is nil.  */
 365 struct coding_system safe_terminal_coding;
 366
 367 /* Coding system of what is sent from terminal keyboard.  */
 368 struct coding_system keyboard_coding;
 369
 370 Lisp_Object Vfile_coding_system_alist;
 371 Lisp_Object Vprocess_coding_system_alist;
 372 Lisp_Object Vnetwork_coding_system_alist;
 373
 374 Lisp_Object Vlocale_coding_system;
 375
 376 #endif /* emacs */
 377
 378 /* Flag to tell if we look up translation table on character code
 379    conversion.  */
 380 Lisp_Object Venable_character_translation;
 381 /* Standard translation table to look up on decoding (reading).  */
 382 Lisp_Object Vstandard_translation_table_for_decode;
 383 /* Standard translation table to look up on encoding (writing).  */
 384 Lisp_Object Vstandard_translation_table_for_encode;
 385
 386 Lisp_Object Qtranslation_table;
 387 Lisp_Object Qtranslation_table_id;
 388 Lisp_Object Qtranslation_table_for_decode;
 389 Lisp_Object Qtranslation_table_for_encode;
 390
 391 /* Alist of charsets vs revision number.  */
 392 static Lisp_Object Vcharset_revision_table;
 393
 394 /* Default coding systems used for process I/O.  */
 395 Lisp_Object Vdefault_process_coding_system;
 396
 397 /* Global flag to tell that we can't call post-read-conversion and
 398    pre-write-conversion functions.  Usually the value is zero, but it
 399    is set to 1 temporarily while such functions are running.  This is
 400    to avoid infinite recursive call.  */
 401 static int inhibit_pre_post_conversion;
 402
 403 /* Two special coding systems.  */
 404 Lisp_Object Vsjis_coding_system;
 405 Lisp_Object Vbig5_coding_system;
 406
 407
 408 static int detect_coding_utf_8 P_ ((struct coding_system *,
 409                                     struct coding_detection_info *info));
 410 static void decode_coding_utf_8 P_ ((struct coding_system *));
 411 static int encode_coding_utf_8 P_ ((struct coding_system *));
 412
 413 static int detect_coding_utf_16 P_ ((struct coding_system *,
 414                                      struct coding_detection_info *info));
 415 static void decode_coding_utf_16 P_ ((struct coding_system *));
 416 static int encode_coding_utf_16 P_ ((struct coding_system *));
 417
 418 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 419                                        struct coding_detection_info *info));
 420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 422
 423 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 424                                          struct coding_detection_info *info));
 425 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 426 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 427
 428 static int detect_coding_sjis P_ ((struct coding_system *,
 429                                    struct coding_detection_info *info));
 430 static void decode_coding_sjis P_ ((struct coding_system *));
 431 static int encode_coding_sjis P_ ((struct coding_system *));
 432
 433 static int detect_coding_big5 P_ ((struct coding_system *,
 434                                    struct coding_detection_info *info));
 435 static void decode_coding_big5 P_ ((struct coding_system *));
 436 static int encode_coding_big5 P_ ((struct coding_system *));
 437
 438 static int detect_coding_ccl P_ ((struct coding_system *,
 439                                   struct coding_detection_info *info));
 440 static void decode_coding_ccl P_ ((struct coding_system *));
 441 static int encode_coding_ccl P_ ((struct coding_system *));
 442
 443 static void decode_coding_raw_text P_ ((struct coding_system *));
 444 static int encode_coding_raw_text P_ ((struct coding_system *));
 445
 446
 447 /* ISO2022 section */
 448
 449 #define CODING_ISO_INITIAL(coding, reg)                 \
 450   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 451                      coding_attr_iso_initial),          \
 452                reg)))
 453
 454
 455 #define CODING_ISO_REQUEST(coding, charset_id)  \
 456   ((charset_id <= (coding)->max_charset_id      \
 457     ? (coding)->safe_charsets[charset_id]       \
 458     : -1))
 459
 460
 461 #define CODING_ISO_FLAGS(coding)        \
 462   ((coding)->spec.iso_2022.flags)
 463 #define CODING_ISO_DESIGNATION(coding, reg)     \
 464   ((coding)->spec.iso_2022.current_designation[reg])
 465 #define CODING_ISO_INVOCATION(coding, plane)    \
 466   ((coding)->spec.iso_2022.current_invocation[plane])
 467 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 468   ((coding)->spec.iso_2022.single_shifting)
 469 #define CODING_ISO_BOL(coding)  \
 470   ((coding)->spec.iso_2022.bol)
 471 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 472   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 473
 474 /* Control characters of ISO2022.  */
 475                         /* code */      /* function */
 476 #define ISO_CODE_LF     0x0A            /* line-feed */
 477 #define ISO_CODE_CR     0x0D            /* carriage-return */
 478 #define ISO_CODE_SO     0x0E            /* shift-out */
 479 #define ISO_CODE_SI     0x0F            /* shift-in */
 480 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 481 #define ISO_CODE_ESC    0x1B            /* escape */
 482 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 483 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 484 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 485
 486 /* All code (1-byte) of ISO2022 is classified into one of the
 487    followings.  */
 488 enum iso_code_class_type
 489   {
 490     ISO_control_0,              /* Control codes in the range
 491                                    0x00..0x1F and 0x7F, except for the
 492                                    following 5 codes.  */
 493     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 494     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 495     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 496     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 497     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 498     ISO_control_1,              /* Control codes in the range
 499                                    0x80..0x9F, except for the
 500                                    following 3 codes.  */
 501     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 502     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 503     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 504     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 505     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 506     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 507     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 508   };
 509
 510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 511     `iso-flags' attribute of an iso2022 coding system.  */
 512
 513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 514    instead of the correct short-form sequence (e.g. ESC $ A).  */
 515 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 516
 517 /* If set, reset graphic planes and registers at end-of-line to the
 518    initial state.  */
 519 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 520
 521 /* If set, reset graphic planes and registers before any control
 522    characters to the initial state.  */
 523 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 524
 525 /* If set, encode by 7-bit environment.  */
 526 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 527
 528 /* If set, use locking-shift function.  */
 529 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 530
 531 /* If set, use single-shift function.  Overwrite
 532    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 533 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 534
 535 /* If set, use designation escape sequence.  */
 536 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 537
 538 /* If set, produce revision number sequence.  */
 539 #define CODING_ISO_FLAG_REVISION        0x0080
 540
 541 /* If set, produce ISO6429's direction specifying sequence.  */
 542 #define CODING_ISO_FLAG_DIRECTION       0x0100
 543
 544 /* If set, assume designation states are reset at beginning of line on
 545    output.  */
 546 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 547
 548 /* If set, designation sequence should be placed at beginning of line
 549    on output.  */
 550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 551
 552 /* If set, do not encode unsafe charactes on output.  */
 553 #define CODING_ISO_FLAG_SAFE            0x0800
 554
 555 /* If set, extra latin codes (128..159) are accepted as a valid code
 556    on input.  */
 557 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 558
 559 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 560
 561 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 562
 563 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 564
 565 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 566
 567 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 568
 569 /* A character to be produced on output if encoding of the original
 570    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 572
 573
 574 /* UTF-16 section */
 575 #define CODING_UTF_16_BOM(coding)       \
 576   ((coding)->spec.utf_16.bom)
 577
 578 #define CODING_UTF_16_ENDIAN(coding)    \
 579   ((coding)->spec.utf_16.endian)
 580
 581 #define CODING_UTF_16_SURROGATE(coding) \
 582   ((coding)->spec.utf_16.surrogate)
 583
 584
 585 /* CCL section */
 586 #define CODING_CCL_DECODER(coding)      \
 587   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 588 #define CODING_CCL_ENCODER(coding)      \
 589   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 590 #define CODING_CCL_VALIDS(coding)                                          \
 591   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 592    ->data)
 593
 594 /* Index for each coding category in `coding_categories' */
 595
 596 enum coding_category
 597   {
 598     coding_category_iso_7,
 599     coding_category_iso_7_tight,
 600     coding_category_iso_8_1,
 601     coding_category_iso_8_2,
 602     coding_category_iso_7_else,
 603     coding_category_iso_8_else,
 604     coding_category_utf_8,
 605     coding_category_utf_16_auto,
 606     coding_category_utf_16_be,
 607     coding_category_utf_16_le,
 608     coding_category_utf_16_be_nosig,
 609     coding_category_utf_16_le_nosig,
 610     coding_category_charset,
 611     coding_category_sjis,
 612     coding_category_big5,
 613     coding_category_ccl,
 614     coding_category_emacs_mule,
 615     /* All above are targets of code detection.  */
 616     coding_category_raw_text,
 617     coding_category_undecided,
 618     coding_category_max
 619   };
 620
 621 /* Definitions of flag bits used in detect_coding_XXXX.  */
 622 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 623 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 624 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 625 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 626 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 627 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 628 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 629 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 630 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 631 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 632 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 633 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 634 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 635 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 636 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 637 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 638 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 639 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 640
 641 /* This value is returned if detect_coding_mask () find nothing other
 642    than ASCII characters.  */
 643 #define CATEGORY_MASK_ANY               \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_8_1              \
 647    | CATEGORY_MASK_ISO_8_2              \
 648    | CATEGORY_MASK_ISO_7_ELSE           \
 649    | CATEGORY_MASK_ISO_8_ELSE           \
 650    | CATEGORY_MASK_UTF_8                \
 651    | CATEGORY_MASK_UTF_16_BE            \
 652    | CATEGORY_MASK_UTF_16_LE            \
 653    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 654    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 655    | CATEGORY_MASK_CHARSET              \
 656    | CATEGORY_MASK_SJIS                 \
 657    | CATEGORY_MASK_BIG5                 \
 658    | CATEGORY_MASK_CCL                  \
 659    | CATEGORY_MASK_EMACS_MULE)
 660
 661
 662 #define CATEGORY_MASK_ISO_7BIT \
 663   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 664
 665 #define CATEGORY_MASK_ISO_8BIT \
 666   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 667
 668 #define CATEGORY_MASK_ISO_ELSE \
 669   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 670
 671 #define CATEGORY_MASK_ISO_ESCAPE        \
 672   (CATEGORY_MASK_ISO_7                  \
 673    | CATEGORY_MASK_ISO_7_TIGHT          \
 674    | CATEGORY_MASK_ISO_7_ELSE           \
 675    | CATEGORY_MASK_ISO_8_ELSE)
 676
 677 #define CATEGORY_MASK_ISO       \
 678   (  CATEGORY_MASK_ISO_7BIT     \
 679      | CATEGORY_MASK_ISO_8BIT   \
 680      | CATEGORY_MASK_ISO_ELSE)
 681
 682 #define CATEGORY_MASK_UTF_16            \
 683   (CATEGORY_MASK_UTF_16_BE              \
 684    | CATEGORY_MASK_UTF_16_LE            \
 685    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 686    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 687
 688
 689 /* List of symbols `coding-category-xxx' ordered by priority.  This
 690    variable is exposed to Emacs Lisp.  */
 691 static Lisp_Object Vcoding_category_list;
 692
 693 /* Table of coding categories (Lisp symbols).  This variable is for
 694    internal use oly.  */
 695 static Lisp_Object Vcoding_category_table;
 696
 697 /* Table of coding-categories ordered by priority.  */
 698 static enum coding_category coding_priorities[coding_category_max];
 699
 700 /* Nth element is a coding context for the coding system bound to the
 701    Nth coding category.  */
 702 static struct coding_system coding_categories[coding_category_max];
 703
 704 /*** Commonly used macros and functions ***/
 705
 706 #ifndef min
 707 #define min(a, b) ((a) < (b) ? (a) : (b))
 708 #endif
 709 #ifndef max
 710 #define max(a, b) ((a) > (b) ? (a) : (b))
 711 #endif
 712
 713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 714   do {                                                          \
 715     attrs = CODING_ID_ATTRS (coding->id);                       \
 716     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 717     if (VECTORP (eol_type))                                     \
 718       eol_type = Qunix;                                         \
 719     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 720   } while (0)
 721
 722
 723 /* Safely get one byte from the source text pointed by SRC which ends
 724    at SRC_END, and set C to that byte.  If there are not enough bytes
 725    in the source, it jumps to `no_more_source'.  The caller
 726    should declare and set these variables appropriately in advance:
 727         src, src_end, multibytep
 728 */
 729
 730 #define ONE_MORE_BYTE(c)                                        \
 731   do {                                                          \
 732     if (src == src_end)                                         \
 733       {                                                         \
 734         if (src_base < src)                                     \
 735           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 736         goto no_more_source;                                    \
 737       }                                                         \
 738     c = *src++;                                                 \
 739     if (multibytep && (c & 0x80))                               \
 740       {                                                         \
 741         if ((c & 0xFE) != 0xC0)                                 \
 742           error ("Undecodable char found");                     \
 743         c = ((c & 1) << 6) | *src++;                            \
 744       }                                                         \
 745     consumed_chars++;                                           \
 746   } while (0)
 747
 748
 749 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 750   do {                                          \
 751     c = *src++;                                 \
 752     if (multibytep && (c & 0x80))               \
 753       {                                         \
 754         if ((c & 0xFE) != 0xC0)                 \
 755           error ("Undecodable char found");     \
 756         c = ((c & 1) << 6) | *src++;            \
 757       }                                         \
 758     consumed_chars++;                           \
 759   } while (0)
 760
 761
 762 /* Store a byte C in the place pointed by DST and increment DST to the
 763    next free point, and increment PRODUCED_CHARS.  The caller should
 764    assure that C is 0..127, and declare and set the variable `dst'
 765    appropriately in advance.
 766 */
 767
 768
 769 #define EMIT_ONE_ASCII_BYTE(c)  \
 770   do {                          \
 771     produced_chars++;           \
 772     *dst++ = (c);               \
 773   } while (0)
 774
 775
 776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 777
 778 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 779   do {                                  \
 780     produced_chars += 2;                \
 781     *dst++ = (c1), *dst++ = (c2);       \
 782   } while (0)
 783
 784
 785 /* Store a byte C in the place pointed by DST and increment DST to the
 786    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 787    nonzero, store in an appropriate multibyte from.  The caller should
 788    declare and set the variables `dst' and `multibytep' appropriately
 789    in advance.  */
 790
 791 #define EMIT_ONE_BYTE(c)                \
 792   do {                                  \
 793     produced_chars++;                   \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch = (c);                   \
 797         if (ch >= 0x80)                 \
 798           ch = BYTE8_TO_CHAR (ch);      \
 799         CHAR_STRING_ADVANCE (ch, dst);  \
 800       }                                 \
 801     else                                \
 802       *dst++ = (c);                     \
 803   } while (0)
 804
 805
 806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 807
 808 #define EMIT_TWO_BYTES(c1, c2)          \
 809   do {                                  \
 810     produced_chars += 2;                \
 811     if (multibytep)                     \
 812       {                                 \
 813         int ch;                         \
 814                                         \
 815         ch = (c1);                      \
 816         if (ch >= 0x80)                 \
 817           ch = BYTE8_TO_CHAR (ch);      \
 818         CHAR_STRING_ADVANCE (ch, dst);  \
 819         ch = (c2);                      \
 820         if (ch >= 0x80)                 \
 821           ch = BYTE8_TO_CHAR (ch);      \
 822         CHAR_STRING_ADVANCE (ch, dst);  \
 823       }                                 \
 824     else                                \
 825       {                                 \
 826         *dst++ = (c1);                  \
 827         *dst++ = (c2);                  \
 828       }                                 \
 829   } while (0)
 830
 831
 832 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 833   do {                                  \
 834     EMIT_ONE_BYTE (c1);                 \
 835     EMIT_TWO_BYTES (c2, c3);            \
 836   } while (0)
 837
 838
 839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 840   do {                                          \
 841     EMIT_TWO_BYTES (c1, c2);                    \
 842     EMIT_TWO_BYTES (c3, c4);                    \
 843   } while (0)
 844
 845
 846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 847   do {                                                                       \
 848     charset_map_loaded = 0;                                                  \
 849     c = DECODE_CHAR (charset, code);                                         \
 850     if (charset_map_loaded)                                                  \
 851       {                                                                      \
 852         unsigned char *orig = coding->source;                                \
 853         EMACS_INT offset;                                                    \
 854                                                                              \
 855         coding_set_source (coding);                                          \
 856         offset = coding->source - orig;                                      \
 857         src += offset;                                                       \
 858         src_base += offset;                                                  \
 859         src_end += offset;                                                   \
 860       }                                                                      \
 861   } while (0)
 862
 863
 864 #define ASSURE_DESTINATION(bytes)                               \
 865   do {                                                          \
 866     if (dst + (bytes) >= dst_end)                               \
 867       {                                                         \
 868         int more_bytes = charbuf_end - charbuf + (bytes);       \
 869                                                                 \
 870         dst = alloc_destination (coding, more_bytes, dst);      \
 871         dst_end = coding->destination + coding->dst_bytes;      \
 872       }                                                         \
 873   } while (0)
 874
 875
 876
 877 static void
 878 coding_set_source (coding)
 879      struct coding_system *coding;
 880 {
 881   if (BUFFERP (coding->src_object))
 882     {
 883       struct buffer *buf = XBUFFER (coding->src_object);
 884
 885       if (coding->src_pos < 0)
 886         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 887       else
 888         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 889     }
 890   else if (STRINGP (coding->src_object))
 891     {
 892       coding->source = (XSTRING (coding->src_object)->data
 893                         + coding->src_pos_byte);
 894     }
 895   else
 896     /* Otherwise, the source is C string and is never relocated
 897        automatically.  Thus we don't have to update anything.  */
 898     ;
 899 }
 900
 901 static void
 902 coding_set_destination (coding)
 903      struct coding_system *coding;
 904 {
 905   if (BUFFERP (coding->dst_object))
 906     {
 907       if (coding->src_pos < 0)
 908         {
 909           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 910           coding->dst_bytes = (GAP_END_ADDR
 911                                - (coding->src_bytes - coding->consumed)
 912                                - coding->destination);
 913         }
 914       else
 915         {
 916           /* We are sure that coding->dst_pos_byte is before the gap
 917              of the buffer. */
 918           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 919                                  + coding->dst_pos_byte - 1);
 920           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 921                                - coding->destination);
 922         }
 923     }
 924   else
 925     /* Otherwise, the destination is C string and is never relocated
 926        automatically.  Thus we don't have to update anything.  */
 927     ;
 928 }
 929
 930
 931 static void
 932 coding_alloc_by_realloc (coding, bytes)
 933      struct coding_system *coding;
 934      EMACS_INT bytes;
 935 {
 936   coding->destination = (unsigned char *) xrealloc (coding->destination,
 937                                                     coding->dst_bytes + bytes);
 938   coding->dst_bytes += bytes;
 939 }
 940
 941 static void
 942 coding_alloc_by_making_gap (coding, bytes)
 943      struct coding_system *coding;
 944      EMACS_INT bytes;
 945 {
 946   if (BUFFERP (coding->dst_object)
 947       && EQ (coding->src_object, coding->dst_object))
 948     {
 949       EMACS_INT add = coding->src_bytes - coding->consumed;
 950
 951       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 952       make_gap (bytes);
 953       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 954     }
 955   else
 956     {
 957       Lisp_Object this_buffer;
 958
 959       this_buffer = Fcurrent_buffer ();
 960       set_buffer_internal (XBUFFER (coding->dst_object));
 961       make_gap (bytes);
 962       set_buffer_internal (XBUFFER (this_buffer));
 963     }
 964 }
 965
 966
 967 static unsigned char *
 968 alloc_destination (coding, nbytes, dst)
 969      struct coding_system *coding;
 970      int nbytes;
 971      unsigned char *dst;
 972 {
 973   EMACS_INT offset = dst - coding->destination;
 974
 975   if (BUFFERP (coding->dst_object))
 976     coding_alloc_by_making_gap (coding, nbytes);
 977   else
 978     coding_alloc_by_realloc (coding, nbytes);
 979   coding->result = CODING_RESULT_SUCCESS;
 980   coding_set_destination (coding);
 981   dst = coding->destination + offset;
 982   return dst;
 983 }
 984
 985 /** Macros for annotations.  */
 986
 987 /* Maximum length of annotation data (sum of annotations for
 988    composition and charset).  */
 989 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
 990
 991 /* An annotation data is stored in the array coding->charbuf in this
 992    format:
 993      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
 994    LENGTH is the number of elements in the annotation.
 995    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
 996    FROM and TO specify the range of text annotated.  They are relative
 997    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
 998
 999    The format of the following elements depend on ANNOTATION_MASK.
1000
1001    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1002    follows:
1003      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1004    METHOD is one of enum composition_method.
1005    Optionnal COMPOSITION-COMPONENTS are characters and composition
1006    rules.
1007
1008    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1009    follows.  */
1010
1011 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1012   do {                                                  \
1013     *(buf)++ = -(len);                                  \
1014     *(buf)++ = (mask);                                  \
1015     *(buf)++ = (from);                                  \
1016     *(buf)++ = (to);                                    \
1017     coding->annotated = 1;                              \
1018   } while (0);
1019
1020 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1021   do {                                                                        \
1022     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1023     *buf++ = method;                                                          \
1024   } while (0)
1025
1026
1027 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1028   do {                                                                    \
1029     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1030     *buf++ = id;                                                          \
1031   } while (0)
1032
1033 \f
1034 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1035
1036
1037
1038 \f
1039 /*** 3. UTF-8 ***/
1040
1041 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1042    Check if a text is encoded in UTF-8.  If it is, return 1, else
1043    return 0.  */
1044
1045 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1046 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1047 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1048 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1049 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1050 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1051
1052 static int
1053 detect_coding_utf_8 (coding, detect_info)
1054      struct coding_system *coding;
1055      struct coding_detection_info *detect_info;
1056 {
1057   unsigned char *src = coding->source, *src_base = src;
1058   unsigned char *src_end = coding->source + coding->src_bytes;
1059   int multibytep = coding->src_multibyte;
1060   int consumed_chars = 0;
1061   int found = 0;
1062   int incomplete;
1063
1064   detect_info->checked |= CATEGORY_MASK_UTF_8;
1065   /* A coding system of this category is always ASCII compatible.  */
1066   src += coding->head_ascii;
1067
1068   while (1)
1069     {
1070       int c, c1, c2, c3, c4;
1071
1072       incomplete = 0;
1073       ONE_MORE_BYTE (c);
1074       if (UTF_8_1_OCTET_P (c))
1075         continue;
1076       incomplete = 1;
1077       ONE_MORE_BYTE (c1);
1078       if (! UTF_8_EXTRA_OCTET_P (c1))
1079         break;
1080       if (UTF_8_2_OCTET_LEADING_P (c))
1081         {
1082           found = CATEGORY_MASK_UTF_8;
1083           continue;
1084         }
1085       ONE_MORE_BYTE (c2);
1086       if (! UTF_8_EXTRA_OCTET_P (c2))
1087         break;
1088       if (UTF_8_3_OCTET_LEADING_P (c))
1089         {
1090           found = CATEGORY_MASK_UTF_8;
1091           continue;
1092         }
1093       ONE_MORE_BYTE (c3);
1094       if (! UTF_8_EXTRA_OCTET_P (c3))
1095         break;
1096       if (UTF_8_4_OCTET_LEADING_P (c))
1097         {
1098           found = CATEGORY_MASK_UTF_8;
1099           continue;
1100         }
1101       ONE_MORE_BYTE (c4);
1102       if (! UTF_8_EXTRA_OCTET_P (c4))
1103         break;
1104       if (UTF_8_5_OCTET_LEADING_P (c))
1105         {
1106           found = CATEGORY_MASK_UTF_8;
1107           continue;
1108         }
1109       break;
1110     }
1111   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1112   return 0;
1113
1114  no_more_source:
1115   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1116     {
1117       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1118       return 0;
1119     }
1120   detect_info->found |= found;
1121   return 1;
1122 }
1123
1124
1125 static void
1126 decode_coding_utf_8 (coding)
1127      struct coding_system *coding;
1128 {
1129   unsigned char *src = coding->source + coding->consumed;
1130   unsigned char *src_end = coding->source + coding->src_bytes;
1131   unsigned char *src_base;
1132   int *charbuf = coding->charbuf;
1133   int *charbuf_end = charbuf + coding->charbuf_size;
1134   int consumed_chars = 0, consumed_chars_base;
1135   int multibytep = coding->src_multibyte;
1136   Lisp_Object attr, eol_type, charset_list;
1137
1138   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1139
1140   while (1)
1141     {
1142       int c, c1, c2, c3, c4, c5;
1143
1144       src_base = src;
1145       consumed_chars_base = consumed_chars;
1146
1147       if (charbuf >= charbuf_end)
1148         break;
1149
1150       ONE_MORE_BYTE (c1);
1151       if (UTF_8_1_OCTET_P(c1))
1152         {
1153           c = c1;
1154           if (c == '\r')
1155             {
1156               if (EQ (eol_type, Qdos))
1157                 {
1158                   if (src == src_end)
1159                     {
1160                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1161                       goto no_more_source;
1162                     }
1163                   if (*src == '\n')
1164                     ONE_MORE_BYTE (c);
1165                 }
1166               else if (EQ (eol_type, Qmac))
1167                 c = '\n';
1168             }
1169         }
1170       else
1171         {
1172           ONE_MORE_BYTE (c2);
1173           if (! UTF_8_EXTRA_OCTET_P (c2))
1174             goto invalid_code;
1175           if (UTF_8_2_OCTET_LEADING_P (c1))
1176             {
1177               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1178               /* Reject overlong sequences here and below.  Encoders
1179                  producing them are incorrect, they can be misleading,
1180                  and they mess up read/write invariance.  */
1181               if (c < 128)
1182                 goto invalid_code;
1183             }
1184           else
1185             {
1186               ONE_MORE_BYTE (c3);
1187               if (! UTF_8_EXTRA_OCTET_P (c3))
1188                 goto invalid_code;
1189               if (UTF_8_3_OCTET_LEADING_P (c1))
1190                 {
1191                   c = (((c1 & 0xF) << 12)
1192                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1193                   if (c < 0x800
1194                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1195                     goto invalid_code;
1196                 }
1197               else
1198                 {
1199                   ONE_MORE_BYTE (c4);
1200                   if (! UTF_8_EXTRA_OCTET_P (c4))
1201                     goto invalid_code;
1202                   if (UTF_8_4_OCTET_LEADING_P (c1))
1203                     {
1204                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1205                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1206                     if (c < 0x10000)
1207                       goto invalid_code;
1208                     }
1209                   else
1210                     {
1211                       ONE_MORE_BYTE (c5);
1212                       if (! UTF_8_EXTRA_OCTET_P (c5))
1213                         goto invalid_code;
1214                       if (UTF_8_5_OCTET_LEADING_P (c1))
1215                         {
1216                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1217                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1218                                | (c5 & 0x3F));
1219                           if ((c > MAX_CHAR) || (c < 0x200000))
1220                             goto invalid_code;
1221                         }
1222                       else
1223                         goto invalid_code;
1224                     }
1225                 }
1226             }
1227         }
1228
1229       *charbuf++ = c;
1230       continue;
1231
1232     invalid_code:
1233       src = src_base;
1234       consumed_chars = consumed_chars_base;
1235       ONE_MORE_BYTE (c);
1236       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1237       coding->errors++;
1238     }
1239
1240  no_more_source:
1241   coding->consumed_char += consumed_chars_base;
1242   coding->consumed = src_base - coding->source;
1243   coding->charbuf_used = charbuf - coding->charbuf;
1244 }
1245
1246
1247 static int
1248 encode_coding_utf_8 (coding)
1249      struct coding_system *coding;
1250 {
1251   int multibytep = coding->dst_multibyte;
1252   int *charbuf = coding->charbuf;
1253   int *charbuf_end = charbuf + coding->charbuf_used;
1254   unsigned char *dst = coding->destination + coding->produced;
1255   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1256   int produced_chars = 0;
1257   int c;
1258
1259   if (multibytep)
1260     {
1261       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1262
1263       while (charbuf < charbuf_end)
1264         {
1265           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1266
1267           ASSURE_DESTINATION (safe_room);
1268           c = *charbuf++;
1269           if (CHAR_BYTE8_P (c))
1270             {
1271               c = CHAR_TO_BYTE8 (c);
1272               EMIT_ONE_BYTE (c);
1273             }
1274           else
1275             {
1276               CHAR_STRING_ADVANCE (c, pend);
1277               for (p = str; p < pend; p++)
1278                 EMIT_ONE_BYTE (*p);
1279             }
1280         }
1281     }
1282   else
1283     {
1284       int safe_room = MAX_MULTIBYTE_LENGTH;
1285
1286       while (charbuf < charbuf_end)
1287         {
1288           ASSURE_DESTINATION (safe_room);
1289           c = *charbuf++;
1290           dst += CHAR_STRING (c, dst);
1291           produced_chars++;
1292         }
1293     }
1294   coding->result = CODING_RESULT_SUCCESS;
1295   coding->produced_char += produced_chars;
1296   coding->produced = dst - coding->destination;
1297   return 0;
1298 }
1299
1300
1301 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1302    Check if a text is encoded in one of UTF-16 based coding systems.
1303    If it is, return 1, else return 0.  */
1304
1305 #define UTF_16_HIGH_SURROGATE_P(val) \
1306   (((val) & 0xFC00) == 0xD800)
1307
1308 #define UTF_16_LOW_SURROGATE_P(val) \
1309   (((val) & 0xFC00) == 0xDC00)
1310
1311 #define UTF_16_INVALID_P(val)   \
1312   (((val) == 0xFFFE)            \
1313    || ((val) == 0xFFFF)         \
1314    || UTF_16_LOW_SURROGATE_P (val))
1315
1316
1317 static int
1318 detect_coding_utf_16 (coding, detect_info)
1319      struct coding_system *coding;
1320      struct coding_detection_info *detect_info;
1321 {
1322   unsigned char *src = coding->source, *src_base = src;
1323   unsigned char *src_end = coding->source + coding->src_bytes;
1324   int multibytep = coding->src_multibyte;
1325   int consumed_chars = 0;
1326   int c1, c2;
1327
1328   detect_info->checked |= CATEGORY_MASK_UTF_16;
1329
1330   if (coding->mode & CODING_MODE_LAST_BLOCK
1331       && (coding->src_bytes & 1))
1332     {
1333       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1334       return 0;
1335     }
1336   ONE_MORE_BYTE (c1);
1337   ONE_MORE_BYTE (c2);
1338
1339   if ((c1 == 0xFF) && (c2 == 0xFE))
1340     {
1341       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1342                              | CATEGORY_MASK_UTF_16_AUTO);
1343       detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1344     }
1345   else if ((c1 == 0xFE) && (c2 == 0xFF))
1346     {
1347       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1348                              | CATEGORY_MASK_UTF_16_AUTO);
1349       detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1350     }
1351  no_more_source:
1352   return 1;
1353 }
1354
1355 static void
1356 decode_coding_utf_16 (coding)
1357      struct coding_system *coding;
1358 {
1359   unsigned char *src = coding->source + coding->consumed;
1360   unsigned char *src_end = coding->source + coding->src_bytes;
1361   unsigned char *src_base;
1362   int *charbuf = coding->charbuf;
1363   int *charbuf_end = charbuf + coding->charbuf_size;
1364   int consumed_chars = 0, consumed_chars_base;
1365   int multibytep = coding->src_multibyte;
1366   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1367   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1368   int surrogate = CODING_UTF_16_SURROGATE (coding);
1369   Lisp_Object attr, eol_type, charset_list;
1370
1371   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1372
1373   if (bom == utf_16_with_bom)
1374     {
1375       int c, c1, c2;
1376
1377       src_base = src;
1378       ONE_MORE_BYTE (c1);
1379       ONE_MORE_BYTE (c2);
1380       c = (c1 << 8) | c2;
1381
1382       if (endian == utf_16_big_endian
1383           ? c != 0xFEFF : c != 0xFFFE)
1384         {
1385           /* The first two bytes are not BOM.  Treat them as bytes
1386              for a normal character.  */
1387           src = src_base;
1388           coding->errors++;
1389         }
1390       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1391     }
1392   else if (bom == utf_16_detect_bom)
1393     {
1394       /* We have already tried to detect BOM and failed in
1395          detect_coding.  */
1396       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1397     }
1398
1399   while (1)
1400     {
1401       int c, c1, c2;
1402
1403       src_base = src;
1404       consumed_chars_base = consumed_chars;
1405
1406       if (charbuf + 2 >= charbuf_end)
1407         break;
1408
1409       ONE_MORE_BYTE (c1);
1410       ONE_MORE_BYTE (c2);
1411       c = (endian == utf_16_big_endian
1412            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1413       if (surrogate)
1414         {
1415           if (! UTF_16_LOW_SURROGATE_P (c))
1416             {
1417               if (endian == utf_16_big_endian)
1418                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1419               else
1420                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1421               *charbuf++ = c1;
1422               *charbuf++ = c2;
1423               coding->errors++;
1424               if (UTF_16_HIGH_SURROGATE_P (c))
1425                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1426               else
1427                 *charbuf++ = c;
1428             }
1429           else
1430             {
1431               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1432               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1433               *charbuf++ = c;
1434             }
1435         }
1436       else
1437         {
1438           if (UTF_16_HIGH_SURROGATE_P (c))
1439             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1440           else
1441             *charbuf++ = c;
1442         }
1443     }
1444
1445  no_more_source:
1446   coding->consumed_char += consumed_chars_base;
1447   coding->consumed = src_base - coding->source;
1448   coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451 static int
1452 encode_coding_utf_16 (coding)
1453      struct coding_system *coding;
1454 {
1455   int multibytep = coding->dst_multibyte;
1456   int *charbuf = coding->charbuf;
1457   int *charbuf_end = charbuf + coding->charbuf_used;
1458   unsigned char *dst = coding->destination + coding->produced;
1459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460   int safe_room = 8;
1461   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1462   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1463   int produced_chars = 0;
1464   Lisp_Object attrs, eol_type, charset_list;
1465   int c;
1466
1467   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1468
1469   if (bom != utf_16_without_bom)
1470     {
1471       ASSURE_DESTINATION (safe_room);
1472       if (big_endian)
1473         EMIT_TWO_BYTES (0xFE, 0xFF);
1474       else
1475         EMIT_TWO_BYTES (0xFF, 0xFE);
1476       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1477     }
1478
1479   while (charbuf < charbuf_end)
1480     {
1481       ASSURE_DESTINATION (safe_room);
1482       c = *charbuf++;
1483       if (c >= MAX_UNICODE_CHAR)
1484         c = coding->default_char;
1485
1486       if (c < 0x10000)
1487         {
1488           if (big_endian)
1489             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1490           else
1491             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1492         }
1493       else
1494         {
1495           int c1, c2;
1496
1497           c -= 0x10000;
1498           c1 = (c >> 10) + 0xD800;
1499           c2 = (c & 0x3FF) + 0xDC00;
1500           if (big_endian)
1501             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1502           else
1503             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1504         }
1505     }
1506   coding->result = CODING_RESULT_SUCCESS;
1507   coding->produced = dst - coding->destination;
1508   coding->produced_char += produced_chars;
1509   return 0;
1510 }
1511
1512 \f
1513 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1514
1515 /* Emacs' internal format for representation of multiple character
1516    sets is a kind of multi-byte encoding, i.e. characters are
1517    represented by variable-length sequences of one-byte codes.
1518
1519    ASCII characters and control characters (e.g. `tab', `newline') are
1520    represented by one-byte sequences which are their ASCII codes, in
1521    the range 0x00 through 0x7F.
1522
1523    8-bit characters of the range 0x80..0x9F are represented by
1524    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1525    code + 0x20).
1526
1527    8-bit characters of the range 0xA0..0xFF are represented by
1528    one-byte sequences which are their 8-bit code.
1529
1530    The other characters are represented by a sequence of `base
1531    leading-code', optional `extended leading-code', and one or two
1532    `position-code's.  The length of the sequence is determined by the
1533    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1534    whereas extended leading-code and position-code take the range 0xA0
1535    through 0xFF.  See `charset.h' for more details about leading-code
1536    and position-code.
1537
1538    --- CODE RANGE of Emacs' internal format ---
1539    character set        range
1540    -------------        -----
1541    ascii                0x00..0x7F
1542    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1543    eight-bit-graphic    0xA0..0xBF
1544    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1545    ---------------------------------------------
1546
1547    As this is the internal character representation, the format is
1548    usually not used externally (i.e. in a file or in a data sent to a
1549    process).  But, it is possible to have a text externally in this
1550    format (i.e. by encoding by the coding system `emacs-mule').
1551
1552    In that case, a sequence of one-byte codes has a slightly different
1553    form.
1554
1555    At first, all characters in eight-bit-control are represented by
1556    one-byte sequences which are their 8-bit code.
1557
1558    Next, character composition data are represented by the byte
1559    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1560    where,
1561         METHOD is 0xF0 plus one of composition method (enum
1562         composition_method),
1563
1564         BYTES is 0xA0 plus a byte length of this composition data,
1565
1566         CHARS is 0x20 plus a number of characters composed by this
1567         data,
1568
1569         COMPONENTs are characters of multibye form or composition
1570         rules encoded by two-byte of ASCII codes.
1571
1572    In addition, for backward compatibility, the following formats are
1573    also recognized as composition data on decoding.
1574
1575    0x80 MSEQ ...
1576    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1577
1578    Here,
1579         MSEQ is a multibyte form but in these special format:
1580           ASCII: 0xA0 ASCII_CODE+0x80,
1581           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1582         RULE is a one byte code of the range 0xA0..0xF0 that
1583         represents a composition rule.
1584   */
1585
1586 char emacs_mule_bytes[256];
1587
1588 int
1589 emacs_mule_char (coding, src, nbytes, nchars, id)
1590      struct coding_system *coding;
1591      unsigned char *src;
1592      int *nbytes, *nchars, *id;
1593 {
1594   unsigned char *src_end = coding->source + coding->src_bytes;
1595   int multibytep = coding->src_multibyte;
1596   unsigned char *src_base = src;
1597   struct charset *charset;
1598   unsigned code;
1599   int c;
1600   int consumed_chars = 0;
1601
1602   ONE_MORE_BYTE (c);
1603   switch (emacs_mule_bytes[c])
1604     {
1605     case 2:
1606       if (! (charset = emacs_mule_charset[c]))
1607         goto invalid_code;
1608       ONE_MORE_BYTE (c);
1609       code = c & 0x7F;
1610       break;
1611
1612     case 3:
1613       if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1614           || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1615         {
1616           ONE_MORE_BYTE (c);
1617           if (! (charset = emacs_mule_charset[c]))
1618             goto invalid_code;
1619           ONE_MORE_BYTE (c);
1620           code = c & 0x7F;
1621         }
1622       else
1623         {
1624           if (! (charset = emacs_mule_charset[c]))
1625             goto invalid_code;
1626           ONE_MORE_BYTE (c);
1627           code = (c & 0x7F) << 8;
1628           ONE_MORE_BYTE (c);
1629           code |= c & 0x7F;
1630         }
1631       break;
1632
1633     case 4:
1634       ONE_MORE_BYTE (c);
1635       if (! (charset = emacs_mule_charset[c]))
1636         goto invalid_code;
1637       ONE_MORE_BYTE (c);
1638       code = (c & 0x7F) << 8;
1639       ONE_MORE_BYTE (c);
1640       code |= c & 0x7F;
1641       break;
1642
1643     case 1:
1644       code = c;
1645       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1646                                  ? charset_ascii : charset_eight_bit);
1647       break;
1648
1649     default:
1650       abort ();
1651     }
1652   c = DECODE_CHAR (charset, code);
1653   if (c < 0)
1654     goto invalid_code;
1655   *nbytes = src - src_base;
1656   *nchars = consumed_chars;
1657   if (id)
1658     *id = charset->id;
1659   return c;
1660
1661  no_more_source:
1662   return -2;
1663
1664  invalid_code:
1665   return -1;
1666 }
1667
1668
1669 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1670    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1671    else return 0.  */
1672
1673 static int
1674 detect_coding_emacs_mule (coding, detect_info)
1675      struct coding_system *coding;
1676      struct coding_detection_info *detect_info;
1677 {
1678   unsigned char *src = coding->source, *src_base = src;
1679   unsigned char *src_end = coding->source + coding->src_bytes;
1680   int multibytep = coding->src_multibyte;
1681   int consumed_chars = 0;
1682   int c;
1683   int found = 0;
1684   int incomplete;
1685
1686   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1687   /* A coding system of this category is always ASCII compatible.  */
1688   src += coding->head_ascii;
1689
1690   while (1)
1691     {
1692       incomplete = 0;
1693       ONE_MORE_BYTE (c);
1694       incomplete = 1;
1695
1696       if (c == 0x80)
1697         {
1698           /* Perhaps the start of composite character.  We simple skip
1699              it because analyzing it is too heavy for detecting.  But,
1700              at least, we check that the composite character
1701              constitues of more than 4 bytes.  */
1702           unsigned char *src_base;
1703
1704         repeat:
1705           src_base = src;
1706           do
1707             {
1708               ONE_MORE_BYTE (c);
1709             }
1710           while (c >= 0xA0);
1711
1712           if (src - src_base <= 4)
1713             break;
1714           found = CATEGORY_MASK_EMACS_MULE;
1715           if (c == 0x80)
1716             goto repeat;
1717         }
1718
1719       if (c < 0x80)
1720         {
1721           if (c < 0x20
1722               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1723             break;
1724         }
1725       else
1726         {
1727           unsigned char *src_base = src - 1;
1728
1729           do
1730             {
1731               ONE_MORE_BYTE (c);
1732             }
1733           while (c >= 0xA0);
1734           if (src - src_base != emacs_mule_bytes[*src_base])
1735             break;
1736           found = CATEGORY_MASK_EMACS_MULE;
1737         }
1738     }
1739   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1740   return 0;
1741
1742  no_more_source:
1743   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1744     {
1745       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1746       return 0;
1747     }
1748   detect_info->found |= found;
1749   return 1;
1750 }
1751
1752
1753 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1754
1755 /* Decode a character represented as a component of composition
1756    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1757    update SRC to the head of next character (or an encoded composition
1758    rule).  If SRC doesn't points a composition component, set C to -1.
1759    If SRC points an invalid byte sequence, global exit by a return
1760    value 0.  */
1761
1762 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1763   if (1)                                                        \
1764     {                                                           \
1765       int c;                                                    \
1766       int nbytes, nchars;                                       \
1767                                                                 \
1768       if (src == src_end)                                       \
1769         break;                                                  \
1770       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1771       if (c < 0)                                                \
1772         {                                                       \
1773           if (c == -2)                                          \
1774             break;                                              \
1775           goto invalid_code;                                    \
1776         }                                                       \
1777       *buf++ = c;                                               \
1778       src += nbytes;                                            \
1779       consumed_chars += nchars;                                 \
1780     }                                                           \
1781   else
1782
1783
1784 /* Decode a composition rule represented as a component of composition
1785    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1786    and increment BUF.  If SRC points an invalid byte sequence, set C
1787    to -1.  */
1788
1789 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1790   do {                                                  \
1791     int c, gref, nref;                                  \
1792                                                         \
1793     if (src >= src_end)                                 \
1794       goto invalid_code;                                \
1795     ONE_MORE_BYTE_NO_CHECK (c);                         \
1796     c -= 0x20;                                          \
1797     if (c < 0 || c >= 81)                               \
1798       goto invalid_code;                                \
1799                                                         \
1800     gref = c / 9, nref = c % 9;                         \
1801     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1802   } while (0)
1803
1804
1805 /* Decode a composition rule represented as a component of composition
1806    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1807    and increment BUF.  If SRC points an invalid byte sequence, set C
1808    to -1.  */
1809
1810 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1811   do {                                                  \
1812     int gref, nref;                                     \
1813                                                         \
1814     if (src + 1>= src_end)                              \
1815       goto invalid_code;                                \
1816     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1817     gref -= 0x20;                                       \
1818     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1819     nref -= 0x20;                                       \
1820     if (gref < 0 || gref >= 81                          \
1821         || nref < 0 || nref >= 81)                      \
1822       goto invalid_code;                                \
1823     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1824   } while (0)
1825
1826
1827 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1828   do {                                                                  \
1829     /* Emacs 21 style format.  The first three bytes at SRC are         \
1830        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1831        the byte length of this composition information, CHARS is the    \
1832        number of characters composed by this composition.  */           \
1833     enum composition_method method = c - 0xF2;                          \
1834     int *charbuf_base = charbuf;                                        \
1835     int from, to;                                                       \
1836     int consumed_chars_limit;                                           \
1837     int nbytes, nchars;                                                 \
1838                                                                         \
1839     ONE_MORE_BYTE (c);                                                  \
1840     nbytes = c - 0xA0;                                                  \
1841     if (nbytes < 3)                                                     \
1842       goto invalid_code;                                                \
1843     ONE_MORE_BYTE (c);                                                  \
1844     nchars = c - 0xA0;                                                  \
1845     from = coding->produced + char_offset;                              \
1846     to = from + nchars;                                                 \
1847     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1848     consumed_chars_limit = consumed_chars_base + nbytes;                \
1849     if (method != COMPOSITION_RELATIVE)                                 \
1850       {                                                                 \
1851         int i = 0;                                                      \
1852         while (consumed_chars < consumed_chars_limit)                   \
1853           {                                                             \
1854             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1855               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1856             else                                                        \
1857               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1858             i++;                                                        \
1859           }                                                             \
1860         if (consumed_chars < consumed_chars_limit)                      \
1861           goto invalid_code;                                            \
1862         charbuf_base[0] -= i;                                           \
1863       }                                                                 \
1864   } while (0)
1865
1866
1867 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1868   do {                                                          \
1869     /* Emacs 20 style format for relative composition.  */      \
1870     /* Store multibyte form of characters to be composed.  */   \
1871     enum composition_method method = COMPOSITION_RELATIVE;      \
1872     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1873     int *buf = components;                                      \
1874     int i, j;                                                   \
1875     int from, to;                                               \
1876                                                                 \
1877     src = src_base;                                             \
1878     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1879     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1880       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1881     if (i < 2)                                                  \
1882       goto invalid_code;                                        \
1883     from = coding->produced_char + char_offset;                 \
1884     to = from + i;                                              \
1885     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1886     for (j = 0; j < i; j++)                                     \
1887       *charbuf++ = components[j];                               \
1888   } while (0)
1889
1890
1891 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1892   do {                                                          \
1893     /* Emacs 20 style format for rule-base composition.  */     \
1894     /* Store multibyte form of characters to be composed.  */   \
1895     enum composition_method method = COMPOSITION_WITH_RULE;     \
1896     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1897     int *buf = components;                                      \
1898     int i, j;                                                   \
1899     int from, to;                                               \
1900                                                                 \
1901     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1902     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1903       {                                                         \
1904         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1905         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1906       }                                                         \
1907     if (i < 1 || (buf - components) % 2 == 0)                   \
1908       goto invalid_code;                                        \
1909     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1910       goto no_more_source;                                      \
1911     from = coding->produced_char + char_offset;                 \
1912     to = from + i;                                              \
1913     ADD_COMPOSITION_DATA (buf, from, to, method);               \
1914     for (j = 0; j < i; j++)                                     \
1915       *charbuf++ = components[j];                               \
1916     for (j = 0; j < i; j += 2)                                  \
1917       *charbuf++ = components[j];                               \
1918   } while (0)
1919
1920
1921 static void
1922 decode_coding_emacs_mule (coding)
1923      struct coding_system *coding;
1924 {
1925   unsigned char *src = coding->source + coding->consumed;
1926   unsigned char *src_end = coding->source + coding->src_bytes;
1927   unsigned char *src_base;
1928   int *charbuf = coding->charbuf;
1929   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1930   int consumed_chars = 0, consumed_chars_base;
1931   int multibytep = coding->src_multibyte;
1932   Lisp_Object attrs, eol_type, charset_list;
1933   int char_offset = coding->produced_char;
1934   int last_offset = char_offset;
1935   int last_id = charset_ascii;
1936
1937   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1938
1939   while (1)
1940     {
1941       int c;
1942
1943       src_base = src;
1944       consumed_chars_base = consumed_chars;
1945
1946       if (charbuf >= charbuf_end)
1947         break;
1948
1949       ONE_MORE_BYTE (c);
1950
1951       if (c < 0x80)
1952         {
1953           if (c == '\r')
1954             {
1955               if (EQ (eol_type, Qdos))
1956                 {
1957                   if (src == src_end)
1958                     {
1959                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1960                       goto no_more_source;
1961                     }
1962                   if (*src == '\n')
1963                     ONE_MORE_BYTE (c);
1964                 }
1965               else if (EQ (eol_type, Qmac))
1966                 c = '\n';
1967             }
1968           *charbuf++ = c;
1969           char_offset++;
1970         }
1971       else if (c == 0x80)
1972         {
1973           ONE_MORE_BYTE (c);
1974           if (c - 0xF2 >= COMPOSITION_RELATIVE
1975               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1976             DECODE_EMACS_MULE_21_COMPOSITION (c);
1977           else if (c < 0xC0)
1978             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1979           else if (c == 0xFF)
1980             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1981           else
1982             goto invalid_code;
1983         }
1984       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1985         {
1986           int nbytes, nchars;
1987           int id;
1988
1989           src = src_base;
1990           consumed_chars = consumed_chars_base;
1991           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1992           if (c < 0)
1993             {
1994               if (c == -2)
1995                 break;
1996               goto invalid_code;
1997             }
1998           if (last_id != id)
1999             {
2000               if (last_id != charset_ascii)
2001                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2002               last_id = id;
2003               last_offset = char_offset;
2004             }
2005           *charbuf++ = c;
2006           src += nbytes;
2007           consumed_chars += nchars;
2008           char_offset++;
2009         }
2010       continue;
2011
2012     invalid_code:
2013       src = src_base;
2014       consumed_chars = consumed_chars_base;
2015       ONE_MORE_BYTE (c);
2016       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2017       char_offset++;
2018       coding->errors++;
2019     }
2020
2021  no_more_source:
2022   if (last_id != charset_ascii)
2023     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2024   coding->consumed_char += consumed_chars_base;
2025   coding->consumed = src_base - coding->source;
2026   coding->charbuf_used = charbuf - coding->charbuf;
2027 }
2028
2029
2030 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2031   do {                                          \
2032     if (id < 0xA0)                              \
2033       codes[0] = id, codes[1] = 0;              \
2034     else if (id < 0xE0)                         \
2035       codes[0] = 0x9A, codes[1] = id;           \
2036     else if (id < 0xF0)                         \
2037       codes[0] = 0x9B, codes[1] = id;           \
2038     else if (id < 0xF5)                         \
2039       codes[0] = 0x9C, codes[1] = id;           \
2040     else                                        \
2041       codes[0] = 0x9D, codes[1] = id;           \
2042   } while (0);
2043
2044
2045 static int
2046 encode_coding_emacs_mule (coding)
2047      struct coding_system *coding;
2048 {
2049   int multibytep = coding->dst_multibyte;
2050   int *charbuf = coding->charbuf;
2051   int *charbuf_end = charbuf + coding->charbuf_used;
2052   unsigned char *dst = coding->destination + coding->produced;
2053   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2054   int safe_room = 8;
2055   int produced_chars = 0;
2056   Lisp_Object attrs, eol_type, charset_list;
2057   int c;
2058   int preferred_charset_id = -1;
2059
2060   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2061
2062   while (charbuf < charbuf_end)
2063     {
2064       ASSURE_DESTINATION (safe_room);
2065       c = *charbuf++;
2066
2067       if (c < 0)
2068         {
2069           /* Handle an annotation.  */
2070           switch (*charbuf)
2071             {
2072             case CODING_ANNOTATE_COMPOSITION_MASK:
2073               /* Not yet implemented.  */
2074               break;
2075             case CODING_ANNOTATE_CHARSET_MASK:
2076               preferred_charset_id = charbuf[3];
2077               if (preferred_charset_id >= 0
2078                   && NILP (Fmemq (make_number (preferred_charset_id),
2079                                   charset_list)))
2080                 preferred_charset_id = -1;
2081               break;
2082             default:
2083               abort ();
2084             }
2085           charbuf += -c - 1;
2086           continue;
2087         }
2088
2089       if (ASCII_CHAR_P (c))
2090         EMIT_ONE_ASCII_BYTE (c);
2091       else if (CHAR_BYTE8_P (c))
2092         {
2093           c = CHAR_TO_BYTE8 (c);
2094           EMIT_ONE_BYTE (c);
2095         }
2096       else
2097         {
2098           struct charset *charset;
2099           unsigned code;
2100           int dimension;
2101           int emacs_mule_id;
2102           unsigned char leading_codes[2];
2103
2104           if (preferred_charset_id >= 0)
2105             {
2106               charset = CHARSET_FROM_ID (preferred_charset_id);
2107               if (! CHAR_CHARSET_P (c, charset))
2108                 charset = char_charset (c, charset_list, NULL);
2109             }
2110           else
2111             charset = char_charset (c, charset_list, &code);
2112           if (! charset)
2113             {
2114               c = coding->default_char;
2115               if (ASCII_CHAR_P (c))
2116                 {
2117                   EMIT_ONE_ASCII_BYTE (c);
2118                   continue;
2119                 }
2120               charset = char_charset (c, charset_list, &code);
2121             }
2122           dimension = CHARSET_DIMENSION (charset);
2123           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2124           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2125           EMIT_ONE_BYTE (leading_codes[0]);
2126           if (leading_codes[1])
2127             EMIT_ONE_BYTE (leading_codes[1]);
2128           if (dimension == 1)
2129             EMIT_ONE_BYTE (code);
2130           else
2131             {
2132               EMIT_ONE_BYTE (code >> 8);
2133               EMIT_ONE_BYTE (code & 0xFF);
2134             }
2135         }
2136     }
2137   coding->result = CODING_RESULT_SUCCESS;
2138   coding->produced_char += produced_chars;
2139   coding->produced = dst - coding->destination;
2140   return 0;
2141 }
2142
2143 \f
2144 /*** 7. ISO2022 handlers ***/
2145
2146 /* The following note describes the coding system ISO2022 briefly.
2147    Since the intention of this note is to help understand the
2148    functions in this file, some parts are NOT ACCURATE or are OVERLY
2149    SIMPLIFIED.  For thorough understanding, please refer to the
2150    original document of ISO2022.  This is equivalent to the standard
2151    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2152
2153    ISO2022 provides many mechanisms to encode several character sets
2154    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2155    is encoded using bytes less than 128.  This may make the encoded
2156    text a little bit longer, but the text passes more easily through
2157    several types of gateway, some of which strip off the MSB (Most
2158    Significant Bit).
2159
2160    There are two kinds of character sets: control character sets and
2161    graphic character sets.  The former contain control characters such
2162    as `newline' and `escape' to provide control functions (control
2163    functions are also provided by escape sequences).  The latter
2164    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2165    two control character sets and many graphic character sets.
2166
2167    Graphic character sets are classified into one of the following
2168    four classes, according to the number of bytes (DIMENSION) and
2169    number of characters in one dimension (CHARS) of the set:
2170    - DIMENSION1_CHARS94
2171    - DIMENSION1_CHARS96
2172    - DIMENSION2_CHARS94
2173    - DIMENSION2_CHARS96
2174
2175    In addition, each character set is assigned an identification tag,
2176    unique for each set, called the "final character" (denoted as <F>
2177    hereafter).  The <F> of each character set is decided by ECMA(*)
2178    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2179    (0x30..0x3F are for private use only).
2180
2181    Note (*): ECMA = European Computer Manufacturers Association
2182
2183    Here are examples of graphic character sets [NAME(<F>)]:
2184         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2185         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2186         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2187         o DIMENSION2_CHARS96 -- none for the moment
2188
2189    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2190         C0 [0x00..0x1F] -- control character plane 0
2191         GL [0x20..0x7F] -- graphic character plane 0
2192         C1 [0x80..0x9F] -- control character plane 1
2193         GR [0xA0..0xFF] -- graphic character plane 1
2194
2195    A control character set is directly designated and invoked to C0 or
2196    C1 by an escape sequence.  The most common case is that:
2197    - ISO646's  control character set is designated/invoked to C0, and
2198    - ISO6429's control character set is designated/invoked to C1,
2199    and usually these designations/invocations are omitted in encoded
2200    text.  In a 7-bit environment, only C0 can be used, and a control
2201    character for C1 is encoded by an appropriate escape sequence to
2202    fit into the environment.  All control characters for C1 are
2203    defined to have corresponding escape sequences.
2204
2205    A graphic character set is at first designated to one of four
2206    graphic registers (G0 through G3), then these graphic registers are
2207    invoked to GL or GR.  These designations and invocations can be
2208    done independently.  The most common case is that G0 is invoked to
2209    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2210    these invocations and designations are omitted in encoded text.
2211    In a 7-bit environment, only GL can be used.
2212
2213    When a graphic character set of CHARS94 is invoked to GL, codes
2214    0x20 and 0x7F of the GL area work as control characters SPACE and
2215    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2216    be used.
2217
2218    There are two ways of invocation: locking-shift and single-shift.
2219    With locking-shift, the invocation lasts until the next different
2220    invocation, whereas with single-shift, the invocation affects the
2221    following character only and doesn't affect the locking-shift
2222    state.  Invocations are done by the following control characters or
2223    escape sequences:
2224
2225    ----------------------------------------------------------------------
2226    abbrev  function                  cntrl escape seq   description
2227    ----------------------------------------------------------------------
2228    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2229    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2230    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2231    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2232    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2233    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2234    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2235    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2236    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2237    ----------------------------------------------------------------------
2238    (*) These are not used by any known coding system.
2239
2240    Control characters for these functions are defined by macros
2241    ISO_CODE_XXX in `coding.h'.
2242
2243    Designations are done by the following escape sequences:
2244    ----------------------------------------------------------------------
2245    escape sequence      description
2246    ----------------------------------------------------------------------
2247    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2248    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2249    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2250    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2251    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2252    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2253    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2254    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2255    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2256    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2257    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2258    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2259    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2260    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2261    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2262    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2263    ----------------------------------------------------------------------
2264
2265    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2266    of dimension 1, chars 94, and final character <F>, etc...
2267
2268    Note (*): Although these designations are not allowed in ISO2022,
2269    Emacs accepts them on decoding, and produces them on encoding
2270    CHARS96 character sets in a coding system which is characterized as
2271    7-bit environment, non-locking-shift, and non-single-shift.
2272
2273    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2274    '(' must be omitted.  We refer to this as "short-form" hereafter.
2275
2276    Now you may notice that there are a lot of ways of encoding the
2277    same multilingual text in ISO2022.  Actually, there exist many
2278    coding systems such as Compound Text (used in X11's inter client
2279    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2280    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2281    localized platforms), and all of these are variants of ISO2022.
2282
2283    In addition to the above, Emacs handles two more kinds of escape
2284    sequences: ISO6429's direction specification and Emacs' private
2285    sequence for specifying character composition.
2286
2287    ISO6429's direction specification takes the following form:
2288         o CSI ']'      -- end of the current direction
2289         o CSI '0' ']'  -- end of the current direction
2290         o CSI '1' ']'  -- start of left-to-right text
2291         o CSI '2' ']'  -- start of right-to-left text
2292    The control character CSI (0x9B: control sequence introducer) is
2293    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2294
2295    Character composition specification takes the following form:
2296         o ESC '0' -- start relative composition
2297         o ESC '1' -- end composition
2298         o ESC '2' -- start rule-base composition (*)
2299         o ESC '3' -- start relative composition with alternate chars  (**)
2300         o ESC '4' -- start rule-base composition with alternate chars  (**)
2301   Since these are not standard escape sequences of any ISO standard,
2302   the use of them with these meanings is restricted to Emacs only.
2303
2304   (*) This form is used only in Emacs 20.7 and older versions,
2305   but newer versions can safely decode it.
2306   (**) This form is used only in Emacs 21.1 and newer versions,
2307   and older versions can't decode it.
2308
2309   Here's a list of example usages of these composition escape
2310   sequences (categorized by `enum composition_method').
2311
2312   COMPOSITION_RELATIVE:
2313         ESC 0 CHAR [ CHAR ] ESC 1
2314   COMPOSITION_WITH_RULE:
2315         ESC 2 CHAR [ RULE CHAR ] ESC 1
2316   COMPOSITION_WITH_ALTCHARS:
2317         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2318   COMPOSITION_WITH_RULE_ALTCHARS:
2319         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2320
2321 enum iso_code_class_type iso_code_class[256];
2322
2323 #define SAFE_CHARSET_P(coding, id)      \
2324   ((id) <= (coding)->max_charset_id     \
2325    && (coding)->safe_charsets[id] >= 0)
2326
2327
2328 #define SHIFT_OUT_OK(category)  \
2329   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2330
2331 static void
2332 setup_iso_safe_charsets (attrs)
2333      Lisp_Object attrs;
2334 {
2335   Lisp_Object charset_list, safe_charsets;
2336   Lisp_Object request;
2337   Lisp_Object reg_usage;
2338   Lisp_Object tail;
2339   int reg94, reg96;
2340   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2341   int max_charset_id;
2342
2343   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2344   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2345       && ! EQ (charset_list, Viso_2022_charset_list))
2346     {
2347       CODING_ATTR_CHARSET_LIST (attrs)
2348         = charset_list = Viso_2022_charset_list;
2349       ASET (attrs, coding_attr_safe_charsets, Qnil);
2350     }
2351
2352   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2353     return;
2354
2355   max_charset_id = 0;
2356   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2357     {
2358       int id = XINT (XCAR (tail));
2359       if (max_charset_id < id)
2360         max_charset_id = id;
2361     }
2362
2363   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2364                                 make_number (255));
2365   request = AREF (attrs, coding_attr_iso_request);
2366   reg_usage = AREF (attrs, coding_attr_iso_usage);
2367   reg94 = XINT (XCAR (reg_usage));
2368   reg96 = XINT (XCDR (reg_usage));
2369
2370   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2371     {
2372       Lisp_Object id;
2373       Lisp_Object reg;
2374       struct charset *charset;
2375
2376       id = XCAR (tail);
2377       charset = CHARSET_FROM_ID (XINT (id));
2378       reg = Fcdr (Fassq (id, request));
2379       if (! NILP (reg))
2380         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2381       else if (charset->iso_chars_96)
2382         {
2383           if (reg96 < 4)
2384             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2385         }
2386       else
2387         {
2388           if (reg94 < 4)
2389             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2390         }
2391     }
2392   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2393 }
2394
2395
2396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2397    Check if a text is encoded in one of ISO-2022 based codig systems.
2398    If it is, return 1, else return 0.  */
2399
2400 static int
2401 detect_coding_iso_2022 (coding, detect_info)
2402      struct coding_system *coding;
2403      struct coding_detection_info *detect_info;
2404 {
2405   unsigned char *src = coding->source, *src_base = src;
2406   unsigned char *src_end = coding->source + coding->src_bytes;
2407   int multibytep = coding->src_multibyte;
2408   int single_shifting = 0;
2409   int id;
2410   int c, c1;
2411   int consumed_chars = 0;
2412   int i;
2413   int rejected = 0;
2414   int found = 0;
2415
2416   detect_info->checked |= CATEGORY_MASK_ISO;
2417
2418   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2419     {
2420       struct coding_system *this = &(coding_categories[i]);
2421       Lisp_Object attrs, val;
2422
2423       attrs = CODING_ID_ATTRS (this->id);
2424       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2425           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2426         setup_iso_safe_charsets (attrs);
2427       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2428       this->max_charset_id = XSTRING (val)->size - 1;
2429       this->safe_charsets = (char *) XSTRING (val)->data;
2430     }
2431
2432   /* A coding system of this category is always ASCII compatible.  */
2433   src += coding->head_ascii;
2434
2435   while (rejected != CATEGORY_MASK_ISO)
2436     {
2437       ONE_MORE_BYTE (c);
2438       switch (c)
2439         {
2440         case ISO_CODE_ESC:
2441           if (inhibit_iso_escape_detection)
2442             break;
2443           single_shifting = 0;
2444           ONE_MORE_BYTE (c);
2445           if (c >= '(' && c <= '/')
2446             {
2447               /* Designation sequence for a charset of dimension 1.  */
2448               ONE_MORE_BYTE (c1);
2449               if (c1 < ' ' || c1 >= 0x80
2450                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2451                 /* Invalid designation sequence.  Just ignore.  */
2452                 break;
2453             }
2454           else if (c == '$')
2455             {
2456               /* Designation sequence for a charset of dimension 2.  */
2457               ONE_MORE_BYTE (c);
2458               if (c >= '@' && c <= 'B')
2459                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2460                 id = iso_charset_table[1][0][c];
2461               else if (c >= '(' && c <= '/')
2462                 {
2463                   ONE_MORE_BYTE (c1);
2464                   if (c1 < ' ' || c1 >= 0x80
2465                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2466                     /* Invalid designation sequence.  Just ignore.  */
2467                     break;
2468                 }
2469               else
2470                 /* Invalid designation sequence.  Just ignore it.  */
2471                 break;
2472             }
2473           else if (c == 'N' || c == 'O')
2474             {
2475               /* ESC <Fe> for SS2 or SS3.  */
2476               single_shifting = 1;
2477               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2478               break;
2479             }
2480           else if (c >= '0' && c <= '4')
2481             {
2482               /* ESC <Fp> for start/end composition.  */
2483               found |= CATEGORY_MASK_ISO;
2484               break;
2485             }
2486           else
2487             {
2488               /* Invalid escape sequence.  Just ignore it.  */
2489               break;
2490             }
2491
2492           /* We found a valid designation sequence for CHARSET.  */
2493           rejected |= CATEGORY_MASK_ISO_8BIT;
2494           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2495                               id))
2496             found |= CATEGORY_MASK_ISO_7;
2497           else
2498             rejected |= CATEGORY_MASK_ISO_7;
2499           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2500                               id))
2501             found |= CATEGORY_MASK_ISO_7_TIGHT;
2502           else
2503             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2504           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2505                               id))
2506             found |= CATEGORY_MASK_ISO_7_ELSE;
2507           else
2508             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2509           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2510                               id))
2511             found |= CATEGORY_MASK_ISO_8_ELSE;
2512           else
2513             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2514           break;
2515
2516         case ISO_CODE_SO:
2517         case ISO_CODE_SI:
2518           /* Locking shift out/in.  */
2519           if (inhibit_iso_escape_detection)
2520             break;
2521           single_shifting = 0;
2522           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2523           found |= CATEGORY_MASK_ISO_ELSE;
2524           break;
2525
2526         case ISO_CODE_CSI:
2527           /* Control sequence introducer.  */
2528           single_shifting = 0;
2529           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2530           found |= CATEGORY_MASK_ISO_8_ELSE;
2531           goto check_extra_latin;
2532
2533
2534         case ISO_CODE_SS2:
2535         case ISO_CODE_SS3:
2536           /* Single shift.   */
2537           if (inhibit_iso_escape_detection)
2538             break;
2539           single_shifting = 1;
2540           rejected |= CATEGORY_MASK_ISO_7BIT;
2541           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2542               & CODING_ISO_FLAG_SINGLE_SHIFT)
2543             found |= CATEGORY_MASK_ISO_8_1;
2544           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2545               & CODING_ISO_FLAG_SINGLE_SHIFT)
2546             found |= CATEGORY_MASK_ISO_8_2;
2547           goto check_extra_latin;
2548
2549         default:
2550           if (c < 0x80)
2551             {
2552               single_shifting = 0;
2553               break;
2554             }
2555           if (c >= 0xA0)
2556             {
2557               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2558               found |= CATEGORY_MASK_ISO_8_1;
2559               /* Check the length of succeeding codes of the range
2560                  0xA0..0FF.  If the byte length is even, we include
2561                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2562                  only when we are not single shifting.  */
2563               if (! single_shifting
2564                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2565                 {
2566                   int i = 1;
2567                   while (src < src_end)
2568                     {
2569                       ONE_MORE_BYTE (c);
2570                       if (c < 0xA0)
2571                         break;
2572                       i++;
2573                     }
2574
2575                   if (i & 1 && src < src_end)
2576                     rejected |= CATEGORY_MASK_ISO_8_2;
2577                   else
2578                     found |= CATEGORY_MASK_ISO_8_2;
2579                 }
2580               break;
2581             }
2582         check_extra_latin:
2583           single_shifting = 0;
2584           if (! VECTORP (Vlatin_extra_code_table)
2585               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2586             {
2587               rejected = CATEGORY_MASK_ISO;
2588               break;
2589             }
2590           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2591               & CODING_ISO_FLAG_LATIN_EXTRA)
2592             found |= CATEGORY_MASK_ISO_8_1;
2593           else
2594             rejected |= CATEGORY_MASK_ISO_8_1;
2595           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2596               & CODING_ISO_FLAG_LATIN_EXTRA)
2597             found |= CATEGORY_MASK_ISO_8_2;
2598           else
2599             rejected |= CATEGORY_MASK_ISO_8_2;
2600         }
2601     }
2602   detect_info->rejected |= CATEGORY_MASK_ISO;
2603   return 0;
2604
2605  no_more_source:
2606   detect_info->rejected |= rejected;
2607   detect_info->found |= (found & ~rejected);
2608   return 1;
2609 }
2610
2611
2612 /* Set designation state into CODING.  */
2613 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2614   do {                                                                  \
2615     int id, prev;                                                       \
2616                                                                         \
2617     if (final < '0' || final >= 128                                     \
2618         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2619         || !SAFE_CHARSET_P (coding, id))                                \
2620       {                                                                 \
2621         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2622         goto invalid_code;                                              \
2623       }                                                                 \
2624     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2625     if (id == charset_jisx0201_roman)                                   \
2626       {                                                                 \
2627         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2628           id = charset_ascii;                                           \
2629       }                                                                 \
2630     else if (id == charset_jisx0208_1978)                               \
2631       {                                                                 \
2632         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2633           id = charset_jisx0208;                                        \
2634       }                                                                 \
2635     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2636     /* If there was an invalid designation to REG previously, and this  \
2637        designation is ASCII to REG, we should keep this designation     \
2638        sequence.  */                                                    \
2639     if (prev == -2 && id == charset_ascii)                              \
2640       goto invalid_code;                                                \
2641   } while (0)
2642
2643
2644 #define MAYBE_FINISH_COMPOSITION()                              \
2645   do {                                                          \
2646     int i;                                                      \
2647     if (composition_state == COMPOSING_NO)                      \
2648       break;                                                    \
2649     /* It is assured that we have enough room for producing     \
2650        characters stored in the table `components'.  */         \
2651     if (charbuf + component_idx > charbuf_end)                  \
2652       goto no_more_source;                                      \
2653     composition_state = COMPOSING_NO;                           \
2654     if (method == COMPOSITION_RELATIVE                          \
2655         || method == COMPOSITION_WITH_ALTCHARS)                 \
2656       {                                                         \
2657         for (i = 0; i < component_idx; i++)                     \
2658           *charbuf++ = components[i];                           \
2659         char_offset += component_idx;                           \
2660       }                                                         \
2661     else                                                        \
2662       {                                                         \
2663         for (i = 0; i < component_idx; i += 2)                  \
2664           *charbuf++ = components[i];                           \
2665         char_offset += (component_idx / 2) + 1;                 \
2666       }                                                         \
2667   } while (0)
2668
2669
2670 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2671    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2672    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2673    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2674    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2675   */
2676
2677 #define DECODE_COMPOSITION_START(c1)                                    \
2678   do {                                                                  \
2679     if (c1 == '0'                                                       \
2680         && composition_state == COMPOSING_COMPONENT_RULE)               \
2681       {                                                                 \
2682         component_len = component_idx;                                  \
2683         composition_state = COMPOSING_CHAR;                             \
2684       }                                                                 \
2685     else                                                                \
2686       {                                                                 \
2687         unsigned char *p;                                               \
2688                                                                         \
2689         MAYBE_FINISH_COMPOSITION ();                                    \
2690         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2691           goto no_more_source;                                          \
2692         for (p = src; p < src_end - 1; p++)                             \
2693           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2694             break;                                                      \
2695         if (p == src_end - 1)                                           \
2696           {                                                             \
2697             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2698               goto invalid_code;                                        \
2699             goto no_more_source;                                        \
2700           }                                                             \
2701                                                                         \
2702         /* This is surely the start of a composition.  */               \
2703         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2704                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2705                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2706                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2707         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2708                              : COMPOSING_COMPONENT_CHAR);               \
2709         component_idx = component_len = 0;                              \
2710       }                                                                 \
2711   } while (0)
2712
2713
2714 /* Handle compositoin end sequence ESC 1.  */
2715
2716 #define DECODE_COMPOSITION_END()                                        \
2717   do {                                                                  \
2718     int nchars = (component_len > 0 ? component_idx - component_len     \
2719                   : method == COMPOSITION_RELATIVE ? component_idx      \
2720                   : (component_idx + 1) / 2);                           \
2721     int i;                                                              \
2722     int *saved_charbuf = charbuf;                                       \
2723     int from = coding->produced_char + char_offset;                     \
2724     int to = from + nchars;                                             \
2725                                                                         \
2726     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2727     if (method != COMPOSITION_RELATIVE)                                 \
2728       {                                                                 \
2729         if (component_len == 0)                                         \
2730           for (i = 0; i < component_idx; i++)                           \
2731             *charbuf++ = components[i];                                 \
2732         else                                                            \
2733           for (i = 0; i < component_len; i++)                           \
2734             *charbuf++ = components[i];                                 \
2735         *saved_charbuf = saved_charbuf - charbuf;                       \
2736       }                                                                 \
2737     if (method == COMPOSITION_WITH_RULE)                                \
2738       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2739         *charbuf++ = components[i];                                     \
2740     else                                                                \
2741       for (i = component_len; i < component_idx; i++, char_offset++)    \
2742         *charbuf++ = components[i];                                     \
2743     coding->annotated = 1;                                              \
2744     composition_state = COMPOSING_NO;                                   \
2745   } while (0)
2746
2747
2748 /* Decode a composition rule from the byte C1 (and maybe one more byte
2749    from SRC) and store one encoded composition rule in
2750    coding->cmp_data.  */
2751
2752 #define DECODE_COMPOSITION_RULE(c1)                                     \
2753   do {                                                                  \
2754     (c1) -= 32;                                                         \
2755     if (c1 < 81)                /* old format (before ver.21) */        \
2756       {                                                                 \
2757         int gref = (c1) / 9;                                            \
2758         int nref = (c1) % 9;                                            \
2759         if (gref == 4) gref = 10;                                       \
2760         if (nref == 4) nref = 10;                                       \
2761         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2762       }                                                                 \
2763     else if (c1 < 93)           /* new format (after ver.21) */         \
2764       {                                                                 \
2765         ONE_MORE_BYTE (c2);                                             \
2766         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2767       }                                                                 \
2768     else                                                                \
2769       c1 = 0;                                                           \
2770   } while (0)
2771
2772
2773 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2774
2775 static void
2776 decode_coding_iso_2022 (coding)
2777      struct coding_system *coding;
2778 {
2779   unsigned char *src = coding->source + coding->consumed;
2780   unsigned char *src_end = coding->source + coding->src_bytes;
2781   unsigned char *src_base;
2782   int *charbuf = coding->charbuf;
2783   int *charbuf_end
2784     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2785   int consumed_chars = 0, consumed_chars_base;
2786   int multibytep = coding->src_multibyte;
2787   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2788   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2789   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2790   struct charset *charset;
2791   int c;
2792   /* For handling composition sequence.  */
2793 #define COMPOSING_NO                    0
2794 #define COMPOSING_CHAR                  1
2795 #define COMPOSING_RULE                  2
2796 #define COMPOSING_COMPONENT_CHAR        3
2797 #define COMPOSING_COMPONENT_RULE        4
2798
2799   int composition_state = COMPOSING_NO;
2800   enum composition_method method;
2801   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2802   int component_idx;
2803   int component_len;
2804   Lisp_Object attrs, eol_type, charset_list;
2805   int char_offset = coding->produced_char;
2806   int last_offset = char_offset;
2807   int last_id = charset_ascii;
2808
2809   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2810   setup_iso_safe_charsets (attrs);
2811
2812   while (1)
2813     {
2814       int c1, c2;
2815
2816       src_base = src;
2817       consumed_chars_base = consumed_chars;
2818
2819       if (charbuf >= charbuf_end)
2820         break;
2821
2822       ONE_MORE_BYTE (c1);
2823
2824       /* We produce at most one character.  */
2825       switch (iso_code_class [c1])
2826         {
2827         case ISO_0x20_or_0x7F:
2828           if (composition_state != COMPOSING_NO)
2829             {
2830               if (composition_state == COMPOSING_RULE
2831                   || composition_state == COMPOSING_COMPONENT_RULE)
2832                 {
2833                   DECODE_COMPOSITION_RULE (c1);
2834                   components[component_idx++] = c1;
2835                   composition_state--;
2836                   continue;
2837                 }
2838             }
2839           if (charset_id_0 < 0
2840               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2841             /* This is SPACE or DEL.  */
2842             charset = CHARSET_FROM_ID (charset_ascii);
2843           else
2844             charset = CHARSET_FROM_ID (charset_id_0);
2845           break;
2846
2847         case ISO_graphic_plane_0:
2848           if (composition_state != COMPOSING_NO)
2849             {
2850               if (composition_state == COMPOSING_RULE
2851                   || composition_state == COMPOSING_COMPONENT_RULE)
2852                 {
2853                   DECODE_COMPOSITION_RULE (c1);
2854                   components[component_idx++] = c1;
2855                   composition_state--;
2856                   continue;
2857                 }
2858             }
2859           charset = CHARSET_FROM_ID (charset_id_0);
2860           break;
2861
2862         case ISO_0xA0_or_0xFF:
2863           if (charset_id_1 < 0
2864               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2865               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2866             goto invalid_code;
2867           /* This is a graphic character, we fall down ... */
2868
2869         case ISO_graphic_plane_1:
2870           if (charset_id_1 < 0)
2871             goto invalid_code;
2872           charset = CHARSET_FROM_ID (charset_id_1);
2873           break;
2874
2875         case ISO_carriage_return:
2876           if (c1 == '\r')
2877             {
2878               if (EQ (eol_type, Qdos))
2879                 {
2880                   if (src == src_end)
2881                     {
2882                       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2883                       goto no_more_source;
2884                     }
2885                   if (*src == '\n')
2886                     ONE_MORE_BYTE (c1);
2887                 }
2888               else if (EQ (eol_type, Qmac))
2889                 c1 = '\n';
2890             }
2891           /* fall through */
2892
2893         case ISO_control_0:
2894           MAYBE_FINISH_COMPOSITION ();
2895           charset = CHARSET_FROM_ID (charset_ascii);
2896           break;
2897
2898         case ISO_control_1:
2899           MAYBE_FINISH_COMPOSITION ();
2900           goto invalid_code;
2901
2902         case ISO_shift_out:
2903           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2904               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2905             goto invalid_code;
2906           CODING_ISO_INVOCATION (coding, 0) = 1;
2907           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2908           continue;
2909
2910         case ISO_shift_in:
2911           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2912             goto invalid_code;
2913           CODING_ISO_INVOCATION (coding, 0) = 0;
2914           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2915           continue;
2916
2917         case ISO_single_shift_2_7:
2918         case ISO_single_shift_2:
2919           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2920             goto invalid_code;
2921           /* SS2 is handled as an escape sequence of ESC 'N' */
2922           c1 = 'N';
2923           goto label_escape_sequence;
2924
2925         case ISO_single_shift_3:
2926           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2927             goto invalid_code;
2928           /* SS2 is handled as an escape sequence of ESC 'O' */
2929           c1 = 'O';
2930           goto label_escape_sequence;
2931
2932         case ISO_control_sequence_introducer:
2933           /* CSI is handled as an escape sequence of ESC '[' ...  */
2934           c1 = '[';
2935           goto label_escape_sequence;
2936
2937         case ISO_escape:
2938           ONE_MORE_BYTE (c1);
2939         label_escape_sequence:
2940           /* Escape sequences handled here are invocation,
2941              designation, direction specification, and character
2942              composition specification.  */
2943           switch (c1)
2944             {
2945             case '&':           /* revision of following character set */
2946               ONE_MORE_BYTE (c1);
2947               if (!(c1 >= '@' && c1 <= '~'))
2948                 goto invalid_code;
2949               ONE_MORE_BYTE (c1);
2950               if (c1 != ISO_CODE_ESC)
2951                 goto invalid_code;
2952               ONE_MORE_BYTE (c1);
2953               goto label_escape_sequence;
2954
2955             case '$':           /* designation of 2-byte character set */
2956               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2957                 goto invalid_code;
2958               ONE_MORE_BYTE (c1);
2959               if (c1 >= '@' && c1 <= 'B')
2960                 {       /* designation of JISX0208.1978, GB2312.1980,
2961                            or JISX0208.1980 */
2962                   DECODE_DESIGNATION (0, 2, 0, c1);
2963                 }
2964               else if (c1 >= 0x28 && c1 <= 0x2B)
2965                 {       /* designation of DIMENSION2_CHARS94 character set */
2966                   ONE_MORE_BYTE (c2);
2967                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2968                 }
2969               else if (c1 >= 0x2C && c1 <= 0x2F)
2970                 {       /* designation of DIMENSION2_CHARS96 character set */
2971                   ONE_MORE_BYTE (c2);
2972                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2973                 }
2974               else
2975                 goto invalid_code;
2976               /* We must update these variables now.  */
2977               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2978               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2979               continue;
2980
2981             case 'n':           /* invocation of locking-shift-2 */
2982               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2983                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2984                 goto invalid_code;
2985               CODING_ISO_INVOCATION (coding, 0) = 2;
2986               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2987               continue;
2988
2989             case 'o':           /* invocation of locking-shift-3 */
2990               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2991                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2992                 goto invalid_code;
2993               CODING_ISO_INVOCATION (coding, 0) = 3;
2994               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2995               continue;
2996
2997             case 'N':           /* invocation of single-shift-2 */
2998               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2999                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3000                 goto invalid_code;
3001               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3002               ONE_MORE_BYTE (c1);
3003               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3004                 goto invalid_code;
3005               break;
3006
3007             case 'O':           /* invocation of single-shift-3 */
3008               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3009                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3010                 goto invalid_code;
3011               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3012               ONE_MORE_BYTE (c1);
3013               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3014                 goto invalid_code;
3015               break;
3016
3017             case '0': case '2': case '3': case '4': /* start composition */
3018               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3019                 goto invalid_code;
3020               DECODE_COMPOSITION_START (c1);
3021               continue;
3022
3023             case '1':           /* end composition */
3024               if (composition_state == COMPOSING_NO)
3025                 goto invalid_code;
3026               DECODE_COMPOSITION_END ();
3027               continue;
3028
3029             case '[':           /* specification of direction */
3030               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3031                 goto invalid_code;
3032               /* For the moment, nested direction is not supported.
3033                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3034                  left-to-right, and nozero means right-to-left.  */
3035               ONE_MORE_BYTE (c1);
3036               switch (c1)
3037                 {
3038                 case ']':       /* end of the current direction */
3039                   coding->mode &= ~CODING_MODE_DIRECTION;
3040
3041                 case '0':       /* end of the current direction */
3042                 case '1':       /* start of left-to-right direction */
3043                   ONE_MORE_BYTE (c1);
3044                   if (c1 == ']')
3045                     coding->mode &= ~CODING_MODE_DIRECTION;
3046                   else
3047                     goto invalid_code;
3048                   break;
3049
3050                 case '2':       /* start of right-to-left direction */
3051                   ONE_MORE_BYTE (c1);
3052                   if (c1 == ']')
3053                     coding->mode |= CODING_MODE_DIRECTION;
3054                   else
3055                     goto invalid_code;
3056                   break;
3057
3058                 default:
3059                   goto invalid_code;
3060                 }
3061               continue;
3062
3063             case '%':
3064               ONE_MORE_BYTE (c1);
3065               if (c1 == '/')
3066                 {
3067                   /* CTEXT extended segment:
3068                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3069                      We keep these bytes as is for the moment.
3070                      They may be decoded by post-read-conversion.  */
3071                   int dim, M, L;
3072                   int size;
3073
3074                   ONE_MORE_BYTE (dim);
3075                   ONE_MORE_BYTE (M);
3076                   ONE_MORE_BYTE (L);
3077                   size = ((M - 128) * 128) + (L - 128);
3078                   if (charbuf + 8 + size > charbuf_end)
3079                     goto break_loop;
3080                   *charbuf++ = ISO_CODE_ESC;
3081                   *charbuf++ = '%';
3082                   *charbuf++ = '/';
3083                   *charbuf++ = dim;
3084                   *charbuf++ = BYTE8_TO_CHAR (M);
3085                   *charbuf++ = BYTE8_TO_CHAR (L);
3086                   while (size-- > 0)
3087                     {
3088                       ONE_MORE_BYTE (c1);
3089                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3090                     }
3091                 }
3092               else if (c1 == 'G')
3093                 {
3094                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3095                      ESC % G --UTF-8-BYTES-- ESC % @
3096                      We keep these bytes as is for the moment.
3097                      They may be decoded by post-read-conversion.  */
3098                   int *p = charbuf;
3099
3100                   if (p + 6 > charbuf_end)
3101                     goto break_loop;
3102                   *p++ = ISO_CODE_ESC;
3103                   *p++ = '%';
3104                   *p++ = 'G';
3105                   while (p < charbuf_end)
3106                     {
3107                       ONE_MORE_BYTE (c1);
3108                       if (c1 == ISO_CODE_ESC
3109                           && src + 1 < src_end
3110                           && src[0] == '%'
3111                           && src[1] == '@')
3112                         break;
3113                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3114                     }
3115                   if (p + 3 > charbuf_end)
3116                     goto break_loop;
3117                   *p++ = ISO_CODE_ESC;
3118                   *p++ = '%';
3119                   *p++ = '@';
3120                   charbuf = p;
3121                 }
3122               else
3123                 goto invalid_code;
3124               continue;
3125               break;
3126
3127             default:
3128               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3129                 goto invalid_code;
3130               if (c1 >= 0x28 && c1 <= 0x2B)
3131                 {       /* designation of DIMENSION1_CHARS94 character set */
3132                   ONE_MORE_BYTE (c2);
3133                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3134                 }
3135               else if (c1 >= 0x2C && c1 <= 0x2F)
3136                 {       /* designation of DIMENSION1_CHARS96 character set */
3137                   ONE_MORE_BYTE (c2);
3138                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3139                 }
3140               else
3141                 goto invalid_code;
3142               /* We must update these variables now.  */
3143               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3144               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3145               continue;
3146             }
3147         }
3148
3149       if (charset->id != charset_ascii
3150           && last_id != charset->id)
3151         {
3152           if (last_id != charset_ascii)
3153             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3154           last_id = charset->id;
3155           last_offset = char_offset;
3156         }
3157
3158       /* Now we know CHARSET and 1st position code C1 of a character.
3159          Produce a decoded character while getting 2nd position code
3160          C2 if necessary.  */
3161       c1 &= 0x7F;
3162       if (CHARSET_DIMENSION (charset) > 1)
3163         {
3164           ONE_MORE_BYTE (c2);
3165           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3166             /* C2 is not in a valid range.  */
3167             goto invalid_code;
3168           c1 = (c1 << 8) | (c2 & 0x7F);
3169           if (CHARSET_DIMENSION (charset) > 2)
3170             {
3171               ONE_MORE_BYTE (c2);
3172               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3173                 /* C2 is not in a valid range.  */
3174                 goto invalid_code;
3175               c1 = (c1 << 8) | (c2 & 0x7F);
3176             }
3177         }
3178
3179       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3180       if (c < 0)
3181         {
3182           MAYBE_FINISH_COMPOSITION ();
3183           for (; src_base < src; src_base++, char_offset++)
3184             {
3185               if (ASCII_BYTE_P (*src_base))
3186                 *charbuf++ = *src_base;
3187               else
3188                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3189               char_offset++;
3190             }
3191         }
3192       else if (composition_state == COMPOSING_NO)
3193         {
3194           *charbuf++ = c;
3195           char_offset++;
3196         }
3197       else
3198         {
3199           components[component_idx++] = c;
3200           if (method == COMPOSITION_WITH_RULE
3201               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3202                   && composition_state == COMPOSING_COMPONENT_CHAR))
3203             composition_state++;
3204         }
3205       continue;
3206
3207     invalid_code:
3208       MAYBE_FINISH_COMPOSITION ();
3209       src = src_base;
3210       consumed_chars = consumed_chars_base;
3211       ONE_MORE_BYTE (c);
3212       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3213       char_offset++;
3214       coding->errors++;
3215       continue;
3216
3217     break_loop:
3218       break;
3219     }
3220
3221  no_more_source:
3222   if (last_id != charset_ascii)
3223     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224   coding->consumed_char += consumed_chars_base;
3225   coding->consumed = src_base - coding->source;
3226   coding->charbuf_used = charbuf - coding->charbuf;
3227 }
3228
3229
3230 /* ISO2022 encoding stuff.  */
3231
3232 /*
3233    It is not enough to say just "ISO2022" on encoding, we have to
3234    specify more details.  In Emacs, each coding system of ISO2022
3235    variant has the following specifications:
3236         1. Initial designation to G0 thru G3.
3237         2. Allows short-form designation?
3238         3. ASCII should be designated to G0 before control characters?
3239         4. ASCII should be designated to G0 at end of line?
3240         5. 7-bit environment or 8-bit environment?
3241         6. Use locking-shift?
3242         7. Use Single-shift?
3243    And the following two are only for Japanese:
3244         8. Use ASCII in place of JIS0201-1976-Roman?
3245         9. Use JISX0208-1983 in place of JISX0208-1978?
3246    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3248    details.
3249 */
3250
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3253    '@', 'A', or 'B' and the coding system CODING allows, produce
3254    designation sequence of short-form.  */
3255
3256 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3257   do {                                                                  \
3258     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3259     char *intermediate_char_94 = "()*+";                                \
3260     char *intermediate_char_96 = ",-./";                                \
3261     int revision = -1;                                                  \
3262     int c;                                                              \
3263                                                                         \
3264     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3265       revision = CHARSET_ISO_REVISION (charset);                        \
3266                                                                         \
3267     if (revision >= 0)                                                  \
3268       {                                                                 \
3269         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3270         EMIT_ONE_BYTE ('@' + revision);                                 \
3271       }                                                                 \
3272     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3273     if (CHARSET_DIMENSION (charset) == 1)                               \
3274       {                                                                 \
3275         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3276           c = intermediate_char_94[reg];                                \
3277         else                                                            \
3278           c = intermediate_char_96[reg];                                \
3279         EMIT_ONE_ASCII_BYTE (c);                                        \
3280       }                                                                 \
3281     else                                                                \
3282       {                                                                 \
3283         EMIT_ONE_ASCII_BYTE ('$');                                      \
3284         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3285           {                                                             \
3286             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3287                 || reg != 0                                             \
3288                 || final_char < '@' || final_char > 'B')                \
3289               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3290           }                                                             \
3291         else                                                            \
3292           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3293       }                                                                 \
3294     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3295                                                                         \
3296     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3297   } while (0)
3298
3299
3300 /* The following two macros produce codes (control character or escape
3301    sequence) for ISO2022 single-shift functions (single-shift-2 and
3302    single-shift-3).  */
3303
3304 #define ENCODE_SINGLE_SHIFT_2                                           \
3305   do {                                                                  \
3306     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3307       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3308     else                                                                \
3309       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3310     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3311   } while (0)
3312
3313
3314 #define ENCODE_SINGLE_SHIFT_3                                           \
3315   do {                                                                  \
3316     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3317       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3318     else                                                                \
3319       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3320     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3321   } while (0)
3322
3323
3324 /* The following four macros produce codes (control character or
3325    escape sequence) for ISO2022 locking-shift functions (shift-in,
3326    shift-out, locking-shift-2, and locking-shift-3).  */
3327
3328 #define ENCODE_SHIFT_IN                                 \
3329   do {                                                  \
3330     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3331     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3332   } while (0)
3333
3334
3335 #define ENCODE_SHIFT_OUT                                \
3336   do {                                                  \
3337     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3338     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3339   } while (0)
3340
3341
3342 #define ENCODE_LOCKING_SHIFT_2                          \
3343   do {                                                  \
3344     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3345     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3346   } while (0)
3347
3348
3349 #define ENCODE_LOCKING_SHIFT_3                          \
3350   do {                                                  \
3351     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3352     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3353   } while (0)
3354
3355
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357    CHARSET and whose position-code is C1.  Designation and invocation
3358    sequences are also produced in advance if necessary.  */
3359
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3361   do {                                                                  \
3362     int id = CHARSET_ID (charset);                                      \
3363                                                                         \
3364     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3365         && id == charset_ascii)                                         \
3366       {                                                                 \
3367         id = charset_jisx0201_roman;                                    \
3368         charset = CHARSET_FROM_ID (id);                                 \
3369       }                                                                 \
3370                                                                         \
3371     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3372       {                                                                 \
3373         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3374           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3375         else                                                            \
3376           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3377         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3378         break;                                                          \
3379       }                                                                 \
3380     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3381       {                                                                 \
3382         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3383         break;                                                          \
3384       }                                                                 \
3385     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3386       {                                                                 \
3387         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3388         break;                                                          \
3389       }                                                                 \
3390     else                                                                \
3391       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3392          must invoke it, or, at first, designate it to some graphic     \
3393          register.  Then repeat the loop to actually produce the        \
3394          character.  */                                                 \
3395       dst = encode_invocation_designation (charset, coding, dst,        \
3396                                            &produced_chars);            \
3397   } while (1)
3398
3399
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401    CHARSET and whose position-codes are C1 and C2.  Designation and
3402    invocation codes are also produced in advance if necessary.  */
3403
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3405   do {                                                                  \
3406     int id = CHARSET_ID (charset);                                      \
3407                                                                         \
3408     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3409         && id == charset_jisx0208)                                      \
3410       {                                                                 \
3411         id = charset_jisx0208_1978;                                     \
3412         charset = CHARSET_FROM_ID (id);                                 \
3413       }                                                                 \
3414                                                                         \
3415     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3416       {                                                                 \
3417         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3418           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3419         else                                                            \
3420           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3421         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3422         break;                                                          \
3423       }                                                                 \
3424     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3425       {                                                                 \
3426         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3427         break;                                                          \
3428       }                                                                 \
3429     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3430       {                                                                 \
3431         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3432         break;                                                          \
3433       }                                                                 \
3434     else                                                                \
3435       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3436          must invoke it, or, at first, designate it to some graphic     \
3437          register.  Then repeat the loop to actually produce the        \
3438          character.  */                                                 \
3439       dst = encode_invocation_designation (charset, coding, dst,        \
3440                                            &produced_chars);            \
3441   } while (1)
3442
3443
3444 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3445   do {                                                                     \
3446     int code = ENCODE_CHAR ((charset),(c));                                \
3447                                                                            \
3448     if (CHARSET_DIMENSION (charset) == 1)                                  \
3449       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3450     else                                                                   \
3451       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3452   } while (0)
3453
3454
3455 /* Produce designation and invocation codes at a place pointed by DST
3456    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3457    Return new DST.  */
3458
3459 unsigned char *
3460 encode_invocation_designation (charset, coding, dst, p_nchars)
3461      struct charset *charset;
3462      struct coding_system *coding;
3463      unsigned char *dst;
3464      int *p_nchars;
3465 {
3466   int multibytep = coding->dst_multibyte;
3467   int produced_chars = *p_nchars;
3468   int reg;                      /* graphic register number */
3469   int id = CHARSET_ID (charset);
3470
3471   /* At first, check designations.  */
3472   for (reg = 0; reg < 4; reg++)
3473     if (id == CODING_ISO_DESIGNATION (coding, reg))
3474       break;
3475
3476   if (reg >= 4)
3477     {
3478       /* CHARSET is not yet designated to any graphic registers.  */
3479       /* At first check the requested designation.  */
3480       reg = CODING_ISO_REQUEST (coding, id);
3481       if (reg < 0)
3482         /* Since CHARSET requests no special designation, designate it
3483            to graphic register 0.  */
3484         reg = 0;
3485
3486       ENCODE_DESIGNATION (charset, reg, coding);
3487     }
3488
3489   if (CODING_ISO_INVOCATION (coding, 0) != reg
3490       && CODING_ISO_INVOCATION (coding, 1) != reg)
3491     {
3492       /* Since the graphic register REG is not invoked to any graphic
3493          planes, invoke it to graphic plane 0.  */
3494       switch (reg)
3495         {
3496         case 0:                 /* graphic register 0 */
3497           ENCODE_SHIFT_IN;
3498           break;
3499
3500         case 1:                 /* graphic register 1 */
3501           ENCODE_SHIFT_OUT;
3502           break;
3503
3504         case 2:                 /* graphic register 2 */
3505           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3506             ENCODE_SINGLE_SHIFT_2;
3507           else
3508             ENCODE_LOCKING_SHIFT_2;
3509           break;
3510
3511         case 3:                 /* graphic register 3 */
3512           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3513             ENCODE_SINGLE_SHIFT_3;
3514           else
3515             ENCODE_LOCKING_SHIFT_3;
3516           break;
3517         }
3518     }
3519
3520   *p_nchars = produced_chars;
3521   return dst;
3522 }
3523
3524 /* The following three macros produce codes for indicating direction
3525    of text.  */
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3527   do {                                                                  \
3528     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3529       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3530     else                                                                \
3531       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3532   } while (0)
3533
3534
3535 #define ENCODE_DIRECTION_R2L()                  \
3536   do {                                          \
3537     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3538     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3539   } while (0)
3540
3541
3542 #define ENCODE_DIRECTION_L2R()                  \
3543   do {                                          \
3544     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3545     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3546   } while (0)
3547
3548
3549 /* Produce codes for designation and invocation to reset the graphic
3550    planes and registers to initial state.  */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3552   do {                                                                  \
3553     int reg;                                                            \
3554     struct charset *charset;                                            \
3555                                                                         \
3556     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3557       ENCODE_SHIFT_IN;                                                  \
3558     for (reg = 0; reg < 4; reg++)                                       \
3559       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3560           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3561               != CODING_ISO_INITIAL (coding, reg)))                     \
3562         {                                                               \
3563           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564           ENCODE_DESIGNATION (charset, reg, coding);                    \
3565         }                                                               \
3566   } while (0)
3567
3568
3569 /* Produce designation sequences of charsets in the line started from
3570    SRC to a place pointed by DST, and return updated DST.
3571
3572    If the current block ends before any end-of-line, we may fail to
3573    find all the necessary designations.  */
3574
3575 static unsigned char *
3576 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3577      struct coding_system *coding;
3578      int *charbuf, *charbuf_end;
3579      unsigned char *dst;
3580 {
3581   struct charset *charset;
3582   /* Table of charsets to be designated to each graphic register.  */
3583   int r[4];
3584   int c, found = 0, reg;
3585   int produced_chars = 0;
3586   int multibytep = coding->dst_multibyte;
3587   Lisp_Object attrs;
3588   Lisp_Object charset_list;
3589
3590   attrs = CODING_ID_ATTRS (coding->id);
3591   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592   if (EQ (charset_list, Qiso_2022))
3593     charset_list = Viso_2022_charset_list;
3594
3595   for (reg = 0; reg < 4; reg++)
3596     r[reg] = -1;
3597
3598   while (found < 4)
3599     {
3600       int id;
3601
3602       c = *charbuf++;
3603       if (c == '\n')
3604         break;
3605       charset = char_charset (c, charset_list, NULL);
3606       id = CHARSET_ID (charset);
3607       reg = CODING_ISO_REQUEST (coding, id);
3608       if (reg >= 0 && r[reg] < 0)
3609         {
3610           found++;
3611           r[reg] = id;
3612         }
3613     }
3614
3615   if (found)
3616     {
3617       for (reg = 0; reg < 4; reg++)
3618         if (r[reg] >= 0
3619             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3620           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3621     }
3622
3623   return dst;
3624 }
3625
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3627
3628 static int
3629 encode_coding_iso_2022 (coding)
3630      struct coding_system *coding;
3631 {
3632   int multibytep = coding->dst_multibyte;
3633   int *charbuf = coding->charbuf;
3634   int *charbuf_end = charbuf + coding->charbuf_used;
3635   unsigned char *dst = coding->destination + coding->produced;
3636   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3637   int safe_room = 16;
3638   int bol_designation
3639     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640        && CODING_ISO_BOL (coding));
3641   int produced_chars = 0;
3642   Lisp_Object attrs, eol_type, charset_list;
3643   int ascii_compatible;
3644   int c;
3645   int preferred_charset_id = -1;
3646
3647   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3648   setup_iso_safe_charsets (attrs);
3649   /* Charset list may have been changed.  */
3650   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3651   coding->safe_charsets
3652     = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3653
3654   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3655
3656   while (charbuf < charbuf_end)
3657     {
3658       ASSURE_DESTINATION (safe_room);
3659
3660       if (bol_designation)
3661         {
3662           unsigned char *dst_prev = dst;
3663
3664           /* We have to produce designation sequences if any now.  */
3665           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3666           bol_designation = 0;
3667           /* We are sure that designation sequences are all ASCII bytes.  */
3668           produced_chars += dst - dst_prev;
3669         }
3670
3671       c = *charbuf++;
3672
3673       if (c < 0)
3674         {
3675           /* Handle an annotation.  */
3676           switch (*charbuf)
3677             {
3678             case CODING_ANNOTATE_COMPOSITION_MASK:
3679               /* Not yet implemented.  */
3680               break;
3681             case CODING_ANNOTATE_CHARSET_MASK:
3682               preferred_charset_id = charbuf[3];
3683               if (preferred_charset_id >= 0
3684                   && NILP (Fmemq (make_number (preferred_charset_id),
3685                                   charset_list)))
3686                 preferred_charset_id = -1;
3687               break;
3688             default:
3689               abort ();
3690             }
3691           charbuf += -c - 1;
3692           continue;
3693         }
3694
3695       /* Now encode the character C.  */
3696       if (c < 0x20 || c == 0x7F)
3697         {
3698           if (c == '\n'
3699               || (c == '\r' && EQ (eol_type, Qmac)))
3700             {
3701               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3702                 ENCODE_RESET_PLANE_AND_REGISTER ();
3703               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3704                 {
3705                   int i;
3706
3707                   for (i = 0; i < 4; i++)
3708                     CODING_ISO_DESIGNATION (coding, i)
3709                       = CODING_ISO_INITIAL (coding, i);
3710                 }
3711               bol_designation
3712                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3713             }
3714           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3715             ENCODE_RESET_PLANE_AND_REGISTER ();
3716           EMIT_ONE_ASCII_BYTE (c);
3717         }
3718       else if (ASCII_CHAR_P (c))
3719         {
3720           if (ascii_compatible)
3721             EMIT_ONE_ASCII_BYTE (c);
3722           else
3723             {
3724               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3725               ENCODE_ISO_CHARACTER (charset, c);
3726             }
3727         }
3728       else if (CHAR_BYTE8_P (c))
3729         {
3730           c = CHAR_TO_BYTE8 (c);
3731           EMIT_ONE_BYTE (c);
3732         }
3733       else
3734         {
3735           struct charset *charset;
3736
3737           if (preferred_charset_id >= 0)
3738             {
3739               charset = CHARSET_FROM_ID (preferred_charset_id);
3740               if (! CHAR_CHARSET_P (c, charset))
3741                 charset = char_charset (c, charset_list, NULL);
3742             }
3743           else
3744             charset = char_charset (c, charset_list, NULL);
3745           if (!charset)
3746             {
3747               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3748                 {
3749                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3750                   charset = CHARSET_FROM_ID (charset_ascii);
3751                 }
3752               else
3753                 {
3754                   c = coding->default_char;
3755                   charset = char_charset (c, charset_list, NULL);
3756                 }
3757             }
3758           ENCODE_ISO_CHARACTER (charset, c);
3759         }
3760     }
3761
3762   if (coding->mode & CODING_MODE_LAST_BLOCK
3763       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3764     {
3765       ASSURE_DESTINATION (safe_room);
3766       ENCODE_RESET_PLANE_AND_REGISTER ();
3767     }
3768   coding->result = CODING_RESULT_SUCCESS;
3769   CODING_ISO_BOL (coding) = bol_designation;
3770   coding->produced_char += produced_chars;
3771   coding->produced = dst - coding->destination;
3772   return 0;
3773 }
3774
3775 \f
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3777
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779    quite widely.  So, for the moment, Emacs supports them in the bare
3780    C code.  But, in the future, they may be supported only by CCL.  */
3781
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3784    as is.  A character of charset katakana-jisx0201 is encoded by
3785    "position-code + 0x80".  A character of charset japanese-jisx0208
3786    is encoded in 2-byte but two position-codes are divided and shifted
3787    so that it fit in the range below.
3788
3789    --- CODE RANGE of SJIS ---
3790    (character set)      (range)
3791    ASCII                0x00 .. 0x7F
3792    KATAKANA-JISX0201    0xA0 .. 0xDF
3793    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3794             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3795    -------------------------------
3796
3797 */
3798
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3801    character set and is encoded in two-byte.
3802
3803    --- CODE RANGE of BIG5 ---
3804    (character set)      (range)
3805    ASCII                0x00 .. 0x7F
3806    Big5 (1st byte)      0xA1 .. 0xFE
3807         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3808    --------------------------
3809
3810   */
3811
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813    Check if a text is encoded in SJIS.  If it is, return
3814    CATEGORY_MASK_SJIS, else return 0.  */
3815
3816 static int
3817 detect_coding_sjis (coding, detect_info)
3818      struct coding_system *coding;
3819      struct coding_detection_info *detect_info;
3820 {
3821   unsigned char *src = coding->source, *src_base = src;
3822   unsigned char *src_end = coding->source + coding->src_bytes;
3823   int multibytep = coding->src_multibyte;
3824   int consumed_chars = 0;
3825   int found = 0;
3826   int c;
3827   int incomplete;
3828
3829   detect_info->checked |= CATEGORY_MASK_SJIS;
3830   /* A coding system of this category is always ASCII compatible.  */
3831   src += coding->head_ascii;
3832
3833   while (1)
3834     {
3835       incomplete = 0;
3836       ONE_MORE_BYTE (c);
3837       incomplete = 1;
3838       if (c < 0x80)
3839         continue;
3840       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3841         {
3842           ONE_MORE_BYTE (c);
3843           if (c < 0x40 || c == 0x7F || c > 0xFC)
3844             break;
3845           found = CATEGORY_MASK_SJIS;
3846         }
3847       else if (c >= 0xA0 && c < 0xE0)
3848         found = CATEGORY_MASK_SJIS;
3849       else
3850         break;
3851     }
3852   detect_info->rejected |= CATEGORY_MASK_SJIS;
3853   return 0;
3854
3855  no_more_source:
3856   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3857     {
3858       detect_info->rejected |= CATEGORY_MASK_SJIS;
3859       return 0;
3860     }
3861   detect_info->found |= found;
3862   return 1;
3863 }
3864
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866    Check if a text is encoded in BIG5.  If it is, return
3867    CATEGORY_MASK_BIG5, else return 0.  */
3868
3869 static int
3870 detect_coding_big5 (coding, detect_info)
3871      struct coding_system *coding;
3872      struct coding_detection_info *detect_info;
3873 {
3874   unsigned char *src = coding->source, *src_base = src;
3875   unsigned char *src_end = coding->source + coding->src_bytes;
3876   int multibytep = coding->src_multibyte;
3877   int consumed_chars = 0;
3878   int found = 0;
3879   int c;
3880   int incomplete;
3881
3882   detect_info->checked |= CATEGORY_MASK_BIG5;
3883   /* A coding system of this category is always ASCII compatible.  */
3884   src += coding->head_ascii;
3885
3886   while (1)
3887     {
3888       incomplete = 0;
3889       ONE_MORE_BYTE (c);
3890       incomplete = 1;
3891       if (c < 0x80)
3892         continue;
3893       if (c >= 0xA1)
3894         {
3895           ONE_MORE_BYTE (c);
3896           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3897             return 0;
3898           found = CATEGORY_MASK_BIG5;
3899         }
3900       else
3901         break;
3902     }
3903   detect_info->rejected |= CATEGORY_MASK_BIG5;
3904   return 0;
3905
3906  no_more_source:
3907   if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3908     {
3909       detect_info->rejected |= CATEGORY_MASK_BIG5;
3910       return 0;
3911     }
3912   detect_info->found |= found;
3913   return 1;
3914 }
3915
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3918
3919 static void
3920 decode_coding_sjis (coding)
3921      struct coding_system *coding;
3922 {
3923   unsigned char *src = coding->source + coding->consumed;
3924   unsigned char *src_end = coding->source + coding->src_bytes;
3925   unsigned char *src_base;
3926   int *charbuf = coding->charbuf;
3927   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3928   int consumed_chars = 0, consumed_chars_base;
3929   int multibytep = coding->src_multibyte;
3930   struct charset *charset_roman, *charset_kanji, *charset_kana;
3931   Lisp_Object attrs, eol_type, charset_list, val;
3932   int char_offset = coding->produced_char;
3933   int last_offset = char_offset;
3934   int last_id = charset_ascii;
3935
3936   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3937
3938   val = charset_list;
3939   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3940   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3941   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3942
3943   while (1)
3944     {
3945       int c, c1;
3946
3947       src_base = src;
3948       consumed_chars_base = consumed_chars;
3949
3950       if (charbuf >= charbuf_end)
3951         break;
3952
3953       ONE_MORE_BYTE (c);
3954
3955       if (c == '\r')
3956         {
3957           if (EQ (eol_type, Qdos))
3958             {
3959               if (src == src_end)
3960                 {
3961                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3962                   goto no_more_source;
3963                 }
3964               if (*src == '\n')
3965                 ONE_MORE_BYTE (c);
3966             }
3967           else if (EQ (eol_type, Qmac))
3968             c = '\n';
3969         }
3970       else
3971         {
3972           struct charset *charset;
3973
3974           if (c < 0x80)
3975             charset = charset_roman;
3976           else
3977             {
3978               if (c >= 0xF0)
3979                 goto invalid_code;
3980               if (c < 0xA0 || c >= 0xE0)
3981                 {
3982                   /* SJIS -> JISX0208 */
3983                   ONE_MORE_BYTE (c1);
3984                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3985                     goto invalid_code;
3986                   c = (c << 8) | c1;
3987                   SJIS_TO_JIS (c);
3988                   charset = charset_kanji;
3989                 }
3990               else
3991                 {
3992                   /* SJIS -> JISX0201-Kana */
3993                   c &= 0x7F;
3994                   charset = charset_kana;
3995                 }
3996             }
3997           if (charset->id != charset_ascii
3998               && last_id != charset->id)
3999             {
4000               if (last_id != charset_ascii)
4001                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4002               last_id = charset->id;
4003               last_offset = char_offset;
4004             }
4005           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4006         }
4007       *charbuf++ = c;
4008       char_offset++;
4009       continue;
4010
4011     invalid_code:
4012       src = src_base;
4013       consumed_chars = consumed_chars_base;
4014       ONE_MORE_BYTE (c);
4015       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4016       char_offset++;
4017       coding->errors++;
4018     }
4019
4020  no_more_source:
4021   if (last_id != charset_ascii)
4022     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4023   coding->consumed_char += consumed_chars_base;
4024   coding->consumed = src_base - coding->source;
4025   coding->charbuf_used = charbuf - coding->charbuf;
4026 }
4027
4028 static void
4029 decode_coding_big5 (coding)
4030      struct coding_system *coding;
4031 {
4032   unsigned char *src = coding->source + coding->consumed;
4033   unsigned char *src_end = coding->source + coding->src_bytes;
4034   unsigned char *src_base;
4035   int *charbuf = coding->charbuf;
4036   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4037   int consumed_chars = 0, consumed_chars_base;
4038   int multibytep = coding->src_multibyte;
4039   struct charset *charset_roman, *charset_big5;
4040   Lisp_Object attrs, eol_type, charset_list, val;
4041   int char_offset = coding->produced_char;
4042   int last_offset = char_offset;
4043   int last_id = charset_ascii;
4044
4045   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4046   val = charset_list;
4047   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4048   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4049
4050   while (1)
4051     {
4052       int c, c1;
4053
4054       src_base = src;
4055       consumed_chars_base = consumed_chars;
4056
4057       if (charbuf >= charbuf_end)
4058         break;
4059
4060       ONE_MORE_BYTE (c);
4061
4062       if (c == '\r')
4063         {
4064           if (EQ (eol_type, Qdos))
4065             {
4066               if (src == src_end)
4067                 {
4068                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4069                   goto no_more_source;
4070                 }
4071               if (*src == '\n')
4072                 ONE_MORE_BYTE (c);
4073             }
4074           else if (EQ (eol_type, Qmac))
4075             c = '\n';
4076         }
4077       else
4078         {
4079           struct charset *charset;
4080           if (c < 0x80)
4081             charset = charset_roman;
4082           else
4083             {
4084               /* BIG5 -> Big5 */
4085               if (c < 0xA1 || c > 0xFE)
4086                 goto invalid_code;
4087               ONE_MORE_BYTE (c1);
4088               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4089                 goto invalid_code;
4090               c = c << 8 | c1;
4091               charset = charset_big5;
4092             }
4093           if (charset->id != charset_ascii
4094               && last_id != charset->id)
4095             {
4096               if (last_id != charset_ascii)
4097                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4098               last_id = charset->id;
4099               last_offset = char_offset;
4100             }
4101           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4102         }
4103
4104       *charbuf++ = c;
4105       char_offset++;
4106       continue;
4107
4108     invalid_code:
4109       src = src_base;
4110       consumed_chars = consumed_chars_base;
4111       ONE_MORE_BYTE (c);
4112       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4113       char_offset++;
4114       coding->errors++;
4115     }
4116
4117  no_more_source:
4118   if (last_id != charset_ascii)
4119     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4120   coding->consumed_char += consumed_chars_base;
4121   coding->consumed = src_base - coding->source;
4122   coding->charbuf_used = charbuf - coding->charbuf;
4123 }
4124
4125 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4126    This function can encode charsets `ascii', `katakana-jisx0201',
4127    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4128    are sure that all these charsets are registered as official charset
4129    (i.e. do not have extended leading-codes).  Characters of other
4130    charsets are produced without any encoding.  If SJIS_P is 1, encode
4131    SJIS text, else encode BIG5 text.  */
4132
4133 static int
4134 encode_coding_sjis (coding)
4135      struct coding_system *coding;
4136 {
4137   int multibytep = coding->dst_multibyte;
4138   int *charbuf = coding->charbuf;
4139   int *charbuf_end = charbuf + coding->charbuf_used;
4140   unsigned char *dst = coding->destination + coding->produced;
4141   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4142   int safe_room = 4;
4143   int produced_chars = 0;
4144   Lisp_Object attrs, eol_type, charset_list, val;
4145   int ascii_compatible;
4146   struct charset *charset_roman, *charset_kanji, *charset_kana;
4147   int c;
4148
4149   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4150   val = charset_list;
4151   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4152   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4153   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4154
4155   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4156
4157   while (charbuf < charbuf_end)
4158     {
4159       ASSURE_DESTINATION (safe_room);
4160       c = *charbuf++;
4161       /* Now encode the character C.  */
4162       if (ASCII_CHAR_P (c) && ascii_compatible)
4163         EMIT_ONE_ASCII_BYTE (c);
4164       else if (CHAR_BYTE8_P (c))
4165         {
4166           c = CHAR_TO_BYTE8 (c);
4167           EMIT_ONE_BYTE (c);
4168         }
4169       else
4170         {
4171           unsigned code;
4172           struct charset *charset = char_charset (c, charset_list, &code);
4173
4174           if (!charset)
4175             {
4176               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4177                 {
4178                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4179                   charset = CHARSET_FROM_ID (charset_ascii);
4180                 }
4181               else
4182                 {
4183                   c = coding->default_char;
4184                   charset = char_charset (c, charset_list, &code);
4185                 }
4186             }
4187           if (code == CHARSET_INVALID_CODE (charset))
4188             abort ();
4189           if (charset == charset_kanji)
4190             {
4191               int c1, c2;
4192               JIS_TO_SJIS (code);
4193               c1 = code >> 8, c2 = code & 0xFF;
4194               EMIT_TWO_BYTES (c1, c2);
4195             }
4196           else if (charset == charset_kana)
4197             EMIT_ONE_BYTE (code | 0x80);
4198           else
4199             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4200         }
4201     }
4202   coding->result = CODING_RESULT_SUCCESS;
4203   coding->produced_char += produced_chars;
4204   coding->produced = dst - coding->destination;
4205   return 0;
4206 }
4207
4208 static int
4209 encode_coding_big5 (coding)
4210      struct coding_system *coding;
4211 {
4212   int multibytep = coding->dst_multibyte;
4213   int *charbuf = coding->charbuf;
4214   int *charbuf_end = charbuf + coding->charbuf_used;
4215   unsigned char *dst = coding->destination + coding->produced;
4216   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4217   int safe_room = 4;
4218   int produced_chars = 0;
4219   Lisp_Object attrs, eol_type, charset_list, val;
4220   int ascii_compatible;
4221   struct charset *charset_roman, *charset_big5;
4222   int c;
4223
4224   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4225   val = charset_list;
4226   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4227   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4228   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4229
4230   while (charbuf < charbuf_end)
4231     {
4232       ASSURE_DESTINATION (safe_room);
4233       c = *charbuf++;
4234       /* Now encode the character C.  */
4235       if (ASCII_CHAR_P (c) && ascii_compatible)
4236         EMIT_ONE_ASCII_BYTE (c);
4237       else if (CHAR_BYTE8_P (c))
4238         {
4239           c = CHAR_TO_BYTE8 (c);
4240           EMIT_ONE_BYTE (c);
4241         }
4242       else
4243         {
4244           unsigned code;
4245           struct charset *charset = char_charset (c, charset_list, &code);
4246
4247           if (! charset)
4248             {
4249               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4250                 {
4251                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4252                   charset = CHARSET_FROM_ID (charset_ascii);
4253                 }
4254               else
4255                 {
4256                   c = coding->default_char;
4257                   charset = char_charset (c, charset_list, &code);
4258                 }
4259             }
4260           if (code == CHARSET_INVALID_CODE (charset))
4261             abort ();
4262           if (charset == charset_big5)
4263             {
4264               int c1, c2;
4265
4266               c1 = code >> 8, c2 = code & 0xFF;
4267               EMIT_TWO_BYTES (c1, c2);
4268             }
4269           else
4270             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4271         }
4272     }
4273   coding->result = CODING_RESULT_SUCCESS;
4274   coding->produced_char += produced_chars;
4275   coding->produced = dst - coding->destination;
4276   return 0;
4277 }
4278
4279 \f
4280 /*** 10. CCL handlers ***/
4281
4282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4283    Check if a text is encoded in a coding system of which
4284    encoder/decoder are written in CCL program.  If it is, return
4285    CATEGORY_MASK_CCL, else return 0.  */
4286
4287 static int
4288 detect_coding_ccl (coding, detect_info)
4289      struct coding_system *coding;
4290      struct coding_detection_info *detect_info;
4291 {
4292   unsigned char *src = coding->source, *src_base = src;
4293   unsigned char *src_end = coding->source + coding->src_bytes;
4294   int multibytep = coding->src_multibyte;
4295   int consumed_chars = 0;
4296   int found = 0;
4297   unsigned char *valids = CODING_CCL_VALIDS (coding);
4298   int head_ascii = coding->head_ascii;
4299   Lisp_Object attrs;
4300
4301   detect_info->checked |= CATEGORY_MASK_CCL;
4302
4303   coding = &coding_categories[coding_category_ccl];
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4306     src += head_ascii;
4307
4308   while (1)
4309     {
4310       int c;
4311       ONE_MORE_BYTE (c);
4312       if (! valids[c])
4313         break;
4314       if ((valids[c] > 1))
4315         found = CATEGORY_MASK_CCL;
4316     }
4317   detect_info->rejected |= CATEGORY_MASK_CCL;
4318   return 0;
4319
4320  no_more_source:
4321   detect_info->found |= found;
4322   return 1;
4323 }
4324
4325 static void
4326 decode_coding_ccl (coding)
4327      struct coding_system *coding;
4328 {
4329   const unsigned char *src = coding->source + coding->consumed;
4330   unsigned char *src_end = coding->source + coding->src_bytes;
4331   int *charbuf = coding->charbuf;
4332   int *charbuf_end = charbuf + coding->charbuf_size;
4333   int consumed_chars = 0;
4334   int multibytep = coding->src_multibyte;
4335   struct ccl_program ccl;
4336   int source_charbuf[1024];
4337   int source_byteidx[1024];
4338   Lisp_Object attrs, eol_type, charset_list;
4339
4340   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4341   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4342
4343   while (src < src_end)
4344     {
4345       const unsigned char *p = src;
4346       int *source, *source_end;
4347       int i = 0;
4348
4349       if (multibytep)
4350         while (i < 1024 && p < src_end)
4351           {
4352             source_byteidx[i] = p - src;
4353             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4354           }
4355       else
4356         while (i < 1024 && p < src_end)
4357           source_charbuf[i++] = *p++;
4358
4359       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4360         ccl.last_block = 1;
4361
4362       source = source_charbuf;
4363       source_end = source + i;
4364       while (source < source_end)
4365         {
4366           ccl_driver (&ccl, source, charbuf,
4367                       source_end - source, charbuf_end - charbuf,
4368                       charset_list);
4369           source += ccl.consumed;
4370           charbuf += ccl.produced;
4371           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4372             break;
4373         }
4374       if (source < source_end)
4375         src += source_byteidx[source - source_charbuf];
4376       else
4377         src = p;
4378       consumed_chars += source - source_charbuf;
4379
4380       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4381           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4382         break;
4383     }
4384
4385   switch (ccl.status)
4386     {
4387     case CCL_STAT_SUSPEND_BY_SRC:
4388       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4389       break;
4390     case CCL_STAT_SUSPEND_BY_DST:
4391       break;
4392     case CCL_STAT_QUIT:
4393     case CCL_STAT_INVALID_CMD:
4394       coding->result = CODING_RESULT_INTERRUPT;
4395       break;
4396     default:
4397       coding->result = CODING_RESULT_SUCCESS;
4398       break;
4399     }
4400   coding->consumed_char += consumed_chars;
4401   coding->consumed = src - coding->source;
4402   coding->charbuf_used = charbuf - coding->charbuf;
4403 }
4404
4405 static int
4406 encode_coding_ccl (coding)
4407      struct coding_system *coding;
4408 {
4409   struct ccl_program ccl;
4410   int multibytep = coding->dst_multibyte;
4411   int *charbuf = coding->charbuf;
4412   int *charbuf_end = charbuf + coding->charbuf_used;
4413   unsigned char *dst = coding->destination + coding->produced;
4414   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4415   unsigned char *adjusted_dst_end = dst_end - 1;
4416   int destination_charbuf[1024];
4417   int i, produced_chars = 0;
4418   Lisp_Object attrs, eol_type, charset_list;
4419
4420   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4421   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4422
4423   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4424   ccl.dst_multibyte = coding->dst_multibyte;
4425
4426   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4427     {
4428       int dst_bytes = dst_end - dst;
4429       if (dst_bytes > 1024)
4430         dst_bytes = 1024;
4431
4432       ccl_driver (&ccl, charbuf, destination_charbuf,
4433                   charbuf_end - charbuf, dst_bytes, charset_list);
4434       charbuf += ccl.consumed;
4435       if (multibytep)
4436         for (i = 0; i < ccl.produced; i++)
4437           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4438       else
4439         {
4440           for (i = 0; i < ccl.produced; i++)
4441             *dst++ = destination_charbuf[i] & 0xFF;
4442           produced_chars += ccl.produced;
4443         }
4444     }
4445
4446   switch (ccl.status)
4447     {
4448     case CCL_STAT_SUSPEND_BY_SRC:
4449       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4450       break;
4451     case CCL_STAT_SUSPEND_BY_DST:
4452       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4453       break;
4454     case CCL_STAT_QUIT:
4455     case CCL_STAT_INVALID_CMD:
4456       coding->result = CODING_RESULT_INTERRUPT;
4457       break;
4458     default:
4459       coding->result = CODING_RESULT_SUCCESS;
4460       break;
4461     }
4462
4463   coding->produced_char += produced_chars;
4464   coding->produced = dst - coding->destination;
4465   return 0;
4466 }
4467
4468
4469 \f
4470 /*** 10, 11. no-conversion handlers ***/
4471
4472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4473
4474 static void
4475 decode_coding_raw_text (coding)
4476      struct coding_system *coding;
4477 {
4478   coding->chars_at_source = 1;
4479   coding->consumed_char = 0;
4480   coding->consumed = 0;
4481   coding->result = CODING_RESULT_SUCCESS;
4482 }
4483
4484 static int
4485 encode_coding_raw_text (coding)
4486      struct coding_system *coding;
4487 {
4488   int multibytep = coding->dst_multibyte;
4489   int *charbuf = coding->charbuf;
4490   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4491   unsigned char *dst = coding->destination + coding->produced;
4492   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4493   int produced_chars = 0;
4494   int c;
4495
4496   if (multibytep)
4497     {
4498       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4499
4500       if (coding->src_multibyte)
4501         while (charbuf < charbuf_end)
4502           {
4503             ASSURE_DESTINATION (safe_room);
4504             c = *charbuf++;
4505             if (ASCII_CHAR_P (c))
4506               EMIT_ONE_ASCII_BYTE (c);
4507             else if (CHAR_BYTE8_P (c))
4508               {
4509                 c = CHAR_TO_BYTE8 (c);
4510                 EMIT_ONE_BYTE (c);
4511               }
4512             else
4513               {
4514                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4515
4516                 CHAR_STRING_ADVANCE (c, p1);
4517                 while (p0 < p1)
4518                   {
4519                     EMIT_ONE_BYTE (*p0);
4520                     p0++;
4521                   }
4522               }
4523           }
4524       else
4525         while (charbuf < charbuf_end)
4526           {
4527             ASSURE_DESTINATION (safe_room);
4528             c = *charbuf++;
4529             EMIT_ONE_BYTE (c);
4530           }
4531     }
4532   else
4533     {
4534       if (coding->src_multibyte)
4535         {
4536           int safe_room = MAX_MULTIBYTE_LENGTH;
4537
4538           while (charbuf < charbuf_end)
4539             {
4540               ASSURE_DESTINATION (safe_room);
4541               c = *charbuf++;
4542               if (ASCII_CHAR_P (c))
4543                 *dst++ = c;
4544               else if (CHAR_BYTE8_P (c))
4545                 *dst++ = CHAR_TO_BYTE8 (c);
4546               else
4547                 CHAR_STRING_ADVANCE (c, dst);
4548               produced_chars++;
4549             }
4550         }
4551       else
4552         {
4553           ASSURE_DESTINATION (charbuf_end - charbuf);
4554           while (charbuf < charbuf_end && dst < dst_end)
4555             *dst++ = *charbuf++;
4556           produced_chars = dst - (coding->destination + coding->dst_bytes);
4557         }
4558     }
4559   coding->result = CODING_RESULT_SUCCESS;
4560   coding->produced_char += produced_chars;
4561   coding->produced = dst - coding->destination;
4562   return 0;
4563 }
4564
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566    Check if a text is encoded in a charset-based coding system.  If it
4567    is, return 1, else return 0.  */
4568
4569 static int
4570 detect_coding_charset (coding, detect_info)
4571      struct coding_system *coding;
4572      struct coding_detection_info *detect_info;
4573 {
4574   unsigned char *src = coding->source, *src_base = src;
4575   unsigned char *src_end = coding->source + coding->src_bytes;
4576   int multibytep = coding->src_multibyte;
4577   int consumed_chars = 0;
4578   Lisp_Object attrs, valids;
4579   int found = 0;
4580
4581   detect_info->checked |= CATEGORY_MASK_CHARSET;
4582
4583   coding = &coding_categories[coding_category_charset];
4584   attrs = CODING_ID_ATTRS (coding->id);
4585   valids = AREF (attrs, coding_attr_charset_valids);
4586
4587   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4588     src += coding->head_ascii;
4589
4590   while (1)
4591     {
4592       int c;
4593
4594       ONE_MORE_BYTE (c);
4595       if (NILP (AREF (valids, c)))
4596         break;
4597       if (c >= 0x80)
4598         found = CATEGORY_MASK_CHARSET;
4599     }
4600   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4601   return 0;
4602
4603  no_more_source:
4604   detect_info->found |= found;
4605   return 1;
4606 }
4607
4608 static void
4609 decode_coding_charset (coding)
4610      struct coding_system *coding;
4611 {
4612   unsigned char *src = coding->source + coding->consumed;
4613   unsigned char *src_end = coding->source + coding->src_bytes;
4614   unsigned char *src_base;
4615   int *charbuf = coding->charbuf;
4616   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4617   int consumed_chars = 0, consumed_chars_base;
4618   int multibytep = coding->src_multibyte;
4619   Lisp_Object attrs, eol_type, charset_list, valids;
4620   int char_offset = coding->produced_char;
4621   int last_offset = char_offset;
4622   int last_id = charset_ascii;
4623
4624   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4625   valids = AREF (attrs, coding_attr_charset_valids);
4626
4627   while (1)
4628     {
4629       int c;
4630
4631       src_base = src;
4632       consumed_chars_base = consumed_chars;
4633
4634       if (charbuf >= charbuf_end)
4635         break;
4636
4637       ONE_MORE_BYTE (c);
4638       if (c == '\r')
4639         {
4640           /* Here we assume that no charset maps '\r' to something
4641              else.  */
4642           if (EQ (eol_type, Qdos))
4643             {
4644               if (src == src_end)
4645                 {
4646                   coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4647                   goto no_more_source;
4648                 }
4649               if (*src == '\n')
4650                 ONE_MORE_BYTE (c);
4651             }
4652           else if (EQ (eol_type, Qmac))
4653             c = '\n';
4654         }
4655       else
4656         {
4657           Lisp_Object val;
4658           struct charset *charset;
4659           int dim;
4660           int len = 1;
4661           unsigned code = c;
4662
4663           val = AREF (valids, c);
4664           if (NILP (val))
4665             goto invalid_code;
4666           if (INTEGERP (val))
4667             {
4668               charset = CHARSET_FROM_ID (XFASTINT (val));
4669               dim = CHARSET_DIMENSION (charset);
4670               while (len < dim)
4671                 {
4672                   ONE_MORE_BYTE (c);
4673                   code = (code << 8) | c;
4674                   len++;
4675                 }
4676               CODING_DECODE_CHAR (coding, src, src_base, src_end,
4677                                   charset, code, c);
4678             }
4679           else
4680             {
4681               /* VAL is a list of charset IDs.  It is assured that the
4682                  list is sorted by charset dimensions (smaller one
4683                  comes first).  */
4684               while (CONSP (val))
4685                 {
4686                   charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4687                   dim = CHARSET_DIMENSION (charset);
4688                   while (len < dim)
4689                     {
4690                       ONE_MORE_BYTE (c);
4691                       code = (code << 8) | c;
4692                       len++;
4693                     }
4694                   CODING_DECODE_CHAR (coding, src, src_base,
4695                                       src_end, charset, code, c);
4696                   if (c >= 0)
4697                     break;
4698                   val = XCDR (val);
4699                 }
4700             }
4701           if (c < 0)
4702             goto invalid_code;
4703           if (charset->id != charset_ascii
4704               && last_id != charset->id)
4705             {
4706               if (last_id != charset_ascii)
4707                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4708               last_id = charset->id;
4709               last_offset = char_offset;
4710             }
4711         }
4712       *charbuf++ = c;
4713       char_offset++;
4714       continue;
4715
4716     invalid_code:
4717       src = src_base;
4718       consumed_chars = consumed_chars_base;
4719       ONE_MORE_BYTE (c);
4720       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4721       char_offset++;
4722       coding->errors++;
4723     }
4724
4725  no_more_source:
4726   if (last_id != charset_ascii)
4727     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4728   coding->consumed_char += consumed_chars_base;
4729   coding->consumed = src_base - coding->source;
4730   coding->charbuf_used = charbuf - coding->charbuf;
4731 }
4732
4733 static int
4734 encode_coding_charset (coding)
4735      struct coding_system *coding;
4736 {
4737   int multibytep = coding->dst_multibyte;
4738   int *charbuf = coding->charbuf;
4739   int *charbuf_end = charbuf + coding->charbuf_used;
4740   unsigned char *dst = coding->destination + coding->produced;
4741   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4742   int safe_room = MAX_MULTIBYTE_LENGTH;
4743   int produced_chars = 0;
4744   Lisp_Object attrs, eol_type, charset_list;
4745   int ascii_compatible;
4746   int c;
4747
4748   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4749   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4750
4751   while (charbuf < charbuf_end)
4752     {
4753       struct charset *charset;
4754       unsigned code;
4755
4756       ASSURE_DESTINATION (safe_room);
4757       c = *charbuf++;
4758       if (ascii_compatible && ASCII_CHAR_P (c))
4759         EMIT_ONE_ASCII_BYTE (c);
4760       else if (CHAR_BYTE8_P (c))
4761         {
4762           c = CHAR_TO_BYTE8 (c);
4763           EMIT_ONE_BYTE (c);
4764         }
4765       else
4766         {
4767           charset = char_charset (c, charset_list, &code);
4768           if (charset)
4769             {
4770               if (CHARSET_DIMENSION (charset) == 1)
4771                 EMIT_ONE_BYTE (code);
4772               else if (CHARSET_DIMENSION (charset) == 2)
4773                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4774               else if (CHARSET_DIMENSION (charset) == 3)
4775                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4776               else
4777                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4778                                  (code >> 8) & 0xFF, code & 0xFF);
4779             }
4780           else
4781             {
4782               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4783                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4784               else
4785                 c = coding->default_char;
4786               EMIT_ONE_BYTE (c);
4787             }
4788         }
4789     }
4790
4791   coding->result = CODING_RESULT_SUCCESS;
4792   coding->produced_char += produced_chars;
4793   coding->produced = dst - coding->destination;
4794   return 0;
4795 }
4796
4797 \f
4798 /*** 7. C library functions ***/
4799
4800 /* Setup coding context CODING from information about CODING_SYSTEM.
4801    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4802    CODING_SYSTEM is invalid, signal an error.  */
4803
4804 void
4805 setup_coding_system (coding_system, coding)
4806      Lisp_Object coding_system;
4807      struct coding_system *coding;
4808 {
4809   Lisp_Object attrs;
4810   Lisp_Object eol_type;
4811   Lisp_Object coding_type;
4812   Lisp_Object val;
4813
4814   if (NILP (coding_system))
4815     coding_system = Qno_conversion;
4816
4817   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4818
4819   attrs = CODING_ID_ATTRS (coding->id);
4820   eol_type = CODING_ID_EOL_TYPE (coding->id);
4821
4822   coding->mode = 0;
4823   coding->head_ascii = -1;
4824   coding->common_flags
4825     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4826
4827   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4828   coding->max_charset_id = XSTRING (val)->size - 1;
4829   coding->safe_charsets = (char *) XSTRING (val)->data;
4830   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4831
4832   coding_type = CODING_ATTR_TYPE (attrs);
4833   if (EQ (coding_type, Qundecided))
4834     {
4835       coding->detector = NULL;
4836       coding->decoder = decode_coding_raw_text;
4837       coding->encoder = encode_coding_raw_text;
4838       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4839     }
4840   else if (EQ (coding_type, Qiso_2022))
4841     {
4842       int i;
4843       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4844
4845       /* Invoke graphic register 0 to plane 0.  */
4846       CODING_ISO_INVOCATION (coding, 0) = 0;
4847       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4848       CODING_ISO_INVOCATION (coding, 1)
4849         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4850       /* Setup the initial status of designation.  */
4851       for (i = 0; i < 4; i++)
4852         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4853       /* Not single shifting initially.  */
4854       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4855       /* Beginning of buffer should also be regarded as bol. */
4856       CODING_ISO_BOL (coding) = 1;
4857       coding->detector = detect_coding_iso_2022;
4858       coding->decoder = decode_coding_iso_2022;
4859       coding->encoder = encode_coding_iso_2022;
4860       if (flags & CODING_ISO_FLAG_SAFE)
4861         coding->mode |= CODING_MODE_SAFE_ENCODING;
4862       coding->common_flags
4863         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4864             | CODING_REQUIRE_FLUSHING_MASK);
4865       if (flags & CODING_ISO_FLAG_COMPOSITION)
4866         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4867       if (flags & CODING_ISO_FLAG_DESIGNATION)
4868         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4869       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4870         {
4871           setup_iso_safe_charsets (attrs);
4872           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4873           coding->max_charset_id = XSTRING (val)->size - 1;
4874           coding->safe_charsets = (char *) XSTRING (val)->data;
4875         }
4876       CODING_ISO_FLAGS (coding) = flags;
4877     }
4878   else if (EQ (coding_type, Qcharset))
4879     {
4880       coding->detector = detect_coding_charset;
4881       coding->decoder = decode_coding_charset;
4882       coding->encoder = encode_coding_charset;
4883       coding->common_flags
4884         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4885     }
4886   else if (EQ (coding_type, Qutf_8))
4887     {
4888       coding->detector = detect_coding_utf_8;
4889       coding->decoder = decode_coding_utf_8;
4890       coding->encoder = encode_coding_utf_8;
4891       coding->common_flags
4892         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4893     }
4894   else if (EQ (coding_type, Qutf_16))
4895     {
4896       val = AREF (attrs, coding_attr_utf_16_bom);
4897       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4898                                     : EQ (val, Qt) ? utf_16_with_bom
4899                                     : utf_16_without_bom);
4900       val = AREF (attrs, coding_attr_utf_16_endian);
4901       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4902                                        : utf_16_little_endian);
4903       CODING_UTF_16_SURROGATE (coding) = 0;
4904       coding->detector = detect_coding_utf_16;
4905       coding->decoder = decode_coding_utf_16;
4906       coding->encoder = encode_coding_utf_16;
4907       coding->common_flags
4908         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4909       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4910         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4911     }
4912   else if (EQ (coding_type, Qccl))
4913     {
4914       coding->detector = detect_coding_ccl;
4915       coding->decoder = decode_coding_ccl;
4916       coding->encoder = encode_coding_ccl;
4917       coding->common_flags
4918         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4919             | CODING_REQUIRE_FLUSHING_MASK);
4920     }
4921   else if (EQ (coding_type, Qemacs_mule))
4922     {
4923       coding->detector = detect_coding_emacs_mule;
4924       coding->decoder = decode_coding_emacs_mule;
4925       coding->encoder = encode_coding_emacs_mule;
4926       coding->common_flags
4927         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4928       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4929           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4930         {
4931           Lisp_Object tail, safe_charsets;
4932           int max_charset_id = 0;
4933
4934           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4935                tail = XCDR (tail))
4936             if (max_charset_id < XFASTINT (XCAR (tail)))
4937               max_charset_id = XFASTINT (XCAR (tail));
4938           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4939                                         make_number (255));
4940           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4941                tail = XCDR (tail))
4942             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4943           coding->max_charset_id = max_charset_id;
4944           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4945         }
4946     }
4947   else if (EQ (coding_type, Qshift_jis))
4948     {
4949       coding->detector = detect_coding_sjis;
4950       coding->decoder = decode_coding_sjis;
4951       coding->encoder = encode_coding_sjis;
4952       coding->common_flags
4953         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4954     }
4955   else if (EQ (coding_type, Qbig5))
4956     {
4957       coding->detector = detect_coding_big5;
4958       coding->decoder = decode_coding_big5;
4959       coding->encoder = encode_coding_big5;
4960       coding->common_flags
4961         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4962     }
4963   else                          /* EQ (coding_type, Qraw_text) */
4964     {
4965       coding->detector = NULL;
4966       coding->decoder = decode_coding_raw_text;
4967       coding->encoder = encode_coding_raw_text;
4968       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4969     }
4970
4971   return;
4972 }
4973
4974 /* Return raw-text or one of its subsidiaries that has the same
4975    eol_type as CODING-SYSTEM.  */
4976
4977 Lisp_Object
4978 raw_text_coding_system (coding_system)
4979      Lisp_Object coding_system;
4980 {
4981   Lisp_Object spec, attrs;
4982   Lisp_Object eol_type, raw_text_eol_type;
4983
4984   spec = CODING_SYSTEM_SPEC (coding_system);
4985   attrs = AREF (spec, 0);
4986
4987   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4988     return coding_system;
4989
4990   eol_type = AREF (spec, 2);
4991   if (VECTORP (eol_type))
4992     return Qraw_text;
4993   spec = CODING_SYSTEM_SPEC (Qraw_text);
4994   raw_text_eol_type = AREF (spec, 2);
4995   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4996           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4997           : AREF (raw_text_eol_type, 2));
4998 }
4999
5000
5001 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5002    does, return one of the subsidiary that has the same eol-spec as
5003    PARENT.  Otherwise, return CODING_SYSTEM.  */
5004
5005 Lisp_Object
5006 coding_inherit_eol_type (coding_system, parent)
5007      Lisp_Object coding_system, parent;
5008 {
5009   Lisp_Object spec, attrs, eol_type;
5010
5011   spec = CODING_SYSTEM_SPEC (coding_system);
5012   attrs = AREF (spec, 0);
5013   eol_type = AREF (spec, 2);
5014   if (VECTORP (eol_type))
5015     {
5016       Lisp_Object parent_spec;
5017       Lisp_Object parent_eol_type;
5018
5019       parent_spec
5020         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5021       parent_eol_type = AREF (parent_spec, 2);
5022       if (EQ (parent_eol_type, Qunix))
5023         coding_system = AREF (eol_type, 0);
5024       else if (EQ (parent_eol_type, Qdos))
5025         coding_system = AREF (eol_type, 1);
5026       else if (EQ (parent_eol_type, Qmac))
5027         coding_system = AREF (eol_type, 2);
5028     }
5029   return coding_system;
5030 }
5031
5032 /* Emacs has a mechanism to automatically detect a coding system if it
5033    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5034    it's impossible to distinguish some coding systems accurately
5035    because they use the same range of codes.  So, at first, coding
5036    systems are categorized into 7, those are:
5037
5038    o coding-category-emacs-mule
5039
5040         The category for a coding system which has the same code range
5041         as Emacs' internal format.  Assigned the coding-system (Lisp
5042         symbol) `emacs-mule' by default.
5043
5044    o coding-category-sjis
5045
5046         The category for a coding system which has the same code range
5047         as SJIS.  Assigned the coding-system (Lisp
5048         symbol) `japanese-shift-jis' by default.
5049
5050    o coding-category-iso-7
5051
5052         The category for a coding system which has the same code range
5053         as ISO2022 of 7-bit environment.  This doesn't use any locking
5054         shift and single shift functions.  This can encode/decode all
5055         charsets.  Assigned the coding-system (Lisp symbol)
5056         `iso-2022-7bit' by default.
5057
5058    o coding-category-iso-7-tight
5059
5060         Same as coding-category-iso-7 except that this can
5061         encode/decode only the specified charsets.
5062
5063    o coding-category-iso-8-1
5064
5065         The category for a coding system which has the same code range
5066         as ISO2022 of 8-bit environment and graphic plane 1 used only
5067         for DIMENSION1 charset.  This doesn't use any locking shift
5068         and single shift functions.  Assigned the coding-system (Lisp
5069         symbol) `iso-latin-1' by default.
5070
5071    o coding-category-iso-8-2
5072
5073         The category for a coding system which has the same code range
5074         as ISO2022 of 8-bit environment and graphic plane 1 used only
5075         for DIMENSION2 charset.  This doesn't use any locking shift
5076         and single shift functions.  Assigned the coding-system (Lisp
5077         symbol) `japanese-iso-8bit' by default.
5078
5079    o coding-category-iso-7-else
5080
5081         The category for a coding system which has the same code range
5082         as ISO2022 of 7-bit environemnt but uses locking shift or
5083         single shift functions.  Assigned the coding-system (Lisp
5084         symbol) `iso-2022-7bit-lock' by default.
5085
5086    o coding-category-iso-8-else
5087
5088         The category for a coding system which has the same code range
5089         as ISO2022 of 8-bit environemnt but uses locking shift or
5090         single shift functions.  Assigned the coding-system (Lisp
5091         symbol) `iso-2022-8bit-ss2' by default.
5092
5093    o coding-category-big5
5094
5095         The category for a coding system which has the same code range
5096         as BIG5.  Assigned the coding-system (Lisp symbol)
5097         `cn-big5' by default.
5098
5099    o coding-category-utf-8
5100
5101         The category for a coding system which has the same code range
5102         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5103         symbol) `utf-8' by default.
5104
5105    o coding-category-utf-16-be
5106
5107         The category for a coding system in which a text has an
5108         Unicode signature (cf. Unicode Standard) in the order of BIG
5109         endian at the head.  Assigned the coding-system (Lisp symbol)
5110         `utf-16-be' by default.
5111
5112    o coding-category-utf-16-le
5113
5114         The category for a coding system in which a text has an
5115         Unicode signature (cf. Unicode Standard) in the order of
5116         LITTLE endian at the head.  Assigned the coding-system (Lisp
5117         symbol) `utf-16-le' by default.
5118
5119    o coding-category-ccl
5120
5121         The category for a coding system of which encoder/decoder is
5122         written in CCL programs.  The default value is nil, i.e., no
5123         coding system is assigned.
5124
5125    o coding-category-binary
5126
5127         The category for a coding system not categorized in any of the
5128         above.  Assigned the coding-system (Lisp symbol)
5129         `no-conversion' by default.
5130
5131    Each of them is a Lisp symbol and the value is an actual
5132    `coding-system's (this is also a Lisp symbol) assigned by a user.
5133    What Emacs does actually is to detect a category of coding system.
5134    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5135    decide only one possible category, it selects a category of the
5136    highest priority.  Priorities of categories are also specified by a
5137    user in a Lisp variable `coding-category-list'.
5138
5139 */
5140
5141 #define EOL_SEEN_NONE   0
5142 #define EOL_SEEN_LF     1
5143 #define EOL_SEEN_CR     2
5144 #define EOL_SEEN_CRLF   4
5145
5146 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5147    SOURCE is encoded.  If CATEGORY is one of
5148    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5149    two-byte, else they are encoded by one-byte.
5150
5151    Return one of EOL_SEEN_XXX.  */
5152
5153 #define MAX_EOL_CHECK_COUNT 3
5154
5155 static int
5156 detect_eol (source, src_bytes, category)
5157      unsigned char *source;
5158      EMACS_INT src_bytes;
5159      enum coding_category category;
5160 {
5161   unsigned char *src = source, *src_end = src + src_bytes;
5162   unsigned char c;
5163   int total  = 0;
5164   int eol_seen = EOL_SEEN_NONE;
5165
5166   if ((1 << category) & CATEGORY_MASK_UTF_16)
5167     {
5168       int msb, lsb;
5169
5170       msb = category == (coding_category_utf_16_le
5171                          | coding_category_utf_16_le_nosig);
5172       lsb = 1 - msb;
5173
5174       while (src + 1 < src_end)
5175         {
5176           c = src[lsb];
5177           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5178             {
5179               int this_eol;
5180
5181               if (c == '\n')
5182                 this_eol = EOL_SEEN_LF;
5183               else if (src + 3 >= src_end
5184                        || src[msb + 2] != 0
5185                        || src[lsb + 2] != '\n')
5186                 this_eol = EOL_SEEN_CR;
5187               else
5188                 this_eol = EOL_SEEN_CRLF;
5189
5190               if (eol_seen == EOL_SEEN_NONE)
5191                 /* This is the first end-of-line.  */
5192                 eol_seen = this_eol;
5193               else if (eol_seen != this_eol)
5194                 {
5195                   /* The found type is different from what found before.  */
5196                   eol_seen = EOL_SEEN_LF;
5197                   break;
5198                 }
5199               if (++total == MAX_EOL_CHECK_COUNT)
5200                 break;
5201             }
5202           src += 2;
5203         }
5204     }
5205   else
5206     {
5207       while (src < src_end)
5208         {
5209           c = *src++;
5210           if (c == '\n' || c == '\r')
5211             {
5212               int this_eol;
5213
5214               if (c == '\n')
5215                 this_eol = EOL_SEEN_LF;
5216               else if (src >= src_end || *src != '\n')
5217                 this_eol = EOL_SEEN_CR;
5218               else
5219                 this_eol = EOL_SEEN_CRLF, src++;
5220
5221               if (eol_seen == EOL_SEEN_NONE)
5222                 /* This is the first end-of-line.  */
5223                 eol_seen = this_eol;
5224               else if (eol_seen != this_eol)
5225                 {
5226                   /* The found type is different from what found before.  */
5227                   eol_seen = EOL_SEEN_LF;
5228                   break;
5229                 }
5230               if (++total == MAX_EOL_CHECK_COUNT)
5231                 break;
5232             }
5233         }
5234     }
5235   return eol_seen;
5236 }
5237
5238
5239 static void
5240 adjust_coding_eol_type (coding, eol_seen)
5241      struct coding_system *coding;
5242      int eol_seen;
5243 {
5244   Lisp_Object eol_type;
5245
5246   eol_type = CODING_ID_EOL_TYPE (coding->id);
5247   if (eol_seen & EOL_SEEN_LF)
5248     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5249   else if (eol_seen & EOL_SEEN_CRLF)
5250     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5251   else if (eol_seen & EOL_SEEN_CR)
5252     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5253 }
5254
5255 /* Detect how a text specified in CODING is encoded.  If a coding
5256    system is detected, update fields of CODING by the detected coding
5257    system.  */
5258
5259 void
5260 detect_coding (coding)
5261      struct coding_system *coding;
5262 {
5263   unsigned char *src, *src_end;
5264   Lisp_Object attrs, coding_type;
5265
5266   coding->consumed = coding->consumed_char = 0;
5267   coding->produced = coding->produced_char = 0;
5268   coding_set_source (coding);
5269
5270   src_end = coding->source + coding->src_bytes;
5271
5272   /* If we have not yet decided the text encoding type, detect it
5273      now.  */
5274   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5275     {
5276       int c, i;
5277
5278       for (src = coding->source; src < src_end; src++)
5279         {
5280           c = *src;
5281           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5282                                         || c == ISO_CODE_SI
5283                                         || c == ISO_CODE_SO)))
5284             break;
5285         }
5286       coding->head_ascii = src - (coding->source + coding->consumed);
5287
5288       if (coding->head_ascii < coding->src_bytes)
5289         {
5290           struct coding_detection_info detect_info;
5291           enum coding_category category;
5292           struct coding_system *this;
5293
5294           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5295           for (i = 0; i < coding_category_raw_text; i++)
5296             {
5297               category = coding_priorities[i];
5298               this = coding_categories + category;
5299               if (this->id < 0)
5300                 {
5301                   /* No coding system of this category is defined.  */
5302                   detect_info.rejected |= (1 << category);
5303                 }
5304               else if (category >= coding_category_raw_text)
5305                 continue;
5306               else if (detect_info.checked & (1 << category))
5307                 {
5308                   if (detect_info.found & (1 << category))
5309                     break;
5310                 }
5311               else if ((*(this->detector)) (coding, &detect_info)
5312                        && detect_info.found & (1 << category))
5313                 break;
5314             }
5315           if (i < coding_category_raw_text)
5316             setup_coding_system (CODING_ID_NAME (this->id), coding);
5317           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5318             setup_coding_system (Qraw_text, coding);
5319           else if (detect_info.rejected)
5320             for (i = 0; i < coding_category_raw_text; i++)
5321               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5322                 {
5323                   this = coding_categories + coding_priorities[i];
5324                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5325                   break;
5326                 }
5327         }
5328     }
5329   else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5330     {
5331       Lisp_Object coding_systems;
5332       struct coding_detection_info detect_info;
5333
5334       coding_systems
5335         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5336       detect_info.found = detect_info.rejected = 0;
5337       if (CONSP (coding_systems)
5338           && detect_coding_utf_16 (coding, &detect_info)
5339           && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5340                                    | CATEGORY_MASK_UTF_16_BE)))
5341         {
5342           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5343             setup_coding_system (XCAR (coding_systems), coding);
5344           else
5345             setup_coding_system (XCDR (coding_systems), coding);
5346         }
5347     }
5348
5349   attrs = CODING_ID_ATTRS (coding->id);
5350   coding_type = CODING_ATTR_TYPE (attrs);
5351
5352   /* If we have not yet decided the EOL type, detect it now.  But, the
5353      detection is impossible for a CCL based coding system, in which
5354      case, we detct the EOL type after decoding.  */
5355   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5356       && ! EQ (coding_type, Qccl))
5357     {
5358       int eol_seen = detect_eol (coding->source, coding->src_bytes,
5359                                  XINT (CODING_ATTR_CATEGORY (attrs)));
5360
5361       if (eol_seen != EOL_SEEN_NONE)
5362         adjust_coding_eol_type (coding, eol_seen);
5363     }
5364 }
5365
5366
5367 static void
5368 decode_eol (coding)
5369      struct coding_system *coding;
5370 {
5371   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5372     {
5373       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5374       unsigned char *pend = p + coding->produced;
5375       int eol_seen = EOL_SEEN_NONE;
5376
5377       for (; p < pend; p++)
5378         {
5379           if (*p == '\n')
5380             eol_seen |= EOL_SEEN_LF;
5381           else if (*p == '\r')
5382             {
5383               if (p + 1 < pend && *(p + 1) == '\n')
5384                 {
5385                   eol_seen |= EOL_SEEN_CRLF;
5386                   p++;
5387                 }
5388               else
5389                 eol_seen |= EOL_SEEN_CR;
5390             }
5391         }
5392       if (eol_seen != EOL_SEEN_NONE)
5393         adjust_coding_eol_type (coding, eol_seen);
5394     }
5395
5396   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5397     {
5398       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5399       unsigned char *pend = p + coding->produced;
5400
5401       for (; p < pend; p++)
5402         if (*p == '\r')
5403           *p = '\n';
5404     }
5405   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5406     {
5407       unsigned char *p, *pbeg, *pend;
5408       Lisp_Object undo_list;
5409
5410       move_gap_both (coding->dst_pos + coding->produced_char,
5411                      coding->dst_pos_byte + coding->produced);
5412       undo_list = current_buffer->undo_list;
5413       current_buffer->undo_list = Qt;
5414       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5415       current_buffer->undo_list = undo_list;
5416       pbeg = GPT_ADDR;
5417       pend = pbeg + coding->produced;
5418
5419       for (p = pend - 1; p >= pbeg; p--)
5420         if (*p == '\r')
5421           {
5422             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5423             pend--;
5424           }
5425       coding->produced_char -= coding->produced - (pend - pbeg);
5426       coding->produced = pend - pbeg;
5427       insert_from_gap (coding->produced_char, coding->produced);
5428     }
5429 }
5430
5431 static void
5432 translate_chars (coding, table)
5433      struct coding_system *coding;
5434      Lisp_Object table;
5435 {
5436   int *charbuf = coding->charbuf;
5437   int *charbuf_end = charbuf + coding->charbuf_used;
5438   int c;
5439
5440   if (coding->chars_at_source)
5441     return;
5442
5443   while (charbuf < charbuf_end)
5444     {
5445       c = *charbuf;
5446       if (c < 0)
5447         charbuf += c;
5448       else
5449         *charbuf++ = translate_char (table, c);
5450     }
5451 }
5452
5453 static int
5454 produce_chars (coding)
5455      struct coding_system *coding;
5456 {
5457   unsigned char *dst = coding->destination + coding->produced;
5458   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5459   int produced;
5460   int produced_chars = 0;
5461
5462   if (! coding->chars_at_source)
5463     {
5464       /* Characters are in coding->charbuf.  */
5465       int *buf = coding->charbuf;
5466       int *buf_end = buf + coding->charbuf_used;
5467       unsigned char *adjusted_dst_end;
5468
5469       if (BUFFERP (coding->src_object)
5470           && EQ (coding->src_object, coding->dst_object))
5471         dst_end = coding->source + coding->consumed;
5472       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5473
5474       while (buf < buf_end)
5475         {
5476           int c = *buf++;
5477
5478           if (dst >= adjusted_dst_end)
5479             {
5480               dst = alloc_destination (coding,
5481                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5482                                        dst);
5483               dst_end = coding->destination + coding->dst_bytes;
5484               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5485             }
5486           if (c >= 0)
5487             {
5488               if (coding->dst_multibyte
5489                   || ! CHAR_BYTE8_P (c))
5490                 CHAR_STRING_ADVANCE (c, dst);
5491               else
5492                 *dst++ = CHAR_TO_BYTE8 (c);
5493               produced_chars++;
5494             }
5495           else
5496             /* This is an annotation datum.  */
5497             buf -= c + 1;
5498         }
5499     }
5500   else
5501     {
5502       unsigned char *src = coding->source;
5503       unsigned char *src_end = src + coding->src_bytes;
5504       Lisp_Object eol_type;
5505
5506       eol_type = CODING_ID_EOL_TYPE (coding->id);
5507
5508       if (coding->src_multibyte != coding->dst_multibyte)
5509         {
5510           if (coding->src_multibyte)
5511             {
5512               int multibytep = 1;
5513               int consumed_chars;
5514
5515               while (1)
5516                 {
5517                   unsigned char *src_base = src;
5518                   int c;
5519
5520                   ONE_MORE_BYTE (c);
5521                   if (c == '\r')
5522                     {
5523                       if (EQ (eol_type, Qdos))
5524                         {
5525                           if (src == src_end)
5526                             {
5527                               coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5528                               goto no_more_source;
5529                             }
5530                           if (*src == '\n')
5531                             c = *src++;
5532                         }
5533                       else if (EQ (eol_type, Qmac))
5534                         c = '\n';
5535                     }
5536                   if (dst == dst_end)
5537                     {
5538                       coding->consumed = src - coding->source;
5539
5540                     if (EQ (coding->src_object, coding->dst_object))
5541                       dst_end = src;
5542                     if (dst == dst_end)
5543                       {
5544                         dst = alloc_destination (coding, src_end - src + 1,
5545                                                  dst);
5546                         dst_end = coding->destination + coding->dst_bytes;
5547                         coding_set_source (coding);
5548                         src = coding->source + coding->consumed;
5549                         src_end = coding->source + coding->src_bytes;
5550                       }
5551                     }
5552                   *dst++ = c;
5553                   produced_chars++;
5554                 }
5555             no_more_source:
5556               ;
5557             }
5558           else
5559             while (src < src_end)
5560               {
5561                 int multibytep = 1;
5562                 int c = *src++;
5563
5564                 if (c == '\r')
5565                   {
5566                     if (EQ (eol_type, Qdos))
5567                       {
5568                         if (src < src_end
5569                             && *src == '\n')
5570                           c = *src++;
5571                       }
5572                     else if (EQ (eol_type, Qmac))
5573                       c = '\n';
5574                   }
5575                 if (dst >= dst_end - 1)
5576                   {
5577                     coding->consumed = src - coding->source;
5578
5579                     if (EQ (coding->src_object, coding->dst_object))
5580                       dst_end = src;
5581                     if (dst >= dst_end - 1)
5582                       {
5583                         dst = alloc_destination (coding, src_end - src + 2,
5584                                                  dst);
5585                         dst_end = coding->destination + coding->dst_bytes;
5586                         coding_set_source (coding);
5587                         src = coding->source + coding->consumed;
5588                         src_end = coding->source + coding->src_bytes;
5589                       }
5590                   }
5591                 EMIT_ONE_BYTE (c);
5592               }
5593         }
5594       else
5595         {
5596           if (!EQ (coding->src_object, coding->dst_object))
5597             {
5598               int require = coding->src_bytes - coding->dst_bytes;
5599
5600               if (require > 0)
5601                 {
5602                   EMACS_INT offset = src - coding->source;
5603
5604                   dst = alloc_destination (coding, require, dst);
5605                   coding_set_source (coding);
5606                   src = coding->source + offset;
5607                   src_end = coding->source + coding->src_bytes;
5608                 }
5609             }
5610           produced_chars = coding->src_chars;
5611           while (src < src_end)
5612             {
5613               int c = *src++;
5614
5615               if (c == '\r')
5616                 {
5617                   if (EQ (eol_type, Qdos))
5618                     {
5619                       if (src < src_end
5620                           && *src == '\n')
5621                         c = *src++;
5622                       produced_chars--;
5623                     }
5624                   else if (EQ (eol_type, Qmac))
5625                     c = '\n';
5626                 }
5627               *dst++ = c;
5628             }
5629         }
5630       coding->consumed = coding->src_bytes;
5631       coding->consumed_char = coding->src_chars;
5632     }
5633
5634   produced = dst - (coding->destination + coding->produced);
5635   if (BUFFERP (coding->dst_object))
5636     insert_from_gap (produced_chars, produced);
5637   coding->produced += produced;
5638   coding->produced_char += produced_chars;
5639   return produced_chars;
5640 }
5641
5642 /* Compose text in CODING->object according to the annotation data at
5643    CHARBUF.  CHARBUF is an array:
5644      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5645  */
5646
5647 static INLINE void
5648 produce_composition (coding, charbuf)
5649      struct coding_system *coding;
5650      int *charbuf;
5651 {
5652   int len;
5653   EMACS_INT from, to;
5654   enum composition_method method;
5655   Lisp_Object components;
5656
5657   len = -charbuf[0];
5658   from = coding->dst_pos + charbuf[2];
5659   to = coding->dst_pos + charbuf[3];
5660   method = (enum composition_method) (charbuf[4]);
5661
5662   if (method == COMPOSITION_RELATIVE)
5663     components = Qnil;
5664   else
5665     {
5666       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5667       int i;
5668
5669       len -= 5;
5670       charbuf += 5;
5671       for (i = 0; i < len; i++)
5672         args[i] = make_number (charbuf[i]);
5673       components = (method == COMPOSITION_WITH_ALTCHARS
5674                     ? Fstring (len, args) : Fvector (len, args));
5675     }
5676   compose_text (from, to, components, Qnil, coding->dst_object);
5677 }
5678
5679
5680 /* Put `charset' property on text in CODING->object according to
5681    the annotation data at CHARBUF.  CHARBUF is an array:
5682      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5683  */
5684
5685 static INLINE void
5686 produce_charset (coding, charbuf)
5687      struct coding_system *coding;
5688      int *charbuf;
5689 {
5690   EMACS_INT from = coding->dst_pos + charbuf[2];
5691   EMACS_INT to = coding->dst_pos + charbuf[3];
5692   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5693
5694   Fput_text_property (make_number (from), make_number (to),
5695                       Qcharset, CHARSET_NAME (charset),
5696                       coding->dst_object);
5697 }
5698
5699
5700 #define CHARBUF_SIZE 0x4000
5701
5702 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5703   do {                                                                  \
5704     int size = CHARBUF_SIZE;;                                           \
5705                                                                         \
5706     coding->charbuf = NULL;                                             \
5707     while (size > 1024)                                                 \
5708       {                                                                 \
5709         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5710         if (coding->charbuf)                                            \
5711           break;                                                        \
5712         size >>= 1;                                                     \
5713       }                                                                 \
5714     if (! coding->charbuf)                                              \
5715       {                                                                 \
5716         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5717         return coding->result;                                          \
5718       }                                                                 \
5719     coding->charbuf_size = size;                                        \
5720   } while (0)
5721
5722
5723 static void
5724 produce_annotation (coding)
5725      struct coding_system *coding;
5726 {
5727   int *charbuf = coding->charbuf;
5728   int *charbuf_end = charbuf + coding->charbuf_used;
5729
5730   if (NILP (coding->dst_object))
5731     return;
5732
5733   while (charbuf < charbuf_end)
5734     {
5735       if (*charbuf >= 0)
5736         charbuf++;
5737       else
5738         {
5739           int len = -*charbuf;
5740           switch (charbuf[1])
5741             {
5742             case CODING_ANNOTATE_COMPOSITION_MASK:
5743               produce_composition (coding, charbuf);
5744               break;
5745             case CODING_ANNOTATE_CHARSET_MASK:
5746               produce_charset (coding, charbuf);
5747               break;
5748             default:
5749               abort ();
5750             }
5751           charbuf += len;
5752         }
5753     }
5754 }
5755
5756 /* Decode the data at CODING->src_object into CODING->dst_object.
5757    CODING->src_object is a buffer, a string, or nil.
5758    CODING->dst_object is a buffer.
5759
5760    If CODING->src_object is a buffer, it must be the current buffer.
5761    In this case, if CODING->src_pos is positive, it is a position of
5762    the source text in the buffer, otherwise, the source text is in the
5763    gap area of the buffer, and CODING->src_pos specifies the offset of
5764    the text from GPT (which must be the same as PT).  If this is the
5765    same buffer as CODING->dst_object, CODING->src_pos must be
5766    negative.
5767
5768    If CODING->src_object is a string, CODING->src_pos in an index to
5769    that string.
5770
5771    If CODING->src_object is nil, CODING->source must already point to
5772    the non-relocatable memory area.  In this case, CODING->src_pos is
5773    an offset from CODING->source.
5774
5775    The decoded data is inserted at the current point of the buffer
5776    CODING->dst_object.
5777 */
5778
5779 static int
5780 decode_coding (coding)
5781      struct coding_system *coding;
5782 {
5783   Lisp_Object attrs;
5784
5785   if (BUFFERP (coding->src_object)
5786       && coding->src_pos > 0
5787       && coding->src_pos < GPT
5788       && coding->src_pos + coding->src_chars > GPT)
5789     move_gap_both (coding->src_pos, coding->src_pos_byte);
5790
5791   if (BUFFERP (coding->dst_object))
5792     {
5793       if (current_buffer != XBUFFER (coding->dst_object))
5794         set_buffer_internal (XBUFFER (coding->dst_object));
5795       if (GPT != PT)
5796         move_gap_both (PT, PT_BYTE);
5797     }
5798
5799   coding->consumed = coding->consumed_char = 0;
5800   coding->produced = coding->produced_char = 0;
5801   coding->chars_at_source = 0;
5802   coding->result = CODING_RESULT_SUCCESS;
5803   coding->errors = 0;
5804
5805   ALLOC_CONVERSION_WORK_AREA (coding);
5806
5807   attrs = CODING_ID_ATTRS (coding->id);
5808
5809   do
5810     {
5811       coding_set_source (coding);
5812       coding->annotated = 0;
5813       (*(coding->decoder)) (coding);
5814       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5815         translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5816       else if (!NILP (Vstandard_translation_table_for_decode))
5817         translate_chars (coding, Vstandard_translation_table_for_decode);
5818       coding_set_destination (coding);
5819       produce_chars (coding);
5820       if (coding->annotated)
5821         produce_annotation (coding);
5822     }
5823   while (coding->consumed < coding->src_bytes
5824          && ! coding->result);
5825
5826   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5827       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5828       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5829     decode_eol (coding);
5830
5831   coding->carryover_bytes = 0;
5832   if (coding->consumed < coding->src_bytes)
5833     {
5834       int nbytes = coding->src_bytes - coding->consumed;
5835       unsigned char *src;
5836
5837       coding_set_source (coding);
5838       coding_set_destination (coding);
5839       src = coding->source + coding->consumed;
5840
5841       if (coding->mode & CODING_MODE_LAST_BLOCK)
5842         {
5843           /* Flush out unprocessed data as binary chars.  We are sure
5844              that the number of data is less than the size of
5845              coding->charbuf.  */
5846           while (nbytes-- > 0)
5847             {
5848               int c = *src++;
5849
5850               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5851             }
5852           produce_chars (coding);
5853         }
5854       else
5855         {
5856           /* Record unprocessed bytes in coding->carryover.  We are
5857              sure that the number of data is less than the size of
5858              coding->carryover.  */
5859           unsigned char *p = coding->carryover;
5860
5861           coding->carryover_bytes = nbytes;
5862           while (nbytes-- > 0)
5863             *p++ = *src++;
5864         }
5865       coding->consumed = coding->src_bytes;
5866     }
5867
5868   return coding->result;
5869 }
5870
5871
5872 /* Extract an annotation datum from a composition starting at POS and
5873    ending before LIMIT of CODING->src_object (buffer or string), store
5874    the data in BUF, set *STOP to a starting position of the next
5875    composition (if any) or to LIMIT, and return the address of the
5876    next element of BUF.
5877
5878    If such an annotation is not found, set *STOP to a starting
5879    position of a composition after POS (if any) or to LIMIT, and
5880    return BUF.  */
5881
5882 static INLINE int *
5883 handle_composition_annotation (pos, limit, coding, buf, stop)
5884      EMACS_INT pos, limit;
5885      struct coding_system *coding;
5886      int *buf;
5887      EMACS_INT *stop;
5888 {
5889   EMACS_INT start, end;
5890   Lisp_Object prop;
5891
5892   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5893       || end > limit)
5894     *stop = limit;
5895   else if (start > pos)
5896     *stop = start;
5897   else
5898     {
5899       if (start == pos)
5900         {
5901           /* We found a composition.  Store the corresponding
5902              annotation data in BUF.  */
5903           int *head = buf;
5904           enum composition_method method = COMPOSITION_METHOD (prop);
5905           int nchars = COMPOSITION_LENGTH (prop);
5906
5907           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5908           if (method != COMPOSITION_RELATIVE)
5909             {
5910               Lisp_Object components;
5911               int len, i, i_byte;
5912
5913               components = COMPOSITION_COMPONENTS (prop);
5914               if (VECTORP (components))
5915                 {
5916                   len = XVECTOR (components)->size;
5917                   for (i = 0; i < len; i++)
5918                     *buf++ = XINT (AREF (components, i));
5919                 }
5920               else if (STRINGP (components))
5921                 {
5922                   len = XSTRING (components)->size;
5923                   i = i_byte = 0;
5924                   while (i < len)
5925                     {
5926                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5927                       buf++;
5928                     }
5929                 }
5930               else if (INTEGERP (components))
5931                 {
5932                   len = 1;
5933                   *buf++ = XINT (components);
5934                 }
5935               else if (CONSP (components))
5936                 {
5937                   for (len = 0; CONSP (components);
5938                        len++, components = XCDR (components))
5939                     *buf++ = XINT (XCAR (components));
5940                 }
5941               else
5942                 abort ();
5943               *head -= len;
5944             }
5945         }
5946
5947       if (find_composition (end, limit, &start, &end, &prop,
5948                             coding->src_object)
5949           && end <= limit)
5950         *stop = start;
5951       else
5952         *stop = limit;
5953     }
5954   return buf;
5955 }
5956
5957
5958 /* Extract an annotation datum from a text property `charset' at POS of
5959    CODING->src_object (buffer of string), store the data in BUF, set
5960    *STOP to the position where the value of `charset' property changes
5961    (limiting by LIMIT), and return the address of the next element of
5962    BUF.
5963
5964    If the property value is nil, set *STOP to the position where the
5965    property value is non-nil (limiting by LIMIT), and return BUF.  */
5966
5967 static INLINE int *
5968 handle_charset_annotation (pos, limit, coding, buf, stop)
5969      EMACS_INT pos, limit;
5970      struct coding_system *coding;
5971      int *buf;
5972      EMACS_INT *stop;
5973 {
5974   Lisp_Object val, next;
5975   int id;
5976
5977   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5978   if (! NILP (val) && CHARSETP (val))
5979     id = XINT (CHARSET_SYMBOL_ID (val));
5980   else
5981     id = -1;
5982   ADD_CHARSET_DATA (buf, 0, 0, id);
5983   next = Fnext_single_property_change (make_number (pos), Qcharset,
5984                                        coding->src_object,
5985                                        make_number (limit));
5986   *stop = XINT (next);
5987   return buf;
5988 }
5989
5990
5991 static void
5992 consume_chars (coding)
5993      struct coding_system *coding;
5994 {
5995   int *buf = coding->charbuf;
5996   int *buf_end = coding->charbuf + coding->charbuf_size;
5997   const unsigned char *src = coding->source + coding->consumed;
5998   const unsigned char *src_end = coding->source + coding->src_bytes;
5999   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6000   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6001   int multibytep = coding->src_multibyte;
6002   Lisp_Object eol_type;
6003   int c;
6004   EMACS_INT stop, stop_composition, stop_charset;
6005
6006   eol_type = CODING_ID_EOL_TYPE (coding->id);
6007   if (VECTORP (eol_type))
6008     eol_type = Qunix;
6009
6010   /* Note: composition handling is not yet implemented.  */
6011   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6012
6013   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6014     stop = stop_composition = pos;
6015   else
6016     stop = stop_composition = end_pos;
6017   if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6018     stop = stop_charset = pos;
6019   else
6020     stop_charset = end_pos;
6021
6022   /* Compensate for CRLF and annotation.  */
6023   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6024   while (buf < buf_end)
6025     {
6026       if (pos == stop)
6027         {
6028           if (pos == end_pos)
6029             break;
6030           if (pos == stop_composition)
6031             buf = handle_composition_annotation (pos, end_pos, coding,
6032                                                  buf, &stop_composition);
6033           if (pos == stop_charset)
6034             buf = handle_charset_annotation (pos, end_pos, coding,
6035                                              buf, &stop_charset);
6036           stop = (stop_composition < stop_charset
6037                   ? stop_composition : stop_charset);
6038         }
6039
6040       if (! multibytep)
6041         {
6042           EMACS_INT bytes = MULTIBYTE_LENGTH (src, src_end);
6043
6044           if (bytes > 0)
6045             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6046           else
6047             c = *src++, pos++;
6048         }
6049       else
6050         c = STRING_CHAR_ADVANCE (src), pos++;
6051       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6052         c = '\n';
6053       if (! EQ (eol_type, Qunix))
6054         {
6055           if (c == '\n')
6056             {
6057               if (EQ (eol_type, Qdos))
6058                 *buf++ = '\r';
6059               else
6060                 c = '\r';
6061             }
6062         }
6063       *buf++ = c;
6064     }
6065
6066   coding->consumed = src - coding->source;
6067   coding->consumed_char = pos - coding->src_pos;
6068   coding->charbuf_used = buf - coding->charbuf;
6069   coding->chars_at_source = 0;
6070 }
6071
6072
6073 /* Encode the text at CODING->src_object into CODING->dst_object.
6074    CODING->src_object is a buffer or a string.
6075    CODING->dst_object is a buffer or nil.
6076
6077    If CODING->src_object is a buffer, it must be the current buffer.
6078    In this case, if CODING->src_pos is positive, it is a position of
6079    the source text in the buffer, otherwise. the source text is in the
6080    gap area of the buffer, and coding->src_pos specifies the offset of
6081    the text from GPT (which must be the same as PT).  If this is the
6082    same buffer as CODING->dst_object, CODING->src_pos must be
6083    negative and CODING should not have `pre-write-conversion'.
6084
6085    If CODING->src_object is a string, CODING should not have
6086    `pre-write-conversion'.
6087
6088    If CODING->dst_object is a buffer, the encoded data is inserted at
6089    the current point of that buffer.
6090
6091    If CODING->dst_object is nil, the encoded data is placed at the
6092    memory area specified by CODING->destination.  */
6093
6094 static int
6095 encode_coding (coding)
6096      struct coding_system *coding;
6097 {
6098   Lisp_Object attrs;
6099
6100   attrs = CODING_ID_ATTRS (coding->id);
6101
6102   if (BUFFERP (coding->dst_object))
6103     {
6104       set_buffer_internal (XBUFFER (coding->dst_object));
6105       coding->dst_multibyte
6106         = ! NILP (current_buffer->enable_multibyte_characters);
6107     }
6108
6109   coding->consumed = coding->consumed_char = 0;
6110   coding->produced = coding->produced_char = 0;
6111   coding->result = CODING_RESULT_SUCCESS;
6112   coding->errors = 0;
6113
6114   ALLOC_CONVERSION_WORK_AREA (coding);
6115
6116   do {
6117     coding_set_source (coding);
6118     consume_chars (coding);
6119
6120     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6121       translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6122     else if (!NILP (Vstandard_translation_table_for_encode))
6123       translate_chars (coding, Vstandard_translation_table_for_encode);
6124
6125     coding_set_destination (coding);
6126     (*(coding->encoder)) (coding);
6127   } while (coding->consumed_char < coding->src_chars);
6128
6129   if (BUFFERP (coding->dst_object))
6130     insert_from_gap (coding->produced_char, coding->produced);
6131
6132   return (coding->result);
6133 }
6134
6135
6136 /* Stack of working buffers used in code conversion.  An nil element
6137    means that the code conversion of that level is not using a working
6138    buffer.  */
6139 Lisp_Object Vcode_conversion_work_buf_list;
6140
6141 /* A working buffer used by the top level conversion.  */
6142 Lisp_Object Vcode_conversion_reused_work_buf;
6143
6144
6145 /* Return a working buffer that can be freely used by the following
6146    code conversion.  MULTIBYTEP specifies the multibyteness of the
6147    buffer.  */
6148
6149 Lisp_Object
6150 make_conversion_work_buffer (multibytep, depth)
6151      int multibytep, depth;
6152 {
6153   struct buffer *current = current_buffer;
6154   Lisp_Object buf, name;
6155
6156   if (depth == 0)
6157     {
6158       if (NILP (Vcode_conversion_reused_work_buf))
6159         Vcode_conversion_reused_work_buf
6160           = Fget_buffer_create (build_string (" *code-conversion-work<0>*"));
6161       buf = Vcode_conversion_reused_work_buf;
6162     }
6163   else
6164     {
6165       if (depth < 0)
6166         {
6167           name = build_string (" *code-conversion-work*");
6168           name = Fgenerate_new_buffer_name (name, Qnil);
6169         }
6170       else
6171         {
6172           char str[128];
6173
6174           sprintf (str, " *code-conversion-work*<%d>", depth);
6175           name = build_string (str);
6176         }
6177       buf = Fget_buffer_create (name);
6178     }
6179   set_buffer_internal (XBUFFER (buf));
6180   current_buffer->undo_list = Qt;
6181   Ferase_buffer ();
6182   Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6183   set_buffer_internal (current);
6184   return buf;
6185 }
6186
6187 static Lisp_Object
6188 code_conversion_restore (buffer)
6189      Lisp_Object buffer;
6190 {
6191   Lisp_Object workbuf;
6192
6193   workbuf = XCAR (Vcode_conversion_work_buf_list);
6194   if (! NILP (workbuf)
6195       && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6196       && ! NILP (Fbuffer_live_p (workbuf)))
6197     Fkill_buffer (workbuf);
6198   Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6199   set_buffer_internal (XBUFFER (buffer));
6200   return Qnil;
6201 }
6202
6203 static Lisp_Object
6204 code_conversion_save (buffer, with_work_buf, multibyte)
6205      Lisp_Object buffer;
6206      int with_work_buf, multibyte;
6207 {
6208   Lisp_Object workbuf;
6209
6210   if (with_work_buf)
6211     {
6212       int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6213
6214       workbuf = make_conversion_work_buffer (multibyte, depth);
6215     }
6216   else
6217     workbuf = Qnil;
6218   Vcode_conversion_work_buf_list
6219     = Fcons (workbuf, Vcode_conversion_work_buf_list);
6220   record_unwind_protect (code_conversion_restore, buffer);
6221   return workbuf;
6222 }
6223
6224 int
6225 decode_coding_gap (coding, chars, bytes)
6226      struct coding_system *coding;
6227      EMACS_INT chars, bytes;
6228 {
6229   int count = specpdl_ptr - specpdl;
6230   Lisp_Object buffer;
6231
6232   buffer = Fcurrent_buffer ();
6233   code_conversion_save (buffer, 0, 0);
6234
6235   coding->src_object = buffer;
6236   coding->src_chars = chars;
6237   coding->src_bytes = bytes;
6238   coding->src_pos = -chars;
6239   coding->src_pos_byte = -bytes;
6240   coding->src_multibyte = chars < bytes;
6241   coding->dst_object = coding->src_object;
6242   coding->dst_pos = PT;
6243   coding->dst_pos_byte = PT_BYTE;
6244   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6245   coding->mode |= CODING_MODE_LAST_BLOCK;
6246
6247   if (CODING_REQUIRE_DETECTION (coding))
6248     detect_coding (coding);
6249
6250   decode_coding (coding);
6251
6252   unbind_to (count, Qnil);
6253   return coding->result;
6254 }
6255
6256 int
6257 encode_coding_gap (coding, chars, bytes)
6258      struct coding_system *coding;
6259      EMACS_INT chars, bytes;
6260 {
6261   int count = specpdl_ptr - specpdl;
6262   Lisp_Object buffer;
6263
6264   buffer = Fcurrent_buffer ();
6265   code_conversion_save (buffer, 0, 0);
6266
6267   coding->src_object = buffer;
6268   coding->src_chars = chars;
6269   coding->src_bytes = bytes;
6270   coding->src_pos = -chars;
6271   coding->src_pos_byte = -bytes;
6272   coding->src_multibyte = chars < bytes;
6273   coding->dst_object = coding->src_object;
6274   coding->dst_pos = PT;
6275   coding->dst_pos_byte = PT_BYTE;
6276
6277   encode_coding (coding);
6278
6279   unbind_to (count, Qnil);
6280   return coding->result;
6281 }
6282
6283
6284 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6285    SRC_OBJECT into DST_OBJECT by coding context CODING.
6286
6287    SRC_OBJECT is a buffer, a string, or Qnil.
6288
6289    If it is a buffer, the text is at point of the buffer.  FROM and TO
6290    are positions in the buffer.
6291
6292    If it is a string, the text is at the beginning of the string.
6293    FROM and TO are indices to the string.
6294
6295    If it is nil, the text is at coding->source.  FROM and TO are
6296    indices to coding->source.
6297
6298    DST_OBJECT is a buffer, Qt, or Qnil.
6299
6300    If it is a buffer, the decoded text is inserted at point of the
6301    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6302    is deleted.
6303
6304    If it is Qt, a string is made from the decoded text, and
6305    set in CODING->dst_object.
6306
6307    If it is Qnil, the decoded text is stored at CODING->destination.
6308    The caller must allocate CODING->dst_bytes bytes at
6309    CODING->destination by xmalloc.  If the decoded text is longer than
6310    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6311  */
6312
6313 void
6314 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6315                       dst_object)
6316      struct coding_system *coding;
6317      Lisp_Object src_object;
6318      EMACS_INT from, from_byte, to, to_byte;
6319      Lisp_Object dst_object;
6320 {
6321   int count = specpdl_ptr - specpdl;
6322   unsigned char *destination;
6323   EMACS_INT dst_bytes;
6324   EMACS_INT chars = to - from;
6325   EMACS_INT bytes = to_byte - from_byte;
6326   Lisp_Object attrs;
6327   Lisp_Object buffer;
6328   int saved_pt = -1, saved_pt_byte;
6329
6330   buffer = Fcurrent_buffer ();
6331
6332   if (NILP (dst_object))
6333     {
6334       destination = coding->destination;
6335       dst_bytes = coding->dst_bytes;
6336     }
6337
6338   coding->src_object = src_object;
6339   coding->src_chars = chars;
6340   coding->src_bytes = bytes;
6341   coding->src_multibyte = chars < bytes;
6342
6343   if (STRINGP (src_object))
6344     {
6345       coding->src_pos = from;
6346       coding->src_pos_byte = from_byte;
6347     }
6348   else if (BUFFERP (src_object))
6349     {
6350       set_buffer_internal (XBUFFER (src_object));
6351       if (from != GPT)
6352         move_gap_both (from, from_byte);
6353       if (EQ (src_object, dst_object))
6354         {
6355           saved_pt = PT, saved_pt_byte = PT_BYTE;
6356           TEMP_SET_PT_BOTH (from, from_byte);
6357           del_range_both (from, from_byte, to, to_byte, 1);
6358           coding->src_pos = -chars;
6359           coding->src_pos_byte = -bytes;
6360         }
6361       else
6362         {
6363           coding->src_pos = from;
6364           coding->src_pos_byte = from_byte;
6365         }
6366     }
6367
6368   if (CODING_REQUIRE_DETECTION (coding))
6369     detect_coding (coding);
6370   attrs = CODING_ID_ATTRS (coding->id);
6371
6372   if (EQ (dst_object, Qt)
6373       || (! NILP (CODING_ATTR_POST_READ (attrs))
6374           && NILP (dst_object)))
6375     {
6376       coding->dst_object = code_conversion_save (buffer, 1, 1);
6377       coding->dst_pos = BEG;
6378       coding->dst_pos_byte = BEG_BYTE;
6379       coding->dst_multibyte = 1;
6380     }
6381   else if (BUFFERP (dst_object))
6382     {
6383       code_conversion_save (buffer, 0, 0);
6384       coding->dst_object = dst_object;
6385       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6386       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6387       coding->dst_multibyte
6388         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6389     }
6390   else
6391     {
6392       code_conversion_save (buffer, 0, 0);
6393       coding->dst_object = Qnil;
6394       coding->dst_multibyte = 1;
6395     }
6396
6397   decode_coding (coding);
6398
6399   if (BUFFERP (coding->dst_object))
6400     set_buffer_internal (XBUFFER (coding->dst_object));
6401
6402   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6403     {
6404       struct gcpro gcpro1, gcpro2;
6405       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6406       Lisp_Object val;
6407
6408       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6409       GCPRO2 (coding->src_object, coding->dst_object);
6410       val = call1 (CODING_ATTR_POST_READ (attrs),
6411                    make_number (coding->produced_char));
6412       UNGCPRO;
6413       CHECK_NATNUM (val);
6414       coding->produced_char += Z - prev_Z;
6415       coding->produced += Z_BYTE - prev_Z_BYTE;
6416     }
6417
6418   if (EQ (dst_object, Qt))
6419     {
6420       coding->dst_object = Fbuffer_string ();
6421     }
6422   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6423     {
6424       set_buffer_internal (XBUFFER (coding->dst_object));
6425       if (dst_bytes < coding->produced)
6426         {
6427           destination
6428             = (unsigned char *) xrealloc (destination, coding->produced);
6429           if (! destination)
6430             {
6431               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6432               unbind_to (count, Qnil);
6433               return;
6434             }
6435           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6436             move_gap_both (BEGV, BEGV_BYTE);
6437           bcopy (BEGV_ADDR, destination, coding->produced);
6438           coding->destination = destination;
6439         }
6440     }
6441
6442   if (saved_pt >= 0)
6443     {
6444       /* This is the case of:
6445          (BUFFERP (src_object) && EQ (src_object, dst_object))
6446          As we have moved PT while replacing the original buffer
6447          contents, we must recover it now.  */
6448       set_buffer_internal (XBUFFER (src_object));
6449       if (saved_pt < from)
6450         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6451       else if (saved_pt < from + chars)
6452         TEMP_SET_PT_BOTH (from, from_byte);
6453       else if (! NILP (current_buffer->enable_multibyte_characters))
6454         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6455                           saved_pt_byte + (coding->produced - bytes));
6456       else
6457         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6458                           saved_pt_byte + (coding->produced - bytes));
6459     }
6460
6461   unbind_to (count, Qnil);
6462 }
6463
6464
6465 void
6466 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6467                       dst_object)
6468      struct coding_system *coding;
6469      Lisp_Object src_object;
6470      EMACS_INT from, from_byte, to, to_byte;
6471      Lisp_Object dst_object;
6472 {
6473   int count = specpdl_ptr - specpdl;
6474   EMACS_INT chars = to - from;
6475   EMACS_INT bytes = to_byte - from_byte;
6476   Lisp_Object attrs;
6477   Lisp_Object buffer;
6478   int saved_pt = -1, saved_pt_byte;
6479
6480   buffer = Fcurrent_buffer ();
6481
6482   coding->src_object = src_object;
6483   coding->src_chars = chars;
6484   coding->src_bytes = bytes;
6485   coding->src_multibyte = chars < bytes;
6486
6487   attrs = CODING_ID_ATTRS (coding->id);
6488
6489   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6490     {
6491       coding->src_object = code_conversion_save (buffer, 1,
6492                                                  coding->src_multibyte);
6493       set_buffer_internal (XBUFFER (coding->src_object));
6494       if (STRINGP (src_object))
6495         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6496       else if (BUFFERP (src_object))
6497         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6498       else
6499         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6500
6501       if (EQ (src_object, dst_object))
6502         {
6503           set_buffer_internal (XBUFFER (src_object));
6504           saved_pt = PT, saved_pt_byte = PT_BYTE;
6505           del_range_both (from, from_byte, to, to_byte, 1);
6506           set_buffer_internal (XBUFFER (coding->src_object));
6507         }
6508
6509       call2 (CODING_ATTR_PRE_WRITE (attrs),
6510              make_number (BEG), make_number (Z));
6511       coding->src_object = Fcurrent_buffer ();
6512       if (BEG != GPT)
6513         move_gap_both (BEG, BEG_BYTE);
6514       coding->src_chars = Z - BEG;
6515       coding->src_bytes = Z_BYTE - BEG_BYTE;
6516       coding->src_pos = BEG;
6517       coding->src_pos_byte = BEG_BYTE;
6518       coding->src_multibyte = Z < Z_BYTE;
6519     }
6520   else if (STRINGP (src_object))
6521     {
6522       code_conversion_save (buffer, 0, 0);
6523       coding->src_pos = from;
6524       coding->src_pos_byte = from_byte;
6525     }
6526   else if (BUFFERP (src_object))
6527     {
6528       code_conversion_save (buffer, 0, 0);
6529       set_buffer_internal (XBUFFER (src_object));
6530       if (EQ (src_object, dst_object))
6531         {
6532           saved_pt = PT, saved_pt_byte = PT_BYTE;
6533           coding->src_object = del_range_1 (from, to, 1, 1);
6534           coding->src_pos = 0;
6535           coding->src_pos_byte = 0;
6536         }
6537       else
6538         {
6539           if (from < GPT && to >= GPT)
6540             move_gap_both (from, from_byte);
6541           coding->src_pos = from;
6542           coding->src_pos_byte = from_byte;
6543         }
6544     }
6545   else
6546     code_conversion_save (buffer, 0, 0);
6547
6548   if (BUFFERP (dst_object))
6549     {
6550       coding->dst_object = dst_object;
6551       if (EQ (src_object, dst_object))
6552         {
6553           coding->dst_pos = from;
6554           coding->dst_pos_byte = from_byte;
6555         }
6556       else
6557         {
6558           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6559           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6560         }
6561       coding->dst_multibyte
6562         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6563     }
6564   else if (EQ (dst_object, Qt))
6565     {
6566       coding->dst_object = Qnil;
6567       coding->dst_bytes = coding->src_chars;
6568       if (coding->dst_bytes == 0)
6569         coding->dst_bytes = 1;
6570       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6571       coding->dst_multibyte = 0;
6572     }
6573   else
6574     {
6575       coding->dst_object = Qnil;
6576       coding->dst_multibyte = 0;
6577     }
6578
6579   encode_coding (coding);
6580
6581   if (EQ (dst_object, Qt))
6582     {
6583       if (BUFFERP (coding->dst_object))
6584         coding->dst_object = Fbuffer_string ();
6585       else
6586         {
6587           coding->dst_object
6588             = make_unibyte_string ((char *) coding->destination,
6589                                    coding->produced);
6590           xfree (coding->destination);
6591         }
6592     }
6593
6594   if (saved_pt >= 0)
6595     {
6596       /* This is the case of:
6597          (BUFFERP (src_object) && EQ (src_object, dst_object))
6598          As we have moved PT while replacing the original buffer
6599          contents, we must recover it now.  */
6600       set_buffer_internal (XBUFFER (src_object));
6601       if (saved_pt < from)
6602         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6603       else if (saved_pt < from + chars)
6604         TEMP_SET_PT_BOTH (from, from_byte);
6605       else if (! NILP (current_buffer->enable_multibyte_characters))
6606         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6607                           saved_pt_byte + (coding->produced - bytes));
6608       else
6609         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6610                           saved_pt_byte + (coding->produced - bytes));
6611     }
6612
6613   unbind_to (count, Qnil);
6614 }
6615
6616
6617 Lisp_Object
6618 preferred_coding_system ()
6619 {
6620   int id = coding_categories[coding_priorities[0]].id;
6621
6622   return CODING_ID_NAME (id);
6623 }
6624
6625 \f
6626 #ifdef emacs
6627 /*** 8. Emacs Lisp library functions ***/
6628
6629 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6630        doc: /* Return t if OBJECT is nil or a coding-system.
6631 See the documentation of `define-coding-system' for information
6632 about coding-system objects.  */)
6633      (obj)
6634      Lisp_Object obj;
6635 {
6636   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6637 }
6638
6639 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6640        Sread_non_nil_coding_system, 1, 1, 0,
6641        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6642      (prompt)
6643      Lisp_Object prompt;
6644 {
6645   Lisp_Object val;
6646   do
6647     {
6648       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6649                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6650     }
6651   while (XSTRING (val)->size == 0);
6652   return (Fintern (val, Qnil));
6653 }
6654
6655 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6656        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6657 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6658      (prompt, default_coding_system)
6659      Lisp_Object prompt, default_coding_system;
6660 {
6661   Lisp_Object val;
6662   if (SYMBOLP (default_coding_system))
6663     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6664   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6665                           Qt, Qnil, Qcoding_system_history,
6666                           default_coding_system, Qnil);
6667   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6668 }
6669
6670 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6671        1, 1, 0,
6672        doc: /* Check validity of CODING-SYSTEM.
6673 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6674   (coding_system)
6675      Lisp_Object coding_system;
6676 {
6677   CHECK_SYMBOL (coding_system);
6678   if (!NILP (Fcoding_system_p (coding_system)))
6679     return coding_system;
6680   while (1)
6681     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6682 }
6683
6684 \f
6685 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6686    HIGHEST is nonzero, return the coding system of the highest
6687    priority among the detected coding systems.  Otherwize return a
6688    list of detected coding systems sorted by their priorities.  If
6689    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6690    multibyte form but contains only ASCII and eight-bit chars.
6691    Otherwise, the bytes are raw bytes.
6692
6693    CODING-SYSTEM controls the detection as below:
6694
6695    If it is nil, detect both text-format and eol-format.  If the
6696    text-format part of CODING-SYSTEM is already specified
6697    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6698    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6699    detect only text-format.  */
6700
6701 Lisp_Object
6702 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6703      unsigned char *src;
6704      int src_bytes, highest;
6705      int multibytep;
6706      Lisp_Object coding_system;
6707 {
6708   unsigned char *src_end = src + src_bytes;
6709   Lisp_Object attrs, eol_type;
6710   Lisp_Object val;
6711   struct coding_system coding;
6712   int id;
6713   struct coding_detection_info detect_info;
6714
6715   if (NILP (coding_system))
6716     coding_system = Qundecided;
6717   setup_coding_system (coding_system, &coding);
6718   attrs = CODING_ID_ATTRS (coding.id);
6719   eol_type = CODING_ID_EOL_TYPE (coding.id);
6720   coding_system = CODING_ATTR_BASE_NAME (attrs);
6721
6722   coding.source = src;
6723   coding.src_bytes = src_bytes;
6724   coding.src_multibyte = multibytep;
6725   coding.consumed = 0;
6726   coding.mode |= CODING_MODE_LAST_BLOCK;
6727
6728   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6729
6730   /* At first, detect text-format if necessary.  */
6731   if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6732     {
6733       enum coding_category category;
6734       struct coding_system *this;
6735       int c, i;
6736
6737       for (; src < src_end; src++)
6738         {
6739           c = *src;
6740           if (c & 0x80
6741               || (c < 0x20 && (c == ISO_CODE_ESC
6742                                || c == ISO_CODE_SI
6743                                || c == ISO_CODE_SO)))
6744             break;
6745         }
6746       coding.head_ascii = src - coding.source;
6747
6748       if (src < src_end)
6749         for (i = 0; i < coding_category_raw_text; i++)
6750           {
6751             category = coding_priorities[i];
6752             this = coding_categories + category;
6753
6754             if (this->id < 0)
6755               {
6756                 /* No coding system of this category is defined.  */
6757                 detect_info.rejected |= (1 << category);
6758               }
6759             else if (category >= coding_category_raw_text)
6760               continue;
6761             else if (detect_info.checked & (1 << category))
6762               {
6763                 if (highest
6764                     && (detect_info.found & (1 << category)))
6765                   break;
6766               }
6767             else
6768               {
6769                 if ((*(this->detector)) (&coding, &detect_info)
6770                     && highest
6771                     && (detect_info.found & (1 << category)))
6772                   break;
6773               }
6774           }
6775
6776
6777       if (detect_info.rejected == CATEGORY_MASK_ANY)
6778         {
6779           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6780           id = coding_categories[coding_category_raw_text].id;
6781           val = Fcons (make_number (id), Qnil);
6782         }
6783       else if (! detect_info.rejected && ! detect_info.found)
6784         {
6785           detect_info.found = CATEGORY_MASK_ANY;
6786           id = coding_categories[coding_category_undecided].id;
6787           val = Fcons (make_number (id), Qnil);
6788         }
6789       else if (highest)
6790         {
6791           if (detect_info.found)
6792             {
6793               detect_info.found = 1 << category;
6794               val = Fcons (make_number (this->id), Qnil);
6795             }
6796           else
6797             for (i = 0; i < coding_category_raw_text; i++)
6798               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6799                 {
6800                   detect_info.found = 1 << coding_priorities[i];
6801                   id = coding_categories[coding_priorities[i]].id;
6802                   val = Fcons (make_number (id), Qnil);
6803                   break;
6804                 }
6805         }
6806       else
6807         {
6808           int mask = detect_info.rejected | detect_info.found;
6809           int found = 0;
6810           val = Qnil;
6811
6812           for (i = coding_category_raw_text - 1; i >= 0; i--)
6813             {
6814               category = coding_priorities[i];
6815               if (! (mask & (1 << category)))
6816                 {
6817                   found |= 1 << category;
6818                   id = coding_categories[category].id;
6819                   val = Fcons (make_number (id), val);
6820                 }
6821             }
6822           for (i = coding_category_raw_text - 1; i >= 0; i--)
6823             {
6824               category = coding_priorities[i];
6825               if (detect_info.found & (1 << category))
6826                 {
6827                   id = coding_categories[category].id;
6828                   val = Fcons (make_number (id), val);
6829                 }
6830             }
6831           detect_info.found |= found;
6832         }
6833     }
6834   else
6835     {
6836       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6837       val = Fcons (make_number (coding.id), Qnil);
6838     }
6839
6840   /* Then, detect eol-format if necessary.  */
6841   {
6842     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6843     Lisp_Object tail;
6844
6845     if (VECTORP (eol_type))
6846       {
6847         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6848           normal_eol = detect_eol (coding.source, src_bytes,
6849                                    coding_category_raw_text);
6850         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6851                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
6852           utf_16_be_eol = detect_eol (coding.source, src_bytes,
6853                                       coding_category_utf_16_be);
6854         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6855                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
6856           utf_16_le_eol = detect_eol (coding.source, src_bytes,
6857                                       coding_category_utf_16_le);
6858       }
6859     else
6860       {
6861         if (EQ (eol_type, Qunix))
6862           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6863         else if (EQ (eol_type, Qdos))
6864           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6865         else
6866           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6867       }
6868
6869     for (tail = val; CONSP (tail); tail = XCDR (tail))
6870       {
6871         enum coding_category category;
6872         int this_eol;
6873
6874         id = XINT (XCAR (tail));
6875         attrs = CODING_ID_ATTRS (id);
6876         category = XINT (CODING_ATTR_CATEGORY (attrs));
6877         eol_type = CODING_ID_EOL_TYPE (id);
6878         if (VECTORP (eol_type))
6879           {
6880             if (category == coding_category_utf_16_be
6881                 || category == coding_category_utf_16_be_nosig)
6882               this_eol = utf_16_be_eol;
6883             else if (category == coding_category_utf_16_le
6884                      || category == coding_category_utf_16_le_nosig)
6885               this_eol = utf_16_le_eol;
6886             else
6887               this_eol = normal_eol;
6888
6889             if (this_eol == EOL_SEEN_LF)
6890               XSETCAR (tail, AREF (eol_type, 0));
6891             else if (this_eol == EOL_SEEN_CRLF)
6892               XSETCAR (tail, AREF (eol_type, 1));
6893             else if (this_eol == EOL_SEEN_CR)
6894               XSETCAR (tail, AREF (eol_type, 2));
6895             else
6896               XSETCAR (tail, CODING_ID_NAME (id));
6897           }
6898         else
6899           XSETCAR (tail, CODING_ID_NAME (id));
6900       }
6901   }
6902
6903   return (highest ? XCAR (val) : val);
6904 }
6905
6906
6907 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6908        2, 3, 0,
6909        doc: /* Detect coding system of the text in the region between START and END.
6910 Return a list of possible coding systems ordered by priority.
6911
6912 If only ASCII characters are found, it returns a list of single element
6913 `undecided' or its subsidiary coding system according to a detected
6914 end-of-line format.
6915
6916 If optional argument HIGHEST is non-nil, return the coding system of
6917 highest priority.  */)
6918      (start, end, highest)
6919      Lisp_Object start, end, highest;
6920 {
6921   int from, to;
6922   int from_byte, to_byte;
6923
6924   CHECK_NUMBER_COERCE_MARKER (start);
6925   CHECK_NUMBER_COERCE_MARKER (end);
6926
6927   validate_region (&start, &end);
6928   from = XINT (start), to = XINT (end);
6929   from_byte = CHAR_TO_BYTE (from);
6930   to_byte = CHAR_TO_BYTE (to);
6931
6932   if (from < GPT && to >= GPT)
6933     move_gap_both (to, to_byte);
6934
6935   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6936                                to_byte - from_byte,
6937                                !NILP (highest),
6938                                !NILP (current_buffer
6939                                       ->enable_multibyte_characters),
6940                                Qnil);
6941 }
6942
6943 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6944        1, 2, 0,
6945        doc: /* Detect coding system of the text in STRING.
6946 Return a list of possible coding systems ordered by priority.
6947
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6950 end-of-line format.
6951
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority.  */)
6954      (string, highest)
6955      Lisp_Object string, highest;
6956 {
6957   CHECK_STRING (string);
6958
6959   return detect_coding_system (XSTRING (string)->data,
6960                                STRING_BYTES (XSTRING (string)),
6961                                !NILP (highest),
6962                                STRING_MULTIBYTE (string),
6963                                Qnil);
6964 }
6965
6966
6967 static INLINE int
6968 char_encodable_p (c, attrs)
6969      int c;
6970      Lisp_Object attrs;
6971 {
6972   Lisp_Object tail;
6973   struct charset *charset;
6974
6975   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6976        CONSP (tail); tail = XCDR (tail))
6977     {
6978       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6979       if (CHAR_CHARSET_P (c, charset))
6980         break;
6981     }
6982   return (! NILP (tail));
6983 }
6984
6985
6986 /* Return a list of coding systems that safely encode the text between
6987    START and END.  If EXCLUDE is non-nil, it is a list of coding
6988    systems not to check.  The returned list doesn't contain any such
6989    coding systems.  In any case, if the text contains only ASCII or is
6990    unibyte, return t.  */
6991
6992 DEFUN ("find-coding-systems-region-internal",
6993        Ffind_coding_systems_region_internal,
6994        Sfind_coding_systems_region_internal, 2, 3, 0,
6995        doc: /* Internal use only.  */)
6996      (start, end, exclude)
6997      Lisp_Object start, end, exclude;
6998 {
6999   Lisp_Object coding_attrs_list, safe_codings;
7000   EMACS_INT start_byte, end_byte;
7001   const unsigned char *p, *pbeg, *pend;
7002   int c;
7003   Lisp_Object tail, elt;
7004
7005   if (STRINGP (start))
7006     {
7007       if (!STRING_MULTIBYTE (start)
7008           || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
7009         return Qt;
7010       start_byte = 0;
7011       end_byte = STRING_BYTES (XSTRING (start));
7012     }
7013   else
7014     {
7015       CHECK_NUMBER_COERCE_MARKER (start);
7016       CHECK_NUMBER_COERCE_MARKER (end);
7017       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7018         args_out_of_range (start, end);
7019       if (NILP (current_buffer->enable_multibyte_characters))
7020         return Qt;
7021       start_byte = CHAR_TO_BYTE (XINT (start));
7022       end_byte = CHAR_TO_BYTE (XINT (end));
7023       if (XINT (end) - XINT (start) == end_byte - start_byte)
7024         return Qt;
7025
7026       if (XINT (start) < GPT && XINT (end) > GPT)
7027         {
7028           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7029             move_gap_both (XINT (start), start_byte);
7030           else
7031             move_gap_both (XINT (end), end_byte);
7032         }
7033     }
7034
7035   coding_attrs_list = Qnil;
7036   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7037     if (NILP (exclude)
7038         || NILP (Fmemq (XCAR (tail), exclude)))
7039       {
7040         Lisp_Object attrs;
7041
7042         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7043         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7044             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7045           coding_attrs_list = Fcons (attrs, coding_attrs_list);
7046       }
7047
7048   if (STRINGP (start))
7049     p = pbeg = XSTRING (start)->data;
7050   else
7051     p = pbeg = BYTE_POS_ADDR (start_byte);
7052   pend = p + (end_byte - start_byte);
7053
7054   while (p < pend && ASCII_BYTE_P (*p)) p++;
7055   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7056
7057   while (p < pend)
7058     {
7059       if (ASCII_BYTE_P (*p))
7060         p++;
7061       else
7062         {
7063           c = STRING_CHAR_ADVANCE (p);
7064
7065           charset_map_loaded = 0;
7066           for (tail = coding_attrs_list; CONSP (tail);)
7067             {
7068               elt = XCAR (tail);
7069               if (NILP (elt))
7070                 tail = XCDR (tail);
7071               else if (char_encodable_p (c, elt))
7072                 tail = XCDR (tail);
7073               else if (CONSP (XCDR (tail)))
7074                 {
7075                   XSETCAR (tail, XCAR (XCDR (tail)));
7076                   XSETCDR (tail, XCDR (XCDR (tail)));
7077                 }
7078               else
7079                 {
7080                   XSETCAR (tail, Qnil);
7081                   tail = XCDR (tail);
7082                 }
7083             }
7084           if (charset_map_loaded)
7085             {
7086               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7087
7088               if (STRINGP (start))
7089                 pbeg = XSTRING (start)->data;
7090               else
7091                 pbeg = BYTE_POS_ADDR (start_byte);
7092               p = pbeg + p_offset;
7093               pend = pbeg + pend_offset;
7094             }
7095         }
7096     }
7097
7098   safe_codings = Qnil;
7099   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7100     if (! NILP (XCAR (tail)))
7101       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7102
7103   return safe_codings;
7104 }
7105
7106
7107 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7108        Scheck_coding_systems_region, 3, 3, 0,
7109        doc: /* Check if the region is encodable by coding systems.
7110
7111 START and END are buffer positions specifying the region.
7112 CODING-SYSTEM-LIST is a list of coding systems to check.
7113
7114 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7115 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7116 whole region, POS0, POS1, ... are buffer positions where non-encodable
7117 characters are found.
7118
7119 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7120 value is nil.
7121
7122 START may be a string.  In that case, check if the string is
7123 encodable, and the value contains indices to the string instead of
7124 buffer positions.  END is ignored.  */)
7125      (start, end, coding_system_list)
7126      Lisp_Object start, end, coding_system_list;
7127 {
7128   Lisp_Object list;
7129   EMACS_INT start_byte, end_byte;
7130   int pos;
7131   const unsigned char *p, *pbeg, *pend;
7132   int c;
7133   Lisp_Object tail, elt;
7134
7135   if (STRINGP (start))
7136     {
7137       if (!STRING_MULTIBYTE (start)
7138           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7139         return Qnil;
7140       start_byte = 0;
7141       end_byte = STRING_BYTES (XSTRING (start));
7142       pos = 0;
7143     }
7144   else
7145     {
7146       CHECK_NUMBER_COERCE_MARKER (start);
7147       CHECK_NUMBER_COERCE_MARKER (end);
7148       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7149         args_out_of_range (start, end);
7150       if (NILP (current_buffer->enable_multibyte_characters))
7151         return Qnil;
7152       start_byte = CHAR_TO_BYTE (XINT (start));
7153       end_byte = CHAR_TO_BYTE (XINT (end));
7154       if (XINT (end) - XINT (start) == end_byte - start_byte)
7155         return Qt;
7156
7157       if (XINT (start) < GPT && XINT (end) > GPT)
7158         {
7159           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7160             move_gap_both (XINT (start), start_byte);
7161           else
7162             move_gap_both (XINT (end), end_byte);
7163         }
7164       pos = XINT (start);
7165     }
7166
7167   list = Qnil;
7168   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7169     {
7170       elt = XCAR (tail);
7171       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7172                                        Qnil)),
7173                     list);
7174     }
7175
7176   if (STRINGP (start))
7177     p = pbeg = XSTRING (start)->data;
7178   else
7179     p = pbeg = BYTE_POS_ADDR (start_byte);
7180   pend = p + (end_byte - start_byte);
7181
7182   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7183   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7184
7185   while (p < pend)
7186     {
7187       if (ASCII_BYTE_P (*p))
7188         p++;
7189       else
7190         {
7191           c = STRING_CHAR_ADVANCE (p);
7192
7193           charset_map_loaded = 0;
7194           for (tail = list; CONSP (tail); tail = XCDR (tail))
7195             {
7196               elt = XCDR (XCAR (tail));
7197               if (! char_encodable_p (c, XCAR (elt)))
7198                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7199             }
7200           if (charset_map_loaded)
7201             {
7202               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7203
7204               if (STRINGP (start))
7205                 pbeg = XSTRING (start)->data;
7206               else
7207                 pbeg = BYTE_POS_ADDR (start_byte);
7208               p = pbeg + p_offset;
7209               pend = pbeg + pend_offset;
7210             }
7211         }
7212       pos++;
7213     }
7214
7215   tail = list;
7216   list = Qnil;
7217   for (; CONSP (tail); tail = XCDR (tail))
7218     {
7219       elt = XCAR (tail);
7220       if (CONSP (XCDR (XCDR (elt))))
7221         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7222                       list);
7223     }
7224
7225   return list;
7226 }
7227
7228
7229
7230 Lisp_Object
7231 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7232      Lisp_Object start, end, coding_system, dst_object;
7233      int encodep, norecord;
7234 {
7235   struct coding_system coding;
7236   EMACS_INT from, from_byte, to, to_byte;
7237   Lisp_Object src_object;
7238
7239   CHECK_NUMBER_COERCE_MARKER (start);
7240   CHECK_NUMBER_COERCE_MARKER (end);
7241   if (NILP (coding_system))
7242     coding_system = Qno_conversion;
7243   else
7244     CHECK_CODING_SYSTEM (coding_system);
7245   src_object = Fcurrent_buffer ();
7246   if (NILP (dst_object))
7247     dst_object = src_object;
7248   else if (! EQ (dst_object, Qt))
7249     CHECK_BUFFER (dst_object);
7250
7251   validate_region (&start, &end);
7252   from = XFASTINT (start);
7253   from_byte = CHAR_TO_BYTE (from);
7254   to = XFASTINT (end);
7255   to_byte = CHAR_TO_BYTE (to);
7256
7257   setup_coding_system (coding_system, &coding);
7258   coding.mode |= CODING_MODE_LAST_BLOCK;
7259
7260   if (encodep)
7261     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7262                           dst_object);
7263   else
7264     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7265                           dst_object);
7266   if (! norecord)
7267     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7268
7269   if (coding.result != CODING_RESULT_SUCCESS)
7270     error ("Code conversion error: %d", coding.result);
7271
7272   return (BUFFERP (dst_object)
7273           ? make_number (coding.produced_char)
7274           : coding.dst_object);
7275 }
7276
7277
7278 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7279        3, 4, "r\nzCoding system: ",
7280        doc: /* Decode the current region from the specified coding system.
7281 When called from a program, takes four arguments:
7282         START, END, CODING-SYSTEM, and DESTINATION.
7283 START and END are buffer positions.
7284
7285 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7286 If nil, the region between START and END is replace by the decoded text.
7287 If buffer, the decoded text is inserted in the buffer.
7288 If t, the decoded text is returned.
7289
7290 This function sets `last-coding-system-used' to the precise coding system
7291 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7292 not fully specified.)
7293 It returns the length of the decoded text.  */)
7294      (start, end, coding_system, destination)
7295      Lisp_Object start, end, coding_system, destination;
7296 {
7297   return code_convert_region (start, end, coding_system, destination, 0, 0);
7298 }
7299
7300 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7301        3, 4, "r\nzCoding system: ",
7302        doc: /* Encode the current region by specified coding system.
7303 When called from a program, takes three arguments:
7304 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7305
7306 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7307 If nil, the region between START and END is replace by the encoded text.
7308 If buffer, the encoded text is inserted in the buffer.
7309 If t, the encoded text is returned.
7310
7311 This function sets `last-coding-system-used' to the precise coding system
7312 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7313 not fully specified.)
7314 It returns the length of the encoded text.  */)
7315   (start, end, coding_system, destination)
7316      Lisp_Object start, end, coding_system, destination;
7317 {
7318   return code_convert_region (start, end, coding_system, destination, 1, 0);
7319 }
7320
7321 Lisp_Object
7322 code_convert_string (string, coding_system, dst_object,
7323                      encodep, nocopy, norecord)
7324      Lisp_Object string, coding_system, dst_object;
7325      int encodep, nocopy, norecord;
7326 {
7327   struct coding_system coding;
7328   EMACS_INT chars, bytes;
7329
7330   CHECK_STRING (string);
7331   if (NILP (coding_system))
7332     {
7333       if (! norecord)
7334         Vlast_coding_system_used = Qno_conversion;
7335       if (NILP (dst_object))
7336         return (nocopy ? Fcopy_sequence (string) : string);
7337     }
7338
7339   if (NILP (coding_system))
7340     coding_system = Qno_conversion;
7341   else
7342     CHECK_CODING_SYSTEM (coding_system);
7343   if (NILP (dst_object))
7344     dst_object = Qt;
7345   else if (! EQ (dst_object, Qt))
7346     CHECK_BUFFER (dst_object);
7347
7348   setup_coding_system (coding_system, &coding);
7349   coding.mode |= CODING_MODE_LAST_BLOCK;
7350   chars = XSTRING (string)->size;
7351   bytes = STRING_BYTES (XSTRING (string));
7352   if (encodep)
7353     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7354   else
7355     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7356   if (! norecord)
7357     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7358
7359   if (coding.result != CODING_RESULT_SUCCESS)
7360     error ("Code conversion error: %d", coding.result);
7361
7362   return (BUFFERP (dst_object)
7363           ? make_number (coding.produced_char)
7364           : coding.dst_object);
7365 }
7366
7367
7368 /* Encode or decode STRING according to CODING_SYSTEM.
7369    Do not set Vlast_coding_system_used.
7370
7371    This function is called only from macros DECODE_FILE and
7372    ENCODE_FILE, thus we ignore character composition.  */
7373
7374 Lisp_Object
7375 code_convert_string_norecord (string, coding_system, encodep)
7376      Lisp_Object string, coding_system;
7377      int encodep;
7378 {
7379   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7380 }
7381
7382
7383 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7384        2, 4, 0,
7385        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7386
7387 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7388 if the decoding operation is trivial.
7389
7390 Optional fourth arg BUFFER non-nil meant that the decoded text is
7391 inserted in BUFFER instead of returned as a string.  In this case,
7392 the return value is BUFFER.
7393
7394 This function sets `last-coding-system-used' to the precise coding system
7395 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7396 not fully specified.  */)
7397   (string, coding_system, nocopy, buffer)
7398      Lisp_Object string, coding_system, nocopy, buffer;
7399 {
7400   return code_convert_string (string, coding_system, buffer,
7401                               0, ! NILP (nocopy), 0);
7402 }
7403
7404 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7405        2, 4, 0,
7406        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7407
7408 Optional third arg NOCOPY non-nil means it is OK to return STRING
7409 itself if the encoding operation is trivial.
7410
7411 Optional fourth arg BUFFER non-nil meant that the encoded text is
7412 inserted in BUFFER instead of returned as a string.  In this case,
7413 the return value is BUFFER.
7414
7415 This function sets `last-coding-system-used' to the precise coding system
7416 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7417 not fully specified.)  */)
7418      (string, coding_system, nocopy, buffer)
7419      Lisp_Object string, coding_system, nocopy, buffer;
7420 {
7421   return code_convert_string (string, coding_system, buffer,
7422                               1, ! NILP (nocopy), 1);
7423 }
7424
7425 \f
7426 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7427        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7428 Return the corresponding character.  */)
7429      (code)
7430      Lisp_Object code;
7431 {
7432   Lisp_Object spec, attrs, val;
7433   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7434   int c;
7435
7436   CHECK_NATNUM (code);
7437   c = XFASTINT (code);
7438   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7439   attrs = AREF (spec, 0);
7440
7441   if (ASCII_BYTE_P (c)
7442       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7443     return code;
7444
7445   val = CODING_ATTR_CHARSET_LIST (attrs);
7446   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7447   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7448   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7449
7450   if (c <= 0x7F)
7451     charset = charset_roman;
7452   else if (c >= 0xA0 && c < 0xDF)
7453     {
7454       charset = charset_kana;
7455       c -= 0x80;
7456     }
7457   else
7458     {
7459       int s1 = c >> 8, s2 = c & 0xFF;
7460
7461       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7462           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7463         error ("Invalid code: %d", code);
7464       SJIS_TO_JIS (c);
7465       charset = charset_kanji;
7466     }
7467   c = DECODE_CHAR (charset, c);
7468   if (c < 0)
7469     error ("Invalid code: %d", code);
7470   return make_number (c);
7471 }
7472
7473
7474 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7475        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7476 Return the corresponding code in SJIS.  */)
7477      (ch)
7478     Lisp_Object ch;
7479 {
7480   Lisp_Object spec, attrs, charset_list;
7481   int c;
7482   struct charset *charset;
7483   unsigned code;
7484
7485   CHECK_CHARACTER (ch);
7486   c = XFASTINT (ch);
7487   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7488   attrs = AREF (spec, 0);
7489
7490   if (ASCII_CHAR_P (c)
7491       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7492     return ch;
7493
7494   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7495   charset = char_charset (c, charset_list, &code);
7496   if (code == CHARSET_INVALID_CODE (charset))
7497     error ("Can't encode by shift_jis encoding: %d", c);
7498   JIS_TO_SJIS (code);
7499
7500   return make_number (code);
7501 }
7502
7503 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7504        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7505 Return the corresponding character.  */)
7506      (code)
7507      Lisp_Object code;
7508 {
7509   Lisp_Object spec, attrs, val;
7510   struct charset *charset_roman, *charset_big5, *charset;
7511   int c;
7512
7513   CHECK_NATNUM (code);
7514   c = XFASTINT (code);
7515   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7516   attrs = AREF (spec, 0);
7517
7518   if (ASCII_BYTE_P (c)
7519       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7520     return code;
7521
7522   val = CODING_ATTR_CHARSET_LIST (attrs);
7523   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7524   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7525
7526   if (c <= 0x7F)
7527     charset = charset_roman;
7528   else
7529     {
7530       int b1 = c >> 8, b2 = c & 0x7F;
7531       if (b1 < 0xA1 || b1 > 0xFE
7532           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7533         error ("Invalid code: %d", code);
7534       charset = charset_big5;
7535     }
7536   c = DECODE_CHAR (charset, (unsigned )c);
7537   if (c < 0)
7538     error ("Invalid code: %d", code);
7539   return make_number (c);
7540 }
7541
7542 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7543        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7544 Return the corresponding character code in Big5.  */)
7545      (ch)
7546      Lisp_Object ch;
7547 {
7548   Lisp_Object spec, attrs, charset_list;
7549   struct charset *charset;
7550   int c;
7551   unsigned code;
7552
7553   CHECK_CHARACTER (ch);
7554   c = XFASTINT (ch);
7555   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7556   attrs = AREF (spec, 0);
7557   if (ASCII_CHAR_P (c)
7558       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7559     return ch;
7560
7561   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7562   charset = char_charset (c, charset_list, &code);
7563   if (code == CHARSET_INVALID_CODE (charset))
7564     error ("Can't encode by Big5 encoding: %d", c);
7565
7566   return make_number (code);
7567 }
7568
7569 \f
7570 DEFUN ("set-terminal-coding-system-internal",
7571        Fset_terminal_coding_system_internal,
7572        Sset_terminal_coding_system_internal, 1, 1, 0,
7573        doc: /* Internal use only.  */)
7574      (coding_system)
7575      Lisp_Object coding_system;
7576 {
7577   CHECK_SYMBOL (coding_system);
7578   setup_coding_system (Fcheck_coding_system (coding_system),
7579                         &terminal_coding);
7580
7581   /* We had better not send unsafe characters to terminal.  */
7582   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7583   /* Characer composition should be disabled.  */
7584   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7585   terminal_coding.src_multibyte = 1;
7586   terminal_coding.dst_multibyte = 0;
7587   return Qnil;
7588 }
7589
7590 DEFUN ("set-safe-terminal-coding-system-internal",
7591        Fset_safe_terminal_coding_system_internal,
7592        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7593        doc: /* Internal use only.  */)
7594      (coding_system)
7595      Lisp_Object coding_system;
7596 {
7597   CHECK_SYMBOL (coding_system);
7598   setup_coding_system (Fcheck_coding_system (coding_system),
7599                        &safe_terminal_coding);
7600   /* Characer composition should be disabled.  */
7601   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7602   safe_terminal_coding.src_multibyte = 1;
7603   safe_terminal_coding.dst_multibyte = 0;
7604   return Qnil;
7605 }
7606
7607 DEFUN ("terminal-coding-system",
7608        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7609        doc: /* Return coding system specified for terminal output.  */)
7610      ()
7611 {
7612   return CODING_ID_NAME (terminal_coding.id);
7613 }
7614
7615 DEFUN ("set-keyboard-coding-system-internal",
7616        Fset_keyboard_coding_system_internal,
7617        Sset_keyboard_coding_system_internal, 1, 1, 0,
7618        doc: /* Internal use only.  */)
7619      (coding_system)
7620      Lisp_Object coding_system;
7621 {
7622   CHECK_SYMBOL (coding_system);
7623   setup_coding_system (Fcheck_coding_system (coding_system),
7624                        &keyboard_coding);
7625   /* Characer composition should be disabled.  */
7626   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7627   return Qnil;
7628 }
7629
7630 DEFUN ("keyboard-coding-system",
7631        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7632        doc: /* Return coding system specified for decoding keyboard input.  */)
7633      ()
7634 {
7635   return CODING_ID_NAME (keyboard_coding.id);
7636 }
7637
7638 \f
7639 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7640        Sfind_operation_coding_system,  1, MANY, 0,
7641        doc: /* Choose a coding system for an operation based on the target name.
7642 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7643 DECODING-SYSTEM is the coding system to use for decoding
7644 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7645 for encoding (in case OPERATION does encoding).
7646
7647 The first argument OPERATION specifies an I/O primitive:
7648   For file I/O, `insert-file-contents' or `write-region'.
7649   For process I/O, `call-process', `call-process-region', or `start-process'.
7650   For network I/O, `open-network-stream'.
7651
7652 The remaining arguments should be the same arguments that were passed
7653 to the primitive.  Depending on which primitive, one of those arguments
7654 is selected as the TARGET.  For example, if OPERATION does file I/O,
7655 whichever argument specifies the file name is TARGET.
7656
7657 TARGET has a meaning which depends on OPERATION:
7658   For file I/O, TARGET is a file name.
7659   For process I/O, TARGET is a process name.
7660   For network I/O, TARGET is a service name or a port number
7661
7662 This function looks up what specified for TARGET in,
7663 `file-coding-system-alist', `process-coding-system-alist',
7664 or `network-coding-system-alist' depending on OPERATION.
7665 They may specify a coding system, a cons of coding systems,
7666 or a function symbol to call.
7667 In the last case, we call the function with one argument,
7668 which is a list of all the arguments given to this function.
7669
7670 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7671      (nargs, args)
7672      int nargs;
7673      Lisp_Object *args;
7674 {
7675   Lisp_Object operation, target_idx, target, val;
7676   register Lisp_Object chain;
7677
7678   if (nargs < 2)
7679     error ("Too few arguments");
7680   operation = args[0];
7681   if (!SYMBOLP (operation)
7682       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7683     error ("Invalid first arguement");
7684   if (nargs < 1 + XINT (target_idx))
7685     error ("Too few arguments for operation: %s",
7686            XSYMBOL (operation)->name->data);
7687   target = args[XINT (target_idx) + 1];
7688   if (!(STRINGP (target)
7689         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7690     error ("Invalid %dth argument", XINT (target_idx) + 1);
7691
7692   chain = ((EQ (operation, Qinsert_file_contents)
7693             || EQ (operation, Qwrite_region))
7694            ? Vfile_coding_system_alist
7695            : (EQ (operation, Qopen_network_stream)
7696               ? Vnetwork_coding_system_alist
7697               : Vprocess_coding_system_alist));
7698   if (NILP (chain))
7699     return Qnil;
7700
7701   for (; CONSP (chain); chain = XCDR (chain))
7702     {
7703       Lisp_Object elt;
7704
7705       elt = XCAR (chain);
7706       if (CONSP (elt)
7707           && ((STRINGP (target)
7708                && STRINGP (XCAR (elt))
7709                && fast_string_match (XCAR (elt), target) >= 0)
7710               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7711         {
7712           val = XCDR (elt);
7713           /* Here, if VAL is both a valid coding system and a valid
7714              function symbol, we return VAL as a coding system.  */
7715           if (CONSP (val))
7716             return val;
7717           if (! SYMBOLP (val))
7718             return Qnil;
7719           if (! NILP (Fcoding_system_p (val)))
7720             return Fcons (val, val);
7721           if (! NILP (Ffboundp (val)))
7722             {
7723               val = call1 (val, Flist (nargs, args));
7724               if (CONSP (val))
7725                 return val;
7726               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7727                 return Fcons (val, val);
7728             }
7729           return Qnil;
7730         }
7731     }
7732   return Qnil;
7733 }
7734
7735 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7736        Sset_coding_system_priority, 0, MANY, 0,
7737        doc: /* Assign higher priority to the coding systems given as arguments.
7738 usage: (set-coding-system-priority CODING-SYSTEM ...)  */)
7739      (nargs, args)
7740      int nargs;
7741      Lisp_Object *args;
7742 {
7743   int i, j;
7744   int changed[coding_category_max];
7745   enum coding_category priorities[coding_category_max];
7746
7747   bzero (changed, sizeof changed);
7748
7749   for (i = j = 0; i < nargs; i++)
7750     {
7751       enum coding_category category;
7752       Lisp_Object spec, attrs;
7753
7754       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7755       attrs = AREF (spec, 0);
7756       category = XINT (CODING_ATTR_CATEGORY (attrs));
7757       if (changed[category])
7758         /* Ignore this coding system because a coding system of the
7759            same category already had a higher priority.  */
7760         continue;
7761       changed[category] = 1;
7762       priorities[j++] = category;
7763       if (coding_categories[category].id >= 0
7764           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7765         setup_coding_system (args[i], &coding_categories[category]);
7766     }
7767
7768   /* Now we have decided top J priorities.  Reflect the order of the
7769      original priorities to the remaining priorities.  */
7770
7771   for (i = j, j = 0; i < coding_category_max; i++, j++)
7772     {
7773       while (j < coding_category_max
7774              && changed[coding_priorities[j]])
7775         j++;
7776       if (j == coding_category_max)
7777         abort ();
7778       priorities[i] = coding_priorities[j];
7779     }
7780
7781   bcopy (priorities, coding_priorities, sizeof priorities);
7782   return Qnil;
7783 }
7784
7785 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7786        Scoding_system_priority_list, 0, 1, 0,
7787        doc: /* Return a list of coding systems ordered by their priorities.
7788 HIGHESTP non-nil means just return the highest priority one.  */)
7789      (highestp)
7790      Lisp_Object highestp;
7791 {
7792   int i;
7793   Lisp_Object val;
7794
7795   for (i = 0, val = Qnil; i < coding_category_max; i++)
7796     {
7797       enum coding_category category = coding_priorities[i];
7798       int id = coding_categories[category].id;
7799       Lisp_Object attrs;
7800
7801       if (id < 0)
7802         continue;
7803       attrs = CODING_ID_ATTRS (id);
7804       if (! NILP (highestp))
7805         return CODING_ATTR_BASE_NAME (attrs);
7806       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7807     }
7808   return Fnreverse (val);
7809 }
7810
7811 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7812
7813 static Lisp_Object
7814 make_subsidiaries (base)
7815      Lisp_Object base;
7816 {
7817   Lisp_Object subsidiaries;
7818   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7819   char *buf = (char *) alloca (base_name_len + 6);
7820   int i;
7821
7822   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7823   subsidiaries = Fmake_vector (make_number (3), Qnil);
7824   for (i = 0; i < 3; i++)
7825     {
7826       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7827       ASET (subsidiaries, i, intern (buf));
7828     }
7829   return subsidiaries;
7830 }
7831
7832
7833 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7834        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7835        doc: /* For internal use only.
7836 usage: (define-coding-system-internal ...)  */)
7837      (nargs, args)
7838      int nargs;
7839      Lisp_Object *args;
7840 {
7841   Lisp_Object name;
7842   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7843   Lisp_Object attrs;            /* Vector of attributes.  */
7844   Lisp_Object eol_type;
7845   Lisp_Object aliases;
7846   Lisp_Object coding_type, charset_list, safe_charsets;
7847   enum coding_category category;
7848   Lisp_Object tail, val;
7849   int max_charset_id = 0;
7850   int i;
7851
7852   if (nargs < coding_arg_max)
7853     goto short_args;
7854
7855   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7856
7857   name = args[coding_arg_name];
7858   CHECK_SYMBOL (name);
7859   CODING_ATTR_BASE_NAME (attrs) = name;
7860
7861   val = args[coding_arg_mnemonic];
7862   if (! STRINGP (val))
7863     CHECK_CHARACTER (val);
7864   CODING_ATTR_MNEMONIC (attrs) = val;
7865
7866   coding_type = args[coding_arg_coding_type];
7867   CHECK_SYMBOL (coding_type);
7868   CODING_ATTR_TYPE (attrs) = coding_type;
7869
7870   charset_list = args[coding_arg_charset_list];
7871   if (SYMBOLP (charset_list))
7872     {
7873       if (EQ (charset_list, Qiso_2022))
7874         {
7875           if (! EQ (coding_type, Qiso_2022))
7876             error ("Invalid charset-list");
7877           charset_list = Viso_2022_charset_list;
7878         }
7879       else if (EQ (charset_list, Qemacs_mule))
7880         {
7881           if (! EQ (coding_type, Qemacs_mule))
7882             error ("Invalid charset-list");
7883           charset_list = Vemacs_mule_charset_list;
7884         }
7885       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7886         if (max_charset_id < XFASTINT (XCAR (tail)))
7887           max_charset_id = XFASTINT (XCAR (tail));
7888     }
7889   else
7890     {
7891       charset_list = Fcopy_sequence (charset_list);
7892       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7893         {
7894           struct charset *charset;
7895
7896           val = Fcar (tail);
7897           CHECK_CHARSET_GET_CHARSET (val, charset);
7898           if (EQ (coding_type, Qiso_2022)
7899               ? CHARSET_ISO_FINAL (charset) < 0
7900               : EQ (coding_type, Qemacs_mule)
7901               ? CHARSET_EMACS_MULE_ID (charset) < 0
7902               : 0)
7903             error ("Can't handle charset `%s'",
7904                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7905
7906           XCAR (tail) = make_number (charset->id);
7907           if (max_charset_id < charset->id)
7908             max_charset_id = charset->id;
7909         }
7910     }
7911   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7912
7913   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7914                                 make_number (255));
7915   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7916     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7917   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7918
7919   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7920
7921   val = args[coding_arg_decode_translation_table];
7922   if (! NILP (val))
7923     CHECK_CHAR_TABLE (val);
7924   CODING_ATTR_DECODE_TBL (attrs) = val;
7925
7926   val = args[coding_arg_encode_translation_table];
7927   if (! NILP (val))
7928     CHECK_CHAR_TABLE (val);
7929   CODING_ATTR_ENCODE_TBL (attrs) = val;
7930
7931   val = args[coding_arg_post_read_conversion];
7932   CHECK_SYMBOL (val);
7933   CODING_ATTR_POST_READ (attrs) = val;
7934
7935   val = args[coding_arg_pre_write_conversion];
7936   CHECK_SYMBOL (val);
7937   CODING_ATTR_PRE_WRITE (attrs) = val;
7938
7939   val = args[coding_arg_default_char];
7940   if (NILP (val))
7941     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7942   else
7943     {
7944       CHECK_CHARACTER (val);
7945       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7946     }
7947
7948   val = args[coding_arg_plist];
7949   CHECK_LIST (val);
7950   CODING_ATTR_PLIST (attrs) = val;
7951
7952   if (EQ (coding_type, Qcharset))
7953     {
7954       Lisp_Object list;
7955       /* Generate a lisp vector of 256 elements.  Each element is nil,
7956          integer, or a list of charset IDs.
7957
7958          If Nth element is nil, the byte code N is invalid in this
7959          coding system.
7960
7961          If Nth element is a number NUM, N is the first byte of a
7962          charset whose ID is NUM.
7963
7964          If Nth element is a list of charset IDs, N is the first byte
7965          of one of them.  The list is sorted by dimensions of the
7966          charsets.  A charset of smaller dimension comes firtst.
7967       */
7968       for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
7969         {
7970           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7971
7972           if (charset->method == CHARSET_METHOD_SUPERSET)
7973             {
7974               val = CHARSET_SUPERSET (charset);
7975               for (; CONSP (val); val = XCDR (val))
7976                 list = Fcons (XCAR (XCAR (val)), list);
7977             }
7978           else
7979             list = Fcons (XCAR (tail), list);
7980         }
7981
7982       val = Fmake_vector (make_number (256), Qnil);
7983
7984       for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
7985         {
7986           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7987           int dim = CHARSET_DIMENSION (charset);
7988           int idx = (dim - 1) * 4;
7989
7990           if (CHARSET_ASCII_COMPATIBLE_P (charset))
7991             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7992
7993           for (i = charset->code_space[idx];
7994                i <= charset->code_space[idx + 1]; i++)
7995             {
7996               Lisp_Object tmp, tmp2;
7997               int dim2;
7998
7999               tmp = AREF (val, i);
8000               if (NILP (tmp))
8001                 tmp = XCAR (tail);
8002               else if (NUMBERP (tmp))
8003                 {
8004                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8005                   if (dim < dim2)
8006                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8007                   else
8008                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8009                 }
8010               else
8011                 {
8012                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8013                     {
8014                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8015                       if (dim < dim2)
8016                         break;
8017                     }
8018                   if (NILP (tmp2))
8019                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8020                   else
8021                     {
8022                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8023                       XSETCAR (tmp2, XCAR (tail));
8024                     }
8025                 }
8026               ASET (val, i, tmp);
8027             }
8028         }
8029       ASET (attrs, coding_attr_charset_valids, val);
8030       category = coding_category_charset;
8031     }
8032   else if (EQ (coding_type, Qccl))
8033     {
8034       Lisp_Object valids;
8035
8036       if (nargs < coding_arg_ccl_max)
8037         goto short_args;
8038
8039       val = args[coding_arg_ccl_decoder];
8040       CHECK_CCL_PROGRAM (val);
8041       if (VECTORP (val))
8042         val = Fcopy_sequence (val);
8043       ASET (attrs, coding_attr_ccl_decoder, val);
8044
8045       val = args[coding_arg_ccl_encoder];
8046       CHECK_CCL_PROGRAM (val);
8047       if (VECTORP (val))
8048         val = Fcopy_sequence (val);
8049       ASET (attrs, coding_attr_ccl_encoder, val);
8050
8051       val = args[coding_arg_ccl_valids];
8052       valids = Fmake_string (make_number (256), make_number (0));
8053       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8054         {
8055           int from, to;
8056
8057           val = Fcar (tail);
8058           if (INTEGERP (val))
8059             {
8060               from = to = XINT (val);
8061               if (from < 0 || from > 255)
8062                 args_out_of_range_3 (val, make_number (0), make_number (255));
8063             }
8064           else
8065             {
8066               CHECK_CONS (val);
8067               CHECK_NUMBER (XCAR (val));
8068               CHECK_NUMBER (XCDR (val));
8069               from = XINT (XCAR (val));
8070               if (from < 0 || from > 255)
8071                 args_out_of_range_3 (XCAR (val),
8072                                      make_number (0), make_number (255));
8073               to = XINT (XCDR (val));
8074               if (to < from || to > 255)
8075                 args_out_of_range_3 (XCDR (val),
8076                                      XCAR (val), make_number (255));
8077             }
8078           for (i = from; i <= to; i++)
8079             XSTRING (valids)->data[i] = 1;
8080         }
8081       ASET (attrs, coding_attr_ccl_valids, valids);
8082
8083       category = coding_category_ccl;
8084     }
8085   else if (EQ (coding_type, Qutf_16))
8086     {
8087       Lisp_Object bom, endian;
8088
8089       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8090
8091       if (nargs < coding_arg_utf16_max)
8092         goto short_args;
8093
8094       bom = args[coding_arg_utf16_bom];
8095       if (! NILP (bom) && ! EQ (bom, Qt))
8096         {
8097           CHECK_CONS (bom);
8098           CHECK_CODING_SYSTEM (XCAR (bom));
8099           CHECK_CODING_SYSTEM (XCDR (bom));
8100         }
8101       ASET (attrs, coding_attr_utf_16_bom, bom);
8102
8103       endian = args[coding_arg_utf16_endian];
8104       CHECK_SYMBOL (endian);
8105       if (NILP (endian))
8106         endian = Qbig;
8107       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8108         error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
8109       ASET (attrs, coding_attr_utf_16_endian, endian);
8110
8111       category = (CONSP (bom)
8112                   ? coding_category_utf_16_auto
8113                   : NILP (bom)
8114                   ? (EQ (endian, Qbig)
8115                      ? coding_category_utf_16_be_nosig
8116                      : coding_category_utf_16_le_nosig)
8117                   : (EQ (endian, Qbig)
8118                      ? coding_category_utf_16_be
8119                      : coding_category_utf_16_le));
8120     }
8121   else if (EQ (coding_type, Qiso_2022))
8122     {
8123       Lisp_Object initial, reg_usage, request, flags;
8124       int i;
8125
8126       if (nargs < coding_arg_iso2022_max)
8127         goto short_args;
8128
8129       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8130       CHECK_VECTOR (initial);
8131       for (i = 0; i < 4; i++)
8132         {
8133           val = Faref (initial, make_number (i));
8134           if (! NILP (val))
8135             {
8136               struct charset *charset;
8137
8138               CHECK_CHARSET_GET_CHARSET (val, charset);
8139               ASET (initial, i, make_number (CHARSET_ID (charset)));
8140               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8141                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8142             }
8143           else
8144             ASET (initial, i, make_number (-1));
8145         }
8146
8147       reg_usage = args[coding_arg_iso2022_reg_usage];
8148       CHECK_CONS (reg_usage);
8149       CHECK_NATNUM (XCAR (reg_usage));
8150       CHECK_NATNUM (XCDR (reg_usage));
8151
8152       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8153       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8154         {
8155           int id;
8156
8157           val = Fcar (tail);
8158           CHECK_CONS (val);
8159           CHECK_CHARSET_GET_ID (XCAR (val), id);
8160           CHECK_NATNUM (XCDR (val));
8161           if (XINT (XCDR (val)) >= 4)
8162             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8163           XCAR (val) = make_number (id);
8164         }
8165
8166       flags = args[coding_arg_iso2022_flags];
8167       CHECK_NATNUM (flags);
8168       i = XINT (flags);
8169       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8170         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8171
8172       ASET (attrs, coding_attr_iso_initial, initial);
8173       ASET (attrs, coding_attr_iso_usage, reg_usage);
8174       ASET (attrs, coding_attr_iso_request, request);
8175       ASET (attrs, coding_attr_iso_flags, flags);
8176       setup_iso_safe_charsets (attrs);
8177
8178       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8179         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8180                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8181                     ? coding_category_iso_7_else
8182                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8183                     ? coding_category_iso_7
8184                     : coding_category_iso_7_tight);
8185       else
8186         {
8187           int id = XINT (AREF (initial, 1));
8188
8189           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8190                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8191                        || id < 0)
8192                       ? coding_category_iso_8_else
8193                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8194                       ? coding_category_iso_8_1
8195                       : coding_category_iso_8_2);
8196         }
8197       if (category != coding_category_iso_8_1
8198           && category != coding_category_iso_8_2)
8199         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8200     }
8201   else if (EQ (coding_type, Qemacs_mule))
8202     {
8203       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8204         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8205       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8206       category = coding_category_emacs_mule;
8207     }
8208   else if (EQ (coding_type, Qshift_jis))
8209     {
8210
8211       struct charset *charset;
8212
8213       if (XINT (Flength (charset_list)) != 3)
8214         error ("There should be just three charsets");
8215
8216       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8217       if (CHARSET_DIMENSION (charset) != 1)
8218         error ("Dimension of charset %s is not one",
8219                XSYMBOL (CHARSET_NAME (charset))->name->data);
8220       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8221         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8222
8223       charset_list = XCDR (charset_list);
8224       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8225       if (CHARSET_DIMENSION (charset) != 1)
8226         error ("Dimension of charset %s is not one",
8227                XSYMBOL (CHARSET_NAME (charset))->name->data);
8228
8229       charset_list = XCDR (charset_list);
8230       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8231       if (CHARSET_DIMENSION (charset) != 2)
8232         error ("Dimension of charset %s is not two",
8233                XSYMBOL (CHARSET_NAME (charset))->name->data);
8234
8235       category = coding_category_sjis;
8236       Vsjis_coding_system = name;
8237     }
8238   else if (EQ (coding_type, Qbig5))
8239     {
8240       struct charset *charset;
8241
8242       if (XINT (Flength (charset_list)) != 2)
8243         error ("There should be just two charsets");
8244
8245       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8246       if (CHARSET_DIMENSION (charset) != 1)
8247         error ("Dimension of charset %s is not one",
8248                XSYMBOL (CHARSET_NAME (charset))->name->data);
8249       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8250         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8251
8252       charset_list = XCDR (charset_list);
8253       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8254       if (CHARSET_DIMENSION (charset) != 2)
8255         error ("Dimension of charset %s is not two",
8256                XSYMBOL (CHARSET_NAME (charset))->name->data);
8257
8258       category = coding_category_big5;
8259       Vbig5_coding_system = name;
8260     }
8261   else if (EQ (coding_type, Qraw_text))
8262     {
8263       category = coding_category_raw_text;
8264       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8265     }
8266   else if (EQ (coding_type, Qutf_8))
8267     {
8268       category = coding_category_utf_8;
8269       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8270     }
8271   else if (EQ (coding_type, Qundecided))
8272     category = coding_category_undecided;
8273   else
8274     error ("Invalid coding system type: %s",
8275            XSYMBOL (coding_type)->name->data);
8276
8277   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8278
8279   eol_type = args[coding_arg_eol_type];
8280   if (! NILP (eol_type)
8281       && ! EQ (eol_type, Qunix)
8282       && ! EQ (eol_type, Qdos)
8283       && ! EQ (eol_type, Qmac))
8284     error ("Invalid eol-type");
8285
8286   aliases = Fcons (name, Qnil);
8287
8288   if (NILP (eol_type))
8289     {
8290       eol_type = make_subsidiaries (name);
8291       for (i = 0; i < 3; i++)
8292         {
8293           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8294
8295           this_name = AREF (eol_type, i);
8296           this_aliases = Fcons (this_name, Qnil);
8297           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8298           this_spec = Fmake_vector (make_number (3), attrs);
8299           ASET (this_spec, 1, this_aliases);
8300           ASET (this_spec, 2, this_eol_type);
8301           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8302           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8303           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8304                                         Vcoding_system_alist);
8305         }
8306     }
8307
8308   spec_vec = Fmake_vector (make_number (3), attrs);
8309   ASET (spec_vec, 1, aliases);
8310   ASET (spec_vec, 2, eol_type);
8311
8312   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8313   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8314   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8315                                 Vcoding_system_alist);
8316
8317   {
8318     int id = coding_categories[category].id;
8319
8320     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8321       setup_coding_system (name, &coding_categories[category]);
8322   }
8323
8324   return Qnil;
8325
8326  short_args:
8327   return Fsignal (Qwrong_number_of_arguments,
8328                   Fcons (intern ("define-coding-system-internal"),
8329                          make_number (nargs)));
8330 }
8331
8332 /* Fixme: should this record the alias relationships for
8333    diagnostics?  Should it update coding-system-list?  */
8334 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8335        Sdefine_coding_system_alias, 2, 2, 0,
8336        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8337      (alias, coding_system)
8338      Lisp_Object alias, coding_system;
8339 {
8340   Lisp_Object spec, aliases, eol_type;
8341
8342   CHECK_SYMBOL (alias);
8343   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8344   aliases = AREF (spec, 1);
8345   while (!NILP (XCDR (aliases)))
8346     aliases = XCDR (aliases);
8347   XCDR (aliases) = Fcons (alias, Qnil);
8348
8349   eol_type = AREF (spec, 2);
8350   if (VECTORP (eol_type))
8351     {
8352       Lisp_Object subsidiaries;
8353       int i;
8354
8355       subsidiaries = make_subsidiaries (alias);
8356       for (i = 0; i < 3; i++)
8357         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8358                                      AREF (eol_type, i));
8359
8360       ASET (spec, 2, subsidiaries);
8361     }
8362
8363   Fputhash (alias, spec, Vcoding_system_hash_table);
8364   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8365                                 Vcoding_system_alist);
8366
8367   return Qnil;
8368 }
8369
8370 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8371        1, 1, 0,
8372        doc: /* Return the base of CODING-SYSTEM.
8373 Any alias or subsidiary coding system is not a base coding system.  */)
8374   (coding_system)
8375      Lisp_Object coding_system;
8376 {
8377   Lisp_Object spec, attrs;
8378
8379   if (NILP (coding_system))
8380     return (Qno_conversion);
8381   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8382   attrs = AREF (spec, 0);
8383   return CODING_ATTR_BASE_NAME (attrs);
8384 }
8385
8386 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8387        1, 1, 0,
8388        doc: "Return the property list of CODING-SYSTEM.")
8389      (coding_system)
8390      Lisp_Object coding_system;
8391 {
8392   Lisp_Object spec, attrs;
8393
8394   if (NILP (coding_system))
8395     coding_system = Qno_conversion;
8396   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8397   attrs = AREF (spec, 0);
8398   return CODING_ATTR_PLIST (attrs);
8399 }
8400
8401
8402 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8403        1, 1, 0,
8404        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8405      (coding_system)
8406      Lisp_Object coding_system;
8407 {
8408   Lisp_Object spec;
8409
8410   if (NILP (coding_system))
8411     coding_system = Qno_conversion;
8412   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8413   return AREF (spec, 1);
8414 }
8415
8416 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8417        Scoding_system_eol_type, 1, 1, 0,
8418        doc: /* Return eol-type of CODING-SYSTEM.
8419 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8420
8421 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8422 and CR respectively.
8423
8424 A vector value indicates that a format of end-of-line should be
8425 detected automatically.  Nth element of the vector is the subsidiary
8426 coding system whose eol-type is N.  */)
8427      (coding_system)
8428      Lisp_Object coding_system;
8429 {
8430   Lisp_Object spec, eol_type;
8431   int n;
8432
8433   if (NILP (coding_system))
8434     coding_system = Qno_conversion;
8435   if (! CODING_SYSTEM_P (coding_system))
8436     return Qnil;
8437   spec = CODING_SYSTEM_SPEC (coding_system);
8438   eol_type = AREF (spec, 2);
8439   if (VECTORP (eol_type))
8440     return Fcopy_sequence (eol_type);
8441   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8442   return make_number (n);
8443 }
8444
8445 #endif /* emacs */
8446
8447 \f
8448 /*** 9. Post-amble ***/
8449
8450 void
8451 init_coding_once ()
8452 {
8453   int i;
8454
8455   for (i = 0; i < coding_category_max; i++)
8456     {
8457       coding_categories[i].id = -1;
8458       coding_priorities[i] = i;
8459     }
8460
8461   /* ISO2022 specific initialize routine.  */
8462   for (i = 0; i < 0x20; i++)
8463     iso_code_class[i] = ISO_control_0;
8464   for (i = 0x21; i < 0x7F; i++)
8465     iso_code_class[i] = ISO_graphic_plane_0;
8466   for (i = 0x80; i < 0xA0; i++)
8467     iso_code_class[i] = ISO_control_1;
8468   for (i = 0xA1; i < 0xFF; i++)
8469     iso_code_class[i] = ISO_graphic_plane_1;
8470   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8471   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8472   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8473   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8474   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8475   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8476   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8477   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8478   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8479   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8480
8481   inhibit_pre_post_conversion = 0;
8482
8483   for (i = 0; i < 256; i++)
8484     {
8485       emacs_mule_bytes[i] = 1;
8486     }
8487   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8488   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8489   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8490   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8491 }
8492
8493 #ifdef emacs
8494
8495 void
8496 syms_of_coding ()
8497 {
8498   staticpro (&Vcoding_system_hash_table);
8499   Vcoding_system_hash_table = Fmakehash (Qeq);
8500
8501   staticpro (&Vsjis_coding_system);
8502   Vsjis_coding_system = Qnil;
8503
8504   staticpro (&Vbig5_coding_system);
8505   Vbig5_coding_system = Qnil;
8506
8507   staticpro (&Vcode_conversion_work_buf_list);
8508   Vcode_conversion_work_buf_list = Qnil;
8509
8510   staticpro (&Vcode_conversion_reused_work_buf);
8511   Vcode_conversion_reused_work_buf = Qnil;
8512
8513   DEFSYM (Qcharset, "charset");
8514   DEFSYM (Qtarget_idx, "target-idx");
8515   DEFSYM (Qcoding_system_history, "coding-system-history");
8516   Fset (Qcoding_system_history, Qnil);
8517
8518   /* Target FILENAME is the first argument.  */
8519   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8520   /* Target FILENAME is the third argument.  */
8521   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8522
8523   DEFSYM (Qcall_process, "call-process");
8524   /* Target PROGRAM is the first argument.  */
8525   Fput (Qcall_process, Qtarget_idx, make_number (0));
8526
8527   DEFSYM (Qcall_process_region, "call-process-region");
8528   /* Target PROGRAM is the third argument.  */
8529   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8530
8531   DEFSYM (Qstart_process, "start-process");
8532   /* Target PROGRAM is the third argument.  */
8533   Fput (Qstart_process, Qtarget_idx, make_number (2));
8534
8535   DEFSYM (Qopen_network_stream, "open-network-stream");
8536   /* Target SERVICE is the fourth argument.  */
8537   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8538
8539   DEFSYM (Qcoding_system, "coding-system");
8540   DEFSYM (Qcoding_aliases, "coding-aliases");
8541
8542   DEFSYM (Qeol_type, "eol-type");
8543   DEFSYM (Qunix, "unix");
8544   DEFSYM (Qdos, "dos");
8545
8546   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8547   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8548   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8549   DEFSYM (Qdefault_char, "default-char");
8550   DEFSYM (Qundecided, "undecided");
8551   DEFSYM (Qno_conversion, "no-conversion");
8552   DEFSYM (Qraw_text, "raw-text");
8553
8554   DEFSYM (Qiso_2022, "iso-2022");
8555
8556   DEFSYM (Qutf_8, "utf-8");
8557
8558   DEFSYM (Qutf_16, "utf-16");
8559   DEFSYM (Qbig, "big");
8560   DEFSYM (Qlittle, "little");
8561
8562   DEFSYM (Qshift_jis, "shift-jis");
8563   DEFSYM (Qbig5, "big5");
8564
8565   DEFSYM (Qcoding_system_p, "coding-system-p");
8566
8567   DEFSYM (Qcoding_system_error, "coding-system-error");
8568   Fput (Qcoding_system_error, Qerror_conditions,
8569         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8570   Fput (Qcoding_system_error, Qerror_message,
8571         build_string ("Invalid coding system"));
8572
8573   /* Intern this now in case it isn't already done.
8574      Setting this variable twice is harmless.
8575      But don't staticpro it here--that is done in alloc.c.  */
8576   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8577
8578   DEFSYM (Qtranslation_table, "translation-table");
8579   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8580   DEFSYM (Qtranslation_table_id, "translation-table-id");
8581   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8582   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8583
8584   DEFSYM (Qvalid_codes, "valid-codes");
8585
8586   DEFSYM (Qemacs_mule, "emacs-mule");
8587
8588   Vcoding_category_table
8589     = Fmake_vector (make_number (coding_category_max), Qnil);
8590   staticpro (&Vcoding_category_table);
8591   /* Followings are target of code detection.  */
8592   ASET (Vcoding_category_table, coding_category_iso_7,
8593         intern ("coding-category-iso-7"));
8594   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8595         intern ("coding-category-iso-7-tight"));
8596   ASET (Vcoding_category_table, coding_category_iso_8_1,
8597         intern ("coding-category-iso-8-1"));
8598   ASET (Vcoding_category_table, coding_category_iso_8_2,
8599         intern ("coding-category-iso-8-2"));
8600   ASET (Vcoding_category_table, coding_category_iso_7_else,
8601         intern ("coding-category-iso-7-else"));
8602   ASET (Vcoding_category_table, coding_category_iso_8_else,
8603         intern ("coding-category-iso-8-else"));
8604   ASET (Vcoding_category_table, coding_category_utf_8,
8605         intern ("coding-category-utf-8"));
8606   ASET (Vcoding_category_table, coding_category_utf_16_be,
8607         intern ("coding-category-utf-16-be"));
8608   ASET (Vcoding_category_table, coding_category_utf_16_le,
8609         intern ("coding-category-utf-16-le"));
8610   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8611         intern ("coding-category-utf-16-be-nosig"));
8612   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8613         intern ("coding-category-utf-16-le-nosig"));
8614   ASET (Vcoding_category_table, coding_category_charset,
8615         intern ("coding-category-charset"));
8616   ASET (Vcoding_category_table, coding_category_sjis,
8617         intern ("coding-category-sjis"));
8618   ASET (Vcoding_category_table, coding_category_big5,
8619         intern ("coding-category-big5"));
8620   ASET (Vcoding_category_table, coding_category_ccl,
8621         intern ("coding-category-ccl"));
8622   ASET (Vcoding_category_table, coding_category_emacs_mule,
8623         intern ("coding-category-emacs-mule"));
8624   /* Followings are NOT target of code detection.  */
8625   ASET (Vcoding_category_table, coding_category_raw_text,
8626         intern ("coding-category-raw-text"));
8627   ASET (Vcoding_category_table, coding_category_undecided,
8628         intern ("coding-category-undecided"));
8629
8630   defsubr (&Scoding_system_p);
8631   defsubr (&Sread_coding_system);
8632   defsubr (&Sread_non_nil_coding_system);
8633   defsubr (&Scheck_coding_system);
8634   defsubr (&Sdetect_coding_region);
8635   defsubr (&Sdetect_coding_string);
8636   defsubr (&Sfind_coding_systems_region_internal);
8637   defsubr (&Scheck_coding_systems_region);
8638   defsubr (&Sdecode_coding_region);
8639   defsubr (&Sencode_coding_region);
8640   defsubr (&Sdecode_coding_string);
8641   defsubr (&Sencode_coding_string);
8642   defsubr (&Sdecode_sjis_char);
8643   defsubr (&Sencode_sjis_char);
8644   defsubr (&Sdecode_big5_char);
8645   defsubr (&Sencode_big5_char);
8646   defsubr (&Sset_terminal_coding_system_internal);
8647   defsubr (&Sset_safe_terminal_coding_system_internal);
8648   defsubr (&Sterminal_coding_system);
8649   defsubr (&Sset_keyboard_coding_system_internal);
8650   defsubr (&Skeyboard_coding_system);
8651   defsubr (&Sfind_operation_coding_system);
8652   defsubr (&Sset_coding_system_priority);
8653   defsubr (&Sdefine_coding_system_internal);
8654   defsubr (&Sdefine_coding_system_alias);
8655   defsubr (&Scoding_system_base);
8656   defsubr (&Scoding_system_plist);
8657   defsubr (&Scoding_system_aliases);
8658   defsubr (&Scoding_system_eol_type);
8659   defsubr (&Scoding_system_priority_list);
8660
8661   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8662                doc: /* List of coding systems.
8663
8664 Do not alter the value of this variable manually.  This variable should be
8665 updated by the functions `define-coding-system' and
8666 `define-coding-system-alias'.  */);
8667   Vcoding_system_list = Qnil;
8668
8669   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8670                doc: /* Alist of coding system names.
8671 Each element is one element list of coding system name.
8672 This variable is given to `completing-read' as TABLE argument.
8673
8674 Do not alter the value of this variable manually.  This variable should be
8675 updated by the functions `make-coding-system' and
8676 `define-coding-system-alias'.  */);
8677   Vcoding_system_alist = Qnil;
8678
8679   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8680                doc: /* List of coding-categories (symbols) ordered by priority.
8681
8682 On detecting a coding system, Emacs tries code detection algorithms
8683 associated with each coding-category one by one in this order.  When
8684 one algorithm agrees with a byte sequence of source text, the coding
8685 system bound to the corresponding coding-category is selected.  */);
8686   {
8687     int i;
8688
8689     Vcoding_category_list = Qnil;
8690     for (i = coding_category_max - 1; i >= 0; i--)
8691       Vcoding_category_list
8692         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8693                  Vcoding_category_list);
8694   }
8695
8696   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8697                doc: /* Specify the coding system for read operations.
8698 It is useful to bind this variable with `let', but do not set it globally.
8699 If the value is a coding system, it is used for decoding on read operation.
8700 If not, an appropriate element is used from one of the coding system alists:
8701 There are three such tables, `file-coding-system-alist',
8702 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8703   Vcoding_system_for_read = Qnil;
8704
8705   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8706                doc: /* Specify the coding system for write operations.
8707 Programs bind this variable with `let', but you should not set it globally.
8708 If the value is a coding system, it is used for encoding of output,
8709 when writing it to a file and when sending it to a file or subprocess.
8710
8711 If this does not specify a coding system, an appropriate element
8712 is used from one of the coding system alists:
8713 There are three such tables, `file-coding-system-alist',
8714 `process-coding-system-alist', and `network-coding-system-alist'.
8715 For output to files, if the above procedure does not specify a coding system,
8716 the value of `buffer-file-coding-system' is used.  */);
8717   Vcoding_system_for_write = Qnil;
8718
8719   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8720                doc: /*
8721 Coding system used in the latest file or process I/O.  */);
8722   Vlast_coding_system_used = Qnil;
8723
8724   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8725                doc: /*
8726 *Non-nil means always inhibit code conversion of end-of-line format.
8727 See info node `Coding Systems' and info node `Text and Binary' concerning
8728 such conversion.  */);
8729   inhibit_eol_conversion = 0;
8730
8731   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8732                doc: /*
8733 Non-nil means process buffer inherits coding system of process output.
8734 Bind it to t if the process output is to be treated as if it were a file
8735 read from some filesystem.  */);
8736   inherit_process_coding_system = 0;
8737
8738   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8739                doc: /*
8740 Alist to decide a coding system to use for a file I/O operation.
8741 The format is ((PATTERN . VAL) ...),
8742 where PATTERN is a regular expression matching a file name,
8743 VAL is a coding system, a cons of coding systems, or a function symbol.
8744 If VAL is a coding system, it is used for both decoding and encoding
8745 the file contents.
8746 If VAL is a cons of coding systems, the car part is used for decoding,
8747 and the cdr part is used for encoding.
8748 If VAL is a function symbol, the function must return a coding system
8749 or a cons of coding systems which are used as above.  The function gets
8750 the arguments with which `find-operation-coding-systems' was called.
8751
8752 See also the function `find-operation-coding-system'
8753 and the variable `auto-coding-alist'.  */);
8754   Vfile_coding_system_alist = Qnil;
8755
8756   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8757                doc: /*
8758 Alist to decide a coding system to use for a process I/O operation.
8759 The format is ((PATTERN . VAL) ...),
8760 where PATTERN is a regular expression matching a program name,
8761 VAL is a coding system, a cons of coding systems, or a function symbol.
8762 If VAL is a coding system, it is used for both decoding what received
8763 from the program and encoding what sent to the program.
8764 If VAL is a cons of coding systems, the car part is used for decoding,
8765 and the cdr part is used for encoding.
8766 If VAL is a function symbol, the function must return a coding system
8767 or a cons of coding systems which are used as above.
8768
8769 See also the function `find-operation-coding-system'.  */);
8770   Vprocess_coding_system_alist = Qnil;
8771
8772   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8773                doc: /*
8774 Alist to decide a coding system to use for a network I/O operation.
8775 The format is ((PATTERN . VAL) ...),
8776 where PATTERN is a regular expression matching a network service name
8777 or is a port number to connect to,
8778 VAL is a coding system, a cons of coding systems, or a function symbol.
8779 If VAL is a coding system, it is used for both decoding what received
8780 from the network stream and encoding what sent to the network stream.
8781 If VAL is a cons of coding systems, the car part is used for decoding,
8782 and the cdr part is used for encoding.
8783 If VAL is a function symbol, the function must return a coding system
8784 or a cons of coding systems which are used as above.
8785
8786 See also the function `find-operation-coding-system'.  */);
8787   Vnetwork_coding_system_alist = Qnil;
8788
8789   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8790                doc: /* Coding system to use with system messages.
8791 Also used for decoding keyboard input on X Window system.  */);
8792   Vlocale_coding_system = Qnil;
8793
8794   /* The eol mnemonics are reset in startup.el system-dependently.  */
8795   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8796                doc: /*
8797 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8798   eol_mnemonic_unix = build_string (":");
8799
8800   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8801                doc: /*
8802 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8803   eol_mnemonic_dos = build_string ("\\");
8804
8805   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8806                doc: /*
8807 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8808   eol_mnemonic_mac = build_string ("/");
8809
8810   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8811                doc: /*
8812 *String displayed in mode line when end-of-line format is not yet determined.  */);
8813   eol_mnemonic_undecided = build_string (":");
8814
8815   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8816                doc: /*
8817 *Non-nil enables character translation while encoding and decoding.  */);
8818   Venable_character_translation = Qt;
8819
8820   DEFVAR_LISP ("standard-translation-table-for-decode",
8821                &Vstandard_translation_table_for_decode,
8822                doc: /* Table for translating characters while decoding.  */);
8823   Vstandard_translation_table_for_decode = Qnil;
8824
8825   DEFVAR_LISP ("standard-translation-table-for-encode",
8826                &Vstandard_translation_table_for_encode,
8827                doc: /* Table for translating characters while encoding.  */);
8828   Vstandard_translation_table_for_encode = Qnil;
8829
8830   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8831                doc: /* Alist of charsets vs revision numbers.
8832 While encoding, if a charset (car part of an element) is found,
8833 designate it with the escape sequence identifying revision (cdr part
8834 of the element).  */);
8835   Vcharset_revision_table = Qnil;
8836
8837   DEFVAR_LISP ("default-process-coding-system",
8838                &Vdefault_process_coding_system,
8839                doc: /* Cons of coding systems used for process I/O by default.
8840 The car part is used for decoding a process output,
8841 the cdr part is used for encoding a text to be sent to a process.  */);
8842   Vdefault_process_coding_system = Qnil;
8843
8844   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8845                doc: /*
8846 Table of extra Latin codes in the range 128..159 (inclusive).
8847 This is a vector of length 256.
8848 If Nth element is non-nil, the existence of code N in a file
8849 \(or output of subprocess) doesn't prevent it to be detected as
8850 a coding system of ISO 2022 variant which has a flag
8851 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8852 or reading output of a subprocess.
8853 Only 128th through 159th elements has a meaning.  */);
8854   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8855
8856   DEFVAR_LISP ("select-safe-coding-system-function",
8857                &Vselect_safe_coding_system_function,
8858                doc: /*
8859 Function to call to select safe coding system for encoding a text.
8860
8861 If set, this function is called to force a user to select a proper
8862 coding system which can encode the text in the case that a default
8863 coding system used in each operation can't encode the text.
8864
8865 The default value is `select-safe-coding-system' (which see).  */);
8866   Vselect_safe_coding_system_function = Qnil;
8867
8868   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8869                &inhibit_iso_escape_detection,
8870                doc: /*
8871 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8872
8873 By default, on reading a file, Emacs tries to detect how the text is
8874 encoded.  This code detection is sensitive to escape sequences.  If
8875 the sequence is valid as ISO2022, the code is determined as one of
8876 the ISO2022 encodings, and the file is decoded by the corresponding
8877 coding system (e.g. `iso-2022-7bit').
8878
8879 However, there may be a case that you want to read escape sequences in
8880 a file as is.  In such a case, you can set this variable to non-nil.
8881 Then, as the code detection ignores any escape sequences, no file is
8882 detected as encoded in some ISO2022 encoding.  The result is that all
8883 escape sequences become visible in a buffer.
8884
8885 The default value is nil, and it is strongly recommended not to change
8886 it.  That is because many Emacs Lisp source files that contain
8887 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8888 in Emacs's distribution, and they won't be decoded correctly on
8889 reading if you suppress escape sequence detection.
8890
8891 The other way to read escape sequences in a file without decoding is
8892 to explicitly specify some coding system that doesn't use ISO2022's
8893 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8894   inhibit_iso_escape_detection = 0;
8895
8896   {
8897     Lisp_Object args[coding_arg_max];
8898     Lisp_Object plist[14];
8899     int i;
8900
8901     for (i = 0; i < coding_arg_max; i++)
8902       args[i] = Qnil;
8903
8904     plist[0] = intern (":name");
8905     plist[1] = args[coding_arg_name] = Qno_conversion;
8906     plist[2] = intern (":mnemonic");
8907     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8908     plist[4] = intern (":coding-type");
8909     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8910     plist[6] = intern (":ascii-compatible-p");
8911     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8912     plist[8] = intern (":default-char");
8913     plist[9] = args[coding_arg_default_char] = make_number (0);
8914     plist[10] = intern (":docstring");
8915     plist[11] = build_string ("Do no conversion.\n\
8916 \n\
8917 When you visit a file with this coding, the file is read into a\n\
8918 unibyte buffer as is, thus each byte of a file is treated as a\n\
8919 character.");
8920     plist[12] = intern (":eol-type");
8921     plist[13] = args[coding_arg_eol_type] = Qunix;
8922     args[coding_arg_plist] = Flist (14, plist);
8923     Fdefine_coding_system_internal (coding_arg_max, args);
8924   }
8925
8926   setup_coding_system (Qno_conversion, &keyboard_coding);
8927   setup_coding_system (Qno_conversion, &terminal_coding);
8928   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8929 }
8930
8931 char *
8932 emacs_strerror (error_number)
8933      int error_number;
8934 {
8935   char *str;
8936
8937   synchronize_system_messages_locale ();
8938   str = strerror (error_number);
8939
8940   if (! NILP (Vlocale_coding_system))
8941     {
8942       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8943                                                       Vlocale_coding_system,
8944                                                       0);
8945       str = (char *) XSTRING (dec)->data;
8946     }
8947
8948   return str;
8949 }
8950
8951 #endif /* emacs */