src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf;
 216   int *charbuf_end = charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 int coding_system_require_warning;
 327
 328 Lisp_Object Vselect_safe_coding_system_function;
 329
 330 /* Mnemonic string for each format of end-of-line.  */
 331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 332 /* Mnemonic string to indicate format of end-of-line is not yet
 333    decided.  */
 334 Lisp_Object eol_mnemonic_undecided;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding-system for reading files and receiving data from process.  */
 350 Lisp_Object Vcoding_system_for_read;
 351 /* Coding-system for writing files and sending data to process.  */
 352 Lisp_Object Vcoding_system_for_write;
 353 /* Coding-system actually used in the latest I/O.  */
 354 Lisp_Object Vlast_coding_system_used;
 355 /* Set to non-nil when an error is detected while code conversion.  */
 356 Lisp_Object Vlast_code_conversion_error;
 357 /* A vector of length 256 which contains information about special
 358    Latin codes (especially for dealing with Microsoft codes).  */
 359 Lisp_Object Vlatin_extra_code_table;
 360
 361 /* Flag to inhibit code conversion of end-of-line format.  */
 362 int inhibit_eol_conversion;
 363
 364 /* Flag to inhibit ISO2022 escape sequence detection.  */
 365 int inhibit_iso_escape_detection;
 366
 367 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 368 int inherit_process_coding_system;
 369
 370 /* Coding system to be used to encode text for terminal display.  */
 371 struct coding_system terminal_coding;
 372
 373 /* Coding system to be used to encode text for terminal display when
 374    terminal coding system is nil.  */
 375 struct coding_system safe_terminal_coding;
 376
 377 /* Coding system of what is sent from terminal keyboard.  */
 378 struct coding_system keyboard_coding;
 379
 380 Lisp_Object Vfile_coding_system_alist;
 381 Lisp_Object Vprocess_coding_system_alist;
 382 Lisp_Object Vnetwork_coding_system_alist;
 383
 384 Lisp_Object Vlocale_coding_system;
 385
 386 #endif /* emacs */
 387
 388 /* Flag to tell if we look up translation table on character code
 389    conversion.  */
 390 Lisp_Object Venable_character_translation;
 391 /* Standard translation table to look up on decoding (reading).  */
 392 Lisp_Object Vstandard_translation_table_for_decode;
 393 /* Standard translation table to look up on encoding (writing).  */
 394 Lisp_Object Vstandard_translation_table_for_encode;
 395
 396 Lisp_Object Qtranslation_table;
 397 Lisp_Object Qtranslation_table_id;
 398 Lisp_Object Qtranslation_table_for_decode;
 399 Lisp_Object Qtranslation_table_for_encode;
 400
 401 /* Alist of charsets vs revision number.  */
 402 static Lisp_Object Vcharset_revision_table;
 403
 404 /* Default coding systems used for process I/O.  */
 405 Lisp_Object Vdefault_process_coding_system;
 406
 407 /* Char table for translating Quail and self-inserting input.  */
 408 Lisp_Object Vtranslation_table_for_input;
 409
 410 /* Two special coding systems.  */
 411 Lisp_Object Vsjis_coding_system;
 412 Lisp_Object Vbig5_coding_system;
 413
 414 static void record_conversion_result (struct coding_system *coding,
 415                                       enum coding_result_code result);
 416 static int detect_coding_utf_8 P_ ((struct coding_system *,
 417                                     struct coding_detection_info *info));
 418 static void decode_coding_utf_8 P_ ((struct coding_system *));
 419 static int encode_coding_utf_8 P_ ((struct coding_system *));
 420
 421 static int detect_coding_utf_16 P_ ((struct coding_system *,
 422                                      struct coding_detection_info *info));
 423 static void decode_coding_utf_16 P_ ((struct coding_system *));
 424 static int encode_coding_utf_16 P_ ((struct coding_system *));
 425
 426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 427                                        struct coding_detection_info *info));
 428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 430
 431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 432                                          struct coding_detection_info *info));
 433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 435
 436 static int detect_coding_sjis P_ ((struct coding_system *,
 437                                    struct coding_detection_info *info));
 438 static void decode_coding_sjis P_ ((struct coding_system *));
 439 static int encode_coding_sjis P_ ((struct coding_system *));
 440
 441 static int detect_coding_big5 P_ ((struct coding_system *,
 442                                    struct coding_detection_info *info));
 443 static void decode_coding_big5 P_ ((struct coding_system *));
 444 static int encode_coding_big5 P_ ((struct coding_system *));
 445
 446 static int detect_coding_ccl P_ ((struct coding_system *,
 447                                   struct coding_detection_info *info));
 448 static void decode_coding_ccl P_ ((struct coding_system *));
 449 static int encode_coding_ccl P_ ((struct coding_system *));
 450
 451 static void decode_coding_raw_text P_ ((struct coding_system *));
 452 static int encode_coding_raw_text P_ ((struct coding_system *));
 453
 454
 455 /* ISO2022 section */
 456
 457 #define CODING_ISO_INITIAL(coding, reg)                 \
 458   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 459                      coding_attr_iso_initial),          \
 460                reg)))
 461
 462
 463 #define CODING_ISO_REQUEST(coding, charset_id)  \
 464   ((charset_id <= (coding)->max_charset_id      \
 465     ? (coding)->safe_charsets[charset_id]       \
 466     : -1))
 467
 468
 469 #define CODING_ISO_FLAGS(coding)        \
 470   ((coding)->spec.iso_2022.flags)
 471 #define CODING_ISO_DESIGNATION(coding, reg)     \
 472   ((coding)->spec.iso_2022.current_designation[reg])
 473 #define CODING_ISO_INVOCATION(coding, plane)    \
 474   ((coding)->spec.iso_2022.current_invocation[plane])
 475 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 476   ((coding)->spec.iso_2022.single_shifting)
 477 #define CODING_ISO_BOL(coding)  \
 478   ((coding)->spec.iso_2022.bol)
 479 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 480   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 481
 482 /* Control characters of ISO2022.  */
 483                         /* code */      /* function */
 484 #define ISO_CODE_LF     0x0A            /* line-feed */
 485 #define ISO_CODE_CR     0x0D            /* carriage-return */
 486 #define ISO_CODE_SO     0x0E            /* shift-out */
 487 #define ISO_CODE_SI     0x0F            /* shift-in */
 488 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 489 #define ISO_CODE_ESC    0x1B            /* escape */
 490 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 491 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 492 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 493
 494 /* All code (1-byte) of ISO2022 is classified into one of the
 495    followings.  */
 496 enum iso_code_class_type
 497   {
 498     ISO_control_0,              /* Control codes in the range
 499                                    0x00..0x1F and 0x7F, except for the
 500                                    following 5 codes.  */
 501     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 502     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 503     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 504     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 505     ISO_control_1,              /* Control codes in the range
 506                                    0x80..0x9F, except for the
 507                                    following 3 codes.  */
 508     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 509     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 510     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 511     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 512     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 513     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 514     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 515   };
 516
 517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 518     `iso-flags' attribute of an iso2022 coding system.  */
 519
 520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 521    instead of the correct short-form sequence (e.g. ESC $ A).  */
 522 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 523
 524 /* If set, reset graphic planes and registers at end-of-line to the
 525    initial state.  */
 526 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 527
 528 /* If set, reset graphic planes and registers before any control
 529    characters to the initial state.  */
 530 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 531
 532 /* If set, encode by 7-bit environment.  */
 533 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 534
 535 /* If set, use locking-shift function.  */
 536 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 537
 538 /* If set, use single-shift function.  Overwrite
 539    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 540 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 541
 542 /* If set, use designation escape sequence.  */
 543 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 544
 545 /* If set, produce revision number sequence.  */
 546 #define CODING_ISO_FLAG_REVISION        0x0080
 547
 548 /* If set, produce ISO6429's direction specifying sequence.  */
 549 #define CODING_ISO_FLAG_DIRECTION       0x0100
 550
 551 /* If set, assume designation states are reset at beginning of line on
 552    output.  */
 553 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 554
 555 /* If set, designation sequence should be placed at beginning of line
 556    on output.  */
 557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 558
 559 /* If set, do not encode unsafe charactes on output.  */
 560 #define CODING_ISO_FLAG_SAFE            0x0800
 561
 562 /* If set, extra latin codes (128..159) are accepted as a valid code
 563    on input.  */
 564 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 565
 566 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 567
 568 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 569
 570 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 571
 572 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 573
 574 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 575
 576 /* A character to be produced on output if encoding of the original
 577    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 579
 580
 581 /* UTF-16 section */
 582 #define CODING_UTF_16_BOM(coding)       \
 583   ((coding)->spec.utf_16.bom)
 584
 585 #define CODING_UTF_16_ENDIAN(coding)    \
 586   ((coding)->spec.utf_16.endian)
 587
 588 #define CODING_UTF_16_SURROGATE(coding) \
 589   ((coding)->spec.utf_16.surrogate)
 590
 591
 592 /* CCL section */
 593 #define CODING_CCL_DECODER(coding)      \
 594   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 595 #define CODING_CCL_ENCODER(coding)      \
 596   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 597 #define CODING_CCL_VALIDS(coding)                                          \
 598   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 599
 600 /* Index for each coding category in `coding_categories' */
 601
 602 enum coding_category
 603   {
 604     coding_category_iso_7,
 605     coding_category_iso_7_tight,
 606     coding_category_iso_8_1,
 607     coding_category_iso_8_2,
 608     coding_category_iso_7_else,
 609     coding_category_iso_8_else,
 610     coding_category_utf_8,
 611     coding_category_utf_16_auto,
 612     coding_category_utf_16_be,
 613     coding_category_utf_16_le,
 614     coding_category_utf_16_be_nosig,
 615     coding_category_utf_16_le_nosig,
 616     coding_category_charset,
 617     coding_category_sjis,
 618     coding_category_big5,
 619     coding_category_ccl,
 620     coding_category_emacs_mule,
 621     /* All above are targets of code detection.  */
 622     coding_category_raw_text,
 623     coding_category_undecided,
 624     coding_category_max
 625   };
 626
 627 /* Definitions of flag bits used in detect_coding_XXXX.  */
 628 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 629 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 630 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 631 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 632 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 633 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 634 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 635 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 636 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 637 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 638 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 639 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 640 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 641 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 642 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 643 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 644 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 645 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 646
 647 /* This value is returned if detect_coding_mask () find nothing other
 648    than ASCII characters.  */
 649 #define CATEGORY_MASK_ANY               \
 650   (CATEGORY_MASK_ISO_7                  \
 651    | CATEGORY_MASK_ISO_7_TIGHT          \
 652    | CATEGORY_MASK_ISO_8_1              \
 653    | CATEGORY_MASK_ISO_8_2              \
 654    | CATEGORY_MASK_ISO_7_ELSE           \
 655    | CATEGORY_MASK_ISO_8_ELSE           \
 656    | CATEGORY_MASK_UTF_8                \
 657    | CATEGORY_MASK_UTF_16_BE            \
 658    | CATEGORY_MASK_UTF_16_LE            \
 659    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 660    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 661    | CATEGORY_MASK_CHARSET              \
 662    | CATEGORY_MASK_SJIS                 \
 663    | CATEGORY_MASK_BIG5                 \
 664    | CATEGORY_MASK_CCL                  \
 665    | CATEGORY_MASK_EMACS_MULE)
 666
 667
 668 #define CATEGORY_MASK_ISO_7BIT \
 669   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 670
 671 #define CATEGORY_MASK_ISO_8BIT \
 672   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 673
 674 #define CATEGORY_MASK_ISO_ELSE \
 675   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 676
 677 #define CATEGORY_MASK_ISO_ESCAPE        \
 678   (CATEGORY_MASK_ISO_7                  \
 679    | CATEGORY_MASK_ISO_7_TIGHT          \
 680    | CATEGORY_MASK_ISO_7_ELSE           \
 681    | CATEGORY_MASK_ISO_8_ELSE)
 682
 683 #define CATEGORY_MASK_ISO       \
 684   (  CATEGORY_MASK_ISO_7BIT     \
 685      | CATEGORY_MASK_ISO_8BIT   \
 686      | CATEGORY_MASK_ISO_ELSE)
 687
 688 #define CATEGORY_MASK_UTF_16            \
 689   (CATEGORY_MASK_UTF_16_BE              \
 690    | CATEGORY_MASK_UTF_16_LE            \
 691    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 692    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 693
 694
 695 /* List of symbols `coding-category-xxx' ordered by priority.  This
 696    variable is exposed to Emacs Lisp.  */
 697 static Lisp_Object Vcoding_category_list;
 698
 699 /* Table of coding categories (Lisp symbols).  This variable is for
 700    internal use oly.  */
 701 static Lisp_Object Vcoding_category_table;
 702
 703 /* Table of coding-categories ordered by priority.  */
 704 static enum coding_category coding_priorities[coding_category_max];
 705
 706 /* Nth element is a coding context for the coding system bound to the
 707    Nth coding category.  */
 708 static struct coding_system coding_categories[coding_category_max];
 709
 710 /*** Commonly used macros and functions ***/
 711
 712 #ifndef min
 713 #define min(a, b) ((a) < (b) ? (a) : (b))
 714 #endif
 715 #ifndef max
 716 #define max(a, b) ((a) > (b) ? (a) : (b))
 717 #endif
 718
 719 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 720   do {                                                  \
 721     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 722     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 723   } while (0)
 724
 725
 726 /* Safely get one byte from the source text pointed by SRC which ends
 727    at SRC_END, and set C to that byte.  If there are not enough bytes
 728    in the source, it jumps to `no_more_source'.  If multibytep is
 729    nonzero, and a multibyte character is found at SRC, set C to the
 730    negative value of the character code.  The caller should declare
 731    and set these variables appropriately in advance:
 732         src, src_end, multibytep */
 733
 734 #define ONE_MORE_BYTE(c)                                \
 735   do {                                                  \
 736     if (src == src_end)                                 \
 737       {                                                 \
 738         if (src_base < src)                             \
 739           record_conversion_result                      \
 740             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 741         goto no_more_source;                            \
 742       }                                                 \
 743     c = *src++;                                         \
 744     if (multibytep && (c & 0x80))                       \
 745       {                                                 \
 746         if ((c & 0xFE) == 0xC0)                         \
 747           c = ((c & 1) << 6) | *src++;                  \
 748         else                                            \
 749           {                                             \
 750             c = - string_char (--src, &src, NULL);      \
 751             record_conversion_result                    \
 752               (coding, CODING_RESULT_INVALID_SRC);      \
 753           }                                             \
 754       }                                                 \
 755     consumed_chars++;                                   \
 756   } while (0)
 757
 758
 759 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 760   do {                                                  \
 761     c = *src++;                                         \
 762     if (multibytep && (c & 0x80))                       \
 763       {                                                 \
 764         if ((c & 0xFE) == 0xC0)                         \
 765           c = ((c & 1) << 6) | *src++;                  \
 766         else                                            \
 767           {                                             \
 768             c = - string_char (--src, &src, NULL);      \
 769             record_conversion_result                    \
 770               (coding, CODING_RESULT_INVALID_SRC);      \
 771           }                                             \
 772       }                                                 \
 773     consumed_chars++;                                   \
 774   } while (0)
 775
 776
 777 /* Store a byte C in the place pointed by DST and increment DST to the
 778    next free point, and increment PRODUCED_CHARS.  The caller should
 779    assure that C is 0..127, and declare and set the variable `dst'
 780    appropriately in advance.
 781 */
 782
 783
 784 #define EMIT_ONE_ASCII_BYTE(c)  \
 785   do {                          \
 786     produced_chars++;           \
 787     *dst++ = (c);               \
 788   } while (0)
 789
 790
 791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 792
 793 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 794   do {                                  \
 795     produced_chars += 2;                \
 796     *dst++ = (c1), *dst++ = (c2);       \
 797   } while (0)
 798
 799
 800 /* Store a byte C in the place pointed by DST and increment DST to the
 801    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 802    nonzero, store in an appropriate multibyte from.  The caller should
 803    declare and set the variables `dst' and `multibytep' appropriately
 804    in advance.  */
 805
 806 #define EMIT_ONE_BYTE(c)                \
 807   do {                                  \
 808     produced_chars++;                   \
 809     if (multibytep)                     \
 810       {                                 \
 811         int ch = (c);                   \
 812         if (ch >= 0x80)                 \
 813           ch = BYTE8_TO_CHAR (ch);      \
 814         CHAR_STRING_ADVANCE (ch, dst);  \
 815       }                                 \
 816     else                                \
 817       *dst++ = (c);                     \
 818   } while (0)
 819
 820
 821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 822
 823 #define EMIT_TWO_BYTES(c1, c2)          \
 824   do {                                  \
 825     produced_chars += 2;                \
 826     if (multibytep)                     \
 827       {                                 \
 828         int ch;                         \
 829                                         \
 830         ch = (c1);                      \
 831         if (ch >= 0x80)                 \
 832           ch = BYTE8_TO_CHAR (ch);      \
 833         CHAR_STRING_ADVANCE (ch, dst);  \
 834         ch = (c2);                      \
 835         if (ch >= 0x80)                 \
 836           ch = BYTE8_TO_CHAR (ch);      \
 837         CHAR_STRING_ADVANCE (ch, dst);  \
 838       }                                 \
 839     else                                \
 840       {                                 \
 841         *dst++ = (c1);                  \
 842         *dst++ = (c2);                  \
 843       }                                 \
 844   } while (0)
 845
 846
 847 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 848   do {                                  \
 849     EMIT_ONE_BYTE (c1);                 \
 850     EMIT_TWO_BYTES (c2, c3);            \
 851   } while (0)
 852
 853
 854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 855   do {                                          \
 856     EMIT_TWO_BYTES (c1, c2);                    \
 857     EMIT_TWO_BYTES (c3, c4);                    \
 858   } while (0)
 859
 860
 861 static void
 862 record_conversion_result (struct coding_system *coding,
 863                           enum coding_result_code result)
 864 {
 865   coding->result = result;
 866   switch (result)
 867     {
 868     case CODING_RESULT_INSUFFICIENT_SRC:
 869       Vlast_code_conversion_error = Qinsufficient_source;
 870       break;
 871     case CODING_RESULT_INCONSISTENT_EOL:
 872       Vlast_code_conversion_error = Qinconsistent_eol;
 873       break;
 874     case CODING_RESULT_INVALID_SRC:
 875       Vlast_code_conversion_error = Qinvalid_source;
 876       break;
 877     case CODING_RESULT_INTERRUPT:
 878       Vlast_code_conversion_error = Qinterrupted;
 879       break;
 880     case CODING_RESULT_INSUFFICIENT_MEM:
 881       Vlast_code_conversion_error = Qinsufficient_memory;
 882       break;
 883     }
 884 }
 885
 886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 887   do {                                                                       \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded)                                                  \
 891       {                                                                      \
 892         const unsigned char *orig = coding->source;                          \
 893         EMACS_INT offset;                                                    \
 894                                                                              \
 895         coding_set_source (coding);                                          \
 896         offset = coding->source - orig;                                      \
 897         src += offset;                                                       \
 898         src_base += offset;                                                  \
 899         src_end += offset;                                                   \
 900       }                                                                      \
 901   } while (0)
 902
 903
 904 #define ASSURE_DESTINATION(bytes)                               \
 905   do {                                                          \
 906     if (dst + (bytes) >= dst_end)                               \
 907       {                                                         \
 908         int more_bytes = charbuf_end - charbuf + (bytes);       \
 909                                                                 \
 910         dst = alloc_destination (coding, more_bytes, dst);      \
 911         dst_end = coding->destination + coding->dst_bytes;      \
 912       }                                                         \
 913   } while (0)
 914
 915
 916
 917 static void
 918 coding_set_source (coding)
 919      struct coding_system *coding;
 920 {
 921   if (BUFFERP (coding->src_object))
 922     {
 923       struct buffer *buf = XBUFFER (coding->src_object);
 924
 925       if (coding->src_pos < 0)
 926         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 927       else
 928         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 929     }
 930   else if (STRINGP (coding->src_object))
 931     {
 932       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 933     }
 934   else
 935     /* Otherwise, the source is C string and is never relocated
 936        automatically.  Thus we don't have to update anything.  */
 937     ;
 938 }
 939
 940 static void
 941 coding_set_destination (coding)
 942      struct coding_system *coding;
 943 {
 944   if (BUFFERP (coding->dst_object))
 945     {
 946       if (coding->src_pos < 0)
 947         {
 948           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 949           coding->dst_bytes = (GAP_END_ADDR
 950                                - (coding->src_bytes - coding->consumed)
 951                                - coding->destination);
 952         }
 953       else
 954         {
 955           /* We are sure that coding->dst_pos_byte is before the gap
 956              of the buffer. */
 957           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 958                                  + coding->dst_pos_byte - 1);
 959           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 960                                - coding->destination);
 961         }
 962     }
 963   else
 964     /* Otherwise, the destination is C string and is never relocated
 965        automatically.  Thus we don't have to update anything.  */
 966     ;
 967 }
 968
 969
 970 static void
 971 coding_alloc_by_realloc (coding, bytes)
 972      struct coding_system *coding;
 973      EMACS_INT bytes;
 974 {
 975   coding->destination = (unsigned char *) xrealloc (coding->destination,
 976                                                     coding->dst_bytes + bytes);
 977   coding->dst_bytes += bytes;
 978 }
 979
 980 static void
 981 coding_alloc_by_making_gap (coding, bytes)
 982      struct coding_system *coding;
 983      EMACS_INT bytes;
 984 {
 985   if (BUFFERP (coding->dst_object)
 986       && EQ (coding->src_object, coding->dst_object))
 987     {
 988       EMACS_INT add = coding->src_bytes - coding->consumed;
 989
 990       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 991       make_gap (bytes);
 992       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 993     }
 994   else
 995     {
 996       Lisp_Object this_buffer;
 997
 998       this_buffer = Fcurrent_buffer ();
 999       set_buffer_internal (XBUFFER (coding->dst_object));
1000       make_gap (bytes);
1001       set_buffer_internal (XBUFFER (this_buffer));
1002     }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008      struct coding_system *coding;
1009      EMACS_INT nbytes;
1010      unsigned char *dst;
1011 {
1012   EMACS_INT offset = dst - coding->destination;
1013
1014   if (BUFFERP (coding->dst_object))
1015     coding_alloc_by_making_gap (coding, nbytes);
1016   else
1017     coding_alloc_by_realloc (coding, nbytes);
1018   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019   coding_set_destination (coding);
1020   dst = coding->destination + offset;
1021   return dst;
1022 }
1023
1024 /** Macros for annotations.  */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027    composition and charset).  */
1028 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031    format:
1032      [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033    LENGTH is the number of elements in the annotation.
1034    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035    FROM and TO specify the range of text annotated.  They are relative
1036    to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038    The format of the following elements depend on ANNOTATION_MASK.
1039
1040    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041    follows:
1042      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043    METHOD is one of enum composition_method.
1044    Optionnal COMPOSITION-COMPONENTS are characters and composition
1045    rules.
1046
1047    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048    follows.  */
1049
1050 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to)   \
1051   do {                                                  \
1052     *(buf)++ = -(len);                                  \
1053     *(buf)++ = (mask);                                  \
1054     *(buf)++ = (from);                                  \
1055     *(buf)++ = (to);                                    \
1056     coding->annotated = 1;                              \
1057   } while (0);
1058
1059 #define ADD_COMPOSITION_DATA(buf, from, to, method)                           \
1060   do {                                                                        \
1061     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062     *buf++ = method;                                                          \
1063   } while (0)
1064
1065
1066 #define ADD_CHARSET_DATA(buf, from, to, id)                               \
1067   do {                                                                    \
1068     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069     *buf++ = id;                                                          \
1070   } while (0)
1071
1072 \f
1073 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077 \f
1078 /*** 3. UTF-8 ***/
1079
1080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1081    Check if a text is encoded in UTF-8.  If it is, return 1, else
1082    return 0.  */
1083
1084 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1085 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1086 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091 static int
1092 detect_coding_utf_8 (coding, detect_info)
1093      struct coding_system *coding;
1094      struct coding_detection_info *detect_info;
1095 {
1096   const unsigned char *src = coding->source, *src_base;
1097   const unsigned char *src_end = coding->source + coding->src_bytes;
1098   int multibytep = coding->src_multibyte;
1099   int consumed_chars = 0;
1100   int found = 0;
1101
1102   detect_info->checked |= CATEGORY_MASK_UTF_8;
1103   /* A coding system of this category is always ASCII compatible.  */
1104   src += coding->head_ascii;
1105
1106   while (1)
1107     {
1108       int c, c1, c2, c3, c4;
1109
1110       src_base = src;
1111       ONE_MORE_BYTE (c);
1112       if (c < 0 || UTF_8_1_OCTET_P (c))
1113         continue;
1114       ONE_MORE_BYTE (c1);
1115       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1116         break;
1117       if (UTF_8_2_OCTET_LEADING_P (c))
1118         {
1119           found = CATEGORY_MASK_UTF_8;
1120           continue;
1121         }
1122       ONE_MORE_BYTE (c2);
1123       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1124         break;
1125       if (UTF_8_3_OCTET_LEADING_P (c))
1126         {
1127           found = CATEGORY_MASK_UTF_8;
1128           continue;
1129         }
1130       ONE_MORE_BYTE (c3);
1131       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1132         break;
1133       if (UTF_8_4_OCTET_LEADING_P (c))
1134         {
1135           found = CATEGORY_MASK_UTF_8;
1136           continue;
1137         }
1138       ONE_MORE_BYTE (c4);
1139       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1140         break;
1141       if (UTF_8_5_OCTET_LEADING_P (c))
1142         {
1143           found = CATEGORY_MASK_UTF_8;
1144           continue;
1145         }
1146       break;
1147     }
1148   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1149   return 0;
1150
1151  no_more_source:
1152   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1153     {
1154       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1155       return 0;
1156     }
1157   detect_info->found |= found;
1158   return 1;
1159 }
1160
1161
1162 static void
1163 decode_coding_utf_8 (coding)
1164      struct coding_system *coding;
1165 {
1166   const unsigned char *src = coding->source + coding->consumed;
1167   const unsigned char *src_end = coding->source + coding->src_bytes;
1168   const unsigned char *src_base;
1169   int *charbuf = coding->charbuf;
1170   int *charbuf_end = charbuf + coding->charbuf_size;
1171   int consumed_chars = 0, consumed_chars_base;
1172   int multibytep = coding->src_multibyte;
1173   Lisp_Object attr, charset_list;
1174
1175   CODING_GET_INFO (coding, attr, charset_list);
1176
1177   while (1)
1178     {
1179       int c, c1, c2, c3, c4, c5;
1180
1181       src_base = src;
1182       consumed_chars_base = consumed_chars;
1183
1184       if (charbuf >= charbuf_end)
1185         break;
1186
1187       ONE_MORE_BYTE (c1);
1188       if (c1 < 0)
1189         {
1190           c = - c1;
1191         }
1192       else if (UTF_8_1_OCTET_P(c1))
1193         {
1194           c = c1;
1195         }
1196       else
1197         {
1198           ONE_MORE_BYTE (c2);
1199           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1200             goto invalid_code;
1201           if (UTF_8_2_OCTET_LEADING_P (c1))
1202             {
1203               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204               /* Reject overlong sequences here and below.  Encoders
1205                  producing them are incorrect, they can be misleading,
1206                  and they mess up read/write invariance.  */
1207               if (c < 128)
1208                 goto invalid_code;
1209             }
1210           else
1211             {
1212               ONE_MORE_BYTE (c3);
1213               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1214                 goto invalid_code;
1215               if (UTF_8_3_OCTET_LEADING_P (c1))
1216                 {
1217                   c = (((c1 & 0xF) << 12)
1218                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1219                   if (c < 0x800
1220                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1221                     goto invalid_code;
1222                 }
1223               else
1224                 {
1225                   ONE_MORE_BYTE (c4);
1226                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227                     goto invalid_code;
1228                   if (UTF_8_4_OCTET_LEADING_P (c1))
1229                     {
1230                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1232                     if (c < 0x10000)
1233                       goto invalid_code;
1234                     }
1235                   else
1236                     {
1237                       ONE_MORE_BYTE (c5);
1238                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1239                         goto invalid_code;
1240                       if (UTF_8_5_OCTET_LEADING_P (c1))
1241                         {
1242                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244                                | (c5 & 0x3F));
1245                           if ((c > MAX_CHAR) || (c < 0x200000))
1246                             goto invalid_code;
1247                         }
1248                       else
1249                         goto invalid_code;
1250                     }
1251                 }
1252             }
1253         }
1254
1255       *charbuf++ = c;
1256       continue;
1257
1258     invalid_code:
1259       src = src_base;
1260       consumed_chars = consumed_chars_base;
1261       ONE_MORE_BYTE (c);
1262       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263       coding->errors++;
1264     }
1265
1266  no_more_source:
1267   coding->consumed_char += consumed_chars_base;
1268   coding->consumed = src_base - coding->source;
1269   coding->charbuf_used = charbuf - coding->charbuf;
1270 }
1271
1272
1273 static int
1274 encode_coding_utf_8 (coding)
1275      struct coding_system *coding;
1276 {
1277   int multibytep = coding->dst_multibyte;
1278   int *charbuf = coding->charbuf;
1279   int *charbuf_end = charbuf + coding->charbuf_used;
1280   unsigned char *dst = coding->destination + coding->produced;
1281   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1282   int produced_chars = 0;
1283   int c;
1284
1285   if (multibytep)
1286     {
1287       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289       while (charbuf < charbuf_end)
1290         {
1291           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1292
1293           ASSURE_DESTINATION (safe_room);
1294           c = *charbuf++;
1295           if (CHAR_BYTE8_P (c))
1296             {
1297               c = CHAR_TO_BYTE8 (c);
1298               EMIT_ONE_BYTE (c);
1299             }
1300           else
1301             {
1302               CHAR_STRING_ADVANCE (c, pend);
1303               for (p = str; p < pend; p++)
1304                 EMIT_ONE_BYTE (*p);
1305             }
1306         }
1307     }
1308   else
1309     {
1310       int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312       while (charbuf < charbuf_end)
1313         {
1314           ASSURE_DESTINATION (safe_room);
1315           c = *charbuf++;
1316           dst += CHAR_STRING (c, dst);
1317           produced_chars++;
1318         }
1319     }
1320   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1321   coding->produced_char += produced_chars;
1322   coding->produced = dst - coding->destination;
1323   return 0;
1324 }
1325
1326
1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1328    Check if a text is encoded in one of UTF-16 based coding systems.
1329    If it is, return 1, else return 0.  */
1330
1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1332   (((val) & 0xFC00) == 0xD800)
1333
1334 #define UTF_16_LOW_SURROGATE_P(val) \
1335   (((val) & 0xFC00) == 0xDC00)
1336
1337 #define UTF_16_INVALID_P(val)   \
1338   (((val) == 0xFFFE)            \
1339    || ((val) == 0xFFFF)         \
1340    || UTF_16_LOW_SURROGATE_P (val))
1341
1342
1343 static int
1344 detect_coding_utf_16 (coding, detect_info)
1345      struct coding_system *coding;
1346      struct coding_detection_info *detect_info;
1347 {
1348   const unsigned char *src = coding->source, *src_base = src;
1349   const unsigned char *src_end = coding->source + coding->src_bytes;
1350   int multibytep = coding->src_multibyte;
1351   int consumed_chars = 0;
1352   int c1, c2;
1353
1354   detect_info->checked |= CATEGORY_MASK_UTF_16;
1355   if (coding->mode & CODING_MODE_LAST_BLOCK
1356       && (coding->src_chars & 1))
1357     {
1358       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359       return 0;
1360     }
1361
1362   ONE_MORE_BYTE (c1);
1363   ONE_MORE_BYTE (c2);
1364   if ((c1 == 0xFF) && (c2 == 0xFE))
1365     {
1366       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367                              | CATEGORY_MASK_UTF_16_AUTO);
1368       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1371     }
1372   else if ((c1 == 0xFE) && (c2 == 0xFF))
1373     {
1374       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375                              | CATEGORY_MASK_UTF_16_AUTO);
1376       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379     }
1380   else if (c1 >= 0 && c2 >= 0)
1381     {
1382       unsigned char b1[256], b2[256];
1383       int b1_variants = 1, b2_variants = 1;
1384       int n;
1385
1386       bzero (b1, 256), bzero (b2, 256);
1387       b1[c1]++, b2[c2]++;
1388       for (n = 0; n < 256 && src < src_end; n++)
1389         {
1390           src_base = src;
1391           ONE_MORE_BYTE (c1);
1392           ONE_MORE_BYTE (c2);
1393           if (c1 < 0 || c2 < 0)
1394             break;
1395           if (! b1[c1++]) b1_variants++;
1396           if (! b2[c2++]) b2_variants++;
1397         }
1398       if (b1_variants < b2_variants)
1399         detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400       else
1401         detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402       detect_info->rejected
1403         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1404     }
1405  no_more_source:
1406   return 1;
1407 }
1408
1409 static void
1410 decode_coding_utf_16 (coding)
1411      struct coding_system *coding;
1412 {
1413   const unsigned char *src = coding->source + coding->consumed;
1414   const unsigned char *src_end = coding->source + coding->src_bytes;
1415   const unsigned char *src_base;
1416   int *charbuf = coding->charbuf;
1417   int *charbuf_end = charbuf + coding->charbuf_size;
1418   int consumed_chars = 0, consumed_chars_base;
1419   int multibytep = coding->src_multibyte;
1420   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422   int surrogate = CODING_UTF_16_SURROGATE (coding);
1423   Lisp_Object attr, charset_list;
1424
1425   CODING_GET_INFO (coding, attr, charset_list);
1426
1427   if (bom == utf_16_with_bom)
1428     {
1429       int c, c1, c2;
1430
1431       src_base = src;
1432       ONE_MORE_BYTE (c1);
1433       ONE_MORE_BYTE (c2);
1434       c = (c1 << 8) | c2;
1435
1436       if (endian == utf_16_big_endian
1437           ? c != 0xFEFF : c != 0xFFFE)
1438         {
1439           /* The first two bytes are not BOM.  Treat them as bytes
1440              for a normal character.  */
1441           src = src_base;
1442           coding->errors++;
1443         }
1444       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445     }
1446   else if (bom == utf_16_detect_bom)
1447     {
1448       /* We have already tried to detect BOM and failed in
1449          detect_coding.  */
1450       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1451     }
1452
1453   while (1)
1454     {
1455       int c, c1, c2;
1456
1457       src_base = src;
1458       consumed_chars_base = consumed_chars;
1459
1460       if (charbuf + 2 >= charbuf_end)
1461         break;
1462
1463       ONE_MORE_BYTE (c1);
1464       if (c1 < 0)
1465         {
1466           *charbuf++ = -c1;
1467           continue;
1468         }
1469       ONE_MORE_BYTE (c2);
1470       if (c2 < 0)
1471         {
1472           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473           *charbuf++ = -c2;
1474           continue;
1475         }
1476       c = (endian == utf_16_big_endian
1477            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1478       if (surrogate)
1479         {
1480           if (! UTF_16_LOW_SURROGATE_P (c))
1481             {
1482               if (endian == utf_16_big_endian)
1483                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484               else
1485                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486               *charbuf++ = c1;
1487               *charbuf++ = c2;
1488               coding->errors++;
1489               if (UTF_16_HIGH_SURROGATE_P (c))
1490                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1491               else
1492                 *charbuf++ = c;
1493             }
1494           else
1495             {
1496               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1498               *charbuf++ = 0x10000 + c;
1499             }
1500         }
1501       else
1502         {
1503           if (UTF_16_HIGH_SURROGATE_P (c))
1504             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505           else
1506             *charbuf++ = c;
1507         }
1508     }
1509
1510  no_more_source:
1511   coding->consumed_char += consumed_chars_base;
1512   coding->consumed = src_base - coding->source;
1513   coding->charbuf_used = charbuf - coding->charbuf;
1514 }
1515
1516 static int
1517 encode_coding_utf_16 (coding)
1518      struct coding_system *coding;
1519 {
1520   int multibytep = coding->dst_multibyte;
1521   int *charbuf = coding->charbuf;
1522   int *charbuf_end = charbuf + coding->charbuf_used;
1523   unsigned char *dst = coding->destination + coding->produced;
1524   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525   int safe_room = 8;
1526   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528   int produced_chars = 0;
1529   Lisp_Object attrs, charset_list;
1530   int c;
1531
1532   CODING_GET_INFO (coding, attrs, charset_list);
1533
1534   if (bom != utf_16_without_bom)
1535     {
1536       ASSURE_DESTINATION (safe_room);
1537       if (big_endian)
1538         EMIT_TWO_BYTES (0xFE, 0xFF);
1539       else
1540         EMIT_TWO_BYTES (0xFF, 0xFE);
1541       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542     }
1543
1544   while (charbuf < charbuf_end)
1545     {
1546       ASSURE_DESTINATION (safe_room);
1547       c = *charbuf++;
1548       if (c >= MAX_UNICODE_CHAR)
1549         c = coding->default_char;
1550
1551       if (c < 0x10000)
1552         {
1553           if (big_endian)
1554             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555           else
1556             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557         }
1558       else
1559         {
1560           int c1, c2;
1561
1562           c -= 0x10000;
1563           c1 = (c >> 10) + 0xD800;
1564           c2 = (c & 0x3FF) + 0xDC00;
1565           if (big_endian)
1566             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567           else
1568             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569         }
1570     }
1571   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1572   coding->produced = dst - coding->destination;
1573   coding->produced_char += produced_chars;
1574   return 0;
1575 }
1576
1577 \f
1578 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580 /* Emacs' internal format for representation of multiple character
1581    sets is a kind of multi-byte encoding, i.e. characters are
1582    represented by variable-length sequences of one-byte codes.
1583
1584    ASCII characters and control characters (e.g. `tab', `newline') are
1585    represented by one-byte sequences which are their ASCII codes, in
1586    the range 0x00 through 0x7F.
1587
1588    8-bit characters of the range 0x80..0x9F are represented by
1589    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590    code + 0x20).
1591
1592    8-bit characters of the range 0xA0..0xFF are represented by
1593    one-byte sequences which are their 8-bit code.
1594
1595    The other characters are represented by a sequence of `base
1596    leading-code', optional `extended leading-code', and one or two
1597    `position-code's.  The length of the sequence is determined by the
1598    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1599    whereas extended leading-code and position-code take the range 0xA0
1600    through 0xFF.  See `charset.h' for more details about leading-code
1601    and position-code.
1602
1603    --- CODE RANGE of Emacs' internal format ---
1604    character set        range
1605    -------------        -----
1606    ascii                0x00..0x7F
1607    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608    eight-bit-graphic    0xA0..0xBF
1609    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1610    ---------------------------------------------
1611
1612    As this is the internal character representation, the format is
1613    usually not used externally (i.e. in a file or in a data sent to a
1614    process).  But, it is possible to have a text externally in this
1615    format (i.e. by encoding by the coding system `emacs-mule').
1616
1617    In that case, a sequence of one-byte codes has a slightly different
1618    form.
1619
1620    At first, all characters in eight-bit-control are represented by
1621    one-byte sequences which are their 8-bit code.
1622
1623    Next, character composition data are represented by the byte
1624    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625    where,
1626         METHOD is 0xF0 plus one of composition method (enum
1627         composition_method),
1628
1629         BYTES is 0xA0 plus a byte length of this composition data,
1630
1631         CHARS is 0x20 plus a number of characters composed by this
1632         data,
1633
1634         COMPONENTs are characters of multibye form or composition
1635         rules encoded by two-byte of ASCII codes.
1636
1637    In addition, for backward compatibility, the following formats are
1638    also recognized as composition data on decoding.
1639
1640    0x80 MSEQ ...
1641    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643    Here,
1644         MSEQ is a multibyte form but in these special format:
1645           ASCII: 0xA0 ASCII_CODE+0x80,
1646           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647         RULE is a one byte code of the range 0xA0..0xF0 that
1648         represents a composition rule.
1649   */
1650
1651 char emacs_mule_bytes[256];
1652
1653 int
1654 emacs_mule_char (coding, src, nbytes, nchars, id)
1655      struct coding_system *coding;
1656      const unsigned char *src;
1657      int *nbytes, *nchars, *id;
1658 {
1659   const unsigned char *src_end = coding->source + coding->src_bytes;
1660   const unsigned char *src_base = src;
1661   int multibytep = coding->src_multibyte;
1662   struct charset *charset;
1663   unsigned code;
1664   int c;
1665   int consumed_chars = 0;
1666
1667   ONE_MORE_BYTE (c);
1668   if (c < 0)
1669     {
1670       c = -c;
1671       charset = emacs_mule_charset[0];
1672     }
1673   else
1674     {
1675       switch (emacs_mule_bytes[c])
1676         {
1677         case 2:
1678           if (! (charset = emacs_mule_charset[c]))
1679             goto invalid_code;
1680           ONE_MORE_BYTE (c);
1681           if (c < 0)
1682             goto invalid_code;
1683           code = c & 0x7F;
1684           break;
1685
1686         case 3:
1687           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689             {
1690               ONE_MORE_BYTE (c);
1691               if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692                 goto invalid_code;
1693               ONE_MORE_BYTE (c);
1694               if (c < 0)
1695                 goto invalid_code;
1696               code = c & 0x7F;
1697             }
1698           else
1699             {
1700               if (! (charset = emacs_mule_charset[c]))
1701                 goto invalid_code;
1702               ONE_MORE_BYTE (c);
1703               if (c < 0)
1704                 goto invalid_code;
1705               code = (c & 0x7F) << 8;
1706               ONE_MORE_BYTE (c);
1707               if (c < 0)
1708                 goto invalid_code;
1709               code |= c & 0x7F;
1710             }
1711           break;
1712
1713         case 4:
1714           ONE_MORE_BYTE (c);
1715           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1716             goto invalid_code;
1717           ONE_MORE_BYTE (c);
1718           if (c < 0)
1719             goto invalid_code;
1720           code = (c & 0x7F) << 8;
1721           ONE_MORE_BYTE (c);
1722           if (c < 0)
1723             goto invalid_code;
1724           code |= c & 0x7F;
1725           break;
1726
1727         case 1:
1728           code = c;
1729           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730                                      ? charset_ascii : charset_eight_bit);
1731           break;
1732
1733         default:
1734           abort ();
1735         }
1736       c = DECODE_CHAR (charset, code);
1737       if (c < 0)
1738         goto invalid_code;
1739     }
1740   *nbytes = src - src_base;
1741   *nchars = consumed_chars;
1742   if (id)
1743     *id = charset->id;
1744   return c;
1745
1746  no_more_source:
1747   return -2;
1748
1749  invalid_code:
1750   return -1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1755    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1756    else return 0.  */
1757
1758 static int
1759 detect_coding_emacs_mule (coding, detect_info)
1760      struct coding_system *coding;
1761      struct coding_detection_info *detect_info;
1762 {
1763   const unsigned char *src = coding->source, *src_base;
1764   const unsigned char *src_end = coding->source + coding->src_bytes;
1765   int multibytep = coding->src_multibyte;
1766   int consumed_chars = 0;
1767   int c;
1768   int found = 0;
1769
1770   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1771   /* A coding system of this category is always ASCII compatible.  */
1772   src += coding->head_ascii;
1773
1774   while (1)
1775     {
1776       src_base = src;
1777       ONE_MORE_BYTE (c);
1778       if (c < 0)
1779         continue;
1780       if (c == 0x80)
1781         {
1782           /* Perhaps the start of composite character.  We simple skip
1783              it because analyzing it is too heavy for detecting.  But,
1784              at least, we check that the composite character
1785              constitues of more than 4 bytes.  */
1786           const unsigned char *src_base;
1787
1788         repeat:
1789           src_base = src;
1790           do
1791             {
1792               ONE_MORE_BYTE (c);
1793             }
1794           while (c >= 0xA0);
1795
1796           if (src - src_base <= 4)
1797             break;
1798           found = CATEGORY_MASK_EMACS_MULE;
1799           if (c == 0x80)
1800             goto repeat;
1801         }
1802
1803       if (c < 0x80)
1804         {
1805           if (c < 0x20
1806               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807             break;
1808         }
1809       else
1810         {
1811           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1812
1813           while (more_bytes > 0)
1814             {
1815               ONE_MORE_BYTE (c);
1816               if (c < 0xA0)
1817                 {
1818                   src--;        /* Unread the last byte.  */
1819                   break;
1820                 }
1821               more_bytes--;
1822             }
1823           if (more_bytes != 0)
1824             break;
1825           found = CATEGORY_MASK_EMACS_MULE;
1826         }
1827     }
1828   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1829   return 0;
1830
1831  no_more_source:
1832   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1833     {
1834       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1835       return 0;
1836     }
1837   detect_info->found |= found;
1838   return 1;
1839 }
1840
1841
1842 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1843
1844 /* Decode a character represented as a component of composition
1845    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1846    update SRC to the head of next character (or an encoded composition
1847    rule).  If SRC doesn't points a composition component, set C to -1.
1848    If SRC points an invalid byte sequence, global exit by a return
1849    value 0.  */
1850
1851 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1852   if (1)                                                        \
1853     {                                                           \
1854       int c;                                                    \
1855       int nbytes, nchars;                                       \
1856                                                                 \
1857       if (src == src_end)                                       \
1858         break;                                                  \
1859       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1860       if (c < 0)                                                \
1861         {                                                       \
1862           if (c == -2)                                          \
1863             break;                                              \
1864           goto invalid_code;                                    \
1865         }                                                       \
1866       *buf++ = c;                                               \
1867       src += nbytes;                                            \
1868       consumed_chars += nchars;                                 \
1869     }                                                           \
1870   else
1871
1872
1873 /* Decode a composition rule represented as a component of composition
1874    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1875    and increment BUF.  If SRC points an invalid byte sequence, set C
1876    to -1.  */
1877
1878 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1879   do {                                                  \
1880     int c, gref, nref;                                  \
1881                                                         \
1882     if (src >= src_end)                                 \
1883       goto invalid_code;                                \
1884     ONE_MORE_BYTE_NO_CHECK (c);                         \
1885     c -= 0x20;                                          \
1886     if (c < 0 || c >= 81)                               \
1887       goto invalid_code;                                \
1888                                                         \
1889     gref = c / 9, nref = c % 9;                         \
1890     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1891   } while (0)
1892
1893
1894 /* Decode a composition rule represented as a component of composition
1895    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1896    and increment BUF.  If SRC points an invalid byte sequence, set C
1897    to -1.  */
1898
1899 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1900   do {                                                  \
1901     int gref, nref;                                     \
1902                                                         \
1903     if (src + 1>= src_end)                              \
1904       goto invalid_code;                                \
1905     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1906     gref -= 0x20;                                       \
1907     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1908     nref -= 0x20;                                       \
1909     if (gref < 0 || gref >= 81                          \
1910         || nref < 0 || nref >= 81)                      \
1911       goto invalid_code;                                \
1912     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1913   } while (0)
1914
1915
1916 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1917   do {                                                                  \
1918     /* Emacs 21 style format.  The first three bytes at SRC are         \
1919        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1920        the byte length of this composition information, CHARS is the    \
1921        number of characters composed by this composition.  */           \
1922     enum composition_method method = c - 0xF2;                          \
1923     int *charbuf_base = charbuf;                                        \
1924     int from, to;                                                       \
1925     int consumed_chars_limit;                                           \
1926     int nbytes, nchars;                                                 \
1927                                                                         \
1928     ONE_MORE_BYTE (c);                                                  \
1929     if (c < 0)                                                          \
1930       goto invalid_code;                                                \
1931     nbytes = c - 0xA0;                                                  \
1932     if (nbytes < 3)                                                     \
1933       goto invalid_code;                                                \
1934     ONE_MORE_BYTE (c);                                                  \
1935     if (c < 0)                                                          \
1936       goto invalid_code;                                                \
1937     nchars = c - 0xA0;                                                  \
1938     from = coding->produced + char_offset;                              \
1939     to = from + nchars;                                                 \
1940     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
1941     consumed_chars_limit = consumed_chars_base + nbytes;                \
1942     if (method != COMPOSITION_RELATIVE)                                 \
1943       {                                                                 \
1944         int i = 0;                                                      \
1945         while (consumed_chars < consumed_chars_limit)                   \
1946           {                                                             \
1947             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1948               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1949             else                                                        \
1950               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1951             i++;                                                        \
1952           }                                                             \
1953         if (consumed_chars < consumed_chars_limit)                      \
1954           goto invalid_code;                                            \
1955         charbuf_base[0] -= i;                                           \
1956       }                                                                 \
1957   } while (0)
1958
1959
1960 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1961   do {                                                          \
1962     /* Emacs 20 style format for relative composition.  */      \
1963     /* Store multibyte form of characters to be composed.  */   \
1964     enum composition_method method = COMPOSITION_RELATIVE;      \
1965     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1966     int *buf = components;                                      \
1967     int i, j;                                                   \
1968     int from, to;                                               \
1969                                                                 \
1970     src = src_base;                                             \
1971     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1972     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1973       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1974     if (i < 2)                                                  \
1975       goto invalid_code;                                        \
1976     from = coding->produced_char + char_offset;                 \
1977     to = from + i;                                              \
1978     ADD_COMPOSITION_DATA (charbuf, from, to, method);           \
1979     for (j = 0; j < i; j++)                                     \
1980       *charbuf++ = components[j];                               \
1981   } while (0)
1982
1983
1984 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1985   do {                                                          \
1986     /* Emacs 20 style format for rule-base composition.  */     \
1987     /* Store multibyte form of characters to be composed.  */   \
1988     enum composition_method method = COMPOSITION_WITH_RULE;     \
1989     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1990     int *buf = components;                                      \
1991     int i, j;                                                   \
1992     int from, to;                                               \
1993                                                                 \
1994     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1995     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1996       {                                                         \
1997         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1998         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1999       }                                                         \
2000     if (i < 1 || (buf - components) % 2 == 0)                   \
2001       goto invalid_code;                                        \
2002     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2003       goto no_more_source;                                      \
2004     from = coding->produced_char + char_offset;                 \
2005     to = from + i;                                              \
2006     ADD_COMPOSITION_DATA (buf, from, to, method);               \
2007     for (j = 0; j < i; j++)                                     \
2008       *charbuf++ = components[j];                               \
2009     for (j = 0; j < i; j += 2)                                  \
2010       *charbuf++ = components[j];                               \
2011   } while (0)
2012
2013
2014 static void
2015 decode_coding_emacs_mule (coding)
2016      struct coding_system *coding;
2017 {
2018   const unsigned char *src = coding->source + coding->consumed;
2019   const unsigned char *src_end = coding->source + coding->src_bytes;
2020   const unsigned char *src_base;
2021   int *charbuf = coding->charbuf;
2022   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2023   int consumed_chars = 0, consumed_chars_base;
2024   int multibytep = coding->src_multibyte;
2025   Lisp_Object attrs, charset_list;
2026   int char_offset = coding->produced_char;
2027   int last_offset = char_offset;
2028   int last_id = charset_ascii;
2029
2030   CODING_GET_INFO (coding, attrs, charset_list);
2031
2032   while (1)
2033     {
2034       int c;
2035
2036       src_base = src;
2037       consumed_chars_base = consumed_chars;
2038
2039       if (charbuf >= charbuf_end)
2040         break;
2041
2042       ONE_MORE_BYTE (c);
2043       if (c < 0)
2044         {
2045           *charbuf++ = -c;
2046           char_offset++;
2047         }
2048       else if (c < 0x80)
2049         {
2050           *charbuf++ = c;
2051           char_offset++;
2052         }
2053       else if (c == 0x80)
2054         {
2055           ONE_MORE_BYTE (c);
2056           if (c < 0)
2057             goto invalid_code;
2058           if (c - 0xF2 >= COMPOSITION_RELATIVE
2059               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2060             DECODE_EMACS_MULE_21_COMPOSITION (c);
2061           else if (c < 0xC0)
2062             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2063           else if (c == 0xFF)
2064             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2065           else
2066             goto invalid_code;
2067         }
2068       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2069         {
2070           int nbytes, nchars;
2071           int id;
2072
2073           src = src_base;
2074           consumed_chars = consumed_chars_base;
2075           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2076           if (c < 0)
2077             {
2078               if (c == -2)
2079                 break;
2080               goto invalid_code;
2081             }
2082           if (last_id != id)
2083             {
2084               if (last_id != charset_ascii)
2085                 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2086               last_id = id;
2087               last_offset = char_offset;
2088             }
2089           *charbuf++ = c;
2090           src += nbytes;
2091           consumed_chars += nchars;
2092           char_offset++;
2093         }
2094       continue;
2095
2096     invalid_code:
2097       src = src_base;
2098       consumed_chars = consumed_chars_base;
2099       ONE_MORE_BYTE (c);
2100       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2101       char_offset++;
2102       coding->errors++;
2103     }
2104
2105  no_more_source:
2106   if (last_id != charset_ascii)
2107     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2108   coding->consumed_char += consumed_chars_base;
2109   coding->consumed = src_base - coding->source;
2110   coding->charbuf_used = charbuf - coding->charbuf;
2111 }
2112
2113
2114 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2115   do {                                          \
2116     if (id < 0xA0)                              \
2117       codes[0] = id, codes[1] = 0;              \
2118     else if (id < 0xE0)                         \
2119       codes[0] = 0x9A, codes[1] = id;           \
2120     else if (id < 0xF0)                         \
2121       codes[0] = 0x9B, codes[1] = id;           \
2122     else if (id < 0xF5)                         \
2123       codes[0] = 0x9C, codes[1] = id;           \
2124     else                                        \
2125       codes[0] = 0x9D, codes[1] = id;           \
2126   } while (0);
2127
2128
2129 static int
2130 encode_coding_emacs_mule (coding)
2131      struct coding_system *coding;
2132 {
2133   int multibytep = coding->dst_multibyte;
2134   int *charbuf = coding->charbuf;
2135   int *charbuf_end = charbuf + coding->charbuf_used;
2136   unsigned char *dst = coding->destination + coding->produced;
2137   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2138   int safe_room = 8;
2139   int produced_chars = 0;
2140   Lisp_Object attrs, charset_list;
2141   int c;
2142   int preferred_charset_id = -1;
2143
2144   CODING_GET_INFO (coding, attrs, charset_list);
2145   if (! EQ (charset_list, Vemacs_mule_charset_list))
2146     {
2147       CODING_ATTR_CHARSET_LIST (attrs)
2148         = charset_list = Vemacs_mule_charset_list;
2149     }
2150
2151   while (charbuf < charbuf_end)
2152     {
2153       ASSURE_DESTINATION (safe_room);
2154       c = *charbuf++;
2155
2156       if (c < 0)
2157         {
2158           /* Handle an annotation.  */
2159           switch (*charbuf)
2160             {
2161             case CODING_ANNOTATE_COMPOSITION_MASK:
2162               /* Not yet implemented.  */
2163               break;
2164             case CODING_ANNOTATE_CHARSET_MASK:
2165               preferred_charset_id = charbuf[3];
2166               if (preferred_charset_id >= 0
2167                   && NILP (Fmemq (make_number (preferred_charset_id),
2168                                   charset_list)))
2169                 preferred_charset_id = -1;
2170               break;
2171             default:
2172               abort ();
2173             }
2174           charbuf += -c - 1;
2175           continue;
2176         }
2177
2178       if (ASCII_CHAR_P (c))
2179         EMIT_ONE_ASCII_BYTE (c);
2180       else if (CHAR_BYTE8_P (c))
2181         {
2182           c = CHAR_TO_BYTE8 (c);
2183           EMIT_ONE_BYTE (c);
2184         }
2185       else
2186         {
2187           struct charset *charset;
2188           unsigned code;
2189           int dimension;
2190           int emacs_mule_id;
2191           unsigned char leading_codes[2];
2192
2193           if (preferred_charset_id >= 0)
2194             {
2195               charset = CHARSET_FROM_ID (preferred_charset_id);
2196               if (! CHAR_CHARSET_P (c, charset))
2197                 charset = char_charset (c, charset_list, NULL);
2198             }
2199           else
2200             charset = char_charset (c, charset_list, &code);
2201           if (! charset)
2202             {
2203               c = coding->default_char;
2204               if (ASCII_CHAR_P (c))
2205                 {
2206                   EMIT_ONE_ASCII_BYTE (c);
2207                   continue;
2208                 }
2209               charset = char_charset (c, charset_list, &code);
2210             }
2211           dimension = CHARSET_DIMENSION (charset);
2212           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2213           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2214           EMIT_ONE_BYTE (leading_codes[0]);
2215           if (leading_codes[1])
2216             EMIT_ONE_BYTE (leading_codes[1]);
2217           if (dimension == 1)
2218             EMIT_ONE_BYTE (code | 0x80);
2219           else
2220             {
2221               code |= 0x8080;
2222               EMIT_ONE_BYTE (code >> 8);
2223               EMIT_ONE_BYTE (code & 0xFF);
2224             }
2225         }
2226     }
2227   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2228   coding->produced_char += produced_chars;
2229   coding->produced = dst - coding->destination;
2230   return 0;
2231 }
2232
2233 \f
2234 /*** 7. ISO2022 handlers ***/
2235
2236 /* The following note describes the coding system ISO2022 briefly.
2237    Since the intention of this note is to help understand the
2238    functions in this file, some parts are NOT ACCURATE or are OVERLY
2239    SIMPLIFIED.  For thorough understanding, please refer to the
2240    original document of ISO2022.  This is equivalent to the standard
2241    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2242
2243    ISO2022 provides many mechanisms to encode several character sets
2244    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2245    is encoded using bytes less than 128.  This may make the encoded
2246    text a little bit longer, but the text passes more easily through
2247    several types of gateway, some of which strip off the MSB (Most
2248    Significant Bit).
2249
2250    There are two kinds of character sets: control character sets and
2251    graphic character sets.  The former contain control characters such
2252    as `newline' and `escape' to provide control functions (control
2253    functions are also provided by escape sequences).  The latter
2254    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2255    two control character sets and many graphic character sets.
2256
2257    Graphic character sets are classified into one of the following
2258    four classes, according to the number of bytes (DIMENSION) and
2259    number of characters in one dimension (CHARS) of the set:
2260    - DIMENSION1_CHARS94
2261    - DIMENSION1_CHARS96
2262    - DIMENSION2_CHARS94
2263    - DIMENSION2_CHARS96
2264
2265    In addition, each character set is assigned an identification tag,
2266    unique for each set, called the "final character" (denoted as <F>
2267    hereafter).  The <F> of each character set is decided by ECMA(*)
2268    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2269    (0x30..0x3F are for private use only).
2270
2271    Note (*): ECMA = European Computer Manufacturers Association
2272
2273    Here are examples of graphic character sets [NAME(<F>)]:
2274         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2275         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2276         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2277         o DIMENSION2_CHARS96 -- none for the moment
2278
2279    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2280         C0 [0x00..0x1F] -- control character plane 0
2281         GL [0x20..0x7F] -- graphic character plane 0
2282         C1 [0x80..0x9F] -- control character plane 1
2283         GR [0xA0..0xFF] -- graphic character plane 1
2284
2285    A control character set is directly designated and invoked to C0 or
2286    C1 by an escape sequence.  The most common case is that:
2287    - ISO646's  control character set is designated/invoked to C0, and
2288    - ISO6429's control character set is designated/invoked to C1,
2289    and usually these designations/invocations are omitted in encoded
2290    text.  In a 7-bit environment, only C0 can be used, and a control
2291    character for C1 is encoded by an appropriate escape sequence to
2292    fit into the environment.  All control characters for C1 are
2293    defined to have corresponding escape sequences.
2294
2295    A graphic character set is at first designated to one of four
2296    graphic registers (G0 through G3), then these graphic registers are
2297    invoked to GL or GR.  These designations and invocations can be
2298    done independently.  The most common case is that G0 is invoked to
2299    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2300    these invocations and designations are omitted in encoded text.
2301    In a 7-bit environment, only GL can be used.
2302
2303    When a graphic character set of CHARS94 is invoked to GL, codes
2304    0x20 and 0x7F of the GL area work as control characters SPACE and
2305    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2306    be used.
2307
2308    There are two ways of invocation: locking-shift and single-shift.
2309    With locking-shift, the invocation lasts until the next different
2310    invocation, whereas with single-shift, the invocation affects the
2311    following character only and doesn't affect the locking-shift
2312    state.  Invocations are done by the following control characters or
2313    escape sequences:
2314
2315    ----------------------------------------------------------------------
2316    abbrev  function                  cntrl escape seq   description
2317    ----------------------------------------------------------------------
2318    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2319    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2320    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2321    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2322    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2323    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2324    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2325    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2326    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2327    ----------------------------------------------------------------------
2328    (*) These are not used by any known coding system.
2329
2330    Control characters for these functions are defined by macros
2331    ISO_CODE_XXX in `coding.h'.
2332
2333    Designations are done by the following escape sequences:
2334    ----------------------------------------------------------------------
2335    escape sequence      description
2336    ----------------------------------------------------------------------
2337    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2338    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2339    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2340    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2341    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2342    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2343    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2344    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2345    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2346    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2347    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2348    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2349    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2350    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2351    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2352    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2353    ----------------------------------------------------------------------
2354
2355    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2356    of dimension 1, chars 94, and final character <F>, etc...
2357
2358    Note (*): Although these designations are not allowed in ISO2022,
2359    Emacs accepts them on decoding, and produces them on encoding
2360    CHARS96 character sets in a coding system which is characterized as
2361    7-bit environment, non-locking-shift, and non-single-shift.
2362
2363    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2364    '(' must be omitted.  We refer to this as "short-form" hereafter.
2365
2366    Now you may notice that there are a lot of ways of encoding the
2367    same multilingual text in ISO2022.  Actually, there exist many
2368    coding systems such as Compound Text (used in X11's inter client
2369    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2370    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2371    localized platforms), and all of these are variants of ISO2022.
2372
2373    In addition to the above, Emacs handles two more kinds of escape
2374    sequences: ISO6429's direction specification and Emacs' private
2375    sequence for specifying character composition.
2376
2377    ISO6429's direction specification takes the following form:
2378         o CSI ']'      -- end of the current direction
2379         o CSI '0' ']'  -- end of the current direction
2380         o CSI '1' ']'  -- start of left-to-right text
2381         o CSI '2' ']'  -- start of right-to-left text
2382    The control character CSI (0x9B: control sequence introducer) is
2383    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2384
2385    Character composition specification takes the following form:
2386         o ESC '0' -- start relative composition
2387         o ESC '1' -- end composition
2388         o ESC '2' -- start rule-base composition (*)
2389         o ESC '3' -- start relative composition with alternate chars  (**)
2390         o ESC '4' -- start rule-base composition with alternate chars  (**)
2391   Since these are not standard escape sequences of any ISO standard,
2392   the use of them with these meanings is restricted to Emacs only.
2393
2394   (*) This form is used only in Emacs 20.7 and older versions,
2395   but newer versions can safely decode it.
2396   (**) This form is used only in Emacs 21.1 and newer versions,
2397   and older versions can't decode it.
2398
2399   Here's a list of example usages of these composition escape
2400   sequences (categorized by `enum composition_method').
2401
2402   COMPOSITION_RELATIVE:
2403         ESC 0 CHAR [ CHAR ] ESC 1
2404   COMPOSITION_WITH_RULE:
2405         ESC 2 CHAR [ RULE CHAR ] ESC 1
2406   COMPOSITION_WITH_ALTCHARS:
2407         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2408   COMPOSITION_WITH_RULE_ALTCHARS:
2409         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2410
2411 enum iso_code_class_type iso_code_class[256];
2412
2413 #define SAFE_CHARSET_P(coding, id)      \
2414   ((id) <= (coding)->max_charset_id     \
2415    && (coding)->safe_charsets[id] >= 0)
2416
2417
2418 #define SHIFT_OUT_OK(category)  \
2419   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2420
2421 static void
2422 setup_iso_safe_charsets (attrs)
2423      Lisp_Object attrs;
2424 {
2425   Lisp_Object charset_list, safe_charsets;
2426   Lisp_Object request;
2427   Lisp_Object reg_usage;
2428   Lisp_Object tail;
2429   int reg94, reg96;
2430   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2431   int max_charset_id;
2432
2433   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2434   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2435       && ! EQ (charset_list, Viso_2022_charset_list))
2436     {
2437       CODING_ATTR_CHARSET_LIST (attrs)
2438         = charset_list = Viso_2022_charset_list;
2439       ASET (attrs, coding_attr_safe_charsets, Qnil);
2440     }
2441
2442   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2443     return;
2444
2445   max_charset_id = 0;
2446   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2447     {
2448       int id = XINT (XCAR (tail));
2449       if (max_charset_id < id)
2450         max_charset_id = id;
2451     }
2452
2453   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2454                                 make_number (255));
2455   request = AREF (attrs, coding_attr_iso_request);
2456   reg_usage = AREF (attrs, coding_attr_iso_usage);
2457   reg94 = XINT (XCAR (reg_usage));
2458   reg96 = XINT (XCDR (reg_usage));
2459
2460   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2461     {
2462       Lisp_Object id;
2463       Lisp_Object reg;
2464       struct charset *charset;
2465
2466       id = XCAR (tail);
2467       charset = CHARSET_FROM_ID (XINT (id));
2468       reg = Fcdr (Fassq (id, request));
2469       if (! NILP (reg))
2470         SSET (safe_charsets, XINT (id), XINT (reg));
2471       else if (charset->iso_chars_96)
2472         {
2473           if (reg96 < 4)
2474             SSET (safe_charsets, XINT (id), reg96);
2475         }
2476       else
2477         {
2478           if (reg94 < 4)
2479             SSET (safe_charsets, XINT (id), reg94);
2480         }
2481     }
2482   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2483 }
2484
2485
2486 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2487    Check if a text is encoded in one of ISO-2022 based codig systems.
2488    If it is, return 1, else return 0.  */
2489
2490 static int
2491 detect_coding_iso_2022 (coding, detect_info)
2492      struct coding_system *coding;
2493      struct coding_detection_info *detect_info;
2494 {
2495   const unsigned char *src = coding->source, *src_base = src;
2496   const unsigned char *src_end = coding->source + coding->src_bytes;
2497   int multibytep = coding->src_multibyte;
2498   int single_shifting = 0;
2499   int id;
2500   int c, c1;
2501   int consumed_chars = 0;
2502   int i;
2503   int rejected = 0;
2504   int found = 0;
2505
2506   detect_info->checked |= CATEGORY_MASK_ISO;
2507
2508   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2509     {
2510       struct coding_system *this = &(coding_categories[i]);
2511       Lisp_Object attrs, val;
2512
2513       attrs = CODING_ID_ATTRS (this->id);
2514       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2515           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2516         setup_iso_safe_charsets (attrs);
2517       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2518       this->max_charset_id = SCHARS (val) - 1;
2519       this->safe_charsets = (char *) SDATA (val);
2520     }
2521
2522   /* A coding system of this category is always ASCII compatible.  */
2523   src += coding->head_ascii;
2524
2525   while (rejected != CATEGORY_MASK_ISO)
2526     {
2527       src_base = src;
2528       ONE_MORE_BYTE (c);
2529       switch (c)
2530         {
2531         case ISO_CODE_ESC:
2532           if (inhibit_iso_escape_detection)
2533             break;
2534           single_shifting = 0;
2535           ONE_MORE_BYTE (c);
2536           if (c >= '(' && c <= '/')
2537             {
2538               /* Designation sequence for a charset of dimension 1.  */
2539               ONE_MORE_BYTE (c1);
2540               if (c1 < ' ' || c1 >= 0x80
2541                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2542                 /* Invalid designation sequence.  Just ignore.  */
2543                 break;
2544             }
2545           else if (c == '$')
2546             {
2547               /* Designation sequence for a charset of dimension 2.  */
2548               ONE_MORE_BYTE (c);
2549               if (c >= '@' && c <= 'B')
2550                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2551                 id = iso_charset_table[1][0][c];
2552               else if (c >= '(' && c <= '/')
2553                 {
2554                   ONE_MORE_BYTE (c1);
2555                   if (c1 < ' ' || c1 >= 0x80
2556                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2557                     /* Invalid designation sequence.  Just ignore.  */
2558                     break;
2559                 }
2560               else
2561                 /* Invalid designation sequence.  Just ignore it.  */
2562                 break;
2563             }
2564           else if (c == 'N' || c == 'O')
2565             {
2566               /* ESC <Fe> for SS2 or SS3.  */
2567               single_shifting = 1;
2568               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2569               break;
2570             }
2571           else if (c >= '0' && c <= '4')
2572             {
2573               /* ESC <Fp> for start/end composition.  */
2574               found |= CATEGORY_MASK_ISO;
2575               break;
2576             }
2577           else
2578             {
2579               /* Invalid escape sequence.  Just ignore it.  */
2580               break;
2581             }
2582
2583           /* We found a valid designation sequence for CHARSET.  */
2584           rejected |= CATEGORY_MASK_ISO_8BIT;
2585           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2586                               id))
2587             found |= CATEGORY_MASK_ISO_7;
2588           else
2589             rejected |= CATEGORY_MASK_ISO_7;
2590           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2591                               id))
2592             found |= CATEGORY_MASK_ISO_7_TIGHT;
2593           else
2594             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2595           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2596                               id))
2597             found |= CATEGORY_MASK_ISO_7_ELSE;
2598           else
2599             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2600           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2601                               id))
2602             found |= CATEGORY_MASK_ISO_8_ELSE;
2603           else
2604             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2605           break;
2606
2607         case ISO_CODE_SO:
2608         case ISO_CODE_SI:
2609           /* Locking shift out/in.  */
2610           if (inhibit_iso_escape_detection)
2611             break;
2612           single_shifting = 0;
2613           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2614           found |= CATEGORY_MASK_ISO_ELSE;
2615           break;
2616
2617         case ISO_CODE_CSI:
2618           /* Control sequence introducer.  */
2619           single_shifting = 0;
2620           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2621           found |= CATEGORY_MASK_ISO_8_ELSE;
2622           goto check_extra_latin;
2623
2624         case ISO_CODE_SS2:
2625         case ISO_CODE_SS3:
2626           /* Single shift.   */
2627           if (inhibit_iso_escape_detection)
2628             break;
2629           single_shifting = 0;
2630           rejected |= CATEGORY_MASK_ISO_7BIT;
2631           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2632               & CODING_ISO_FLAG_SINGLE_SHIFT)
2633             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2634           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2635               & CODING_ISO_FLAG_SINGLE_SHIFT)
2636             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2637           if (single_shifting)
2638             break;
2639           goto check_extra_latin;
2640
2641         default:
2642           if (c < 0)
2643             continue;
2644           if (c < 0x80)
2645             {
2646               single_shifting = 0;
2647               break;
2648             }
2649           if (c >= 0xA0)
2650             {
2651               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2652               found |= CATEGORY_MASK_ISO_8_1;
2653               /* Check the length of succeeding codes of the range
2654                  0xA0..0FF.  If the byte length is even, we include
2655                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2656                  only when we are not single shifting.  */
2657               if (! single_shifting
2658                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2659                 {
2660                   int i = 1;
2661                   while (src < src_end)
2662                     {
2663                       ONE_MORE_BYTE (c);
2664                       if (c < 0xA0)
2665                         break;
2666                       i++;
2667                     }
2668
2669                   if (i & 1 && src < src_end)
2670                     rejected |= CATEGORY_MASK_ISO_8_2;
2671                   else
2672                     found |= CATEGORY_MASK_ISO_8_2;
2673                 }
2674               break;
2675             }
2676         check_extra_latin:
2677           single_shifting = 0;
2678           if (! VECTORP (Vlatin_extra_code_table)
2679               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2680             {
2681               rejected = CATEGORY_MASK_ISO;
2682               break;
2683             }
2684           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2685               & CODING_ISO_FLAG_LATIN_EXTRA)
2686             found |= CATEGORY_MASK_ISO_8_1;
2687           else
2688             rejected |= CATEGORY_MASK_ISO_8_1;
2689           rejected |= CATEGORY_MASK_ISO_8_2;
2690         }
2691     }
2692   detect_info->rejected |= CATEGORY_MASK_ISO;
2693   return 0;
2694
2695  no_more_source:
2696   detect_info->rejected |= rejected;
2697   detect_info->found |= (found & ~rejected);
2698   return 1;
2699 }
2700
2701
2702 /* Set designation state into CODING.  */
2703 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2704   do {                                                                  \
2705     int id, prev;                                                       \
2706                                                                         \
2707     if (final < '0' || final >= 128                                     \
2708         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2709         || !SAFE_CHARSET_P (coding, id))                                \
2710       {                                                                 \
2711         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2712         goto invalid_code;                                              \
2713       }                                                                 \
2714     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2715     if (id == charset_jisx0201_roman)                                   \
2716       {                                                                 \
2717         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2718           id = charset_ascii;                                           \
2719       }                                                                 \
2720     else if (id == charset_jisx0208_1978)                               \
2721       {                                                                 \
2722         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2723           id = charset_jisx0208;                                        \
2724       }                                                                 \
2725     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2726     /* If there was an invalid designation to REG previously, and this  \
2727        designation is ASCII to REG, we should keep this designation     \
2728        sequence.  */                                                    \
2729     if (prev == -2 && id == charset_ascii)                              \
2730       goto invalid_code;                                                \
2731   } while (0)
2732
2733
2734 #define MAYBE_FINISH_COMPOSITION()                              \
2735   do {                                                          \
2736     int i;                                                      \
2737     if (composition_state == COMPOSING_NO)                      \
2738       break;                                                    \
2739     /* It is assured that we have enough room for producing     \
2740        characters stored in the table `components'.  */         \
2741     if (charbuf + component_idx > charbuf_end)                  \
2742       goto no_more_source;                                      \
2743     composition_state = COMPOSING_NO;                           \
2744     if (method == COMPOSITION_RELATIVE                          \
2745         || method == COMPOSITION_WITH_ALTCHARS)                 \
2746       {                                                         \
2747         for (i = 0; i < component_idx; i++)                     \
2748           *charbuf++ = components[i];                           \
2749         char_offset += component_idx;                           \
2750       }                                                         \
2751     else                                                        \
2752       {                                                         \
2753         for (i = 0; i < component_idx; i += 2)                  \
2754           *charbuf++ = components[i];                           \
2755         char_offset += (component_idx / 2) + 1;                 \
2756       }                                                         \
2757   } while (0)
2758
2759
2760 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2761    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2762    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2763    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2764    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2765   */
2766
2767 #define DECODE_COMPOSITION_START(c1)                                    \
2768   do {                                                                  \
2769     if (c1 == '0'                                                       \
2770         && composition_state == COMPOSING_COMPONENT_RULE)               \
2771       {                                                                 \
2772         component_len = component_idx;                                  \
2773         composition_state = COMPOSING_CHAR;                             \
2774       }                                                                 \
2775     else                                                                \
2776       {                                                                 \
2777         const unsigned char *p;                                         \
2778                                                                         \
2779         MAYBE_FINISH_COMPOSITION ();                                    \
2780         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2781           goto no_more_source;                                          \
2782         for (p = src; p < src_end - 1; p++)                             \
2783           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2784             break;                                                      \
2785         if (p == src_end - 1)                                           \
2786           {                                                             \
2787             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2788               goto invalid_code;                                        \
2789             goto no_more_source;                                        \
2790           }                                                             \
2791                                                                         \
2792         /* This is surely the start of a composition.  */               \
2793         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2794                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2795                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2796                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2797         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2798                              : COMPOSING_COMPONENT_CHAR);               \
2799         component_idx = component_len = 0;                              \
2800       }                                                                 \
2801   } while (0)
2802
2803
2804 /* Handle compositoin end sequence ESC 1.  */
2805
2806 #define DECODE_COMPOSITION_END()                                        \
2807   do {                                                                  \
2808     int nchars = (component_len > 0 ? component_idx - component_len     \
2809                   : method == COMPOSITION_RELATIVE ? component_idx      \
2810                   : (component_idx + 1) / 2);                           \
2811     int i;                                                              \
2812     int *saved_charbuf = charbuf;                                       \
2813     int from = char_offset;                                             \
2814     int to = from + nchars;                                             \
2815                                                                         \
2816     ADD_COMPOSITION_DATA (charbuf, from, to, method);                   \
2817     if (method != COMPOSITION_RELATIVE)                                 \
2818       {                                                                 \
2819         if (component_len == 0)                                         \
2820           for (i = 0; i < component_idx; i++)                           \
2821             *charbuf++ = components[i];                                 \
2822         else                                                            \
2823           for (i = 0; i < component_len; i++)                           \
2824             *charbuf++ = components[i];                                 \
2825         *saved_charbuf = saved_charbuf - charbuf;                       \
2826       }                                                                 \
2827     if (method == COMPOSITION_WITH_RULE)                                \
2828       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2829         *charbuf++ = components[i];                                     \
2830     else                                                                \
2831       for (i = component_len; i < component_idx; i++, char_offset++)    \
2832         *charbuf++ = components[i];                                     \
2833     coding->annotated = 1;                                              \
2834     composition_state = COMPOSING_NO;                                   \
2835   } while (0)
2836
2837
2838 /* Decode a composition rule from the byte C1 (and maybe one more byte
2839    from SRC) and store one encoded composition rule in
2840    coding->cmp_data.  */
2841
2842 #define DECODE_COMPOSITION_RULE(c1)                                     \
2843   do {                                                                  \
2844     (c1) -= 32;                                                         \
2845     if (c1 < 81)                /* old format (before ver.21) */        \
2846       {                                                                 \
2847         int gref = (c1) / 9;                                            \
2848         int nref = (c1) % 9;                                            \
2849         if (gref == 4) gref = 10;                                       \
2850         if (nref == 4) nref = 10;                                       \
2851         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2852       }                                                                 \
2853     else if (c1 < 93)           /* new format (after ver.21) */         \
2854       {                                                                 \
2855         ONE_MORE_BYTE (c2);                                             \
2856         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2857       }                                                                 \
2858     else                                                                \
2859       c1 = 0;                                                           \
2860   } while (0)
2861
2862
2863 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2864
2865 static void
2866 decode_coding_iso_2022 (coding)
2867      struct coding_system *coding;
2868 {
2869   const unsigned char *src = coding->source + coding->consumed;
2870   const unsigned char *src_end = coding->source + coding->src_bytes;
2871   const unsigned char *src_base;
2872   int *charbuf = coding->charbuf;
2873   int *charbuf_end
2874     = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2875   int consumed_chars = 0, consumed_chars_base;
2876   int multibytep = coding->src_multibyte;
2877   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2878   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2879   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2880   struct charset *charset;
2881   int c;
2882   /* For handling composition sequence.  */
2883 #define COMPOSING_NO                    0
2884 #define COMPOSING_CHAR                  1
2885 #define COMPOSING_RULE                  2
2886 #define COMPOSING_COMPONENT_CHAR        3
2887 #define COMPOSING_COMPONENT_RULE        4
2888
2889   int composition_state = COMPOSING_NO;
2890   enum composition_method method;
2891   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2892   int component_idx;
2893   int component_len;
2894   Lisp_Object attrs, charset_list;
2895   int char_offset = coding->produced_char;
2896   int last_offset = char_offset;
2897   int last_id = charset_ascii;
2898
2899   CODING_GET_INFO (coding, attrs, charset_list);
2900   setup_iso_safe_charsets (attrs);
2901
2902   while (1)
2903     {
2904       int c1, c2;
2905
2906       src_base = src;
2907       consumed_chars_base = consumed_chars;
2908
2909       if (charbuf >= charbuf_end)
2910         break;
2911
2912       ONE_MORE_BYTE (c1);
2913       if (c1 < 0)
2914         goto invalid_code;
2915
2916       /* We produce at most one character.  */
2917       switch (iso_code_class [c1])
2918         {
2919         case ISO_0x20_or_0x7F:
2920           if (composition_state != COMPOSING_NO)
2921             {
2922               if (composition_state == COMPOSING_RULE
2923                   || composition_state == COMPOSING_COMPONENT_RULE)
2924                 {
2925                   DECODE_COMPOSITION_RULE (c1);
2926                   components[component_idx++] = c1;
2927                   composition_state--;
2928                   continue;
2929                 }
2930             }
2931           if (charset_id_0 < 0
2932               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2933             /* This is SPACE or DEL.  */
2934             charset = CHARSET_FROM_ID (charset_ascii);
2935           else
2936             charset = CHARSET_FROM_ID (charset_id_0);
2937           break;
2938
2939         case ISO_graphic_plane_0:
2940           if (composition_state != COMPOSING_NO)
2941             {
2942               if (composition_state == COMPOSING_RULE
2943                   || composition_state == COMPOSING_COMPONENT_RULE)
2944                 {
2945                   DECODE_COMPOSITION_RULE (c1);
2946                   components[component_idx++] = c1;
2947                   composition_state--;
2948                   continue;
2949                 }
2950             }
2951           charset = CHARSET_FROM_ID (charset_id_0);
2952           break;
2953
2954         case ISO_0xA0_or_0xFF:
2955           if (charset_id_1 < 0
2956               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2957               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2958             goto invalid_code;
2959           /* This is a graphic character, we fall down ... */
2960
2961         case ISO_graphic_plane_1:
2962           if (charset_id_1 < 0)
2963             goto invalid_code;
2964           charset = CHARSET_FROM_ID (charset_id_1);
2965           break;
2966
2967         case ISO_control_0:
2968           MAYBE_FINISH_COMPOSITION ();
2969           charset = CHARSET_FROM_ID (charset_ascii);
2970           break;
2971
2972         case ISO_control_1:
2973           MAYBE_FINISH_COMPOSITION ();
2974           goto invalid_code;
2975
2976         case ISO_shift_out:
2977           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2978               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2979             goto invalid_code;
2980           CODING_ISO_INVOCATION (coding, 0) = 1;
2981           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2982           continue;
2983
2984         case ISO_shift_in:
2985           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2986             goto invalid_code;
2987           CODING_ISO_INVOCATION (coding, 0) = 0;
2988           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2989           continue;
2990
2991         case ISO_single_shift_2_7:
2992         case ISO_single_shift_2:
2993           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2994             goto invalid_code;
2995           /* SS2 is handled as an escape sequence of ESC 'N' */
2996           c1 = 'N';
2997           goto label_escape_sequence;
2998
2999         case ISO_single_shift_3:
3000           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3001             goto invalid_code;
3002           /* SS2 is handled as an escape sequence of ESC 'O' */
3003           c1 = 'O';
3004           goto label_escape_sequence;
3005
3006         case ISO_control_sequence_introducer:
3007           /* CSI is handled as an escape sequence of ESC '[' ...  */
3008           c1 = '[';
3009           goto label_escape_sequence;
3010
3011         case ISO_escape:
3012           ONE_MORE_BYTE (c1);
3013         label_escape_sequence:
3014           /* Escape sequences handled here are invocation,
3015              designation, direction specification, and character
3016              composition specification.  */
3017           switch (c1)
3018             {
3019             case '&':           /* revision of following character set */
3020               ONE_MORE_BYTE (c1);
3021               if (!(c1 >= '@' && c1 <= '~'))
3022                 goto invalid_code;
3023               ONE_MORE_BYTE (c1);
3024               if (c1 != ISO_CODE_ESC)
3025                 goto invalid_code;
3026               ONE_MORE_BYTE (c1);
3027               goto label_escape_sequence;
3028
3029             case '$':           /* designation of 2-byte character set */
3030               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3031                 goto invalid_code;
3032               ONE_MORE_BYTE (c1);
3033               if (c1 >= '@' && c1 <= 'B')
3034                 {       /* designation of JISX0208.1978, GB2312.1980,
3035                            or JISX0208.1980 */
3036                   DECODE_DESIGNATION (0, 2, 0, c1);
3037                 }
3038               else if (c1 >= 0x28 && c1 <= 0x2B)
3039                 {       /* designation of DIMENSION2_CHARS94 character set */
3040                   ONE_MORE_BYTE (c2);
3041                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3042                 }
3043               else if (c1 >= 0x2C && c1 <= 0x2F)
3044                 {       /* designation of DIMENSION2_CHARS96 character set */
3045                   ONE_MORE_BYTE (c2);
3046                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3047                 }
3048               else
3049                 goto invalid_code;
3050               /* We must update these variables now.  */
3051               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3052               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3053               continue;
3054
3055             case 'n':           /* invocation of locking-shift-2 */
3056               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3057                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3058                 goto invalid_code;
3059               CODING_ISO_INVOCATION (coding, 0) = 2;
3060               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3061               continue;
3062
3063             case 'o':           /* invocation of locking-shift-3 */
3064               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3065                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3066                 goto invalid_code;
3067               CODING_ISO_INVOCATION (coding, 0) = 3;
3068               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3069               continue;
3070
3071             case 'N':           /* invocation of single-shift-2 */
3072               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3073                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3074                 goto invalid_code;
3075               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3076               ONE_MORE_BYTE (c1);
3077               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3078                 goto invalid_code;
3079               break;
3080
3081             case 'O':           /* invocation of single-shift-3 */
3082               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3083                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3084                 goto invalid_code;
3085               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3086               ONE_MORE_BYTE (c1);
3087               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3088                 goto invalid_code;
3089               break;
3090
3091             case '0': case '2': case '3': case '4': /* start composition */
3092               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3093                 goto invalid_code;
3094               DECODE_COMPOSITION_START (c1);
3095               continue;
3096
3097             case '1':           /* end composition */
3098               if (composition_state == COMPOSING_NO)
3099                 goto invalid_code;
3100               DECODE_COMPOSITION_END ();
3101               continue;
3102
3103             case '[':           /* specification of direction */
3104               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3105                 goto invalid_code;
3106               /* For the moment, nested direction is not supported.
3107                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3108                  left-to-right, and nozero means right-to-left.  */
3109               ONE_MORE_BYTE (c1);
3110               switch (c1)
3111                 {
3112                 case ']':       /* end of the current direction */
3113                   coding->mode &= ~CODING_MODE_DIRECTION;
3114
3115                 case '0':       /* end of the current direction */
3116                 case '1':       /* start of left-to-right direction */
3117                   ONE_MORE_BYTE (c1);
3118                   if (c1 == ']')
3119                     coding->mode &= ~CODING_MODE_DIRECTION;
3120                   else
3121                     goto invalid_code;
3122                   break;
3123
3124                 case '2':       /* start of right-to-left direction */
3125                   ONE_MORE_BYTE (c1);
3126                   if (c1 == ']')
3127                     coding->mode |= CODING_MODE_DIRECTION;
3128                   else
3129                     goto invalid_code;
3130                   break;
3131
3132                 default:
3133                   goto invalid_code;
3134                 }
3135               continue;
3136
3137             case '%':
3138               ONE_MORE_BYTE (c1);
3139               if (c1 == '/')
3140                 {
3141                   /* CTEXT extended segment:
3142                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3143                      We keep these bytes as is for the moment.
3144                      They may be decoded by post-read-conversion.  */
3145                   int dim, M, L;
3146                   int size;
3147
3148                   ONE_MORE_BYTE (dim);
3149                   ONE_MORE_BYTE (M);
3150                   ONE_MORE_BYTE (L);
3151                   size = ((M - 128) * 128) + (L - 128);
3152                   if (charbuf + 8 + size > charbuf_end)
3153                     goto break_loop;
3154                   *charbuf++ = ISO_CODE_ESC;
3155                   *charbuf++ = '%';
3156                   *charbuf++ = '/';
3157                   *charbuf++ = dim;
3158                   *charbuf++ = BYTE8_TO_CHAR (M);
3159                   *charbuf++ = BYTE8_TO_CHAR (L);
3160                   while (size-- > 0)
3161                     {
3162                       ONE_MORE_BYTE (c1);
3163                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3164                     }
3165                 }
3166               else if (c1 == 'G')
3167                 {
3168                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3169                      ESC % G --UTF-8-BYTES-- ESC % @
3170                      We keep these bytes as is for the moment.
3171                      They may be decoded by post-read-conversion.  */
3172                   int *p = charbuf;
3173
3174                   if (p + 6 > charbuf_end)
3175                     goto break_loop;
3176                   *p++ = ISO_CODE_ESC;
3177                   *p++ = '%';
3178                   *p++ = 'G';
3179                   while (p < charbuf_end)
3180                     {
3181                       ONE_MORE_BYTE (c1);
3182                       if (c1 == ISO_CODE_ESC
3183                           && src + 1 < src_end
3184                           && src[0] == '%'
3185                           && src[1] == '@')
3186                         break;
3187                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3188                     }
3189                   if (p + 3 > charbuf_end)
3190                     goto break_loop;
3191                   *p++ = ISO_CODE_ESC;
3192                   *p++ = '%';
3193                   *p++ = '@';
3194                   charbuf = p;
3195                 }
3196               else
3197                 goto invalid_code;
3198               continue;
3199               break;
3200
3201             default:
3202               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3203                 goto invalid_code;
3204               if (c1 >= 0x28 && c1 <= 0x2B)
3205                 {       /* designation of DIMENSION1_CHARS94 character set */
3206                   ONE_MORE_BYTE (c2);
3207                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3208                 }
3209               else if (c1 >= 0x2C && c1 <= 0x2F)
3210                 {       /* designation of DIMENSION1_CHARS96 character set */
3211                   ONE_MORE_BYTE (c2);
3212                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3213                 }
3214               else
3215                 goto invalid_code;
3216               /* We must update these variables now.  */
3217               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3218               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3219               continue;
3220             }
3221         }
3222
3223       if (charset->id != charset_ascii
3224           && last_id != charset->id)
3225         {
3226           if (last_id != charset_ascii)
3227             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228           last_id = charset->id;
3229           last_offset = char_offset;
3230         }
3231
3232       /* Now we know CHARSET and 1st position code C1 of a character.
3233          Produce a decoded character while getting 2nd position code
3234          C2 if necessary.  */
3235       c1 &= 0x7F;
3236       if (CHARSET_DIMENSION (charset) > 1)
3237         {
3238           ONE_MORE_BYTE (c2);
3239           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3240             /* C2 is not in a valid range.  */
3241             goto invalid_code;
3242           c1 = (c1 << 8) | (c2 & 0x7F);
3243           if (CHARSET_DIMENSION (charset) > 2)
3244             {
3245               ONE_MORE_BYTE (c2);
3246               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3247                 /* C2 is not in a valid range.  */
3248                 goto invalid_code;
3249               c1 = (c1 << 8) | (c2 & 0x7F);
3250             }
3251         }
3252
3253       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3254       if (c < 0)
3255         {
3256           MAYBE_FINISH_COMPOSITION ();
3257           for (; src_base < src; src_base++, char_offset++)
3258             {
3259               if (ASCII_BYTE_P (*src_base))
3260                 *charbuf++ = *src_base;
3261               else
3262                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3263             }
3264         }
3265       else if (composition_state == COMPOSING_NO)
3266         {
3267           *charbuf++ = c;
3268           char_offset++;
3269         }
3270       else
3271         {
3272           components[component_idx++] = c;
3273           if (method == COMPOSITION_WITH_RULE
3274               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3275                   && composition_state == COMPOSING_COMPONENT_CHAR))
3276             composition_state++;
3277         }
3278       continue;
3279
3280     invalid_code:
3281       MAYBE_FINISH_COMPOSITION ();
3282       src = src_base;
3283       consumed_chars = consumed_chars_base;
3284       ONE_MORE_BYTE (c);
3285       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3286       char_offset++;
3287       coding->errors++;
3288       continue;
3289
3290     break_loop:
3291       break;
3292     }
3293
3294  no_more_source:
3295   if (last_id != charset_ascii)
3296     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3297   coding->consumed_char += consumed_chars_base;
3298   coding->consumed = src_base - coding->source;
3299   coding->charbuf_used = charbuf - coding->charbuf;
3300 }
3301
3302
3303 /* ISO2022 encoding stuff.  */
3304
3305 /*
3306    It is not enough to say just "ISO2022" on encoding, we have to
3307    specify more details.  In Emacs, each coding system of ISO2022
3308    variant has the following specifications:
3309         1. Initial designation to G0 thru G3.
3310         2. Allows short-form designation?
3311         3. ASCII should be designated to G0 before control characters?
3312         4. ASCII should be designated to G0 at end of line?
3313         5. 7-bit environment or 8-bit environment?
3314         6. Use locking-shift?
3315         7. Use Single-shift?
3316    And the following two are only for Japanese:
3317         8. Use ASCII in place of JIS0201-1976-Roman?
3318         9. Use JISX0208-1983 in place of JISX0208-1978?
3319    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3320    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3321    details.
3322 */
3323
3324 /* Produce codes (escape sequence) for designating CHARSET to graphic
3325    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3326    '@', 'A', or 'B' and the coding system CODING allows, produce
3327    designation sequence of short-form.  */
3328
3329 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3330   do {                                                                  \
3331     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3332     char *intermediate_char_94 = "()*+";                                \
3333     char *intermediate_char_96 = ",-./";                                \
3334     int revision = -1;                                                  \
3335     int c;                                                              \
3336                                                                         \
3337     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3338       revision = CHARSET_ISO_REVISION (charset);                        \
3339                                                                         \
3340     if (revision >= 0)                                                  \
3341       {                                                                 \
3342         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3343         EMIT_ONE_BYTE ('@' + revision);                                 \
3344       }                                                                 \
3345     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3346     if (CHARSET_DIMENSION (charset) == 1)                               \
3347       {                                                                 \
3348         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3349           c = intermediate_char_94[reg];                                \
3350         else                                                            \
3351           c = intermediate_char_96[reg];                                \
3352         EMIT_ONE_ASCII_BYTE (c);                                        \
3353       }                                                                 \
3354     else                                                                \
3355       {                                                                 \
3356         EMIT_ONE_ASCII_BYTE ('$');                                      \
3357         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3358           {                                                             \
3359             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3360                 || reg != 0                                             \
3361                 || final_char < '@' || final_char > 'B')                \
3362               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3363           }                                                             \
3364         else                                                            \
3365           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3366       }                                                                 \
3367     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3368                                                                         \
3369     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3370   } while (0)
3371
3372
3373 /* The following two macros produce codes (control character or escape
3374    sequence) for ISO2022 single-shift functions (single-shift-2 and
3375    single-shift-3).  */
3376
3377 #define ENCODE_SINGLE_SHIFT_2                                           \
3378   do {                                                                  \
3379     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3380       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3381     else                                                                \
3382       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3383     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3384   } while (0)
3385
3386
3387 #define ENCODE_SINGLE_SHIFT_3                                           \
3388   do {                                                                  \
3389     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3390       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3391     else                                                                \
3392       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3393     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3394   } while (0)
3395
3396
3397 /* The following four macros produce codes (control character or
3398    escape sequence) for ISO2022 locking-shift functions (shift-in,
3399    shift-out, locking-shift-2, and locking-shift-3).  */
3400
3401 #define ENCODE_SHIFT_IN                                 \
3402   do {                                                  \
3403     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3404     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3405   } while (0)
3406
3407
3408 #define ENCODE_SHIFT_OUT                                \
3409   do {                                                  \
3410     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3411     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3412   } while (0)
3413
3414
3415 #define ENCODE_LOCKING_SHIFT_2                          \
3416   do {                                                  \
3417     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3418     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3419   } while (0)
3420
3421
3422 #define ENCODE_LOCKING_SHIFT_3                          \
3423   do {                                                  \
3424     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3425     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3426   } while (0)
3427
3428
3429 /* Produce codes for a DIMENSION1 character whose character set is
3430    CHARSET and whose position-code is C1.  Designation and invocation
3431    sequences are also produced in advance if necessary.  */
3432
3433 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3434   do {                                                                  \
3435     int id = CHARSET_ID (charset);                                      \
3436                                                                         \
3437     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3438         && id == charset_ascii)                                         \
3439       {                                                                 \
3440         id = charset_jisx0201_roman;                                    \
3441         charset = CHARSET_FROM_ID (id);                                 \
3442       }                                                                 \
3443                                                                         \
3444     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3445       {                                                                 \
3446         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3447           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3448         else                                                            \
3449           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3450         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3451         break;                                                          \
3452       }                                                                 \
3453     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3454       {                                                                 \
3455         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3456         break;                                                          \
3457       }                                                                 \
3458     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3459       {                                                                 \
3460         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3461         break;                                                          \
3462       }                                                                 \
3463     else                                                                \
3464       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3465          must invoke it, or, at first, designate it to some graphic     \
3466          register.  Then repeat the loop to actually produce the        \
3467          character.  */                                                 \
3468       dst = encode_invocation_designation (charset, coding, dst,        \
3469                                            &produced_chars);            \
3470   } while (1)
3471
3472
3473 /* Produce codes for a DIMENSION2 character whose character set is
3474    CHARSET and whose position-codes are C1 and C2.  Designation and
3475    invocation codes are also produced in advance if necessary.  */
3476
3477 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3478   do {                                                                  \
3479     int id = CHARSET_ID (charset);                                      \
3480                                                                         \
3481     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3482         && id == charset_jisx0208)                                      \
3483       {                                                                 \
3484         id = charset_jisx0208_1978;                                     \
3485         charset = CHARSET_FROM_ID (id);                                 \
3486       }                                                                 \
3487                                                                         \
3488     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3489       {                                                                 \
3490         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3491           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3492         else                                                            \
3493           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3494         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3495         break;                                                          \
3496       }                                                                 \
3497     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3498       {                                                                 \
3499         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3500         break;                                                          \
3501       }                                                                 \
3502     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3503       {                                                                 \
3504         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3505         break;                                                          \
3506       }                                                                 \
3507     else                                                                \
3508       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3509          must invoke it, or, at first, designate it to some graphic     \
3510          register.  Then repeat the loop to actually produce the        \
3511          character.  */                                                 \
3512       dst = encode_invocation_designation (charset, coding, dst,        \
3513                                            &produced_chars);            \
3514   } while (1)
3515
3516
3517 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3518   do {                                                                     \
3519     int code = ENCODE_CHAR ((charset),(c));                                \
3520                                                                            \
3521     if (CHARSET_DIMENSION (charset) == 1)                                  \
3522       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3523     else                                                                   \
3524       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3525   } while (0)
3526
3527
3528 /* Produce designation and invocation codes at a place pointed by DST
3529    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3530    Return new DST.  */
3531
3532 unsigned char *
3533 encode_invocation_designation (charset, coding, dst, p_nchars)
3534      struct charset *charset;
3535      struct coding_system *coding;
3536      unsigned char *dst;
3537      int *p_nchars;
3538 {
3539   int multibytep = coding->dst_multibyte;
3540   int produced_chars = *p_nchars;
3541   int reg;                      /* graphic register number */
3542   int id = CHARSET_ID (charset);
3543
3544   /* At first, check designations.  */
3545   for (reg = 0; reg < 4; reg++)
3546     if (id == CODING_ISO_DESIGNATION (coding, reg))
3547       break;
3548
3549   if (reg >= 4)
3550     {
3551       /* CHARSET is not yet designated to any graphic registers.  */
3552       /* At first check the requested designation.  */
3553       reg = CODING_ISO_REQUEST (coding, id);
3554       if (reg < 0)
3555         /* Since CHARSET requests no special designation, designate it
3556            to graphic register 0.  */
3557         reg = 0;
3558
3559       ENCODE_DESIGNATION (charset, reg, coding);
3560     }
3561
3562   if (CODING_ISO_INVOCATION (coding, 0) != reg
3563       && CODING_ISO_INVOCATION (coding, 1) != reg)
3564     {
3565       /* Since the graphic register REG is not invoked to any graphic
3566          planes, invoke it to graphic plane 0.  */
3567       switch (reg)
3568         {
3569         case 0:                 /* graphic register 0 */
3570           ENCODE_SHIFT_IN;
3571           break;
3572
3573         case 1:                 /* graphic register 1 */
3574           ENCODE_SHIFT_OUT;
3575           break;
3576
3577         case 2:                 /* graphic register 2 */
3578           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3579             ENCODE_SINGLE_SHIFT_2;
3580           else
3581             ENCODE_LOCKING_SHIFT_2;
3582           break;
3583
3584         case 3:                 /* graphic register 3 */
3585           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3586             ENCODE_SINGLE_SHIFT_3;
3587           else
3588             ENCODE_LOCKING_SHIFT_3;
3589           break;
3590         }
3591     }
3592
3593   *p_nchars = produced_chars;
3594   return dst;
3595 }
3596
3597 /* The following three macros produce codes for indicating direction
3598    of text.  */
3599 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3600   do {                                                                  \
3601     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3602       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3603     else                                                                \
3604       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3605   } while (0)
3606
3607
3608 #define ENCODE_DIRECTION_R2L()                  \
3609   do {                                          \
3610     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3611     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3612   } while (0)
3613
3614
3615 #define ENCODE_DIRECTION_L2R()                  \
3616   do {                                          \
3617     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3618     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3619   } while (0)
3620
3621
3622 /* Produce codes for designation and invocation to reset the graphic
3623    planes and registers to initial state.  */
3624 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3625   do {                                                                  \
3626     int reg;                                                            \
3627     struct charset *charset;                                            \
3628                                                                         \
3629     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3630       ENCODE_SHIFT_IN;                                                  \
3631     for (reg = 0; reg < 4; reg++)                                       \
3632       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3633           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3634               != CODING_ISO_INITIAL (coding, reg)))                     \
3635         {                                                               \
3636           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3637           ENCODE_DESIGNATION (charset, reg, coding);                    \
3638         }                                                               \
3639   } while (0)
3640
3641
3642 /* Produce designation sequences of charsets in the line started from
3643    SRC to a place pointed by DST, and return updated DST.
3644
3645    If the current block ends before any end-of-line, we may fail to
3646    find all the necessary designations.  */
3647
3648 static unsigned char *
3649 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3650      struct coding_system *coding;
3651      int *charbuf, *charbuf_end;
3652      unsigned char *dst;
3653 {
3654   struct charset *charset;
3655   /* Table of charsets to be designated to each graphic register.  */
3656   int r[4];
3657   int c, found = 0, reg;
3658   int produced_chars = 0;
3659   int multibytep = coding->dst_multibyte;
3660   Lisp_Object attrs;
3661   Lisp_Object charset_list;
3662
3663   attrs = CODING_ID_ATTRS (coding->id);
3664   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3665   if (EQ (charset_list, Qiso_2022))
3666     charset_list = Viso_2022_charset_list;
3667
3668   for (reg = 0; reg < 4; reg++)
3669     r[reg] = -1;
3670
3671   while (found < 4)
3672     {
3673       int id;
3674
3675       c = *charbuf++;
3676       if (c == '\n')
3677         break;
3678       charset = char_charset (c, charset_list, NULL);
3679       id = CHARSET_ID (charset);
3680       reg = CODING_ISO_REQUEST (coding, id);
3681       if (reg >= 0 && r[reg] < 0)
3682         {
3683           found++;
3684           r[reg] = id;
3685         }
3686     }
3687
3688   if (found)
3689     {
3690       for (reg = 0; reg < 4; reg++)
3691         if (r[reg] >= 0
3692             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3693           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3694     }
3695
3696   return dst;
3697 }
3698
3699 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3700
3701 static int
3702 encode_coding_iso_2022 (coding)
3703      struct coding_system *coding;
3704 {
3705   int multibytep = coding->dst_multibyte;
3706   int *charbuf = coding->charbuf;
3707   int *charbuf_end = charbuf + coding->charbuf_used;
3708   unsigned char *dst = coding->destination + coding->produced;
3709   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3710   int safe_room = 16;
3711   int bol_designation
3712     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3713        && CODING_ISO_BOL (coding));
3714   int produced_chars = 0;
3715   Lisp_Object attrs, eol_type, charset_list;
3716   int ascii_compatible;
3717   int c;
3718   int preferred_charset_id = -1;
3719
3720   CODING_GET_INFO (coding, attrs, charset_list);
3721   eol_type = CODING_ID_EOL_TYPE (coding->id);
3722   if (VECTORP (eol_type))
3723     eol_type = Qunix;
3724
3725   setup_iso_safe_charsets (attrs);
3726   /* Charset list may have been changed.  */
3727   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3728   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3729
3730   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3731
3732   while (charbuf < charbuf_end)
3733     {
3734       ASSURE_DESTINATION (safe_room);
3735
3736       if (bol_designation)
3737         {
3738           unsigned char *dst_prev = dst;
3739
3740           /* We have to produce designation sequences if any now.  */
3741           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3742           bol_designation = 0;
3743           /* We are sure that designation sequences are all ASCII bytes.  */
3744           produced_chars += dst - dst_prev;
3745         }
3746
3747       c = *charbuf++;
3748
3749       if (c < 0)
3750         {
3751           /* Handle an annotation.  */
3752           switch (*charbuf)
3753             {
3754             case CODING_ANNOTATE_COMPOSITION_MASK:
3755               /* Not yet implemented.  */
3756               break;
3757             case CODING_ANNOTATE_CHARSET_MASK:
3758               preferred_charset_id = charbuf[3];
3759               if (preferred_charset_id >= 0
3760                   && NILP (Fmemq (make_number (preferred_charset_id),
3761                                   charset_list)))
3762                 preferred_charset_id = -1;
3763               break;
3764             default:
3765               abort ();
3766             }
3767           charbuf += -c - 1;
3768           continue;
3769         }
3770
3771       /* Now encode the character C.  */
3772       if (c < 0x20 || c == 0x7F)
3773         {
3774           if (c == '\n'
3775               || (c == '\r' && EQ (eol_type, Qmac)))
3776             {
3777               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3778                 ENCODE_RESET_PLANE_AND_REGISTER ();
3779               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3780                 {
3781                   int i;
3782
3783                   for (i = 0; i < 4; i++)
3784                     CODING_ISO_DESIGNATION (coding, i)
3785                       = CODING_ISO_INITIAL (coding, i);
3786                 }
3787               bol_designation
3788                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3789             }
3790           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3791             ENCODE_RESET_PLANE_AND_REGISTER ();
3792           EMIT_ONE_ASCII_BYTE (c);
3793         }
3794       else if (ASCII_CHAR_P (c))
3795         {
3796           if (ascii_compatible)
3797             EMIT_ONE_ASCII_BYTE (c);
3798           else
3799             {
3800               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3801               ENCODE_ISO_CHARACTER (charset, c);
3802             }
3803         }
3804       else if (CHAR_BYTE8_P (c))
3805         {
3806           c = CHAR_TO_BYTE8 (c);
3807           EMIT_ONE_BYTE (c);
3808         }
3809       else
3810         {
3811           struct charset *charset;
3812
3813           if (preferred_charset_id >= 0)
3814             {
3815               charset = CHARSET_FROM_ID (preferred_charset_id);
3816               if (! CHAR_CHARSET_P (c, charset))
3817                 charset = char_charset (c, charset_list, NULL);
3818             }
3819           else
3820             charset = char_charset (c, charset_list, NULL);
3821           if (!charset)
3822             {
3823               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3824                 {
3825                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3826                   charset = CHARSET_FROM_ID (charset_ascii);
3827                 }
3828               else
3829                 {
3830                   c = coding->default_char;
3831                   charset = char_charset (c, charset_list, NULL);
3832                 }
3833             }
3834           ENCODE_ISO_CHARACTER (charset, c);
3835         }
3836     }
3837
3838   if (coding->mode & CODING_MODE_LAST_BLOCK
3839       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3840     {
3841       ASSURE_DESTINATION (safe_room);
3842       ENCODE_RESET_PLANE_AND_REGISTER ();
3843     }
3844   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3845   CODING_ISO_BOL (coding) = bol_designation;
3846   coding->produced_char += produced_chars;
3847   coding->produced = dst - coding->destination;
3848   return 0;
3849 }
3850
3851 \f
3852 /*** 8,9. SJIS and BIG5 handlers ***/
3853
3854 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3855    quite widely.  So, for the moment, Emacs supports them in the bare
3856    C code.  But, in the future, they may be supported only by CCL.  */
3857
3858 /* SJIS is a coding system encoding three character sets: ASCII, right
3859    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3860    as is.  A character of charset katakana-jisx0201 is encoded by
3861    "position-code + 0x80".  A character of charset japanese-jisx0208
3862    is encoded in 2-byte but two position-codes are divided and shifted
3863    so that it fit in the range below.
3864
3865    --- CODE RANGE of SJIS ---
3866    (character set)      (range)
3867    ASCII                0x00 .. 0x7F
3868    KATAKANA-JISX0201    0xA0 .. 0xDF
3869    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3870             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3871    -------------------------------
3872
3873 */
3874
3875 /* BIG5 is a coding system encoding two character sets: ASCII and
3876    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3877    character set and is encoded in two-byte.
3878
3879    --- CODE RANGE of BIG5 ---
3880    (character set)      (range)
3881    ASCII                0x00 .. 0x7F
3882    Big5 (1st byte)      0xA1 .. 0xFE
3883         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3884    --------------------------
3885
3886   */
3887
3888 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3889    Check if a text is encoded in SJIS.  If it is, return
3890    CATEGORY_MASK_SJIS, else return 0.  */
3891
3892 static int
3893 detect_coding_sjis (coding, detect_info)
3894      struct coding_system *coding;
3895      struct coding_detection_info *detect_info;
3896 {
3897   const unsigned char *src = coding->source, *src_base;
3898   const unsigned char *src_end = coding->source + coding->src_bytes;
3899   int multibytep = coding->src_multibyte;
3900   int consumed_chars = 0;
3901   int found = 0;
3902   int c;
3903
3904   detect_info->checked |= CATEGORY_MASK_SJIS;
3905   /* A coding system of this category is always ASCII compatible.  */
3906   src += coding->head_ascii;
3907
3908   while (1)
3909     {
3910       src_base = src;
3911       ONE_MORE_BYTE (c);
3912       if (c < 0x80)
3913         continue;
3914       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3915         {
3916           ONE_MORE_BYTE (c);
3917           if (c < 0x40 || c == 0x7F || c > 0xFC)
3918             break;
3919           found = CATEGORY_MASK_SJIS;
3920         }
3921       else if (c >= 0xA0 && c < 0xE0)
3922         found = CATEGORY_MASK_SJIS;
3923       else
3924         break;
3925     }
3926   detect_info->rejected |= CATEGORY_MASK_SJIS;
3927   return 0;
3928
3929  no_more_source:
3930   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3931     {
3932       detect_info->rejected |= CATEGORY_MASK_SJIS;
3933       return 0;
3934     }
3935   detect_info->found |= found;
3936   return 1;
3937 }
3938
3939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3940    Check if a text is encoded in BIG5.  If it is, return
3941    CATEGORY_MASK_BIG5, else return 0.  */
3942
3943 static int
3944 detect_coding_big5 (coding, detect_info)
3945      struct coding_system *coding;
3946      struct coding_detection_info *detect_info;
3947 {
3948   const unsigned char *src = coding->source, *src_base;
3949   const unsigned char *src_end = coding->source + coding->src_bytes;
3950   int multibytep = coding->src_multibyte;
3951   int consumed_chars = 0;
3952   int found = 0;
3953   int c;
3954
3955   detect_info->checked |= CATEGORY_MASK_BIG5;
3956   /* A coding system of this category is always ASCII compatible.  */
3957   src += coding->head_ascii;
3958
3959   while (1)
3960     {
3961       src_base = src;
3962       ONE_MORE_BYTE (c);
3963       if (c < 0x80)
3964         continue;
3965       if (c >= 0xA1)
3966         {
3967           ONE_MORE_BYTE (c);
3968           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3969             return 0;
3970           found = CATEGORY_MASK_BIG5;
3971         }
3972       else
3973         break;
3974     }
3975   detect_info->rejected |= CATEGORY_MASK_BIG5;
3976   return 0;
3977
3978  no_more_source:
3979   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3980     {
3981       detect_info->rejected |= CATEGORY_MASK_BIG5;
3982       return 0;
3983     }
3984   detect_info->found |= found;
3985   return 1;
3986 }
3987
3988 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3989    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3990
3991 static void
3992 decode_coding_sjis (coding)
3993      struct coding_system *coding;
3994 {
3995   const unsigned char *src = coding->source + coding->consumed;
3996   const unsigned char *src_end = coding->source + coding->src_bytes;
3997   const unsigned char *src_base;
3998   int *charbuf = coding->charbuf;
3999   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4000   int consumed_chars = 0, consumed_chars_base;
4001   int multibytep = coding->src_multibyte;
4002   struct charset *charset_roman, *charset_kanji, *charset_kana;
4003   struct charset *charset_kanji2;
4004   Lisp_Object attrs, charset_list, val;
4005   int char_offset = coding->produced_char;
4006   int last_offset = char_offset;
4007   int last_id = charset_ascii;
4008
4009   CODING_GET_INFO (coding, attrs, charset_list);
4010
4011   val = charset_list;
4012   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4013   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4014   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4015   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4016
4017   while (1)
4018     {
4019       int c, c1;
4020       struct charset *charset;
4021
4022       src_base = src;
4023       consumed_chars_base = consumed_chars;
4024
4025       if (charbuf >= charbuf_end)
4026         break;
4027
4028       ONE_MORE_BYTE (c);
4029       if (c < 0)
4030         goto invalid_code;
4031       if (c < 0x80)
4032         charset = charset_roman;
4033       else if (c == 0x80 || c == 0xA0)
4034         goto invalid_code;
4035       else if (c >= 0xA1 && c <= 0xDF)
4036         {
4037           /* SJIS -> JISX0201-Kana */
4038           c &= 0x7F;
4039           charset = charset_kana;
4040         }
4041       else if (c <= 0xEF)
4042         {
4043           /* SJIS -> JISX0208 */
4044           ONE_MORE_BYTE (c1);
4045           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4046             goto invalid_code;
4047           c = (c << 8) | c1;
4048           SJIS_TO_JIS (c);
4049           charset = charset_kanji;
4050         }
4051       else if (c <= 0xFC && charset_kanji2)
4052         {
4053           /* SJIS -> JISX0213-2 */
4054           ONE_MORE_BYTE (c1);
4055           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4056             goto invalid_code;
4057           c = (c << 8) | c1;
4058           SJIS_TO_JIS2 (c);
4059           charset = charset_kanji2;
4060         }
4061       else
4062         goto invalid_code;
4063       if (charset->id != charset_ascii
4064           && last_id != charset->id)
4065         {
4066           if (last_id != charset_ascii)
4067             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4068           last_id = charset->id;
4069           last_offset = char_offset;
4070         }
4071       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4072       *charbuf++ = c;
4073       char_offset++;
4074       continue;
4075
4076     invalid_code:
4077       src = src_base;
4078       consumed_chars = consumed_chars_base;
4079       ONE_MORE_BYTE (c);
4080       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4081       char_offset++;
4082       coding->errors++;
4083     }
4084
4085  no_more_source:
4086   if (last_id != charset_ascii)
4087     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4088   coding->consumed_char += consumed_chars_base;
4089   coding->consumed = src_base - coding->source;
4090   coding->charbuf_used = charbuf - coding->charbuf;
4091 }
4092
4093 static void
4094 decode_coding_big5 (coding)
4095      struct coding_system *coding;
4096 {
4097   const unsigned char *src = coding->source + coding->consumed;
4098   const unsigned char *src_end = coding->source + coding->src_bytes;
4099   const unsigned char *src_base;
4100   int *charbuf = coding->charbuf;
4101   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4102   int consumed_chars = 0, consumed_chars_base;
4103   int multibytep = coding->src_multibyte;
4104   struct charset *charset_roman, *charset_big5;
4105   Lisp_Object attrs, charset_list, val;
4106   int char_offset = coding->produced_char;
4107   int last_offset = char_offset;
4108   int last_id = charset_ascii;
4109
4110   CODING_GET_INFO (coding, attrs, charset_list);
4111   val = charset_list;
4112   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4113   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4114
4115   while (1)
4116     {
4117       int c, c1;
4118       struct charset *charset;
4119
4120       src_base = src;
4121       consumed_chars_base = consumed_chars;
4122
4123       if (charbuf >= charbuf_end)
4124         break;
4125
4126       ONE_MORE_BYTE (c);
4127
4128       if (c < 0)
4129         goto invalid_code;
4130       if (c < 0x80)
4131         charset = charset_roman;
4132       else
4133         {
4134           /* BIG5 -> Big5 */
4135           if (c < 0xA1 || c > 0xFE)
4136             goto invalid_code;
4137           ONE_MORE_BYTE (c1);
4138           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4139             goto invalid_code;
4140           c = c << 8 | c1;
4141           charset = charset_big5;
4142         }
4143       if (charset->id != charset_ascii
4144           && last_id != charset->id)
4145         {
4146           if (last_id != charset_ascii)
4147             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4148           last_id = charset->id;
4149           last_offset = char_offset;
4150         }
4151       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4152       *charbuf++ = c;
4153       char_offset++;
4154       continue;
4155
4156     invalid_code:
4157       src = src_base;
4158       consumed_chars = consumed_chars_base;
4159       ONE_MORE_BYTE (c);
4160       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4161       char_offset++;
4162       coding->errors++;
4163     }
4164
4165  no_more_source:
4166   if (last_id != charset_ascii)
4167     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4168   coding->consumed_char += consumed_chars_base;
4169   coding->consumed = src_base - coding->source;
4170   coding->charbuf_used = charbuf - coding->charbuf;
4171 }
4172
4173 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4174    This function can encode charsets `ascii', `katakana-jisx0201',
4175    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4176    are sure that all these charsets are registered as official charset
4177    (i.e. do not have extended leading-codes).  Characters of other
4178    charsets are produced without any encoding.  If SJIS_P is 1, encode
4179    SJIS text, else encode BIG5 text.  */
4180
4181 static int
4182 encode_coding_sjis (coding)
4183      struct coding_system *coding;
4184 {
4185   int multibytep = coding->dst_multibyte;
4186   int *charbuf = coding->charbuf;
4187   int *charbuf_end = charbuf + coding->charbuf_used;
4188   unsigned char *dst = coding->destination + coding->produced;
4189   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4190   int safe_room = 4;
4191   int produced_chars = 0;
4192   Lisp_Object attrs, charset_list, val;
4193   int ascii_compatible;
4194   struct charset *charset_roman, *charset_kanji, *charset_kana;
4195   struct charset *charset_kanji2;
4196   int c;
4197
4198   CODING_GET_INFO (coding, attrs, charset_list);
4199   val = charset_list;
4200   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4201   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4202   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4203   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4204
4205   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4206
4207   while (charbuf < charbuf_end)
4208     {
4209       ASSURE_DESTINATION (safe_room);
4210       c = *charbuf++;
4211       /* Now encode the character C.  */
4212       if (ASCII_CHAR_P (c) && ascii_compatible)
4213         EMIT_ONE_ASCII_BYTE (c);
4214       else if (CHAR_BYTE8_P (c))
4215         {
4216           c = CHAR_TO_BYTE8 (c);
4217           EMIT_ONE_BYTE (c);
4218         }
4219       else
4220         {
4221           unsigned code;
4222           struct charset *charset = char_charset (c, charset_list, &code);
4223
4224           if (!charset)
4225             {
4226               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4227                 {
4228                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4229                   charset = CHARSET_FROM_ID (charset_ascii);
4230                 }
4231               else
4232                 {
4233                   c = coding->default_char;
4234                   charset = char_charset (c, charset_list, &code);
4235                 }
4236             }
4237           if (code == CHARSET_INVALID_CODE (charset))
4238             abort ();
4239           if (charset == charset_kanji)
4240             {
4241               int c1, c2;
4242               JIS_TO_SJIS (code);
4243               c1 = code >> 8, c2 = code & 0xFF;
4244               EMIT_TWO_BYTES (c1, c2);
4245             }
4246           else if (charset == charset_kana)
4247             EMIT_ONE_BYTE (code | 0x80);
4248           else if (charset_kanji2 && charset == charset_kanji2)
4249             {
4250               int c1, c2;
4251
4252               c1 = code >> 8;
4253               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4254                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4255                 {
4256                   JIS_TO_SJIS2 (code);
4257                   c1 = code >> 8, c2 = code & 0xFF;
4258                   EMIT_TWO_BYTES (c1, c2);
4259                 }
4260               else
4261                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4262             }
4263           else
4264             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4265         }
4266     }
4267   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4268   coding->produced_char += produced_chars;
4269   coding->produced = dst - coding->destination;
4270   return 0;
4271 }
4272
4273 static int
4274 encode_coding_big5 (coding)
4275      struct coding_system *coding;
4276 {
4277   int multibytep = coding->dst_multibyte;
4278   int *charbuf = coding->charbuf;
4279   int *charbuf_end = charbuf + coding->charbuf_used;
4280   unsigned char *dst = coding->destination + coding->produced;
4281   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4282   int safe_room = 4;
4283   int produced_chars = 0;
4284   Lisp_Object attrs, charset_list, val;
4285   int ascii_compatible;
4286   struct charset *charset_roman, *charset_big5;
4287   int c;
4288
4289   CODING_GET_INFO (coding, attrs, charset_list);
4290   val = charset_list;
4291   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4292   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4293   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4294
4295   while (charbuf < charbuf_end)
4296     {
4297       ASSURE_DESTINATION (safe_room);
4298       c = *charbuf++;
4299       /* Now encode the character C.  */
4300       if (ASCII_CHAR_P (c) && ascii_compatible)
4301         EMIT_ONE_ASCII_BYTE (c);
4302       else if (CHAR_BYTE8_P (c))
4303         {
4304           c = CHAR_TO_BYTE8 (c);
4305           EMIT_ONE_BYTE (c);
4306         }
4307       else
4308         {
4309           unsigned code;
4310           struct charset *charset = char_charset (c, charset_list, &code);
4311
4312           if (! charset)
4313             {
4314               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4315                 {
4316                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4317                   charset = CHARSET_FROM_ID (charset_ascii);
4318                 }
4319               else
4320                 {
4321                   c = coding->default_char;
4322                   charset = char_charset (c, charset_list, &code);
4323                 }
4324             }
4325           if (code == CHARSET_INVALID_CODE (charset))
4326             abort ();
4327           if (charset == charset_big5)
4328             {
4329               int c1, c2;
4330
4331               c1 = code >> 8, c2 = code & 0xFF;
4332               EMIT_TWO_BYTES (c1, c2);
4333             }
4334           else
4335             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4336         }
4337     }
4338   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4339   coding->produced_char += produced_chars;
4340   coding->produced = dst - coding->destination;
4341   return 0;
4342 }
4343
4344 \f
4345 /*** 10. CCL handlers ***/
4346
4347 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4348    Check if a text is encoded in a coding system of which
4349    encoder/decoder are written in CCL program.  If it is, return
4350    CATEGORY_MASK_CCL, else return 0.  */
4351
4352 static int
4353 detect_coding_ccl (coding, detect_info)
4354      struct coding_system *coding;
4355      struct coding_detection_info *detect_info;
4356 {
4357   const unsigned char *src = coding->source, *src_base;
4358   const unsigned char *src_end = coding->source + coding->src_bytes;
4359   int multibytep = coding->src_multibyte;
4360   int consumed_chars = 0;
4361   int found = 0;
4362   unsigned char *valids;
4363   int head_ascii = coding->head_ascii;
4364   Lisp_Object attrs;
4365
4366   detect_info->checked |= CATEGORY_MASK_CCL;
4367
4368   coding = &coding_categories[coding_category_ccl];
4369   valids = CODING_CCL_VALIDS (coding);
4370   attrs = CODING_ID_ATTRS (coding->id);
4371   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4372     src += head_ascii;
4373
4374   while (1)
4375     {
4376       int c;
4377
4378       src_base = src;
4379       ONE_MORE_BYTE (c);
4380       if (c < 0 || ! valids[c])
4381         break;
4382       if ((valids[c] > 1))
4383         found = CATEGORY_MASK_CCL;
4384     }
4385   detect_info->rejected |= CATEGORY_MASK_CCL;
4386   return 0;
4387
4388  no_more_source:
4389   detect_info->found |= found;
4390   return 1;
4391 }
4392
4393 static void
4394 decode_coding_ccl (coding)
4395      struct coding_system *coding;
4396 {
4397   const unsigned char *src = coding->source + coding->consumed;
4398   const unsigned char *src_end = coding->source + coding->src_bytes;
4399   int *charbuf = coding->charbuf;
4400   int *charbuf_end = charbuf + coding->charbuf_size;
4401   int consumed_chars = 0;
4402   int multibytep = coding->src_multibyte;
4403   struct ccl_program ccl;
4404   int source_charbuf[1024];
4405   int source_byteidx[1024];
4406   Lisp_Object attrs, charset_list;
4407
4408   CODING_GET_INFO (coding, attrs, charset_list);
4409   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4410
4411   while (src < src_end)
4412     {
4413       const unsigned char *p = src;
4414       int *source, *source_end;
4415       int i = 0;
4416
4417       if (multibytep)
4418         while (i < 1024 && p < src_end)
4419           {
4420             source_byteidx[i] = p - src;
4421             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4422           }
4423       else
4424         while (i < 1024 && p < src_end)
4425           source_charbuf[i++] = *p++;
4426
4427       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4428         ccl.last_block = 1;
4429
4430       source = source_charbuf;
4431       source_end = source + i;
4432       while (source < source_end)
4433         {
4434           ccl_driver (&ccl, source, charbuf,
4435                       source_end - source, charbuf_end - charbuf,
4436                       charset_list);
4437           source += ccl.consumed;
4438           charbuf += ccl.produced;
4439           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4440             break;
4441         }
4442       if (source < source_end)
4443         src += source_byteidx[source - source_charbuf];
4444       else
4445         src = p;
4446       consumed_chars += source - source_charbuf;
4447
4448       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4449           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4450         break;
4451     }
4452
4453   switch (ccl.status)
4454     {
4455     case CCL_STAT_SUSPEND_BY_SRC:
4456       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4457       break;
4458     case CCL_STAT_SUSPEND_BY_DST:
4459       break;
4460     case CCL_STAT_QUIT:
4461     case CCL_STAT_INVALID_CMD:
4462       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4463       break;
4464     default:
4465       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4466       break;
4467     }
4468   coding->consumed_char += consumed_chars;
4469   coding->consumed = src - coding->source;
4470   coding->charbuf_used = charbuf - coding->charbuf;
4471 }
4472
4473 static int
4474 encode_coding_ccl (coding)
4475      struct coding_system *coding;
4476 {
4477   struct ccl_program ccl;
4478   int multibytep = coding->dst_multibyte;
4479   int *charbuf = coding->charbuf;
4480   int *charbuf_end = charbuf + coding->charbuf_used;
4481   unsigned char *dst = coding->destination + coding->produced;
4482   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4483   unsigned char *adjusted_dst_end = dst_end - 1;
4484   int destination_charbuf[1024];
4485   int i, produced_chars = 0;
4486   Lisp_Object attrs, charset_list;
4487
4488   CODING_GET_INFO (coding, attrs, charset_list);
4489   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4490
4491   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4492   ccl.dst_multibyte = coding->dst_multibyte;
4493
4494   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4495     {
4496       int dst_bytes = dst_end - dst;
4497       if (dst_bytes > 1024)
4498         dst_bytes = 1024;
4499
4500       ccl_driver (&ccl, charbuf, destination_charbuf,
4501                   charbuf_end - charbuf, dst_bytes, charset_list);
4502       charbuf += ccl.consumed;
4503       if (multibytep)
4504         for (i = 0; i < ccl.produced; i++)
4505           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4506       else
4507         {
4508           for (i = 0; i < ccl.produced; i++)
4509             *dst++ = destination_charbuf[i] & 0xFF;
4510           produced_chars += ccl.produced;
4511         }
4512     }
4513
4514   switch (ccl.status)
4515     {
4516     case CCL_STAT_SUSPEND_BY_SRC:
4517       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4518       break;
4519     case CCL_STAT_SUSPEND_BY_DST:
4520       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4521       break;
4522     case CCL_STAT_QUIT:
4523     case CCL_STAT_INVALID_CMD:
4524       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4525       break;
4526     default:
4527       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4528       break;
4529     }
4530
4531   coding->produced_char += produced_chars;
4532   coding->produced = dst - coding->destination;
4533   return 0;
4534 }
4535
4536
4537 \f
4538 /*** 10, 11. no-conversion handlers ***/
4539
4540 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4541
4542 static void
4543 decode_coding_raw_text (coding)
4544      struct coding_system *coding;
4545 {
4546   coding->chars_at_source = 1;
4547   coding->consumed_char = 0;
4548   coding->consumed = 0;
4549   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4550 }
4551
4552 static int
4553 encode_coding_raw_text (coding)
4554      struct coding_system *coding;
4555 {
4556   int multibytep = coding->dst_multibyte;
4557   int *charbuf = coding->charbuf;
4558   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4559   unsigned char *dst = coding->destination + coding->produced;
4560   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4561   int produced_chars = 0;
4562   int c;
4563
4564   if (multibytep)
4565     {
4566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4567
4568       if (coding->src_multibyte)
4569         while (charbuf < charbuf_end)
4570           {
4571             ASSURE_DESTINATION (safe_room);
4572             c = *charbuf++;
4573             if (ASCII_CHAR_P (c))
4574               EMIT_ONE_ASCII_BYTE (c);
4575             else if (CHAR_BYTE8_P (c))
4576               {
4577                 c = CHAR_TO_BYTE8 (c);
4578                 EMIT_ONE_BYTE (c);
4579               }
4580             else
4581               {
4582                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4583
4584                 CHAR_STRING_ADVANCE (c, p1);
4585                 while (p0 < p1)
4586                   {
4587                     EMIT_ONE_BYTE (*p0);
4588                     p0++;
4589                   }
4590               }
4591           }
4592       else
4593         while (charbuf < charbuf_end)
4594           {
4595             ASSURE_DESTINATION (safe_room);
4596             c = *charbuf++;
4597             EMIT_ONE_BYTE (c);
4598           }
4599     }
4600   else
4601     {
4602       if (coding->src_multibyte)
4603         {
4604           int safe_room = MAX_MULTIBYTE_LENGTH;
4605
4606           while (charbuf < charbuf_end)
4607             {
4608               ASSURE_DESTINATION (safe_room);
4609               c = *charbuf++;
4610               if (ASCII_CHAR_P (c))
4611                 *dst++ = c;
4612               else if (CHAR_BYTE8_P (c))
4613                 *dst++ = CHAR_TO_BYTE8 (c);
4614               else
4615                 CHAR_STRING_ADVANCE (c, dst);
4616               produced_chars++;
4617             }
4618         }
4619       else
4620         {
4621           ASSURE_DESTINATION (charbuf_end - charbuf);
4622           while (charbuf < charbuf_end && dst < dst_end)
4623             *dst++ = *charbuf++;
4624           produced_chars = dst - (coding->destination + coding->dst_bytes);
4625         }
4626     }
4627   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4628   coding->produced_char += produced_chars;
4629   coding->produced = dst - coding->destination;
4630   return 0;
4631 }
4632
4633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634    Check if a text is encoded in a charset-based coding system.  If it
4635    is, return 1, else return 0.  */
4636
4637 static int
4638 detect_coding_charset (coding, detect_info)
4639      struct coding_system *coding;
4640      struct coding_detection_info *detect_info;
4641 {
4642   const unsigned char *src = coding->source, *src_base;
4643   const unsigned char *src_end = coding->source + coding->src_bytes;
4644   int multibytep = coding->src_multibyte;
4645   int consumed_chars = 0;
4646   Lisp_Object attrs, valids;
4647   int found = 0;
4648
4649   detect_info->checked |= CATEGORY_MASK_CHARSET;
4650
4651   coding = &coding_categories[coding_category_charset];
4652   attrs = CODING_ID_ATTRS (coding->id);
4653   valids = AREF (attrs, coding_attr_charset_valids);
4654
4655   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4656     src += coding->head_ascii;
4657
4658   while (1)
4659     {
4660       int c;
4661
4662       src_base = src;
4663       ONE_MORE_BYTE (c);
4664       if (c < 0)
4665         continue;
4666       if (NILP (AREF (valids, c)))
4667         break;
4668       if (c >= 0x80)
4669         found = CATEGORY_MASK_CHARSET;
4670     }
4671   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4672   return 0;
4673
4674  no_more_source:
4675   detect_info->found |= found;
4676   return 1;
4677 }
4678
4679 static void
4680 decode_coding_charset (coding)
4681      struct coding_system *coding;
4682 {
4683   const unsigned char *src = coding->source + coding->consumed;
4684   const unsigned char *src_end = coding->source + coding->src_bytes;
4685   const unsigned char *src_base;
4686   int *charbuf = coding->charbuf;
4687   int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4688   int consumed_chars = 0, consumed_chars_base;
4689   int multibytep = coding->src_multibyte;
4690   Lisp_Object attrs, charset_list, valids;
4691   int char_offset = coding->produced_char;
4692   int last_offset = char_offset;
4693   int last_id = charset_ascii;
4694
4695   CODING_GET_INFO (coding, attrs, charset_list);
4696   valids = AREF (attrs, coding_attr_charset_valids);
4697
4698   while (1)
4699     {
4700       int c;
4701       Lisp_Object val;
4702       struct charset *charset;
4703       int dim;
4704       int len = 1;
4705       unsigned code;
4706
4707       src_base = src;
4708       consumed_chars_base = consumed_chars;
4709
4710       if (charbuf >= charbuf_end)
4711         break;
4712
4713       ONE_MORE_BYTE (c);
4714       if (c < 0)
4715         goto invalid_code;
4716       code = c;
4717
4718       val = AREF (valids, c);
4719       if (NILP (val))
4720         goto invalid_code;
4721       if (INTEGERP (val))
4722         {
4723           charset = CHARSET_FROM_ID (XFASTINT (val));
4724           dim = CHARSET_DIMENSION (charset);
4725           while (len < dim)
4726             {
4727               ONE_MORE_BYTE (c);
4728               code = (code << 8) | c;
4729               len++;
4730             }
4731           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4732                               charset, code, c);
4733         }
4734       else
4735         {
4736           /* VAL is a list of charset IDs.  It is assured that the
4737              list is sorted by charset dimensions (smaller one
4738              comes first).  */
4739           while (CONSP (val))
4740             {
4741               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4742               dim = CHARSET_DIMENSION (charset);
4743               while (len < dim)
4744                 {
4745                   ONE_MORE_BYTE (c);
4746                   code = (code << 8) | c;
4747                   len++;
4748                 }
4749               CODING_DECODE_CHAR (coding, src, src_base,
4750                                   src_end, charset, code, c);
4751               if (c >= 0)
4752                 break;
4753               val = XCDR (val);
4754             }
4755         }
4756       if (c < 0)
4757         goto invalid_code;
4758       if (charset->id != charset_ascii
4759           && last_id != charset->id)
4760         {
4761           if (last_id != charset_ascii)
4762             ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4763           last_id = charset->id;
4764           last_offset = char_offset;
4765         }
4766
4767       *charbuf++ = c;
4768       char_offset++;
4769       continue;
4770
4771     invalid_code:
4772       src = src_base;
4773       consumed_chars = consumed_chars_base;
4774       ONE_MORE_BYTE (c);
4775       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4776       char_offset++;
4777       coding->errors++;
4778     }
4779
4780  no_more_source:
4781   if (last_id != charset_ascii)
4782     ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4783   coding->consumed_char += consumed_chars_base;
4784   coding->consumed = src_base - coding->source;
4785   coding->charbuf_used = charbuf - coding->charbuf;
4786 }
4787
4788 static int
4789 encode_coding_charset (coding)
4790      struct coding_system *coding;
4791 {
4792   int multibytep = coding->dst_multibyte;
4793   int *charbuf = coding->charbuf;
4794   int *charbuf_end = charbuf + coding->charbuf_used;
4795   unsigned char *dst = coding->destination + coding->produced;
4796   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4797   int safe_room = MAX_MULTIBYTE_LENGTH;
4798   int produced_chars = 0;
4799   Lisp_Object attrs, charset_list;
4800   int ascii_compatible;
4801   int c;
4802
4803   CODING_GET_INFO (coding, attrs, charset_list);
4804   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4805
4806   while (charbuf < charbuf_end)
4807     {
4808       struct charset *charset;
4809       unsigned code;
4810
4811       ASSURE_DESTINATION (safe_room);
4812       c = *charbuf++;
4813       if (ascii_compatible && ASCII_CHAR_P (c))
4814         EMIT_ONE_ASCII_BYTE (c);
4815       else if (CHAR_BYTE8_P (c))
4816         {
4817           c = CHAR_TO_BYTE8 (c);
4818           EMIT_ONE_BYTE (c);
4819         }
4820       else
4821         {
4822           charset = char_charset (c, charset_list, &code);
4823           if (charset)
4824             {
4825               if (CHARSET_DIMENSION (charset) == 1)
4826                 EMIT_ONE_BYTE (code);
4827               else if (CHARSET_DIMENSION (charset) == 2)
4828                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4829               else if (CHARSET_DIMENSION (charset) == 3)
4830                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4831               else
4832                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4833                                  (code >> 8) & 0xFF, code & 0xFF);
4834             }
4835           else
4836             {
4837               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4838                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4839               else
4840                 c = coding->default_char;
4841               EMIT_ONE_BYTE (c);
4842             }
4843         }
4844     }
4845
4846   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4847   coding->produced_char += produced_chars;
4848   coding->produced = dst - coding->destination;
4849   return 0;
4850 }
4851
4852 \f
4853 /*** 7. C library functions ***/
4854
4855 /* Setup coding context CODING from information about CODING_SYSTEM.
4856    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4857    CODING_SYSTEM is invalid, signal an error.  */
4858
4859 void
4860 setup_coding_system (coding_system, coding)
4861      Lisp_Object coding_system;
4862      struct coding_system *coding;
4863 {
4864   Lisp_Object attrs;
4865   Lisp_Object eol_type;
4866   Lisp_Object coding_type;
4867   Lisp_Object val;
4868
4869   if (NILP (coding_system))
4870     coding_system = Qno_conversion;
4871
4872   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4873
4874   attrs = CODING_ID_ATTRS (coding->id);
4875   eol_type = CODING_ID_EOL_TYPE (coding->id);
4876
4877   coding->mode = 0;
4878   coding->head_ascii = -1;
4879   coding->common_flags
4880     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4881   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4882     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4883   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4884     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4885   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4886     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4887
4888   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4889   coding->max_charset_id = SCHARS (val) - 1;
4890   coding->safe_charsets = (char *) SDATA (val);
4891   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4892
4893   coding_type = CODING_ATTR_TYPE (attrs);
4894   if (EQ (coding_type, Qundecided))
4895     {
4896       coding->detector = NULL;
4897       coding->decoder = decode_coding_raw_text;
4898       coding->encoder = encode_coding_raw_text;
4899       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4900     }
4901   else if (EQ (coding_type, Qiso_2022))
4902     {
4903       int i;
4904       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4905
4906       /* Invoke graphic register 0 to plane 0.  */
4907       CODING_ISO_INVOCATION (coding, 0) = 0;
4908       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4909       CODING_ISO_INVOCATION (coding, 1)
4910         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4911       /* Setup the initial status of designation.  */
4912       for (i = 0; i < 4; i++)
4913         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4914       /* Not single shifting initially.  */
4915       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4916       /* Beginning of buffer should also be regarded as bol. */
4917       CODING_ISO_BOL (coding) = 1;
4918       coding->detector = detect_coding_iso_2022;
4919       coding->decoder = decode_coding_iso_2022;
4920       coding->encoder = encode_coding_iso_2022;
4921       if (flags & CODING_ISO_FLAG_SAFE)
4922         coding->mode |= CODING_MODE_SAFE_ENCODING;
4923       coding->common_flags
4924         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4925             | CODING_REQUIRE_FLUSHING_MASK);
4926       if (flags & CODING_ISO_FLAG_COMPOSITION)
4927         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4928       if (flags & CODING_ISO_FLAG_DESIGNATION)
4929         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4930       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4931         {
4932           setup_iso_safe_charsets (attrs);
4933           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4934           coding->max_charset_id = SCHARS (val) - 1;
4935           coding->safe_charsets = (char *) SDATA (val);
4936         }
4937       CODING_ISO_FLAGS (coding) = flags;
4938     }
4939   else if (EQ (coding_type, Qcharset))
4940     {
4941       coding->detector = detect_coding_charset;
4942       coding->decoder = decode_coding_charset;
4943       coding->encoder = encode_coding_charset;
4944       coding->common_flags
4945         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4946     }
4947   else if (EQ (coding_type, Qutf_8))
4948     {
4949       coding->detector = detect_coding_utf_8;
4950       coding->decoder = decode_coding_utf_8;
4951       coding->encoder = encode_coding_utf_8;
4952       coding->common_flags
4953         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4954     }
4955   else if (EQ (coding_type, Qutf_16))
4956     {
4957       val = AREF (attrs, coding_attr_utf_16_bom);
4958       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4959                                     : EQ (val, Qt) ? utf_16_with_bom
4960                                     : utf_16_without_bom);
4961       val = AREF (attrs, coding_attr_utf_16_endian);
4962       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4963                                        : utf_16_little_endian);
4964       CODING_UTF_16_SURROGATE (coding) = 0;
4965       coding->detector = detect_coding_utf_16;
4966       coding->decoder = decode_coding_utf_16;
4967       coding->encoder = encode_coding_utf_16;
4968       coding->common_flags
4969         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4970       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4971         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4972     }
4973   else if (EQ (coding_type, Qccl))
4974     {
4975       coding->detector = detect_coding_ccl;
4976       coding->decoder = decode_coding_ccl;
4977       coding->encoder = encode_coding_ccl;
4978       coding->common_flags
4979         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4980             | CODING_REQUIRE_FLUSHING_MASK);
4981     }
4982   else if (EQ (coding_type, Qemacs_mule))
4983     {
4984       coding->detector = detect_coding_emacs_mule;
4985       coding->decoder = decode_coding_emacs_mule;
4986       coding->encoder = encode_coding_emacs_mule;
4987       coding->common_flags
4988         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4989       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4990           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4991         {
4992           Lisp_Object tail, safe_charsets;
4993           int max_charset_id = 0;
4994
4995           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4996                tail = XCDR (tail))
4997             if (max_charset_id < XFASTINT (XCAR (tail)))
4998               max_charset_id = XFASTINT (XCAR (tail));
4999           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5000                                         make_number (255));
5001           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5002                tail = XCDR (tail))
5003             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5004           coding->max_charset_id = max_charset_id;
5005           coding->safe_charsets = (char *) SDATA (safe_charsets);
5006         }
5007     }
5008   else if (EQ (coding_type, Qshift_jis))
5009     {
5010       coding->detector = detect_coding_sjis;
5011       coding->decoder = decode_coding_sjis;
5012       coding->encoder = encode_coding_sjis;
5013       coding->common_flags
5014         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5015     }
5016   else if (EQ (coding_type, Qbig5))
5017     {
5018       coding->detector = detect_coding_big5;
5019       coding->decoder = decode_coding_big5;
5020       coding->encoder = encode_coding_big5;
5021       coding->common_flags
5022         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5023     }
5024   else                          /* EQ (coding_type, Qraw_text) */
5025     {
5026       coding->detector = NULL;
5027       coding->decoder = decode_coding_raw_text;
5028       coding->encoder = encode_coding_raw_text;
5029     }
5030
5031   return;
5032 }
5033
5034 /* Return raw-text or one of its subsidiaries that has the same
5035    eol_type as CODING-SYSTEM.  */
5036
5037 Lisp_Object
5038 raw_text_coding_system (coding_system)
5039      Lisp_Object coding_system;
5040 {
5041   Lisp_Object spec, attrs;
5042   Lisp_Object eol_type, raw_text_eol_type;
5043
5044   if (NILP (coding_system))
5045     return Qraw_text;
5046   spec = CODING_SYSTEM_SPEC (coding_system);
5047   attrs = AREF (spec, 0);
5048
5049   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5050     return coding_system;
5051
5052   eol_type = AREF (spec, 2);
5053   if (VECTORP (eol_type))
5054     return Qraw_text;
5055   spec = CODING_SYSTEM_SPEC (Qraw_text);
5056   raw_text_eol_type = AREF (spec, 2);
5057   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5058           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5059           : AREF (raw_text_eol_type, 2));
5060 }
5061
5062
5063 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5064    does, return one of the subsidiary that has the same eol-spec as
5065    PARENT.  Otherwise, return CODING_SYSTEM.  */
5066
5067 Lisp_Object
5068 coding_inherit_eol_type (coding_system, parent)
5069      Lisp_Object coding_system, parent;
5070 {
5071   Lisp_Object spec, eol_type;
5072
5073   if (NILP (coding_system))
5074     coding_system = Qraw_text;
5075   spec = CODING_SYSTEM_SPEC (coding_system);
5076   eol_type = AREF (spec, 2);
5077   if (VECTORP (eol_type)
5078       && ! NILP (parent))
5079     {
5080       Lisp_Object parent_spec;
5081       Lisp_Object parent_eol_type;
5082
5083       parent_spec
5084         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5085       parent_eol_type = AREF (parent_spec, 2);
5086       if (EQ (parent_eol_type, Qunix))
5087         coding_system = AREF (eol_type, 0);
5088       else if (EQ (parent_eol_type, Qdos))
5089         coding_system = AREF (eol_type, 1);
5090       else if (EQ (parent_eol_type, Qmac))
5091         coding_system = AREF (eol_type, 2);
5092     }
5093   return coding_system;
5094 }
5095
5096 /* Emacs has a mechanism to automatically detect a coding system if it
5097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5098    it's impossible to distinguish some coding systems accurately
5099    because they use the same range of codes.  So, at first, coding
5100    systems are categorized into 7, those are:
5101
5102    o coding-category-emacs-mule
5103
5104         The category for a coding system which has the same code range
5105         as Emacs' internal format.  Assigned the coding-system (Lisp
5106         symbol) `emacs-mule' by default.
5107
5108    o coding-category-sjis
5109
5110         The category for a coding system which has the same code range
5111         as SJIS.  Assigned the coding-system (Lisp
5112         symbol) `japanese-shift-jis' by default.
5113
5114    o coding-category-iso-7
5115
5116         The category for a coding system which has the same code range
5117         as ISO2022 of 7-bit environment.  This doesn't use any locking
5118         shift and single shift functions.  This can encode/decode all
5119         charsets.  Assigned the coding-system (Lisp symbol)
5120         `iso-2022-7bit' by default.
5121
5122    o coding-category-iso-7-tight
5123
5124         Same as coding-category-iso-7 except that this can
5125         encode/decode only the specified charsets.
5126
5127    o coding-category-iso-8-1
5128
5129         The category for a coding system which has the same code range
5130         as ISO2022 of 8-bit environment and graphic plane 1 used only
5131         for DIMENSION1 charset.  This doesn't use any locking shift
5132         and single shift functions.  Assigned the coding-system (Lisp
5133         symbol) `iso-latin-1' by default.
5134
5135    o coding-category-iso-8-2
5136
5137         The category for a coding system which has the same code range
5138         as ISO2022 of 8-bit environment and graphic plane 1 used only
5139         for DIMENSION2 charset.  This doesn't use any locking shift
5140         and single shift functions.  Assigned the coding-system (Lisp
5141         symbol) `japanese-iso-8bit' by default.
5142
5143    o coding-category-iso-7-else
5144
5145         The category for a coding system which has the same code range
5146         as ISO2022 of 7-bit environemnt but uses locking shift or
5147         single shift functions.  Assigned the coding-system (Lisp
5148         symbol) `iso-2022-7bit-lock' by default.
5149
5150    o coding-category-iso-8-else
5151
5152         The category for a coding system which has the same code range
5153         as ISO2022 of 8-bit environemnt but uses locking shift or
5154         single shift functions.  Assigned the coding-system (Lisp
5155         symbol) `iso-2022-8bit-ss2' by default.
5156
5157    o coding-category-big5
5158
5159         The category for a coding system which has the same code range
5160         as BIG5.  Assigned the coding-system (Lisp symbol)
5161         `cn-big5' by default.
5162
5163    o coding-category-utf-8
5164
5165         The category for a coding system which has the same code range
5166         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
5167         symbol) `utf-8' by default.
5168
5169    o coding-category-utf-16-be
5170
5171         The category for a coding system in which a text has an
5172         Unicode signature (cf. Unicode Standard) in the order of BIG
5173         endian at the head.  Assigned the coding-system (Lisp symbol)
5174         `utf-16-be' by default.
5175
5176    o coding-category-utf-16-le
5177
5178         The category for a coding system in which a text has an
5179         Unicode signature (cf. Unicode Standard) in the order of
5180         LITTLE endian at the head.  Assigned the coding-system (Lisp
5181         symbol) `utf-16-le' by default.
5182
5183    o coding-category-ccl
5184
5185         The category for a coding system of which encoder/decoder is
5186         written in CCL programs.  The default value is nil, i.e., no
5187         coding system is assigned.
5188
5189    o coding-category-binary
5190
5191         The category for a coding system not categorized in any of the
5192         above.  Assigned the coding-system (Lisp symbol)
5193         `no-conversion' by default.
5194
5195    Each of them is a Lisp symbol and the value is an actual
5196    `coding-system's (this is also a Lisp symbol) assigned by a user.
5197    What Emacs does actually is to detect a category of coding system.
5198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5199    decide only one possible category, it selects a category of the
5200    highest priority.  Priorities of categories are also specified by a
5201    user in a Lisp variable `coding-category-list'.
5202
5203 */
5204
5205 #define EOL_SEEN_NONE   0
5206 #define EOL_SEEN_LF     1
5207 #define EOL_SEEN_CR     2
5208 #define EOL_SEEN_CRLF   4
5209
5210 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5211    SOURCE is encoded.  If CATEGORY is one of
5212    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5213    two-byte, else they are encoded by one-byte.
5214
5215    Return one of EOL_SEEN_XXX.  */
5216
5217 #define MAX_EOL_CHECK_COUNT 3
5218
5219 static int
5220 detect_eol (source, src_bytes, category)
5221      unsigned char *source;
5222      EMACS_INT src_bytes;
5223      enum coding_category category;
5224 {
5225   unsigned char *src = source, *src_end = src + src_bytes;
5226   unsigned char c;
5227   int total  = 0;
5228   int eol_seen = EOL_SEEN_NONE;
5229
5230   if ((1 << category) & CATEGORY_MASK_UTF_16)
5231     {
5232       int msb, lsb;
5233
5234       msb = category == (coding_category_utf_16_le
5235                          | coding_category_utf_16_le_nosig);
5236       lsb = 1 - msb;
5237
5238       while (src + 1 < src_end)
5239         {
5240           c = src[lsb];
5241           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5242             {
5243               int this_eol;
5244
5245               if (c == '\n')
5246                 this_eol = EOL_SEEN_LF;
5247               else if (src + 3 >= src_end
5248                        || src[msb + 2] != 0
5249                        || src[lsb + 2] != '\n')
5250                 this_eol = EOL_SEEN_CR;
5251               else
5252                 this_eol = EOL_SEEN_CRLF;
5253
5254               if (eol_seen == EOL_SEEN_NONE)
5255                 /* This is the first end-of-line.  */
5256                 eol_seen = this_eol;
5257               else if (eol_seen != this_eol)
5258                 {
5259                   /* The found type is different from what found before.  */
5260                   eol_seen = EOL_SEEN_LF;
5261                   break;
5262                 }
5263               if (++total == MAX_EOL_CHECK_COUNT)
5264                 break;
5265             }
5266           src += 2;
5267         }
5268     }
5269   else
5270     {
5271       while (src < src_end)
5272         {
5273           c = *src++;
5274           if (c == '\n' || c == '\r')
5275             {
5276               int this_eol;
5277
5278               if (c == '\n')
5279                 this_eol = EOL_SEEN_LF;
5280               else if (src >= src_end || *src != '\n')
5281                 this_eol = EOL_SEEN_CR;
5282               else
5283                 this_eol = EOL_SEEN_CRLF, src++;
5284
5285               if (eol_seen == EOL_SEEN_NONE)
5286                 /* This is the first end-of-line.  */
5287                 eol_seen = this_eol;
5288               else if (eol_seen != this_eol)
5289                 {
5290                   /* The found type is different from what found before.  */
5291                   eol_seen = EOL_SEEN_LF;
5292                   break;
5293                 }
5294               if (++total == MAX_EOL_CHECK_COUNT)
5295                 break;
5296             }
5297         }
5298     }
5299   return eol_seen;
5300 }
5301
5302
5303 static Lisp_Object
5304 adjust_coding_eol_type (coding, eol_seen)
5305      struct coding_system *coding;
5306      int eol_seen;
5307 {
5308   Lisp_Object eol_type;
5309
5310   eol_type = CODING_ID_EOL_TYPE (coding->id);
5311   if (eol_seen & EOL_SEEN_LF)
5312     {
5313       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5314       eol_type = Qunix;
5315     }
5316   else if (eol_seen & EOL_SEEN_CRLF)
5317     {
5318       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5319       eol_type = Qdos;
5320     }
5321   else if (eol_seen & EOL_SEEN_CR)
5322     {
5323       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5324       eol_type = Qmac;
5325     }
5326   return eol_type;
5327 }
5328
5329 /* Detect how a text specified in CODING is encoded.  If a coding
5330    system is detected, update fields of CODING by the detected coding
5331    system.  */
5332
5333 void
5334 detect_coding (coding)
5335      struct coding_system *coding;
5336 {
5337   const unsigned char *src, *src_end;
5338   Lisp_Object attrs, coding_type;
5339
5340   coding->consumed = coding->consumed_char = 0;
5341   coding->produced = coding->produced_char = 0;
5342   coding_set_source (coding);
5343
5344   src_end = coding->source + coding->src_bytes;
5345
5346   /* If we have not yet decided the text encoding type, detect it
5347      now.  */
5348   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5349     {
5350       int c, i;
5351
5352       for (i = 0, src = coding->source; src < src_end; i++, src++)
5353         {
5354           c = *src;
5355           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5356                                         || c == ISO_CODE_SI
5357                                         || c == ISO_CODE_SO)))
5358             break;
5359         }
5360       coding->head_ascii = src - (coding->source + coding->consumed);
5361
5362       if (coding->head_ascii < coding->src_bytes)
5363         {
5364           struct coding_detection_info detect_info;
5365           enum coding_category category;
5366           struct coding_system *this;
5367
5368           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5369           for (i = 0; i < coding_category_raw_text; i++)
5370             {
5371               category = coding_priorities[i];
5372               this = coding_categories + category;
5373               if (this->id < 0)
5374                 {
5375                   /* No coding system of this category is defined.  */
5376                   detect_info.rejected |= (1 << category);
5377                 }
5378               else if (category >= coding_category_raw_text)
5379                 continue;
5380               else if (detect_info.checked & (1 << category))
5381                 {
5382                   if (detect_info.found & (1 << category))
5383                     break;
5384                 }
5385               else if ((*(this->detector)) (coding, &detect_info)
5386                        && detect_info.found & (1 << category))
5387                 {
5388                   if (category == coding_category_utf_16_auto)
5389                     {
5390                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5391                         category = coding_category_utf_16_le;
5392                       else
5393                         category = coding_category_utf_16_be;
5394                     }
5395                   break;
5396                 }
5397             }
5398           if (i < coding_category_raw_text)
5399             setup_coding_system (CODING_ID_NAME (this->id), coding);
5400           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5401             setup_coding_system (Qraw_text, coding);
5402           else if (detect_info.rejected)
5403             for (i = 0; i < coding_category_raw_text; i++)
5404               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5405                 {
5406                   this = coding_categories + coding_priorities[i];
5407                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5408                   break;
5409                 }
5410         }
5411     }
5412   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5413            == coding_category_utf_16_auto)
5414     {
5415       Lisp_Object coding_systems;
5416       struct coding_detection_info detect_info;
5417
5418       coding_systems
5419         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5420       detect_info.found = detect_info.rejected = 0;
5421       if (CONSP (coding_systems)
5422           && detect_coding_utf_16 (coding, &detect_info))
5423         {
5424           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5425             setup_coding_system (XCAR (coding_systems), coding);
5426           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5427             setup_coding_system (XCDR (coding_systems), coding);
5428         }
5429     }
5430 }
5431
5432
5433 static void
5434 decode_eol (coding)
5435      struct coding_system *coding;
5436 {
5437   Lisp_Object eol_type;
5438   unsigned char *p, *pbeg, *pend;
5439
5440   eol_type = CODING_ID_EOL_TYPE (coding->id);
5441   if (EQ (eol_type, Qunix))
5442     return;
5443
5444   if (NILP (coding->dst_object))
5445     pbeg = coding->destination;
5446   else
5447     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5448   pend = pbeg + coding->produced;
5449
5450   if (VECTORP (eol_type))
5451     {
5452       int eol_seen = EOL_SEEN_NONE;
5453
5454       for (p = pbeg; p < pend; p++)
5455         {
5456           if (*p == '\n')
5457             eol_seen |= EOL_SEEN_LF;
5458           else if (*p == '\r')
5459             {
5460               if (p + 1 < pend && *(p + 1) == '\n')
5461                 {
5462                   eol_seen |= EOL_SEEN_CRLF;
5463                   p++;
5464                 }
5465               else
5466                 eol_seen |= EOL_SEEN_CR;
5467             }
5468         }
5469       if (eol_seen != EOL_SEEN_NONE
5470           && eol_seen != EOL_SEEN_LF
5471           && eol_seen != EOL_SEEN_CRLF
5472           && eol_seen != EOL_SEEN_CR)
5473         eol_seen = EOL_SEEN_LF;
5474       if (eol_seen != EOL_SEEN_NONE)
5475         eol_type = adjust_coding_eol_type (coding, eol_seen);
5476     }
5477
5478   if (EQ (eol_type, Qmac))
5479     {
5480       for (p = pbeg; p < pend; p++)
5481         if (*p == '\r')
5482           *p = '\n';
5483     }
5484   else if (EQ (eol_type, Qdos))
5485     {
5486       int n = 0;
5487
5488       if (NILP (coding->dst_object))
5489         {
5490           for (p = pend - 2; p >= pbeg; p--)
5491             if (*p == '\r')
5492               {
5493                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5494                 n++;
5495               }
5496         }
5497       else
5498         {
5499           for (p = pend - 2; p >= pbeg; p--)
5500             if (*p == '\r')
5501               {
5502                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5503                 int pos = BYTE_TO_CHAR (pos_byte);
5504
5505                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5506                 n++;
5507               }
5508         }
5509       coding->produced -= n;
5510       coding->produced_char -= n;
5511     }
5512 }
5513
5514
5515 /* Return a translation table (or list of them) from coding system
5516    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5517    decoding (ENCODEP is zero). */
5518
5519 static INLINE
5520 get_translation_table (attrs, encodep)
5521 {
5522   Lisp_Object standard, translation_table;
5523
5524   if (encodep)
5525     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5526       standard = Vstandard_translation_table_for_encode;
5527   else
5528     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5529       standard = Vstandard_translation_table_for_decode;
5530   if (NILP (translation_table))
5531     return standard;
5532   if (SYMBOLP (translation_table))
5533     translation_table = Fget (translation_table, Qtranslation_table);
5534   else if (CONSP (translation_table))
5535     {
5536       Lisp_Object val;
5537
5538       translation_table = Fcopy_sequence (translation_table);
5539       for (val = translation_table; CONSP (val); val = XCDR (val))
5540         if (SYMBOLP (XCAR (val)))
5541           XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5542     }
5543   if (! NILP (standard))
5544     {
5545       if (CONSP (translation_table))
5546         translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5547       else
5548         translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5549     }
5550   return translation_table;
5551 }
5552
5553
5554 static void
5555 translate_chars (coding, table)
5556      struct coding_system *coding;
5557      Lisp_Object table;
5558 {
5559   int *charbuf = coding->charbuf;
5560   int *charbuf_end = charbuf + coding->charbuf_used;
5561   int c;
5562
5563   if (coding->chars_at_source)
5564     return;
5565
5566   while (charbuf < charbuf_end)
5567     {
5568       c = *charbuf;
5569       if (c < 0)
5570         charbuf += -c;
5571       else
5572         *charbuf++ = translate_char (table, c);
5573     }
5574 }
5575
5576 static int
5577 produce_chars (coding)
5578      struct coding_system *coding;
5579 {
5580   unsigned char *dst = coding->destination + coding->produced;
5581   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5582   int produced;
5583   int produced_chars = 0;
5584
5585   if (! coding->chars_at_source)
5586     {
5587       /* Characters are in coding->charbuf.  */
5588       int *buf = coding->charbuf;
5589       int *buf_end = buf + coding->charbuf_used;
5590       unsigned char *adjusted_dst_end;
5591
5592       if (BUFFERP (coding->src_object)
5593           && EQ (coding->src_object, coding->dst_object))
5594         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5595       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5596
5597       while (buf < buf_end)
5598         {
5599           int c = *buf++;
5600
5601           if (dst >= adjusted_dst_end)
5602             {
5603               dst = alloc_destination (coding,
5604                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5605                                        dst);
5606               dst_end = coding->destination + coding->dst_bytes;
5607               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5608             }
5609           if (c >= 0)
5610             {
5611               if (coding->dst_multibyte
5612                   || ! CHAR_BYTE8_P (c))
5613                 CHAR_STRING_ADVANCE (c, dst);
5614               else
5615                 *dst++ = CHAR_TO_BYTE8 (c);
5616               produced_chars++;
5617             }
5618           else
5619             /* This is an annotation datum.  (-C) is the length of
5620                it.  */
5621             buf += -c - 1;
5622         }
5623     }
5624   else
5625     {
5626       const unsigned char *src = coding->source;
5627       const unsigned char *src_end = src + coding->src_bytes;
5628       Lisp_Object eol_type;
5629
5630       eol_type = CODING_ID_EOL_TYPE (coding->id);
5631
5632       if (coding->src_multibyte != coding->dst_multibyte)
5633         {
5634           if (coding->src_multibyte)
5635             {
5636               int multibytep = 1;
5637               int consumed_chars;
5638
5639               while (1)
5640                 {
5641                   const unsigned char *src_base = src;
5642                   int c;
5643
5644                   ONE_MORE_BYTE (c);
5645                   if (c == '\r')
5646                     {
5647                       if (EQ (eol_type, Qdos))
5648                         {
5649                           if (src == src_end)
5650                             {
5651                               record_conversion_result
5652                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5653                               goto no_more_source;
5654                             }
5655                           if (*src == '\n')
5656                             c = *src++;
5657                         }
5658                       else if (EQ (eol_type, Qmac))
5659                         c = '\n';
5660                     }
5661                   if (dst == dst_end)
5662                     {
5663                       coding->consumed = src - coding->source;
5664
5665                     if (EQ (coding->src_object, coding->dst_object))
5666                       dst_end = (unsigned char *) src;
5667                     if (dst == dst_end)
5668                       {
5669                         dst = alloc_destination (coding, src_end - src + 1,
5670                                                  dst);
5671                         dst_end = coding->destination + coding->dst_bytes;
5672                         coding_set_source (coding);
5673                         src = coding->source + coding->consumed;
5674                         src_end = coding->source + coding->src_bytes;
5675                       }
5676                     }
5677                   *dst++ = c;
5678                   produced_chars++;
5679                 }
5680             no_more_source:
5681               ;
5682             }
5683           else
5684             while (src < src_end)
5685               {
5686                 int multibytep = 1;
5687                 int c = *src++;
5688
5689                 if (c == '\r')
5690                   {
5691                     if (EQ (eol_type, Qdos))
5692                       {
5693                         if (src < src_end
5694                             && *src == '\n')
5695                           c = *src++;
5696                       }
5697                     else if (EQ (eol_type, Qmac))
5698                       c = '\n';
5699                   }
5700                 if (dst >= dst_end - 1)
5701                   {
5702                     coding->consumed = src - coding->source;
5703
5704                     if (EQ (coding->src_object, coding->dst_object))
5705                       dst_end = (unsigned char *) src;
5706                     if (dst >= dst_end - 1)
5707                       {
5708                         dst = alloc_destination (coding, src_end - src + 2,
5709                                                  dst);
5710                         dst_end = coding->destination + coding->dst_bytes;
5711                         coding_set_source (coding);
5712                         src = coding->source + coding->consumed;
5713                         src_end = coding->source + coding->src_bytes;
5714                       }
5715                   }
5716                 EMIT_ONE_BYTE (c);
5717               }
5718         }
5719       else
5720         {
5721           if (!EQ (coding->src_object, coding->dst_object))
5722             {
5723               int require = coding->src_bytes - coding->dst_bytes;
5724
5725               if (require > 0)
5726                 {
5727                   EMACS_INT offset = src - coding->source;
5728
5729                   dst = alloc_destination (coding, require, dst);
5730                   coding_set_source (coding);
5731                   src = coding->source + offset;
5732                   src_end = coding->source + coding->src_bytes;
5733                 }
5734             }
5735           produced_chars = coding->src_chars;
5736           while (src < src_end)
5737             {
5738               int c = *src++;
5739
5740               if (c == '\r')
5741                 {
5742                   if (EQ (eol_type, Qdos))
5743                     {
5744                       if (src < src_end
5745                           && *src == '\n')
5746                         c = *src++;
5747                       produced_chars--;
5748                     }
5749                   else if (EQ (eol_type, Qmac))
5750                     c = '\n';
5751                 }
5752               *dst++ = c;
5753             }
5754         }
5755       coding->consumed = coding->src_bytes;
5756       coding->consumed_char = coding->src_chars;
5757     }
5758
5759   produced = dst - (coding->destination + coding->produced);
5760   if (BUFFERP (coding->dst_object))
5761     insert_from_gap (produced_chars, produced);
5762   coding->produced += produced;
5763   coding->produced_char += produced_chars;
5764   return produced_chars;
5765 }
5766
5767 /* Compose text in CODING->object according to the annotation data at
5768    CHARBUF.  CHARBUF is an array:
5769      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5770  */
5771
5772 static INLINE void
5773 produce_composition (coding, charbuf)
5774      struct coding_system *coding;
5775      int *charbuf;
5776 {
5777   int len;
5778   EMACS_INT from, to;
5779   enum composition_method method;
5780   Lisp_Object components;
5781
5782   len = -charbuf[0];
5783   from = coding->dst_pos + charbuf[2];
5784   to = coding->dst_pos + charbuf[3];
5785   method = (enum composition_method) (charbuf[4]);
5786
5787   if (method == COMPOSITION_RELATIVE)
5788     components = Qnil;
5789   else
5790     {
5791       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5792       int i;
5793
5794       len -= 5;
5795       charbuf += 5;
5796       for (i = 0; i < len; i++)
5797         args[i] = make_number (charbuf[i]);
5798       components = (method == COMPOSITION_WITH_ALTCHARS
5799                     ? Fstring (len, args) : Fvector (len, args));
5800     }
5801   compose_text (from, to, components, Qnil, coding->dst_object);
5802 }
5803
5804
5805 /* Put `charset' property on text in CODING->object according to
5806    the annotation data at CHARBUF.  CHARBUF is an array:
5807      [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5808  */
5809
5810 static INLINE void
5811 produce_charset (coding, charbuf)
5812      struct coding_system *coding;
5813      int *charbuf;
5814 {
5815   EMACS_INT from = coding->dst_pos + charbuf[2];
5816   EMACS_INT to = coding->dst_pos + charbuf[3];
5817   struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5818
5819   Fput_text_property (make_number (from), make_number (to),
5820                       Qcharset, CHARSET_NAME (charset),
5821                       coding->dst_object);
5822 }
5823
5824
5825 #define CHARBUF_SIZE 0x4000
5826
5827 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5828   do {                                                                  \
5829     int size = CHARBUF_SIZE;;                                           \
5830                                                                         \
5831     coding->charbuf = NULL;                                             \
5832     while (size > 1024)                                                 \
5833       {                                                                 \
5834         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5835         if (coding->charbuf)                                            \
5836           break;                                                        \
5837         size >>= 1;                                                     \
5838       }                                                                 \
5839     if (! coding->charbuf)                                              \
5840       {                                                                 \
5841         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5842         return coding->result;                                          \
5843       }                                                                 \
5844     coding->charbuf_size = size;                                        \
5845   } while (0)
5846
5847
5848 static void
5849 produce_annotation (coding)
5850      struct coding_system *coding;
5851 {
5852   int *charbuf = coding->charbuf;
5853   int *charbuf_end = charbuf + coding->charbuf_used;
5854
5855   if (NILP (coding->dst_object))
5856     return;
5857
5858   while (charbuf < charbuf_end)
5859     {
5860       if (*charbuf >= 0)
5861         charbuf++;
5862       else
5863         {
5864           int len = -*charbuf;
5865           switch (charbuf[1])
5866             {
5867             case CODING_ANNOTATE_COMPOSITION_MASK:
5868               produce_composition (coding, charbuf);
5869               break;
5870             case CODING_ANNOTATE_CHARSET_MASK:
5871               produce_charset (coding, charbuf);
5872               break;
5873             default:
5874               abort ();
5875             }
5876           charbuf += len;
5877         }
5878     }
5879 }
5880
5881 /* Decode the data at CODING->src_object into CODING->dst_object.
5882    CODING->src_object is a buffer, a string, or nil.
5883    CODING->dst_object is a buffer.
5884
5885    If CODING->src_object is a buffer, it must be the current buffer.
5886    In this case, if CODING->src_pos is positive, it is a position of
5887    the source text in the buffer, otherwise, the source text is in the
5888    gap area of the buffer, and CODING->src_pos specifies the offset of
5889    the text from GPT (which must be the same as PT).  If this is the
5890    same buffer as CODING->dst_object, CODING->src_pos must be
5891    negative.
5892
5893    If CODING->src_object is a string, CODING->src_pos in an index to
5894    that string.
5895
5896    If CODING->src_object is nil, CODING->source must already point to
5897    the non-relocatable memory area.  In this case, CODING->src_pos is
5898    an offset from CODING->source.
5899
5900    The decoded data is inserted at the current point of the buffer
5901    CODING->dst_object.
5902 */
5903
5904 static int
5905 decode_coding (coding)
5906      struct coding_system *coding;
5907 {
5908   Lisp_Object attrs;
5909   Lisp_Object undo_list;
5910   Lisp_Object translation_table;
5911
5912   if (BUFFERP (coding->src_object)
5913       && coding->src_pos > 0
5914       && coding->src_pos < GPT
5915       && coding->src_pos + coding->src_chars > GPT)
5916     move_gap_both (coding->src_pos, coding->src_pos_byte);
5917
5918   undo_list = Qt;
5919   if (BUFFERP (coding->dst_object))
5920     {
5921       if (current_buffer != XBUFFER (coding->dst_object))
5922         set_buffer_internal (XBUFFER (coding->dst_object));
5923       if (GPT != PT)
5924         move_gap_both (PT, PT_BYTE);
5925       undo_list = current_buffer->undo_list;
5926       current_buffer->undo_list = Qt;
5927     }
5928
5929   coding->consumed = coding->consumed_char = 0;
5930   coding->produced = coding->produced_char = 0;
5931   coding->chars_at_source = 0;
5932   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5933   coding->errors = 0;
5934
5935   ALLOC_CONVERSION_WORK_AREA (coding);
5936
5937   attrs = CODING_ID_ATTRS (coding->id);
5938   translation_table = get_translation_table (attrs, 0);
5939
5940   do
5941     {
5942       coding_set_source (coding);
5943       coding->annotated = 0;
5944       (*(coding->decoder)) (coding);
5945       if (!NILP (translation_table))
5946         translate_chars (coding, translation_table);
5947       coding_set_destination (coding);
5948       produce_chars (coding);
5949       if (coding->annotated)
5950         produce_annotation (coding);
5951     }
5952   while (coding->consumed < coding->src_bytes
5953          && ! coding->result);
5954
5955   coding->carryover_bytes = 0;
5956   if (coding->consumed < coding->src_bytes)
5957     {
5958       int nbytes = coding->src_bytes - coding->consumed;
5959       const unsigned char *src;
5960
5961       coding_set_source (coding);
5962       coding_set_destination (coding);
5963       src = coding->source + coding->consumed;
5964
5965       if (coding->mode & CODING_MODE_LAST_BLOCK)
5966         {
5967           /* Flush out unprocessed data as binary chars.  We are sure
5968              that the number of data is less than the size of
5969              coding->charbuf.  */
5970           coding->charbuf_used = 0;
5971           while (nbytes-- > 0)
5972             {
5973               int c = *src++;
5974
5975               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5976             }
5977           produce_chars (coding);
5978         }
5979       else
5980         {
5981           /* Record unprocessed bytes in coding->carryover.  We are
5982              sure that the number of data is less than the size of
5983              coding->carryover.  */
5984           unsigned char *p = coding->carryover;
5985
5986           coding->carryover_bytes = nbytes;
5987           while (nbytes-- > 0)
5988             *p++ = *src++;
5989         }
5990       coding->consumed = coding->src_bytes;
5991     }
5992
5993   if (BUFFERP (coding->dst_object))
5994     {
5995       current_buffer->undo_list = undo_list;
5996       record_insert (coding->dst_pos, coding->produced_char);
5997     }
5998   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5999     decode_eol (coding);
6000   return coding->result;
6001 }
6002
6003
6004 /* Extract an annotation datum from a composition starting at POS and
6005    ending before LIMIT of CODING->src_object (buffer or string), store
6006    the data in BUF, set *STOP to a starting position of the next
6007    composition (if any) or to LIMIT, and return the address of the
6008    next element of BUF.
6009
6010    If such an annotation is not found, set *STOP to a starting
6011    position of a composition after POS (if any) or to LIMIT, and
6012    return BUF.  */
6013
6014 static INLINE int *
6015 handle_composition_annotation (pos, limit, coding, buf, stop)
6016      EMACS_INT pos, limit;
6017      struct coding_system *coding;
6018      int *buf;
6019      EMACS_INT *stop;
6020 {
6021   EMACS_INT start, end;
6022   Lisp_Object prop;
6023
6024   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6025       || end > limit)
6026     *stop = limit;
6027   else if (start > pos)
6028     *stop = start;
6029   else
6030     {
6031       if (start == pos)
6032         {
6033           /* We found a composition.  Store the corresponding
6034              annotation data in BUF.  */
6035           int *head = buf;
6036           enum composition_method method = COMPOSITION_METHOD (prop);
6037           int nchars = COMPOSITION_LENGTH (prop);
6038
6039           ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6040           if (method != COMPOSITION_RELATIVE)
6041             {
6042               Lisp_Object components;
6043               int len, i, i_byte;
6044
6045               components = COMPOSITION_COMPONENTS (prop);
6046               if (VECTORP (components))
6047                 {
6048                   len = XVECTOR (components)->size;
6049                   for (i = 0; i < len; i++)
6050                     *buf++ = XINT (AREF (components, i));
6051                 }
6052               else if (STRINGP (components))
6053                 {
6054                   len = SCHARS (components);
6055                   i = i_byte = 0;
6056                   while (i < len)
6057                     {
6058                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6059                       buf++;
6060                     }
6061                 }
6062               else if (INTEGERP (components))
6063                 {
6064                   len = 1;
6065                   *buf++ = XINT (components);
6066                 }
6067               else if (CONSP (components))
6068                 {
6069                   for (len = 0; CONSP (components);
6070                        len++, components = XCDR (components))
6071                     *buf++ = XINT (XCAR (components));
6072                 }
6073               else
6074                 abort ();
6075               *head -= len;
6076             }
6077         }
6078
6079       if (find_composition (end, limit, &start, &end, &prop,
6080                             coding->src_object)
6081           && end <= limit)
6082         *stop = start;
6083       else
6084         *stop = limit;
6085     }
6086   return buf;
6087 }
6088
6089
6090 /* Extract an annotation datum from a text property `charset' at POS of
6091    CODING->src_object (buffer of string), store the data in BUF, set
6092    *STOP to the position where the value of `charset' property changes
6093    (limiting by LIMIT), and return the address of the next element of
6094    BUF.
6095
6096    If the property value is nil, set *STOP to the position where the
6097    property value is non-nil (limiting by LIMIT), and return BUF.  */
6098
6099 static INLINE int *
6100 handle_charset_annotation (pos, limit, coding, buf, stop)
6101      EMACS_INT pos, limit;
6102      struct coding_system *coding;
6103      int *buf;
6104      EMACS_INT *stop;
6105 {
6106   Lisp_Object val, next;
6107   int id;
6108
6109   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6110   if (! NILP (val) && CHARSETP (val))
6111     id = XINT (CHARSET_SYMBOL_ID (val));
6112   else
6113     id = -1;
6114   ADD_CHARSET_DATA (buf, 0, 0, id);
6115   next = Fnext_single_property_change (make_number (pos), Qcharset,
6116                                        coding->src_object,
6117                                        make_number (limit));
6118   *stop = XINT (next);
6119   return buf;
6120 }
6121
6122
6123 static void
6124 consume_chars (coding)
6125      struct coding_system *coding;
6126 {
6127   int *buf = coding->charbuf;
6128   int *buf_end = coding->charbuf + coding->charbuf_size;
6129   const unsigned char *src = coding->source + coding->consumed;
6130   const unsigned char *src_end = coding->source + coding->src_bytes;
6131   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6132   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6133   int multibytep = coding->src_multibyte;
6134   Lisp_Object eol_type;
6135   int c;
6136   EMACS_INT stop, stop_composition, stop_charset;
6137
6138   eol_type = CODING_ID_EOL_TYPE (coding->id);
6139   if (VECTORP (eol_type))
6140     eol_type = Qunix;
6141
6142   /* Note: composition handling is not yet implemented.  */
6143   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6144
6145   if (NILP (coding->src_object))
6146     stop = stop_composition = stop_charset = end_pos;
6147   else
6148     {
6149       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6150         stop = stop_composition = pos;
6151       else
6152         stop = stop_composition = end_pos;
6153       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6154         stop = stop_charset = pos;
6155       else
6156         stop_charset = end_pos;
6157     }
6158
6159   /* Compensate for CRLF and conversion.  */
6160   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6161   while (buf < buf_end)
6162     {
6163       if (pos == stop)
6164         {
6165           if (pos == end_pos)
6166             break;
6167           if (pos == stop_composition)
6168             buf = handle_composition_annotation (pos, end_pos, coding,
6169                                                  buf, &stop_composition);
6170           if (pos == stop_charset)
6171             buf = handle_charset_annotation (pos, end_pos, coding,
6172                                              buf, &stop_charset);
6173           stop = (stop_composition < stop_charset
6174                   ? stop_composition : stop_charset);
6175         }
6176
6177       if (! multibytep)
6178         {
6179           EMACS_INT bytes;
6180
6181           if (! CODING_FOR_UNIBYTE (coding)
6182               && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6183             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6184           else
6185             c = *src++, pos++;
6186         }
6187       else
6188         c = STRING_CHAR_ADVANCE (src), pos++;
6189       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6190         c = '\n';
6191       if (! EQ (eol_type, Qunix))
6192         {
6193           if (c == '\n')
6194             {
6195               if (EQ (eol_type, Qdos))
6196                 *buf++ = '\r';
6197               else
6198                 c = '\r';
6199             }
6200         }
6201       *buf++ = c;
6202     }
6203
6204   coding->consumed = src - coding->source;
6205   coding->consumed_char = pos - coding->src_pos;
6206   coding->charbuf_used = buf - coding->charbuf;
6207   coding->chars_at_source = 0;
6208 }
6209
6210
6211 /* Encode the text at CODING->src_object into CODING->dst_object.
6212    CODING->src_object is a buffer or a string.
6213    CODING->dst_object is a buffer or nil.
6214
6215    If CODING->src_object is a buffer, it must be the current buffer.
6216    In this case, if CODING->src_pos is positive, it is a position of
6217    the source text in the buffer, otherwise. the source text is in the
6218    gap area of the buffer, and coding->src_pos specifies the offset of
6219    the text from GPT (which must be the same as PT).  If this is the
6220    same buffer as CODING->dst_object, CODING->src_pos must be
6221    negative and CODING should not have `pre-write-conversion'.
6222
6223    If CODING->src_object is a string, CODING should not have
6224    `pre-write-conversion'.
6225
6226    If CODING->dst_object is a buffer, the encoded data is inserted at
6227    the current point of that buffer.
6228
6229    If CODING->dst_object is nil, the encoded data is placed at the
6230    memory area specified by CODING->destination.  */
6231
6232 static int
6233 encode_coding (coding)
6234      struct coding_system *coding;
6235 {
6236   Lisp_Object attrs;
6237   Lisp_Object translation_table;
6238
6239   attrs = CODING_ID_ATTRS (coding->id);
6240   translation_table = get_translation_table (attrs, 1);
6241
6242   if (BUFFERP (coding->dst_object))
6243     {
6244       set_buffer_internal (XBUFFER (coding->dst_object));
6245       coding->dst_multibyte
6246         = ! NILP (current_buffer->enable_multibyte_characters);
6247     }
6248
6249   coding->consumed = coding->consumed_char = 0;
6250   coding->produced = coding->produced_char = 0;
6251   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6252   coding->errors = 0;
6253
6254   ALLOC_CONVERSION_WORK_AREA (coding);
6255
6256   do {
6257     coding_set_source (coding);
6258     consume_chars (coding);
6259
6260     if (!NILP (translation_table))
6261       translate_chars (coding, translation_table);
6262
6263     coding_set_destination (coding);
6264     (*(coding->encoder)) (coding);
6265   } while (coding->consumed_char < coding->src_chars);
6266
6267   if (BUFFERP (coding->dst_object))
6268     insert_from_gap (coding->produced_char, coding->produced);
6269
6270   return (coding->result);
6271 }
6272
6273
6274 /* Name (or base name) of work buffer for code conversion.  */
6275 static Lisp_Object Vcode_conversion_workbuf_name;
6276
6277 /* A working buffer used by the top level conversion.  Once it is
6278    created, it is never destroyed.  It has the name
6279    Vcode_conversion_workbuf_name.  The other working buffers are
6280    destroyed after the use is finished, and their names are modified
6281    versions of Vcode_conversion_workbuf_name.  */
6282 static Lisp_Object Vcode_conversion_reused_workbuf;
6283
6284 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6285 static int reused_workbuf_in_use;
6286
6287
6288 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6289    multibyteness of returning buffer.  */
6290
6291 Lisp_Object
6292 make_conversion_work_buffer (multibyte)
6293 {
6294   Lisp_Object name, workbuf;
6295   struct buffer *current;
6296
6297   if (reused_workbuf_in_use++)
6298     {
6299       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6300       workbuf = Fget_buffer_create (name);
6301     }
6302   else
6303     {
6304       name = Vcode_conversion_workbuf_name;
6305       workbuf = Fget_buffer_create (name);
6306       if (NILP (Vcode_conversion_reused_workbuf))
6307         Vcode_conversion_reused_workbuf = workbuf;
6308     }
6309   current = current_buffer;
6310   set_buffer_internal (XBUFFER (workbuf));
6311   Ferase_buffer ();
6312   current_buffer->undo_list = Qt;
6313   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6314   set_buffer_internal (current);
6315   return workbuf;
6316 }
6317
6318
6319 static Lisp_Object
6320 code_conversion_restore (arg)
6321      Lisp_Object arg;
6322 {
6323   Lisp_Object current, workbuf;
6324
6325   current = XCAR (arg);
6326   workbuf = XCDR (arg);
6327   if (! NILP (workbuf))
6328     {
6329       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6330         reused_workbuf_in_use = 0;
6331       else if (! NILP (Fbuffer_live_p (workbuf)))
6332         Fkill_buffer (workbuf);
6333     }
6334   set_buffer_internal (XBUFFER (current));
6335   return Qnil;
6336 }
6337
6338 Lisp_Object
6339 code_conversion_save (with_work_buf, multibyte)
6340      int with_work_buf, multibyte;
6341 {
6342   Lisp_Object workbuf = Qnil;
6343
6344   if (with_work_buf)
6345     workbuf = make_conversion_work_buffer (multibyte);
6346   record_unwind_protect (code_conversion_restore,
6347                          Fcons (Fcurrent_buffer (), workbuf));
6348   return workbuf;
6349 }
6350
6351 int
6352 decode_coding_gap (coding, chars, bytes)
6353      struct coding_system *coding;
6354      EMACS_INT chars, bytes;
6355 {
6356   int count = specpdl_ptr - specpdl;
6357   Lisp_Object attrs;
6358
6359   code_conversion_save (0, 0);
6360
6361   coding->src_object = Fcurrent_buffer ();
6362   coding->src_chars = chars;
6363   coding->src_bytes = bytes;
6364   coding->src_pos = -chars;
6365   coding->src_pos_byte = -bytes;
6366   coding->src_multibyte = chars < bytes;
6367   coding->dst_object = coding->src_object;
6368   coding->dst_pos = PT;
6369   coding->dst_pos_byte = PT_BYTE;
6370   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6371   coding->mode |= CODING_MODE_LAST_BLOCK;
6372
6373   if (CODING_REQUIRE_DETECTION (coding))
6374     detect_coding (coding);
6375
6376   decode_coding (coding);
6377
6378   attrs = CODING_ID_ATTRS (coding->id);
6379   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6380     {
6381       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6382       Lisp_Object val;
6383
6384       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6385       val = call1 (CODING_ATTR_POST_READ (attrs),
6386                    make_number (coding->produced_char));
6387       CHECK_NATNUM (val);
6388       coding->produced_char += Z - prev_Z;
6389       coding->produced += Z_BYTE - prev_Z_BYTE;
6390     }
6391
6392   unbind_to (count, Qnil);
6393   return coding->result;
6394 }
6395
6396 int
6397 encode_coding_gap (coding, chars, bytes)
6398      struct coding_system *coding;
6399      EMACS_INT chars, bytes;
6400 {
6401   int count = specpdl_ptr - specpdl;
6402
6403   code_conversion_save (0, 0);
6404
6405   coding->src_object = Fcurrent_buffer ();
6406   coding->src_chars = chars;
6407   coding->src_bytes = bytes;
6408   coding->src_pos = -chars;
6409   coding->src_pos_byte = -bytes;
6410   coding->src_multibyte = chars < bytes;
6411   coding->dst_object = coding->src_object;
6412   coding->dst_pos = PT;
6413   coding->dst_pos_byte = PT_BYTE;
6414
6415   encode_coding (coding);
6416
6417   unbind_to (count, Qnil);
6418   return coding->result;
6419 }
6420
6421
6422 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6423    SRC_OBJECT into DST_OBJECT by coding context CODING.
6424
6425    SRC_OBJECT is a buffer, a string, or Qnil.
6426
6427    If it is a buffer, the text is at point of the buffer.  FROM and TO
6428    are positions in the buffer.
6429
6430    If it is a string, the text is at the beginning of the string.
6431    FROM and TO are indices to the string.
6432
6433    If it is nil, the text is at coding->source.  FROM and TO are
6434    indices to coding->source.
6435
6436    DST_OBJECT is a buffer, Qt, or Qnil.
6437
6438    If it is a buffer, the decoded text is inserted at point of the
6439    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6440    is deleted.
6441
6442    If it is Qt, a string is made from the decoded text, and
6443    set in CODING->dst_object.
6444
6445    If it is Qnil, the decoded text is stored at CODING->destination.
6446    The caller must allocate CODING->dst_bytes bytes at
6447    CODING->destination by xmalloc.  If the decoded text is longer than
6448    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6449  */
6450
6451 void
6452 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6453                       dst_object)
6454      struct coding_system *coding;
6455      Lisp_Object src_object;
6456      EMACS_INT from, from_byte, to, to_byte;
6457      Lisp_Object dst_object;
6458 {
6459   int count = specpdl_ptr - specpdl;
6460   unsigned char *destination;
6461   EMACS_INT dst_bytes;
6462   EMACS_INT chars = to - from;
6463   EMACS_INT bytes = to_byte - from_byte;
6464   Lisp_Object attrs;
6465   Lisp_Object buffer;
6466   int saved_pt = -1, saved_pt_byte;
6467
6468   buffer = Fcurrent_buffer ();
6469
6470   if (NILP (dst_object))
6471     {
6472       destination = coding->destination;
6473       dst_bytes = coding->dst_bytes;
6474     }
6475
6476   coding->src_object = src_object;
6477   coding->src_chars = chars;
6478   coding->src_bytes = bytes;
6479   coding->src_multibyte = chars < bytes;
6480
6481   if (STRINGP (src_object))
6482     {
6483       coding->src_pos = from;
6484       coding->src_pos_byte = from_byte;
6485     }
6486   else if (BUFFERP (src_object))
6487     {
6488       set_buffer_internal (XBUFFER (src_object));
6489       if (from != GPT)
6490         move_gap_both (from, from_byte);
6491       if (EQ (src_object, dst_object))
6492         {
6493           saved_pt = PT, saved_pt_byte = PT_BYTE;
6494           TEMP_SET_PT_BOTH (from, from_byte);
6495           del_range_both (from, from_byte, to, to_byte, 1);
6496           coding->src_pos = -chars;
6497           coding->src_pos_byte = -bytes;
6498         }
6499       else
6500         {
6501           coding->src_pos = from;
6502           coding->src_pos_byte = from_byte;
6503         }
6504     }
6505
6506   if (CODING_REQUIRE_DETECTION (coding))
6507     detect_coding (coding);
6508   attrs = CODING_ID_ATTRS (coding->id);
6509
6510   if (EQ (dst_object, Qt)
6511       || (! NILP (CODING_ATTR_POST_READ (attrs))
6512           && NILP (dst_object)))
6513     {
6514       coding->dst_object = code_conversion_save (1, 1);
6515       coding->dst_pos = BEG;
6516       coding->dst_pos_byte = BEG_BYTE;
6517       coding->dst_multibyte = 1;
6518     }
6519   else if (BUFFERP (dst_object))
6520     {
6521       code_conversion_save (0, 0);
6522       coding->dst_object = dst_object;
6523       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6524       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6525       coding->dst_multibyte
6526         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6527     }
6528   else
6529     {
6530       code_conversion_save (0, 0);
6531       coding->dst_object = Qnil;
6532       coding->dst_multibyte = 1;
6533     }
6534
6535   decode_coding (coding);
6536
6537   if (BUFFERP (coding->dst_object))
6538     set_buffer_internal (XBUFFER (coding->dst_object));
6539
6540   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6541     {
6542       struct gcpro gcpro1, gcpro2;
6543       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6544       Lisp_Object val;
6545
6546       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6547       GCPRO2 (coding->src_object, coding->dst_object);
6548       val = call1 (CODING_ATTR_POST_READ (attrs),
6549                    make_number (coding->produced_char));
6550       UNGCPRO;
6551       CHECK_NATNUM (val);
6552       coding->produced_char += Z - prev_Z;
6553       coding->produced += Z_BYTE - prev_Z_BYTE;
6554     }
6555
6556   if (EQ (dst_object, Qt))
6557     {
6558       coding->dst_object = Fbuffer_string ();
6559     }
6560   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6561     {
6562       set_buffer_internal (XBUFFER (coding->dst_object));
6563       if (dst_bytes < coding->produced)
6564         {
6565           destination
6566             = (unsigned char *) xrealloc (destination, coding->produced);
6567           if (! destination)
6568             {
6569               record_conversion_result (coding,
6570                                         CODING_RESULT_INSUFFICIENT_DST);
6571               unbind_to (count, Qnil);
6572               return;
6573             }
6574           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6575             move_gap_both (BEGV, BEGV_BYTE);
6576           bcopy (BEGV_ADDR, destination, coding->produced);
6577           coding->destination = destination;
6578         }
6579     }
6580
6581   if (saved_pt >= 0)
6582     {
6583       /* This is the case of:
6584          (BUFFERP (src_object) && EQ (src_object, dst_object))
6585          As we have moved PT while replacing the original buffer
6586          contents, we must recover it now.  */
6587       set_buffer_internal (XBUFFER (src_object));
6588       if (saved_pt < from)
6589         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6590       else if (saved_pt < from + chars)
6591         TEMP_SET_PT_BOTH (from, from_byte);
6592       else if (! NILP (current_buffer->enable_multibyte_characters))
6593         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6594                           saved_pt_byte + (coding->produced - bytes));
6595       else
6596         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6597                           saved_pt_byte + (coding->produced - bytes));
6598     }
6599
6600   unbind_to (count, coding->dst_object);
6601 }
6602
6603
6604 void
6605 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6606                       dst_object)
6607      struct coding_system *coding;
6608      Lisp_Object src_object;
6609      EMACS_INT from, from_byte, to, to_byte;
6610      Lisp_Object dst_object;
6611 {
6612   int count = specpdl_ptr - specpdl;
6613   EMACS_INT chars = to - from;
6614   EMACS_INT bytes = to_byte - from_byte;
6615   Lisp_Object attrs;
6616   Lisp_Object buffer;
6617   int saved_pt = -1, saved_pt_byte;
6618
6619   buffer = Fcurrent_buffer ();
6620
6621   coding->src_object = src_object;
6622   coding->src_chars = chars;
6623   coding->src_bytes = bytes;
6624   coding->src_multibyte = chars < bytes;
6625
6626   attrs = CODING_ID_ATTRS (coding->id);
6627
6628   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6629     {
6630       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6631       set_buffer_internal (XBUFFER (coding->src_object));
6632       if (STRINGP (src_object))
6633         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6634       else if (BUFFERP (src_object))
6635         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6636       else
6637         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6638
6639       if (EQ (src_object, dst_object))
6640         {
6641           set_buffer_internal (XBUFFER (src_object));
6642           saved_pt = PT, saved_pt_byte = PT_BYTE;
6643           del_range_both (from, from_byte, to, to_byte, 1);
6644           set_buffer_internal (XBUFFER (coding->src_object));
6645         }
6646
6647       call2 (CODING_ATTR_PRE_WRITE (attrs),
6648              make_number (BEG), make_number (Z));
6649       coding->src_object = Fcurrent_buffer ();
6650       if (BEG != GPT)
6651         move_gap_both (BEG, BEG_BYTE);
6652       coding->src_chars = Z - BEG;
6653       coding->src_bytes = Z_BYTE - BEG_BYTE;
6654       coding->src_pos = BEG;
6655       coding->src_pos_byte = BEG_BYTE;
6656       coding->src_multibyte = Z < Z_BYTE;
6657     }
6658   else if (STRINGP (src_object))
6659     {
6660       code_conversion_save (0, 0);
6661       coding->src_pos = from;
6662       coding->src_pos_byte = from_byte;
6663     }
6664   else if (BUFFERP (src_object))
6665     {
6666       code_conversion_save (0, 0);
6667       set_buffer_internal (XBUFFER (src_object));
6668       if (EQ (src_object, dst_object))
6669         {
6670           saved_pt = PT, saved_pt_byte = PT_BYTE;
6671           coding->src_object = del_range_1 (from, to, 1, 1);
6672           coding->src_pos = 0;
6673           coding->src_pos_byte = 0;
6674         }
6675       else
6676         {
6677           if (from < GPT && to >= GPT)
6678             move_gap_both (from, from_byte);
6679           coding->src_pos = from;
6680           coding->src_pos_byte = from_byte;
6681         }
6682     }
6683   else
6684     code_conversion_save (0, 0);
6685
6686   if (BUFFERP (dst_object))
6687     {
6688       coding->dst_object = dst_object;
6689       if (EQ (src_object, dst_object))
6690         {
6691           coding->dst_pos = from;
6692           coding->dst_pos_byte = from_byte;
6693         }
6694       else
6695         {
6696           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6697           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6698         }
6699       coding->dst_multibyte
6700         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6701     }
6702   else if (EQ (dst_object, Qt))
6703     {
6704       coding->dst_object = Qnil;
6705       coding->dst_bytes = coding->src_chars;
6706       if (coding->dst_bytes == 0)
6707         coding->dst_bytes = 1;
6708       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6709       coding->dst_multibyte = 0;
6710     }
6711   else
6712     {
6713       coding->dst_object = Qnil;
6714       coding->dst_multibyte = 0;
6715     }
6716
6717   encode_coding (coding);
6718
6719   if (EQ (dst_object, Qt))
6720     {
6721       if (BUFFERP (coding->dst_object))
6722         coding->dst_object = Fbuffer_string ();
6723       else
6724         {
6725           coding->dst_object
6726             = make_unibyte_string ((char *) coding->destination,
6727                                    coding->produced);
6728           xfree (coding->destination);
6729         }
6730     }
6731
6732   if (saved_pt >= 0)
6733     {
6734       /* This is the case of:
6735          (BUFFERP (src_object) && EQ (src_object, dst_object))
6736          As we have moved PT while replacing the original buffer
6737          contents, we must recover it now.  */
6738       set_buffer_internal (XBUFFER (src_object));
6739       if (saved_pt < from)
6740         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6741       else if (saved_pt < from + chars)
6742         TEMP_SET_PT_BOTH (from, from_byte);
6743       else if (! NILP (current_buffer->enable_multibyte_characters))
6744         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6745                           saved_pt_byte + (coding->produced - bytes));
6746       else
6747         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6748                           saved_pt_byte + (coding->produced - bytes));
6749     }
6750
6751   unbind_to (count, Qnil);
6752 }
6753
6754
6755 Lisp_Object
6756 preferred_coding_system ()
6757 {
6758   int id = coding_categories[coding_priorities[0]].id;
6759
6760   return CODING_ID_NAME (id);
6761 }
6762
6763 \f
6764 #ifdef emacs
6765 /*** 8. Emacs Lisp library functions ***/
6766
6767 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6768        doc: /* Return t if OBJECT is nil or a coding-system.
6769 See the documentation of `define-coding-system' for information
6770 about coding-system objects.  */)
6771      (obj)
6772      Lisp_Object obj;
6773 {
6774   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6775 }
6776
6777 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6778        Sread_non_nil_coding_system, 1, 1, 0,
6779        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6780      (prompt)
6781      Lisp_Object prompt;
6782 {
6783   Lisp_Object val;
6784   do
6785     {
6786       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6787                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6788     }
6789   while (SCHARS (val) == 0);
6790   return (Fintern (val, Qnil));
6791 }
6792
6793 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6794        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6795 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6796      (prompt, default_coding_system)
6797      Lisp_Object prompt, default_coding_system;
6798 {
6799   Lisp_Object val;
6800   if (SYMBOLP (default_coding_system))
6801     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6802   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6803                           Qt, Qnil, Qcoding_system_history,
6804                           default_coding_system, Qnil);
6805   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6806 }
6807
6808 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6809        1, 1, 0,
6810        doc: /* Check validity of CODING-SYSTEM.
6811 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.  */)
6812   (coding_system)
6813      Lisp_Object coding_system;
6814 {
6815   CHECK_SYMBOL (coding_system);
6816   if (!NILP (Fcoding_system_p (coding_system)))
6817     return coding_system;
6818   while (1)
6819     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6820 }
6821
6822 \f
6823 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
6824    HIGHEST is nonzero, return the coding system of the highest
6825    priority among the detected coding systems.  Otherwize return a
6826    list of detected coding systems sorted by their priorities.  If
6827    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6828    multibyte form but contains only ASCII and eight-bit chars.
6829    Otherwise, the bytes are raw bytes.
6830
6831    CODING-SYSTEM controls the detection as below:
6832
6833    If it is nil, detect both text-format and eol-format.  If the
6834    text-format part of CODING-SYSTEM is already specified
6835    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
6836    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6837    detect only text-format.  */
6838
6839 Lisp_Object
6840 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6841                       coding_system)
6842      const unsigned char *src;
6843      int src_chars, src_bytes, highest;
6844      int multibytep;
6845      Lisp_Object coding_system;
6846 {
6847   const unsigned char *src_end = src + src_bytes;
6848   Lisp_Object attrs, eol_type;
6849   Lisp_Object val;
6850   struct coding_system coding;
6851   int id;
6852   struct coding_detection_info detect_info;
6853   enum coding_category base_category;
6854
6855   if (NILP (coding_system))
6856     coding_system = Qundecided;
6857   setup_coding_system (coding_system, &coding);
6858   attrs = CODING_ID_ATTRS (coding.id);
6859   eol_type = CODING_ID_EOL_TYPE (coding.id);
6860   coding_system = CODING_ATTR_BASE_NAME (attrs);
6861
6862   coding.source = src;
6863   coding.src_chars = src_chars;
6864   coding.src_bytes = src_bytes;
6865   coding.src_multibyte = multibytep;
6866   coding.consumed = 0;
6867   coding.mode |= CODING_MODE_LAST_BLOCK;
6868
6869   detect_info.checked = detect_info.found = detect_info.rejected = 0;
6870
6871   /* At first, detect text-format if necessary.  */
6872   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6873   if (base_category == coding_category_undecided)
6874     {
6875       enum coding_category category;
6876       struct coding_system *this;
6877       int c, i;
6878
6879       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
6880       for (i = 0; src < src_end; i++, src++)
6881         {
6882           c = *src;
6883           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6884                                         || c == ISO_CODE_SI
6885                                         || c == ISO_CODE_SO)))
6886             break;
6887         }
6888       coding.head_ascii = src - coding.source;
6889
6890       if (src < src_end)
6891         for (i = 0; i < coding_category_raw_text; i++)
6892           {
6893             category = coding_priorities[i];
6894             this = coding_categories + category;
6895
6896             if (this->id < 0)
6897               {
6898                 /* No coding system of this category is defined.  */
6899                 detect_info.rejected |= (1 << category);
6900               }
6901             else if (category >= coding_category_raw_text)
6902               continue;
6903             else if (detect_info.checked & (1 << category))
6904               {
6905                 if (highest
6906                     && (detect_info.found & (1 << category)))
6907                   break;
6908               }
6909             else
6910               {
6911                 if ((*(this->detector)) (&coding, &detect_info)
6912                     && highest
6913                     && (detect_info.found & (1 << category)))
6914                   {
6915                     if (category == coding_category_utf_16_auto)
6916                       {
6917                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6918                           category = coding_category_utf_16_le;
6919                         else
6920                           category = coding_category_utf_16_be;
6921                       }
6922                     break;
6923                   }
6924               }
6925           }
6926
6927       if (detect_info.rejected == CATEGORY_MASK_ANY)
6928         {
6929           detect_info.found = CATEGORY_MASK_RAW_TEXT;
6930           id = coding_categories[coding_category_raw_text].id;
6931           val = Fcons (make_number (id), Qnil);
6932         }
6933       else if (! detect_info.rejected && ! detect_info.found)
6934         {
6935           detect_info.found = CATEGORY_MASK_ANY;
6936           id = coding_categories[coding_category_undecided].id;
6937           val = Fcons (make_number (id), Qnil);
6938         }
6939       else if (highest)
6940         {
6941           if (detect_info.found)
6942             {
6943               detect_info.found = 1 << category;
6944               val = Fcons (make_number (this->id), Qnil);
6945             }
6946           else
6947             for (i = 0; i < coding_category_raw_text; i++)
6948               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6949                 {
6950                   detect_info.found = 1 << coding_priorities[i];
6951                   id = coding_categories[coding_priorities[i]].id;
6952                   val = Fcons (make_number (id), Qnil);
6953                   break;
6954                 }
6955         }
6956       else
6957         {
6958           int mask = detect_info.rejected | detect_info.found;
6959           int found = 0;
6960           val = Qnil;
6961
6962           for (i = coding_category_raw_text - 1; i >= 0; i--)
6963             {
6964               category = coding_priorities[i];
6965               if (! (mask & (1 << category)))
6966                 {
6967                   found |= 1 << category;
6968                   id = coding_categories[category].id;
6969                   val = Fcons (make_number (id), val);
6970                 }
6971             }
6972           for (i = coding_category_raw_text - 1; i >= 0; i--)
6973             {
6974               category = coding_priorities[i];
6975               if (detect_info.found & (1 << category))
6976                 {
6977                   id = coding_categories[category].id;
6978                   val = Fcons (make_number (id), val);
6979                 }
6980             }
6981           detect_info.found |= found;
6982         }
6983     }
6984   else if (base_category == coding_category_utf_16_auto)
6985     {
6986       if (detect_coding_utf_16 (&coding, &detect_info))
6987         {
6988           enum coding_category category;
6989           struct coding_system *this;
6990
6991           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6992             this = coding_categories + coding_category_utf_16_le;
6993           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6994             this = coding_categories + coding_category_utf_16_be;
6995           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6996             this = coding_categories + coding_category_utf_16_be_nosig;
6997           else
6998             this = coding_categories + coding_category_utf_16_le_nosig;
6999           val = Fcons (make_number (this->id), Qnil);
7000         }
7001     }
7002   else
7003     {
7004       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7005       val = Fcons (make_number (coding.id), Qnil);
7006     }
7007
7008   /* Then, detect eol-format if necessary.  */
7009   {
7010     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7011     Lisp_Object tail;
7012
7013     if (VECTORP (eol_type))
7014       {
7015         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7016           normal_eol = detect_eol (coding.source, src_bytes,
7017                                    coding_category_raw_text);
7018         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7019                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7020           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7021                                       coding_category_utf_16_be);
7022         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7023                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7024           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7025                                       coding_category_utf_16_le);
7026       }
7027     else
7028       {
7029         if (EQ (eol_type, Qunix))
7030           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7031         else if (EQ (eol_type, Qdos))
7032           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7033         else
7034           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7035       }
7036
7037     for (tail = val; CONSP (tail); tail = XCDR (tail))
7038       {
7039         enum coding_category category;
7040         int this_eol;
7041
7042         id = XINT (XCAR (tail));
7043         attrs = CODING_ID_ATTRS (id);
7044         category = XINT (CODING_ATTR_CATEGORY (attrs));
7045         eol_type = CODING_ID_EOL_TYPE (id);
7046         if (VECTORP (eol_type))
7047           {
7048             if (category == coding_category_utf_16_be
7049                 || category == coding_category_utf_16_be_nosig)
7050               this_eol = utf_16_be_eol;
7051             else if (category == coding_category_utf_16_le
7052                      || category == coding_category_utf_16_le_nosig)
7053               this_eol = utf_16_le_eol;
7054             else
7055               this_eol = normal_eol;
7056
7057             if (this_eol == EOL_SEEN_LF)
7058               XSETCAR (tail, AREF (eol_type, 0));
7059             else if (this_eol == EOL_SEEN_CRLF)
7060               XSETCAR (tail, AREF (eol_type, 1));
7061             else if (this_eol == EOL_SEEN_CR)
7062               XSETCAR (tail, AREF (eol_type, 2));
7063             else
7064               XSETCAR (tail, CODING_ID_NAME (id));
7065           }
7066         else
7067           XSETCAR (tail, CODING_ID_NAME (id));
7068       }
7069   }
7070
7071   return (highest ? XCAR (val) : val);
7072 }
7073
7074
7075 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7076        2, 3, 0,
7077        doc: /* Detect coding system of the text in the region between START and END.
7078 Return a list of possible coding systems ordered by priority.
7079
7080 If only ASCII characters are found, it returns a list of single element
7081 `undecided' or its subsidiary coding system according to a detected
7082 end-of-line format.
7083
7084 If optional argument HIGHEST is non-nil, return the coding system of
7085 highest priority.  */)
7086      (start, end, highest)
7087      Lisp_Object start, end, highest;
7088 {
7089   int from, to;
7090   int from_byte, to_byte;
7091
7092   CHECK_NUMBER_COERCE_MARKER (start);
7093   CHECK_NUMBER_COERCE_MARKER (end);
7094
7095   validate_region (&start, &end);
7096   from = XINT (start), to = XINT (end);
7097   from_byte = CHAR_TO_BYTE (from);
7098   to_byte = CHAR_TO_BYTE (to);
7099
7100   if (from < GPT && to >= GPT)
7101     move_gap_both (to, to_byte);
7102
7103   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7104                                to - from, to_byte - from_byte,
7105                                !NILP (highest),
7106                                !NILP (current_buffer
7107                                       ->enable_multibyte_characters),
7108                                Qnil);
7109 }
7110
7111 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7112        1, 2, 0,
7113        doc: /* Detect coding system of the text in STRING.
7114 Return a list of possible coding systems ordered by priority.
7115
7116 If only ASCII characters are found, it returns a list of single element
7117 `undecided' or its subsidiary coding system according to a detected
7118 end-of-line format.
7119
7120 If optional argument HIGHEST is non-nil, return the coding system of
7121 highest priority.  */)
7122      (string, highest)
7123      Lisp_Object string, highest;
7124 {
7125   CHECK_STRING (string);
7126
7127   return detect_coding_system (SDATA (string),
7128                                SCHARS (string), SBYTES (string),
7129                                !NILP (highest), STRING_MULTIBYTE (string),
7130                                Qnil);
7131 }
7132
7133
7134 static INLINE int
7135 char_encodable_p (c, attrs)
7136      int c;
7137      Lisp_Object attrs;
7138 {
7139   Lisp_Object tail;
7140   struct charset *charset;
7141   Lisp_Object translation_table;
7142
7143   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7144   if (! NILP (translation_table))
7145     c = translate_char (translation_table, c);
7146   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7147        CONSP (tail); tail = XCDR (tail))
7148     {
7149       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7150       if (CHAR_CHARSET_P (c, charset))
7151         break;
7152     }
7153   return (! NILP (tail));
7154 }
7155
7156
7157 /* Return a list of coding systems that safely encode the text between
7158    START and END.  If EXCLUDE is non-nil, it is a list of coding
7159    systems not to check.  The returned list doesn't contain any such
7160    coding systems.  In any case, if the text contains only ASCII or is
7161    unibyte, return t.  */
7162
7163 DEFUN ("find-coding-systems-region-internal",
7164        Ffind_coding_systems_region_internal,
7165        Sfind_coding_systems_region_internal, 2, 3, 0,
7166        doc: /* Internal use only.  */)
7167      (start, end, exclude)
7168      Lisp_Object start, end, exclude;
7169 {
7170   Lisp_Object coding_attrs_list, safe_codings;
7171   EMACS_INT start_byte, end_byte;
7172   const unsigned char *p, *pbeg, *pend;
7173   int c;
7174   Lisp_Object tail, elt;
7175
7176   if (STRINGP (start))
7177     {
7178       if (!STRING_MULTIBYTE (start)
7179           || SCHARS (start) == SBYTES (start))
7180         return Qt;
7181       start_byte = 0;
7182       end_byte = SBYTES (start);
7183     }
7184   else
7185     {
7186       CHECK_NUMBER_COERCE_MARKER (start);
7187       CHECK_NUMBER_COERCE_MARKER (end);
7188       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7189         args_out_of_range (start, end);
7190       if (NILP (current_buffer->enable_multibyte_characters))
7191         return Qt;
7192       start_byte = CHAR_TO_BYTE (XINT (start));
7193       end_byte = CHAR_TO_BYTE (XINT (end));
7194       if (XINT (end) - XINT (start) == end_byte - start_byte)
7195         return Qt;
7196
7197       if (XINT (start) < GPT && XINT (end) > GPT)
7198         {
7199           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7200             move_gap_both (XINT (start), start_byte);
7201           else
7202             move_gap_both (XINT (end), end_byte);
7203         }
7204     }
7205
7206   coding_attrs_list = Qnil;
7207   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7208     if (NILP (exclude)
7209         || NILP (Fmemq (XCAR (tail), exclude)))
7210       {
7211         Lisp_Object attrs;
7212
7213         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7214         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7215             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7216           {
7217             ASET (attrs, coding_attr_trans_tbl,
7218                   get_translation_table (attrs, 1));
7219             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7220           }
7221       }
7222
7223   if (STRINGP (start))
7224     p = pbeg = SDATA (start);
7225   else
7226     p = pbeg = BYTE_POS_ADDR (start_byte);
7227   pend = p + (end_byte - start_byte);
7228
7229   while (p < pend && ASCII_BYTE_P (*p)) p++;
7230   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7231
7232   while (p < pend)
7233     {
7234       if (ASCII_BYTE_P (*p))
7235         p++;
7236       else
7237         {
7238           c = STRING_CHAR_ADVANCE (p);
7239
7240           charset_map_loaded = 0;
7241           for (tail = coding_attrs_list; CONSP (tail);)
7242             {
7243               elt = XCAR (tail);
7244               if (NILP (elt))
7245                 tail = XCDR (tail);
7246               else if (char_encodable_p (c, elt))
7247                 tail = XCDR (tail);
7248               else if (CONSP (XCDR (tail)))
7249                 {
7250                   XSETCAR (tail, XCAR (XCDR (tail)));
7251                   XSETCDR (tail, XCDR (XCDR (tail)));
7252                 }
7253               else
7254                 {
7255                   XSETCAR (tail, Qnil);
7256                   tail = XCDR (tail);
7257                 }
7258             }
7259           if (charset_map_loaded)
7260             {
7261               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7262
7263               if (STRINGP (start))
7264                 pbeg = SDATA (start);
7265               else
7266                 pbeg = BYTE_POS_ADDR (start_byte);
7267               p = pbeg + p_offset;
7268               pend = pbeg + pend_offset;
7269             }
7270         }
7271     }
7272
7273   safe_codings = Qnil;
7274   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7275     if (! NILP (XCAR (tail)))
7276       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7277
7278   return safe_codings;
7279 }
7280
7281
7282 DEFUN ("unencodable-char-position", Funencodable_char_position,
7283        Sunencodable_char_position, 3, 5, 0,
7284        doc: /*
7285 Return position of first un-encodable character in a region.
7286 START and END specfiy the region and CODING-SYSTEM specifies the
7287 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7288
7289 If optional 4th argument COUNT is non-nil, it specifies at most how
7290 many un-encodable characters to search.  In this case, the value is a
7291 list of positions.
7292
7293 If optional 5th argument STRING is non-nil, it is a string to search
7294 for un-encodable characters.  In that case, START and END are indexes
7295 to the string.  */)
7296      (start, end, coding_system, count, string)
7297      Lisp_Object start, end, coding_system, count, string;
7298 {
7299   int n;
7300   struct coding_system coding;
7301   Lisp_Object attrs, charset_list, translation_table;
7302   Lisp_Object positions;
7303   int from, to;
7304   const unsigned char *p, *stop, *pend;
7305   int ascii_compatible;
7306
7307   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7308   attrs = CODING_ID_ATTRS (coding.id);
7309   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7310     return Qnil;
7311   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7312   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7313   translation_table = get_translation_table (attrs, 1);
7314
7315   if (NILP (string))
7316     {
7317       validate_region (&start, &end);
7318       from = XINT (start);
7319       to = XINT (end);
7320       if (NILP (current_buffer->enable_multibyte_characters)
7321           || (ascii_compatible
7322               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7323         return Qnil;
7324       p = CHAR_POS_ADDR (from);
7325       pend = CHAR_POS_ADDR (to);
7326       if (from < GPT && to >= GPT)
7327         stop = GPT_ADDR;
7328       else
7329         stop = pend;
7330     }
7331   else
7332     {
7333       CHECK_STRING (string);
7334       CHECK_NATNUM (start);
7335       CHECK_NATNUM (end);
7336       from = XINT (start);
7337       to = XINT (end);
7338       if (from > to
7339           || to > SCHARS (string))
7340         args_out_of_range_3 (string, start, end);
7341       if (! STRING_MULTIBYTE (string))
7342         return Qnil;
7343       p = SDATA (string) + string_char_to_byte (string, from);
7344       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7345       if (ascii_compatible && (to - from) == (pend - p))
7346         return Qnil;
7347     }
7348
7349   if (NILP (count))
7350     n = 1;
7351   else
7352     {
7353       CHECK_NATNUM (count);
7354       n = XINT (count);
7355     }
7356
7357   positions = Qnil;
7358   while (1)
7359     {
7360       int c;
7361
7362       if (ascii_compatible)
7363         while (p < stop && ASCII_BYTE_P (*p))
7364           p++, from++;
7365       if (p >= stop)
7366         {
7367           if (p >= pend)
7368             break;
7369           stop = pend;
7370           p = GAP_END_ADDR;
7371         }
7372
7373       c = STRING_CHAR_ADVANCE (p);
7374       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7375           && ! char_charset (translate_char (translation_table, c),
7376                              charset_list, NULL))
7377         {
7378           positions = Fcons (make_number (from), positions);
7379           n--;
7380           if (n == 0)
7381             break;
7382         }
7383
7384       from++;
7385     }
7386
7387   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7388 }
7389
7390
7391 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7392        Scheck_coding_systems_region, 3, 3, 0,
7393        doc: /* Check if the region is encodable by coding systems.
7394
7395 START and END are buffer positions specifying the region.
7396 CODING-SYSTEM-LIST is a list of coding systems to check.
7397
7398 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7399 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7400 whole region, POS0, POS1, ... are buffer positions where non-encodable
7401 characters are found.
7402
7403 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7404 value is nil.
7405
7406 START may be a string.  In that case, check if the string is
7407 encodable, and the value contains indices to the string instead of
7408 buffer positions.  END is ignored.  */)
7409      (start, end, coding_system_list)
7410      Lisp_Object start, end, coding_system_list;
7411 {
7412   Lisp_Object list;
7413   EMACS_INT start_byte, end_byte;
7414   int pos;
7415   const unsigned char *p, *pbeg, *pend;
7416   int c;
7417   Lisp_Object tail, elt, attrs;
7418
7419   if (STRINGP (start))
7420     {
7421       if (!STRING_MULTIBYTE (start)
7422           && SCHARS (start) != SBYTES (start))
7423         return Qnil;
7424       start_byte = 0;
7425       end_byte = SBYTES (start);
7426       pos = 0;
7427     }
7428   else
7429     {
7430       CHECK_NUMBER_COERCE_MARKER (start);
7431       CHECK_NUMBER_COERCE_MARKER (end);
7432       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7433         args_out_of_range (start, end);
7434       if (NILP (current_buffer->enable_multibyte_characters))
7435         return Qnil;
7436       start_byte = CHAR_TO_BYTE (XINT (start));
7437       end_byte = CHAR_TO_BYTE (XINT (end));
7438       if (XINT (end) - XINT (start) == end_byte - start_byte)
7439         return Qt;
7440
7441       if (XINT (start) < GPT && XINT (end) > GPT)
7442         {
7443           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7444             move_gap_both (XINT (start), start_byte);
7445           else
7446             move_gap_both (XINT (end), end_byte);
7447         }
7448       pos = XINT (start);
7449     }
7450
7451   list = Qnil;
7452   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7453     {
7454       elt = XCAR (tail);
7455       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7456       ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7457       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7458     }
7459
7460   if (STRINGP (start))
7461     p = pbeg = SDATA (start);
7462   else
7463     p = pbeg = BYTE_POS_ADDR (start_byte);
7464   pend = p + (end_byte - start_byte);
7465
7466   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7467   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7468
7469   while (p < pend)
7470     {
7471       if (ASCII_BYTE_P (*p))
7472         p++;
7473       else
7474         {
7475           c = STRING_CHAR_ADVANCE (p);
7476
7477           charset_map_loaded = 0;
7478           for (tail = list; CONSP (tail); tail = XCDR (tail))
7479             {
7480               elt = XCDR (XCAR (tail));
7481               if (! char_encodable_p (c, XCAR (elt)))
7482                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7483             }
7484           if (charset_map_loaded)
7485             {
7486               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7487
7488               if (STRINGP (start))
7489                 pbeg = SDATA (start);
7490               else
7491                 pbeg = BYTE_POS_ADDR (start_byte);
7492               p = pbeg + p_offset;
7493               pend = pbeg + pend_offset;
7494             }
7495         }
7496       pos++;
7497     }
7498
7499   tail = list;
7500   list = Qnil;
7501   for (; CONSP (tail); tail = XCDR (tail))
7502     {
7503       elt = XCAR (tail);
7504       if (CONSP (XCDR (XCDR (elt))))
7505         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7506                       list);
7507     }
7508
7509   return list;
7510 }
7511
7512
7513 Lisp_Object
7514 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7515      Lisp_Object start, end, coding_system, dst_object;
7516      int encodep, norecord;
7517 {
7518   struct coding_system coding;
7519   EMACS_INT from, from_byte, to, to_byte;
7520   Lisp_Object src_object;
7521
7522   CHECK_NUMBER_COERCE_MARKER (start);
7523   CHECK_NUMBER_COERCE_MARKER (end);
7524   if (NILP (coding_system))
7525     coding_system = Qno_conversion;
7526   else
7527     CHECK_CODING_SYSTEM (coding_system);
7528   src_object = Fcurrent_buffer ();
7529   if (NILP (dst_object))
7530     dst_object = src_object;
7531   else if (! EQ (dst_object, Qt))
7532     CHECK_BUFFER (dst_object);
7533
7534   validate_region (&start, &end);
7535   from = XFASTINT (start);
7536   from_byte = CHAR_TO_BYTE (from);
7537   to = XFASTINT (end);
7538   to_byte = CHAR_TO_BYTE (to);
7539
7540   setup_coding_system (coding_system, &coding);
7541   coding.mode |= CODING_MODE_LAST_BLOCK;
7542
7543   if (encodep)
7544     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7545                           dst_object);
7546   else
7547     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7548                           dst_object);
7549   if (! norecord)
7550     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7551
7552   return (BUFFERP (dst_object)
7553           ? make_number (coding.produced_char)
7554           : coding.dst_object);
7555 }
7556
7557
7558 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7559        3, 4, "r\nzCoding system: ",
7560        doc: /* Decode the current region from the specified coding system.
7561 When called from a program, takes four arguments:
7562         START, END, CODING-SYSTEM, and DESTINATION.
7563 START and END are buffer positions.
7564
7565 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7566 If nil, the region between START and END is replace by the decoded text.
7567 If buffer, the decoded text is inserted in the buffer.
7568 If t, the decoded text is returned.
7569
7570 This function sets `last-coding-system-used' to the precise coding system
7571 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7572 not fully specified.)
7573 It returns the length of the decoded text.  */)
7574      (start, end, coding_system, destination)
7575      Lisp_Object start, end, coding_system, destination;
7576 {
7577   return code_convert_region (start, end, coding_system, destination, 0, 0);
7578 }
7579
7580 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7581        3, 4, "r\nzCoding system: ",
7582        doc: /* Encode the current region by specified coding system.
7583 When called from a program, takes three arguments:
7584 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7585
7586 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7587 If nil, the region between START and END is replace by the encoded text.
7588 If buffer, the encoded text is inserted in the buffer.
7589 If t, the encoded text is returned.
7590
7591 This function sets `last-coding-system-used' to the precise coding system
7592 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7593 not fully specified.)
7594 It returns the length of the encoded text.  */)
7595   (start, end, coding_system, destination)
7596      Lisp_Object start, end, coding_system, destination;
7597 {
7598   return code_convert_region (start, end, coding_system, destination, 1, 0);
7599 }
7600
7601 Lisp_Object
7602 code_convert_string (string, coding_system, dst_object,
7603                      encodep, nocopy, norecord)
7604      Lisp_Object string, coding_system, dst_object;
7605      int encodep, nocopy, norecord;
7606 {
7607   struct coding_system coding;
7608   EMACS_INT chars, bytes;
7609
7610   CHECK_STRING (string);
7611   if (NILP (coding_system))
7612     {
7613       if (! norecord)
7614         Vlast_coding_system_used = Qno_conversion;
7615       if (NILP (dst_object))
7616         return (nocopy ? Fcopy_sequence (string) : string);
7617     }
7618
7619   if (NILP (coding_system))
7620     coding_system = Qno_conversion;
7621   else
7622     CHECK_CODING_SYSTEM (coding_system);
7623   if (NILP (dst_object))
7624     dst_object = Qt;
7625   else if (! EQ (dst_object, Qt))
7626     CHECK_BUFFER (dst_object);
7627
7628   setup_coding_system (coding_system, &coding);
7629   coding.mode |= CODING_MODE_LAST_BLOCK;
7630   chars = SCHARS (string);
7631   bytes = SBYTES (string);
7632   if (encodep)
7633     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7634   else
7635     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7636   if (! norecord)
7637     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7638
7639   return (BUFFERP (dst_object)
7640           ? make_number (coding.produced_char)
7641           : coding.dst_object);
7642 }
7643
7644
7645 /* Encode or decode STRING according to CODING_SYSTEM.
7646    Do not set Vlast_coding_system_used.
7647
7648    This function is called only from macros DECODE_FILE and
7649    ENCODE_FILE, thus we ignore character composition.  */
7650
7651 Lisp_Object
7652 code_convert_string_norecord (string, coding_system, encodep)
7653      Lisp_Object string, coding_system;
7654      int encodep;
7655 {
7656   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7657 }
7658
7659
7660 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7661        2, 4, 0,
7662        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7663
7664 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7665 if the decoding operation is trivial.
7666
7667 Optional fourth arg BUFFER non-nil meant that the decoded text is
7668 inserted in BUFFER instead of returned as a string.  In this case,
7669 the return value is BUFFER.
7670
7671 This function sets `last-coding-system-used' to the precise coding system
7672 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7673 not fully specified.  */)
7674   (string, coding_system, nocopy, buffer)
7675      Lisp_Object string, coding_system, nocopy, buffer;
7676 {
7677   return code_convert_string (string, coding_system, buffer,
7678                               0, ! NILP (nocopy), 0);
7679 }
7680
7681 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7682        2, 4, 0,
7683        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7684
7685 Optional third arg NOCOPY non-nil means it is OK to return STRING
7686 itself if the encoding operation is trivial.
7687
7688 Optional fourth arg BUFFER non-nil meant that the encoded text is
7689 inserted in BUFFER instead of returned as a string.  In this case,
7690 the return value is BUFFER.
7691
7692 This function sets `last-coding-system-used' to the precise coding system
7693 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7694 not fully specified.)  */)
7695      (string, coding_system, nocopy, buffer)
7696      Lisp_Object string, coding_system, nocopy, buffer;
7697 {
7698   return code_convert_string (string, coding_system, buffer,
7699                               1, ! NILP (nocopy), 1);
7700 }
7701
7702 \f
7703 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7704        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7705 Return the corresponding character.  */)
7706      (code)
7707      Lisp_Object code;
7708 {
7709   Lisp_Object spec, attrs, val;
7710   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7711   int c;
7712
7713   CHECK_NATNUM (code);
7714   c = XFASTINT (code);
7715   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7716   attrs = AREF (spec, 0);
7717
7718   if (ASCII_BYTE_P (c)
7719       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7720     return code;
7721
7722   val = CODING_ATTR_CHARSET_LIST (attrs);
7723   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7724   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7725   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7726
7727   if (c <= 0x7F)
7728     charset = charset_roman;
7729   else if (c >= 0xA0 && c < 0xDF)
7730     {
7731       charset = charset_kana;
7732       c -= 0x80;
7733     }
7734   else
7735     {
7736       int s1 = c >> 8, s2 = c & 0xFF;
7737
7738       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7739           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7740         error ("Invalid code: %d", code);
7741       SJIS_TO_JIS (c);
7742       charset = charset_kanji;
7743     }
7744   c = DECODE_CHAR (charset, c);
7745   if (c < 0)
7746     error ("Invalid code: %d", code);
7747   return make_number (c);
7748 }
7749
7750
7751 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7752        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7753 Return the corresponding code in SJIS.  */)
7754      (ch)
7755     Lisp_Object ch;
7756 {
7757   Lisp_Object spec, attrs, charset_list;
7758   int c;
7759   struct charset *charset;
7760   unsigned code;
7761
7762   CHECK_CHARACTER (ch);
7763   c = XFASTINT (ch);
7764   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7765   attrs = AREF (spec, 0);
7766
7767   if (ASCII_CHAR_P (c)
7768       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7769     return ch;
7770
7771   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7772   charset = char_charset (c, charset_list, &code);
7773   if (code == CHARSET_INVALID_CODE (charset))
7774     error ("Can't encode by shift_jis encoding: %d", c);
7775   JIS_TO_SJIS (code);
7776
7777   return make_number (code);
7778 }
7779
7780 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7781        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7782 Return the corresponding character.  */)
7783      (code)
7784      Lisp_Object code;
7785 {
7786   Lisp_Object spec, attrs, val;
7787   struct charset *charset_roman, *charset_big5, *charset;
7788   int c;
7789
7790   CHECK_NATNUM (code);
7791   c = XFASTINT (code);
7792   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7793   attrs = AREF (spec, 0);
7794
7795   if (ASCII_BYTE_P (c)
7796       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7797     return code;
7798
7799   val = CODING_ATTR_CHARSET_LIST (attrs);
7800   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7801   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7802
7803   if (c <= 0x7F)
7804     charset = charset_roman;
7805   else
7806     {
7807       int b1 = c >> 8, b2 = c & 0x7F;
7808       if (b1 < 0xA1 || b1 > 0xFE
7809           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7810         error ("Invalid code: %d", code);
7811       charset = charset_big5;
7812     }
7813   c = DECODE_CHAR (charset, (unsigned )c);
7814   if (c < 0)
7815     error ("Invalid code: %d", code);
7816   return make_number (c);
7817 }
7818
7819 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7820        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7821 Return the corresponding character code in Big5.  */)
7822      (ch)
7823      Lisp_Object ch;
7824 {
7825   Lisp_Object spec, attrs, charset_list;
7826   struct charset *charset;
7827   int c;
7828   unsigned code;
7829
7830   CHECK_CHARACTER (ch);
7831   c = XFASTINT (ch);
7832   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7833   attrs = AREF (spec, 0);
7834   if (ASCII_CHAR_P (c)
7835       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7836     return ch;
7837
7838   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7839   charset = char_charset (c, charset_list, &code);
7840   if (code == CHARSET_INVALID_CODE (charset))
7841     error ("Can't encode by Big5 encoding: %d", c);
7842
7843   return make_number (code);
7844 }
7845
7846 \f
7847 DEFUN ("set-terminal-coding-system-internal",
7848        Fset_terminal_coding_system_internal,
7849        Sset_terminal_coding_system_internal, 1, 1, 0,
7850        doc: /* Internal use only.  */)
7851      (coding_system)
7852      Lisp_Object coding_system;
7853 {
7854   CHECK_SYMBOL (coding_system);
7855   setup_coding_system (Fcheck_coding_system (coding_system),
7856                         &terminal_coding);
7857
7858   /* We had better not send unsafe characters to terminal.  */
7859   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7860   /* Characer composition should be disabled.  */
7861   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7862   terminal_coding.src_multibyte = 1;
7863   terminal_coding.dst_multibyte = 0;
7864   return Qnil;
7865 }
7866
7867 DEFUN ("set-safe-terminal-coding-system-internal",
7868        Fset_safe_terminal_coding_system_internal,
7869        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7870        doc: /* Internal use only.  */)
7871      (coding_system)
7872      Lisp_Object coding_system;
7873 {
7874   CHECK_SYMBOL (coding_system);
7875   setup_coding_system (Fcheck_coding_system (coding_system),
7876                        &safe_terminal_coding);
7877   /* Characer composition should be disabled.  */
7878   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7879   safe_terminal_coding.src_multibyte = 1;
7880   safe_terminal_coding.dst_multibyte = 0;
7881   return Qnil;
7882 }
7883
7884 DEFUN ("terminal-coding-system",
7885        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7886        doc: /* Return coding system specified for terminal output.  */)
7887      ()
7888 {
7889   return CODING_ID_NAME (terminal_coding.id);
7890 }
7891
7892 DEFUN ("set-keyboard-coding-system-internal",
7893        Fset_keyboard_coding_system_internal,
7894        Sset_keyboard_coding_system_internal, 1, 1, 0,
7895        doc: /* Internal use only.  */)
7896      (coding_system)
7897      Lisp_Object coding_system;
7898 {
7899   CHECK_SYMBOL (coding_system);
7900   setup_coding_system (Fcheck_coding_system (coding_system),
7901                        &keyboard_coding);
7902   /* Characer composition should be disabled.  */
7903   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7904   return Qnil;
7905 }
7906
7907 DEFUN ("keyboard-coding-system",
7908        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7909        doc: /* Return coding system specified for decoding keyboard input.  */)
7910      ()
7911 {
7912   return CODING_ID_NAME (keyboard_coding.id);
7913 }
7914
7915 \f
7916 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7917        Sfind_operation_coding_system,  1, MANY, 0,
7918        doc: /* Choose a coding system for an operation based on the target name.
7919 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7920 DECODING-SYSTEM is the coding system to use for decoding
7921 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7922 for encoding (in case OPERATION does encoding).
7923
7924 The first argument OPERATION specifies an I/O primitive:
7925   For file I/O, `insert-file-contents' or `write-region'.
7926   For process I/O, `call-process', `call-process-region', or `start-process'.
7927   For network I/O, `open-network-stream'.
7928
7929 The remaining arguments should be the same arguments that were passed
7930 to the primitive.  Depending on which primitive, one of those arguments
7931 is selected as the TARGET.  For example, if OPERATION does file I/O,
7932 whichever argument specifies the file name is TARGET.
7933
7934 TARGET has a meaning which depends on OPERATION:
7935   For file I/O, TARGET is a file name.
7936   For process I/O, TARGET is a process name.
7937   For network I/O, TARGET is a service name or a port number
7938
7939 This function looks up what specified for TARGET in,
7940 `file-coding-system-alist', `process-coding-system-alist',
7941 or `network-coding-system-alist' depending on OPERATION.
7942 They may specify a coding system, a cons of coding systems,
7943 or a function symbol to call.
7944 In the last case, we call the function with one argument,
7945 which is a list of all the arguments given to this function.
7946
7947 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7948      (nargs, args)
7949      int nargs;
7950      Lisp_Object *args;
7951 {
7952   Lisp_Object operation, target_idx, target, val;
7953   register Lisp_Object chain;
7954
7955   if (nargs < 2)
7956     error ("Too few arguments");
7957   operation = args[0];
7958   if (!SYMBOLP (operation)
7959       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7960     error ("Invalid first arguement");
7961   if (nargs < 1 + XINT (target_idx))
7962     error ("Too few arguments for operation: %s",
7963            SDATA (SYMBOL_NAME (operation)));
7964   target = args[XINT (target_idx) + 1];
7965   if (!(STRINGP (target)
7966         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7967     error ("Invalid %dth argument", XINT (target_idx) + 1);
7968
7969   chain = ((EQ (operation, Qinsert_file_contents)
7970             || EQ (operation, Qwrite_region))
7971            ? Vfile_coding_system_alist
7972            : (EQ (operation, Qopen_network_stream)
7973               ? Vnetwork_coding_system_alist
7974               : Vprocess_coding_system_alist));
7975   if (NILP (chain))
7976     return Qnil;
7977
7978   for (; CONSP (chain); chain = XCDR (chain))
7979     {
7980       Lisp_Object elt;
7981
7982       elt = XCAR (chain);
7983       if (CONSP (elt)
7984           && ((STRINGP (target)
7985                && STRINGP (XCAR (elt))
7986                && fast_string_match (XCAR (elt), target) >= 0)
7987               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7988         {
7989           val = XCDR (elt);
7990           /* Here, if VAL is both a valid coding system and a valid
7991              function symbol, we return VAL as a coding system.  */
7992           if (CONSP (val))
7993             return val;
7994           if (! SYMBOLP (val))
7995             return Qnil;
7996           if (! NILP (Fcoding_system_p (val)))
7997             return Fcons (val, val);
7998           if (! NILP (Ffboundp (val)))
7999             {
8000               val = call1 (val, Flist (nargs, args));
8001               if (CONSP (val))
8002                 return val;
8003               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8004                 return Fcons (val, val);
8005             }
8006           return Qnil;
8007         }
8008     }
8009   return Qnil;
8010 }
8011
8012 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8013        Sset_coding_system_priority, 0, MANY, 0,
8014        doc: /* Assign higher priority to the coding systems given as arguments.
8015 If multiple coding systems belongs to the same category,
8016 all but the first one are ignored.
8017
8018 usage: (set-coding-system-priority ...)  */)
8019      (nargs, args)
8020      int nargs;
8021      Lisp_Object *args;
8022 {
8023   int i, j;
8024   int changed[coding_category_max];
8025   enum coding_category priorities[coding_category_max];
8026
8027   bzero (changed, sizeof changed);
8028
8029   for (i = j = 0; i < nargs; i++)
8030     {
8031       enum coding_category category;
8032       Lisp_Object spec, attrs;
8033
8034       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8035       attrs = AREF (spec, 0);
8036       category = XINT (CODING_ATTR_CATEGORY (attrs));
8037       if (changed[category])
8038         /* Ignore this coding system because a coding system of the
8039            same category already had a higher priority.  */
8040         continue;
8041       changed[category] = 1;
8042       priorities[j++] = category;
8043       if (coding_categories[category].id >= 0
8044           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8045         setup_coding_system (args[i], &coding_categories[category]);
8046       Fset (AREF (Vcoding_category_table, category), args[i]);
8047     }
8048
8049   /* Now we have decided top J priorities.  Reflect the order of the
8050      original priorities to the remaining priorities.  */
8051
8052   for (i = j, j = 0; i < coding_category_max; i++, j++)
8053     {
8054       while (j < coding_category_max
8055              && changed[coding_priorities[j]])
8056         j++;
8057       if (j == coding_category_max)
8058         abort ();
8059       priorities[i] = coding_priorities[j];
8060     }
8061
8062   bcopy (priorities, coding_priorities, sizeof priorities);
8063
8064   /* Update `coding-category-list'.  */
8065   Vcoding_category_list = Qnil;
8066   for (i = coding_category_max - 1; i >= 0; i--)
8067     Vcoding_category_list
8068       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8069                Vcoding_category_list);
8070
8071   return Qnil;
8072 }
8073
8074 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8075        Scoding_system_priority_list, 0, 1, 0,
8076        doc: /* Return a list of coding systems ordered by their priorities.
8077 HIGHESTP non-nil means just return the highest priority one.  */)
8078      (highestp)
8079      Lisp_Object highestp;
8080 {
8081   int i;
8082   Lisp_Object val;
8083
8084   for (i = 0, val = Qnil; i < coding_category_max; i++)
8085     {
8086       enum coding_category category = coding_priorities[i];
8087       int id = coding_categories[category].id;
8088       Lisp_Object attrs;
8089
8090       if (id < 0)
8091         continue;
8092       attrs = CODING_ID_ATTRS (id);
8093       if (! NILP (highestp))
8094         return CODING_ATTR_BASE_NAME (attrs);
8095       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8096     }
8097   return Fnreverse (val);
8098 }
8099
8100 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8101
8102 static Lisp_Object
8103 make_subsidiaries (base)
8104      Lisp_Object base;
8105 {
8106   Lisp_Object subsidiaries;
8107   int base_name_len = SBYTES (SYMBOL_NAME (base));
8108   char *buf = (char *) alloca (base_name_len + 6);
8109   int i;
8110
8111   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8112   subsidiaries = Fmake_vector (make_number (3), Qnil);
8113   for (i = 0; i < 3; i++)
8114     {
8115       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8116       ASET (subsidiaries, i, intern (buf));
8117     }
8118   return subsidiaries;
8119 }
8120
8121
8122 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8123        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8124        doc: /* For internal use only.
8125 usage: (define-coding-system-internal ...)  */)
8126      (nargs, args)
8127      int nargs;
8128      Lisp_Object *args;
8129 {
8130   Lisp_Object name;
8131   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8132   Lisp_Object attrs;            /* Vector of attributes.  */
8133   Lisp_Object eol_type;
8134   Lisp_Object aliases;
8135   Lisp_Object coding_type, charset_list, safe_charsets;
8136   enum coding_category category;
8137   Lisp_Object tail, val;
8138   int max_charset_id = 0;
8139   int i;
8140
8141   if (nargs < coding_arg_max)
8142     goto short_args;
8143
8144   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8145
8146   name = args[coding_arg_name];
8147   CHECK_SYMBOL (name);
8148   CODING_ATTR_BASE_NAME (attrs) = name;
8149
8150   val = args[coding_arg_mnemonic];
8151   if (! STRINGP (val))
8152     CHECK_CHARACTER (val);
8153   CODING_ATTR_MNEMONIC (attrs) = val;
8154
8155   coding_type = args[coding_arg_coding_type];
8156   CHECK_SYMBOL (coding_type);
8157   CODING_ATTR_TYPE (attrs) = coding_type;
8158
8159   charset_list = args[coding_arg_charset_list];
8160   if (SYMBOLP (charset_list))
8161     {
8162       if (EQ (charset_list, Qiso_2022))
8163         {
8164           if (! EQ (coding_type, Qiso_2022))
8165             error ("Invalid charset-list");
8166           charset_list = Viso_2022_charset_list;
8167         }
8168       else if (EQ (charset_list, Qemacs_mule))
8169         {
8170           if (! EQ (coding_type, Qemacs_mule))
8171             error ("Invalid charset-list");
8172           charset_list = Vemacs_mule_charset_list;
8173         }
8174       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8175         if (max_charset_id < XFASTINT (XCAR (tail)))
8176           max_charset_id = XFASTINT (XCAR (tail));
8177     }
8178   else
8179     {
8180       charset_list = Fcopy_sequence (charset_list);
8181       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8182         {
8183           struct charset *charset;
8184
8185           val = Fcar (tail);
8186           CHECK_CHARSET_GET_CHARSET (val, charset);
8187           if (EQ (coding_type, Qiso_2022)
8188               ? CHARSET_ISO_FINAL (charset) < 0
8189               : EQ (coding_type, Qemacs_mule)
8190               ? CHARSET_EMACS_MULE_ID (charset) < 0
8191               : 0)
8192             error ("Can't handle charset `%s'",
8193                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8194
8195           XSETCAR (tail, make_number (charset->id));
8196           if (max_charset_id < charset->id)
8197             max_charset_id = charset->id;
8198         }
8199     }
8200   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8201
8202   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8203                                 make_number (255));
8204   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8205     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8206   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8207
8208   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8209
8210   val = args[coding_arg_decode_translation_table];
8211   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8212     CHECK_SYMBOL (val);
8213   CODING_ATTR_DECODE_TBL (attrs) = val;
8214
8215   val = args[coding_arg_encode_translation_table];
8216   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8217     CHECK_SYMBOL (val);
8218   CODING_ATTR_ENCODE_TBL (attrs) = val;
8219
8220   val = args[coding_arg_post_read_conversion];
8221   CHECK_SYMBOL (val);
8222   CODING_ATTR_POST_READ (attrs) = val;
8223
8224   val = args[coding_arg_pre_write_conversion];
8225   CHECK_SYMBOL (val);
8226   CODING_ATTR_PRE_WRITE (attrs) = val;
8227
8228   val = args[coding_arg_default_char];
8229   if (NILP (val))
8230     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8231   else
8232     {
8233       CHECK_CHARACTER (val);
8234       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8235     }
8236
8237   val = args[coding_arg_for_unibyte];
8238   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8239
8240   val = args[coding_arg_plist];
8241   CHECK_LIST (val);
8242   CODING_ATTR_PLIST (attrs) = val;
8243
8244   if (EQ (coding_type, Qcharset))
8245     {
8246       /* Generate a lisp vector of 256 elements.  Each element is nil,
8247          integer, or a list of charset IDs.
8248
8249          If Nth element is nil, the byte code N is invalid in this
8250          coding system.
8251
8252          If Nth element is a number NUM, N is the first byte of a
8253          charset whose ID is NUM.
8254
8255          If Nth element is a list of charset IDs, N is the first byte
8256          of one of them.  The list is sorted by dimensions of the
8257          charsets.  A charset of smaller dimension comes firtst. */
8258       val = Fmake_vector (make_number (256), Qnil);
8259
8260       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8261         {
8262           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8263           int dim = CHARSET_DIMENSION (charset);
8264           int idx = (dim - 1) * 4;
8265
8266           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8267             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8268
8269           for (i = charset->code_space[idx];
8270                i <= charset->code_space[idx + 1]; i++)
8271             {
8272               Lisp_Object tmp, tmp2;
8273               int dim2;
8274
8275               tmp = AREF (val, i);
8276               if (NILP (tmp))
8277                 tmp = XCAR (tail);
8278               else if (NUMBERP (tmp))
8279                 {
8280                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8281                   if (dim < dim2)
8282                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8283                   else
8284                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8285                 }
8286               else
8287                 {
8288                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8289                     {
8290                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8291                       if (dim < dim2)
8292                         break;
8293                     }
8294                   if (NILP (tmp2))
8295                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8296                   else
8297                     {
8298                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8299                       XSETCAR (tmp2, XCAR (tail));
8300                     }
8301                 }
8302               ASET (val, i, tmp);
8303             }
8304         }
8305       ASET (attrs, coding_attr_charset_valids, val);
8306       category = coding_category_charset;
8307     }
8308   else if (EQ (coding_type, Qccl))
8309     {
8310       Lisp_Object valids;
8311
8312       if (nargs < coding_arg_ccl_max)
8313         goto short_args;
8314
8315       val = args[coding_arg_ccl_decoder];
8316       CHECK_CCL_PROGRAM (val);
8317       if (VECTORP (val))
8318         val = Fcopy_sequence (val);
8319       ASET (attrs, coding_attr_ccl_decoder, val);
8320
8321       val = args[coding_arg_ccl_encoder];
8322       CHECK_CCL_PROGRAM (val);
8323       if (VECTORP (val))
8324         val = Fcopy_sequence (val);
8325       ASET (attrs, coding_attr_ccl_encoder, val);
8326
8327       val = args[coding_arg_ccl_valids];
8328       valids = Fmake_string (make_number (256), make_number (0));
8329       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8330         {
8331           int from, to;
8332
8333           val = Fcar (tail);
8334           if (INTEGERP (val))
8335             {
8336               from = to = XINT (val);
8337               if (from < 0 || from > 255)
8338                 args_out_of_range_3 (val, make_number (0), make_number (255));
8339             }
8340           else
8341             {
8342               CHECK_CONS (val);
8343               CHECK_NATNUM_CAR (val);
8344               CHECK_NATNUM_CDR (val);
8345               from = XINT (XCAR (val));
8346               if (from > 255)
8347                 args_out_of_range_3 (XCAR (val),
8348                                      make_number (0), make_number (255));
8349               to = XINT (XCDR (val));
8350               if (to < from || to > 255)
8351                 args_out_of_range_3 (XCDR (val),
8352                                      XCAR (val), make_number (255));
8353             }
8354           for (i = from; i <= to; i++)
8355             SSET (valids, i, 1);
8356         }
8357       ASET (attrs, coding_attr_ccl_valids, valids);
8358
8359       category = coding_category_ccl;
8360     }
8361   else if (EQ (coding_type, Qutf_16))
8362     {
8363       Lisp_Object bom, endian;
8364
8365       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8366
8367       if (nargs < coding_arg_utf16_max)
8368         goto short_args;
8369
8370       bom = args[coding_arg_utf16_bom];
8371       if (! NILP (bom) && ! EQ (bom, Qt))
8372         {
8373           CHECK_CONS (bom);
8374           val = XCAR (bom);
8375           CHECK_CODING_SYSTEM (val);
8376           val = XCDR (bom);
8377           CHECK_CODING_SYSTEM (val);
8378         }
8379       ASET (attrs, coding_attr_utf_16_bom, bom);
8380
8381       endian = args[coding_arg_utf16_endian];
8382       CHECK_SYMBOL (endian);
8383       if (NILP (endian))
8384         endian = Qbig;
8385       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8386         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8387       ASET (attrs, coding_attr_utf_16_endian, endian);
8388
8389       category = (CONSP (bom)
8390                   ? coding_category_utf_16_auto
8391                   : NILP (bom)
8392                   ? (EQ (endian, Qbig)
8393                      ? coding_category_utf_16_be_nosig
8394                      : coding_category_utf_16_le_nosig)
8395                   : (EQ (endian, Qbig)
8396                      ? coding_category_utf_16_be
8397                      : coding_category_utf_16_le));
8398     }
8399   else if (EQ (coding_type, Qiso_2022))
8400     {
8401       Lisp_Object initial, reg_usage, request, flags;
8402       int i;
8403
8404       if (nargs < coding_arg_iso2022_max)
8405         goto short_args;
8406
8407       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8408       CHECK_VECTOR (initial);
8409       for (i = 0; i < 4; i++)
8410         {
8411           val = Faref (initial, make_number (i));
8412           if (! NILP (val))
8413             {
8414               struct charset *charset;
8415
8416               CHECK_CHARSET_GET_CHARSET (val, charset);
8417               ASET (initial, i, make_number (CHARSET_ID (charset)));
8418               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8419                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8420             }
8421           else
8422             ASET (initial, i, make_number (-1));
8423         }
8424
8425       reg_usage = args[coding_arg_iso2022_reg_usage];
8426       CHECK_CONS (reg_usage);
8427       CHECK_NUMBER_CAR (reg_usage);
8428       CHECK_NUMBER_CDR (reg_usage);
8429
8430       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8431       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8432         {
8433           int id;
8434           Lisp_Object tmp;
8435
8436           val = Fcar (tail);
8437           CHECK_CONS (val);
8438           tmp = XCAR (val);
8439           CHECK_CHARSET_GET_ID (tmp, id);
8440           CHECK_NATNUM_CDR (val);
8441           if (XINT (XCDR (val)) >= 4)
8442             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8443           XSETCAR (val, make_number (id));
8444         }
8445
8446       flags = args[coding_arg_iso2022_flags];
8447       CHECK_NATNUM (flags);
8448       i = XINT (flags);
8449       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8450         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8451
8452       ASET (attrs, coding_attr_iso_initial, initial);
8453       ASET (attrs, coding_attr_iso_usage, reg_usage);
8454       ASET (attrs, coding_attr_iso_request, request);
8455       ASET (attrs, coding_attr_iso_flags, flags);
8456       setup_iso_safe_charsets (attrs);
8457
8458       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8459         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8460                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8461                     ? coding_category_iso_7_else
8462                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8463                     ? coding_category_iso_7
8464                     : coding_category_iso_7_tight);
8465       else
8466         {
8467           int id = XINT (AREF (initial, 1));
8468
8469           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8470                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8471                        || id < 0)
8472                       ? coding_category_iso_8_else
8473                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8474                       ? coding_category_iso_8_1
8475                       : coding_category_iso_8_2);
8476         }
8477       if (category != coding_category_iso_8_1
8478           && category != coding_category_iso_8_2)
8479         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8480     }
8481   else if (EQ (coding_type, Qemacs_mule))
8482     {
8483       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8484         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8485       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8486       category = coding_category_emacs_mule;
8487     }
8488   else if (EQ (coding_type, Qshift_jis))
8489     {
8490
8491       struct charset *charset;
8492
8493       if (XINT (Flength (charset_list)) != 3
8494           && XINT (Flength (charset_list)) != 4)
8495         error ("There should be three or four charsets");
8496
8497       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8498       if (CHARSET_DIMENSION (charset) != 1)
8499         error ("Dimension of charset %s is not one",
8500                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8501       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8502         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8503
8504       charset_list = XCDR (charset_list);
8505       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8506       if (CHARSET_DIMENSION (charset) != 1)
8507         error ("Dimension of charset %s is not one",
8508                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8509
8510       charset_list = XCDR (charset_list);
8511       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8512       if (CHARSET_DIMENSION (charset) != 2)
8513         error ("Dimension of charset %s is not two",
8514                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8515
8516       charset_list = XCDR (charset_list);
8517       if (! NILP (charset_list))
8518         {
8519           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8520           if (CHARSET_DIMENSION (charset) != 2)
8521             error ("Dimension of charset %s is not two",
8522                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8523         }
8524
8525       category = coding_category_sjis;
8526       Vsjis_coding_system = name;
8527     }
8528   else if (EQ (coding_type, Qbig5))
8529     {
8530       struct charset *charset;
8531
8532       if (XINT (Flength (charset_list)) != 2)
8533         error ("There should be just two charsets");
8534
8535       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8536       if (CHARSET_DIMENSION (charset) != 1)
8537         error ("Dimension of charset %s is not one",
8538                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8539       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8540         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8541
8542       charset_list = XCDR (charset_list);
8543       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8544       if (CHARSET_DIMENSION (charset) != 2)
8545         error ("Dimension of charset %s is not two",
8546                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8547
8548       category = coding_category_big5;
8549       Vbig5_coding_system = name;
8550     }
8551   else if (EQ (coding_type, Qraw_text))
8552     {
8553       category = coding_category_raw_text;
8554       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8555     }
8556   else if (EQ (coding_type, Qutf_8))
8557     {
8558       category = coding_category_utf_8;
8559       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8560     }
8561   else if (EQ (coding_type, Qundecided))
8562     category = coding_category_undecided;
8563   else
8564     error ("Invalid coding system type: %s",
8565            SDATA (SYMBOL_NAME (coding_type)));
8566
8567   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8568   CODING_ATTR_PLIST (attrs)
8569     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8570                                 CODING_ATTR_PLIST (attrs)));
8571
8572   eol_type = args[coding_arg_eol_type];
8573   if (! NILP (eol_type)
8574       && ! EQ (eol_type, Qunix)
8575       && ! EQ (eol_type, Qdos)
8576       && ! EQ (eol_type, Qmac))
8577     error ("Invalid eol-type");
8578
8579   aliases = Fcons (name, Qnil);
8580
8581   if (NILP (eol_type))
8582     {
8583       eol_type = make_subsidiaries (name);
8584       for (i = 0; i < 3; i++)
8585         {
8586           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8587
8588           this_name = AREF (eol_type, i);
8589           this_aliases = Fcons (this_name, Qnil);
8590           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8591           this_spec = Fmake_vector (make_number (3), attrs);
8592           ASET (this_spec, 1, this_aliases);
8593           ASET (this_spec, 2, this_eol_type);
8594           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8595           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8596           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8597                                         Vcoding_system_alist);
8598         }
8599     }
8600
8601   spec_vec = Fmake_vector (make_number (3), attrs);
8602   ASET (spec_vec, 1, aliases);
8603   ASET (spec_vec, 2, eol_type);
8604
8605   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8606   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8607   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8608                                 Vcoding_system_alist);
8609
8610   {
8611     int id = coding_categories[category].id;
8612
8613     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8614       setup_coding_system (name, &coding_categories[category]);
8615   }
8616
8617   return Qnil;
8618
8619  short_args:
8620   return Fsignal (Qwrong_number_of_arguments,
8621                   Fcons (intern ("define-coding-system-internal"),
8622                          make_number (nargs)));
8623 }
8624
8625
8626 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8627        3, 3, 0,
8628        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8629   (coding_system, prop, val)
8630      Lisp_Object coding_system, prop, val;
8631 {
8632   Lisp_Object spec, attrs, plist;
8633
8634   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8635   attrs = AREF (spec, 0);
8636   if (EQ (prop, QCmnemonic))
8637     {
8638       if (! STRINGP (val))
8639         CHECK_CHARACTER (val);
8640       CODING_ATTR_MNEMONIC (attrs) = val;
8641     }
8642   else if (EQ (prop, QCdefalut_char))
8643     {
8644       if (NILP (val))
8645         val = make_number (' ');
8646       else
8647         CHECK_CHARACTER (val);
8648       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8649     }
8650   else if (EQ (prop, QCdecode_translation_table))
8651     {
8652       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8653         CHECK_SYMBOL (val);
8654       CODING_ATTR_DECODE_TBL (attrs) = val;
8655     }
8656   else if (EQ (prop, QCencode_translation_table))
8657     {
8658       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8659         CHECK_SYMBOL (val);
8660       CODING_ATTR_ENCODE_TBL (attrs) = val;
8661     }
8662   else if (EQ (prop, QCpost_read_conversion))
8663     {
8664       CHECK_SYMBOL (val);
8665       CODING_ATTR_POST_READ (attrs) = val;
8666     }
8667   else if (EQ (prop, QCpre_write_conversion))
8668     {
8669       CHECK_SYMBOL (val);
8670       CODING_ATTR_PRE_WRITE (attrs) = val;
8671     }
8672
8673   CODING_ATTR_PLIST (attrs)
8674     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8675   return val;
8676 }
8677
8678
8679 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8680        Sdefine_coding_system_alias, 2, 2, 0,
8681        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8682      (alias, coding_system)
8683      Lisp_Object alias, coding_system;
8684 {
8685   Lisp_Object spec, aliases, eol_type;
8686
8687   CHECK_SYMBOL (alias);
8688   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8689   aliases = AREF (spec, 1);
8690   /* ALISES should be a list of length more than zero, and the first
8691      element is a base coding system.  Append ALIAS at the tail of the
8692      list.  */
8693   while (!NILP (XCDR (aliases)))
8694     aliases = XCDR (aliases);
8695   XSETCDR (aliases, Fcons (alias, Qnil));
8696
8697   eol_type = AREF (spec, 2);
8698   if (VECTORP (eol_type))
8699     {
8700       Lisp_Object subsidiaries;
8701       int i;
8702
8703       subsidiaries = make_subsidiaries (alias);
8704       for (i = 0; i < 3; i++)
8705         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8706                                      AREF (eol_type, i));
8707     }
8708
8709   Fputhash (alias, spec, Vcoding_system_hash_table);
8710   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8711   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8712                                 Vcoding_system_alist);
8713
8714   return Qnil;
8715 }
8716
8717 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8718        1, 1, 0,
8719        doc: /* Return the base of CODING-SYSTEM.
8720 Any alias or subsidiary coding system is not a base coding system.  */)
8721   (coding_system)
8722      Lisp_Object coding_system;
8723 {
8724   Lisp_Object spec, attrs;
8725
8726   if (NILP (coding_system))
8727     return (Qno_conversion);
8728   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8729   attrs = AREF (spec, 0);
8730   return CODING_ATTR_BASE_NAME (attrs);
8731 }
8732
8733 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8734        1, 1, 0,
8735        doc: "Return the property list of CODING-SYSTEM.")
8736      (coding_system)
8737      Lisp_Object coding_system;
8738 {
8739   Lisp_Object spec, attrs;
8740
8741   if (NILP (coding_system))
8742     coding_system = Qno_conversion;
8743   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8744   attrs = AREF (spec, 0);
8745   return CODING_ATTR_PLIST (attrs);
8746 }
8747
8748
8749 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8750        1, 1, 0,
8751        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8752      (coding_system)
8753      Lisp_Object coding_system;
8754 {
8755   Lisp_Object spec;
8756
8757   if (NILP (coding_system))
8758     coding_system = Qno_conversion;
8759   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8760   return AREF (spec, 1);
8761 }
8762
8763 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8764        Scoding_system_eol_type, 1, 1, 0,
8765        doc: /* Return eol-type of CODING-SYSTEM.
8766 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8767
8768 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8769 and CR respectively.
8770
8771 A vector value indicates that a format of end-of-line should be
8772 detected automatically.  Nth element of the vector is the subsidiary
8773 coding system whose eol-type is N.  */)
8774      (coding_system)
8775      Lisp_Object coding_system;
8776 {
8777   Lisp_Object spec, eol_type;
8778   int n;
8779
8780   if (NILP (coding_system))
8781     coding_system = Qno_conversion;
8782   if (! CODING_SYSTEM_P (coding_system))
8783     return Qnil;
8784   spec = CODING_SYSTEM_SPEC (coding_system);
8785   eol_type = AREF (spec, 2);
8786   if (VECTORP (eol_type))
8787     return Fcopy_sequence (eol_type);
8788   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8789   return make_number (n);
8790 }
8791
8792 #endif /* emacs */
8793
8794 \f
8795 /*** 9. Post-amble ***/
8796
8797 void
8798 init_coding_once ()
8799 {
8800   int i;
8801
8802   for (i = 0; i < coding_category_max; i++)
8803     {
8804       coding_categories[i].id = -1;
8805       coding_priorities[i] = i;
8806     }
8807
8808   /* ISO2022 specific initialize routine.  */
8809   for (i = 0; i < 0x20; i++)
8810     iso_code_class[i] = ISO_control_0;
8811   for (i = 0x21; i < 0x7F; i++)
8812     iso_code_class[i] = ISO_graphic_plane_0;
8813   for (i = 0x80; i < 0xA0; i++)
8814     iso_code_class[i] = ISO_control_1;
8815   for (i = 0xA1; i < 0xFF; i++)
8816     iso_code_class[i] = ISO_graphic_plane_1;
8817   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8818   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8819   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8820   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8821   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8822   iso_code_class[ISO_CODE_ESC] = ISO_escape;
8823   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8824   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8825   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8826
8827   for (i = 0; i < 256; i++)
8828     {
8829       emacs_mule_bytes[i] = 1;
8830     }
8831   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8832   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8833   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8834   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8835 }
8836
8837 #ifdef emacs
8838
8839 void
8840 syms_of_coding ()
8841 {
8842   staticpro (&Vcoding_system_hash_table);
8843   {
8844     Lisp_Object args[2];
8845     args[0] = QCtest;
8846     args[1] = Qeq;
8847     Vcoding_system_hash_table = Fmake_hash_table (2, args);
8848   }
8849
8850   staticpro (&Vsjis_coding_system);
8851   Vsjis_coding_system = Qnil;
8852
8853   staticpro (&Vbig5_coding_system);
8854   Vbig5_coding_system = Qnil;
8855
8856   staticpro (&Vcode_conversion_reused_workbuf);
8857   Vcode_conversion_reused_workbuf = Qnil;
8858
8859   staticpro (&Vcode_conversion_workbuf_name);
8860   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8861
8862   reused_workbuf_in_use = 0;
8863
8864   DEFSYM (Qcharset, "charset");
8865   DEFSYM (Qtarget_idx, "target-idx");
8866   DEFSYM (Qcoding_system_history, "coding-system-history");
8867   Fset (Qcoding_system_history, Qnil);
8868
8869   /* Target FILENAME is the first argument.  */
8870   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8871   /* Target FILENAME is the third argument.  */
8872   Fput (Qwrite_region, Qtarget_idx, make_number (2));
8873
8874   DEFSYM (Qcall_process, "call-process");
8875   /* Target PROGRAM is the first argument.  */
8876   Fput (Qcall_process, Qtarget_idx, make_number (0));
8877
8878   DEFSYM (Qcall_process_region, "call-process-region");
8879   /* Target PROGRAM is the third argument.  */
8880   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8881
8882   DEFSYM (Qstart_process, "start-process");
8883   /* Target PROGRAM is the third argument.  */
8884   Fput (Qstart_process, Qtarget_idx, make_number (2));
8885
8886   DEFSYM (Qopen_network_stream, "open-network-stream");
8887   /* Target SERVICE is the fourth argument.  */
8888   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8889
8890   DEFSYM (Qcoding_system, "coding-system");
8891   DEFSYM (Qcoding_aliases, "coding-aliases");
8892
8893   DEFSYM (Qeol_type, "eol-type");
8894   DEFSYM (Qunix, "unix");
8895   DEFSYM (Qdos, "dos");
8896
8897   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8898   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8899   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8900   DEFSYM (Qdefault_char, "default-char");
8901   DEFSYM (Qundecided, "undecided");
8902   DEFSYM (Qno_conversion, "no-conversion");
8903   DEFSYM (Qraw_text, "raw-text");
8904
8905   DEFSYM (Qiso_2022, "iso-2022");
8906
8907   DEFSYM (Qutf_8, "utf-8");
8908   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8909
8910   DEFSYM (Qutf_16, "utf-16");
8911   DEFSYM (Qbig, "big");
8912   DEFSYM (Qlittle, "little");
8913
8914   DEFSYM (Qshift_jis, "shift-jis");
8915   DEFSYM (Qbig5, "big5");
8916
8917   DEFSYM (Qcoding_system_p, "coding-system-p");
8918
8919   DEFSYM (Qcoding_system_error, "coding-system-error");
8920   Fput (Qcoding_system_error, Qerror_conditions,
8921         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8922   Fput (Qcoding_system_error, Qerror_message,
8923         build_string ("Invalid coding system"));
8924
8925   /* Intern this now in case it isn't already done.
8926      Setting this variable twice is harmless.
8927      But don't staticpro it here--that is done in alloc.c.  */
8928   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8929
8930   DEFSYM (Qtranslation_table, "translation-table");
8931   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8932   DEFSYM (Qtranslation_table_id, "translation-table-id");
8933   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8934   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8935
8936   DEFSYM (Qvalid_codes, "valid-codes");
8937
8938   DEFSYM (Qemacs_mule, "emacs-mule");
8939
8940   DEFSYM (QCcategory, ":category");
8941   DEFSYM (QCmnemonic, ":mnemonic");
8942   DEFSYM (QCdefalut_char, ":default-char");
8943   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8944   DEFSYM (QCencode_translation_table, ":encode-translation-table");
8945   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8946   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
8947
8948   Vcoding_category_table
8949     = Fmake_vector (make_number (coding_category_max), Qnil);
8950   staticpro (&Vcoding_category_table);
8951   /* Followings are target of code detection.  */
8952   ASET (Vcoding_category_table, coding_category_iso_7,
8953         intern ("coding-category-iso-7"));
8954   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8955         intern ("coding-category-iso-7-tight"));
8956   ASET (Vcoding_category_table, coding_category_iso_8_1,
8957         intern ("coding-category-iso-8-1"));
8958   ASET (Vcoding_category_table, coding_category_iso_8_2,
8959         intern ("coding-category-iso-8-2"));
8960   ASET (Vcoding_category_table, coding_category_iso_7_else,
8961         intern ("coding-category-iso-7-else"));
8962   ASET (Vcoding_category_table, coding_category_iso_8_else,
8963         intern ("coding-category-iso-8-else"));
8964   ASET (Vcoding_category_table, coding_category_utf_8,
8965         intern ("coding-category-utf-8"));
8966   ASET (Vcoding_category_table, coding_category_utf_16_be,
8967         intern ("coding-category-utf-16-be"));
8968   ASET (Vcoding_category_table, coding_category_utf_16_auto,
8969         intern ("coding-category-utf-16-auto"));
8970   ASET (Vcoding_category_table, coding_category_utf_16_le,
8971         intern ("coding-category-utf-16-le"));
8972   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8973         intern ("coding-category-utf-16-be-nosig"));
8974   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8975         intern ("coding-category-utf-16-le-nosig"));
8976   ASET (Vcoding_category_table, coding_category_charset,
8977         intern ("coding-category-charset"));
8978   ASET (Vcoding_category_table, coding_category_sjis,
8979         intern ("coding-category-sjis"));
8980   ASET (Vcoding_category_table, coding_category_big5,
8981         intern ("coding-category-big5"));
8982   ASET (Vcoding_category_table, coding_category_ccl,
8983         intern ("coding-category-ccl"));
8984   ASET (Vcoding_category_table, coding_category_emacs_mule,
8985         intern ("coding-category-emacs-mule"));
8986   /* Followings are NOT target of code detection.  */
8987   ASET (Vcoding_category_table, coding_category_raw_text,
8988         intern ("coding-category-raw-text"));
8989   ASET (Vcoding_category_table, coding_category_undecided,
8990         intern ("coding-category-undecided"));
8991
8992   DEFSYM (Qinsufficient_source, "insufficient-source");
8993   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8994   DEFSYM (Qinvalid_source, "invalid-source");
8995   DEFSYM (Qinterrupted, "interrupted");
8996   DEFSYM (Qinsufficient_memory, "insufficient-memory");
8997
8998   defsubr (&Scoding_system_p);
8999   defsubr (&Sread_coding_system);
9000   defsubr (&Sread_non_nil_coding_system);
9001   defsubr (&Scheck_coding_system);
9002   defsubr (&Sdetect_coding_region);
9003   defsubr (&Sdetect_coding_string);
9004   defsubr (&Sfind_coding_systems_region_internal);
9005   defsubr (&Sunencodable_char_position);
9006   defsubr (&Scheck_coding_systems_region);
9007   defsubr (&Sdecode_coding_region);
9008   defsubr (&Sencode_coding_region);
9009   defsubr (&Sdecode_coding_string);
9010   defsubr (&Sencode_coding_string);
9011   defsubr (&Sdecode_sjis_char);
9012   defsubr (&Sencode_sjis_char);
9013   defsubr (&Sdecode_big5_char);
9014   defsubr (&Sencode_big5_char);
9015   defsubr (&Sset_terminal_coding_system_internal);
9016   defsubr (&Sset_safe_terminal_coding_system_internal);
9017   defsubr (&Sterminal_coding_system);
9018   defsubr (&Sset_keyboard_coding_system_internal);
9019   defsubr (&Skeyboard_coding_system);
9020   defsubr (&Sfind_operation_coding_system);
9021   defsubr (&Sset_coding_system_priority);
9022   defsubr (&Sdefine_coding_system_internal);
9023   defsubr (&Sdefine_coding_system_alias);
9024   defsubr (&Scoding_system_put);
9025   defsubr (&Scoding_system_base);
9026   defsubr (&Scoding_system_plist);
9027   defsubr (&Scoding_system_aliases);
9028   defsubr (&Scoding_system_eol_type);
9029   defsubr (&Scoding_system_priority_list);
9030
9031   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9032                doc: /* List of coding systems.
9033
9034 Do not alter the value of this variable manually.  This variable should be
9035 updated by the functions `define-coding-system' and
9036 `define-coding-system-alias'.  */);
9037   Vcoding_system_list = Qnil;
9038
9039   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9040                doc: /* Alist of coding system names.
9041 Each element is one element list of coding system name.
9042 This variable is given to `completing-read' as TABLE argument.
9043
9044 Do not alter the value of this variable manually.  This variable should be
9045 updated by the functions `make-coding-system' and
9046 `define-coding-system-alias'.  */);
9047   Vcoding_system_alist = Qnil;
9048
9049   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9050                doc: /* List of coding-categories (symbols) ordered by priority.
9051
9052 On detecting a coding system, Emacs tries code detection algorithms
9053 associated with each coding-category one by one in this order.  When
9054 one algorithm agrees with a byte sequence of source text, the coding
9055 system bound to the corresponding coding-category is selected.  */);
9056   {
9057     int i;
9058
9059     Vcoding_category_list = Qnil;
9060     for (i = coding_category_max - 1; i >= 0; i--)
9061       Vcoding_category_list
9062         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9063                  Vcoding_category_list);
9064   }
9065
9066   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9067                doc: /* Specify the coding system for read operations.
9068 It is useful to bind this variable with `let', but do not set it globally.
9069 If the value is a coding system, it is used for decoding on read operation.
9070 If not, an appropriate element is used from one of the coding system alists:
9071 There are three such tables, `file-coding-system-alist',
9072 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9073   Vcoding_system_for_read = Qnil;
9074
9075   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9076                doc: /* Specify the coding system for write operations.
9077 Programs bind this variable with `let', but you should not set it globally.
9078 If the value is a coding system, it is used for encoding of output,
9079 when writing it to a file and when sending it to a file or subprocess.
9080
9081 If this does not specify a coding system, an appropriate element
9082 is used from one of the coding system alists:
9083 There are three such tables, `file-coding-system-alist',
9084 `process-coding-system-alist', and `network-coding-system-alist'.
9085 For output to files, if the above procedure does not specify a coding system,
9086 the value of `buffer-file-coding-system' is used.  */);
9087   Vcoding_system_for_write = Qnil;
9088
9089   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9090                doc: /*
9091 Coding system used in the latest file or process I/O.  */);
9092   Vlast_coding_system_used = Qnil;
9093
9094   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9095                doc: /*
9096 Error status of the last code conversion.
9097
9098 When an error was detected in the last code conversion, this variable
9099 is set to one of the following symbols.
9100   `insufficient-source'
9101   `inconsistent-eol'
9102   `invalid-source'
9103   `interrupted'
9104   `insufficient-memory'
9105 When no error was detected, the value doesn't change.  So, to check
9106 the error status of a code conversion by this variable, you must
9107 explicitly set this variable to nil before performing code
9108 conversion.  */);
9109   Vlast_code_conversion_error = Qnil;
9110
9111   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9112                doc: /*
9113 *Non-nil means always inhibit code conversion of end-of-line format.
9114 See info node `Coding Systems' and info node `Text and Binary' concerning
9115 such conversion.  */);
9116   inhibit_eol_conversion = 0;
9117
9118   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9119                doc: /*
9120 Non-nil means process buffer inherits coding system of process output.
9121 Bind it to t if the process output is to be treated as if it were a file
9122 read from some filesystem.  */);
9123   inherit_process_coding_system = 0;
9124
9125   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9126                doc: /*
9127 Alist to decide a coding system to use for a file I/O operation.
9128 The format is ((PATTERN . VAL) ...),
9129 where PATTERN is a regular expression matching a file name,
9130 VAL is a coding system, a cons of coding systems, or a function symbol.
9131 If VAL is a coding system, it is used for both decoding and encoding
9132 the file contents.
9133 If VAL is a cons of coding systems, the car part is used for decoding,
9134 and the cdr part is used for encoding.
9135 If VAL is a function symbol, the function must return a coding system
9136 or a cons of coding systems which are used as above.  The function gets
9137 the arguments with which `find-operation-coding-systems' was called.
9138
9139 See also the function `find-operation-coding-system'
9140 and the variable `auto-coding-alist'.  */);
9141   Vfile_coding_system_alist = Qnil;
9142
9143   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9144                doc: /*
9145 Alist to decide a coding system to use for a process I/O operation.
9146 The format is ((PATTERN . VAL) ...),
9147 where PATTERN is a regular expression matching a program name,
9148 VAL is a coding system, a cons of coding systems, or a function symbol.
9149 If VAL is a coding system, it is used for both decoding what received
9150 from the program and encoding what sent to the program.
9151 If VAL is a cons of coding systems, the car part is used for decoding,
9152 and the cdr part is used for encoding.
9153 If VAL is a function symbol, the function must return a coding system
9154 or a cons of coding systems which are used as above.
9155
9156 See also the function `find-operation-coding-system'.  */);
9157   Vprocess_coding_system_alist = Qnil;
9158
9159   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9160                doc: /*
9161 Alist to decide a coding system to use for a network I/O operation.
9162 The format is ((PATTERN . VAL) ...),
9163 where PATTERN is a regular expression matching a network service name
9164 or is a port number to connect to,
9165 VAL is a coding system, a cons of coding systems, or a function symbol.
9166 If VAL is a coding system, it is used for both decoding what received
9167 from the network stream and encoding what sent to the network stream.
9168 If VAL is a cons of coding systems, the car part is used for decoding,
9169 and the cdr part is used for encoding.
9170 If VAL is a function symbol, the function must return a coding system
9171 or a cons of coding systems which are used as above.
9172
9173 See also the function `find-operation-coding-system'.  */);
9174   Vnetwork_coding_system_alist = Qnil;
9175
9176   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9177                doc: /* Coding system to use with system messages.
9178 Also used for decoding keyboard input on X Window system.  */);
9179   Vlocale_coding_system = Qnil;
9180
9181   /* The eol mnemonics are reset in startup.el system-dependently.  */
9182   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9183                doc: /*
9184 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9185   eol_mnemonic_unix = build_string (":");
9186
9187   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9188                doc: /*
9189 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9190   eol_mnemonic_dos = build_string ("\\");
9191
9192   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9193                doc: /*
9194 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9195   eol_mnemonic_mac = build_string ("/");
9196
9197   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9198                doc: /*
9199 *String displayed in mode line when end-of-line format is not yet determined.  */);
9200   eol_mnemonic_undecided = build_string (":");
9201
9202   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9203                doc: /*
9204 *Non-nil enables character translation while encoding and decoding.  */);
9205   Venable_character_translation = Qt;
9206
9207   DEFVAR_LISP ("standard-translation-table-for-decode",
9208                &Vstandard_translation_table_for_decode,
9209                doc: /* Table for translating characters while decoding.  */);
9210   Vstandard_translation_table_for_decode = Qnil;
9211
9212   DEFVAR_LISP ("standard-translation-table-for-encode",
9213                &Vstandard_translation_table_for_encode,
9214                doc: /* Table for translating characters while encoding.  */);
9215   Vstandard_translation_table_for_encode = Qnil;
9216
9217   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9218                doc: /* Alist of charsets vs revision numbers.
9219 While encoding, if a charset (car part of an element) is found,
9220 designate it with the escape sequence identifying revision (cdr part
9221 of the element).  */);
9222   Vcharset_revision_table = Qnil;
9223
9224   DEFVAR_LISP ("default-process-coding-system",
9225                &Vdefault_process_coding_system,
9226                doc: /* Cons of coding systems used for process I/O by default.
9227 The car part is used for decoding a process output,
9228 the cdr part is used for encoding a text to be sent to a process.  */);
9229   Vdefault_process_coding_system = Qnil;
9230
9231   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9232                doc: /*
9233 Table of extra Latin codes in the range 128..159 (inclusive).
9234 This is a vector of length 256.
9235 If Nth element is non-nil, the existence of code N in a file
9236 \(or output of subprocess) doesn't prevent it to be detected as
9237 a coding system of ISO 2022 variant which has a flag
9238 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9239 or reading output of a subprocess.
9240 Only 128th through 159th elements has a meaning.  */);
9241   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9242
9243   DEFVAR_LISP ("select-safe-coding-system-function",
9244                &Vselect_safe_coding_system_function,
9245                doc: /*
9246 Function to call to select safe coding system for encoding a text.
9247
9248 If set, this function is called to force a user to select a proper
9249 coding system which can encode the text in the case that a default
9250 coding system used in each operation can't encode the text.
9251
9252 The default value is `select-safe-coding-system' (which see).  */);
9253   Vselect_safe_coding_system_function = Qnil;
9254
9255   DEFVAR_BOOL ("coding-system-require-warning",
9256                &coding_system_require_warning,
9257                doc: /* Internal use only.
9258 If non-nil, on writing a file, `select-safe-coding-system-function' is
9259 called even if `coding-system-for-write' is non-nil.  The command
9260 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9261   coding_system_require_warning = 0;
9262
9263
9264   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9265                &inhibit_iso_escape_detection,
9266                doc: /*
9267 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9268
9269 By default, on reading a file, Emacs tries to detect how the text is
9270 encoded.  This code detection is sensitive to escape sequences.  If
9271 the sequence is valid as ISO2022, the code is determined as one of
9272 the ISO2022 encodings, and the file is decoded by the corresponding
9273 coding system (e.g. `iso-2022-7bit').
9274
9275 However, there may be a case that you want to read escape sequences in
9276 a file as is.  In such a case, you can set this variable to non-nil.
9277 Then, as the code detection ignores any escape sequences, no file is
9278 detected as encoded in some ISO2022 encoding.  The result is that all
9279 escape sequences become visible in a buffer.
9280
9281 The default value is nil, and it is strongly recommended not to change
9282 it.  That is because many Emacs Lisp source files that contain
9283 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9284 in Emacs's distribution, and they won't be decoded correctly on
9285 reading if you suppress escape sequence detection.
9286
9287 The other way to read escape sequences in a file without decoding is
9288 to explicitly specify some coding system that doesn't use ISO2022's
9289 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9290   inhibit_iso_escape_detection = 0;
9291
9292   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9293                doc: /* Char table for translating self-inserting characters.
9294 This is applied to the result of input methods, not their input.  See also
9295 `keyboard-translate-table'.  */);
9296     Vtranslation_table_for_input = Qnil;
9297
9298   {
9299     Lisp_Object args[coding_arg_max];
9300     Lisp_Object plist[16];
9301     int i;
9302
9303     for (i = 0; i < coding_arg_max; i++)
9304       args[i] = Qnil;
9305
9306     plist[0] = intern (":name");
9307     plist[1] = args[coding_arg_name] = Qno_conversion;
9308     plist[2] = intern (":mnemonic");
9309     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9310     plist[4] = intern (":coding-type");
9311     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9312     plist[6] = intern (":ascii-compatible-p");
9313     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9314     plist[8] = intern (":default-char");
9315     plist[9] = args[coding_arg_default_char] = make_number (0);
9316     plist[10] = intern (":for-unibyte");
9317     plist[11] = args[coding_arg_for_unibyte] = Qt;
9318     plist[12] = intern (":docstring");
9319     plist[13] = build_string ("Do no conversion.\n\
9320 \n\
9321 When you visit a file with this coding, the file is read into a\n\
9322 unibyte buffer as is, thus each byte of a file is treated as a\n\
9323 character.");
9324     plist[14] = intern (":eol-type");
9325     plist[15] = args[coding_arg_eol_type] = Qunix;
9326     args[coding_arg_plist] = Flist (16, plist);
9327     Fdefine_coding_system_internal (coding_arg_max, args);
9328   }
9329
9330   setup_coding_system (Qno_conversion, &keyboard_coding);
9331   setup_coding_system (Qno_conversion, &terminal_coding);
9332   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9333
9334   {
9335     int i;
9336
9337     for (i = 0; i < coding_category_max; i++)
9338       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9339   }
9340 }
9341
9342 char *
9343 emacs_strerror (error_number)
9344      int error_number;
9345 {
9346   char *str;
9347
9348   synchronize_system_messages_locale ();
9349   str = strerror (error_number);
9350
9351   if (! NILP (Vlocale_coding_system))
9352     {
9353       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9354                                                       Vlocale_coding_system,
9355                                                       0);
9356       str = (char *) SDATA (dec);
9357     }
9358
9359   return str;
9360 }
9361
9362 #endif /* emacs */