src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for a text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX.  Return 1 if the data contains
 148   a byte sequence which can be decoded into non-ASCII characters by
 149   the coding system.  Otherwize (i.e. the data contains only ASCII
 150   characters or invalid sequence) return 0.
 151
 152   It also resets some bits of an integer pointed by MASK.  The macros
 153   CATEGORY_MASK_XXX specifies each bit of this integer.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, mask)
 160      struct coding_system *coding;
 161      int *mask;
 162 {
 163   unsigned char *src = coding->source;
 164   unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int c;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175       /* Check if it conforms to XXX.  If not, break the loop.  */
 176     }
 177   /* As the data is invalid for XXX, reset a proper bits.  */
 178   *mask &= ~CODING_CATEGORY_XXX;
 179   return 0;
 180  no_more_source:
 181   /* The source exausted.  */
 182   if (!found)
 183     /* ASCII characters only. */
 184     return 0;
 185   /* Some data should be decoded into non-ASCII characters.  */
 186   *mask &= CODING_CATEGORY_XXX;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   unsigned char *src = coding->source + coding->consumed;
 211   unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf;
 218   int *charbuf_end = charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos, Qmac;
 307 Lisp_Object Qbuffer_file_coding_system;
 308 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 309 Lisp_Object Qdefault_char;
 310 Lisp_Object Qno_conversion, Qundecided;
 311 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 312 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
 313 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316
 317 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 318 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Vselect_safe_coding_system_function;
 323
 324 /* Mnemonic string for each format of end-of-line.  */
 325 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 326 /* Mnemonic string to indicate format of end-of-line is not yet
 327    decided.  */
 328 Lisp_Object eol_mnemonic_undecided;
 329
 330 #ifdef emacs
 331
 332 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 333
 334 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 335
 336 /* Coding system emacs-mule and raw-text are for converting only
 337    end-of-line format.  */
 338 Lisp_Object Qemacs_mule, Qraw_text;
 339
 340 /* Coding-systems are handed between Emacs Lisp programs and C internal
 341    routines by the following three variables.  */
 342 /* Coding-system for reading files and receiving data from process.  */
 343 Lisp_Object Vcoding_system_for_read;
 344 /* Coding-system for writing files and sending data to process.  */
 345 Lisp_Object Vcoding_system_for_write;
 346 /* Coding-system actually used in the latest I/O.  */
 347 Lisp_Object Vlast_coding_system_used;
 348
 349 /* A vector of length 256 which contains information about special
 350    Latin codes (especially for dealing with Microsoft codes).  */
 351 Lisp_Object Vlatin_extra_code_table;
 352
 353 /* Flag to inhibit code conversion of end-of-line format.  */
 354 int inhibit_eol_conversion;
 355
 356 /* Flag to inhibit ISO2022 escape sequence detection.  */
 357 int inhibit_iso_escape_detection;
 358
 359 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 360 int inherit_process_coding_system;
 361
 362 /* Coding system to be used to encode text for terminal display.  */
 363 struct coding_system terminal_coding;
 364
 365 /* Coding system to be used to encode text for terminal display when
 366    terminal coding system is nil.  */
 367 struct coding_system safe_terminal_coding;
 368
 369 /* Coding system of what is sent from terminal keyboard.  */
 370 struct coding_system keyboard_coding;
 371
 372 Lisp_Object Vfile_coding_system_alist;
 373 Lisp_Object Vprocess_coding_system_alist;
 374 Lisp_Object Vnetwork_coding_system_alist;
 375
 376 Lisp_Object Vlocale_coding_system;
 377
 378 #endif /* emacs */
 379
 380 /* Flag to tell if we look up translation table on character code
 381    conversion.  */
 382 Lisp_Object Venable_character_translation;
 383 /* Standard translation table to look up on decoding (reading).  */
 384 Lisp_Object Vstandard_translation_table_for_decode;
 385 /* Standard translation table to look up on encoding (writing).  */
 386 Lisp_Object Vstandard_translation_table_for_encode;
 387
 388 Lisp_Object Qtranslation_table;
 389 Lisp_Object Qtranslation_table_id;
 390 Lisp_Object Qtranslation_table_for_decode;
 391 Lisp_Object Qtranslation_table_for_encode;
 392
 393 /* Alist of charsets vs revision number.  */
 394 static Lisp_Object Vcharset_revision_table;
 395
 396 /* Default coding systems used for process I/O.  */
 397 Lisp_Object Vdefault_process_coding_system;
 398
 399 /* Global flag to tell that we can't call post-read-conversion and
 400    pre-write-conversion functions.  Usually the value is zero, but it
 401    is set to 1 temporarily while such functions are running.  This is
 402    to avoid infinite recursive call.  */
 403 static int inhibit_pre_post_conversion;
 404
 405 /* Two special coding systems.  */
 406 Lisp_Object Vsjis_coding_system;
 407 Lisp_Object Vbig5_coding_system;
 408
 409
 410 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
 411 static void decode_coding_utf_8 P_ ((struct coding_system *));
 412 static int encode_coding_utf_8 P_ ((struct coding_system *));
 413
 414 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
 415 static void decode_coding_utf_16 P_ ((struct coding_system *));
 416 static int encode_coding_utf_16 P_ ((struct coding_system *));
 417
 418 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
 419 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 420 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 421
 422 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
 423 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 424 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 425
 426 static int detect_coding_sjis P_ ((struct coding_system *, int *));
 427 static void decode_coding_sjis P_ ((struct coding_system *));
 428 static int encode_coding_sjis P_ ((struct coding_system *));
 429
 430 static int detect_coding_big5 P_ ((struct coding_system *, int *));
 431 static void decode_coding_big5 P_ ((struct coding_system *));
 432 static int encode_coding_big5 P_ ((struct coding_system *));
 433
 434 static int detect_coding_ccl P_ ((struct coding_system *, int *));
 435 static void decode_coding_ccl P_ ((struct coding_system *));
 436 static int encode_coding_ccl P_ ((struct coding_system *));
 437
 438 static void decode_coding_raw_text P_ ((struct coding_system *));
 439 static int encode_coding_raw_text P_ ((struct coding_system *));
 440
 441
 442 /* ISO2022 section */
 443
 444 #define CODING_ISO_INITIAL(coding, reg)                 \
 445   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 446                      coding_attr_iso_initial),          \
 447                reg)))
 448
 449
 450 #define CODING_ISO_REQUEST(coding, charset_id)  \
 451   ((charset_id <= (coding)->max_charset_id      \
 452     ? (coding)->safe_charsets[charset_id]       \
 453     : -1))
 454
 455
 456 #define CODING_ISO_FLAGS(coding)        \
 457   ((coding)->spec.iso_2022.flags)
 458 #define CODING_ISO_DESIGNATION(coding, reg)     \
 459   ((coding)->spec.iso_2022.current_designation[reg])
 460 #define CODING_ISO_INVOCATION(coding, plane)    \
 461   ((coding)->spec.iso_2022.current_invocation[plane])
 462 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 463   ((coding)->spec.iso_2022.single_shifting)
 464 #define CODING_ISO_BOL(coding)  \
 465   ((coding)->spec.iso_2022.bol)
 466 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 467   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 468
 469 /* Control characters of ISO2022.  */
 470                         /* code */      /* function */
 471 #define ISO_CODE_LF     0x0A            /* line-feed */
 472 #define ISO_CODE_CR     0x0D            /* carriage-return */
 473 #define ISO_CODE_SO     0x0E            /* shift-out */
 474 #define ISO_CODE_SI     0x0F            /* shift-in */
 475 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 476 #define ISO_CODE_ESC    0x1B            /* escape */
 477 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 478 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 479 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 480
 481 /* All code (1-byte) of ISO2022 is classified into one of the
 482    followings.  */
 483 enum iso_code_class_type
 484   {
 485     ISO_control_0,              /* Control codes in the range
 486                                    0x00..0x1F and 0x7F, except for the
 487                                    following 5 codes.  */
 488     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 489     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 490     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 491     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 492     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 493     ISO_control_1,              /* Control codes in the range
 494                                    0x80..0x9F, except for the
 495                                    following 3 codes.  */
 496     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 497     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 498     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 499     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 500     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 501     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 502     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 503   };
 504
 505 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 506     `iso-flags' attribute of an iso2022 coding system.  */
 507
 508 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 509    instead of the correct short-form sequence (e.g. ESC $ A).  */
 510 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 511
 512 /* If set, reset graphic planes and registers at end-of-line to the
 513    initial state.  */
 514 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 515
 516 /* If set, reset graphic planes and registers before any control
 517    characters to the initial state.  */
 518 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 519
 520 /* If set, encode by 7-bit environment.  */
 521 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 522
 523 /* If set, use locking-shift function.  */
 524 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 525
 526 /* If set, use single-shift function.  Overwrite
 527    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 528 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 529
 530 /* If set, use designation escape sequence.  */
 531 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 532
 533 /* If set, produce revision number sequence.  */
 534 #define CODING_ISO_FLAG_REVISION        0x0080
 535
 536 /* If set, produce ISO6429's direction specifying sequence.  */
 537 #define CODING_ISO_FLAG_DIRECTION       0x0100
 538
 539 /* If set, assume designation states are reset at beginning of line on
 540    output.  */
 541 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 542
 543 /* If set, designation sequence should be placed at beginning of line
 544    on output.  */
 545 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 546
 547 /* If set, do not encode unsafe charactes on output.  */
 548 #define CODING_ISO_FLAG_SAFE            0x0800
 549
 550 /* If set, extra latin codes (128..159) are accepted as a valid code
 551    on input.  */
 552 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 553
 554 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 555
 556 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 557
 558 #define CODING_ISO_FLAG_FULL_SUPPORT    0x8000
 559
 560 /* A character to be produced on output if encoding of the original
 561    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 562 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 563
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 583    ->data)
 584
 585 /* Index for each coding category in `coding_category_table' */
 586
 587 enum coding_category
 588   {
 589     coding_category_iso_7,
 590     coding_category_iso_7_tight,
 591     coding_category_iso_8_1,
 592     coding_category_iso_8_2,
 593     coding_category_iso_7_else,
 594     coding_category_iso_8_else,
 595     coding_category_utf_8,
 596     coding_category_utf_16_auto,
 597     coding_category_utf_16_be,
 598     coding_category_utf_16_le,
 599     coding_category_utf_16_be_nosig,
 600     coding_category_utf_16_le_nosig,
 601     coding_category_charset,
 602     coding_category_sjis,
 603     coding_category_big5,
 604     coding_category_ccl,
 605     coding_category_emacs_mule,
 606     /* All above are targets of code detection.  */
 607     coding_category_raw_text,
 608     coding_category_undecided,
 609     coding_category_max
 610   };
 611
 612 /* Definitions of flag bits used in detect_coding_XXXX.  */
 613 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 614 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 615 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 616 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 617 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 618 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 619 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 620 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 621 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 622 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 623 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 624 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 625 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 626 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 627 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 628 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 629
 630 /* This value is returned if detect_coding_mask () find nothing other
 631    than ASCII characters.  */
 632 #define CATEGORY_MASK_ANY               \
 633   (CATEGORY_MASK_ISO_7                  \
 634    | CATEGORY_MASK_ISO_7_TIGHT          \
 635    | CATEGORY_MASK_ISO_8_1              \
 636    | CATEGORY_MASK_ISO_8_2              \
 637    | CATEGORY_MASK_ISO_7_ELSE           \
 638    | CATEGORY_MASK_ISO_8_ELSE           \
 639    | CATEGORY_MASK_UTF_8                \
 640    | CATEGORY_MASK_UTF_16_BE            \
 641    | CATEGORY_MASK_UTF_16_LE            \
 642    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 643    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 644    | CATEGORY_MASK_CHARSET              \
 645    | CATEGORY_MASK_SJIS                 \
 646    | CATEGORY_MASK_BIG5                 \
 647    | CATEGORY_MASK_CCL                  \
 648    | CATEGORY_MASK_EMACS_MULE)
 649
 650
 651 #define CATEGORY_MASK_ISO_7BIT \
 652   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 653
 654 #define CATEGORY_MASK_ISO_8BIT \
 655   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 656
 657 #define CATEGORY_MASK_ISO_ELSE \
 658   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 659
 660 #define CATEGORY_MASK_ISO_ESCAPE        \
 661   (CATEGORY_MASK_ISO_7                  \
 662    | CATEGORY_MASK_ISO_7_TIGHT          \
 663    | CATEGORY_MASK_ISO_7_ELSE           \
 664    | CATEGORY_MASK_ISO_8_ELSE)
 665
 666 #define CATEGORY_MASK_ISO       \
 667   (  CATEGORY_MASK_ISO_7BIT     \
 668      | CATEGORY_MASK_ISO_8BIT   \
 669      | CATEGORY_MASK_ISO_ELSE)
 670
 671 #define CATEGORY_MASK_UTF_16            \
 672   (CATEGORY_MASK_UTF_16_BE              \
 673    | CATEGORY_MASK_UTF_16_LE            \
 674    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 675    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 676
 677
 678 /* List of symbols `coding-category-xxx' ordered by priority.  This
 679    variable is exposed to Emacs Lisp.  */
 680 static Lisp_Object Vcoding_category_list;
 681
 682 /* Table of coding categories (Lisp symbols).  This variable is for
 683    internal use oly.  */
 684 static Lisp_Object Vcoding_category_table;
 685
 686 /* Table of coding-categories ordered by priority.  */
 687 static enum coding_category coding_priorities[coding_category_max];
 688
 689 /* Nth element is a coding context for the coding system bound to the
 690    Nth coding category.  */
 691 static struct coding_system coding_categories[coding_category_max];
 692
 693 static int detected_mask[coding_category_raw_text] =
 694   { CATEGORY_MASK_ISO,
 695     CATEGORY_MASK_ISO,
 696     CATEGORY_MASK_ISO,
 697     CATEGORY_MASK_ISO,
 698     CATEGORY_MASK_ISO,
 699     CATEGORY_MASK_ISO,
 700     CATEGORY_MASK_UTF_8,
 701     CATEGORY_MASK_UTF_16,
 702     CATEGORY_MASK_UTF_16,
 703     CATEGORY_MASK_UTF_16,
 704     CATEGORY_MASK_UTF_16,
 705     CATEGORY_MASK_UTF_16,
 706     CATEGORY_MASK_CHARSET,
 707     CATEGORY_MASK_SJIS,
 708     CATEGORY_MASK_BIG5,
 709     CATEGORY_MASK_CCL,
 710     CATEGORY_MASK_EMACS_MULE
 711   };
 712
 713 /*** Commonly used macros and functions ***/
 714
 715 #ifndef min
 716 #define min(a, b) ((a) < (b) ? (a) : (b))
 717 #endif
 718 #ifndef max
 719 #define max(a, b) ((a) > (b) ? (a) : (b))
 720 #endif
 721
 722 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 723   do {                                                          \
 724     attrs = CODING_ID_ATTRS (coding->id);                       \
 725     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 726     if (VECTORP (eol_type))                                     \
 727       eol_type = Qunix;                                         \
 728     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 729   } while (0)
 730
 731
 732 /* Safely get one byte from the source text pointed by SRC which ends
 733    at SRC_END, and set C to that byte.  If there are not enough bytes
 734    in the source, it jumps to `no_more_source'.  The caller
 735    should declare and set these variables appropriately in advance:
 736         src, src_end, multibytep
 737 */
 738
 739 #define ONE_MORE_BYTE(c)                                        \
 740   do {                                                          \
 741     if (src == src_end)                                         \
 742       {                                                         \
 743         if (src_base < src)                                     \
 744           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 745         goto no_more_source;                                    \
 746       }                                                         \
 747     c = *src++;                                                 \
 748     if (multibytep && (c & 0x80))                               \
 749       {                                                         \
 750         if ((c & 0xFE) != 0xC0)                                 \
 751           error ("Undecodable char found");                     \
 752         c = ((c & 1) << 6) | *src++;                            \
 753       }                                                         \
 754     consumed_chars++;                                           \
 755   } while (0)
 756
 757
 758 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 759   do {                                          \
 760     c = *src++;                                 \
 761     if (multibytep && (c & 0x80))               \
 762       {                                         \
 763         if ((c & 0xFE) != 0xC0)                 \
 764           error ("Undecodable char found");     \
 765         c = ((c & 1) << 6) | *src++;            \
 766       }                                         \
 767   } while (0)
 768
 769
 770 /* Store a byte C in the place pointed by DST and increment DST to the
 771    next free point, and increment PRODUCED_CHARS.  The caller should
 772    assure that C is 0..127, and declare and set the variable `dst'
 773    appropriately in advance.
 774 */
 775
 776
 777 #define EMIT_ONE_ASCII_BYTE(c)  \
 778   do {                          \
 779     produced_chars++;           \
 780     *dst++ = (c);               \
 781   } while (0)
 782
 783
 784 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 785
 786 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 787   do {                                  \
 788     produced_chars += 2;                \
 789     *dst++ = (c1), *dst++ = (c2);       \
 790   } while (0)
 791
 792
 793 /* Store a byte C in the place pointed by DST and increment DST to the
 794    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 795    nonzero, store in an appropriate multibyte from.  The caller should
 796    declare and set the variables `dst' and `multibytep' appropriately
 797    in advance.  */
 798
 799 #define EMIT_ONE_BYTE(c)                \
 800   do {                                  \
 801     produced_chars++;                   \
 802     if (multibytep)                     \
 803       {                                 \
 804         int ch = (c);                   \
 805         if (ch >= 0x80)                 \
 806           ch = BYTE8_TO_CHAR (ch);      \
 807         CHAR_STRING_ADVANCE (ch, dst);  \
 808       }                                 \
 809     else                                \
 810       *dst++ = (c);                     \
 811   } while (0)
 812
 813
 814 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 815
 816 #define EMIT_TWO_BYTES(c1, c2)          \
 817   do {                                  \
 818     produced_chars += 2;                \
 819     if (multibytep)                     \
 820       {                                 \
 821         int ch;                         \
 822                                         \
 823         ch = (c1);                      \
 824         if (ch >= 0x80)                 \
 825           ch = BYTE8_TO_CHAR (ch);      \
 826         CHAR_STRING_ADVANCE (ch, dst);  \
 827         ch = (c2);                      \
 828         if (ch >= 0x80)                 \
 829           ch = BYTE8_TO_CHAR (ch);      \
 830         CHAR_STRING_ADVANCE (ch, dst);  \
 831       }                                 \
 832     else                                \
 833       {                                 \
 834         *dst++ = (c1);                  \
 835         *dst++ = (c2);                  \
 836       }                                 \
 837   } while (0)
 838
 839
 840 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 841   do {                                  \
 842     EMIT_ONE_BYTE (c1);                 \
 843     EMIT_TWO_BYTES (c2, c3);            \
 844   } while (0)
 845
 846
 847 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 848   do {                                          \
 849     EMIT_TWO_BYTES (c1, c2);                    \
 850     EMIT_TWO_BYTES (c3, c4);                    \
 851   } while (0)
 852
 853
 854 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 855   do {                                                                       \
 856     charset_map_loaded = 0;                                                  \
 857     c = DECODE_CHAR (charset, code);                                         \
 858     if (charset_map_loaded)                                                  \
 859       {                                                                      \
 860         unsigned char *orig = coding->source;                                \
 861         EMACS_INT offset;                                                    \
 862                                                                              \
 863         coding_set_source (coding);                                          \
 864         offset = coding->source - orig;                                      \
 865         src += offset;                                                       \
 866         src_base += offset;                                                  \
 867         src_end += offset;                                                   \
 868       }                                                                      \
 869   } while (0)
 870
 871
 872 #define ASSURE_DESTINATION(bytes)                               \
 873   do {                                                          \
 874     if (dst + (bytes) >= dst_end)                               \
 875       {                                                         \
 876         int more_bytes = charbuf_end - charbuf + (bytes);       \
 877                                                                 \
 878         dst = alloc_destination (coding, more_bytes, dst);      \
 879         dst_end = coding->destination + coding->dst_bytes;      \
 880       }                                                         \
 881   } while (0)
 882
 883
 884
 885 static void
 886 coding_set_source (coding)
 887      struct coding_system *coding;
 888 {
 889   if (BUFFERP (coding->src_object))
 890     {
 891       if (coding->src_pos < 0)
 892         coding->source = GAP_END_ADDR + coding->src_pos_byte;
 893       else
 894         {
 895           struct buffer *buf = XBUFFER (coding->src_object);
 896           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
 897           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
 898
 899           coding->source = beg_addr + coding->src_pos_byte - 1;
 900           if (coding->src_pos_byte >= gpt_byte)
 901             coding->source += BUF_GAP_SIZE (buf);
 902         }
 903     }
 904   else if (STRINGP (coding->src_object))
 905     {
 906       coding->source = (XSTRING (coding->src_object)->data
 907                         + coding->src_pos_byte);
 908     }
 909   else
 910     /* Otherwise, the source is C string and is never relocated
 911        automatically.  Thus we don't have to update anything.  */
 912     ;
 913 }
 914
 915 static void
 916 coding_set_destination (coding)
 917      struct coding_system *coding;
 918 {
 919   if (BUFFERP (coding->dst_object))
 920     {
 921       /* We are sure that coding->dst_pos_byte is before the gap of the
 922          buffer. */
 923       coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 924                              + coding->dst_pos_byte - 1);
 925       if (coding->src_pos < 0)
 926         coding->dst_bytes = (GAP_END_ADDR
 927                              - (coding->src_bytes - coding->consumed)
 928                              - coding->destination);
 929       else
 930         coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 931                              - coding->destination);
 932     }
 933   else
 934     /* Otherwise, the destination is C string and is never relocated
 935        automatically.  Thus we don't have to update anything.  */
 936     ;
 937 }
 938
 939
 940 static void
 941 coding_alloc_by_realloc (coding, bytes)
 942      struct coding_system *coding;
 943      EMACS_INT bytes;
 944 {
 945   coding->destination = (unsigned char *) xrealloc (coding->destination,
 946                                                     coding->dst_bytes + bytes);
 947   coding->dst_bytes += bytes;
 948 }
 949
 950 static void
 951 coding_alloc_by_making_gap (coding, bytes)
 952      struct coding_system *coding;
 953      EMACS_INT bytes;
 954 {
 955   if (BUFFERP (coding->dst_object)
 956       && EQ (coding->src_object, coding->dst_object))
 957     {
 958       EMACS_INT add = coding->src_bytes - coding->consumed;
 959
 960       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 961       make_gap (bytes);
 962       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 963     }
 964   else
 965     {
 966       Lisp_Object this_buffer;
 967
 968       this_buffer = Fcurrent_buffer ();
 969       set_buffer_internal (XBUFFER (coding->dst_object));
 970       make_gap (bytes);
 971       set_buffer_internal (XBUFFER (this_buffer));
 972     }
 973 }
 974
 975
 976 static unsigned char *
 977 alloc_destination (coding, nbytes, dst)
 978      struct coding_system *coding;
 979      int nbytes;
 980      unsigned char *dst;
 981 {
 982   EMACS_INT offset = dst - coding->destination;
 983
 984   if (BUFFERP (coding->dst_object))
 985     coding_alloc_by_making_gap (coding, nbytes);
 986   else
 987     coding_alloc_by_realloc (coding, nbytes);
 988   coding->result = CODING_RESULT_SUCCESS;
 989   coding_set_destination (coding);
 990   dst = coding->destination + offset;
 991   return dst;
 992 }
 993
 994 \f
 995 /*** 2. Emacs' internal format (emacs-utf-8) ***/
 996
 997
 998
 999 \f
1000 /*** 3. UTF-8 ***/
1001
1002 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1003    Check if a text is encoded in UTF-8.  If it is, return
1004    CATEGORY_MASK_UTF_8, else return 0.  */
1005
1006 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1007 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1008 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1009 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1010 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1011 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1012
1013 static int
1014 detect_coding_utf_8 (coding, mask)
1015      struct coding_system *coding;
1016      int *mask;
1017 {
1018   unsigned char *src = coding->source, *src_base = src;
1019   unsigned char *src_end = coding->source + coding->src_bytes;
1020   int multibytep = coding->src_multibyte;
1021   int consumed_chars = 0;
1022   int found = 0;
1023
1024   /* A coding system of this category is always ASCII compatible.  */
1025   src += coding->head_ascii;
1026
1027   while (1)
1028     {
1029       int c, c1, c2, c3, c4;
1030
1031       ONE_MORE_BYTE (c);
1032       if (UTF_8_1_OCTET_P (c))
1033         continue;
1034       ONE_MORE_BYTE (c1);
1035       if (! UTF_8_EXTRA_OCTET_P (c1))
1036         break;
1037       if (UTF_8_2_OCTET_LEADING_P (c))
1038         {
1039           found++;
1040           continue;
1041         }
1042       ONE_MORE_BYTE (c2);
1043       if (! UTF_8_EXTRA_OCTET_P (c2))
1044         break;
1045       if (UTF_8_3_OCTET_LEADING_P (c))
1046         {
1047           found++;
1048           continue;
1049         }
1050       ONE_MORE_BYTE (c3);
1051       if (! UTF_8_EXTRA_OCTET_P (c3))
1052         break;
1053       if (UTF_8_4_OCTET_LEADING_P (c))
1054         {
1055           found++;
1056           continue;
1057         }
1058       ONE_MORE_BYTE (c4);
1059       if (! UTF_8_EXTRA_OCTET_P (c4))
1060         break;
1061       if (UTF_8_5_OCTET_LEADING_P (c))
1062         {
1063           found++;
1064           continue;
1065         }
1066       break;
1067     }
1068   *mask &= ~CATEGORY_MASK_UTF_8;
1069   return 0;
1070
1071  no_more_source:
1072   if (! found)
1073     return 0;
1074   *mask &= CATEGORY_MASK_UTF_8;
1075   return 1;
1076 }
1077
1078
1079 static void
1080 decode_coding_utf_8 (coding)
1081      struct coding_system *coding;
1082 {
1083   unsigned char *src = coding->source + coding->consumed;
1084   unsigned char *src_end = coding->source + coding->src_bytes;
1085   unsigned char *src_base;
1086   int *charbuf = coding->charbuf;
1087   int *charbuf_end = charbuf + coding->charbuf_size;
1088   int consumed_chars = 0, consumed_chars_base;
1089   int multibytep = coding->src_multibyte;
1090   Lisp_Object attr, eol_type, charset_list;
1091
1092   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1093
1094   while (1)
1095     {
1096       int c, c1, c2, c3, c4, c5;
1097
1098       src_base = src;
1099       consumed_chars_base = consumed_chars;
1100
1101       if (charbuf >= charbuf_end)
1102         break;
1103
1104       ONE_MORE_BYTE (c1);
1105       if (UTF_8_1_OCTET_P(c1))
1106         {
1107           c = c1;
1108           if (c == '\r')
1109             {
1110               if (EQ (eol_type, Qdos))
1111                 {
1112                   if (src == src_end)
1113                     goto no_more_source;
1114                   if (*src == '\n')
1115                     ONE_MORE_BYTE (c);
1116                 }
1117               else if (EQ (eol_type, Qmac))
1118                 c = '\n';
1119             }
1120         }
1121       else
1122         {
1123           ONE_MORE_BYTE (c2);
1124           if (! UTF_8_EXTRA_OCTET_P (c2))
1125             goto invalid_code;
1126           if (UTF_8_2_OCTET_LEADING_P (c1))
1127             c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1128           else
1129             {
1130               ONE_MORE_BYTE (c3);
1131               if (! UTF_8_EXTRA_OCTET_P (c3))
1132                 goto invalid_code;
1133               if (UTF_8_3_OCTET_LEADING_P (c1))
1134                 c = (((c1 & 0xF) << 12)
1135                      | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1136               else
1137                 {
1138                   ONE_MORE_BYTE (c4);
1139                   if (! UTF_8_EXTRA_OCTET_P (c4))
1140                     goto invalid_code;
1141                   if (UTF_8_4_OCTET_LEADING_P (c1))
1142                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1143                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1144                   else
1145                     {
1146                       ONE_MORE_BYTE (c5);
1147                       if (! UTF_8_EXTRA_OCTET_P (c5))
1148                         goto invalid_code;
1149                       if (UTF_8_5_OCTET_LEADING_P (c1))
1150                         {
1151                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1152                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1153                                | (c5 & 0x3F));
1154                           if (c > MAX_CHAR)
1155                             goto invalid_code;
1156                         }
1157                       else
1158                         goto invalid_code;
1159                     }
1160                 }
1161             }
1162         }
1163
1164       *charbuf++ = c;
1165       continue;
1166
1167     invalid_code:
1168       src = src_base;
1169       consumed_chars = consumed_chars_base;
1170       ONE_MORE_BYTE (c);
1171       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1172       coding->errors++;
1173     }
1174
1175  no_more_source:
1176   coding->consumed_char += consumed_chars_base;
1177   coding->consumed = src_base - coding->source;
1178   coding->charbuf_used = charbuf - coding->charbuf;
1179 }
1180
1181
1182 static int
1183 encode_coding_utf_8 (coding)
1184      struct coding_system *coding;
1185 {
1186   int multibytep = coding->dst_multibyte;
1187   int *charbuf = coding->charbuf;
1188   int *charbuf_end = charbuf + coding->charbuf_used;
1189   unsigned char *dst = coding->destination + coding->produced;
1190   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1191   int produced_chars = 0;
1192   int c;
1193
1194   if (multibytep)
1195     {
1196       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1197
1198       while (charbuf < charbuf_end)
1199         {
1200           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1201
1202           ASSURE_DESTINATION (safe_room);
1203           c = *charbuf++;
1204           CHAR_STRING_ADVANCE (c, pend);
1205           for (p = str; p < pend; p++)
1206             EMIT_ONE_BYTE (*p);
1207         }
1208     }
1209   else
1210     {
1211       int safe_room = MAX_MULTIBYTE_LENGTH;
1212
1213       while (charbuf < charbuf_end)
1214         {
1215           ASSURE_DESTINATION (safe_room);
1216           c = *charbuf++;
1217           dst += CHAR_STRING (c, dst);
1218           produced_chars++;
1219         }
1220     }
1221   coding->result = CODING_RESULT_SUCCESS;
1222   coding->produced_char += produced_chars;
1223   coding->produced = dst - coding->destination;
1224   return 0;
1225 }
1226
1227
1228 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1229    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1230    Little Endian (otherwise).  If it is, return
1231    CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1232    else return 0.  */
1233
1234 #define UTF_16_HIGH_SURROGATE_P(val) \
1235   (((val) & 0xFC00) == 0xD800)
1236
1237 #define UTF_16_LOW_SURROGATE_P(val) \
1238   (((val) & 0xFC00) == 0xDC00)
1239
1240 #define UTF_16_INVALID_P(val)   \
1241   (((val) == 0xFFFE)            \
1242    || ((val) == 0xFFFF)         \
1243    || UTF_16_LOW_SURROGATE_P (val))
1244
1245
1246 static int
1247 detect_coding_utf_16 (coding, mask)
1248      struct coding_system *coding;
1249      int *mask;
1250 {
1251   unsigned char *src = coding->source, *src_base = src;
1252   unsigned char *src_end = coding->source + coding->src_bytes;
1253   int multibytep = coding->src_multibyte;
1254   int consumed_chars = 0;
1255   int c1, c2;
1256
1257   ONE_MORE_BYTE (c1);
1258   ONE_MORE_BYTE (c2);
1259
1260   if ((c1 == 0xFF) && (c2 == 0xFE))
1261     {
1262       *mask &= CATEGORY_MASK_UTF_16_LE;
1263       return 1;
1264     }
1265   else if ((c1 == 0xFE) && (c2 == 0xFF))
1266     {
1267       *mask &= CATEGORY_MASK_UTF_16_BE;
1268       return 1;
1269     }
1270  no_more_source:
1271   return 0;
1272 }
1273
1274 static void
1275 decode_coding_utf_16 (coding)
1276      struct coding_system *coding;
1277 {
1278   unsigned char *src = coding->source + coding->consumed;
1279   unsigned char *src_end = coding->source + coding->src_bytes;
1280   unsigned char *src_base;
1281   int *charbuf = coding->charbuf;
1282   int *charbuf_end = charbuf + coding->charbuf_size;
1283   int consumed_chars = 0, consumed_chars_base;
1284   int multibytep = coding->src_multibyte;
1285   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1286   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1287   int surrogate = CODING_UTF_16_SURROGATE (coding);
1288   Lisp_Object attr, eol_type, charset_list;
1289
1290   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1291
1292   if (bom != utf_16_without_bom)
1293     {
1294       int c, c1, c2;
1295
1296       src_base = src;
1297       ONE_MORE_BYTE (c1);
1298       ONE_MORE_BYTE (c2);
1299       c = (c1 << 8) | c2;
1300       if (bom == utf_16_with_bom)
1301         {
1302           if (endian == utf_16_big_endian
1303               ? c != 0xFFFE : c != 0xFEFF)
1304             {
1305               /* We are sure that there's enouph room at CHARBUF.  */
1306               *charbuf++ = c1;
1307               *charbuf++ = c2;
1308               coding->errors++;
1309             }
1310         }
1311       else
1312         {
1313           if (c == 0xFFFE)
1314             CODING_UTF_16_ENDIAN (coding)
1315               = endian = utf_16_big_endian;
1316           else if (c == 0xFEFF)
1317             CODING_UTF_16_ENDIAN (coding)
1318               = endian = utf_16_little_endian;
1319           else
1320             {
1321               CODING_UTF_16_ENDIAN (coding)
1322                 = endian = utf_16_big_endian;
1323               src = src_base;
1324             }
1325         }
1326       CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1327     }
1328
1329   while (1)
1330     {
1331       int c, c1, c2;
1332
1333       src_base = src;
1334       consumed_chars_base = consumed_chars;
1335
1336       if (charbuf + 2 >= charbuf_end)
1337         break;
1338
1339       ONE_MORE_BYTE (c1);
1340       ONE_MORE_BYTE (c2);
1341       c = (endian == utf_16_big_endian
1342            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1343       if (surrogate)
1344         {
1345           if (! UTF_16_LOW_SURROGATE_P (c))
1346             {
1347               if (endian == utf_16_big_endian)
1348                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1349               else
1350                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1351               *charbuf++ = c1;
1352               *charbuf++ = c2;
1353               coding->errors++;
1354               if (UTF_16_HIGH_SURROGATE_P (c))
1355                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1356               else
1357                 *charbuf++ = c;
1358             }
1359           else
1360             {
1361               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1362               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1363               *charbuf++ = c;
1364             }
1365         }
1366       else
1367         {
1368           if (UTF_16_HIGH_SURROGATE_P (c))
1369             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1370           else
1371             *charbuf++ = c;
1372         }
1373     }
1374
1375  no_more_source:
1376   coding->consumed_char += consumed_chars_base;
1377   coding->consumed = src_base - coding->source;
1378   coding->charbuf_used = charbuf - coding->charbuf;
1379 }
1380
1381 static int
1382 encode_coding_utf_16 (coding)
1383      struct coding_system *coding;
1384 {
1385   int multibytep = coding->dst_multibyte;
1386   int *charbuf = coding->charbuf;
1387   int *charbuf_end = charbuf + coding->charbuf_used;
1388   unsigned char *dst = coding->destination + coding->produced;
1389   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1390   int safe_room = 8;
1391   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1392   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1393   int produced_chars = 0;
1394   Lisp_Object attrs, eol_type, charset_list;
1395   int c;
1396
1397   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1398
1399   if (bom == utf_16_with_bom)
1400     {
1401       ASSURE_DESTINATION (safe_room);
1402       if (big_endian)
1403         EMIT_TWO_BYTES (0xFF, 0xFE);
1404       else
1405         EMIT_TWO_BYTES (0xFE, 0xFF);
1406       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1407     }
1408
1409   while (charbuf < charbuf_end)
1410     {
1411       ASSURE_DESTINATION (safe_room);
1412       c = *charbuf++;
1413       if (c >= MAX_UNICODE_CHAR)
1414         c = coding->default_char;
1415
1416       if (c < 0x10000)
1417         {
1418           if (big_endian)
1419             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1420           else
1421             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1422         }
1423       else
1424         {
1425           int c1, c2;
1426
1427           c -= 0x10000;
1428           c1 = (c >> 10) + 0xD800;
1429           c2 = (c & 0x3FF) + 0xDC00;
1430           if (big_endian)
1431             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1432           else
1433             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1434         }
1435     }
1436   coding->result = CODING_RESULT_SUCCESS;
1437   coding->produced = dst - coding->destination;
1438   coding->produced_char += produced_chars;
1439   return 0;
1440 }
1441
1442 \f
1443 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1444
1445 /* Emacs' internal format for representation of multiple character
1446    sets is a kind of multi-byte encoding, i.e. characters are
1447    represented by variable-length sequences of one-byte codes.
1448
1449    ASCII characters and control characters (e.g. `tab', `newline') are
1450    represented by one-byte sequences which are their ASCII codes, in
1451    the range 0x00 through 0x7F.
1452
1453    8-bit characters of the range 0x80..0x9F are represented by
1454    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1455    code + 0x20).
1456
1457    8-bit characters of the range 0xA0..0xFF are represented by
1458    one-byte sequences which are their 8-bit code.
1459
1460    The other characters are represented by a sequence of `base
1461    leading-code', optional `extended leading-code', and one or two
1462    `position-code's.  The length of the sequence is determined by the
1463    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1464    whereas extended leading-code and position-code take the range 0xA0
1465    through 0xFF.  See `charset.h' for more details about leading-code
1466    and position-code.
1467
1468    --- CODE RANGE of Emacs' internal format ---
1469    character set        range
1470    -------------        -----
1471    ascii                0x00..0x7F
1472    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1473    eight-bit-graphic    0xA0..0xBF
1474    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1475    ---------------------------------------------
1476
1477    As this is the internal character representation, the format is
1478    usually not used externally (i.e. in a file or in a data sent to a
1479    process).  But, it is possible to have a text externally in this
1480    format (i.e. by encoding by the coding system `emacs-mule').
1481
1482    In that case, a sequence of one-byte codes has a slightly different
1483    form.
1484
1485    At first, all characters in eight-bit-control are represented by
1486    one-byte sequences which are their 8-bit code.
1487
1488    Next, character composition data are represented by the byte
1489    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1490    where,
1491         METHOD is 0xF0 plus one of composition method (enum
1492         composition_method),
1493
1494         BYTES is 0xA0 plus a byte length of this composition data,
1495
1496         CHARS is 0x20 plus a number of characters composed by this
1497         data,
1498
1499         COMPONENTs are characters of multibye form or composition
1500         rules encoded by two-byte of ASCII codes.
1501
1502    In addition, for backward compatibility, the following formats are
1503    also recognized as composition data on decoding.
1504
1505    0x80 MSEQ ...
1506    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1507
1508    Here,
1509         MSEQ is a multibyte form but in these special format:
1510           ASCII: 0xA0 ASCII_CODE+0x80,
1511           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1512         RULE is a one byte code of the range 0xA0..0xF0 that
1513         represents a composition rule.
1514   */
1515
1516 char emacs_mule_bytes[256];
1517
1518 /* Leading-code followed by extended leading-code.  */
1519 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1520 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1521 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1522 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1523
1524
1525 int
1526 emacs_mule_char (coding, composition, nbytes, nchars)
1527      struct coding_system *coding;
1528      int composition;
1529      int *nbytes, *nchars;
1530 {
1531   unsigned char *src = coding->source + coding->consumed;
1532   unsigned char *src_end = coding->source + coding->src_bytes;
1533   int multibytep = coding->src_multibyte;
1534   unsigned char *src_base = src;
1535   struct charset *charset;
1536   unsigned code;
1537   int c;
1538   int consumed_chars = 0;
1539
1540   ONE_MORE_BYTE (c);
1541   if (composition)
1542     {
1543       c -= 0x20;
1544       if (c == 0x80)
1545         {
1546           ONE_MORE_BYTE (c);
1547           if (c < 0xA0)
1548             goto invalid_code;
1549           *nbytes = src - src_base;
1550           *nchars = consumed_chars;
1551           return (c - 0x80);
1552         }
1553     }
1554
1555   switch (emacs_mule_bytes[c])
1556     {
1557     case 2:
1558       if (! (charset = emacs_mule_charset[c]))
1559         goto invalid_code;
1560       ONE_MORE_BYTE (c);
1561       code = c & 0x7F;
1562       break;
1563
1564     case 3:
1565       if (c == LEADING_CODE_PRIVATE_11
1566           || c == LEADING_CODE_PRIVATE_12)
1567         {
1568           ONE_MORE_BYTE (c);
1569           if (! (charset = emacs_mule_charset[c]))
1570             goto invalid_code;
1571           ONE_MORE_BYTE (c);
1572           code = c & 0x7F;
1573         }
1574       else
1575         {
1576           if (! (charset = emacs_mule_charset[c]))
1577             goto invalid_code;
1578           ONE_MORE_BYTE (c);
1579           code = (c & 0x7F) << 7;
1580           ONE_MORE_BYTE (c);
1581           code |= c & 0x7F;
1582         }
1583       break;
1584
1585     case 4:
1586       if (! (charset = emacs_mule_charset[c]))
1587         goto invalid_code;
1588       ONE_MORE_BYTE (c);
1589       code = (c & 0x7F) << 7;
1590       ONE_MORE_BYTE (c);
1591       code |= c & 0x7F;
1592       break;
1593
1594     case 1:
1595       code = c;
1596       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1597                                  : code < 0xA0 ? charset_8_bit_control
1598                                  : charset_8_bit_graphic);
1599       break;
1600
1601     default:
1602       abort ();
1603     }
1604   c = DECODE_CHAR (charset, code);
1605   if (c < 0)
1606     goto invalid_code;
1607   *nbytes = src - src_base;
1608   *nchars = consumed_chars;
1609   return c;
1610
1611  no_more_source:
1612   return -2;
1613
1614  invalid_code:
1615   return -1;
1616 }
1617
1618
1619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1620    Check if a text is encoded in `emacs-mule'.  */
1621
1622 static int
1623 detect_coding_emacs_mule (coding, mask)
1624      struct coding_system *coding;
1625      int *mask;
1626 {
1627   unsigned char *src = coding->source, *src_base = src;
1628   unsigned char *src_end = coding->source + coding->src_bytes;
1629   int multibytep = coding->src_multibyte;
1630   int consumed_chars = 0;
1631   int c;
1632   int found = 0;
1633
1634   /* A coding system of this category is always ASCII compatible.  */
1635   src += coding->head_ascii;
1636
1637   while (1)
1638     {
1639       ONE_MORE_BYTE (c);
1640
1641       if (c == 0x80)
1642         {
1643           /* Perhaps the start of composite character.  We simple skip
1644              it because analyzing it is too heavy for detecting.  But,
1645              at least, we check that the composite character
1646              constitues of more than 4 bytes.  */
1647           unsigned char *src_base;
1648
1649         repeat:
1650           src_base = src;
1651           do
1652             {
1653               ONE_MORE_BYTE (c);
1654             }
1655           while (c >= 0xA0);
1656
1657           if (src - src_base <= 4)
1658             break;
1659           found = 1;
1660           if (c == 0x80)
1661             goto repeat;
1662         }
1663
1664       if (c < 0x80)
1665         {
1666           if (c < 0x20
1667               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1668             break;
1669         }
1670       else
1671         {
1672           unsigned char *src_base = src - 1;
1673
1674           do
1675             {
1676               ONE_MORE_BYTE (c);
1677             }
1678           while (c >= 0xA0);
1679           if (src - src_base != emacs_mule_bytes[*src_base])
1680             break;
1681           found = 1;
1682         }
1683     }
1684   *mask &= ~CATEGORY_MASK_EMACS_MULE;
1685   return 0;
1686
1687  no_more_source:
1688   if (!found)
1689     return 0;
1690   *mask &= CATEGORY_MASK_EMACS_MULE;
1691   return 1;
1692 }
1693
1694
1695 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1696
1697 /* Decode a character represented as a component of composition
1698    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1699    update SRC to the head of next character (or an encoded composition
1700    rule).  If SRC doesn't points a composition component, set C to -1.
1701    If SRC points an invalid byte sequence, global exit by a return
1702    value 0.  */
1703
1704 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1705   if (1)                                                        \
1706     {                                                           \
1707       int c;                                                    \
1708       int nbytes, nchars;                                       \
1709                                                                 \
1710       if (src == src_end)                                       \
1711         break;                                                  \
1712       c = emacs_mule_char (coding, 1, &nbytes, &nchars);        \
1713       if (c < 0)                                                \
1714         {                                                       \
1715           if (c == -2)                                          \
1716             break;                                              \
1717           goto invalid_code;                                    \
1718         }                                                       \
1719       *buf++ = c;                                               \
1720       src += nbytes;                                            \
1721       consumed_chars += nchars;                                 \
1722     }                                                           \
1723   else
1724
1725
1726 /* Decode a composition rule represented as a component of composition
1727    sequence of Emacs 20 style at SRC.  Set C to the rule.  If SRC
1728    points an invalid byte sequence, set C to -1.  */
1729
1730 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf)         \
1731   do {                                                  \
1732     int c, gref, nref;                                  \
1733                                                         \
1734     if (src < src_end)                                  \
1735       goto invalid_code;                                \
1736     ONE_MORE_BYTE_NO_CHECK (c);                         \
1737     c -= 0xA0;                                          \
1738     if (c < 0 || c >= 81)                               \
1739       goto invalid_code;                                \
1740                                                         \
1741     gref = c / 9, nref = c % 9;                         \
1742     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1743   } while (0)
1744
1745
1746 #define ADD_COMPOSITION_DATA(buf, method, nchars)       \
1747   do {                                                  \
1748     *buf++ = -5;                                        \
1749     *buf++ = coding->produced_char + char_offset;       \
1750     *buf++ = CODING_ANNOTATE_COMPOSITION_MASK;          \
1751     *buf++ = method;                                    \
1752     *buf++ = nchars;                                    \
1753   } while (0)
1754
1755
1756 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1757   do {                                                                  \
1758     /* Emacs 21 style format.  The first three bytes at SRC are         \
1759        (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1760        the byte length of this composition information, CHARS is the    \
1761        number of characters composed by this composition.  */           \
1762     enum composition_method method = c - 0xF0;                          \
1763     int consumed_chars_limit;                                           \
1764     int nbytes, nchars;                                                 \
1765                                                                         \
1766     ONE_MORE_BYTE (c);                                                  \
1767     nbytes = c - 0xA0;                                                  \
1768     if (nbytes < 3)                                                     \
1769       goto invalid_code;                                                \
1770     ONE_MORE_BYTE (c);                                                  \
1771     nchars = c - 0xA0;                                                  \
1772     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
1773     consumed_chars_limit = consumed_chars_base + nbytes;                \
1774     if (method != COMPOSITION_RELATIVE)                                 \
1775       {                                                                 \
1776         int i = 0;                                                      \
1777         while (consumed_chars < consumed_chars_limit)                   \
1778           {                                                             \
1779             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1780               DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf);             \
1781             else                                                        \
1782               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1783           }                                                             \
1784         if (consumed_chars < consumed_chars_limit)                      \
1785           goto invalid_code;                                            \
1786       }                                                                 \
1787   } while (0)
1788
1789
1790 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1791   do {                                                          \
1792     /* Emacs 20 style format for relative composition.  */      \
1793     /* Store multibyte form of characters to be composed.  */   \
1794     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1795     int *buf = components;                                      \
1796     int i, j;                                                   \
1797                                                                 \
1798     src = src_base;                                             \
1799     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1800     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1801       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1802     if (i < 2)                                                  \
1803       goto invalid_code;                                        \
1804     ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i);    \
1805     for (j = 0; j < i; j++)                                     \
1806       *charbuf++ = components[j];                               \
1807   } while (0)
1808
1809
1810 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1811   do {                                                          \
1812     /* Emacs 20 style format for rule-base composition.  */     \
1813     /* Store multibyte form of characters to be composed.  */   \
1814     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1815     int *buf = components;                                      \
1816     int i, j;                                                   \
1817                                                                 \
1818     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1819     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1820       {                                                         \
1821         DECODE_EMACS_MULE_COMPOSITION_RULE (buf);               \
1822         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1823       }                                                         \
1824     if (i < 1 || (buf - components) % 2 == 0)                   \
1825       goto invalid_code;                                        \
1826     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1827       goto no_more_source;                                      \
1828     ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i);       \
1829     for (j = 0; j < i; j++)                                     \
1830       *charbuf++ = components[j];                               \
1831     for (j = 0; j < i; j += 2)                                  \
1832       *charbuf++ = components[j];                               \
1833   } while (0)
1834
1835
1836 static void
1837 decode_coding_emacs_mule (coding)
1838      struct coding_system *coding;
1839 {
1840   unsigned char *src = coding->source + coding->consumed;
1841   unsigned char *src_end = coding->source + coding->src_bytes;
1842   unsigned char *src_base;
1843   int *charbuf = coding->charbuf;
1844   int *charbuf_end = charbuf + coding->charbuf_size;
1845   int consumed_chars = 0, consumed_chars_base;
1846   int char_offset = 0;
1847   int multibytep = coding->src_multibyte;
1848   Lisp_Object attrs, eol_type, charset_list;
1849
1850   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1851
1852   while (1)
1853     {
1854       int c;
1855
1856       src_base = src;
1857       consumed_chars_base = consumed_chars;
1858
1859       if (charbuf >= charbuf_end)
1860         break;
1861
1862       ONE_MORE_BYTE (c);
1863
1864       if (c < 0x80)
1865         {
1866           if (c == '\r')
1867             {
1868               if (EQ (eol_type, Qdos))
1869                 {
1870                   if (src == src_end)
1871                     goto no_more_source;
1872                   if (*src == '\n')
1873                     ONE_MORE_BYTE (c);
1874                 }
1875               else if (EQ (eol_type, Qmac))
1876                 c = '\n';
1877             }
1878           *charbuf++ = c;
1879           char_offset++;
1880         }
1881       else if (c == 0x80)
1882         {
1883           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1884             break;
1885           ONE_MORE_BYTE (c);
1886           if (c - 0xF0 >= COMPOSITION_RELATIVE
1887               && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1888             DECODE_EMACS_MULE_21_COMPOSITION (c);
1889           else if (c < 0xC0)
1890             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1891           else if (c == 0xFF)
1892             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1893           else
1894             goto invalid_code;
1895         }
1896       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1897         {
1898           int nbytes, nchars;
1899           src--;
1900           c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1901           if (c < 0)
1902             {
1903               if (c == -2)
1904                 break;
1905               goto invalid_code;
1906             }
1907           *charbuf++ = c;
1908           char_offset++;
1909         }
1910       continue;
1911
1912     invalid_code:
1913       src = src_base;
1914       consumed_chars = consumed_chars_base;
1915       ONE_MORE_BYTE (c);
1916       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1917       coding->errors++;
1918     }
1919
1920  no_more_source:
1921   coding->consumed_char += consumed_chars_base;
1922   coding->consumed = src_base - coding->source;
1923   coding->charbuf_used = charbuf - coding->charbuf;
1924 }
1925
1926
1927 #define EMACS_MULE_LEADING_CODES(id, codes)     \
1928   do {                                          \
1929     if (id < 0xA0)                              \
1930       codes[0] = id, codes[1] = 0;              \
1931     else if (id < 0xE0)                         \
1932       codes[0] = 0x9A, codes[1] = id;           \
1933     else if (id < 0xF0)                         \
1934       codes[0] = 0x9B, codes[1] = id;           \
1935     else if (id < 0xF5)                         \
1936       codes[0] = 0x9C, codes[1] = id;           \
1937     else                                        \
1938       codes[0] = 0x9D, codes[1] = id;           \
1939   } while (0);
1940
1941
1942 static int
1943 encode_coding_emacs_mule (coding)
1944      struct coding_system *coding;
1945 {
1946   int multibytep = coding->dst_multibyte;
1947   int *charbuf = coding->charbuf;
1948   int *charbuf_end = charbuf + coding->charbuf_used;
1949   unsigned char *dst = coding->destination + coding->produced;
1950   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1951   int safe_room = 8;
1952   int produced_chars = 0;
1953   Lisp_Object attrs, eol_type, charset_list;
1954   int c;
1955
1956   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1957
1958   while (charbuf < charbuf_end)
1959     {
1960       ASSURE_DESTINATION (safe_room);
1961       c = *charbuf++;
1962       if (ASCII_CHAR_P (c))
1963         EMIT_ONE_ASCII_BYTE (c);
1964       else
1965         {
1966           struct charset *charset;
1967           unsigned code;
1968           int dimension;
1969           int emacs_mule_id;
1970           unsigned char leading_codes[2];
1971
1972           charset = char_charset (c, charset_list, &code);
1973           if (! charset)
1974             {
1975               c = coding->default_char;
1976               if (ASCII_CHAR_P (c))
1977                 {
1978                   EMIT_ONE_ASCII_BYTE (c);
1979                   continue;
1980                 }
1981               charset = char_charset (c, charset_list, &code);
1982             }
1983           dimension = CHARSET_DIMENSION (charset);
1984           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1985           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1986           EMIT_ONE_BYTE (leading_codes[0]);
1987           if (leading_codes[1])
1988             EMIT_ONE_BYTE (leading_codes[1]);
1989           if (dimension == 1)
1990             EMIT_ONE_BYTE (code);
1991           else
1992             {
1993               EMIT_ONE_BYTE (code >> 8);
1994               EMIT_ONE_BYTE (code & 0xFF);
1995             }
1996         }
1997     }
1998   coding->result = CODING_RESULT_SUCCESS;
1999   coding->produced_char += produced_chars;
2000   coding->produced = dst - coding->destination;
2001   return 0;
2002 }
2003
2004 \f
2005 /*** 7. ISO2022 handlers ***/
2006
2007 /* The following note describes the coding system ISO2022 briefly.
2008    Since the intention of this note is to help understand the
2009    functions in this file, some parts are NOT ACCURATE or OVERLY
2010    SIMPLIFIED.  For thorough understanding, please refer to the
2011    original document of ISO2022.
2012
2013    ISO2022 provides many mechanisms to encode several character sets
2014    in 7-bit and 8-bit environments.  For 7-bite environments, all text
2015    is encoded using bytes less than 128.  This may make the encoded
2016    text a little bit longer, but the text passes more easily through
2017    several gateways, some of which strip off MSB (Most Signigant Bit).
2018
2019    There are two kinds of character sets: control character set and
2020    graphic character set.  The former contains control characters such
2021    as `newline' and `escape' to provide control functions (control
2022    functions are also provided by escape sequences).  The latter
2023    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2024    two control character sets and many graphic character sets.
2025
2026    Graphic character sets are classified into one of the following
2027    four classes, according to the number of bytes (DIMENSION) and
2028    number of characters in one dimension (CHARS) of the set:
2029    - DIMENSION1_CHARS94
2030    - DIMENSION1_CHARS96
2031    - DIMENSION2_CHARS94
2032    - DIMENSION2_CHARS96
2033
2034    In addition, each character set is assigned an identification tag,
2035    unique for each set, called "final character" (denoted as <F>
2036    hereafter).  The <F> of each character set is decided by ECMA(*)
2037    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2038    (0x30..0x3F are for private use only).
2039
2040    Note (*): ECMA = European Computer Manufacturers Association
2041
2042    Here are examples of graphic character set [NAME(<F>)]:
2043         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2044         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2045         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2046         o DIMENSION2_CHARS96 -- none for the moment
2047
2048    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2049         C0 [0x00..0x1F] -- control character plane 0
2050         GL [0x20..0x7F] -- graphic character plane 0
2051         C1 [0x80..0x9F] -- control character plane 1
2052         GR [0xA0..0xFF] -- graphic character plane 1
2053
2054    A control character set is directly designated and invoked to C0 or
2055    C1 by an escape sequence.  The most common case is that:
2056    - ISO646's  control character set is designated/invoked to C0, and
2057    - ISO6429's control character set is designated/invoked to C1,
2058    and usually these designations/invocations are omitted in encoded
2059    text.  In a 7-bit environment, only C0 can be used, and a control
2060    character for C1 is encoded by an appropriate escape sequence to
2061    fit into the environment.  All control characters for C1 are
2062    defined to have corresponding escape sequences.
2063
2064    A graphic character set is at first designated to one of four
2065    graphic registers (G0 through G3), then these graphic registers are
2066    invoked to GL or GR.  These designations and invocations can be
2067    done independently.  The most common case is that G0 is invoked to
2068    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2069    these invocations and designations are omitted in encoded text.
2070    In a 7-bit environment, only GL can be used.
2071
2072    When a graphic character set of CHARS94 is invoked to GL, codes
2073    0x20 and 0x7F of the GL area work as control characters SPACE and
2074    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2075    be used.
2076
2077    There are two ways of invocation: locking-shift and single-shift.
2078    With locking-shift, the invocation lasts until the next different
2079    invocation, whereas with single-shift, the invocation affects the
2080    following character only and doesn't affect the locking-shift
2081    state.  Invocations are done by the following control characters or
2082    escape sequences:
2083
2084    ----------------------------------------------------------------------
2085    abbrev  function                  cntrl escape seq   description
2086    ----------------------------------------------------------------------
2087    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2088    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2089    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2090    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2091    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2092    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2093    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2094    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2095    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2096    ----------------------------------------------------------------------
2097    (*) These are not used by any known coding system.
2098
2099    Control characters for these functions are defined by macros
2100    ISO_CODE_XXX in `coding.h'.
2101
2102    Designations are done by the following escape sequences:
2103    ----------------------------------------------------------------------
2104    escape sequence      description
2105    ----------------------------------------------------------------------
2106    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2107    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2108    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2109    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2110    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2111    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2112    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2113    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2114    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2115    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2116    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2117    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2118    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2119    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2120    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2121    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2122    ----------------------------------------------------------------------
2123
2124    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2125    of dimension 1, chars 94, and final character <F>, etc...
2126
2127    Note (*): Although these designations are not allowed in ISO2022,
2128    Emacs accepts them on decoding, and produces them on encoding
2129    CHARS96 character sets in a coding system which is characterized as
2130    7-bit environment, non-locking-shift, and non-single-shift.
2131
2132    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2133    '(' must be omitted.  We refer to this as "short-form" hereafter.
2134
2135    Now you may notice that there are a lot of ways for encoding the
2136    same multilingual text in ISO2022.  Actually, there exist many
2137    coding systems such as Compound Text (used in X11's inter client
2138    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2139    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2140    localized platforms), and all of these are variants of ISO2022.
2141
2142    In addition to the above, Emacs handles two more kinds of escape
2143    sequences: ISO6429's direction specification and Emacs' private
2144    sequence for specifying character composition.
2145
2146    ISO6429's direction specification takes the following form:
2147         o CSI ']'      -- end of the current direction
2148         o CSI '0' ']'  -- end of the current direction
2149         o CSI '1' ']'  -- start of left-to-right text
2150         o CSI '2' ']'  -- start of right-to-left text
2151    The control character CSI (0x9B: control sequence introducer) is
2152    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2153
2154    Character composition specification takes the following form:
2155         o ESC '0' -- start relative composition
2156         o ESC '1' -- end composition
2157         o ESC '2' -- start rule-base composition (*)
2158         o ESC '3' -- start relative composition with alternate chars  (**)
2159         o ESC '4' -- start rule-base composition with alternate chars  (**)
2160   Since these are not standard escape sequences of any ISO standard,
2161   the use of them for these meaning is restricted to Emacs only.
2162
2163   (*) This form is used only in Emacs 20.5 and the older versions,
2164   but the newer versions can safely decode it.
2165   (**) This form is used only in Emacs 21.1 and the newer versions,
2166   and the older versions can't decode it.
2167
2168   Here's a list of examples usages of these composition escape
2169   sequences (categorized by `enum composition_method').
2170
2171   COMPOSITION_RELATIVE:
2172         ESC 0 CHAR [ CHAR ] ESC 1
2173   COMPOSITOIN_WITH_RULE:
2174         ESC 2 CHAR [ RULE CHAR ] ESC 1
2175   COMPOSITION_WITH_ALTCHARS:
2176         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2177   COMPOSITION_WITH_RULE_ALTCHARS:
2178         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2179
2180 enum iso_code_class_type iso_code_class[256];
2181
2182 #define SAFE_CHARSET_P(coding, id)      \
2183   ((id) <= (coding)->max_charset_id     \
2184    && (coding)->safe_charsets[id] >= 0)
2185
2186
2187 #define SHIFT_OUT_OK(category)  \
2188   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2189
2190 static void
2191 setup_iso_safe_charsets (Lisp_Object attrs)
2192 {
2193   Lisp_Object charset_list, safe_charsets;
2194   Lisp_Object request;
2195   Lisp_Object reg_usage;
2196   Lisp_Object tail;
2197   int reg94, reg96;
2198   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2199   int max_charset_id;
2200
2201   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2202   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2203       && ! EQ (charset_list, Viso_2022_charset_list))
2204     {
2205       CODING_ATTR_CHARSET_LIST (attrs)
2206         = charset_list = Viso_2022_charset_list;
2207       ASET (attrs, coding_attr_safe_charsets, Qnil);
2208     }
2209
2210   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2211     return;
2212
2213   max_charset_id = 0;
2214   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2215     {
2216       int id = XINT (XCAR (tail));
2217       if (max_charset_id < id)
2218         max_charset_id = id;
2219     }
2220
2221   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2222                                 make_number (255));
2223   request = AREF (attrs, coding_attr_iso_request);
2224   reg_usage = AREF (attrs, coding_attr_iso_usage);
2225   reg94 = XINT (XCAR (reg_usage));
2226   reg96 = XINT (XCDR (reg_usage));
2227
2228   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2229     {
2230       Lisp_Object id;
2231       Lisp_Object reg;
2232       struct charset *charset;
2233
2234       id = XCAR (tail);
2235       charset = CHARSET_FROM_ID (XINT (id));
2236       reg = Fcdr (Fassq (request, id));
2237       if (! NILP (reg))
2238         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2239       else if (charset->iso_chars_96)
2240         {
2241           if (reg96 < 4)
2242             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2243         }
2244       else
2245         {
2246           if (reg94 < 4)
2247             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2248         }
2249     }
2250   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2251 }
2252
2253
2254 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2255    Check if a text is encoded in ISO2022.  If it is, returns an
2256    integer in which appropriate flag bits any of:
2257         CATEGORY_MASK_ISO_7
2258         CATEGORY_MASK_ISO_7_TIGHT
2259         CATEGORY_MASK_ISO_8_1
2260         CATEGORY_MASK_ISO_8_2
2261         CATEGORY_MASK_ISO_7_ELSE
2262         CATEGORY_MASK_ISO_8_ELSE
2263    are set.  If a code which should never appear in ISO2022 is found,
2264    returns 0.  */
2265
2266 static int
2267 detect_coding_iso_2022 (coding, mask)
2268      struct coding_system *coding;
2269      int *mask;
2270 {
2271   unsigned char *src = coding->source, *src_base = src;
2272   unsigned char *src_end = coding->source + coding->src_bytes;
2273   int multibytep = coding->src_multibyte;
2274   int mask_iso = CATEGORY_MASK_ISO;
2275   int mask_found = 0, mask_8bit_found = 0;
2276   int reg[4], shift_out = 0, single_shifting = 0;
2277   int id;
2278   int c, c1;
2279   int consumed_chars = 0;
2280   int i;
2281
2282   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2283     {
2284       struct coding_system *this = &(coding_categories[i]);
2285       Lisp_Object attrs, val;
2286
2287       attrs = CODING_ID_ATTRS (this->id);
2288       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2289           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2290         setup_iso_safe_charsets (attrs);
2291       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2292       this->max_charset_id = XSTRING (val)->size - 1;
2293       this->safe_charsets = (char *) XSTRING (val)->data;
2294     }
2295
2296   /* A coding system of this category is always ASCII compatible.  */
2297   src += coding->head_ascii;
2298
2299   reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2300   while (mask_iso && src < src_end)
2301     {
2302       ONE_MORE_BYTE (c);
2303       switch (c)
2304         {
2305         case ISO_CODE_ESC:
2306           if (inhibit_iso_escape_detection)
2307             break;
2308           single_shifting = 0;
2309           ONE_MORE_BYTE (c);
2310           if (c >= '(' && c <= '/')
2311             {
2312               /* Designation sequence for a charset of dimension 1.  */
2313               ONE_MORE_BYTE (c1);
2314               if (c1 < ' ' || c1 >= 0x80
2315                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2316                 /* Invalid designation sequence.  Just ignore.  */
2317                 break;
2318               reg[(c - '(') % 4] = id;
2319             }
2320           else if (c == '$')
2321             {
2322               /* Designation sequence for a charset of dimension 2.  */
2323               ONE_MORE_BYTE (c);
2324               if (c >= '@' && c <= 'B')
2325                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2326                 reg[0] = id = iso_charset_table[1][0][c];
2327               else if (c >= '(' && c <= '/')
2328                 {
2329                   ONE_MORE_BYTE (c1);
2330                   if (c1 < ' ' || c1 >= 0x80
2331                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2332                     /* Invalid designation sequence.  Just ignore.  */
2333                     break;
2334                   reg[(c - '(') % 4] = id;
2335                 }
2336               else
2337                 /* Invalid designation sequence.  Just ignore.  */
2338                 break;
2339             }
2340           else if (c == 'N' || c == 'O')
2341             {
2342               /* ESC <Fe> for SS2 or SS3.  */
2343               mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2344               break;
2345             }
2346           else if (c >= '0' && c <= '4')
2347             {
2348               /* ESC <Fp> for start/end composition.  */
2349               mask_found |= CATEGORY_MASK_ISO;
2350               break;
2351             }
2352           else
2353             {
2354               /* Invalid escape sequence.  */
2355               mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2356               break;
2357             }
2358
2359           /* We found a valid designation sequence for CHARSET.  */
2360           mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2361           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2362                               id))
2363             mask_found |= CATEGORY_MASK_ISO_7;
2364           else
2365             mask_iso &= ~CATEGORY_MASK_ISO_7;
2366           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2367                               id))
2368             mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2369           else
2370             mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2371           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2372                               id))
2373             mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2374           else
2375             mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2376           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2377                               id))
2378             mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2379           else
2380             mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2381           break;
2382
2383         case ISO_CODE_SO:
2384           if (inhibit_iso_escape_detection)
2385             break;
2386           single_shifting = 0;
2387           if (shift_out == 0
2388               && (reg[1] >= 0
2389                   || SHIFT_OUT_OK (coding_category_iso_7_else)
2390                   || SHIFT_OUT_OK (coding_category_iso_8_else)))
2391             {
2392               /* Locking shift out.  */
2393               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2394               mask_found |= CATEGORY_MASK_ISO_ELSE;
2395             }
2396           break;
2397
2398         case ISO_CODE_SI:
2399           if (inhibit_iso_escape_detection)
2400             break;
2401           single_shifting = 0;
2402           if (shift_out == 1)
2403             {
2404               /* Locking shift in.  */
2405               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2406               mask_found |= CATEGORY_MASK_ISO_ELSE;
2407             }
2408           break;
2409
2410         case ISO_CODE_CSI:
2411           single_shifting = 0;
2412         case ISO_CODE_SS2:
2413         case ISO_CODE_SS3:
2414           {
2415             int newmask = CATEGORY_MASK_ISO_8_ELSE;
2416
2417             if (inhibit_iso_escape_detection)
2418               break;
2419             if (c != ISO_CODE_CSI)
2420               {
2421                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2422                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2423                   newmask |= CATEGORY_MASK_ISO_8_1;
2424                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2425                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2426                   newmask |= CATEGORY_MASK_ISO_8_2;
2427                 single_shifting = 1;
2428               }
2429             if (VECTORP (Vlatin_extra_code_table)
2430                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2431               {
2432                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2433                     & CODING_ISO_FLAG_LATIN_EXTRA)
2434                   newmask |= CATEGORY_MASK_ISO_8_1;
2435                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2436                     & CODING_ISO_FLAG_LATIN_EXTRA)
2437                   newmask |= CATEGORY_MASK_ISO_8_2;
2438               }
2439             mask_iso &= newmask;
2440             mask_found |= newmask;
2441           }
2442           break;
2443
2444         default:
2445           if (c < 0x80)
2446             {
2447               single_shifting = 0;
2448               break;
2449             }
2450           else if (c < 0xA0)
2451             {
2452               single_shifting = 0;
2453               mask_8bit_found = 1;
2454               if (VECTORP (Vlatin_extra_code_table)
2455                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2456                 {
2457                   int newmask = 0;
2458
2459                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2460                       & CODING_ISO_FLAG_LATIN_EXTRA)
2461                     newmask |= CATEGORY_MASK_ISO_8_1;
2462                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2463                       & CODING_ISO_FLAG_LATIN_EXTRA)
2464                     newmask |= CATEGORY_MASK_ISO_8_2;
2465                   mask_iso &= newmask;
2466                   mask_found |= newmask;
2467                 }
2468               else
2469                 return 0;
2470             }
2471           else
2472             {
2473               mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2474                             | CATEGORY_MASK_ISO_7_ELSE);
2475               mask_found |= CATEGORY_MASK_ISO_8_1;
2476               mask_8bit_found = 1;
2477               /* Check the length of succeeding codes of the range
2478                  0xA0..0FF.  If the byte length is odd, we exclude
2479                  CATEGORY_MASK_ISO_8_2.  We can check this only
2480                  when we are not single shifting.  */
2481               if (!single_shifting
2482                   && mask_iso & CATEGORY_MASK_ISO_8_2)
2483                 {
2484                   int i = 1;
2485                   while (src < src_end)
2486                     {
2487                       ONE_MORE_BYTE (c);
2488                       if (c < 0xA0)
2489                         break;
2490                       i++;
2491                     }
2492
2493                   if (i & 1 && src < src_end)
2494                     mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2495                   else
2496                     mask_found |= CATEGORY_MASK_ISO_8_2;
2497                 }
2498             }
2499           break;
2500         }
2501     }
2502  no_more_source:
2503   if (!mask_iso)
2504     {
2505       *mask &= ~CATEGORY_MASK_ISO;
2506       return 0;
2507     }
2508   if (!mask_found)
2509     return 0;
2510   *mask &= mask_iso & mask_found;
2511   if (! mask_8bit_found)
2512     *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2513   return 1;
2514 }
2515
2516
2517 /* Set designation state into CODING.  */
2518 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2519   do {                                                                  \
2520     int id, prev;                                                       \
2521                                                                         \
2522     if (final < '0' || final >= 128                                     \
2523         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2524         || !SAFE_CHARSET_P (coding, id))                                \
2525       {                                                                 \
2526         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2527         goto invalid_code;                                              \
2528       }                                                                 \
2529     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2530     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2531     /* If there was an invalid designation to REG previously, and this  \
2532        designation is ASCII to REG, we should keep this designation     \
2533        sequence.  */                                                    \
2534     if (prev == -2 && id == charset_ascii)                              \
2535       goto invalid_code;                                                \
2536   } while (0)
2537
2538
2539 #define MAYBE_FINISH_COMPOSITION()                              \
2540   do {                                                          \
2541     int i;                                                      \
2542     if (composition_state == COMPOSING_NO)                      \
2543       break;                                                    \
2544     /* It is assured that we have enough room for producing     \
2545        characters stored in the table `components'.  */         \
2546     if (charbuf + component_idx > charbuf_end)                  \
2547       goto no_more_source;                                      \
2548     composition_state = COMPOSING_NO;                           \
2549     if (method == COMPOSITION_RELATIVE                          \
2550         || method == COMPOSITION_WITH_ALTCHARS)                 \
2551       {                                                         \
2552         for (i = 0; i < component_idx; i++)                     \
2553           *charbuf++ = components[i];                           \
2554         char_offset += component_idx;                           \
2555       }                                                         \
2556     else                                                        \
2557       {                                                         \
2558         for (i = 0; i < component_idx; i += 2)                  \
2559           *charbuf++ = components[i];                           \
2560         char_offset += (component_idx / 2) + 1;                 \
2561       }                                                         \
2562   } while (0)
2563
2564
2565 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2566    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2567    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2568    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2569    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2570   */
2571
2572 #define DECODE_COMPOSITION_START(c1)                                    \
2573   do {                                                                  \
2574     if (c1 == '0'                                                       \
2575         && composition_state == COMPOSING_COMPONENT_CHAR)               \
2576       {                                                                 \
2577         component_len = component_idx;                                  \
2578         composition_state = COMPOSING_CHAR;                             \
2579       }                                                                 \
2580     else                                                                \
2581       {                                                                 \
2582         unsigned char *p;                                               \
2583                                                                         \
2584         MAYBE_FINISH_COMPOSITION ();                                    \
2585         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2586           goto no_more_source;                                          \
2587         for (p = src; p < src_end - 1; p++)                             \
2588           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2589             break;                                                      \
2590         if (p == src_end - 1)                                           \
2591           {                                                             \
2592             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2593               goto invalid_code;                                        \
2594             goto no_more_source;                                        \
2595           }                                                             \
2596                                                                         \
2597         /* This is surely the start of a composition.  */               \
2598         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2599                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2600                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2601                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2602         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2603                              : COMPOSING_COMPONENT_CHAR);               \
2604         component_idx = component_len = 0;                              \
2605       }                                                                 \
2606   } while (0)
2607
2608
2609 /* Handle compositoin end sequence ESC 1.  */
2610
2611 #define DECODE_COMPOSITION_END()                                        \
2612   do {                                                                  \
2613     int nchars = (component_len > 0 ? component_idx - component_len     \
2614                   : method == COMPOSITION_RELATIVE ? component_idx      \
2615                   : (component_idx + 1) / 2);                           \
2616     int i;                                                              \
2617     int *saved_charbuf = charbuf;                                       \
2618                                                                         \
2619     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
2620     if (method != COMPOSITION_RELATIVE)                                 \
2621       {                                                                 \
2622         if (component_len == 0)                                         \
2623           for (i = 0; i < component_idx; i++)                           \
2624             *charbuf++ = components[i];                                 \
2625         else                                                            \
2626           for (i = 0; i < component_len; i++)                           \
2627             *charbuf++ = components[i];                                 \
2628         *saved_charbuf = saved_charbuf - charbuf;                       \
2629       }                                                                 \
2630     if (method == COMPOSITION_WITH_RULE)                                \
2631       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2632         *charbuf++ = components[i];                                     \
2633     else                                                                \
2634       for (i = component_len; i < component_idx; i++, char_offset++)    \
2635         *charbuf++ = components[i];                                     \
2636     coding->annotated = 1;                                              \
2637     composition_state = COMPOSING_NO;                                   \
2638   } while (0)
2639
2640
2641 /* Decode a composition rule from the byte C1 (and maybe one more byte
2642    from SRC) and store one encoded composition rule in
2643    coding->cmp_data.  */
2644
2645 #define DECODE_COMPOSITION_RULE(c1)                                     \
2646   do {                                                                  \
2647     (c1) -= 32;                                                         \
2648     if (c1 < 81)                /* old format (before ver.21) */        \
2649       {                                                                 \
2650         int gref = (c1) / 9;                                            \
2651         int nref = (c1) % 9;                                            \
2652         if (gref == 4) gref = 10;                                       \
2653         if (nref == 4) nref = 10;                                       \
2654         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2655       }                                                                 \
2656     else if (c1 < 93)           /* new format (after ver.21) */         \
2657       {                                                                 \
2658         ONE_MORE_BYTE (c2);                                             \
2659         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2660       }                                                                 \
2661     else                                                                \
2662       c1 = 0;                                                           \
2663   } while (0)
2664
2665
2666 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2667
2668 static void
2669 decode_coding_iso_2022 (coding)
2670      struct coding_system *coding;
2671 {
2672   unsigned char *src = coding->source + coding->consumed;
2673   unsigned char *src_end = coding->source + coding->src_bytes;
2674   unsigned char *src_base;
2675   int *charbuf = coding->charbuf;
2676   int *charbuf_end = charbuf + coding->charbuf_size - 4;
2677   int consumed_chars = 0, consumed_chars_base;
2678   int char_offset = 0;
2679   int multibytep = coding->src_multibyte;
2680   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2681   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2682   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2683   struct charset *charset;
2684   int c;
2685   /* For handling composition sequence.  */
2686 #define COMPOSING_NO                    0
2687 #define COMPOSING_CHAR                  1
2688 #define COMPOSING_RULE                  2
2689 #define COMPOSING_COMPONENT_CHAR        3
2690 #define COMPOSING_COMPONENT_RULE        4
2691
2692   int composition_state = COMPOSING_NO;
2693   enum composition_method method;
2694   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2695   int component_idx;
2696   int component_len;
2697   Lisp_Object attrs, eol_type, charset_list;
2698
2699   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2700   setup_iso_safe_charsets (attrs);
2701
2702   while (1)
2703     {
2704       int c1, c2;
2705
2706       src_base = src;
2707       consumed_chars_base = consumed_chars;
2708
2709       if (charbuf >= charbuf_end)
2710         break;
2711
2712       ONE_MORE_BYTE (c1);
2713
2714       /* We produce no character or one character.  */
2715       switch (iso_code_class [c1])
2716         {
2717         case ISO_0x20_or_0x7F:
2718           if (composition_state != COMPOSING_NO)
2719             {
2720               if (composition_state == COMPOSING_RULE
2721                   || composition_state == COMPOSING_COMPONENT_RULE)
2722                 {
2723                   DECODE_COMPOSITION_RULE (c1);
2724                   components[component_idx++] = c1;
2725                   composition_state--;
2726                   continue;
2727                 }
2728               else if (method == COMPOSITION_WITH_RULE)
2729                 composition_state = COMPOSING_RULE;
2730               else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2731                        && composition_state == COMPOSING_COMPONENT_CHAR)
2732                 composition_state = COMPOSING_COMPONENT_CHAR;
2733             }
2734           if (charset_id_0 < 0
2735               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2736             {
2737               /* This is SPACE or DEL.  */
2738               charset = CHARSET_FROM_ID (charset_ascii);
2739               break;
2740             }
2741           /* This is a graphic character, we fall down ...  */
2742
2743         case ISO_graphic_plane_0:
2744           if (composition_state == COMPOSING_RULE)
2745             {
2746               DECODE_COMPOSITION_RULE (c1);
2747               components[component_idx++] = c1;
2748               composition_state = COMPOSING_CHAR;
2749             }
2750           charset = CHARSET_FROM_ID (charset_id_0);
2751           break;
2752
2753         case ISO_0xA0_or_0xFF:
2754           if (charset_id_1 < 0
2755               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2756               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2757             goto invalid_code;
2758           /* This is a graphic character, we fall down ... */
2759
2760         case ISO_graphic_plane_1:
2761           if (charset_id_1 < 0)
2762             goto invalid_code;
2763           charset = CHARSET_FROM_ID (charset_id_1);
2764           break;
2765
2766         case ISO_carriage_return:
2767           if (c1 == '\r')
2768             {
2769               if (EQ (eol_type, Qdos))
2770                 {
2771                   if (src == src_end)
2772                     goto no_more_source;
2773                   if (*src == '\n')
2774                     ONE_MORE_BYTE (c1);
2775                 }
2776               else if (EQ (eol_type, Qmac))
2777                 c1 = '\n';
2778             }
2779           /* fall through */
2780
2781         case ISO_control_0:
2782           MAYBE_FINISH_COMPOSITION ();
2783           charset = CHARSET_FROM_ID (charset_ascii);
2784           break;
2785
2786         case ISO_control_1:
2787           MAYBE_FINISH_COMPOSITION ();
2788           goto invalid_code;
2789
2790         case ISO_shift_out:
2791           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2792               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2793             goto invalid_code;
2794           CODING_ISO_INVOCATION (coding, 0) = 1;
2795           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2796           continue;
2797
2798         case ISO_shift_in:
2799           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2800             goto invalid_code;
2801           CODING_ISO_INVOCATION (coding, 0) = 0;
2802           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2803           continue;
2804
2805         case ISO_single_shift_2_7:
2806         case ISO_single_shift_2:
2807           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2808             goto invalid_code;
2809           /* SS2 is handled as an escape sequence of ESC 'N' */
2810           c1 = 'N';
2811           goto label_escape_sequence;
2812
2813         case ISO_single_shift_3:
2814           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2815             goto invalid_code;
2816           /* SS2 is handled as an escape sequence of ESC 'O' */
2817           c1 = 'O';
2818           goto label_escape_sequence;
2819
2820         case ISO_control_sequence_introducer:
2821           /* CSI is handled as an escape sequence of ESC '[' ...  */
2822           c1 = '[';
2823           goto label_escape_sequence;
2824
2825         case ISO_escape:
2826           ONE_MORE_BYTE (c1);
2827         label_escape_sequence:
2828           /* Escape sequences handled here are invocation,
2829              designation, direction specification, and character
2830              composition specification.  */
2831           switch (c1)
2832             {
2833             case '&':           /* revision of following character set */
2834               ONE_MORE_BYTE (c1);
2835               if (!(c1 >= '@' && c1 <= '~'))
2836                 goto invalid_code;
2837               ONE_MORE_BYTE (c1);
2838               if (c1 != ISO_CODE_ESC)
2839                 goto invalid_code;
2840               ONE_MORE_BYTE (c1);
2841               goto label_escape_sequence;
2842
2843             case '$':           /* designation of 2-byte character set */
2844               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2845                 goto invalid_code;
2846               ONE_MORE_BYTE (c1);
2847               if (c1 >= '@' && c1 <= 'B')
2848                 {       /* designation of JISX0208.1978, GB2312.1980,
2849                            or JISX0208.1980 */
2850                   DECODE_DESIGNATION (0, 2, 0, c1);
2851                 }
2852               else if (c1 >= 0x28 && c1 <= 0x2B)
2853                 {       /* designation of DIMENSION2_CHARS94 character set */
2854                   ONE_MORE_BYTE (c2);
2855                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2856                 }
2857               else if (c1 >= 0x2C && c1 <= 0x2F)
2858                 {       /* designation of DIMENSION2_CHARS96 character set */
2859                   ONE_MORE_BYTE (c2);
2860                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2861                 }
2862               else
2863                 goto invalid_code;
2864               /* We must update these variables now.  */
2865               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2866               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2867               continue;
2868
2869             case 'n':           /* invocation of locking-shift-2 */
2870               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2871                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2872                 goto invalid_code;
2873               CODING_ISO_INVOCATION (coding, 0) = 2;
2874               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2875               continue;
2876
2877             case 'o':           /* invocation of locking-shift-3 */
2878               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2879                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2880                 goto invalid_code;
2881               CODING_ISO_INVOCATION (coding, 0) = 3;
2882               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2883               continue;
2884
2885             case 'N':           /* invocation of single-shift-2 */
2886               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2887                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2888                 goto invalid_code;
2889               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2890               ONE_MORE_BYTE (c1);
2891               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2892                 goto invalid_code;
2893               break;
2894
2895             case 'O':           /* invocation of single-shift-3 */
2896               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2897                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2898                 goto invalid_code;
2899               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2900               ONE_MORE_BYTE (c1);
2901               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2902                 goto invalid_code;
2903               break;
2904
2905             case '0': case '2': case '3': case '4': /* start composition */
2906               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2907                 goto invalid_code;
2908               DECODE_COMPOSITION_START (c1);
2909               continue;
2910
2911             case '1':           /* end composition */
2912               if (composition_state == COMPOSING_NO)
2913                 goto invalid_code;
2914               DECODE_COMPOSITION_END ();
2915               continue;
2916
2917             case '[':           /* specification of direction */
2918               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2919                 goto invalid_code;
2920               /* For the moment, nested direction is not supported.
2921                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2922                  left-to-right, and nozero means right-to-left.  */
2923               ONE_MORE_BYTE (c1);
2924               switch (c1)
2925                 {
2926                 case ']':       /* end of the current direction */
2927                   coding->mode &= ~CODING_MODE_DIRECTION;
2928
2929                 case '0':       /* end of the current direction */
2930                 case '1':       /* start of left-to-right direction */
2931                   ONE_MORE_BYTE (c1);
2932                   if (c1 == ']')
2933                     coding->mode &= ~CODING_MODE_DIRECTION;
2934                   else
2935                     goto invalid_code;
2936                   break;
2937
2938                 case '2':       /* start of right-to-left direction */
2939                   ONE_MORE_BYTE (c1);
2940                   if (c1 == ']')
2941                     coding->mode |= CODING_MODE_DIRECTION;
2942                   else
2943                     goto invalid_code;
2944                   break;
2945
2946                 default:
2947                   goto invalid_code;
2948                 }
2949               continue;
2950
2951             default:
2952               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2953                 goto invalid_code;
2954               if (c1 >= 0x28 && c1 <= 0x2B)
2955                 {       /* designation of DIMENSION1_CHARS94 character set */
2956                   ONE_MORE_BYTE (c2);
2957                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
2958                 }
2959               else if (c1 >= 0x2C && c1 <= 0x2F)
2960                 {       /* designation of DIMENSION1_CHARS96 character set */
2961                   ONE_MORE_BYTE (c2);
2962                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
2963                 }
2964               else
2965                 goto invalid_code;
2966               /* We must update these variables now.  */
2967               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2968               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2969               continue;
2970             }
2971         }
2972
2973       /* Now we know CHARSET and 1st position code C1 of a character.
2974          Produce a decoded character while getting 2nd position code
2975          C2 if necessary.  */
2976       c1 &= 0x7F;
2977       if (CHARSET_DIMENSION (charset) > 1)
2978         {
2979           ONE_MORE_BYTE (c2);
2980           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2981             /* C2 is not in a valid range.  */
2982             goto invalid_code;
2983           c1 = (c1 << 8) | (c2 & 0x7F);
2984           if (CHARSET_DIMENSION (charset) > 2)
2985             {
2986               ONE_MORE_BYTE (c2);
2987               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2988                 /* C2 is not in a valid range.  */
2989                 goto invalid_code;
2990               c1 = (c1 << 8) | (c2 & 0x7F);
2991             }
2992         }
2993
2994       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
2995       if (c < 0)
2996         {
2997           MAYBE_FINISH_COMPOSITION ();
2998           for (; src_base < src; src_base++, char_offset++)
2999             {
3000               if (ASCII_BYTE_P (*src_base))
3001                 *charbuf++ = *src_base;
3002               else
3003                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3004             }
3005         }
3006       else if (composition_state == COMPOSING_NO)
3007         {
3008           *charbuf++ = c;
3009           char_offset++;
3010         }
3011       else
3012         components[component_idx++] = c;
3013       continue;
3014
3015     invalid_code:
3016       MAYBE_FINISH_COMPOSITION ();
3017       src = src_base;
3018       consumed_chars = consumed_chars_base;
3019       ONE_MORE_BYTE (c);
3020       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3021       coding->errors++;
3022     }
3023
3024  no_more_source:
3025   coding->consumed_char += consumed_chars_base;
3026   coding->consumed = src_base - coding->source;
3027   coding->charbuf_used = charbuf - coding->charbuf;
3028 }
3029
3030
3031 /* ISO2022 encoding stuff.  */
3032
3033 /*
3034    It is not enough to say just "ISO2022" on encoding, we have to
3035    specify more details.  In Emacs, each coding system of ISO2022
3036    variant has the following specifications:
3037         1. Initial designation to G0 thru G3.
3038         2. Allows short-form designation?
3039         3. ASCII should be designated to G0 before control characters?
3040         4. ASCII should be designated to G0 at end of line?
3041         5. 7-bit environment or 8-bit environment?
3042         6. Use locking-shift?
3043         7. Use Single-shift?
3044    And the following two are only for Japanese:
3045         8. Use ASCII in place of JIS0201-1976-Roman?
3046         9. Use JISX0208-1983 in place of JISX0208-1978?
3047    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3048    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3049    details.
3050 */
3051
3052 /* Produce codes (escape sequence) for designating CHARSET to graphic
3053    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3054    '@', 'A', or 'B' and the coding system CODING allows, produce
3055    designation sequence of short-form.  */
3056
3057 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3058   do {                                                                  \
3059     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3060     char *intermediate_char_94 = "()*+";                                \
3061     char *intermediate_char_96 = ",-./";                                \
3062     int revision = -1;                                                  \
3063     int c;                                                              \
3064                                                                         \
3065     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3066       revision = XINT (CHARSET_ISO_REVISION (charset));                 \
3067                                                                         \
3068     if (revision >= 0)                                                  \
3069       {                                                                 \
3070         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3071         EMIT_ONE_BYTE ('@' + revision);                                 \
3072       }                                                                 \
3073     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3074     if (CHARSET_DIMENSION (charset) == 1)                               \
3075       {                                                                 \
3076         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3077           c = intermediate_char_94[reg];                                \
3078         else                                                            \
3079           c = intermediate_char_96[reg];                                \
3080         EMIT_ONE_ASCII_BYTE (c);                                        \
3081       }                                                                 \
3082     else                                                                \
3083       {                                                                 \
3084         EMIT_ONE_ASCII_BYTE ('$');                                      \
3085         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3086           {                                                             \
3087             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3088                 || reg != 0                                             \
3089                 || final_char < '@' || final_char > 'B')                \
3090               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3091           }                                                             \
3092         else                                                            \
3093           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3094       }                                                                 \
3095     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3096                                                                         \
3097     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3098   } while (0)
3099
3100
3101 /* The following two macros produce codes (control character or escape
3102    sequence) for ISO2022 single-shift functions (single-shift-2 and
3103    single-shift-3).  */
3104
3105 #define ENCODE_SINGLE_SHIFT_2                                           \
3106   do {                                                                  \
3107     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3108       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3109     else                                                                \
3110       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3111     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3112   } while (0)
3113
3114
3115 #define ENCODE_SINGLE_SHIFT_3                                           \
3116   do {                                                                  \
3117     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3118       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3119     else                                                                \
3120       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3121     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3122   } while (0)
3123
3124
3125 /* The following four macros produce codes (control character or
3126    escape sequence) for ISO2022 locking-shift functions (shift-in,
3127    shift-out, locking-shift-2, and locking-shift-3).  */
3128
3129 #define ENCODE_SHIFT_IN                                 \
3130   do {                                                  \
3131     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3132     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3133   } while (0)
3134
3135
3136 #define ENCODE_SHIFT_OUT                                \
3137   do {                                                  \
3138     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3139     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3140   } while (0)
3141
3142
3143 #define ENCODE_LOCKING_SHIFT_2                          \
3144   do {                                                  \
3145     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3146     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3147   } while (0)
3148
3149
3150 #define ENCODE_LOCKING_SHIFT_3                          \
3151   do {                                                  \
3152     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3153     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3154   } while (0)
3155
3156
3157 /* Produce codes for a DIMENSION1 character whose character set is
3158    CHARSET and whose position-code is C1.  Designation and invocation
3159    sequences are also produced in advance if necessary.  */
3160
3161 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3162   do {                                                                  \
3163     int id = CHARSET_ID (charset);                                      \
3164     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3165       {                                                                 \
3166         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3167           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3168         else                                                            \
3169           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3170         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3171         break;                                                          \
3172       }                                                                 \
3173     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3174       {                                                                 \
3175         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3176         break;                                                          \
3177       }                                                                 \
3178     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3179       {                                                                 \
3180         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3181         break;                                                          \
3182       }                                                                 \
3183     else                                                                \
3184       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3185          must invoke it, or, at first, designate it to some graphic     \
3186          register.  Then repeat the loop to actually produce the        \
3187          character.  */                                                 \
3188       dst = encode_invocation_designation (charset, coding, dst,        \
3189                                            &produced_chars);            \
3190   } while (1)
3191
3192
3193 /* Produce codes for a DIMENSION2 character whose character set is
3194    CHARSET and whose position-codes are C1 and C2.  Designation and
3195    invocation codes are also produced in advance if necessary.  */
3196
3197 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3198   do {                                                                  \
3199     int id = CHARSET_ID (charset);                                      \
3200     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3201       {                                                                 \
3202         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3203           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3204         else                                                            \
3205           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3206         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3207         break;                                                          \
3208       }                                                                 \
3209     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3210       {                                                                 \
3211         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3212         break;                                                          \
3213       }                                                                 \
3214     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3215       {                                                                 \
3216         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3217         break;                                                          \
3218       }                                                                 \
3219     else                                                                \
3220       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3221          must invoke it, or, at first, designate it to some graphic     \
3222          register.  Then repeat the loop to actually produce the        \
3223          character.  */                                                 \
3224       dst = encode_invocation_designation (charset, coding, dst,        \
3225                                            &produced_chars);            \
3226   } while (1)
3227
3228
3229 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3230   do {                                                                     \
3231     int code = ENCODE_CHAR ((charset),(c));                                \
3232                                                                            \
3233     if (CHARSET_DIMENSION (charset) == 1)                                  \
3234       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3235     else                                                                   \
3236       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3237   } while (0)
3238
3239
3240 /* Produce designation and invocation codes at a place pointed by DST
3241    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3242    Return new DST.  */
3243
3244 unsigned char *
3245 encode_invocation_designation (charset, coding, dst, p_nchars)
3246      struct charset *charset;
3247      struct coding_system *coding;
3248      unsigned char *dst;
3249      int *p_nchars;
3250 {
3251   int multibytep = coding->dst_multibyte;
3252   int produced_chars = *p_nchars;
3253   int reg;                      /* graphic register number */
3254   int id = CHARSET_ID (charset);
3255
3256   /* At first, check designations.  */
3257   for (reg = 0; reg < 4; reg++)
3258     if (id == CODING_ISO_DESIGNATION (coding, reg))
3259       break;
3260
3261   if (reg >= 4)
3262     {
3263       /* CHARSET is not yet designated to any graphic registers.  */
3264       /* At first check the requested designation.  */
3265       reg = CODING_ISO_REQUEST (coding, id);
3266       if (reg < 0)
3267         /* Since CHARSET requests no special designation, designate it
3268            to graphic register 0.  */
3269         reg = 0;
3270
3271       ENCODE_DESIGNATION (charset, reg, coding);
3272     }
3273
3274   if (CODING_ISO_INVOCATION (coding, 0) != reg
3275       && CODING_ISO_INVOCATION (coding, 1) != reg)
3276     {
3277       /* Since the graphic register REG is not invoked to any graphic
3278          planes, invoke it to graphic plane 0.  */
3279       switch (reg)
3280         {
3281         case 0:                 /* graphic register 0 */
3282           ENCODE_SHIFT_IN;
3283           break;
3284
3285         case 1:                 /* graphic register 1 */
3286           ENCODE_SHIFT_OUT;
3287           break;
3288
3289         case 2:                 /* graphic register 2 */
3290           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3291             ENCODE_SINGLE_SHIFT_2;
3292           else
3293             ENCODE_LOCKING_SHIFT_2;
3294           break;
3295
3296         case 3:                 /* graphic register 3 */
3297           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3298             ENCODE_SINGLE_SHIFT_3;
3299           else
3300             ENCODE_LOCKING_SHIFT_3;
3301           break;
3302         }
3303     }
3304
3305   *p_nchars = produced_chars;
3306   return dst;
3307 }
3308
3309 /* The following three macros produce codes for indicating direction
3310    of text.  */
3311 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3312   do {                                                                  \
3313     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3314       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3315     else                                                                \
3316       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3317   } while (0)
3318
3319
3320 #define ENCODE_DIRECTION_R2L()                  \
3321   do {                                          \
3322     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3323     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3324   } while (0)
3325
3326
3327 #define ENCODE_DIRECTION_L2R()                  \
3328   do {                                          \
3329     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3330     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3331   } while (0)
3332
3333
3334 /* Produce codes for designation and invocation to reset the graphic
3335    planes and registers to initial state.  */
3336 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3337   do {                                                                  \
3338     int reg;                                                            \
3339     struct charset *charset;                                            \
3340                                                                         \
3341     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3342       ENCODE_SHIFT_IN;                                                  \
3343     for (reg = 0; reg < 4; reg++)                                       \
3344       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3345           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3346               != CODING_ISO_INITIAL (coding, reg)))                     \
3347         {                                                               \
3348           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3349           ENCODE_DESIGNATION (charset, reg, coding);                    \
3350         }                                                               \
3351   } while (0)
3352
3353
3354 /* Produce designation sequences of charsets in the line started from
3355    SRC to a place pointed by DST, and return updated DST.
3356
3357    If the current block ends before any end-of-line, we may fail to
3358    find all the necessary designations.  */
3359
3360 static unsigned char *
3361 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3362      struct coding_system *coding;
3363      int *charbuf, *charbuf_end;
3364      unsigned char *dst;
3365 {
3366   struct charset *charset;
3367   /* Table of charsets to be designated to each graphic register.  */
3368   int r[4];
3369   int c, found = 0, reg;
3370   int produced_chars = 0;
3371   int multibytep = coding->dst_multibyte;
3372   Lisp_Object attrs;
3373   Lisp_Object charset_list;
3374
3375   attrs = CODING_ID_ATTRS (coding->id);
3376   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3377   if (EQ (charset_list, Qiso_2022))
3378     charset_list = Viso_2022_charset_list;
3379
3380   for (reg = 0; reg < 4; reg++)
3381     r[reg] = -1;
3382
3383   while (found < 4)
3384     {
3385       int id;
3386
3387       c = *charbuf++;
3388       if (c == '\n')
3389         break;
3390       charset = char_charset (c, charset_list, NULL);
3391       id = CHARSET_ID (charset);
3392       reg = CODING_ISO_REQUEST (coding, id);
3393       if (reg >= 0 && r[reg] < 0)
3394         {
3395           found++;
3396           r[reg] = id;
3397         }
3398     }
3399
3400   if (found)
3401     {
3402       for (reg = 0; reg < 4; reg++)
3403         if (r[reg] >= 0
3404             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3405           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3406     }
3407
3408   return dst;
3409 }
3410
3411 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3412
3413 static int
3414 encode_coding_iso_2022 (coding)
3415      struct coding_system *coding;
3416 {
3417   int multibytep = coding->dst_multibyte;
3418   int *charbuf = coding->charbuf;
3419   int *charbuf_end = charbuf + coding->charbuf_used;
3420   unsigned char *dst = coding->destination + coding->produced;
3421   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3422   int safe_room = 16;
3423   int bol_designation
3424     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3425        && CODING_ISO_BOL (coding));
3426   int produced_chars = 0;
3427   Lisp_Object attrs, eol_type, charset_list;
3428   int ascii_compatible;
3429   int c;
3430
3431   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3432   setup_iso_safe_charsets (attrs);
3433   coding->safe_charsets
3434     = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3435
3436   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3437
3438   while (charbuf < charbuf_end)
3439     {
3440       ASSURE_DESTINATION (safe_room);
3441
3442       if (bol_designation)
3443         {
3444           unsigned char *dst_prev = dst;
3445
3446           /* We have to produce designation sequences if any now.  */
3447           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3448           bol_designation = 0;
3449           /* We are sure that designation sequences are all ASCII bytes.  */
3450           produced_chars += dst - dst_prev;
3451         }
3452
3453       c = *charbuf++;
3454
3455       /* Now encode the character C.  */
3456       if (c < 0x20 || c == 0x7F)
3457         {
3458           if (c == '\n'
3459               || (c == '\r' && EQ (eol_type, Qmac)))
3460             {
3461               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3462                 ENCODE_RESET_PLANE_AND_REGISTER ();
3463               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3464                 {
3465                   int i;
3466
3467                   for (i = 0; i < 4; i++)
3468                     CODING_ISO_DESIGNATION (coding, i)
3469                       = CODING_ISO_INITIAL (coding, i);
3470                 }
3471               bol_designation
3472                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3473             }
3474           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3475             ENCODE_RESET_PLANE_AND_REGISTER ();
3476           EMIT_ONE_ASCII_BYTE (c);
3477         }
3478       else if (ASCII_CHAR_P (c))
3479         {
3480           if (ascii_compatible)
3481             EMIT_ONE_ASCII_BYTE (c);
3482           else
3483             ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
3484         }
3485       else
3486         {
3487           struct charset *charset = char_charset (c, charset_list, NULL);
3488
3489           if (!charset)
3490             {
3491               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3492                 {
3493                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3494                   charset = CHARSET_FROM_ID (charset_ascii);
3495                 }
3496               else
3497                 {
3498                   c = coding->default_char;
3499                   charset = char_charset (c, charset_list, NULL);
3500                 }
3501             }
3502           ENCODE_ISO_CHARACTER (charset, c);
3503         }
3504     }
3505
3506   if (coding->mode & CODING_MODE_LAST_BLOCK
3507       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3508     {
3509       ASSURE_DESTINATION (safe_room);
3510       ENCODE_RESET_PLANE_AND_REGISTER ();
3511     }
3512   coding->result = CODING_RESULT_SUCCESS;
3513   CODING_ISO_BOL (coding) = bol_designation;
3514   coding->produced_char += produced_chars;
3515   coding->produced = dst - coding->destination;
3516   return 0;
3517 }
3518
3519 \f
3520 /*** 8,9. SJIS and BIG5 handlers ***/
3521
3522 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3523    quite widely.  So, for the moment, Emacs supports them in the bare
3524    C code.  But, in the future, they may be supported only by CCL.  */
3525
3526 /* SJIS is a coding system encoding three character sets: ASCII, right
3527    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3528    as is.  A character of charset katakana-jisx0201 is encoded by
3529    "position-code + 0x80".  A character of charset japanese-jisx0208
3530    is encoded in 2-byte but two position-codes are divided and shifted
3531    so that it fit in the range below.
3532
3533    --- CODE RANGE of SJIS ---
3534    (character set)      (range)
3535    ASCII                0x00 .. 0x7F
3536    KATAKANA-JISX0201    0xA0 .. 0xDF
3537    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3538             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3539    -------------------------------
3540
3541 */
3542
3543 /* BIG5 is a coding system encoding two character sets: ASCII and
3544    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3545    character set and is encoded in two-byte.
3546
3547    --- CODE RANGE of BIG5 ---
3548    (character set)      (range)
3549    ASCII                0x00 .. 0x7F
3550    Big5 (1st byte)      0xA1 .. 0xFE
3551         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3552    --------------------------
3553
3554   */
3555
3556 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3557    Check if a text is encoded in SJIS.  If it is, return
3558    CATEGORY_MASK_SJIS, else return 0.  */
3559
3560 static int
3561 detect_coding_sjis (coding, mask)
3562      struct coding_system *coding;
3563      int *mask;
3564 {
3565   unsigned char *src = coding->source, *src_base = src;
3566   unsigned char *src_end = coding->source + coding->src_bytes;
3567   int multibytep = coding->src_multibyte;
3568   int consumed_chars = 0;
3569   int found = 0;
3570   int c;
3571
3572   /* A coding system of this category is always ASCII compatible.  */
3573   src += coding->head_ascii;
3574
3575   while (1)
3576     {
3577       ONE_MORE_BYTE (c);
3578       if (c < 0x80)
3579         continue;
3580       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3581         {
3582           ONE_MORE_BYTE (c);
3583           if (c < 0x40 || c == 0x7F || c > 0xFC)
3584             break;
3585           found = 1;
3586         }
3587       else if (c >= 0xA0 && c < 0xE0)
3588         found = 1;
3589       else
3590         break;
3591     }
3592   *mask &= ~CATEGORY_MASK_SJIS;
3593   return 0;
3594
3595  no_more_source:
3596   if (!found)
3597     return 0;
3598   *mask &= CATEGORY_MASK_SJIS;
3599   return 1;
3600 }
3601
3602 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3603    Check if a text is encoded in BIG5.  If it is, return
3604    CATEGORY_MASK_BIG5, else return 0.  */
3605
3606 static int
3607 detect_coding_big5 (coding, mask)
3608      struct coding_system *coding;
3609      int *mask;
3610 {
3611   unsigned char *src = coding->source, *src_base = src;
3612   unsigned char *src_end = coding->source + coding->src_bytes;
3613   int multibytep = coding->src_multibyte;
3614   int consumed_chars = 0;
3615   int found = 0;
3616   int c;
3617
3618   /* A coding system of this category is always ASCII compatible.  */
3619   src += coding->head_ascii;
3620
3621   while (1)
3622     {
3623       ONE_MORE_BYTE (c);
3624       if (c < 0x80)
3625         continue;
3626       if (c >= 0xA1)
3627         {
3628           ONE_MORE_BYTE (c);
3629           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3630             return 0;
3631           found = 1;
3632         }
3633       else
3634         break;
3635     }
3636   *mask &= ~CATEGORY_MASK_BIG5;
3637   return 0;
3638
3639  no_more_source:
3640   if (!found)
3641     return 0;
3642   *mask &= CATEGORY_MASK_BIG5;
3643   return 1;
3644 }
3645
3646 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3647    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3648
3649 static void
3650 decode_coding_sjis (coding)
3651      struct coding_system *coding;
3652 {
3653   unsigned char *src = coding->source + coding->consumed;
3654   unsigned char *src_end = coding->source + coding->src_bytes;
3655   unsigned char *src_base;
3656   int *charbuf = coding->charbuf;
3657   int *charbuf_end = charbuf + coding->charbuf_size;
3658   int consumed_chars = 0, consumed_chars_base;
3659   int multibytep = coding->src_multibyte;
3660   struct charset *charset_roman, *charset_kanji, *charset_kana;
3661   Lisp_Object attrs, eol_type, charset_list, val;
3662
3663   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3664
3665   val = charset_list;
3666   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3667   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3668   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3669
3670   while (1)
3671     {
3672       int c, c1;
3673
3674       src_base = src;
3675       consumed_chars_base = consumed_chars;
3676
3677       if (charbuf >= charbuf_end)
3678         break;
3679
3680       ONE_MORE_BYTE (c);
3681
3682       if (c == '\r')
3683         {
3684           if (EQ (eol_type, Qdos))
3685             {
3686               if (src == src_end)
3687                 goto no_more_source;
3688               if (*src == '\n')
3689                 ONE_MORE_BYTE (c);
3690             }
3691           else if (EQ (eol_type, Qmac))
3692             c = '\n';
3693         }
3694       else
3695         {
3696           struct charset *charset;
3697
3698           if (c < 0x80)
3699             charset = charset_roman;
3700           else
3701             {
3702               if (c >= 0xF0)
3703                 goto invalid_code;
3704               if (c < 0xA0 || c >= 0xE0)
3705                 {
3706                   /* SJIS -> JISX0208 */
3707                   ONE_MORE_BYTE (c1);
3708                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3709                     goto invalid_code;
3710                   c = (c << 8) | c1;
3711                   SJIS_TO_JIS (c);
3712                   charset = charset_kanji;
3713                 }
3714               else
3715                 /* SJIS -> JISX0201-Kana */
3716                 charset = charset_kana;
3717             }
3718           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3719         }
3720       *charbuf++ = c;
3721       continue;
3722
3723     invalid_code:
3724       src = src_base;
3725       consumed_chars = consumed_chars_base;
3726       ONE_MORE_BYTE (c);
3727       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3728       coding->errors++;
3729     }
3730
3731  no_more_source:
3732   coding->consumed_char += consumed_chars_base;
3733   coding->consumed = src_base - coding->source;
3734   coding->charbuf_used = charbuf - coding->charbuf;
3735 }
3736
3737 static void
3738 decode_coding_big5 (coding)
3739      struct coding_system *coding;
3740 {
3741   unsigned char *src = coding->source + coding->consumed;
3742   unsigned char *src_end = coding->source + coding->src_bytes;
3743   unsigned char *src_base;
3744   int *charbuf = coding->charbuf;
3745   int *charbuf_end = charbuf + coding->charbuf_size;
3746   int consumed_chars = 0, consumed_chars_base;
3747   int multibytep = coding->src_multibyte;
3748   struct charset *charset_roman, *charset_big5;
3749   Lisp_Object attrs, eol_type, charset_list, val;
3750
3751   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3752   val = charset_list;
3753   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3754   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3755
3756   while (1)
3757     {
3758       int c, c1;
3759
3760       src_base = src;
3761       consumed_chars_base = consumed_chars;
3762
3763       if (charbuf >= charbuf_end)
3764         break;
3765
3766       ONE_MORE_BYTE (c);
3767
3768       if (c == '\r')
3769         {
3770           if (EQ (eol_type, Qdos))
3771             {
3772               if (src == src_end)
3773                 goto no_more_source;
3774               if (*src == '\n')
3775                 ONE_MORE_BYTE (c);
3776             }
3777           else if (EQ (eol_type, Qmac))
3778             c = '\n';
3779         }
3780       else
3781         {
3782           struct charset *charset;
3783           if (c < 0x80)
3784             charset = charset_roman;
3785           else
3786             {
3787               /* BIG5 -> Big5 */
3788               if (c < 0xA1 || c > 0xFE)
3789                 goto invalid_code;
3790               ONE_MORE_BYTE (c1);
3791               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3792                 goto invalid_code;
3793               c = c << 8 | c1;
3794               charset = charset_big5;
3795             }
3796           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3797         }
3798
3799       *charbuf++ = c;
3800       continue;
3801
3802     invalid_code:
3803       src = src_base;
3804       consumed_chars = consumed_chars_base;
3805       ONE_MORE_BYTE (c);
3806       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3807       coding->errors++;
3808     }
3809
3810  no_more_source:
3811   coding->consumed_char += consumed_chars_base;
3812   coding->consumed = src_base - coding->source;
3813   coding->charbuf_used = charbuf - coding->charbuf;
3814 }
3815
3816 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3817    This function can encode charsets `ascii', `katakana-jisx0201',
3818    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3819    are sure that all these charsets are registered as official charset
3820    (i.e. do not have extended leading-codes).  Characters of other
3821    charsets are produced without any encoding.  If SJIS_P is 1, encode
3822    SJIS text, else encode BIG5 text.  */
3823
3824 static int
3825 encode_coding_sjis (coding)
3826      struct coding_system *coding;
3827 {
3828   int multibytep = coding->dst_multibyte;
3829   int *charbuf = coding->charbuf;
3830   int *charbuf_end = charbuf + coding->charbuf_used;
3831   unsigned char *dst = coding->destination + coding->produced;
3832   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3833   int safe_room = 4;
3834   int produced_chars = 0;
3835   Lisp_Object attrs, eol_type, charset_list, val;
3836   int ascii_compatible;
3837   struct charset *charset_roman, *charset_kanji, *charset_kana;
3838   int c;
3839
3840   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3841   val = charset_list;
3842   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3843   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3844   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3845
3846   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3847
3848   while (charbuf < charbuf_end)
3849     {
3850       ASSURE_DESTINATION (safe_room);
3851       c = *charbuf++;
3852       /* Now encode the character C.  */
3853       if (ASCII_CHAR_P (c) && ascii_compatible)
3854         EMIT_ONE_ASCII_BYTE (c);
3855       else
3856         {
3857           unsigned code;
3858           struct charset *charset = char_charset (c, charset_list, &code);
3859
3860           if (!charset)
3861             {
3862               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3863                 {
3864                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3865                   charset = CHARSET_FROM_ID (charset_ascii);
3866                 }
3867               else
3868                 {
3869                   c = coding->default_char;
3870                   charset = char_charset (c, charset_list, &code);
3871                 }
3872             }
3873           if (code == CHARSET_INVALID_CODE (charset))
3874             abort ();
3875           if (charset == charset_kanji)
3876             {
3877               int c1, c2;
3878               JIS_TO_SJIS (code);
3879               c1 = code >> 8, c2 = code & 0xFF;
3880               EMIT_TWO_BYTES (c1, c2);
3881             }
3882           else if (charset == charset_kana)
3883             EMIT_ONE_BYTE (code | 0x80);
3884           else
3885             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3886         }
3887     }
3888   coding->result = CODING_RESULT_SUCCESS;
3889   coding->produced_char += produced_chars;
3890   coding->produced = dst - coding->destination;
3891   return 0;
3892 }
3893
3894 static int
3895 encode_coding_big5 (coding)
3896      struct coding_system *coding;
3897 {
3898   int multibytep = coding->dst_multibyte;
3899   int *charbuf = coding->charbuf;
3900   int *charbuf_end = charbuf + coding->charbuf_used;
3901   unsigned char *dst = coding->destination + coding->produced;
3902   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3903   int safe_room = 4;
3904   int produced_chars = 0;
3905   Lisp_Object attrs, eol_type, charset_list, val;
3906   int ascii_compatible;
3907   struct charset *charset_roman, *charset_big5;
3908   int c;
3909
3910   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3911   val = charset_list;
3912   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3913   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3914   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3915
3916   while (charbuf < charbuf_end)
3917     {
3918       ASSURE_DESTINATION (safe_room);
3919       c = *charbuf++;
3920       /* Now encode the character C.  */
3921       if (ASCII_CHAR_P (c) && ascii_compatible)
3922         EMIT_ONE_ASCII_BYTE (c);
3923       else
3924         {
3925           unsigned code;
3926           struct charset *charset = char_charset (c, charset_list, &code);
3927
3928           if (! charset)
3929             {
3930               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3931                 {
3932                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3933                   charset = CHARSET_FROM_ID (charset_ascii);
3934                 }
3935               else
3936                 {
3937                   c = coding->default_char;
3938                   charset = char_charset (c, charset_list, &code);
3939                 }
3940             }
3941           if (code == CHARSET_INVALID_CODE (charset))
3942             abort ();
3943           if (charset == charset_big5)
3944             {
3945               int c1, c2;
3946
3947               c1 = code >> 8, c2 = code & 0xFF;
3948               EMIT_TWO_BYTES (c1, c2);
3949             }
3950           else
3951             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3952         }
3953     }
3954   coding->result = CODING_RESULT_SUCCESS;
3955   coding->produced_char += produced_chars;
3956   coding->produced = dst - coding->destination;
3957   return 0;
3958 }
3959
3960 \f
3961 /*** 10. CCL handlers ***/
3962
3963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3964    Check if a text is encoded in a coding system of which
3965    encoder/decoder are written in CCL program.  If it is, return
3966    CATEGORY_MASK_CCL, else return 0.  */
3967
3968 static int
3969 detect_coding_ccl (coding, mask)
3970      struct coding_system *coding;
3971      int *mask;
3972 {
3973   unsigned char *src = coding->source, *src_base = src;
3974   unsigned char *src_end = coding->source + coding->src_bytes;
3975   int multibytep = coding->src_multibyte;
3976   int consumed_chars = 0;
3977   int found = 0;
3978   unsigned char *valids = CODING_CCL_VALIDS (coding);
3979   int head_ascii = coding->head_ascii;
3980   Lisp_Object attrs;
3981
3982   coding = &coding_categories[coding_category_ccl];
3983   attrs = CODING_ID_ATTRS (coding->id);
3984   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3985     src += head_ascii;
3986
3987   while (1)
3988     {
3989       int c;
3990       ONE_MORE_BYTE (c);
3991       if (! valids[c])
3992         break;
3993       if (!found && valids[c] > 1)
3994         found = 1;
3995     }
3996   *mask &= ~CATEGORY_MASK_CCL;
3997   return 0;
3998
3999  no_more_source:
4000   if (!found)
4001     return 0;
4002   *mask &= CATEGORY_MASK_CCL;
4003   return 1;
4004 }
4005
4006 static void
4007 decode_coding_ccl (coding)
4008      struct coding_system *coding;
4009 {
4010   unsigned char *src = coding->source + coding->consumed;
4011   unsigned char *src_end = coding->source + coding->src_bytes;
4012   int *charbuf = coding->charbuf;
4013   int *charbuf_end = charbuf + coding->charbuf_size;
4014   int consumed_chars = 0;
4015   int multibytep = coding->src_multibyte;
4016   struct ccl_program ccl;
4017   int source_charbuf[1024];
4018   int source_byteidx[1024];
4019
4020   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4021
4022   while (src < src_end)
4023     {
4024       unsigned char *p = src;
4025       int *source, *source_end;
4026       int i = 0;
4027
4028       if (multibytep)
4029         while (i < 1024 && p < src_end)
4030           {
4031             source_byteidx[i] = p - src;
4032             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4033           }
4034       else
4035         while (i < 1024 && p < src_end)
4036           source_charbuf[i++] = *p++;
4037
4038       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4039         ccl.last_block = 1;
4040
4041       source = source_charbuf;
4042       source_end = source + i;
4043       while (source < source_end)
4044         {
4045           ccl_driver (&ccl, source, charbuf,
4046                       source_end - source, charbuf_end - charbuf);
4047           source += ccl.consumed;
4048           charbuf += ccl.produced;
4049           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4050             break;
4051         }
4052       if (source < source_end)
4053         src += source_byteidx[source - source_charbuf];
4054       else
4055         src = p;
4056       consumed_chars += source - source_charbuf;
4057
4058       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4059           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4060         break;
4061     }
4062
4063   switch (ccl.status)
4064     {
4065     case CCL_STAT_SUSPEND_BY_SRC:
4066       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4067       break;
4068     case CCL_STAT_SUSPEND_BY_DST:
4069       break;
4070     case CCL_STAT_QUIT:
4071     case CCL_STAT_INVALID_CMD:
4072       coding->result = CODING_RESULT_INTERRUPT;
4073       break;
4074     default:
4075       coding->result = CODING_RESULT_SUCCESS;
4076       break;
4077     }
4078   coding->consumed_char += consumed_chars;
4079   coding->consumed = src - coding->source;
4080   coding->charbuf_used = charbuf - coding->charbuf;
4081 }
4082
4083 static int
4084 encode_coding_ccl (coding)
4085      struct coding_system *coding;
4086 {
4087   struct ccl_program ccl;
4088   int multibytep = coding->dst_multibyte;
4089   int *charbuf = coding->charbuf;
4090   int *charbuf_end = charbuf + coding->charbuf_used;
4091   unsigned char *dst = coding->destination + coding->produced;
4092   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4093   unsigned char *adjusted_dst_end = dst_end - 1;
4094   int destination_charbuf[1024];
4095   int i, produced_chars = 0;
4096
4097   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4098
4099   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4100   ccl.dst_multibyte = coding->dst_multibyte;
4101
4102   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4103     {
4104       int dst_bytes = dst_end - dst;
4105       if (dst_bytes > 1024)
4106         dst_bytes = 1024;
4107
4108       ccl_driver (&ccl, charbuf, destination_charbuf,
4109                   charbuf_end - charbuf, dst_bytes);
4110       charbuf += ccl.consumed;
4111       if (multibytep)
4112         for (i = 0; i < ccl.produced; i++)
4113           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4114       else
4115         {
4116           for (i = 0; i < ccl.produced; i++)
4117             *dst++ = destination_charbuf[i] & 0xFF;
4118           produced_chars += ccl.produced;
4119         }
4120     }
4121
4122   switch (ccl.status)
4123     {
4124     case CCL_STAT_SUSPEND_BY_SRC:
4125       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4126       break;
4127     case CCL_STAT_SUSPEND_BY_DST:
4128       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4129       break;
4130     case CCL_STAT_QUIT:
4131     case CCL_STAT_INVALID_CMD:
4132       coding->result = CODING_RESULT_INTERRUPT;
4133       break;
4134     default:
4135       coding->result = CODING_RESULT_SUCCESS;
4136       break;
4137     }
4138
4139   coding->produced_char += produced_chars;
4140   coding->produced = dst - coding->destination;
4141   return 0;
4142 }
4143
4144
4145 \f
4146 /*** 10, 11. no-conversion handlers ***/
4147
4148 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4149
4150 static void
4151 decode_coding_raw_text (coding)
4152      struct coding_system *coding;
4153 {
4154   coding->chars_at_source = 1;
4155   coding->consumed_char = 0;
4156   coding->consumed = 0;
4157   coding->result = CODING_RESULT_SUCCESS;
4158 }
4159
4160 static int
4161 encode_coding_raw_text (coding)
4162      struct coding_system *coding;
4163 {
4164   int multibytep = coding->dst_multibyte;
4165   int *charbuf = coding->charbuf;
4166   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4167   unsigned char *dst = coding->destination + coding->produced;
4168   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4169   int produced_chars = 0;
4170   int c;
4171
4172   if (multibytep)
4173     {
4174       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4175
4176       if (coding->src_multibyte)
4177         while (charbuf < charbuf_end)
4178           {
4179             ASSURE_DESTINATION (safe_room);
4180             c = *charbuf++;
4181             if (ASCII_CHAR_P (c))
4182               EMIT_ONE_ASCII_BYTE (c);
4183             else if (CHAR_BYTE8_P (c))
4184               {
4185                 c = CHAR_TO_BYTE8 (c);
4186                 EMIT_ONE_BYTE (c);
4187               }
4188             else
4189               {
4190                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4191
4192                 CHAR_STRING_ADVANCE (c, p1);
4193                 while (p0 < p1)
4194                   EMIT_ONE_BYTE (*p0);
4195               }
4196           }
4197       else
4198         while (charbuf < charbuf_end)
4199           {
4200             ASSURE_DESTINATION (safe_room);
4201             c = *charbuf++;
4202             EMIT_ONE_BYTE (c);
4203           }
4204     }
4205   else
4206     {
4207       if (coding->src_multibyte)
4208         {
4209           int safe_room = MAX_MULTIBYTE_LENGTH;
4210
4211           while (charbuf < charbuf_end)
4212             {
4213               ASSURE_DESTINATION (safe_room);
4214               c = *charbuf++;
4215               if (ASCII_CHAR_P (c))
4216                 *dst++ = c;
4217               else if (CHAR_BYTE8_P (c))
4218                 *dst++ = CHAR_TO_BYTE8 (c);
4219               else
4220                 CHAR_STRING_ADVANCE (c, dst);
4221               produced_chars++;
4222             }
4223         }
4224       else
4225         {
4226           ASSURE_DESTINATION (charbuf_end - charbuf);
4227           while (charbuf < charbuf_end && dst < dst_end)
4228             *dst++ = *charbuf++;
4229           produced_chars = dst - (coding->destination + coding->dst_bytes);
4230         }
4231     }
4232   coding->result = CODING_RESULT_SUCCESS;
4233   coding->produced_char += produced_chars;
4234   coding->produced = dst - coding->destination;
4235   return 0;
4236 }
4237
4238 static int
4239 detect_coding_charset (coding, mask)
4240      struct coding_system *coding;
4241      int *mask;
4242 {
4243   unsigned char *src = coding->source, *src_base = src;
4244   unsigned char *src_end = coding->source + coding->src_bytes;
4245   int multibytep = coding->src_multibyte;
4246   int consumed_chars = 0;
4247   Lisp_Object attrs, valids;
4248
4249   coding = &coding_categories[coding_category_charset];
4250   attrs = CODING_ID_ATTRS (coding->id);
4251   valids = AREF (attrs, coding_attr_charset_valids);
4252
4253   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4254     src += coding->head_ascii;
4255
4256   while (1)
4257     {
4258       int c;
4259
4260       ONE_MORE_BYTE (c);
4261       if (NILP (AREF (valids, c)))
4262         break;
4263     }
4264   *mask &= ~CATEGORY_MASK_CHARSET;
4265   return 0;
4266
4267  no_more_source:
4268   *mask &= CATEGORY_MASK_CHARSET;
4269   return 1;
4270 }
4271
4272 static void
4273 decode_coding_charset (coding)
4274      struct coding_system *coding;
4275 {
4276   unsigned char *src = coding->source + coding->consumed;
4277   unsigned char *src_end = coding->source + coding->src_bytes;
4278   unsigned char *src_base;
4279   int *charbuf = coding->charbuf;
4280   int *charbuf_end = charbuf + coding->charbuf_size;
4281   int consumed_chars = 0, consumed_chars_base;
4282   int multibytep = coding->src_multibyte;
4283   Lisp_Object attrs, eol_type, charset_list, valids;
4284
4285   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4286   valids = AREF (attrs, coding_attr_charset_valids);
4287
4288   while (1)
4289     {
4290       int c;
4291
4292       src_base = src;
4293       consumed_chars_base = consumed_chars;
4294
4295       if (charbuf >= charbuf_end)
4296         break;
4297
4298       ONE_MORE_BYTE (c);
4299       if (c == '\r')
4300         {
4301           if (EQ (eol_type, Qdos))
4302             {
4303               if (src < src_end
4304                   && *src == '\n')
4305                 ONE_MORE_BYTE (c);
4306             }
4307           else if (EQ (eol_type, Qmac))
4308             c = '\n';
4309         }
4310       else
4311         {
4312           Lisp_Object val;
4313           struct charset *charset;
4314           int c1;
4315
4316           val = AREF (valids, c);
4317           if (NILP (val))
4318             goto invalid_code;
4319           charset = CHARSET_FROM_ID (XFASTINT (val));
4320           if (CHARSET_DIMENSION (charset) > 1)
4321             {
4322               ONE_MORE_BYTE (c1);
4323               c = (c << 8) | c1;
4324               if (CHARSET_DIMENSION (charset) > 2)
4325                 {
4326                   ONE_MORE_BYTE (c1);
4327                   c = (c << 8) | c1;
4328                   if (CHARSET_DIMENSION (charset) > 3)
4329                     {
4330                       ONE_MORE_BYTE (c1);
4331                       c = (c << 8) | c1;
4332                     }
4333                 }
4334             }
4335           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4336           if (c < 0)
4337             goto invalid_code;
4338         }
4339       *charbuf++ = c;
4340       continue;
4341
4342     invalid_code:
4343       src = src_base;
4344       consumed_chars = consumed_chars_base;
4345       ONE_MORE_BYTE (c);
4346       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4347       coding->errors++;
4348     }
4349
4350  no_more_source:
4351   coding->consumed_char += consumed_chars_base;
4352   coding->consumed = src_base - coding->source;
4353   coding->charbuf_used = charbuf - coding->charbuf;
4354 }
4355
4356 static int
4357 encode_coding_charset (coding)
4358      struct coding_system *coding;
4359 {
4360   int multibytep = coding->dst_multibyte;
4361   int *charbuf = coding->charbuf;
4362   int *charbuf_end = charbuf + coding->charbuf_used;
4363   unsigned char *dst = coding->destination + coding->produced;
4364   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4365   int safe_room = MAX_MULTIBYTE_LENGTH;
4366   int produced_chars = 0;
4367   Lisp_Object attrs, eol_type, charset_list;
4368   int ascii_compatible;
4369   int c;
4370
4371   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4372   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4373
4374   while (charbuf < charbuf_end)
4375     {
4376       struct charset *charset;
4377       unsigned code;
4378
4379       ASSURE_DESTINATION (safe_room);
4380       c = *charbuf++;
4381       if (ascii_compatible && ASCII_CHAR_P (c))
4382         EMIT_ONE_ASCII_BYTE (c);
4383       else
4384         {
4385           charset = char_charset (c, charset_list, &code);
4386           if (charset)
4387             {
4388               if (CHARSET_DIMENSION (charset) == 1)
4389                 EMIT_ONE_BYTE (code);
4390               else if (CHARSET_DIMENSION (charset) == 2)
4391                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4392               else if (CHARSET_DIMENSION (charset) == 3)
4393                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4394               else
4395                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4396                                  (code >> 8) & 0xFF, code & 0xFF);
4397             }
4398           else
4399             {
4400               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4401                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4402               else
4403                 c = coding->default_char;
4404               EMIT_ONE_BYTE (c);
4405             }
4406         }
4407     }
4408
4409   coding->result = CODING_RESULT_SUCCESS;
4410   coding->produced_char += produced_chars;
4411   coding->produced = dst - coding->destination;
4412   return 0;
4413 }
4414
4415 \f
4416 /*** 7. C library functions ***/
4417
4418 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4419    has a property `coding-system'.  The value of this property is a
4420    vector of length 5 (called as coding-vector).  Among elements of
4421    this vector, the first (element[0]) and the fifth (element[4])
4422    carry important information for decoding/encoding.  Before
4423    decoding/encoding, this information should be set in fields of a
4424    structure of type `coding_system'.
4425
4426    A value of property `coding-system' can be a symbol of another
4427    subsidiary coding-system.  In that case, Emacs gets coding-vector
4428    from that symbol.
4429
4430    `element[0]' contains information to be set in `coding->type'.  The
4431    value and its meaning is as follows:
4432
4433    0 -- coding_type_emacs_mule
4434    1 -- coding_type_sjis
4435    2 -- coding_type_iso_2022
4436    3 -- coding_type_big5
4437    4 -- coding_type_ccl encoder/decoder written in CCL
4438    nil -- coding_type_no_conversion
4439    t -- coding_type_undecided (automatic conversion on decoding,
4440                                no-conversion on encoding)
4441
4442    `element[4]' contains information to be set in `coding->flags' and
4443    `coding->spec'.  The meaning varies by `coding->type'.
4444
4445    If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4446    of length 32 (of which the first 13 sub-elements are used now).
4447    Meanings of these sub-elements are:
4448
4449    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4450         If the value is an integer of valid charset, the charset is
4451         assumed to be designated to graphic register N initially.
4452
4453         If the value is minus, it is a minus value of charset which
4454         reserves graphic register N, which means that the charset is
4455         not designated initially but should be designated to graphic
4456         register N just before encoding a character in that charset.
4457
4458         If the value is nil, graphic register N is never used on
4459         encoding.
4460
4461    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4462         Each value takes t or nil.  See the section ISO2022 of
4463         `coding.h' for more information.
4464
4465    If `coding->type' is `coding_type_big5', element[4] is t to denote
4466    BIG5-ETen or nil to denote BIG5-HKU.
4467
4468    If `coding->type' takes the other value, element[4] is ignored.
4469
4470    Emacs Lisp's coding system also carries information about format of
4471    end-of-line in a value of property `eol-type'.  If the value is
4472    integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr.  If
4473    it is not integer, it should be a vector of subsidiary coding
4474    systems of which property `eol-type' has one of above values.
4475
4476 */
4477
4478 /* Setup coding context CODING from information about CODING_SYSTEM.
4479    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4480    CODING_SYSTEM is invalid, signal an error.  */
4481
4482 void
4483 setup_coding_system (coding_system, coding)
4484      Lisp_Object coding_system;
4485      struct coding_system *coding;
4486 {
4487   Lisp_Object attrs;
4488   Lisp_Object eol_type;
4489   Lisp_Object coding_type;
4490   Lisp_Object val;
4491
4492   if (NILP (coding_system))
4493     coding_system = Qno_conversion;
4494
4495   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4496
4497   attrs = CODING_ID_ATTRS (coding->id);
4498   eol_type = CODING_ID_EOL_TYPE (coding->id);
4499
4500   coding->mode = 0;
4501   coding->head_ascii = -1;
4502   coding->common_flags
4503     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4504
4505   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4506   coding->max_charset_id = XSTRING (val)->size - 1;
4507   coding->safe_charsets = (char *) XSTRING (val)->data;
4508   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4509
4510   coding_type = CODING_ATTR_TYPE (attrs);
4511   if (EQ (coding_type, Qundecided))
4512     {
4513       coding->detector = NULL;
4514       coding->decoder = decode_coding_raw_text;
4515       coding->encoder = encode_coding_raw_text;
4516       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4517     }
4518   else if (EQ (coding_type, Qiso_2022))
4519     {
4520       int i;
4521       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4522
4523       /* Invoke graphic register 0 to plane 0.  */
4524       CODING_ISO_INVOCATION (coding, 0) = 0;
4525       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4526       CODING_ISO_INVOCATION (coding, 1)
4527         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4528       /* Setup the initial status of designation.  */
4529       for (i = 0; i < 4; i++)
4530         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4531       /* Not single shifting initially.  */
4532       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4533       /* Beginning of buffer should also be regarded as bol. */
4534       CODING_ISO_BOL (coding) = 1;
4535       coding->detector = detect_coding_iso_2022;
4536       coding->decoder = decode_coding_iso_2022;
4537       coding->encoder = encode_coding_iso_2022;
4538       if (flags & CODING_ISO_FLAG_SAFE)
4539         coding->mode |= CODING_MODE_SAFE_ENCODING;
4540       coding->common_flags
4541         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4542             | CODING_REQUIRE_FLUSHING_MASK);
4543       if (flags & CODING_ISO_FLAG_COMPOSITION)
4544         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4545       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4546         {
4547           setup_iso_safe_charsets (attrs);
4548           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4549           coding->max_charset_id = XSTRING (val)->size - 1;
4550           coding->safe_charsets = (char *) XSTRING (val)->data;
4551         }
4552       CODING_ISO_FLAGS (coding) = flags;
4553     }
4554   else if (EQ (coding_type, Qcharset))
4555     {
4556       coding->detector = detect_coding_charset;
4557       coding->decoder = decode_coding_charset;
4558       coding->encoder = encode_coding_charset;
4559       coding->common_flags
4560         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4561     }
4562   else if (EQ (coding_type, Qutf_8))
4563     {
4564       coding->detector = detect_coding_utf_8;
4565       coding->decoder = decode_coding_utf_8;
4566       coding->encoder = encode_coding_utf_8;
4567       coding->common_flags
4568         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4569     }
4570   else if (EQ (coding_type, Qutf_16))
4571     {
4572       val = AREF (attrs, coding_attr_utf_16_bom);
4573       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4574                                     : EQ (val, Qt) ? utf_16_with_bom
4575                                     : utf_16_without_bom);
4576       val = AREF (attrs, coding_attr_utf_16_endian);
4577       CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4578                                        : utf_16_little_endian);
4579       CODING_UTF_16_SURROGATE (coding) = 0;
4580       coding->detector = detect_coding_utf_16;
4581       coding->decoder = decode_coding_utf_16;
4582       coding->encoder = encode_coding_utf_16;
4583       coding->common_flags
4584         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4585     }
4586   else if (EQ (coding_type, Qccl))
4587     {
4588       coding->detector = detect_coding_ccl;
4589       coding->decoder = decode_coding_ccl;
4590       coding->encoder = encode_coding_ccl;
4591       coding->common_flags
4592         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4593             | CODING_REQUIRE_FLUSHING_MASK);
4594     }
4595   else if (EQ (coding_type, Qemacs_mule))
4596     {
4597       coding->detector = detect_coding_emacs_mule;
4598       coding->decoder = decode_coding_emacs_mule;
4599       coding->encoder = encode_coding_emacs_mule;
4600       coding->common_flags
4601         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4602       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4603           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4604         {
4605           Lisp_Object tail, safe_charsets;
4606           int max_charset_id = 0;
4607
4608           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4609                tail = XCDR (tail))
4610             if (max_charset_id < XFASTINT (XCAR (tail)))
4611               max_charset_id = XFASTINT (XCAR (tail));
4612           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4613                                         make_number (255));
4614           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4615                tail = XCDR (tail))
4616             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4617           coding->max_charset_id = max_charset_id;
4618           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4619         }
4620     }
4621   else if (EQ (coding_type, Qshift_jis))
4622     {
4623       coding->detector = detect_coding_sjis;
4624       coding->decoder = decode_coding_sjis;
4625       coding->encoder = encode_coding_sjis;
4626       coding->common_flags
4627         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4628     }
4629   else if (EQ (coding_type, Qbig5))
4630     {
4631       coding->detector = detect_coding_big5;
4632       coding->decoder = decode_coding_big5;
4633       coding->encoder = encode_coding_big5;
4634       coding->common_flags
4635         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4636     }
4637   else                          /* EQ (coding_type, Qraw_text) */
4638     {
4639       coding->detector = NULL;
4640       coding->decoder = decode_coding_raw_text;
4641       coding->encoder = encode_coding_raw_text;
4642       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4643     }
4644
4645   return;
4646 }
4647
4648 /* Return raw-text or one of its subsidiaries that has the same
4649    eol_type as CODING-SYSTEM.  */
4650
4651 Lisp_Object
4652 raw_text_coding_system (coding_system)
4653      Lisp_Object coding_system;
4654 {
4655   Lisp_Object spec, attrs;
4656   Lisp_Object eol_type, raw_text_eol_type;
4657
4658   spec = CODING_SYSTEM_SPEC (coding_system);
4659   attrs = AREF (spec, 0);
4660
4661   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4662     return coding_system;
4663
4664   eol_type = AREF (spec, 2);
4665   if (VECTORP (eol_type))
4666     return Qraw_text;
4667   spec = CODING_SYSTEM_SPEC (Qraw_text);
4668   raw_text_eol_type = AREF (spec, 2);
4669   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4670           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4671           : AREF (raw_text_eol_type, 2));
4672 }
4673
4674
4675 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4676    does, return one of the subsidiary that has the same eol-spec as
4677    PARENT.  Otherwise, return CODING_SYSTEM.  */
4678
4679 Lisp_Object
4680 coding_inherit_eol_type (coding_system, parent)
4681      Lisp_Object coding_system, parent;
4682 {
4683   Lisp_Object spec, attrs, eol_type;
4684
4685   spec = CODING_SYSTEM_SPEC (coding_system);
4686   attrs = AREF (spec, 0);
4687   eol_type = AREF (spec, 2);
4688   if (VECTORP (eol_type))
4689     {
4690       Lisp_Object parent_spec;
4691       Lisp_Object parent_eol_type;
4692
4693       parent_spec
4694         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4695       parent_eol_type = AREF (parent_spec, 2);
4696       if (EQ (parent_eol_type, Qunix))
4697         coding_system = AREF (eol_type, 0);
4698       else if (EQ (parent_eol_type, Qdos))
4699         coding_system = AREF (eol_type, 1);
4700       else if (EQ (parent_eol_type, Qmac))
4701         coding_system = AREF (eol_type, 2);
4702     }
4703   return coding_system;
4704 }
4705
4706 /* Emacs has a mechanism to automatically detect a coding system if it
4707    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4708    it's impossible to distinguish some coding systems accurately
4709    because they use the same range of codes.  So, at first, coding
4710    systems are categorized into 7, those are:
4711
4712    o coding-category-emacs-mule
4713
4714         The category for a coding system which has the same code range
4715         as Emacs' internal format.  Assigned the coding-system (Lisp
4716         symbol) `emacs-mule' by default.
4717
4718    o coding-category-sjis
4719
4720         The category for a coding system which has the same code range
4721         as SJIS.  Assigned the coding-system (Lisp
4722         symbol) `japanese-shift-jis' by default.
4723
4724    o coding-category-iso-7
4725
4726         The category for a coding system which has the same code range
4727         as ISO2022 of 7-bit environment.  This doesn't use any locking
4728         shift and single shift functions.  This can encode/decode all
4729         charsets.  Assigned the coding-system (Lisp symbol)
4730         `iso-2022-7bit' by default.
4731
4732    o coding-category-iso-7-tight
4733
4734         Same as coding-category-iso-7 except that this can
4735         encode/decode only the specified charsets.
4736
4737    o coding-category-iso-8-1
4738
4739         The category for a coding system which has the same code range
4740         as ISO2022 of 8-bit environment and graphic plane 1 used only
4741         for DIMENSION1 charset.  This doesn't use any locking shift
4742         and single shift functions.  Assigned the coding-system (Lisp
4743         symbol) `iso-latin-1' by default.
4744
4745    o coding-category-iso-8-2
4746
4747         The category for a coding system which has the same code range
4748         as ISO2022 of 8-bit environment and graphic plane 1 used only
4749         for DIMENSION2 charset.  This doesn't use any locking shift
4750         and single shift functions.  Assigned the coding-system (Lisp
4751         symbol) `japanese-iso-8bit' by default.
4752
4753    o coding-category-iso-7-else
4754
4755         The category for a coding system which has the same code range
4756         as ISO2022 of 7-bit environemnt but uses locking shift or
4757         single shift functions.  Assigned the coding-system (Lisp
4758         symbol) `iso-2022-7bit-lock' by default.
4759
4760    o coding-category-iso-8-else
4761
4762         The category for a coding system which has the same code range
4763         as ISO2022 of 8-bit environemnt but uses locking shift or
4764         single shift functions.  Assigned the coding-system (Lisp
4765         symbol) `iso-2022-8bit-ss2' by default.
4766
4767    o coding-category-big5
4768
4769         The category for a coding system which has the same code range
4770         as BIG5.  Assigned the coding-system (Lisp symbol)
4771         `cn-big5' by default.
4772
4773    o coding-category-utf-8
4774
4775         The category for a coding system which has the same code range
4776         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4777         symbol) `utf-8' by default.
4778
4779    o coding-category-utf-16-be
4780
4781         The category for a coding system in which a text has an
4782         Unicode signature (cf. Unicode Standard) in the order of BIG
4783         endian at the head.  Assigned the coding-system (Lisp symbol)
4784         `utf-16-be' by default.
4785
4786    o coding-category-utf-16-le
4787
4788         The category for a coding system in which a text has an
4789         Unicode signature (cf. Unicode Standard) in the order of
4790         LITTLE endian at the head.  Assigned the coding-system (Lisp
4791         symbol) `utf-16-le' by default.
4792
4793    o coding-category-ccl
4794
4795         The category for a coding system of which encoder/decoder is
4796         written in CCL programs.  The default value is nil, i.e., no
4797         coding system is assigned.
4798
4799    o coding-category-binary
4800
4801         The category for a coding system not categorized in any of the
4802         above.  Assigned the coding-system (Lisp symbol)
4803         `no-conversion' by default.
4804
4805    Each of them is a Lisp symbol and the value is an actual
4806    `coding-system's (this is also a Lisp symbol) assigned by a user.
4807    What Emacs does actually is to detect a category of coding system.
4808    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4809    decide only one possible category, it selects a category of the
4810    highest priority.  Priorities of categories are also specified by a
4811    user in a Lisp variable `coding-category-list'.
4812
4813 */
4814
4815 #define EOL_SEEN_NONE   0
4816 #define EOL_SEEN_LF     1
4817 #define EOL_SEEN_CR     2
4818 #define EOL_SEEN_CRLF   4
4819
4820 /* Detect how end-of-line of a text of length CODING->src_bytes
4821    pointed by CODING->source is encoded.  Return one of
4822    EOL_SEEN_XXX.  */
4823
4824 #define MAX_EOL_CHECK_COUNT 3
4825
4826 static int
4827 detect_eol (coding, source, src_bytes)
4828      struct coding_system *coding;
4829      unsigned char *source;
4830      EMACS_INT src_bytes;
4831 {
4832   Lisp_Object attrs, coding_type;
4833   unsigned char *src = source, *src_end = src + src_bytes;
4834   unsigned char c;
4835   int total  = 0;
4836   int eol_seen = EOL_SEEN_NONE;
4837
4838   attrs = CODING_ID_ATTRS (coding->id);
4839   coding_type = CODING_ATTR_TYPE (attrs);
4840
4841   if (EQ (coding_type, Qccl))
4842     {
4843       int msb, lsb;
4844
4845       msb = coding->spec.utf_16.endian == utf_16_little_endian;
4846       lsb = 1 - msb;
4847
4848       while (src + 1 < src_end)
4849         {
4850           c = src[lsb];
4851           if (src[msb] == 0 && (c == '\n' || c == '\r'))
4852             {
4853               int this_eol;
4854
4855               if (c == '\n')
4856                 this_eol = EOL_SEEN_LF;
4857               else if (src + 3 >= src_end
4858                        || src[msb + 2] != 0
4859                        || src[lsb + 2] != '\n')
4860                 this_eol = EOL_SEEN_CR;
4861               else
4862                 this_eol = EOL_SEEN_CRLF;
4863
4864               if (eol_seen == EOL_SEEN_NONE)
4865                 /* This is the first end-of-line.  */
4866                 eol_seen = this_eol;
4867               else if (eol_seen != this_eol)
4868                 {
4869                   /* The found type is different from what found before.  */
4870                   eol_seen = EOL_SEEN_LF;
4871                   break;
4872                 }
4873               if (++total == MAX_EOL_CHECK_COUNT)
4874                 break;
4875             }
4876           src += 2;
4877         }
4878     }
4879   else
4880     {
4881       while (src < src_end)
4882         {
4883           c = *src++;
4884           if (c == '\n' || c == '\r')
4885             {
4886               int this_eol;
4887
4888               if (c == '\n')
4889                 this_eol = EOL_SEEN_LF;
4890               else if (src >= src_end || *src != '\n')
4891                 this_eol = EOL_SEEN_CR;
4892               else
4893                 this_eol = EOL_SEEN_CRLF, src++;
4894
4895               if (eol_seen == EOL_SEEN_NONE)
4896                 /* This is the first end-of-line.  */
4897                 eol_seen = this_eol;
4898               else if (eol_seen != this_eol)
4899                 {
4900                   /* The found type is different from what found before.  */
4901                   eol_seen = EOL_SEEN_LF;
4902                   break;
4903                 }
4904               if (++total == MAX_EOL_CHECK_COUNT)
4905                 break;
4906             }
4907         }
4908     }
4909   return eol_seen;
4910 }
4911
4912
4913 static void
4914 adjust_coding_eol_type (coding, eol_seen)
4915      struct coding_system *coding;
4916      int eol_seen;
4917 {
4918   Lisp_Object eol_type;
4919
4920   eol_type = CODING_ID_EOL_TYPE (coding->id);
4921   if (eol_seen & EOL_SEEN_LF)
4922     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4923   else if (eol_type & EOL_SEEN_CRLF)
4924     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4925   else if (eol_type & EOL_SEEN_CR)
4926     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
4927 }
4928
4929 /* Detect how a text specified in CODING is encoded.  If a coding
4930    system is detected, update fields of CODING by the detected coding
4931    system.  */
4932
4933 void
4934 detect_coding (coding)
4935      struct coding_system *coding;
4936 {
4937   unsigned char *src, *src_end;
4938   Lisp_Object attrs, coding_type;
4939
4940   coding->consumed = coding->consumed_char = 0;
4941   coding->produced = coding->produced_char = 0;
4942   coding_set_source (coding);
4943
4944   src_end = coding->source + coding->src_bytes;
4945
4946   /* If we have not yet decided the text encoding type, detect it
4947      now.  */
4948   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
4949     {
4950       int mask = CATEGORY_MASK_ANY;
4951       int c, i;
4952
4953       for (src = coding->source; src < src_end; src++)
4954         {
4955           c = *src;
4956           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4957                                         || c == ISO_CODE_SI
4958                                         || c == ISO_CODE_SO)))
4959             break;
4960         }
4961       coding->head_ascii = src - (coding->source + coding->consumed);
4962
4963       if (coding->head_ascii < coding->src_bytes)
4964         {
4965           int detected = 0;
4966
4967           for (i = 0; i < coding_category_raw_text; i++)
4968             {
4969               enum coding_category category = coding_priorities[i];
4970               struct coding_system *this = coding_categories + category;
4971
4972               if (category >= coding_category_raw_text
4973                   || detected & (1 << category))
4974                 continue;
4975
4976               if (this->id < 0)
4977                 {
4978                   /* No coding system of this category is defined.  */
4979                   mask &= ~(1 << category);
4980                 }
4981               else
4982                 {
4983                   detected |= detected_mask[category];
4984                   if ((*(this->detector)) (coding, &mask))
4985                     break;
4986                 }
4987             }
4988           if (! mask)
4989             setup_coding_system (Qraw_text, coding);
4990           else if (mask != CATEGORY_MASK_ANY)
4991             for (i = 0; i < coding_category_raw_text; i++)
4992               {
4993                 enum coding_category category = coding_priorities[i];
4994                 struct coding_system *this = coding_categories + category;
4995
4996                 if (mask & (1 << category))
4997                   {
4998                     setup_coding_system (CODING_ID_NAME (this->id), coding);
4999                     break;
5000                   }
5001               }
5002         }
5003     }
5004
5005   attrs = CODING_ID_ATTRS (coding->id);
5006   coding_type = CODING_ATTR_TYPE (attrs);
5007
5008   /* If we have not yet decided the EOL type, detect it now.  But, the
5009      detection is impossible for a CCL based coding system, in which
5010      case, we detct the EOL type after decoding.  */
5011   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5012       && ! EQ (coding_type, Qccl))
5013     {
5014       int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5015
5016       if (eol_seen != EOL_SEEN_NONE)
5017         adjust_coding_eol_type (coding, eol_seen);
5018     }
5019 }
5020
5021
5022 static void
5023 decode_eol (coding)
5024      struct coding_system *coding;
5025 {
5026   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5027     {
5028       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5029       unsigned char *pend = p + coding->produced;
5030       int eol_seen = EOL_SEEN_NONE;
5031
5032       for (; p < pend; p++)
5033         {
5034           if (*p == '\n')
5035             eol_seen |= EOL_SEEN_LF;
5036           else if (*p == '\r')
5037             {
5038               if (p + 1 < pend && *(p + 1) == '\n')
5039                 {
5040                   eol_seen |= EOL_SEEN_CRLF;
5041                   p++;
5042                 }
5043               else
5044                 eol_seen |= EOL_SEEN_CR;
5045             }
5046         }
5047       if (eol_seen != EOL_SEEN_NONE)
5048         adjust_coding_eol_type (coding, eol_seen);
5049     }
5050
5051   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5052     {
5053       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5054       unsigned char *pend = p + coding->produced;
5055
5056       for (; p < pend; p++)
5057         if (*p == '\r')
5058           *p = '\n';
5059     }
5060   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5061     {
5062       unsigned char *p, *pbeg, *pend;
5063       Lisp_Object undo_list;
5064
5065       move_gap_both (coding->dst_pos + coding->produced_char,
5066                      coding->dst_pos_byte + coding->produced);
5067       undo_list = current_buffer->undo_list;
5068       current_buffer->undo_list = Qt;
5069       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5070       current_buffer->undo_list = undo_list;
5071       pbeg = GPT_ADDR;
5072       pend = pbeg + coding->produced;
5073
5074       for (p = pend - 1; p >= pbeg; p--)
5075         if (*p == '\r')
5076           {
5077             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5078             pend--;
5079           }
5080       coding->produced_char -= coding->produced - (pend - pbeg);
5081       coding->produced = pend - pbeg;
5082       insert_from_gap (coding->produced_char, coding->produced);
5083     }
5084 }
5085
5086 static void
5087 translate_chars (coding, table)
5088      struct coding_system *coding;
5089      Lisp_Object table;
5090 {
5091   int *charbuf = coding->charbuf;
5092   int *charbuf_end = charbuf + coding->charbuf_used;
5093   int c;
5094
5095   if (coding->chars_at_source)
5096     return;
5097
5098   while (charbuf < charbuf_end)
5099     {
5100       c = *charbuf;
5101       if (c < 0)
5102         charbuf += c;
5103       else
5104         *charbuf++ = translate_char (table, c);
5105     }
5106 }
5107
5108 static int
5109 produce_chars (coding)
5110      struct coding_system *coding;
5111 {
5112   unsigned char *dst = coding->destination + coding->produced;
5113   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5114   int produced;
5115   int produced_chars = 0;
5116
5117   if (! coding->chars_at_source)
5118     {
5119       /* Characters are in coding->charbuf.  */
5120       int *buf = coding->charbuf;
5121       int *buf_end = buf + coding->charbuf_used;
5122       unsigned char *adjusted_dst_end;
5123
5124       if (BUFFERP (coding->src_object)
5125           && EQ (coding->src_object, coding->dst_object))
5126         dst_end = coding->source + coding->consumed;
5127       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5128
5129       while (buf < buf_end)
5130         {
5131           int c = *buf++;
5132
5133           if (dst >= adjusted_dst_end)
5134             {
5135               dst = alloc_destination (coding,
5136                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5137                                        dst);
5138               dst_end = coding->destination + coding->dst_bytes;
5139               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5140             }
5141           if (c >= 0)
5142             {
5143               if (coding->dst_multibyte
5144                   || ! CHAR_BYTE8_P (c))
5145                 CHAR_STRING_ADVANCE (c, dst);
5146               else
5147                 *dst++ = CHAR_TO_BYTE8 (c);
5148               produced_chars++;
5149             }
5150           else
5151             /* This is an annotation data.  */
5152             buf -= c + 1;
5153         }
5154     }
5155   else
5156     {
5157       unsigned char *src = coding->source;
5158       unsigned char *src_end = src + coding->src_bytes;
5159       Lisp_Object eol_type;
5160
5161       eol_type = CODING_ID_EOL_TYPE (coding->id);
5162
5163       if (coding->src_multibyte != coding->dst_multibyte)
5164         {
5165           if (coding->src_multibyte)
5166             {
5167               int multibytep = 1;
5168               int consumed_chars;
5169
5170               while (1)
5171                 {
5172                   unsigned char *src_base = src;
5173                   int c;
5174
5175                   ONE_MORE_BYTE (c);
5176                   if (c == '\r')
5177                     {
5178                       if (EQ (eol_type, Qdos))
5179                         {
5180                           if (src < src_end
5181                               && *src == '\n')
5182                             c = *src++;
5183                         }
5184                       else if (EQ (eol_type, Qmac))
5185                         c = '\n';
5186                     }
5187                   if (dst == dst_end)
5188                     {
5189                       coding->consumed = src - coding->source;
5190
5191                     if (EQ (coding->src_object, coding->dst_object))
5192                       dst_end = src;
5193                     if (dst == dst_end)
5194                       {
5195                         dst = alloc_destination (coding, src_end - src + 1,
5196                                                  dst);
5197                         dst_end = coding->destination + coding->dst_bytes;
5198                         coding_set_source (coding);
5199                         src = coding->source + coding->consumed;
5200                         src_end = coding->source + coding->src_bytes;
5201                       }
5202                     }
5203                   *dst++ = c;
5204                   produced_chars++;
5205                 }
5206             no_more_source:
5207               ;
5208             }
5209           else
5210             while (src < src_end)
5211               {
5212                 int multibytep = 1;
5213                 int c = *src++;
5214
5215                 if (c == '\r')
5216                   {
5217                     if (EQ (eol_type, Qdos))
5218                       {
5219                         if (src < src_end
5220                             && *src == '\n')
5221                           c = *src++;
5222                       }
5223                     else if (EQ (eol_type, Qmac))
5224                       c = '\n';
5225                   }
5226                 if (dst >= dst_end - 1)
5227                   {
5228                     coding->consumed = src - coding->source;
5229
5230                     if (EQ (coding->src_object, coding->dst_object))
5231                       dst_end = src;
5232                     if (dst >= dst_end - 1)
5233                       {
5234                         dst = alloc_destination (coding, src_end - src + 2,
5235                                                  dst);
5236                         dst_end = coding->destination + coding->dst_bytes;
5237                         coding_set_source (coding);
5238                         src = coding->source + coding->consumed;
5239                         src_end = coding->source + coding->src_bytes;
5240                       }
5241                   }
5242                 EMIT_ONE_BYTE (c);
5243               }
5244         }
5245       else
5246         {
5247           if (!EQ (coding->src_object, coding->dst_object))
5248             {
5249               int require = coding->src_bytes - coding->dst_bytes;
5250
5251               if (require > 0)
5252                 {
5253                   EMACS_INT offset = src - coding->source;
5254
5255                   dst = alloc_destination (coding, require, dst);
5256                   coding_set_source (coding);
5257                   src = coding->source + offset;
5258                   src_end = coding->source + coding->src_bytes;
5259                 }
5260             }
5261           produced_chars = coding->src_chars;
5262           while (src < src_end)
5263             {
5264               int c = *src++;
5265
5266               if (c == '\r')
5267                 {
5268                   if (EQ (eol_type, Qdos))
5269                     {
5270                       if (src < src_end
5271                           && *src == '\n')
5272                         c = *src++;
5273                       produced_chars--;
5274                     }
5275                   else if (EQ (eol_type, Qmac))
5276                     c = '\n';
5277                 }
5278               *dst++ = c;
5279             }
5280         }
5281       coding->consumed = coding->src_bytes;
5282       coding->consumed_char = coding->src_chars;
5283     }
5284
5285   produced = dst - (coding->destination + coding->produced);
5286   if (BUFFERP (coding->dst_object))
5287     insert_from_gap (produced_chars, produced);
5288   coding->produced += produced;
5289   coding->produced_char += produced_chars;
5290   return produced_chars;
5291 }
5292
5293 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5294         or
5295    [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5296  */
5297
5298 static INLINE void
5299 produce_composition (coding, charbuf)
5300      struct coding_system *coding;
5301      int *charbuf;
5302 {
5303   Lisp_Object buffer;
5304   int len;
5305   EMACS_INT pos;
5306   enum composition_method method;
5307   int cmp_len;
5308   Lisp_Object components;
5309
5310   buffer = coding->dst_object;
5311   len = -charbuf[0];
5312   pos = coding->dst_pos + charbuf[1];
5313   method = (enum composition_method) (charbuf[3]);
5314   cmp_len = charbuf[4];
5315
5316   if (method == COMPOSITION_RELATIVE)
5317     components = Qnil;
5318   else
5319     {
5320       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5321       int i;
5322
5323       len -= 5;
5324       charbuf += 5;
5325       for (i = 0; i < len; i++)
5326         args[i] = make_number (charbuf[i]);
5327       components = (method == COMPOSITION_WITH_ALTCHARS
5328                     ? Fstring (len, args) : Fvector (len, args));
5329     }
5330   compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5331 }
5332
5333 static int *
5334 save_composition_data (buf, buf_end, prop)
5335      int *buf, *buf_end;
5336      Lisp_Object prop;
5337 {
5338   enum composition_method method = COMPOSITION_METHOD (prop);
5339   int cmp_len = COMPOSITION_LENGTH (prop);
5340
5341   if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5342     return NULL;
5343
5344   buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5345   buf[2] = method;
5346   buf[3] = cmp_len;
5347
5348   if (method == COMPOSITION_RELATIVE)
5349     buf[0] = 4;
5350   else
5351     {
5352       Lisp_Object components;
5353       int len, i;
5354
5355       components = COMPOSITION_COMPONENTS (prop);
5356       if (VECTORP (components))
5357         {
5358           len = XVECTOR (components)->size;
5359           for (i = 0; i < len; i++)
5360             buf[4 + i] = XINT (AREF (components, i));
5361         }
5362       else if (STRINGP (components))
5363         {
5364           int i_byte;
5365
5366           len = XSTRING (components)->size;
5367           i = i_byte = 0;
5368           while (i < len)
5369             FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5370         }
5371       else if (INTEGERP (components))
5372         {
5373           len = 1;
5374           buf[4] = XINT (components);
5375         }
5376       else if (CONSP (components))
5377         {
5378           for (len = 0; CONSP (components);
5379                len++, components = XCDR (components))
5380             buf[4 + len] = XINT (XCAR (components));
5381         }
5382       else
5383         abort ();
5384       buf[0] = 4 + len;
5385     }
5386   return (buf + buf[0]);
5387 }
5388
5389 #define CHARBUF_SIZE 0x4000
5390
5391 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5392   do {                                                                  \
5393     int size = CHARBUF_SIZE;;                                           \
5394                                                                         \
5395     coding->charbuf = NULL;                                             \
5396     while (size > 1024)                                                 \
5397       {                                                                 \
5398         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5399         if (coding->charbuf)                                            \
5400           break;                                                        \
5401         size >>= 1;                                                     \
5402       }                                                                 \
5403     if (! coding->charbuf)                                              \
5404       {                                                                 \
5405         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5406         return coding->result;                                          \
5407       }                                                                 \
5408     coding->charbuf_size = size;                                        \
5409   } while (0)
5410
5411
5412 static void
5413 produce_annotation (coding)
5414      struct coding_system *coding;
5415 {
5416   int *charbuf = coding->charbuf;
5417   int *charbuf_end = charbuf + coding->charbuf_used;
5418
5419   while (charbuf < charbuf_end)
5420     {
5421       if (*charbuf >= 0)
5422         charbuf++;
5423       else
5424         {
5425           int len = -*charbuf;
5426           switch (charbuf[2])
5427             {
5428             case CODING_ANNOTATE_COMPOSITION_MASK:
5429               produce_composition (coding, charbuf);
5430               break;
5431             default:
5432               abort ();
5433             }
5434           charbuf += len;
5435         }
5436     }
5437 }
5438
5439 /* Decode the data at CODING->src_object into CODING->dst_object.
5440    CODING->src_object is a buffer, a string, or nil.
5441    CODING->dst_object is a buffer.
5442
5443    If CODING->src_object is a buffer, it must be the current buffer.
5444    In this case, if CODING->src_pos is positive, it is a position of
5445    the source text in the buffer, otherwise, the source text is in the
5446    gap area of the buffer, and CODING->src_pos specifies the offset of
5447    the text from GPT (which must be the same as PT).  If this is the
5448    same buffer as CODING->dst_object, CODING->src_pos must be
5449    negative.
5450
5451    If CODING->src_object is a string, CODING->src_pos in an index to
5452    that string.
5453
5454    If CODING->src_object is nil, CODING->source must already point to
5455    the non-relocatable memory area.  In this case, CODING->src_pos is
5456    an offset from CODING->source.
5457
5458    The decoded data is inserted at the current point of the buffer
5459    CODING->dst_object.
5460 */
5461
5462 static int
5463 decode_coding (coding)
5464      struct coding_system *coding;
5465 {
5466   Lisp_Object attrs;
5467
5468   if (BUFFERP (coding->src_object)
5469       && coding->src_pos > 0
5470       && coding->src_pos < GPT
5471       && coding->src_pos + coding->src_chars > GPT)
5472     move_gap_both (coding->src_pos, coding->src_pos_byte);
5473
5474   if (BUFFERP (coding->dst_object))
5475     {
5476       if (current_buffer != XBUFFER (coding->dst_object))
5477         set_buffer_internal (XBUFFER (coding->dst_object));
5478       if (GPT != PT)
5479         move_gap_both (PT, PT_BYTE);
5480     }
5481
5482   coding->consumed = coding->consumed_char = 0;
5483   coding->produced = coding->produced_char = 0;
5484   coding->chars_at_source = 0;
5485   coding->result = CODING_RESULT_SUCCESS;
5486   coding->errors = 0;
5487
5488   ALLOC_CONVERSION_WORK_AREA (coding);
5489
5490   attrs = CODING_ID_ATTRS (coding->id);
5491
5492   do
5493     {
5494       coding_set_source (coding);
5495       coding->annotated = 0;
5496       (*(coding->decoder)) (coding);
5497       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5498         translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5499       coding_set_destination (coding);
5500       produce_chars (coding);
5501       if (coding->annotated)
5502         produce_annotation (coding);
5503     }
5504   while (coding->consumed < coding->src_bytes
5505          && ! coding->result);
5506
5507   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5508       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5509       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5510     decode_eol (coding);
5511
5512   coding->carryover_bytes = 0;
5513   if (coding->consumed < coding->src_bytes)
5514     {
5515       int nbytes = coding->src_bytes - coding->consumed;
5516       unsigned char *src;
5517
5518       coding_set_source (coding);
5519       coding_set_destination (coding);
5520       src = coding->source + coding->consumed;
5521
5522       if (coding->mode & CODING_MODE_LAST_BLOCK)
5523         {
5524           /* Flush out unprocessed data as binary chars.  We are sure
5525              that the number of data is less than the size of
5526              coding->charbuf.  */
5527           int *charbuf = coding->charbuf;
5528
5529           while (nbytes-- > 0)
5530             {
5531               int c = *src++;
5532               *charbuf++ =  (c & 0x80 ? - c : c);
5533             }
5534           produce_chars (coding);
5535         }
5536       else
5537         {
5538           /* Record unprocessed bytes in coding->carryover.  We are
5539              sure that the number of data is less than the size of
5540              coding->carryover.  */
5541           unsigned char *p = coding->carryover;
5542
5543           coding->carryover_bytes = nbytes;
5544           while (nbytes-- > 0)
5545             *p++ = *src++;
5546         }
5547       coding->consumed = coding->src_bytes;
5548     }
5549
5550   return coding->result;
5551 }
5552
5553 static void
5554 consume_chars (coding)
5555      struct coding_system *coding;
5556 {
5557   int *buf = coding->charbuf;
5558   /* -1 is to compensate for CRLF.  */
5559   int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5560   unsigned char *src = coding->source + coding->consumed;
5561   int pos = coding->src_pos + coding->consumed_char;
5562   int end_pos = coding->src_pos + coding->src_chars;
5563   int multibytep = coding->src_multibyte;
5564   Lisp_Object eol_type;
5565   int c;
5566   int start, end, stop;
5567   Lisp_Object object, prop;
5568
5569   eol_type = CODING_ID_EOL_TYPE (coding->id);
5570   if (VECTORP (eol_type))
5571     eol_type = Qunix;
5572
5573   object = coding->src_object;
5574
5575   /* Note: composition handling is not yet implemented.  */
5576   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5577
5578   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5579       && find_composition (pos, end_pos, &start, &end, &prop, object)
5580       && end <= end_pos
5581       && (start >= pos
5582           || (find_composition (end, end_pos, &start, &end, &prop, object)
5583               && end <= end_pos)))
5584     stop = start;
5585   else
5586     stop = end_pos;
5587
5588   while (buf < buf_end)
5589     {
5590       if (pos == stop)
5591         {
5592           int *p;
5593
5594           if (pos == end_pos)
5595             break;
5596           p = save_composition_data (buf, buf_end, prop);
5597           if (p == NULL)
5598             break;
5599           buf = p;
5600           if (find_composition (end, end_pos, &start, &end, &prop, object)
5601               && end <= end_pos)
5602             stop = start;
5603           else
5604             stop = end_pos;
5605         }
5606
5607       if (! multibytep)
5608         c = *src++;
5609       else
5610         c = STRING_CHAR_ADVANCE (src);
5611       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5612         c = '\n';
5613       if (! EQ (eol_type, Qunix))
5614         {
5615           if (c == '\n')
5616             {
5617               if (EQ (eol_type, Qdos))
5618                 *buf++ = '\r';
5619               else
5620                 c = '\r';
5621             }
5622         }
5623       *buf++ = c;
5624       pos++;
5625     }
5626
5627   coding->consumed = src - coding->source;
5628   coding->consumed_char = pos - coding->src_pos;
5629   coding->charbuf_used = buf - coding->charbuf;
5630   coding->chars_at_source = 0;
5631 }
5632
5633
5634 /* Encode the text at CODING->src_object into CODING->dst_object.
5635    CODING->src_object is a buffer or a string.
5636    CODING->dst_object is a buffer or nil.
5637
5638    If CODING->src_object is a buffer, it must be the current buffer.
5639    In this case, if CODING->src_pos is positive, it is a position of
5640    the source text in the buffer, otherwise. the source text is in the
5641    gap area of the buffer, and coding->src_pos specifies the offset of
5642    the text from GPT (which must be the same as PT).  If this is the
5643    same buffer as CODING->dst_object, CODING->src_pos must be
5644    negative and CODING should not have `pre-write-conversion'.
5645
5646    If CODING->src_object is a string, CODING should not have
5647    `pre-write-conversion'.
5648
5649    If CODING->dst_object is a buffer, the encoded data is inserted at
5650    the current point of that buffer.
5651
5652    If CODING->dst_object is nil, the encoded data is placed at the
5653    memory area specified by CODING->destination.  */
5654
5655 static int
5656 encode_coding (coding)
5657      struct coding_system *coding;
5658 {
5659   Lisp_Object attrs;
5660
5661   attrs = CODING_ID_ATTRS (coding->id);
5662
5663   if (BUFFERP (coding->dst_object))
5664     {
5665       set_buffer_internal (XBUFFER (coding->dst_object));
5666       coding->dst_multibyte
5667         = ! NILP (current_buffer->enable_multibyte_characters);
5668     }
5669
5670   coding->consumed = coding->consumed_char = 0;
5671   coding->produced = coding->produced_char = 0;
5672   coding->result = CODING_RESULT_SUCCESS;
5673   coding->errors = 0;
5674
5675   ALLOC_CONVERSION_WORK_AREA (coding);
5676
5677   do {
5678     coding_set_source (coding);
5679     consume_chars (coding);
5680
5681     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5682       translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5683
5684     coding_set_destination (coding);
5685     (*(coding->encoder)) (coding);
5686   } while (coding->consumed_char < coding->src_chars);
5687
5688   if (BUFFERP (coding->dst_object))
5689     insert_from_gap (coding->produced_char, coding->produced);
5690
5691   return (coding->result);
5692 }
5693
5694 /* Work buffer */
5695
5696 /* List of currently used working buffer.  */
5697 Lisp_Object Vcode_conversion_work_buf_list;
5698
5699 /* A working buffer used by the top level conversion.  */
5700 Lisp_Object Vcode_conversion_reused_work_buf;
5701
5702
5703 /* Return a working buffer that can be freely used by the following
5704    code conversion.  MULTIBYTEP specifies the multibyteness of the
5705    buffer.  */
5706
5707 Lisp_Object
5708 make_conversion_work_buffer (multibytep)
5709      int multibytep;
5710 {
5711   struct buffer *current = current_buffer;
5712   Lisp_Object buf;
5713
5714   if (NILP (Vcode_conversion_work_buf_list))
5715     {
5716       if (NILP (Vcode_conversion_reused_work_buf))
5717         Vcode_conversion_reused_work_buf
5718           = Fget_buffer_create (build_string (" *code-conversion-work*"));
5719       Vcode_conversion_work_buf_list
5720         = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5721     }
5722   else
5723     {
5724       int depth = Flength (Vcode_conversion_work_buf_list);
5725       char str[128];
5726
5727       sprintf (str, " *code-conversion-work*<%d>", depth);
5728       Vcode_conversion_work_buf_list
5729         = Fcons (Fget_buffer_create (build_string (str)),
5730                  Vcode_conversion_work_buf_list);
5731     }
5732
5733   buf = XCAR (Vcode_conversion_work_buf_list);
5734   set_buffer_internal (XBUFFER (buf));
5735   current_buffer->undo_list = Qt;
5736   Ferase_buffer ();
5737   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5738   set_buffer_internal (current);
5739   return buf;
5740 }
5741
5742 static struct coding_system *saved_coding;
5743
5744 Lisp_Object
5745 code_conversion_restore (info)
5746      Lisp_Object info;
5747 {
5748   int depth = Flength (Vcode_conversion_work_buf_list);
5749   Lisp_Object buf;
5750
5751   if (depth > 0)
5752     {
5753       buf = XCAR (Vcode_conversion_work_buf_list);
5754       Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5755       if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5756         Fkill_buffer (buf);
5757     }
5758
5759   if (saved_coding->dst_object == Qt
5760       && saved_coding->destination)
5761     xfree (saved_coding->destination);
5762
5763   return save_excursion_restore (info);
5764 }
5765
5766
5767 int
5768 decode_coding_gap (coding, chars, bytes)
5769      struct coding_system *coding;
5770      EMACS_INT chars, bytes;
5771 {
5772   int count = specpdl_ptr - specpdl;
5773
5774   saved_coding = coding;
5775   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5776
5777   coding->src_object = Fcurrent_buffer ();
5778   coding->src_chars = chars;
5779   coding->src_bytes = bytes;
5780   coding->src_pos = -chars;
5781   coding->src_pos_byte = -bytes;
5782   coding->src_multibyte = chars < bytes;
5783   coding->dst_object = coding->src_object;
5784   coding->dst_pos = PT;
5785   coding->dst_pos_byte = PT_BYTE;
5786   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5787
5788   if (CODING_REQUIRE_DETECTION (coding))
5789     detect_coding (coding);
5790
5791   decode_coding (coding);
5792
5793   unbind_to (count, Qnil);
5794   return coding->result;
5795 }
5796
5797 int
5798 encode_coding_gap (coding, chars, bytes)
5799      struct coding_system *coding;
5800      EMACS_INT chars, bytes;
5801 {
5802   int count = specpdl_ptr - specpdl;
5803   Lisp_Object buffer;
5804
5805   saved_coding = coding;
5806   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5807
5808   buffer = Fcurrent_buffer ();
5809   coding->src_object = buffer;
5810   coding->src_chars = chars;
5811   coding->src_bytes = bytes;
5812   coding->src_pos = -chars;
5813   coding->src_pos_byte = -bytes;
5814   coding->src_multibyte = chars < bytes;
5815   coding->dst_object = coding->src_object;
5816   coding->dst_pos = PT;
5817   coding->dst_pos_byte = PT_BYTE;
5818
5819   encode_coding (coding);
5820
5821   unbind_to (count, Qnil);
5822   return coding->result;
5823 }
5824
5825
5826 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5827    SRC_OBJECT into DST_OBJECT by coding context CODING.
5828
5829    SRC_OBJECT is a buffer, a string, or Qnil.
5830
5831    If it is a buffer, the text is at point of the buffer.  FROM and TO
5832    are positions in the buffer.
5833
5834    If it is a string, the text is at the beginning of the string.
5835    FROM and TO are indices to the string.
5836
5837    If it is nil, the text is at coding->source.  FROM and TO are
5838    indices to coding->source.
5839
5840    DST_OBJECT is a buffer, Qt, or Qnil.
5841
5842    If it is a buffer, the decoded text is inserted at point of the
5843    buffer.  If the buffer is the same as SRC_OBJECT, the source text
5844    is deleted.
5845
5846    If it is Qt, a string is made from the decoded text, and
5847    set in CODING->dst_object.
5848
5849    If it is Qnil, the decoded text is stored at CODING->destination.
5850    The called must allocate CODING->dst_bytes bytes at
5851    CODING->destination by xmalloc.  If the decoded text is longer than
5852    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5853  */
5854
5855 void
5856 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5857                       dst_object)
5858      struct coding_system *coding;
5859      Lisp_Object src_object;
5860      EMACS_INT from, from_byte, to, to_byte;
5861      Lisp_Object dst_object;
5862 {
5863   int count = specpdl_ptr - specpdl;
5864   unsigned char *destination;
5865   EMACS_INT dst_bytes;
5866   EMACS_INT chars = to - from;
5867   EMACS_INT bytes = to_byte - from_byte;
5868   Lisp_Object attrs;
5869
5870   saved_coding = coding;
5871   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5872
5873   if (NILP (dst_object))
5874     {
5875       destination = coding->destination;
5876       dst_bytes = coding->dst_bytes;
5877     }
5878
5879   coding->src_object = src_object;
5880   coding->src_chars = chars;
5881   coding->src_bytes = bytes;
5882   coding->src_multibyte = chars < bytes;
5883
5884   if (STRINGP (src_object))
5885     {
5886       coding->src_pos = from;
5887       coding->src_pos_byte = from_byte;
5888     }
5889   else if (BUFFERP (src_object))
5890     {
5891       set_buffer_internal (XBUFFER (src_object));
5892       if (from != GPT)
5893         move_gap_both (from, from_byte);
5894       if (EQ (src_object, dst_object))
5895         {
5896           TEMP_SET_PT_BOTH (from, from_byte);
5897           del_range_both (from, from_byte, to, to_byte, 1);
5898           coding->src_pos = -chars;
5899           coding->src_pos_byte = -bytes;
5900         }
5901       else
5902         {
5903           coding->src_pos = from;
5904           coding->src_pos_byte = from_byte;
5905         }
5906     }
5907
5908   if (CODING_REQUIRE_DETECTION (coding))
5909     detect_coding (coding);
5910   attrs = CODING_ID_ATTRS (coding->id);
5911
5912   if (! NILP (CODING_ATTR_POST_READ (attrs))
5913       || EQ (dst_object, Qt))
5914     {
5915       coding->dst_object = make_conversion_work_buffer (1);
5916       coding->dst_pos = BEG;
5917       coding->dst_pos_byte = BEG_BYTE;
5918       coding->dst_multibyte = 1;
5919     }
5920   else if (BUFFERP (dst_object))
5921     {
5922       coding->dst_object = dst_object;
5923       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5924       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5925       coding->dst_multibyte
5926         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5927     }
5928   else
5929     {
5930       coding->dst_object = Qnil;
5931       coding->dst_multibyte = 1;
5932     }
5933
5934   decode_coding (coding);
5935
5936   if (BUFFERP (coding->dst_object))
5937     set_buffer_internal (XBUFFER (coding->dst_object));
5938
5939   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5940     {
5941       struct gcpro gcpro1, gcpro2;
5942       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5943       Lisp_Object val;
5944
5945       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5946       GCPRO2 (coding->src_object, coding->dst_object);
5947       val = call1 (CODING_ATTR_POST_READ (attrs),
5948                    make_number (coding->produced_char));
5949       UNGCPRO;
5950       CHECK_NATNUM (val);
5951       coding->produced_char += Z - prev_Z;
5952       coding->produced += Z_BYTE - prev_Z_BYTE;
5953     }
5954
5955   if (EQ (dst_object, Qt))
5956     {
5957       coding->dst_object = Fbuffer_string ();
5958     }
5959   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5960     {
5961       set_buffer_internal (XBUFFER (coding->dst_object));
5962       if (dst_bytes < coding->produced)
5963         {
5964           destination
5965             = (unsigned char *) xrealloc (destination, coding->produced);
5966           if (! destination)
5967             {
5968               coding->result = CODING_RESULT_INSUFFICIENT_DST;
5969               unbind_to (count, Qnil);
5970               return;
5971             }
5972           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5973             move_gap_both (BEGV, BEGV_BYTE);
5974           bcopy (BEGV_ADDR, destination, coding->produced);
5975           coding->destination = destination;
5976         }
5977     }
5978
5979   unbind_to (count, Qnil);
5980 }
5981
5982
5983 void
5984 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5985                       dst_object)
5986      struct coding_system *coding;
5987      Lisp_Object src_object;
5988      EMACS_INT from, from_byte, to, to_byte;
5989      Lisp_Object dst_object;
5990 {
5991   int count = specpdl_ptr - specpdl;
5992   EMACS_INT chars = to - from;
5993   EMACS_INT bytes = to_byte - from_byte;
5994   Lisp_Object attrs;
5995
5996   saved_coding = coding;
5997   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5998
5999   coding->src_object = src_object;
6000   coding->src_chars = chars;
6001   coding->src_bytes = bytes;
6002   coding->src_multibyte = chars < bytes;
6003
6004   attrs = CODING_ID_ATTRS (coding->id);
6005
6006   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6007     {
6008       coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6009       set_buffer_internal (XBUFFER (coding->src_object));
6010       if (STRINGP (src_object))
6011         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6012       else if (BUFFERP (src_object))
6013         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6014       else
6015         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6016
6017       if (EQ (src_object, dst_object))
6018         {
6019           set_buffer_internal (XBUFFER (src_object));
6020           del_range_both (from, from_byte, to, to_byte, 1);
6021           set_buffer_internal (XBUFFER (coding->src_object));
6022         }
6023
6024       call2 (CODING_ATTR_PRE_WRITE (attrs),
6025              make_number (BEG), make_number (Z));
6026       coding->src_object = Fcurrent_buffer ();
6027       if (BEG != GPT)
6028         move_gap_both (BEG, BEG_BYTE);
6029       coding->src_chars = Z - BEG;
6030       coding->src_bytes = Z_BYTE - BEG_BYTE;
6031       coding->src_pos = BEG;
6032       coding->src_pos_byte = BEG_BYTE;
6033       coding->src_multibyte = Z < Z_BYTE;
6034     }
6035   else if (STRINGP (src_object))
6036     {
6037       coding->src_pos = from;
6038       coding->src_pos_byte = from_byte;
6039     }
6040   else if (BUFFERP (src_object))
6041     {
6042       set_buffer_internal (XBUFFER (src_object));
6043       if (from != GPT)
6044         move_gap_both (from, from_byte);
6045       if (EQ (src_object, dst_object))
6046         {
6047           del_range_both (from, from_byte, to, to_byte, 1);
6048           coding->src_pos = -chars;
6049           coding->src_pos_byte = -bytes;
6050         }
6051       else
6052         {
6053           coding->src_pos = from;
6054           coding->src_pos_byte = from_byte;
6055         }
6056     }
6057
6058   if (BUFFERP (dst_object))
6059     {
6060       coding->dst_object = dst_object;
6061       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6062       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6063       coding->dst_multibyte
6064         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6065     }
6066   else if (EQ (dst_object, Qt))
6067     {
6068       coding->dst_object = Qnil;
6069       coding->dst_bytes = coding->src_chars;
6070       if (coding->dst_bytes == 0)
6071         coding->dst_bytes = 1;
6072       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6073       coding->dst_multibyte = 0;
6074     }
6075   else
6076     {
6077       coding->dst_object = Qnil;
6078       coding->dst_multibyte = 0;
6079     }
6080
6081   encode_coding (coding);
6082
6083   if (EQ (dst_object, Qt))
6084     {
6085       if (BUFFERP (coding->dst_object))
6086         coding->dst_object = Fbuffer_string ();
6087       else
6088         {
6089           coding->dst_object
6090             = make_unibyte_string ((char *) coding->destination,
6091                                    coding->produced);
6092           xfree (coding->destination);
6093         }
6094     }
6095
6096   unbind_to (count, Qnil);
6097 }
6098
6099
6100 Lisp_Object
6101 preferred_coding_system ()
6102 {
6103   int id = coding_categories[coding_priorities[0]].id;
6104
6105   return CODING_ID_NAME (id);
6106 }
6107
6108 \f
6109 #ifdef emacs
6110 /*** 8. Emacs Lisp library functions ***/
6111
6112 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6113        doc: /* Return t if OBJECT is nil or a coding-system.
6114 See the documentation of `define-coding-system' for information
6115 about coding-system objects.  */)
6116      (obj)
6117      Lisp_Object obj;
6118 {
6119   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6120 }
6121
6122 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6123        Sread_non_nil_coding_system, 1, 1, 0,
6124        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6125      (prompt)
6126      Lisp_Object prompt;
6127 {
6128   Lisp_Object val;
6129   do
6130     {
6131       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6132                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6133     }
6134   while (XSTRING (val)->size == 0);
6135   return (Fintern (val, Qnil));
6136 }
6137
6138 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6139        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6140 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6141      (prompt, default_coding_system)
6142      Lisp_Object prompt, default_coding_system;
6143 {
6144   Lisp_Object val;
6145   if (SYMBOLP (default_coding_system))
6146     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6147   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6148                           Qt, Qnil, Qcoding_system_history,
6149                           default_coding_system, Qnil);
6150   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6151 }
6152
6153 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6154        1, 1, 0,
6155        doc: /* Check validity of CODING-SYSTEM.
6156 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6157 It is valid if it is a symbol with a non-nil `coding-system' property.
6158 The value of property should be a vector of length 5.  */)
6159   (coding_system)
6160      Lisp_Object coding_system;
6161 {
6162   CHECK_SYMBOL (coding_system);
6163   if (!NILP (Fcoding_system_p (coding_system)))
6164     return coding_system;
6165   while (1)
6166     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6167 }
6168
6169 \f
6170 Lisp_Object
6171 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6172      unsigned char *src;
6173      int src_bytes, highest;
6174      int multibytep;
6175      Lisp_Object coding_system;
6176 {
6177   unsigned char *src_end = src + src_bytes;
6178   int mask = CATEGORY_MASK_ANY;
6179   int detected = 0;
6180   int c, i;
6181   Lisp_Object attrs, eol_type;
6182   Lisp_Object val;
6183   struct coding_system coding;
6184
6185   if (NILP (coding_system))
6186     coding_system = Qundecided;
6187   setup_coding_system (coding_system, &coding);
6188   attrs = CODING_ID_ATTRS (coding.id);
6189   eol_type = CODING_ID_EOL_TYPE (coding.id);
6190
6191   coding.source = src;
6192   coding.src_bytes = src_bytes;
6193   coding.src_multibyte = multibytep;
6194   coding.consumed = 0;
6195
6196   if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6197     {
6198       mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6199     }
6200   else
6201     {
6202       coding_system = Qnil;
6203       for (; src < src_end; src++)
6204         {
6205           c = *src;
6206           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6207                                         || c == ISO_CODE_SI
6208                                         || c == ISO_CODE_SO)))
6209             break;
6210         }
6211       coding.head_ascii = src - coding.source;
6212
6213       if (src < src_end)
6214         for (i = 0; i < coding_category_raw_text; i++)
6215           {
6216             enum coding_category category = coding_priorities[i];
6217             struct coding_system *this = coding_categories + category;
6218
6219             if (category >= coding_category_raw_text
6220                 || detected & (1 << category))
6221               continue;
6222
6223             if (this->id < 0)
6224               {
6225                 /* No coding system of this category is defined.  */
6226                 mask &= ~(1 << category);
6227               }
6228             else
6229               {
6230                 detected |= detected_mask[category];
6231                 if ((*(coding_categories[category].detector)) (&coding, &mask)
6232                     && highest)
6233                   {
6234                     mask &= detected_mask[category];
6235                     break;
6236                   }
6237               }
6238           }
6239     }
6240
6241   if (!mask)
6242     val = Fcons (make_number (coding_category_raw_text), Qnil);
6243   else if (mask == CATEGORY_MASK_ANY)
6244     val = Fcons (make_number (coding_category_undecided), Qnil);
6245   else if (highest)
6246     {
6247       for (i = 0; i < coding_category_raw_text; i++)
6248         if (mask & (1 << coding_priorities[i]))
6249           {
6250             val = Fcons (make_number (coding_priorities[i]), Qnil);
6251             break;
6252           }
6253     }
6254   else
6255     {
6256       val = Qnil;
6257       for (i = coding_category_raw_text - 1; i >= 0; i--)
6258         if (mask & (1 << coding_priorities[i]))
6259           val = Fcons (make_number (coding_priorities[i]), val);
6260     }
6261
6262   {
6263     int one_byte_eol = -1, two_byte_eol = -1;
6264     Lisp_Object tail;
6265
6266     for (tail = val; CONSP (tail); tail = XCDR (tail))
6267       {
6268         struct coding_system *this
6269           = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6270              : &coding);
6271         int this_eol;
6272
6273         attrs = CODING_ID_ATTRS (this->id);
6274         eol_type = CODING_ID_EOL_TYPE (this->id);
6275         XSETCAR (tail, CODING_ID_NAME (this->id));
6276         if (VECTORP (eol_type))
6277           {
6278             if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6279               {
6280                 if (two_byte_eol < 0)
6281                   two_byte_eol = detect_eol (this, coding.source, src_bytes);
6282                 this_eol = two_byte_eol;
6283               }
6284             else
6285               {
6286                 if (one_byte_eol < 0)
6287                   one_byte_eol =detect_eol (this, coding.source, src_bytes);
6288                 this_eol = one_byte_eol;
6289               }
6290             if (this_eol == EOL_SEEN_LF)
6291               XSETCAR (tail, AREF (eol_type, 0));
6292             else if (this_eol == EOL_SEEN_CRLF)
6293               XSETCAR (tail, AREF (eol_type, 1));
6294             else if (this_eol == EOL_SEEN_CR)
6295               XSETCAR (tail, AREF (eol_type, 2));
6296           }
6297       }
6298   }
6299
6300   return (highest ? XCAR (val) : val);
6301 }
6302
6303
6304 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6305        2, 3, 0,
6306        doc: /* Detect coding system of the text in the region between START and END.
6307 Return a list of possible coding systems ordered by priority.
6308
6309 If only ASCII characters are found, it returns a list of single element
6310 `undecided' or its subsidiary coding system according to a detected
6311 end-of-line format.
6312
6313 If optional argument HIGHEST is non-nil, return the coding system of
6314 highest priority.  */)
6315      (start, end, highest)
6316      Lisp_Object start, end, highest;
6317 {
6318   int from, to;
6319   int from_byte, to_byte;
6320
6321   CHECK_NUMBER_COERCE_MARKER (start);
6322   CHECK_NUMBER_COERCE_MARKER (end);
6323
6324   validate_region (&start, &end);
6325   from = XINT (start), to = XINT (end);
6326   from_byte = CHAR_TO_BYTE (from);
6327   to_byte = CHAR_TO_BYTE (to);
6328
6329   if (from < GPT && to >= GPT)
6330     move_gap_both (to, to_byte);
6331
6332   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6333                                to_byte - from_byte,
6334                                !NILP (highest),
6335                                !NILP (current_buffer
6336                                       ->enable_multibyte_characters),
6337                                Qnil);
6338 }
6339
6340 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6341        1, 2, 0,
6342        doc: /* Detect coding system of the text in STRING.
6343 Return a list of possible coding systems ordered by priority.
6344
6345 If only ASCII characters are found, it returns a list of single element
6346 `undecided' or its subsidiary coding system according to a detected
6347 end-of-line format.
6348
6349 If optional argument HIGHEST is non-nil, return the coding system of
6350 highest priority.  */)
6351      (string, highest)
6352      Lisp_Object string, highest;
6353 {
6354   CHECK_STRING (string);
6355
6356   return detect_coding_system (XSTRING (string)->data,
6357                                STRING_BYTES (XSTRING (string)),
6358                                !NILP (highest),
6359                                STRING_MULTIBYTE (string),
6360                                Qnil);
6361 }
6362
6363
6364 static INLINE int
6365 char_encodable_p (c, attrs)
6366      int c;
6367      Lisp_Object attrs;
6368 {
6369   Lisp_Object tail;
6370   struct charset *charset;
6371
6372   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6373        CONSP (tail); tail = XCDR (tail))
6374     {
6375       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6376       if (CHAR_CHARSET_P (c, charset))
6377         break;
6378     }
6379   return (! NILP (tail));
6380 }
6381
6382
6383 /* Return a list of coding systems that safely encode the text between
6384    START and END.  If EXCLUDE is non-nil, it is a list of coding
6385    systems not to check.  The returned list doesn't contain any such
6386    coding systems.  In any case, If the text contains only ASCII or is
6387    unibyte, return t.  */
6388
6389 DEFUN ("find-coding-systems-region-internal",
6390        Ffind_coding_systems_region_internal,
6391        Sfind_coding_systems_region_internal, 2, 3, 0,
6392        doc: /* Internal use only.  */)
6393      (start, end, exclude)
6394      Lisp_Object start, end, exclude;
6395 {
6396   Lisp_Object coding_attrs_list, safe_codings;
6397   EMACS_INT start_byte, end_byte;
6398   unsigned char *p, *pbeg, *pend;
6399   int c;
6400   Lisp_Object tail, elt;
6401
6402   if (STRINGP (start))
6403     {
6404       if (!STRING_MULTIBYTE (start)
6405           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6406         return Qt;
6407       start_byte = 0;
6408       end_byte = STRING_BYTES (XSTRING (start));
6409     }
6410   else
6411     {
6412       CHECK_NUMBER_COERCE_MARKER (start);
6413       CHECK_NUMBER_COERCE_MARKER (end);
6414       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6415         args_out_of_range (start, end);
6416       if (NILP (current_buffer->enable_multibyte_characters))
6417         return Qt;
6418       start_byte = CHAR_TO_BYTE (XINT (start));
6419       end_byte = CHAR_TO_BYTE (XINT (end));
6420       if (XINT (end) - XINT (start) == end_byte - start_byte)
6421         return Qt;
6422
6423       if (start < GPT && end > GPT)
6424         {
6425           if ((GPT - start) < (end - GPT))
6426             move_gap_both (start, start_byte);
6427           else
6428             move_gap_both (end, end_byte);
6429         }
6430     }
6431
6432   coding_attrs_list = Qnil;
6433   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6434     if (NILP (exclude)
6435         || NILP (Fmemq (XCAR (tail), exclude)))
6436       {
6437         Lisp_Object attrs;
6438
6439         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6440         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6441             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6442           coding_attrs_list = Fcons (attrs, coding_attrs_list);
6443       }
6444
6445   if (STRINGP (start))
6446     p = pbeg = XSTRING (start)->data;
6447   else
6448     p = pbeg = BYTE_POS_ADDR (start_byte);
6449   pend = p + (end_byte - start_byte);
6450
6451   while (p < pend && ASCII_BYTE_P (*p)) p++;
6452   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6453
6454   while (p < pend)
6455     {
6456       if (ASCII_BYTE_P (*p))
6457         p++;
6458       else
6459         {
6460           c = STRING_CHAR_ADVANCE (p);
6461
6462           charset_map_loaded = 0;
6463           for (tail = coding_attrs_list; CONSP (tail);)
6464             {
6465               elt = XCAR (tail);
6466               if (NILP (elt))
6467                 tail = XCDR (tail);
6468               else if (char_encodable_p (c, elt))
6469                 tail = XCDR (tail);
6470               else if (CONSP (XCDR (tail)))
6471                 {
6472                   XSETCAR (tail, XCAR (XCDR (tail)));
6473                   XSETCDR (tail, XCDR (XCDR (tail)));
6474                 }
6475               else
6476                 {
6477                   XSETCAR (tail, Qnil);
6478                   tail = XCDR (tail);
6479                 }
6480             }
6481           if (charset_map_loaded)
6482             {
6483               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6484
6485               if (STRINGP (start))
6486                 pbeg = XSTRING (start)->data;
6487               else
6488                 pbeg = BYTE_POS_ADDR (start_byte);
6489               p = pbeg + p_offset;
6490               pend = pbeg + pend_offset;
6491             }
6492         }
6493     }
6494
6495   safe_codings = Qnil;
6496   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6497     if (! NILP (XCAR (tail)))
6498       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6499
6500   return safe_codings;
6501 }
6502
6503
6504 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6505        Scheck_coding_systems_region, 3, 3, 0,
6506        doc: /* Check if the region is encodable by coding systems.
6507
6508 START and END are buffer positions specifying the region.
6509 CODING-SYSTEM-LIST is a list of coding systems to check.
6510
6511 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6512 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6513 whole region, POS0, POS1, ... are buffer positions where non-encodable
6514 characters are found.
6515
6516 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6517 value is nil.
6518
6519 START may be a string.  In that case, check if the string is
6520 encodable, and the value contains indices to the string instead of
6521 buffer positions.  END is ignored.  */)
6522      (start, end, coding_system_list)
6523      Lisp_Object start, end, coding_system_list;
6524 {
6525   Lisp_Object list;
6526   EMACS_INT start_byte, end_byte;
6527   int pos;
6528   unsigned char *p, *pbeg, *pend;
6529   int c;
6530   Lisp_Object tail, elt;
6531
6532   if (STRINGP (start))
6533     {
6534       if (!STRING_MULTIBYTE (start)
6535           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6536         return Qnil;
6537       start_byte = 0;
6538       end_byte = STRING_BYTES (XSTRING (start));
6539       pos = 0;
6540     }
6541   else
6542     {
6543       CHECK_NUMBER_COERCE_MARKER (start);
6544       CHECK_NUMBER_COERCE_MARKER (end);
6545       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6546         args_out_of_range (start, end);
6547       if (NILP (current_buffer->enable_multibyte_characters))
6548         return Qnil;
6549       start_byte = CHAR_TO_BYTE (XINT (start));
6550       end_byte = CHAR_TO_BYTE (XINT (end));
6551       if (XINT (end) - XINT (start) == end_byte - start_byte)
6552         return Qt;
6553
6554       if (start < GPT && end > GPT)
6555         {
6556           if ((GPT - start) < (end - GPT))
6557             move_gap_both (start, start_byte);
6558           else
6559             move_gap_both (end, end_byte);
6560         }
6561       pos = start;
6562     }
6563
6564   list = Qnil;
6565   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6566     {
6567       elt = XCAR (tail);
6568       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6569                                        Qnil)),
6570                     list);
6571     }
6572
6573   if (STRINGP (start))
6574     p = pbeg = XSTRING (start)->data;
6575   else
6576     p = pbeg = BYTE_POS_ADDR (start_byte);
6577   pend = p + (end_byte - start_byte);
6578
6579   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6580   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6581
6582   while (p < pend)
6583     {
6584       if (ASCII_BYTE_P (*p))
6585         p++;
6586       else
6587         {
6588           c = STRING_CHAR_ADVANCE (p);
6589
6590           charset_map_loaded = 0;
6591           for (tail = list; CONSP (tail); tail = XCDR (tail))
6592             {
6593               elt = XCDR (XCAR (tail));
6594               if (! char_encodable_p (c, XCAR (elt)))
6595                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6596             }
6597           if (charset_map_loaded)
6598             {
6599               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6600
6601               if (STRINGP (start))
6602                 pbeg = XSTRING (start)->data;
6603               else
6604                 pbeg = BYTE_POS_ADDR (start_byte);
6605               p = pbeg + p_offset;
6606               pend = pbeg + pend_offset;
6607             }
6608         }
6609       pos++;
6610     }
6611
6612   tail = list;
6613   list = Qnil;
6614   for (; CONSP (tail); tail = XCDR (tail))
6615     {
6616       elt = XCAR (tail);
6617       if (CONSP (XCDR (XCDR (elt))))
6618         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6619                       list);
6620     }
6621
6622   return list;
6623 }
6624
6625
6626
6627 Lisp_Object
6628 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6629      Lisp_Object start, end, coding_system, dst_object;
6630      int encodep, norecord;
6631 {
6632   struct coding_system coding;
6633   EMACS_INT from, from_byte, to, to_byte;
6634   Lisp_Object src_object;
6635
6636   CHECK_NUMBER_COERCE_MARKER (start);
6637   CHECK_NUMBER_COERCE_MARKER (end);
6638   if (NILP (coding_system))
6639     coding_system = Qno_conversion;
6640   else
6641     CHECK_CODING_SYSTEM (coding_system);
6642   src_object = Fcurrent_buffer ();
6643   if (NILP (dst_object))
6644     dst_object = src_object;
6645   else if (! EQ (dst_object, Qt))
6646     CHECK_BUFFER (dst_object);
6647
6648   validate_region (&start, &end);
6649   from = XFASTINT (start);
6650   from_byte = CHAR_TO_BYTE (from);
6651   to = XFASTINT (end);
6652   to_byte = CHAR_TO_BYTE (to);
6653
6654   setup_coding_system (coding_system, &coding);
6655   coding.mode |= CODING_MODE_LAST_BLOCK;
6656
6657   if (encodep)
6658     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6659                           dst_object);
6660   else
6661     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6662                           dst_object);
6663   if (! norecord)
6664     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6665
6666   if (coding.result != CODING_RESULT_SUCCESS)
6667     error ("Code conversion error: %d", coding.result);
6668
6669   return (BUFFERP (dst_object)
6670           ? make_number (coding.produced_char)
6671           : coding.dst_object);
6672 }
6673
6674
6675 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6676        3, 4, "r\nzCoding system: ",
6677        doc: /* Decode the current region from the specified coding system.
6678 When called from a program, takes four arguments:
6679         START, END, CODING-SYSTEM, and DESTINATION.
6680 START and END are buffer positions.
6681
6682 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6683 If nil, the region between START and END is replace by the decoded text.
6684 If buffer, the decoded text is inserted in the buffer.
6685 If t, the decoded text is returned.
6686
6687 This function sets `last-coding-system-used' to the precise coding system
6688 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6689 not fully specified.)
6690 It returns the length of the decoded text.  */)
6691      (start, end, coding_system, destination)
6692      Lisp_Object start, end, coding_system, destination;
6693 {
6694   return code_convert_region (start, end, coding_system, destination, 0, 0);
6695 }
6696
6697 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6698        3, 4, "r\nzCoding system: ",
6699        doc: /* Encode the current region by specified coding system.
6700 When called from a program, takes three arguments:
6701 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6702
6703 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6704 If nil, the region between START and END is replace by the encoded text.
6705 If buffer, the encoded text is inserted in the buffer.
6706 If t, the encoded text is returned.
6707
6708 This function sets `last-coding-system-used' to the precise coding system
6709 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6710 not fully specified.)
6711 It returns the length of the encoded text.  */)
6712   (start, end, coding_system, destination)
6713      Lisp_Object start, end, coding_system, destination;
6714 {
6715   return code_convert_region (start, end, coding_system, destination, 1, 0);
6716 }
6717
6718 Lisp_Object
6719 code_convert_string (string, coding_system, dst_object,
6720                      encodep, nocopy, norecord)
6721      Lisp_Object string, coding_system, dst_object;
6722      int encodep, nocopy, norecord;
6723 {
6724   struct coding_system coding;
6725   EMACS_INT chars, bytes;
6726
6727   CHECK_STRING (string);
6728   if (NILP (coding_system))
6729     {
6730       if (! norecord)
6731         Vlast_coding_system_used = Qno_conversion;
6732       if (NILP (dst_object))
6733         return (nocopy ? Fcopy_sequence (string) : string);
6734     }
6735
6736   if (NILP (coding_system))
6737     coding_system = Qno_conversion;
6738   else
6739     CHECK_CODING_SYSTEM (coding_system);
6740   if (NILP (dst_object))
6741     dst_object = Qt;
6742   else if (! EQ (dst_object, Qt))
6743     CHECK_BUFFER (dst_object);
6744
6745   setup_coding_system (coding_system, &coding);
6746   coding.mode |= CODING_MODE_LAST_BLOCK;
6747   chars = XSTRING (string)->size;
6748   bytes = STRING_BYTES (XSTRING (string));
6749   if (encodep)
6750     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6751   else
6752     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6753   if (! norecord)
6754     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6755
6756   if (coding.result != CODING_RESULT_SUCCESS)
6757     error ("Code conversion error: %d", coding.result);
6758
6759   return (BUFFERP (dst_object)
6760           ? make_number (coding.produced_char)
6761           : coding.dst_object);
6762 }
6763
6764
6765 /* Encode or decode STRING according to CODING_SYSTEM.
6766    Do not set Vlast_coding_system_used.
6767
6768    This function is called only from macros DECODE_FILE and
6769    ENCODE_FILE, thus we ignore character composition.  */
6770
6771 Lisp_Object
6772 code_convert_string_norecord (string, coding_system, encodep)
6773      Lisp_Object string, coding_system;
6774      int encodep;
6775 {
6776   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6777 }
6778
6779
6780 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6781        2, 4, 0,
6782        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6783
6784 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6785 if the decoding operation is trivial.
6786
6787 Optional fourth arg BUFFER non-nil meant that the decoded text is
6788 inserted in BUFFER instead of returned as a astring.  In this case,
6789 the return value is BUFFER.
6790
6791 This function sets `last-coding-system-used' to the precise coding system
6792 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6793 not fully specified.  */)
6794   (string, coding_system, nocopy, buffer)
6795      Lisp_Object string, coding_system, nocopy, buffer;
6796 {
6797   return code_convert_string (string, coding_system, buffer,
6798                               0, ! NILP (nocopy), 0);
6799 }
6800
6801 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6802        2, 4, 0,
6803        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6804
6805 Optional third arg NOCOPY non-nil means it is OK to return STRING
6806 itself if the encoding operation is trivial.
6807
6808 Optional fourth arg BUFFER non-nil meant that the encoded text is
6809 inserted in BUFFER instead of returned as a astring.  In this case,
6810 the return value is BUFFER.
6811
6812 This function sets `last-coding-system-used' to the precise coding system
6813 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6814 not fully specified.)  */)
6815      (string, coding_system, nocopy, buffer)
6816      Lisp_Object string, coding_system, nocopy, buffer;
6817 {
6818   return code_convert_string (string, coding_system, buffer,
6819                               nocopy, ! NILP (nocopy), 1);
6820 }
6821
6822 \f
6823 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6824        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6825 Return the corresponding character.  */)
6826      (code)
6827      Lisp_Object code;
6828 {
6829   Lisp_Object spec, attrs, val;
6830   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6831   int c;
6832
6833   CHECK_NATNUM (code);
6834   c = XFASTINT (code);
6835   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6836   attrs = AREF (spec, 0);
6837
6838   if (ASCII_BYTE_P (c)
6839       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6840     return code;
6841
6842   val = CODING_ATTR_CHARSET_LIST (attrs);
6843   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6844   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6845   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6846
6847   if (c <= 0x7F)
6848     charset = charset_roman;
6849   else if (c >= 0xA0 && c < 0xDF)
6850     {
6851       charset = charset_kana;
6852       c -= 0x80;
6853     }
6854   else
6855     {
6856       int s1 = c >> 8, s2 = c & 0xFF;
6857
6858       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6859           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6860         error ("Invalid code: %d", code);
6861       SJIS_TO_JIS (c);
6862       charset = charset_kanji;
6863     }
6864   c = DECODE_CHAR (charset, c);
6865   if (c < 0)
6866     error ("Invalid code: %d", code);
6867   return make_number (c);
6868 }
6869
6870
6871 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6872        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6873 Return the corresponding code in SJIS.  */)
6874      (ch)
6875     Lisp_Object ch;
6876 {
6877   Lisp_Object spec, attrs, charset_list;
6878   int c;
6879   struct charset *charset;
6880   unsigned code;
6881
6882   CHECK_CHARACTER (ch);
6883   c = XFASTINT (ch);
6884   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6885   attrs = AREF (spec, 0);
6886
6887   if (ASCII_CHAR_P (c)
6888       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6889     return ch;
6890
6891   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6892   charset = char_charset (c, charset_list, &code);
6893   if (code == CHARSET_INVALID_CODE (charset))
6894     error ("Can't encode by shift_jis encoding: %d", c);
6895   JIS_TO_SJIS (code);
6896
6897   return make_number (code);
6898 }
6899
6900 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6901        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6902 Return the corresponding character.  */)
6903      (code)
6904      Lisp_Object code;
6905 {
6906   Lisp_Object spec, attrs, val;
6907   struct charset *charset_roman, *charset_big5, *charset;
6908   int c;
6909
6910   CHECK_NATNUM (code);
6911   c = XFASTINT (code);
6912   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6913   attrs = AREF (spec, 0);
6914
6915   if (ASCII_BYTE_P (c)
6916       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6917     return code;
6918
6919   val = CODING_ATTR_CHARSET_LIST (attrs);
6920   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6921   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6922
6923   if (c <= 0x7F)
6924     charset = charset_roman;
6925   else
6926     {
6927       int b1 = c >> 8, b2 = c & 0x7F;
6928       if (b1 < 0xA1 || b1 > 0xFE
6929           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6930         error ("Invalid code: %d", code);
6931       charset = charset_big5;
6932     }
6933   c = DECODE_CHAR (charset, (unsigned )c);
6934   if (c < 0)
6935     error ("Invalid code: %d", code);
6936   return make_number (c);
6937 }
6938
6939 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6940        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6941 Return the corresponding character code in Big5.  */)
6942      (ch)
6943      Lisp_Object ch;
6944 {
6945   Lisp_Object spec, attrs, charset_list;
6946   struct charset *charset;
6947   int c;
6948   unsigned code;
6949
6950   CHECK_CHARACTER (ch);
6951   c = XFASTINT (ch);
6952   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6953   attrs = AREF (spec, 0);
6954   if (ASCII_CHAR_P (c)
6955       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6956     return ch;
6957
6958   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6959   charset = char_charset (c, charset_list, &code);
6960   if (code == CHARSET_INVALID_CODE (charset))
6961     error ("Can't encode by Big5 encoding: %d", c);
6962
6963   return make_number (code);
6964 }
6965
6966 \f
6967 DEFUN ("set-terminal-coding-system-internal",
6968        Fset_terminal_coding_system_internal,
6969        Sset_terminal_coding_system_internal, 1, 1, 0,
6970        doc: /* Internal use only.  */)
6971      (coding_system)
6972      Lisp_Object coding_system;
6973 {
6974   CHECK_SYMBOL (coding_system);
6975   setup_coding_system (Fcheck_coding_system (coding_system),
6976                         &terminal_coding);
6977
6978   /* We had better not send unsafe characters to terminal.  */
6979   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6980   /* Characer composition should be disabled.  */
6981   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6982   terminal_coding.src_multibyte = 1;
6983   terminal_coding.dst_multibyte = 0;
6984   return Qnil;
6985 }
6986
6987 DEFUN ("set-safe-terminal-coding-system-internal",
6988        Fset_safe_terminal_coding_system_internal,
6989        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6990        doc: /* Internal use only.  */)
6991      (coding_system)
6992      Lisp_Object coding_system;
6993 {
6994   CHECK_SYMBOL (coding_system);
6995   setup_coding_system (Fcheck_coding_system (coding_system),
6996                        &safe_terminal_coding);
6997   /* Characer composition should be disabled.  */
6998   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6999   safe_terminal_coding.src_multibyte = 1;
7000   safe_terminal_coding.dst_multibyte = 0;
7001   return Qnil;
7002 }
7003
7004 DEFUN ("terminal-coding-system",
7005        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7006        doc: /* Return coding system specified for terminal output.  */)
7007      ()
7008 {
7009   return CODING_ID_NAME (terminal_coding.id);
7010 }
7011
7012 DEFUN ("set-keyboard-coding-system-internal",
7013        Fset_keyboard_coding_system_internal,
7014        Sset_keyboard_coding_system_internal, 1, 1, 0,
7015        doc: /* Internal use only.  */)
7016      (coding_system)
7017      Lisp_Object coding_system;
7018 {
7019   CHECK_SYMBOL (coding_system);
7020   setup_coding_system (Fcheck_coding_system (coding_system),
7021                        &keyboard_coding);
7022   /* Characer composition should be disabled.  */
7023   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7024   return Qnil;
7025 }
7026
7027 DEFUN ("keyboard-coding-system",
7028        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7029        doc: /* Return coding system specified for decoding keyboard input.  */)
7030      ()
7031 {
7032   return CODING_ID_NAME (keyboard_coding.id);
7033 }
7034
7035 \f
7036 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7037        Sfind_operation_coding_system,  1, MANY, 0,
7038        doc: /* Choose a coding system for an operation based on the target name.
7039 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7040 DECODING-SYSTEM is the coding system to use for decoding
7041 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7042 for encoding (in case OPERATION does encoding).
7043
7044 The first argument OPERATION specifies an I/O primitive:
7045   For file I/O, `insert-file-contents' or `write-region'.
7046   For process I/O, `call-process', `call-process-region', or `start-process'.
7047   For network I/O, `open-network-stream'.
7048
7049 The remaining arguments should be the same arguments that were passed
7050 to the primitive.  Depending on which primitive, one of those arguments
7051 is selected as the TARGET.  For example, if OPERATION does file I/O,
7052 whichever argument specifies the file name is TARGET.
7053
7054 TARGET has a meaning which depends on OPERATION:
7055   For file I/O, TARGET is a file name.
7056   For process I/O, TARGET is a process name.
7057   For network I/O, TARGET is a service name or a port number
7058
7059 This function looks up what specified for TARGET in,
7060 `file-coding-system-alist', `process-coding-system-alist',
7061 or `network-coding-system-alist' depending on OPERATION.
7062 They may specify a coding system, a cons of coding systems,
7063 or a function symbol to call.
7064 In the last case, we call the function with one argument,
7065 which is a list of all the arguments given to this function.
7066
7067 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7068      (nargs, args)
7069      int nargs;
7070      Lisp_Object *args;
7071 {
7072   Lisp_Object operation, target_idx, target, val;
7073   register Lisp_Object chain;
7074
7075   if (nargs < 2)
7076     error ("Too few arguments");
7077   operation = args[0];
7078   if (!SYMBOLP (operation)
7079       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7080     error ("Invalid first arguement");
7081   if (nargs < 1 + XINT (target_idx))
7082     error ("Too few arguments for operation: %s",
7083            XSYMBOL (operation)->name->data);
7084   target = args[XINT (target_idx) + 1];
7085   if (!(STRINGP (target)
7086         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7087     error ("Invalid %dth argument", XINT (target_idx) + 1);
7088
7089   chain = ((EQ (operation, Qinsert_file_contents)
7090             || EQ (operation, Qwrite_region))
7091            ? Vfile_coding_system_alist
7092            : (EQ (operation, Qopen_network_stream)
7093               ? Vnetwork_coding_system_alist
7094               : Vprocess_coding_system_alist));
7095   if (NILP (chain))
7096     return Qnil;
7097
7098   for (; CONSP (chain); chain = XCDR (chain))
7099     {
7100       Lisp_Object elt;
7101
7102       elt = XCAR (chain);
7103       if (CONSP (elt)
7104           && ((STRINGP (target)
7105                && STRINGP (XCAR (elt))
7106                && fast_string_match (XCAR (elt), target) >= 0)
7107               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7108         {
7109           val = XCDR (elt);
7110           /* Here, if VAL is both a valid coding system and a valid
7111              function symbol, we return VAL as a coding system.  */
7112           if (CONSP (val))
7113             return val;
7114           if (! SYMBOLP (val))
7115             return Qnil;
7116           if (! NILP (Fcoding_system_p (val)))
7117             return Fcons (val, val);
7118           if (! NILP (Ffboundp (val)))
7119             {
7120               val = call1 (val, Flist (nargs, args));
7121               if (CONSP (val))
7122                 return val;
7123               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7124                 return Fcons (val, val);
7125             }
7126           return Qnil;
7127         }
7128     }
7129   return Qnil;
7130 }
7131
7132 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7133        Sset_coding_system_priority, 1, MANY, 0,
7134        doc: /* Assign higher priority to coding systems given as arguments.
7135 usage: (set-coding-system-priority CODING-SYSTEM ...)  */)
7136      (nargs, args)
7137      int nargs;
7138      Lisp_Object *args;
7139 {
7140   int i, j;
7141   int changed[coding_category_max];
7142   enum coding_category priorities[coding_category_max];
7143
7144   bzero (changed, sizeof changed);
7145
7146   for (i = j = 0; i < nargs; i++)
7147     {
7148       enum coding_category category;
7149       Lisp_Object spec, attrs;
7150
7151       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7152       attrs = AREF (spec, 0);
7153       category = XINT (CODING_ATTR_CATEGORY (attrs));
7154       if (changed[category])
7155         /* Ignore this coding system because a coding system of the
7156            same category already had a higher priority.  */
7157         continue;
7158       changed[category] = 1;
7159       priorities[j++] = category;
7160       if (coding_categories[category].id >= 0
7161           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7162         setup_coding_system (args[i], &coding_categories[category]);
7163     }
7164
7165   /* Now we have decided top J priorities.  Reflect the order of the
7166      original priorities to the remaining priorities.  */
7167
7168   for (i = j, j = 0; i < coding_category_max; i++, j++)
7169     {
7170       while (j < coding_category_max
7171              && changed[coding_priorities[j]])
7172         j++;
7173       if (j == coding_category_max)
7174         abort ();
7175       priorities[i] = coding_priorities[j];
7176     }
7177
7178   bcopy (priorities, coding_priorities, sizeof priorities);
7179   return Qnil;
7180 }
7181
7182 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7183        Scoding_system_priority_list, 0, 1, 0,
7184        doc: /* Return a list of coding systems ordered by their priorities.  */)
7185      (highestp)
7186      Lisp_Object highestp;
7187 {
7188   int i;
7189   Lisp_Object val;
7190
7191   for (i = 0, val = Qnil; i < coding_category_max; i++)
7192     {
7193       enum coding_category category = coding_priorities[i];
7194       int id = coding_categories[category].id;
7195       Lisp_Object attrs;
7196
7197       if (id < 0)
7198         continue;
7199       attrs = CODING_ID_ATTRS (id);
7200       if (! NILP (highestp))
7201         return CODING_ATTR_BASE_NAME (attrs);
7202       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7203     }
7204   return Fnreverse (val);
7205 }
7206
7207 static Lisp_Object
7208 make_subsidiaries (base)
7209      Lisp_Object base;
7210 {
7211   Lisp_Object subsidiaries;
7212   char *suffixes[] = { "-unix", "-dos", "-mac" };
7213   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7214   char *buf = (char *) alloca (base_name_len + 6);
7215   int i;
7216
7217   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7218   subsidiaries = Fmake_vector (make_number (3), Qnil);
7219   for (i = 0; i < 3; i++)
7220     {
7221       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7222       ASET (subsidiaries, i, intern (buf));
7223     }
7224   return subsidiaries;
7225 }
7226
7227
7228 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7229        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7230        doc: /* For internal use only.
7231 usage: (define-coding-system-internal ...)  */)
7232      (nargs, args)
7233      int nargs;
7234      Lisp_Object *args;
7235 {
7236   Lisp_Object name;
7237   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7238   Lisp_Object attrs;            /* Vector of attributes.  */
7239   Lisp_Object eol_type;
7240   Lisp_Object aliases;
7241   Lisp_Object coding_type, charset_list, safe_charsets;
7242   enum coding_category category;
7243   Lisp_Object tail, val;
7244   int max_charset_id = 0;
7245   int i;
7246
7247   if (nargs < coding_arg_max)
7248     goto short_args;
7249
7250   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7251
7252   name = args[coding_arg_name];
7253   CHECK_SYMBOL (name);
7254   CODING_ATTR_BASE_NAME (attrs) = name;
7255
7256   val = args[coding_arg_mnemonic];
7257   if (! STRINGP (val))
7258     CHECK_CHARACTER (val);
7259   CODING_ATTR_MNEMONIC (attrs) = val;
7260
7261   coding_type = args[coding_arg_coding_type];
7262   CHECK_SYMBOL (coding_type);
7263   CODING_ATTR_TYPE (attrs) = coding_type;
7264
7265   charset_list = args[coding_arg_charset_list];
7266   if (SYMBOLP (charset_list))
7267     {
7268       if (EQ (charset_list, Qiso_2022))
7269         {
7270           if (! EQ (coding_type, Qiso_2022))
7271             error ("Invalid charset-list");
7272           charset_list = Viso_2022_charset_list;
7273         }
7274       else if (EQ (charset_list, Qemacs_mule))
7275         {
7276           if (! EQ (coding_type, Qemacs_mule))
7277             error ("Invalid charset-list");
7278           charset_list = Vemacs_mule_charset_list;
7279         }
7280       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7281         if (max_charset_id < XFASTINT (XCAR (tail)))
7282           max_charset_id = XFASTINT (XCAR (tail));
7283     }
7284   else
7285     {
7286       charset_list = Fcopy_sequence (charset_list);
7287       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7288         {
7289           struct charset *charset;
7290
7291           val = Fcar (tail);
7292           CHECK_CHARSET_GET_CHARSET (val, charset);
7293           if (EQ (coding_type, Qiso_2022)
7294               ? CHARSET_ISO_FINAL (charset) < 0
7295               : EQ (coding_type, Qemacs_mule)
7296               ? CHARSET_EMACS_MULE_ID (charset) < 0
7297               : 0)
7298             error ("Can't handle charset `%s'",
7299                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7300
7301           XCAR (tail) = make_number (charset->id);
7302           if (max_charset_id < charset->id)
7303             max_charset_id = charset->id;
7304         }
7305     }
7306   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7307
7308   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7309                                 make_number (255));
7310   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7311     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7312   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7313
7314   val = args[coding_arg_decode_translation_table];
7315   if (! NILP (val))
7316     CHECK_CHAR_TABLE (val);
7317   CODING_ATTR_DECODE_TBL (attrs) = val;
7318
7319   val = args[coding_arg_encode_translation_table];
7320   if (! NILP (val))
7321     CHECK_CHAR_TABLE (val);
7322   CODING_ATTR_ENCODE_TBL (attrs) = val;
7323
7324   val = args[coding_arg_post_read_conversion];
7325   CHECK_SYMBOL (val);
7326   CODING_ATTR_POST_READ (attrs) = val;
7327
7328   val = args[coding_arg_pre_write_conversion];
7329   CHECK_SYMBOL (val);
7330   CODING_ATTR_PRE_WRITE (attrs) = val;
7331
7332   val = args[coding_arg_default_char];
7333   if (NILP (val))
7334     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7335   else
7336     {
7337       CHECK_CHARACTER (val);
7338       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7339     }
7340
7341   val = args[coding_arg_plist];
7342   CHECK_LIST (val);
7343   CODING_ATTR_PLIST (attrs) = val;
7344
7345   if (EQ (coding_type, Qcharset))
7346     {
7347       val = Fmake_vector (make_number (256), Qnil);
7348
7349       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7350         {
7351           struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7352           int idx = (CHARSET_DIMENSION (charset) - 1) * 4;
7353
7354           for (i = charset->code_space[idx];
7355                i <= charset->code_space[idx + 1]; i++)
7356             {
7357               if (NILP (AREF (val, i)))
7358                 ASET (val, i, XCAR (tail));
7359               else
7360                 error ("Charsets conflicts in the first byte");
7361             }
7362         }
7363       ASET (attrs, coding_attr_charset_valids, val);
7364       category = coding_category_charset;
7365     }
7366   else if (EQ (coding_type, Qccl))
7367     {
7368       Lisp_Object valids;
7369
7370       if (nargs < coding_arg_ccl_max)
7371         goto short_args;
7372
7373       val = args[coding_arg_ccl_decoder];
7374       CHECK_CCL_PROGRAM (val);
7375       if (VECTORP (val))
7376         val = Fcopy_sequence (val);
7377       ASET (attrs, coding_attr_ccl_decoder, val);
7378
7379       val = args[coding_arg_ccl_encoder];
7380       CHECK_CCL_PROGRAM (val);
7381       if (VECTORP (val))
7382         val = Fcopy_sequence (val);
7383       ASET (attrs, coding_attr_ccl_encoder, val);
7384
7385       val = args[coding_arg_ccl_valids];
7386       valids = Fmake_string (make_number (256), make_number (0));
7387       for (tail = val; !NILP (tail); tail = Fcdr (tail))
7388         {
7389           val = Fcar (tail);
7390           if (INTEGERP (val))
7391             ASET (valids, XINT (val), 1);
7392           else
7393             {
7394               int from, to;
7395
7396               CHECK_CONS (val);
7397               CHECK_NUMBER (XCAR (val));
7398               CHECK_NUMBER (XCDR (val));
7399               from = XINT (XCAR (val));
7400               to = XINT (XCDR (val));
7401               for (i = from; i <= to; i++)
7402                 ASET (valids, i, 1);
7403             }
7404         }
7405       ASET (attrs, coding_attr_ccl_valids, valids);
7406
7407       category = coding_category_ccl;
7408     }
7409   else if (EQ (coding_type, Qutf_16))
7410     {
7411       Lisp_Object bom, endian;
7412
7413       if (nargs < coding_arg_utf16_max)
7414         goto short_args;
7415
7416       bom = args[coding_arg_utf16_bom];
7417       if (! NILP (bom) && ! EQ (bom, Qt))
7418         {
7419           CHECK_CONS (bom);
7420           CHECK_CODING_SYSTEM (XCAR (bom));
7421           CHECK_CODING_SYSTEM (XCDR (bom));
7422         }
7423       ASET (attrs, coding_attr_utf_16_bom, bom);
7424
7425       endian = args[coding_arg_utf16_endian];
7426       ASET (attrs, coding_attr_utf_16_endian, endian);
7427
7428       category = (CONSP (bom)
7429                   ? coding_category_utf_16_auto
7430                   : NILP (bom)
7431                   ? (NILP (endian)
7432                      ? coding_category_utf_16_be_nosig
7433                      : coding_category_utf_16_le_nosig)
7434                   : (NILP (endian)
7435                      ? coding_category_utf_16_be
7436                      : coding_category_utf_16_le));
7437     }
7438   else if (EQ (coding_type, Qiso_2022))
7439     {
7440       Lisp_Object initial, reg_usage, request, flags;
7441       int i, id;
7442
7443       if (nargs < coding_arg_iso2022_max)
7444         goto short_args;
7445
7446       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7447       CHECK_VECTOR (initial);
7448       for (i = 0; i < 4; i++)
7449         {
7450           val = Faref (initial, make_number (i));
7451           if (! NILP (val))
7452             {
7453               CHECK_CHARSET_GET_ID (val, id);
7454               ASET (initial, i, make_number (id));
7455             }
7456           else
7457             ASET (initial, i, make_number (-1));
7458         }
7459
7460       reg_usage = args[coding_arg_iso2022_reg_usage];
7461       CHECK_CONS (reg_usage);
7462       CHECK_NATNUM (XCAR (reg_usage));
7463       CHECK_NATNUM (XCDR (reg_usage));
7464
7465       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7466       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7467         {
7468           int id;
7469
7470           val = Fcar (tail);
7471           CHECK_CONS (val);
7472           CHECK_CHARSET_GET_ID (XCAR (val), id);
7473           CHECK_NATNUM (XCDR (val));
7474           if (XINT (XCDR (val)) >= 4)
7475             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7476           XCAR (val) = make_number (id);
7477         }
7478
7479       flags = args[coding_arg_iso2022_flags];
7480       CHECK_NATNUM (flags);
7481       i = XINT (flags);
7482       if (EQ (args[coding_arg_charset_list], Qiso_2022))
7483         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7484
7485       ASET (attrs, coding_attr_iso_initial, initial);
7486       ASET (attrs, coding_attr_iso_usage, reg_usage);
7487       ASET (attrs, coding_attr_iso_request, request);
7488       ASET (attrs, coding_attr_iso_flags, flags);
7489       setup_iso_safe_charsets (attrs);
7490
7491       if (i & CODING_ISO_FLAG_SEVEN_BITS)
7492         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7493                           | CODING_ISO_FLAG_SINGLE_SHIFT))
7494                     ? coding_category_iso_7_else
7495                     : EQ (args[coding_arg_charset_list], Qiso_2022)
7496                     ? coding_category_iso_7
7497                     : coding_category_iso_7_tight);
7498       else
7499         {
7500           int id = XINT (AREF (initial, 1));
7501
7502           category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7503                              | CODING_ISO_FLAG_SINGLE_SHIFT))
7504                        || EQ (args[coding_arg_charset_list], Qiso_2022)
7505                        || id < 0)
7506                       ? coding_category_iso_8_else
7507                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7508                       ? coding_category_iso_8_1
7509                       : coding_category_iso_8_2);
7510         }
7511     }
7512   else if (EQ (coding_type, Qemacs_mule))
7513     {
7514       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7515         ASET (attrs, coding_attr_emacs_mule_full, Qt);
7516
7517       category = coding_category_emacs_mule;
7518     }
7519   else if (EQ (coding_type, Qshift_jis))
7520     {
7521
7522       struct charset *charset;
7523
7524       if (XINT (Flength (charset_list)) != 3)
7525         error ("There should be just three charsets");
7526
7527       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7528       if (CHARSET_DIMENSION (charset) != 1)
7529         error ("Dimension of charset %s is not one",
7530                XSYMBOL (CHARSET_NAME (charset))->name->data);
7531
7532       charset_list = XCDR (charset_list);
7533       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7534       if (CHARSET_DIMENSION (charset) != 1)
7535         error ("Dimension of charset %s is not one",
7536                XSYMBOL (CHARSET_NAME (charset))->name->data);
7537
7538       charset_list = XCDR (charset_list);
7539       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7540       if (CHARSET_DIMENSION (charset) != 2)
7541         error ("Dimension of charset %s is not two",
7542                XSYMBOL (CHARSET_NAME (charset))->name->data);
7543
7544       category = coding_category_sjis;
7545       Vsjis_coding_system = name;
7546     }
7547   else if (EQ (coding_type, Qbig5))
7548     {
7549       struct charset *charset;
7550
7551       if (XINT (Flength (charset_list)) != 2)
7552         error ("There should be just two charsets");
7553
7554       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7555       if (CHARSET_DIMENSION (charset) != 1)
7556         error ("Dimension of charset %s is not one",
7557                XSYMBOL (CHARSET_NAME (charset))->name->data);
7558
7559       charset_list = XCDR (charset_list);
7560       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7561       if (CHARSET_DIMENSION (charset) != 2)
7562         error ("Dimension of charset %s is not two",
7563                XSYMBOL (CHARSET_NAME (charset))->name->data);
7564
7565       category = coding_category_big5;
7566       Vbig5_coding_system = name;
7567     }
7568   else if (EQ (coding_type, Qraw_text))
7569     category = coding_category_raw_text;
7570   else if (EQ (coding_type, Qutf_8))
7571     category = coding_category_utf_8;
7572   else if (EQ (coding_type, Qundecided))
7573     category = coding_category_undecided;
7574   else
7575     error ("Invalid coding system type: %s",
7576            XSYMBOL (coding_type)->name->data);
7577
7578   CODING_ATTR_CATEGORY (attrs) = make_number (category);
7579
7580   eol_type = args[coding_arg_eol_type];
7581   if (! NILP (eol_type)
7582       && ! EQ (eol_type, Qunix)
7583       && ! EQ (eol_type, Qdos)
7584       && ! EQ (eol_type, Qmac))
7585     error ("Invalid eol-type");
7586
7587   aliases = Fcons (name, Qnil);
7588
7589   if (NILP (eol_type))
7590     {
7591       eol_type = make_subsidiaries (name);
7592       for (i = 0; i < 3; i++)
7593         {
7594           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7595
7596           this_name = AREF (eol_type, i);
7597           this_aliases = Fcons (this_name, Qnil);
7598           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7599           this_spec = Fmake_vector (make_number (3), attrs);
7600           ASET (this_spec, 1, this_aliases);
7601           ASET (this_spec, 2, this_eol_type);
7602           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7603           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7604           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7605                                         Vcoding_system_alist);
7606         }
7607     }
7608
7609   spec_vec = Fmake_vector (make_number (3), attrs);
7610   ASET (spec_vec, 1, aliases);
7611   ASET (spec_vec, 2, eol_type);
7612
7613   Fputhash (name, spec_vec, Vcoding_system_hash_table);
7614   Vcoding_system_list = Fcons (name, Vcoding_system_list);
7615   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7616                                 Vcoding_system_alist);
7617
7618   {
7619     int id = coding_categories[category].id;
7620
7621     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7622       setup_coding_system (name, &coding_categories[category]);
7623   }
7624
7625   return Qnil;
7626
7627  short_args:
7628   return Fsignal (Qwrong_number_of_arguments,
7629                   Fcons (intern ("define-coding-system-internal"),
7630                          make_number (nargs)));
7631 }
7632
7633 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7634        Sdefine_coding_system_alias, 2, 2, 0,
7635        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
7636      (alias, coding_system)
7637      Lisp_Object alias, coding_system;
7638 {
7639   Lisp_Object spec, aliases, eol_type;
7640
7641   CHECK_SYMBOL (alias);
7642   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7643   aliases = AREF (spec, 1);
7644   while (!NILP (XCDR (aliases)))
7645     aliases = XCDR (aliases);
7646   XCDR (aliases) = Fcons (alias, Qnil);
7647
7648   eol_type = AREF (spec, 2);
7649   if (VECTORP (eol_type))
7650     {
7651       Lisp_Object subsidiaries;
7652       int i;
7653
7654       subsidiaries = make_subsidiaries (alias);
7655       for (i = 0; i < 3; i++)
7656         Fdefine_coding_system_alias (AREF (subsidiaries, i),
7657                                      AREF (eol_type, i));
7658
7659       ASET (spec, 2, subsidiaries);
7660     }
7661
7662   Fputhash (alias, spec, Vcoding_system_hash_table);
7663   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7664                                 Vcoding_system_alist);
7665
7666   return Qnil;
7667 }
7668
7669 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7670        1, 1, 0,
7671        doc: /* Return the base of CODING-SYSTEM.
7672 Any alias or subsidiary coding systems are not base coding system.  */)
7673   (coding_system)
7674      Lisp_Object coding_system;
7675 {
7676   Lisp_Object spec, attrs;
7677
7678   if (NILP (coding_system))
7679     return (Qno_conversion);
7680   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7681   attrs = AREF (spec, 0);
7682   return CODING_ATTR_BASE_NAME (attrs);
7683 }
7684
7685 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7686        1, 1, 0,
7687        doc: "Return the property list of CODING-SYSTEM.")
7688      (coding_system)
7689      Lisp_Object coding_system;
7690 {
7691   Lisp_Object spec, attrs;
7692
7693   if (NILP (coding_system))
7694     coding_system = Qno_conversion;
7695   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7696   attrs = AREF (spec, 0);
7697   return CODING_ATTR_PLIST (attrs);
7698 }
7699
7700
7701 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7702        1, 1, 0,
7703        doc: /* Return the list of aliases of CODING-SYSTEM.
7704 A base coding system is what made by `define-coding-system'.
7705 Any alias nor subsidiary coding systems are not base coding system.  */)
7706      (coding_system)
7707      Lisp_Object coding_system;
7708 {
7709   Lisp_Object spec;
7710
7711   if (NILP (coding_system))
7712     coding_system = Qno_conversion;
7713   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7714   return AREF (spec, 2);
7715 }
7716
7717 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7718        Scoding_system_eol_type, 1, 1, 0,
7719        doc: /* Return eol-type of CODING-SYSTEM.
7720 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7721
7722 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7723 and CR respectively.
7724
7725 A vector value indicates that a format of end-of-line should be
7726 detected automatically.  Nth element of the vector is the subsidiary
7727 coding system whose eol-type is N.  */)
7728      (coding_system)
7729      Lisp_Object coding_system;
7730 {
7731   Lisp_Object spec, eol_type;
7732   int n;
7733
7734   if (NILP (coding_system))
7735     coding_system = Qno_conversion;
7736   if (! CODING_SYSTEM_P (coding_system))
7737     return Qnil;
7738   spec = CODING_SYSTEM_SPEC (coding_system);
7739   eol_type = AREF (spec, 2);
7740   if (VECTORP (eol_type))
7741     return Fcopy_sequence (eol_type);
7742   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7743   return make_number (n);
7744 }
7745
7746 #endif /* emacs */
7747
7748 \f
7749 /*** 9. Post-amble ***/
7750
7751 void
7752 init_coding_once ()
7753 {
7754   int i;
7755
7756   for (i = 0; i < coding_category_max; i++)
7757     {
7758       coding_categories[i].id = -1;
7759       coding_priorities[i] = i;
7760     }
7761
7762   /* ISO2022 specific initialize routine.  */
7763   for (i = 0; i < 0x20; i++)
7764     iso_code_class[i] = ISO_control_0;
7765   for (i = 0x21; i < 0x7F; i++)
7766     iso_code_class[i] = ISO_graphic_plane_0;
7767   for (i = 0x80; i < 0xA0; i++)
7768     iso_code_class[i] = ISO_control_1;
7769   for (i = 0xA1; i < 0xFF; i++)
7770     iso_code_class[i] = ISO_graphic_plane_1;
7771   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7772   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7773   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7774   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7775   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7776   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7777   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7778   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7779   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7780   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7781
7782   inhibit_pre_post_conversion = 0;
7783
7784   for (i = 0; i < 256; i++)
7785     {
7786       emacs_mule_bytes[i] = 1;
7787     }
7788 }
7789
7790 #ifdef emacs
7791
7792 void
7793 syms_of_coding ()
7794 {
7795   staticpro (&Vcoding_system_hash_table);
7796   Vcoding_system_hash_table = Fmakehash (Qeq);
7797
7798   staticpro (&Vsjis_coding_system);
7799   Vsjis_coding_system = Qnil;
7800
7801   staticpro (&Vbig5_coding_system);
7802   Vbig5_coding_system = Qnil;
7803
7804   staticpro (&Vcode_conversion_work_buf_list);
7805   Vcode_conversion_work_buf_list = Qnil;
7806
7807   staticpro (&Vcode_conversion_reused_work_buf);
7808   Vcode_conversion_reused_work_buf = Qnil;
7809
7810   DEFSYM (Qcharset, "charset");
7811   DEFSYM (Qtarget_idx, "target-idx");
7812   DEFSYM (Qcoding_system_history, "coding-system-history");
7813   Fset (Qcoding_system_history, Qnil);
7814
7815   /* Target FILENAME is the first argument.  */
7816   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7817   /* Target FILENAME is the third argument.  */
7818   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7819
7820   DEFSYM (Qcall_process, "call-process");
7821   /* Target PROGRAM is the first argument.  */
7822   Fput (Qcall_process, Qtarget_idx, make_number (0));
7823
7824   DEFSYM (Qcall_process_region, "call-process-region");
7825   /* Target PROGRAM is the third argument.  */
7826   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7827
7828   DEFSYM (Qstart_process, "start-process");
7829   /* Target PROGRAM is the third argument.  */
7830   Fput (Qstart_process, Qtarget_idx, make_number (2));
7831
7832   DEFSYM (Qopen_network_stream, "open-network-stream");
7833   /* Target SERVICE is the fourth argument.  */
7834   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7835
7836   DEFSYM (Qcoding_system, "coding-system");
7837   DEFSYM (Qcoding_aliases, "coding-aliases");
7838
7839   DEFSYM (Qeol_type, "eol-type");
7840   DEFSYM (Qunix, "unix");
7841   DEFSYM (Qdos, "dos");
7842   DEFSYM (Qmac, "mac");
7843
7844   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7845   DEFSYM (Qpost_read_conversion, "post-read-conversion");
7846   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7847   DEFSYM (Qdefault_char, "default-char");
7848   DEFSYM (Qundecided, "undecided");
7849   DEFSYM (Qno_conversion, "no-conversion");
7850   DEFSYM (Qraw_text, "raw-text");
7851
7852   DEFSYM (Qiso_2022, "iso-2022");
7853
7854   DEFSYM (Qutf_8, "utf-8");
7855
7856   DEFSYM (Qutf_16, "utf-16");
7857   DEFSYM (Qutf_16_be, "utf-16-be");
7858   DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7859   DEFSYM (Qutf_16_le, "utf-16-l3");
7860   DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7861   DEFSYM (Qsignature, "signature");
7862   DEFSYM (Qendian, "endian");
7863   DEFSYM (Qbig, "big");
7864   DEFSYM (Qlittle, "little");
7865
7866   DEFSYM (Qshift_jis, "shift-jis");
7867   DEFSYM (Qbig5, "big5");
7868
7869   DEFSYM (Qcoding_system_p, "coding-system-p");
7870
7871   DEFSYM (Qcoding_system_error, "coding-system-error");
7872   Fput (Qcoding_system_error, Qerror_conditions,
7873         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7874   Fput (Qcoding_system_error, Qerror_message,
7875         build_string ("Invalid coding system"));
7876
7877   /* Intern this now in case it isn't already done.
7878      Setting this variable twice is harmless.
7879      But don't staticpro it here--that is done in alloc.c.  */
7880   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7881
7882   DEFSYM (Qtranslation_table, "translation-table");
7883   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7884   DEFSYM (Qtranslation_table_id, "translation-table-id");
7885   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7886   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7887
7888   DEFSYM (Qvalid_codes, "valid-codes");
7889
7890   DEFSYM (Qemacs_mule, "emacs-mule");
7891
7892   Vcoding_category_table
7893     = Fmake_vector (make_number (coding_category_max), Qnil);
7894   staticpro (&Vcoding_category_table);
7895   /* Followings are target of code detection.  */
7896   ASET (Vcoding_category_table, coding_category_iso_7,
7897         intern ("coding-category-iso-7"));
7898   ASET (Vcoding_category_table, coding_category_iso_7_tight,
7899         intern ("coding-category-iso-7-tight"));
7900   ASET (Vcoding_category_table, coding_category_iso_8_1,
7901         intern ("coding-category-iso-8-1"));
7902   ASET (Vcoding_category_table, coding_category_iso_8_2,
7903         intern ("coding-category-iso-8-2"));
7904   ASET (Vcoding_category_table, coding_category_iso_7_else,
7905         intern ("coding-category-iso-7-else"));
7906   ASET (Vcoding_category_table, coding_category_iso_8_else,
7907         intern ("coding-category-iso-8-else"));
7908   ASET (Vcoding_category_table, coding_category_utf_8,
7909         intern ("coding-category-utf-8"));
7910   ASET (Vcoding_category_table, coding_category_utf_16_be,
7911         intern ("coding-category-utf-16-be"));
7912   ASET (Vcoding_category_table, coding_category_utf_16_le,
7913         intern ("coding-category-utf-16-le"));
7914   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7915         intern ("coding-category-utf-16-be-nosig"));
7916   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7917         intern ("coding-category-utf-16-le-nosig"));
7918   ASET (Vcoding_category_table, coding_category_charset,
7919         intern ("coding-category-charset"));
7920   ASET (Vcoding_category_table, coding_category_sjis,
7921         intern ("coding-category-sjis"));
7922   ASET (Vcoding_category_table, coding_category_big5,
7923         intern ("coding-category-big5"));
7924   ASET (Vcoding_category_table, coding_category_ccl,
7925         intern ("coding-category-ccl"));
7926   ASET (Vcoding_category_table, coding_category_emacs_mule,
7927         intern ("coding-category-emacs-mule"));
7928   /* Followings are NOT target of code detection.  */
7929   ASET (Vcoding_category_table, coding_category_raw_text,
7930         intern ("coding-category-raw-text"));
7931   ASET (Vcoding_category_table, coding_category_undecided,
7932         intern ("coding-category-undecided"));
7933
7934   defsubr (&Scoding_system_p);
7935   defsubr (&Sread_coding_system);
7936   defsubr (&Sread_non_nil_coding_system);
7937   defsubr (&Scheck_coding_system);
7938   defsubr (&Sdetect_coding_region);
7939   defsubr (&Sdetect_coding_string);
7940   defsubr (&Sfind_coding_systems_region_internal);
7941   defsubr (&Scheck_coding_systems_region);
7942   defsubr (&Sdecode_coding_region);
7943   defsubr (&Sencode_coding_region);
7944   defsubr (&Sdecode_coding_string);
7945   defsubr (&Sencode_coding_string);
7946   defsubr (&Sdecode_sjis_char);
7947   defsubr (&Sencode_sjis_char);
7948   defsubr (&Sdecode_big5_char);
7949   defsubr (&Sencode_big5_char);
7950   defsubr (&Sset_terminal_coding_system_internal);
7951   defsubr (&Sset_safe_terminal_coding_system_internal);
7952   defsubr (&Sterminal_coding_system);
7953   defsubr (&Sset_keyboard_coding_system_internal);
7954   defsubr (&Skeyboard_coding_system);
7955   defsubr (&Sfind_operation_coding_system);
7956   defsubr (&Sset_coding_system_priority);
7957   defsubr (&Sdefine_coding_system_internal);
7958   defsubr (&Sdefine_coding_system_alias);
7959   defsubr (&Scoding_system_base);
7960   defsubr (&Scoding_system_plist);
7961   defsubr (&Scoding_system_aliases);
7962   defsubr (&Scoding_system_eol_type);
7963   defsubr (&Scoding_system_priority_list);
7964
7965   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7966                doc: /* List of coding systems.
7967
7968 Do not alter the value of this variable manually.  This variable should be
7969 updated by the functions `define-coding-system' and
7970 `define-coding-system-alias'.  */);
7971   Vcoding_system_list = Qnil;
7972
7973   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7974                doc: /* Alist of coding system names.
7975 Each element is one element list of coding system name.
7976 This variable is given to `completing-read' as TABLE argument.
7977
7978 Do not alter the value of this variable manually.  This variable should be
7979 updated by the functions `make-coding-system' and
7980 `define-coding-system-alias'.  */);
7981   Vcoding_system_alist = Qnil;
7982
7983   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7984                doc: /* List of coding-categories (symbols) ordered by priority.
7985
7986 On detecting a coding system, Emacs tries code detection algorithms
7987 associated with each coding-category one by one in this order.  When
7988 one algorithm agrees with a byte sequence of source text, the coding
7989 system bound to the corresponding coding-category is selected.  */);
7990   {
7991     int i;
7992
7993     Vcoding_category_list = Qnil;
7994     for (i = coding_category_max - 1; i >= 0; i--)
7995       Vcoding_category_list
7996         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7997                  Vcoding_category_list);
7998   }
7999
8000   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8001                doc: /* Specify the coding system for read operations.
8002 It is useful to bind this variable with `let', but do not set it globally.
8003 If the value is a coding system, it is used for decoding on read operation.
8004 If not, an appropriate element is used from one of the coding system alists:
8005 There are three such tables, `file-coding-system-alist',
8006 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8007   Vcoding_system_for_read = Qnil;
8008
8009   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8010                doc: /* Specify the coding system for write operations.
8011 Programs bind this variable with `let', but you should not set it globally.
8012 If the value is a coding system, it is used for encoding of output,
8013 when writing it to a file and when sending it to a file or subprocess.
8014
8015 If this does not specify a coding system, an appropriate element
8016 is used from one of the coding system alists:
8017 There are three such tables, `file-coding-system-alist',
8018 `process-coding-system-alist', and `network-coding-system-alist'.
8019 For output to files, if the above procedure does not specify a coding system,
8020 the value of `buffer-file-coding-system' is used.  */);
8021   Vcoding_system_for_write = Qnil;
8022
8023   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8024                doc: /*
8025 Coding system used in the latest file or process I/O.  */);
8026   Vlast_coding_system_used = Qnil;
8027
8028   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8029                doc: /*
8030 *Non-nil means always inhibit code conversion of end-of-line format.
8031 See info node `Coding Systems' and info node `Text and Binary' concerning
8032 such conversion.  */);
8033   inhibit_eol_conversion = 0;
8034
8035   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8036                doc: /*
8037 Non-nil means process buffer inherits coding system of process output.
8038 Bind it to t if the process output is to be treated as if it were a file
8039 read from some filesystem.  */);
8040   inherit_process_coding_system = 0;
8041
8042   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8043                doc: /*
8044 Alist to decide a coding system to use for a file I/O operation.
8045 The format is ((PATTERN . VAL) ...),
8046 where PATTERN is a regular expression matching a file name,
8047 VAL is a coding system, a cons of coding systems, or a function symbol.
8048 If VAL is a coding system, it is used for both decoding and encoding
8049 the file contents.
8050 If VAL is a cons of coding systems, the car part is used for decoding,
8051 and the cdr part is used for encoding.
8052 If VAL is a function symbol, the function must return a coding system
8053 or a cons of coding systems which are used as above.  The function gets
8054 the arguments with which `find-operation-coding-systems' was called.
8055
8056 See also the function `find-operation-coding-system'
8057 and the variable `auto-coding-alist'.  */);
8058   Vfile_coding_system_alist = Qnil;
8059
8060   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8061                doc: /*
8062 Alist to decide a coding system to use for a process I/O operation.
8063 The format is ((PATTERN . VAL) ...),
8064 where PATTERN is a regular expression matching a program name,
8065 VAL is a coding system, a cons of coding systems, or a function symbol.
8066 If VAL is a coding system, it is used for both decoding what received
8067 from the program and encoding what sent to the program.
8068 If VAL is a cons of coding systems, the car part is used for decoding,
8069 and the cdr part is used for encoding.
8070 If VAL is a function symbol, the function must return a coding system
8071 or a cons of coding systems which are used as above.
8072
8073 See also the function `find-operation-coding-system'.  */);
8074   Vprocess_coding_system_alist = Qnil;
8075
8076   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8077                doc: /*
8078 Alist to decide a coding system to use for a network I/O operation.
8079 The format is ((PATTERN . VAL) ...),
8080 where PATTERN is a regular expression matching a network service name
8081 or is a port number to connect to,
8082 VAL is a coding system, a cons of coding systems, or a function symbol.
8083 If VAL is a coding system, it is used for both decoding what received
8084 from the network stream and encoding what sent to the network stream.
8085 If VAL is a cons of coding systems, the car part is used for decoding,
8086 and the cdr part is used for encoding.
8087 If VAL is a function symbol, the function must return a coding system
8088 or a cons of coding systems which are used as above.
8089
8090 See also the function `find-operation-coding-system'.  */);
8091   Vnetwork_coding_system_alist = Qnil;
8092
8093   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8094                doc: /* Coding system to use with system messages.
8095 Also used for decoding keyboard input on X Window system.  */);
8096   Vlocale_coding_system = Qnil;
8097
8098   /* The eol mnemonics are reset in startup.el system-dependently.  */
8099   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8100                doc: /*
8101 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8102   eol_mnemonic_unix = build_string (":");
8103
8104   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8105                doc: /*
8106 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8107   eol_mnemonic_dos = build_string ("\\");
8108
8109   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8110                doc: /*
8111 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8112   eol_mnemonic_mac = build_string ("/");
8113
8114   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8115                doc: /*
8116 *String displayed in mode line when end-of-line format is not yet determined.  */);
8117   eol_mnemonic_undecided = build_string (":");
8118
8119   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8120                doc: /*
8121 *Non-nil enables character translation while encoding and decoding.  */);
8122   Venable_character_translation = Qt;
8123
8124   DEFVAR_LISP ("standard-translation-table-for-decode",
8125                &Vstandard_translation_table_for_decode,
8126                doc: /* Table for translating characters while decoding.  */);
8127   Vstandard_translation_table_for_decode = Qnil;
8128
8129   DEFVAR_LISP ("standard-translation-table-for-encode",
8130                &Vstandard_translation_table_for_encode,
8131                doc: /* Table for translating characters while encoding.  */);
8132   Vstandard_translation_table_for_encode = Qnil;
8133
8134   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8135                doc: /* Alist of charsets vs revision numbers.
8136 While encoding, if a charset (car part of an element) is found,
8137 designate it with the escape sequence identifying revision (cdr part
8138 of the element).  */);
8139   Vcharset_revision_table = Qnil;
8140
8141   DEFVAR_LISP ("default-process-coding-system",
8142                &Vdefault_process_coding_system,
8143                doc: /* Cons of coding systems used for process I/O by default.
8144 The car part is used for decoding a process output,
8145 the cdr part is used for encoding a text to be sent to a process.  */);
8146   Vdefault_process_coding_system = Qnil;
8147
8148   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8149                doc: /*
8150 Table of extra Latin codes in the range 128..159 (inclusive).
8151 This is a vector of length 256.
8152 If Nth element is non-nil, the existence of code N in a file
8153 \(or output of subprocess) doesn't prevent it to be detected as
8154 a coding system of ISO 2022 variant which has a flag
8155 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8156 or reading output of a subprocess.
8157 Only 128th through 159th elements has a meaning.  */);
8158   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8159
8160   DEFVAR_LISP ("select-safe-coding-system-function",
8161                &Vselect_safe_coding_system_function,
8162                doc: /*
8163 Function to call to select safe coding system for encoding a text.
8164
8165 If set, this function is called to force a user to select a proper
8166 coding system which can encode the text in the case that a default
8167 coding system used in each operation can't encode the text.
8168
8169 The default value is `select-safe-coding-system' (which see).  */);
8170   Vselect_safe_coding_system_function = Qnil;
8171
8172   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8173                &inhibit_iso_escape_detection,
8174                doc: /*
8175 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8176
8177 By default, on reading a file, Emacs tries to detect how the text is
8178 encoded.  This code detection is sensitive to escape sequences.  If
8179 the sequence is valid as ISO2022, the code is determined as one of
8180 the ISO2022 encodings, and the file is decoded by the corresponding
8181 coding system (e.g. `iso-2022-7bit').
8182
8183 However, there may be a case that you want to read escape sequences in
8184 a file as is.  In such a case, you can set this variable to non-nil.
8185 Then, as the code detection ignores any escape sequences, no file is
8186 detected as encoded in some ISO2022 encoding.  The result is that all
8187 escape sequences become visible in a buffer.
8188
8189 The default value is nil, and it is strongly recommended not to change
8190 it.  That is because many Emacs Lisp source files that contain
8191 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8192 in Emacs's distribution, and they won't be decoded correctly on
8193 reading if you suppress escape sequence detection.
8194
8195 The other way to read escape sequences in a file without decoding is
8196 to explicitly specify some coding system that doesn't use ISO2022's
8197 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8198   inhibit_iso_escape_detection = 0;
8199
8200   {
8201     Lisp_Object args[coding_arg_max];
8202     Lisp_Object plist[14];
8203     int i;
8204
8205     for (i = 0; i < coding_arg_max; i++)
8206       args[i] = Qnil;
8207
8208     plist[0] = intern (":name");
8209     plist[1] = args[coding_arg_name] = Qno_conversion;
8210     plist[2] = intern (":mnemonic");
8211     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8212     plist[4] = intern (":coding-type");
8213     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8214     plist[6] = intern (":ascii-compatible-p");
8215     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8216     plist[8] = intern (":default-char");
8217     plist[9] = args[coding_arg_default_char] = make_number (0);
8218     plist[10] = intern (":docstring");
8219     plist[11] = build_string ("Do no conversion.\n\
8220 \n\
8221 When you visit a file with this coding, the file is read into a\n\
8222 unibyte buffer as is, thus each byte of a file is treated as a\n\
8223 character.");
8224     plist[12] = intern (":eol-type");
8225     plist[13] = args[coding_arg_eol_type] = Qunix;
8226     args[coding_arg_plist] = Flist (14, plist);
8227     Fdefine_coding_system_internal (coding_arg_max, args);
8228   }
8229
8230   setup_coding_system (Qno_conversion, &keyboard_coding);
8231   setup_coding_system (Qno_conversion, &terminal_coding);
8232   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8233 }
8234
8235 char *
8236 emacs_strerror (error_number)
8237      int error_number;
8238 {
8239   char *str;
8240
8241   synchronize_system_messages_locale ();
8242   str = strerror (error_number);
8243
8244   if (! NILP (Vlocale_coding_system))
8245     {
8246       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8247                                                       Vlocale_coding_system,
8248                                                       0);
8249       str = (char *) XSTRING (dec)->data;
8250     }
8251
8252   return str;
8253 }
8254
8255 #endif /* emacs */