*** empty log message ***
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
ff955d90 2 Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
ea9d458b 4 Copyright (C) 2001,2002 Free Software Foundation, Inc.
4ed46869 5
369314dc
KH
6This file is part of GNU Emacs.
7
8GNU Emacs is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
4ed46869 12
369314dc
KH
13GNU Emacs is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
4ed46869 17
369314dc
KH
18You should have received a copy of the GNU General Public License
19along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */
4ed46869
KH
22
23/*** TABLE OF CONTENTS ***
24
b73bfc1c 25 0. General comments
4ed46869 26 1. Preamble
0ef69138 27 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
1397dc18
KH
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
4ed46869
KH
35
36*/
37
b73bfc1c
KH
38/*** 0. General comments ***/
39
40
cfb43547 41/*** GENERAL NOTE on CODING SYSTEMS ***
4ed46869 42
cfb43547 43 A coding system is an encoding mechanism for one or more character
4ed46869
KH
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
cfb43547 46 Emacs' internal format (emacs-mule), and when we say "encode",
0ef69138
KH
47 it means converting the coding system emacs-mule to some other
48 coding system.
4ed46869 49
0ef69138 50 0. Emacs' internal format (emacs-mule)
4ed46869 51
cfb43547 52 Emacs itself holds a multi-lingual character in buffers and strings
f4dee582 53 in a special format. Details are described in section 2.
4ed46869
KH
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
f4dee582
RS
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 63
4ed46869
KH
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 66 section 4.
4ed46869
KH
67
68 3. BIG5
69
cfb43547
DL
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
4ed46869 75
27901516
KH
76 4. Raw text
77
cfb43547
DL
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
27901516
KH
80
81 5. Other
4ed46869 82
cfb43547
DL
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
4ed46869
KH
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
d46c5b12
KH
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
4ed46869 90 information about it is set in a structure of type `struct
f4dee582 91 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
92
93*/
94
95/*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
cfb43547
DL
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
f4dee582 99 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
4ed46869 102
cfb43547
DL
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
4ed46869
KH
107
108*/
109
110/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
cfb43547 114 which appropriate flag bits for the category XXX are set. The flag
4ed46869 115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
cfb43547 116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
0a28aafb 117 of the range 0x80..0x9F are in multibyte form. */
4ed46869
KH
118#if 0
119int
0a28aafb 120detect_coding_emacs_mule (src, src_end, multibytep)
4ed46869 121 unsigned char *src, *src_end;
0a28aafb 122 int multibytep;
4ed46869
KH
123{
124 ...
125}
126#endif
127
128/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
b73bfc1c
KH
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
d46c5b12 134
cfb43547
DL
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
d46c5b12 140
cfb43547 141 DST_BYTES zero means that the source area and destination area are
d46c5b12 142 overlapped, which means that we can produce a decoded text until it
cfb43547 143 reaches the head of the not-yet-decoded source text.
d46c5b12 144
cfb43547 145 Below is a template for these functions. */
4ed46869 146#if 0
b73bfc1c 147static void
d46c5b12 148decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
4ed46869
KH
152{
153 ...
154}
155#endif
156
157/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
cfb43547 159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
b73bfc1c
KH
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
d46c5b12 163
cfb43547
DL
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
d46c5b12 169
cfb43547
DL
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
d46c5b12 173
cfb43547 174 Below is a template for these functions. */
4ed46869 175#if 0
b73bfc1c 176static void
d46c5b12 177encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
4ed46869
KH
181{
182 ...
183}
184#endif
185
186/*** COMMONLY USED MACROS ***/
187
b73bfc1c
KH
188/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
4ed46869 195
b73bfc1c
KH
196#define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
4ed46869
KH
204 } while (0)
205
b73bfc1c
KH
206#define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
4ed46869
KH
215 } while (0)
216
4ed46869 217
0a28aafb
KH
218/* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221#define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
b73bfc1c
KH
233/* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
4ed46869 242
b73bfc1c
KH
243#define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
39658efc 258 c = translate_char (translation_table, c, -1, 0, 0); \
b73bfc1c 259 src += bytes; \
4ed46869
KH
260 } while (0)
261
4ed46869 262
8ca3766a 263/* Produce a multibyte form of character C to `dst'. Jump to
b73bfc1c
KH
264 `label_end_of_loop' if there's not enough space at `dst'.
265
cfb43547 266 If we are now in the middle of a composition sequence, the decoded
b73bfc1c
KH
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273#define EMIT_CHAR(c) \
4ed46869 274 do { \
b73bfc1c
KH
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
ec6d2bb8 288 \
b73bfc1c
KH
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
4ed46869
KH
296 } while (0)
297
4ed46869 298
b73bfc1c
KH
299#define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309#define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319#define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
4ed46869
KH
328 } while (0)
329
330\f
331/*** 1. Preamble ***/
332
68c45bf0
PE
333#ifdef emacs
334#include <config.h>
335#endif
336
4ed46869
KH
337#include <stdio.h>
338
339#ifdef emacs
340
4ed46869
KH
341#include "lisp.h"
342#include "buffer.h"
343#include "charset.h"
ec6d2bb8 344#include "composite.h"
4ed46869
KH
345#include "ccl.h"
346#include "coding.h"
347#include "window.h"
348
349#else /* not emacs */
350
351#include "mulelib.h"
352
353#endif /* not emacs */
354
355Lisp_Object Qcoding_system, Qeol_type;
356Lisp_Object Qbuffer_file_coding_system;
357Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 358Lisp_Object Qno_conversion, Qundecided;
bb0115a2 359Lisp_Object Qcoding_system_history;
05e6f5dc 360Lisp_Object Qsafe_chars;
1397dc18 361Lisp_Object Qvalid_codes;
4ed46869
KH
362
363extern Lisp_Object Qinsert_file_contents, Qwrite_region;
364Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
365Lisp_Object Qstart_process, Qopen_network_stream;
366Lisp_Object Qtarget_idx;
367
d46c5b12
KH
368Lisp_Object Vselect_safe_coding_system_function;
369
5d5bf4d8
KH
370int coding_system_require_warning;
371
7722baf9
EZ
372/* Mnemonic string for each format of end-of-line. */
373Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
374/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 375 decided. */
7722baf9 376Lisp_Object eol_mnemonic_undecided;
4ed46869 377
9ce27fde
KH
378/* Format of end-of-line decided by system. This is CODING_EOL_LF on
379 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
380int system_eol_type;
381
4ed46869
KH
382#ifdef emacs
383
6b89e3aa
KH
384/* Information about which coding system is safe for which chars.
385 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
386
387 GENERIC-LIST is a list of generic coding systems which can encode
388 any characters.
389
390 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
391 corresponding char table that contains safe chars. */
392Lisp_Object Vcoding_system_safe_chars;
393
4608c386
KH
394Lisp_Object Vcoding_system_list, Vcoding_system_alist;
395
396Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 397
d46c5b12
KH
398/* Coding system emacs-mule and raw-text are for converting only
399 end-of-line format. */
400Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 401
4ed46869
KH
402/* Coding-systems are handed between Emacs Lisp programs and C internal
403 routines by the following three variables. */
404/* Coding-system for reading files and receiving data from process. */
405Lisp_Object Vcoding_system_for_read;
406/* Coding-system for writing files and sending data to process. */
407Lisp_Object Vcoding_system_for_write;
408/* Coding-system actually used in the latest I/O. */
409Lisp_Object Vlast_coding_system_used;
410
c4825358 411/* A vector of length 256 which contains information about special
94487c4e 412 Latin codes (especially for dealing with Microsoft codes). */
3f003981 413Lisp_Object Vlatin_extra_code_table;
c4825358 414
9ce27fde
KH
415/* Flag to inhibit code conversion of end-of-line format. */
416int inhibit_eol_conversion;
417
74383408
KH
418/* Flag to inhibit ISO2022 escape sequence detection. */
419int inhibit_iso_escape_detection;
420
ed29121d
EZ
421/* Flag to make buffer-file-coding-system inherit from process-coding. */
422int inherit_process_coding_system;
423
c4825358 424/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
425struct coding_system terminal_coding;
426
c4825358
KH
427/* Coding system to be used to encode text for terminal display when
428 terminal coding system is nil. */
429struct coding_system safe_terminal_coding;
430
431/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
432struct coding_system keyboard_coding;
433
6bc51348
KH
434/* Default coding system to be used to write a file. */
435struct coding_system default_buffer_file_coding;
436
02ba4723
KH
437Lisp_Object Vfile_coding_system_alist;
438Lisp_Object Vprocess_coding_system_alist;
439Lisp_Object Vnetwork_coding_system_alist;
4ed46869 440
68c45bf0
PE
441Lisp_Object Vlocale_coding_system;
442
4ed46869
KH
443#endif /* emacs */
444
d46c5b12 445Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
446
447/* List of symbols `coding-category-xxx' ordered by priority. */
448Lisp_Object Vcoding_category_list;
449
d46c5b12
KH
450/* Table of coding categories (Lisp symbols). */
451Lisp_Object Vcoding_category_table;
4ed46869
KH
452
453/* Table of names of symbol for each coding-category. */
454char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 455 "coding-category-emacs-mule",
4ed46869
KH
456 "coding-category-sjis",
457 "coding-category-iso-7",
d46c5b12 458 "coding-category-iso-7-tight",
4ed46869
KH
459 "coding-category-iso-8-1",
460 "coding-category-iso-8-2",
7717c392
KH
461 "coding-category-iso-7-else",
462 "coding-category-iso-8-else",
89fa8b36 463 "coding-category-ccl",
4ed46869 464 "coding-category-big5",
fa42c37f
KH
465 "coding-category-utf-8",
466 "coding-category-utf-16-be",
467 "coding-category-utf-16-le",
27901516 468 "coding-category-raw-text",
89fa8b36 469 "coding-category-binary"
4ed46869
KH
470};
471
66cfb530 472/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
473 categories. */
474struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
475
66cfb530 476/* Table of coding category masks. Nth element is a mask for a coding
8ca3766a 477 category of which priority is Nth. */
66cfb530
KH
478static
479int coding_priorities[CODING_CATEGORY_IDX_MAX];
480
f967223b
KH
481/* Flag to tell if we look up translation table on character code
482 conversion. */
84fbb8a0 483Lisp_Object Venable_character_translation;
f967223b
KH
484/* Standard translation table to look up on decoding (reading). */
485Lisp_Object Vstandard_translation_table_for_decode;
486/* Standard translation table to look up on encoding (writing). */
487Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 488
f967223b
KH
489Lisp_Object Qtranslation_table;
490Lisp_Object Qtranslation_table_id;
491Lisp_Object Qtranslation_table_for_decode;
492Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
493
494/* Alist of charsets vs revision number. */
495Lisp_Object Vcharset_revision_alist;
496
02ba4723
KH
497/* Default coding systems used for process I/O. */
498Lisp_Object Vdefault_process_coding_system;
499
002fdb44
DL
500/* Char table for translating Quail and self-inserting input. */
501Lisp_Object Vtranslation_table_for_input;
502
b843d1ae
KH
503/* Global flag to tell that we can't call post-read-conversion and
504 pre-write-conversion functions. Usually the value is zero, but it
505 is set to 1 temporarily while such functions are running. This is
506 to avoid infinite recursive call. */
507static int inhibit_pre_post_conversion;
508
05e6f5dc
KH
509/* Char-table containing safe coding systems of each character. */
510Lisp_Object Vchar_coding_system_table;
511Lisp_Object Qchar_coding_system;
512
6b89e3aa
KH
513/* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
514 its validity. */
05e6f5dc
KH
515
516Lisp_Object
6b89e3aa
KH
517coding_safe_chars (coding_system)
518 Lisp_Object coding_system;
05e6f5dc
KH
519{
520 Lisp_Object coding_spec, plist, safe_chars;
93dec019 521
6b89e3aa 522 coding_spec = Fget (coding_system, Qcoding_system);
05e6f5dc
KH
523 plist = XVECTOR (coding_spec)->contents[3];
524 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
525 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
526}
527
528#define CODING_SAFE_CHAR_P(safe_chars, c) \
529 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
530
4ed46869 531\f
0ef69138 532/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869 533
aa72b389
KH
534/* Emacs' internal format for representation of multiple character
535 sets is a kind of multi-byte encoding, i.e. characters are
536 represented by variable-length sequences of one-byte codes.
b73bfc1c
KH
537
538 ASCII characters and control characters (e.g. `tab', `newline') are
539 represented by one-byte sequences which are their ASCII codes, in
540 the range 0x00 through 0x7F.
541
542 8-bit characters of the range 0x80..0x9F are represented by
543 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
544 code + 0x20).
545
546 8-bit characters of the range 0xA0..0xFF are represented by
547 one-byte sequences which are their 8-bit code.
548
549 The other characters are represented by a sequence of `base
550 leading-code', optional `extended leading-code', and one or two
551 `position-code's. The length of the sequence is determined by the
aa72b389 552 base leading-code. Leading-code takes the range 0x81 through 0x9D,
b73bfc1c
KH
553 whereas extended leading-code and position-code take the range 0xA0
554 through 0xFF. See `charset.h' for more details about leading-code
555 and position-code.
f4dee582 556
4ed46869 557 --- CODE RANGE of Emacs' internal format ---
b73bfc1c
KH
558 character set range
559 ------------- -----
560 ascii 0x00..0x7F
561 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
562 eight-bit-graphic 0xA0..0xBF
aa72b389 563 ELSE 0x81..0x9D + [0xA0..0xFF]+
4ed46869
KH
564 ---------------------------------------------
565
aa72b389
KH
566 As this is the internal character representation, the format is
567 usually not used externally (i.e. in a file or in a data sent to a
568 process). But, it is possible to have a text externally in this
569 format (i.e. by encoding by the coding system `emacs-mule').
570
571 In that case, a sequence of one-byte codes has a slightly different
572 form.
573
ae5145c2 574 Firstly, all characters in eight-bit-control are represented by
aa72b389
KH
575 one-byte sequences which are their 8-bit code.
576
577 Next, character composition data are represented by the byte
578 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
579 where,
580 METHOD is 0xF0 plus one of composition method (enum
581 composition_method),
582
ae5145c2 583 BYTES is 0xA0 plus the byte length of these composition data,
aa72b389 584
ae5145c2 585 CHARS is 0xA0 plus the number of characters composed by these
aa72b389
KH
586 data,
587
8ca3766a 588 COMPONENTs are characters of multibyte form or composition
aa72b389
KH
589 rules encoded by two-byte of ASCII codes.
590
591 In addition, for backward compatibility, the following formats are
592 also recognized as composition data on decoding.
593
594 0x80 MSEQ ...
595 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
596
597 Here,
598 MSEQ is a multibyte form but in these special format:
599 ASCII: 0xA0 ASCII_CODE+0x80,
600 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
601 RULE is a one byte code of the range 0xA0..0xF0 that
602 represents a composition rule.
4ed46869
KH
603 */
604
605enum emacs_code_class_type emacs_code_class[256];
606
4ed46869
KH
607/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
608 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 609 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869 610
0a28aafb
KH
611static int
612detect_coding_emacs_mule (src, src_end, multibytep)
b73bfc1c 613 unsigned char *src, *src_end;
0a28aafb 614 int multibytep;
4ed46869
KH
615{
616 unsigned char c;
617 int composing = 0;
b73bfc1c
KH
618 /* Dummy for ONE_MORE_BYTE. */
619 struct coding_system dummy_coding;
620 struct coding_system *coding = &dummy_coding;
4ed46869 621
b73bfc1c 622 while (1)
4ed46869 623 {
0a28aafb 624 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
4ed46869
KH
625
626 if (composing)
627 {
628 if (c < 0xA0)
629 composing = 0;
b73bfc1c
KH
630 else if (c == 0xA0)
631 {
0a28aafb 632 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
633 c &= 0x7F;
634 }
4ed46869
KH
635 else
636 c -= 0x20;
637 }
638
b73bfc1c 639 if (c < 0x20)
4ed46869 640 {
4ed46869
KH
641 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
642 return 0;
b73bfc1c
KH
643 }
644 else if (c >= 0x80 && c < 0xA0)
645 {
646 if (c == 0x80)
647 /* Old leading code for a composite character. */
648 composing = 1;
649 else
650 {
651 unsigned char *src_base = src - 1;
652 int bytes;
4ed46869 653
b73bfc1c
KH
654 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
655 bytes))
656 return 0;
657 src = src_base + bytes;
658 }
659 }
660 }
661 label_end_of_loop:
662 return CODING_CATEGORY_MASK_EMACS_MULE;
663}
4ed46869 664
4ed46869 665
aa72b389
KH
666/* Record the starting position START and METHOD of one composition. */
667
668#define CODING_ADD_COMPOSITION_START(coding, start, method) \
669 do { \
670 struct composition_data *cmp_data = coding->cmp_data; \
671 int *data = cmp_data->data + cmp_data->used; \
672 coding->cmp_data_start = cmp_data->used; \
673 data[0] = -1; \
674 data[1] = cmp_data->char_offset + start; \
675 data[3] = (int) method; \
676 cmp_data->used += 4; \
677 } while (0)
678
679/* Record the ending position END of the current composition. */
680
681#define CODING_ADD_COMPOSITION_END(coding, end) \
682 do { \
683 struct composition_data *cmp_data = coding->cmp_data; \
684 int *data = cmp_data->data + coding->cmp_data_start; \
685 data[0] = cmp_data->used - coding->cmp_data_start; \
686 data[2] = cmp_data->char_offset + end; \
687 } while (0)
688
689/* Record one COMPONENT (alternate character or composition rule). */
690
b6871cc7
KH
691#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
692 do { \
693 coding->cmp_data->data[coding->cmp_data->used++] = component; \
694 if (coding->cmp_data->used - coding->cmp_data_start \
695 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
696 { \
697 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
698 coding->composing = COMPOSITION_NO; \
699 } \
700 } while (0)
aa72b389
KH
701
702
703/* Get one byte from a data pointed by SRC and increment SRC. If SRC
8ca3766a 704 is not less than SRC_END, return -1 without incrementing Src. */
aa72b389
KH
705
706#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
707
708
709/* Decode a character represented as a component of composition
710 sequence of Emacs 20 style at SRC. Set C to that character, store
711 its multibyte form sequence at P, and set P to the end of that
712 sequence. If no valid character is found, set C to -1. */
713
714#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
715 do { \
716 int bytes; \
717 \
718 c = SAFE_ONE_MORE_BYTE (); \
719 if (c < 0) \
720 break; \
721 if (CHAR_HEAD_P (c)) \
722 c = -1; \
723 else if (c == 0xA0) \
724 { \
725 c = SAFE_ONE_MORE_BYTE (); \
726 if (c < 0xA0) \
727 c = -1; \
728 else \
729 { \
730 c -= 0xA0; \
731 *p++ = c; \
732 } \
733 } \
734 else if (BASE_LEADING_CODE_P (c - 0x20)) \
735 { \
736 unsigned char *p0 = p; \
737 \
738 c -= 0x20; \
739 *p++ = c; \
740 bytes = BYTES_BY_CHAR_HEAD (c); \
741 while (--bytes) \
742 { \
743 c = SAFE_ONE_MORE_BYTE (); \
744 if (c < 0) \
745 break; \
746 *p++ = c; \
747 } \
748 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)) \
749 c = STRING_CHAR (p0, bytes); \
750 else \
751 c = -1; \
752 } \
753 else \
754 c = -1; \
755 } while (0)
756
757
758/* Decode a composition rule represented as a component of composition
759 sequence of Emacs 20 style at SRC. Set C to the rule. If not
760 valid rule is found, set C to -1. */
761
762#define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
763 do { \
764 c = SAFE_ONE_MORE_BYTE (); \
765 c -= 0xA0; \
766 if (c < 0 || c >= 81) \
767 c = -1; \
768 else \
769 { \
770 gref = c / 9, nref = c % 9; \
771 c = COMPOSITION_ENCODE_RULE (gref, nref); \
772 } \
773 } while (0)
774
775
776/* Decode composition sequence encoded by `emacs-mule' at the source
777 pointed by SRC. SRC_END is the end of source. Store information
778 of the composition in CODING->cmp_data.
779
780 For backward compatibility, decode also a composition sequence of
781 Emacs 20 style. In that case, the composition sequence contains
782 characters that should be extracted into a buffer or string. Store
783 those characters at *DESTINATION in multibyte form.
784
785 If we encounter an invalid byte sequence, return 0.
786 If we encounter an insufficient source or destination, or
787 insufficient space in CODING->cmp_data, return 1.
788 Otherwise, return consumed bytes in the source.
789
790*/
791static INLINE int
792decode_composition_emacs_mule (coding, src, src_end,
793 destination, dst_end, dst_bytes)
794 struct coding_system *coding;
795 unsigned char *src, *src_end, **destination, *dst_end;
796 int dst_bytes;
797{
798 unsigned char *dst = *destination;
799 int method, data_len, nchars;
800 unsigned char *src_base = src++;
8ca3766a 801 /* Store components of composition. */
aa72b389
KH
802 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
803 int ncomponent;
804 /* Store multibyte form of characters to be composed. This is for
805 Emacs 20 style composition sequence. */
806 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
807 unsigned char *bufp = buf;
808 int c, i, gref, nref;
809
810 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
811 >= COMPOSITION_DATA_SIZE)
812 {
813 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
814 return -1;
815 }
816
817 ONE_MORE_BYTE (c);
818 if (c - 0xF0 >= COMPOSITION_RELATIVE
819 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
820 {
821 int with_rule;
822
823 method = c - 0xF0;
824 with_rule = (method == COMPOSITION_WITH_RULE
825 || method == COMPOSITION_WITH_RULE_ALTCHARS);
826 ONE_MORE_BYTE (c);
827 data_len = c - 0xA0;
828 if (data_len < 4
829 || src_base + data_len > src_end)
830 return 0;
831 ONE_MORE_BYTE (c);
832 nchars = c - 0xA0;
833 if (c < 1)
834 return 0;
835 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
836 {
b1887814
RS
837 /* If it is longer than this, it can't be valid. */
838 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
839 return 0;
840
aa72b389
KH
841 if (ncomponent % 2 && with_rule)
842 {
843 ONE_MORE_BYTE (gref);
844 gref -= 32;
845 ONE_MORE_BYTE (nref);
846 nref -= 32;
847 c = COMPOSITION_ENCODE_RULE (gref, nref);
848 }
849 else
850 {
851 int bytes;
852 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
853 c = STRING_CHAR (src, bytes);
854 else
855 c = *src, bytes = 1;
856 src += bytes;
857 }
858 component[ncomponent] = c;
859 }
860 }
861 else
862 {
863 /* This may be an old Emacs 20 style format. See the comment at
864 the section 2 of this file. */
865 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
866 if (src == src_end
867 && !(coding->mode & CODING_MODE_LAST_BLOCK))
868 goto label_end_of_loop;
869
870 src_end = src;
871 src = src_base + 1;
872 if (c < 0xC0)
873 {
874 method = COMPOSITION_RELATIVE;
875 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
876 {
877 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
878 if (c < 0)
879 break;
880 component[ncomponent++] = c;
881 }
882 if (ncomponent < 2)
883 return 0;
884 nchars = ncomponent;
885 }
886 else if (c == 0xFF)
887 {
888 method = COMPOSITION_WITH_RULE;
889 src++;
890 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
891 if (c < 0)
892 return 0;
893 component[0] = c;
894 for (ncomponent = 1;
895 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
896 {
897 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
898 if (c < 0)
899 break;
900 component[ncomponent++] = c;
901 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
902 if (c < 0)
903 break;
904 component[ncomponent++] = c;
905 }
906 if (ncomponent < 3)
907 return 0;
908 nchars = (ncomponent + 1) / 2;
909 }
910 else
911 return 0;
912 }
913
914 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
915 {
916 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
917 for (i = 0; i < ncomponent; i++)
918 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
93dec019 919 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
aa72b389
KH
920 if (buf < bufp)
921 {
922 unsigned char *p = buf;
923 EMIT_BYTES (p, bufp);
924 *destination += bufp - buf;
925 coding->produced_char += nchars;
926 }
927 return (src - src_base);
928 }
929 label_end_of_loop:
930 return -1;
931}
932
b73bfc1c 933/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 934
b73bfc1c
KH
935static void
936decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
937 struct coding_system *coding;
938 unsigned char *source, *destination;
939 int src_bytes, dst_bytes;
940{
941 unsigned char *src = source;
942 unsigned char *src_end = source + src_bytes;
943 unsigned char *dst = destination;
944 unsigned char *dst_end = destination + dst_bytes;
945 /* SRC_BASE remembers the start position in source in each loop.
946 The loop will be exited when there's not enough source code, or
947 when there's not enough destination area to produce a
948 character. */
949 unsigned char *src_base;
4ed46869 950
b73bfc1c 951 coding->produced_char = 0;
8a33cf7b 952 while ((src_base = src) < src_end)
b73bfc1c
KH
953 {
954 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
955 int bytes;
ec6d2bb8 956
4af310db
EZ
957 if (*src == '\r')
958 {
2bcdf662 959 int c = *src++;
4af310db 960
4af310db
EZ
961 if (coding->eol_type == CODING_EOL_CR)
962 c = '\n';
963 else if (coding->eol_type == CODING_EOL_CRLF)
964 {
965 ONE_MORE_BYTE (c);
966 if (c != '\n')
967 {
4af310db
EZ
968 src--;
969 c = '\r';
970 }
971 }
972 *dst++ = c;
973 coding->produced_char++;
974 continue;
975 }
976 else if (*src == '\n')
977 {
978 if ((coding->eol_type == CODING_EOL_CR
979 || coding->eol_type == CODING_EOL_CRLF)
980 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
981 {
982 coding->result = CODING_FINISH_INCONSISTENT_EOL;
983 goto label_end_of_loop;
984 }
985 *dst++ = *src++;
986 coding->produced_char++;
987 continue;
988 }
3089d25c 989 else if (*src == 0x80 && coding->cmp_data)
aa72b389
KH
990 {
991 /* Start of composition data. */
992 int consumed = decode_composition_emacs_mule (coding, src, src_end,
993 &dst, dst_end,
994 dst_bytes);
995 if (consumed < 0)
996 goto label_end_of_loop;
997 else if (consumed > 0)
998 {
999 src += consumed;
1000 continue;
1001 }
1002 bytes = CHAR_STRING (*src, tmp);
1003 p = tmp;
1004 src++;
1005 }
4af310db 1006 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
b73bfc1c
KH
1007 {
1008 p = src;
1009 src += bytes;
1010 }
1011 else
1012 {
1013 bytes = CHAR_STRING (*src, tmp);
1014 p = tmp;
1015 src++;
1016 }
1017 if (dst + bytes >= (dst_bytes ? dst_end : src))
1018 {
1019 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869
KH
1020 break;
1021 }
b73bfc1c
KH
1022 while (bytes--) *dst++ = *p++;
1023 coding->produced_char++;
4ed46869 1024 }
4af310db 1025 label_end_of_loop:
b73bfc1c
KH
1026 coding->consumed = coding->consumed_char = src_base - source;
1027 coding->produced = dst - destination;
4ed46869
KH
1028}
1029
b73bfc1c 1030
aa72b389
KH
1031/* Encode composition data stored at DATA into a special byte sequence
1032 starting by 0x80. Update CODING->cmp_data_start and maybe
1033 CODING->cmp_data for the next call. */
1034
1035#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1036 do { \
1037 unsigned char buf[1024], *p0 = buf, *p; \
1038 int len = data[0]; \
1039 int i; \
1040 \
1041 buf[0] = 0x80; \
1042 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1043 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1044 p = buf + 4; \
1045 if (data[3] == COMPOSITION_WITH_RULE \
1046 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1047 { \
1048 p += CHAR_STRING (data[4], p); \
1049 for (i = 5; i < len; i += 2) \
1050 { \
1051 int gref, nref; \
1052 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1053 *p++ = 0x20 + gref; \
1054 *p++ = 0x20 + nref; \
1055 p += CHAR_STRING (data[i + 1], p); \
1056 } \
1057 } \
1058 else \
1059 { \
1060 for (i = 4; i < len; i++) \
1061 p += CHAR_STRING (data[i], p); \
1062 } \
1063 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1064 \
1065 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1066 { \
1067 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1068 goto label_end_of_loop; \
1069 } \
1070 while (p0 < p) \
1071 *dst++ = *p0++; \
1072 coding->cmp_data_start += data[0]; \
1073 if (coding->cmp_data_start == coding->cmp_data->used \
1074 && coding->cmp_data->next) \
1075 { \
1076 coding->cmp_data = coding->cmp_data->next; \
1077 coding->cmp_data_start = 0; \
1078 } \
1079 } while (0)
93dec019 1080
aa72b389 1081
a4244313 1082static void encode_eol P_ ((struct coding_system *, const unsigned char *,
aa72b389
KH
1083 unsigned char *, int, int));
1084
1085static void
1086encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1087 struct coding_system *coding;
1088 unsigned char *source, *destination;
1089 int src_bytes, dst_bytes;
1090{
1091 unsigned char *src = source;
1092 unsigned char *src_end = source + src_bytes;
1093 unsigned char *dst = destination;
1094 unsigned char *dst_end = destination + dst_bytes;
1095 unsigned char *src_base;
1096 int c;
1097 int char_offset;
1098 int *data;
1099
1100 Lisp_Object translation_table;
1101
1102 translation_table = Qnil;
1103
1104 /* Optimization for the case that there's no composition. */
1105 if (!coding->cmp_data || coding->cmp_data->used == 0)
1106 {
1107 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1108 return;
1109 }
1110
1111 char_offset = coding->cmp_data->char_offset;
1112 data = coding->cmp_data->data + coding->cmp_data_start;
1113 while (1)
1114 {
1115 src_base = src;
1116
1117 /* If SRC starts a composition, encode the information about the
1118 composition in advance. */
1119 if (coding->cmp_data_start < coding->cmp_data->used
1120 && char_offset + coding->consumed_char == data[1])
1121 {
1122 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1123 char_offset = coding->cmp_data->char_offset;
1124 data = coding->cmp_data->data + coding->cmp_data_start;
1125 }
1126
1127 ONE_MORE_CHAR (c);
1128 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1129 || coding->eol_type == CODING_EOL_CR))
1130 {
1131 if (coding->eol_type == CODING_EOL_CRLF)
1132 EMIT_TWO_BYTES ('\r', c);
1133 else
1134 EMIT_ONE_BYTE ('\r');
1135 }
1136 else if (SINGLE_BYTE_CHAR_P (c))
1137 EMIT_ONE_BYTE (c);
1138 else
1139 EMIT_BYTES (src_base, src);
1140 coding->consumed_char++;
1141 }
1142 label_end_of_loop:
1143 coding->consumed = src_base - source;
1144 coding->produced = coding->produced_char = dst - destination;
1145 return;
1146}
b73bfc1c 1147
4ed46869
KH
1148\f
1149/*** 3. ISO2022 handlers ***/
1150
1151/* The following note describes the coding system ISO2022 briefly.
39787efd 1152 Since the intention of this note is to help understand the
cfb43547 1153 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 1154 SIMPLIFIED. For thorough understanding, please refer to the
cfb43547
DL
1155 original document of ISO2022. This is equivalent to the standard
1156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
1157
1158 ISO2022 provides many mechanisms to encode several character sets
cfb43547 1159 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
1160 is encoded using bytes less than 128. This may make the encoded
1161 text a little bit longer, but the text passes more easily through
cfb43547 1162 several types of gateway, some of which strip off the MSB (Most
8ca3766a 1163 Significant Bit).
b73bfc1c 1164
cfb43547
DL
1165 There are two kinds of character sets: control character sets and
1166 graphic character sets. The former contain control characters such
4ed46869 1167 as `newline' and `escape' to provide control functions (control
39787efd 1168 functions are also provided by escape sequences). The latter
cfb43547 1169 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
1170 two control character sets and many graphic character sets.
1171
1172 Graphic character sets are classified into one of the following
39787efd
KH
1173 four classes, according to the number of bytes (DIMENSION) and
1174 number of characters in one dimension (CHARS) of the set:
1175 - DIMENSION1_CHARS94
1176 - DIMENSION1_CHARS96
1177 - DIMENSION2_CHARS94
1178 - DIMENSION2_CHARS96
1179
1180 In addition, each character set is assigned an identification tag,
cfb43547 1181 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
1182 hereafter). The <F> of each character set is decided by ECMA(*)
1183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1184 (0x30..0x3F are for private use only).
4ed46869
KH
1185
1186 Note (*): ECMA = European Computer Manufacturers Association
1187
cfb43547 1188 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
1189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1192 o DIMENSION2_CHARS96 -- none for the moment
1193
39787efd 1194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
1195 C0 [0x00..0x1F] -- control character plane 0
1196 GL [0x20..0x7F] -- graphic character plane 0
1197 C1 [0x80..0x9F] -- control character plane 1
1198 GR [0xA0..0xFF] -- graphic character plane 1
1199
1200 A control character set is directly designated and invoked to C0 or
39787efd
KH
1201 C1 by an escape sequence. The most common case is that:
1202 - ISO646's control character set is designated/invoked to C0, and
1203 - ISO6429's control character set is designated/invoked to C1,
1204 and usually these designations/invocations are omitted in encoded
1205 text. In a 7-bit environment, only C0 can be used, and a control
1206 character for C1 is encoded by an appropriate escape sequence to
1207 fit into the environment. All control characters for C1 are
1208 defined to have corresponding escape sequences.
4ed46869
KH
1209
1210 A graphic character set is at first designated to one of four
1211 graphic registers (G0 through G3), then these graphic registers are
1212 invoked to GL or GR. These designations and invocations can be
1213 done independently. The most common case is that G0 is invoked to
39787efd
KH
1214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1215 these invocations and designations are omitted in encoded text.
1216 In a 7-bit environment, only GL can be used.
4ed46869 1217
39787efd
KH
1218 When a graphic character set of CHARS94 is invoked to GL, codes
1219 0x20 and 0x7F of the GL area work as control characters SPACE and
1220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1221 be used.
4ed46869
KH
1222
1223 There are two ways of invocation: locking-shift and single-shift.
1224 With locking-shift, the invocation lasts until the next different
39787efd
KH
1225 invocation, whereas with single-shift, the invocation affects the
1226 following character only and doesn't affect the locking-shift
1227 state. Invocations are done by the following control characters or
1228 escape sequences:
4ed46869
KH
1229
1230 ----------------------------------------------------------------------
39787efd 1231 abbrev function cntrl escape seq description
4ed46869 1232 ----------------------------------------------------------------------
39787efd
KH
1233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 1242 ----------------------------------------------------------------------
39787efd
KH
1243 (*) These are not used by any known coding system.
1244
1245 Control characters for these functions are defined by macros
1246 ISO_CODE_XXX in `coding.h'.
4ed46869 1247
39787efd 1248 Designations are done by the following escape sequences:
4ed46869
KH
1249 ----------------------------------------------------------------------
1250 escape sequence description
1251 ----------------------------------------------------------------------
1252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1268 ----------------------------------------------------------------------
1269
1270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 1271 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
1272
1273 Note (*): Although these designations are not allowed in ISO2022,
1274 Emacs accepts them on decoding, and produces them on encoding
39787efd 1275 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
1276 7-bit environment, non-locking-shift, and non-single-shift.
1277
1278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 1279 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869 1280
cfb43547 1281 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
1282 same multilingual text in ISO2022. Actually, there exist many
1283 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
1284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
1286 localized platforms), and all of these are variants of ISO2022.
1287
1288 In addition to the above, Emacs handles two more kinds of escape
1289 sequences: ISO6429's direction specification and Emacs' private
1290 sequence for specifying character composition.
1291
39787efd 1292 ISO6429's direction specification takes the following form:
4ed46869
KH
1293 o CSI ']' -- end of the current direction
1294 o CSI '0' ']' -- end of the current direction
1295 o CSI '1' ']' -- start of left-to-right text
1296 o CSI '2' ']' -- start of right-to-left text
1297 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
1298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1299
1300 Character composition specification takes the following form:
ec6d2bb8
KH
1301 o ESC '0' -- start relative composition
1302 o ESC '1' -- end composition
1303 o ESC '2' -- start rule-base composition (*)
1304 o ESC '3' -- start relative composition with alternate chars (**)
1305 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 1306 Since these are not standard escape sequences of any ISO standard,
cfb43547 1307 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 1308
cfb43547 1309 (*) This form is used only in Emacs 20.5 and older versions,
b73bfc1c 1310 but the newer versions can safely decode it.
cfb43547 1311 (**) This form is used only in Emacs 21.1 and newer versions,
b73bfc1c 1312 and the older versions can't decode it.
ec6d2bb8 1313
cfb43547 1314 Here's a list of example usages of these composition escape
b73bfc1c 1315 sequences (categorized by `enum composition_method').
ec6d2bb8 1316
b73bfc1c 1317 COMPOSITION_RELATIVE:
ec6d2bb8 1318 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 1319 COMPOSITION_WITH_RULE:
ec6d2bb8 1320 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 1321 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 1322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 1323 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 1324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
1325
1326enum iso_code_class_type iso_code_class[256];
1327
05e6f5dc
KH
1328#define CHARSET_OK(idx, charset, c) \
1329 (coding_system_table[idx] \
1330 && (charset == CHARSET_ASCII \
6b89e3aa 1331 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
05e6f5dc
KH
1332 CODING_SAFE_CHAR_P (safe_chars, c))) \
1333 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1334 charset) \
1335 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
1336
1337#define SHIFT_OUT_OK(idx) \
1338 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1339
b6871cc7
KH
1340#define COMPOSITION_OK(idx) \
1341 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1342
4ed46869 1343/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
cfb43547 1344 Check if a text is encoded in ISO2022. If it is, return an
4ed46869
KH
1345 integer in which appropriate flag bits any of:
1346 CODING_CATEGORY_MASK_ISO_7
d46c5b12 1347 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
1348 CODING_CATEGORY_MASK_ISO_8_1
1349 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
1350 CODING_CATEGORY_MASK_ISO_7_ELSE
1351 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
1352 are set. If a code which should never appear in ISO2022 is found,
1353 returns 0. */
1354
0a28aafb
KH
1355static int
1356detect_coding_iso2022 (src, src_end, multibytep)
4ed46869 1357 unsigned char *src, *src_end;
0a28aafb 1358 int multibytep;
4ed46869 1359{
d46c5b12
KH
1360 int mask = CODING_CATEGORY_MASK_ISO;
1361 int mask_found = 0;
f46869e4 1362 int reg[4], shift_out = 0, single_shifting = 0;
da55a2b7 1363 int c, c1, charset;
b73bfc1c
KH
1364 /* Dummy for ONE_MORE_BYTE. */
1365 struct coding_system dummy_coding;
1366 struct coding_system *coding = &dummy_coding;
05e6f5dc 1367 Lisp_Object safe_chars;
3f003981 1368
d46c5b12 1369 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 1370 while (mask && src < src_end)
4ed46869 1371 {
0a28aafb 1372 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
8d239c89 1373 retry:
4ed46869
KH
1374 switch (c)
1375 {
1376 case ISO_CODE_ESC:
74383408
KH
1377 if (inhibit_iso_escape_detection)
1378 break;
f46869e4 1379 single_shifting = 0;
0a28aafb 1380 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
d46c5b12 1381 if (c >= '(' && c <= '/')
4ed46869 1382 {
bf9cdd4e 1383 /* Designation sequence for a charset of dimension 1. */
0a28aafb 1384 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1385 if (c1 < ' ' || c1 >= 0x80
1386 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1387 /* Invalid designation sequence. Just ignore. */
1388 break;
1389 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
1390 }
1391 else if (c == '$')
1392 {
1393 /* Designation sequence for a charset of dimension 2. */
0a28aafb 1394 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
bf9cdd4e
KH
1395 if (c >= '@' && c <= 'B')
1396 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 1397 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 1398 else if (c >= '(' && c <= '/')
bcf26d6a 1399 {
0a28aafb 1400 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1401 if (c1 < ' ' || c1 >= 0x80
1402 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1403 /* Invalid designation sequence. Just ignore. */
1404 break;
1405 reg[(c - '(') % 4] = charset;
bcf26d6a 1406 }
bf9cdd4e 1407 else
d46c5b12
KH
1408 /* Invalid designation sequence. Just ignore. */
1409 break;
1410 }
ae9ff118 1411 else if (c == 'N' || c == 'O')
d46c5b12 1412 {
ae9ff118
KH
1413 /* ESC <Fe> for SS2 or SS3. */
1414 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 1415 break;
4ed46869 1416 }
ec6d2bb8
KH
1417 else if (c >= '0' && c <= '4')
1418 {
1419 /* ESC <Fp> for start/end composition. */
b6871cc7
KH
1420 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1421 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1422 else
1423 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1424 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1425 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1426 else
1427 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1428 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1429 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1430 else
1431 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1432 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1433 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1434 else
1435 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1436 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1437 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1438 else
1439 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1440 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1441 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1442 else
1443 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
ec6d2bb8
KH
1444 break;
1445 }
bf9cdd4e 1446 else
d46c5b12
KH
1447 /* Invalid escape sequence. Just ignore. */
1448 break;
1449
1450 /* We found a valid designation sequence for CHARSET. */
1451 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
05e6f5dc
KH
1452 c = MAKE_CHAR (charset, 0, 0);
1453 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
d46c5b12
KH
1454 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1455 else
1456 mask &= ~CODING_CATEGORY_MASK_ISO_7;
05e6f5dc 1457 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
d46c5b12
KH
1458 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1459 else
1460 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
05e6f5dc 1461 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
ae9ff118
KH
1462 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463 else
d46c5b12 1464 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
05e6f5dc 1465 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
ae9ff118
KH
1466 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467 else
d46c5b12 1468 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
1469 break;
1470
4ed46869 1471 case ISO_CODE_SO:
74383408
KH
1472 if (inhibit_iso_escape_detection)
1473 break;
f46869e4 1474 single_shifting = 0;
d46c5b12
KH
1475 if (shift_out == 0
1476 && (reg[1] >= 0
1477 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1478 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1479 {
1480 /* Locking shift out. */
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1482 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1483 }
e0e989f6 1484 break;
93dec019 1485
d46c5b12 1486 case ISO_CODE_SI:
74383408
KH
1487 if (inhibit_iso_escape_detection)
1488 break;
f46869e4 1489 single_shifting = 0;
d46c5b12
KH
1490 if (shift_out == 1)
1491 {
1492 /* Locking shift in. */
1493 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1494 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1495 }
1496 break;
1497
4ed46869 1498 case ISO_CODE_CSI:
f46869e4 1499 single_shifting = 0;
4ed46869
KH
1500 case ISO_CODE_SS2:
1501 case ISO_CODE_SS3:
3f003981
KH
1502 {
1503 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1504
74383408
KH
1505 if (inhibit_iso_escape_detection)
1506 break;
70c22245
KH
1507 if (c != ISO_CODE_CSI)
1508 {
d46c5b12
KH
1509 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1510 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1511 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1512 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1513 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1514 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 1515 single_shifting = 1;
70c22245 1516 }
3f003981
KH
1517 if (VECTORP (Vlatin_extra_code_table)
1518 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1519 {
d46c5b12
KH
1520 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1521 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1522 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1523 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1524 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1525 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1526 }
1527 mask &= newmask;
d46c5b12 1528 mask_found |= newmask;
3f003981
KH
1529 }
1530 break;
4ed46869
KH
1531
1532 default:
1533 if (c < 0x80)
f46869e4
KH
1534 {
1535 single_shifting = 0;
1536 break;
1537 }
4ed46869 1538 else if (c < 0xA0)
c4825358 1539 {
f46869e4 1540 single_shifting = 0;
3f003981
KH
1541 if (VECTORP (Vlatin_extra_code_table)
1542 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 1543 {
3f003981
KH
1544 int newmask = 0;
1545
d46c5b12
KH
1546 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1547 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1548 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1549 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1550 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1551 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1552 mask &= newmask;
d46c5b12 1553 mask_found |= newmask;
c4825358 1554 }
3f003981
KH
1555 else
1556 return 0;
c4825358 1557 }
4ed46869
KH
1558 else
1559 {
d46c5b12 1560 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 1561 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 1562 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
1563 /* Check the length of succeeding codes of the range
1564 0xA0..0FF. If the byte length is odd, we exclude
1565 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1566 when we are not single shifting. */
b73bfc1c
KH
1567 if (!single_shifting
1568 && mask & CODING_CATEGORY_MASK_ISO_8_2)
f46869e4 1569 {
e17de821 1570 int i = 1;
8d239c89
KH
1571
1572 c = -1;
b73bfc1c
KH
1573 while (src < src_end)
1574 {
0a28aafb 1575 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
1576 if (c < 0xA0)
1577 break;
1578 i++;
1579 }
1580
1581 if (i & 1 && src < src_end)
f46869e4
KH
1582 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1583 else
1584 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
8d239c89
KH
1585 if (c >= 0)
1586 /* This means that we have read one extra byte. */
1587 goto retry;
f46869e4 1588 }
4ed46869
KH
1589 }
1590 break;
1591 }
1592 }
b73bfc1c 1593 label_end_of_loop:
d46c5b12 1594 return (mask & mask_found);
4ed46869
KH
1595}
1596
b73bfc1c
KH
1597/* Decode a character of which charset is CHARSET, the 1st position
1598 code is C1, the 2nd position code is C2, and return the decoded
1599 character code. If the variable `translation_table' is non-nil,
1600 returned the translated code. */
ec6d2bb8 1601
b73bfc1c
KH
1602#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1603 (NILP (translation_table) \
1604 ? MAKE_CHAR (charset, c1, c2) \
1605 : translate_char (translation_table, -1, charset, c1, c2))
4ed46869
KH
1606
1607/* Set designation state into CODING. */
d46c5b12
KH
1608#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1609 do { \
05e6f5dc 1610 int charset, c; \
944bd420
KH
1611 \
1612 if (final_char < '0' || final_char >= 128) \
1613 goto label_invalid_code; \
1614 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1615 make_number (chars), \
1616 make_number (final_char)); \
05e6f5dc 1617 c = MAKE_CHAR (charset, 0, 0); \
d46c5b12 1618 if (charset >= 0 \
704c5781 1619 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
05e6f5dc 1620 || CODING_SAFE_CHAR_P (safe_chars, c))) \
d46c5b12
KH
1621 { \
1622 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1623 && reg == 0 \
1624 && charset == CHARSET_ASCII) \
1625 { \
1626 /* We should insert this designation sequence as is so \
1627 that it is surely written back to a file. */ \
1628 coding->spec.iso2022.last_invalid_designation_register = -1; \
1629 goto label_invalid_code; \
1630 } \
1631 coding->spec.iso2022.last_invalid_designation_register = -1; \
1632 if ((coding->mode & CODING_MODE_DIRECTION) \
1633 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1634 charset = CHARSET_REVERSE_CHARSET (charset); \
1635 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1636 } \
1637 else \
1638 { \
1639 coding->spec.iso2022.last_invalid_designation_register = reg; \
1640 goto label_invalid_code; \
1641 } \
4ed46869
KH
1642 } while (0)
1643
ec6d2bb8
KH
1644/* Allocate a memory block for storing information about compositions.
1645 The block is chained to the already allocated blocks. */
d46c5b12 1646
33fb63eb 1647void
ec6d2bb8 1648coding_allocate_composition_data (coding, char_offset)
d46c5b12 1649 struct coding_system *coding;
ec6d2bb8 1650 int char_offset;
d46c5b12 1651{
ec6d2bb8
KH
1652 struct composition_data *cmp_data
1653 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1654
1655 cmp_data->char_offset = char_offset;
1656 cmp_data->used = 0;
1657 cmp_data->prev = coding->cmp_data;
1658 cmp_data->next = NULL;
1659 if (coding->cmp_data)
1660 coding->cmp_data->next = cmp_data;
1661 coding->cmp_data = cmp_data;
1662 coding->cmp_data_start = 0;
1663}
d46c5b12 1664
aa72b389
KH
1665/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1666 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1667 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1668 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1669 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1670 */
ec6d2bb8 1671
33fb63eb
KH
1672#define DECODE_COMPOSITION_START(c1) \
1673 do { \
1674 if (coding->composing == COMPOSITION_DISABLED) \
1675 { \
1676 *dst++ = ISO_CODE_ESC; \
1677 *dst++ = c1 & 0x7f; \
1678 coding->produced_char += 2; \
1679 } \
1680 else if (!COMPOSING_P (coding)) \
1681 { \
1682 /* This is surely the start of a composition. We must be sure \
1683 that coding->cmp_data has enough space to store the \
1684 information about the composition. If not, terminate the \
1685 current decoding loop, allocate one more memory block for \
8ca3766a 1686 coding->cmp_data in the caller, then start the decoding \
33fb63eb
KH
1687 loop again. We can't allocate memory here directly because \
1688 it may cause buffer/string relocation. */ \
1689 if (!coding->cmp_data \
1690 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1691 >= COMPOSITION_DATA_SIZE)) \
1692 { \
1693 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1694 goto label_end_of_loop; \
1695 } \
1696 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1697 : c1 == '2' ? COMPOSITION_WITH_RULE \
1698 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1699 : COMPOSITION_WITH_RULE_ALTCHARS); \
1700 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1701 coding->composing); \
1702 coding->composition_rule_follows = 0; \
1703 } \
1704 else \
1705 { \
1706 /* We are already handling a composition. If the method is \
1707 the following two, the codes following the current escape \
1708 sequence are actual characters stored in a buffer. */ \
1709 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1710 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1711 { \
1712 coding->composing = COMPOSITION_RELATIVE; \
1713 coding->composition_rule_follows = 0; \
1714 } \
1715 } \
ec6d2bb8
KH
1716 } while (0)
1717
8ca3766a 1718/* Handle composition end sequence ESC 1. */
ec6d2bb8
KH
1719
1720#define DECODE_COMPOSITION_END(c1) \
1721 do { \
93dec019 1722 if (! COMPOSING_P (coding)) \
ec6d2bb8
KH
1723 { \
1724 *dst++ = ISO_CODE_ESC; \
1725 *dst++ = c1; \
1726 coding->produced_char += 2; \
1727 } \
1728 else \
1729 { \
1730 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1731 coding->composing = COMPOSITION_NO; \
1732 } \
1733 } while (0)
1734
1735/* Decode a composition rule from the byte C1 (and maybe one more byte
1736 from SRC) and store one encoded composition rule in
1737 coding->cmp_data. */
1738
1739#define DECODE_COMPOSITION_RULE(c1) \
1740 do { \
1741 int rule = 0; \
1742 (c1) -= 32; \
1743 if (c1 < 81) /* old format (before ver.21) */ \
1744 { \
1745 int gref = (c1) / 9; \
1746 int nref = (c1) % 9; \
1747 if (gref == 4) gref = 10; \
1748 if (nref == 4) nref = 10; \
1749 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1750 } \
b73bfc1c 1751 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
1752 { \
1753 ONE_MORE_BYTE (c2); \
1754 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1755 } \
1756 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1757 coding->composition_rule_follows = 0; \
1758 } while (0)
88993dfd 1759
d46c5b12 1760
4ed46869
KH
1761/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1762
b73bfc1c 1763static void
d46c5b12 1764decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1765 struct coding_system *coding;
1766 unsigned char *source, *destination;
1767 int src_bytes, dst_bytes;
4ed46869
KH
1768{
1769 unsigned char *src = source;
1770 unsigned char *src_end = source + src_bytes;
1771 unsigned char *dst = destination;
1772 unsigned char *dst_end = destination + dst_bytes;
4ed46869
KH
1773 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1774 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1775 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
b73bfc1c
KH
1776 /* SRC_BASE remembers the start position in source in each loop.
1777 The loop will be exited when there's not enough source code
1778 (within macro ONE_MORE_BYTE), or when there's not enough
1779 destination area to produce a character (within macro
1780 EMIT_CHAR). */
1781 unsigned char *src_base;
1782 int c, charset;
1783 Lisp_Object translation_table;
05e6f5dc
KH
1784 Lisp_Object safe_chars;
1785
6b89e3aa 1786 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 1787
b73bfc1c
KH
1788 if (NILP (Venable_character_translation))
1789 translation_table = Qnil;
1790 else
1791 {
1792 translation_table = coding->translation_table_for_decode;
1793 if (NILP (translation_table))
1794 translation_table = Vstandard_translation_table_for_decode;
1795 }
4ed46869 1796
b73bfc1c
KH
1797 coding->result = CODING_FINISH_NORMAL;
1798
1799 while (1)
4ed46869 1800 {
b73bfc1c
KH
1801 int c1, c2;
1802
1803 src_base = src;
1804 ONE_MORE_BYTE (c1);
4ed46869 1805
ec6d2bb8 1806 /* We produce no character or one character. */
4ed46869
KH
1807 switch (iso_code_class [c1])
1808 {
1809 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1810 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1811 {
1812 DECODE_COMPOSITION_RULE (c1);
b73bfc1c 1813 continue;
ec6d2bb8
KH
1814 }
1815 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1816 {
1817 /* This is SPACE or DEL. */
b73bfc1c 1818 charset = CHARSET_ASCII;
4ed46869
KH
1819 break;
1820 }
1821 /* This is a graphic character, we fall down ... */
1822
1823 case ISO_graphic_plane_0:
ec6d2bb8 1824 if (COMPOSING_P (coding) && coding->composition_rule_follows)
b73bfc1c
KH
1825 {
1826 DECODE_COMPOSITION_RULE (c1);
1827 continue;
1828 }
1829 charset = charset0;
4ed46869
KH
1830 break;
1831
1832 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1833 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1834 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1835 goto label_invalid_code;
4ed46869
KH
1836 /* This is a graphic character, we fall down ... */
1837
1838 case ISO_graphic_plane_1:
b73bfc1c 1839 if (charset1 < 0)
fb88bf2d 1840 goto label_invalid_code;
b73bfc1c 1841 charset = charset1;
4ed46869
KH
1842 break;
1843
b73bfc1c 1844 case ISO_control_0:
ec6d2bb8
KH
1845 if (COMPOSING_P (coding))
1846 DECODE_COMPOSITION_END ('1');
1847
4ed46869
KH
1848 /* All ISO2022 control characters in this class have the
1849 same representation in Emacs internal format. */
d46c5b12
KH
1850 if (c1 == '\n'
1851 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1852 && (coding->eol_type == CODING_EOL_CR
1853 || coding->eol_type == CODING_EOL_CRLF))
1854 {
b73bfc1c
KH
1855 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1856 goto label_end_of_loop;
d46c5b12 1857 }
b73bfc1c 1858 charset = CHARSET_ASCII;
4ed46869
KH
1859 break;
1860
b73bfc1c
KH
1861 case ISO_control_1:
1862 if (COMPOSING_P (coding))
1863 DECODE_COMPOSITION_END ('1');
1864 goto label_invalid_code;
1865
4ed46869 1866 case ISO_carriage_return:
ec6d2bb8
KH
1867 if (COMPOSING_P (coding))
1868 DECODE_COMPOSITION_END ('1');
1869
4ed46869 1870 if (coding->eol_type == CODING_EOL_CR)
b73bfc1c 1871 c1 = '\n';
4ed46869
KH
1872 else if (coding->eol_type == CODING_EOL_CRLF)
1873 {
1874 ONE_MORE_BYTE (c1);
b73bfc1c 1875 if (c1 != ISO_CODE_LF)
4ed46869
KH
1876 {
1877 src--;
b73bfc1c 1878 c1 = '\r';
4ed46869
KH
1879 }
1880 }
b73bfc1c 1881 charset = CHARSET_ASCII;
4ed46869
KH
1882 break;
1883
1884 case ISO_shift_out:
d46c5b12
KH
1885 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1886 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1887 goto label_invalid_code;
4ed46869
KH
1888 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1889 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1890 continue;
4ed46869
KH
1891
1892 case ISO_shift_in:
d46c5b12
KH
1893 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1894 goto label_invalid_code;
4ed46869
KH
1895 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1896 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1897 continue;
4ed46869
KH
1898
1899 case ISO_single_shift_2_7:
1900 case ISO_single_shift_2:
d46c5b12
KH
1901 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1902 goto label_invalid_code;
4ed46869
KH
1903 /* SS2 is handled as an escape sequence of ESC 'N' */
1904 c1 = 'N';
1905 goto label_escape_sequence;
1906
1907 case ISO_single_shift_3:
d46c5b12
KH
1908 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1909 goto label_invalid_code;
4ed46869
KH
1910 /* SS2 is handled as an escape sequence of ESC 'O' */
1911 c1 = 'O';
1912 goto label_escape_sequence;
1913
1914 case ISO_control_sequence_introducer:
1915 /* CSI is handled as an escape sequence of ESC '[' ... */
1916 c1 = '[';
1917 goto label_escape_sequence;
1918
1919 case ISO_escape:
1920 ONE_MORE_BYTE (c1);
1921 label_escape_sequence:
1922 /* Escape sequences handled by Emacs are invocation,
1923 designation, direction specification, and character
1924 composition specification. */
1925 switch (c1)
1926 {
1927 case '&': /* revision of following character set */
1928 ONE_MORE_BYTE (c1);
1929 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1930 goto label_invalid_code;
4ed46869
KH
1931 ONE_MORE_BYTE (c1);
1932 if (c1 != ISO_CODE_ESC)
d46c5b12 1933 goto label_invalid_code;
4ed46869
KH
1934 ONE_MORE_BYTE (c1);
1935 goto label_escape_sequence;
1936
1937 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1938 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1939 goto label_invalid_code;
4ed46869
KH
1940 ONE_MORE_BYTE (c1);
1941 if (c1 >= '@' && c1 <= 'B')
1942 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1943 or JISX0208.1980 */
4ed46869
KH
1944 DECODE_DESIGNATION (0, 2, 94, c1);
1945 }
1946 else if (c1 >= 0x28 && c1 <= 0x2B)
1947 { /* designation of DIMENSION2_CHARS94 character set */
1948 ONE_MORE_BYTE (c2);
1949 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1950 }
1951 else if (c1 >= 0x2C && c1 <= 0x2F)
1952 { /* designation of DIMENSION2_CHARS96 character set */
1953 ONE_MORE_BYTE (c2);
1954 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1955 }
1956 else
d46c5b12 1957 goto label_invalid_code;
b73bfc1c
KH
1958 /* We must update these variables now. */
1959 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1960 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1961 continue;
4ed46869
KH
1962
1963 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1964 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1965 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1966 goto label_invalid_code;
4ed46869 1967 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1968 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1969 continue;
4ed46869
KH
1970
1971 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1972 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1973 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1974 goto label_invalid_code;
4ed46869 1975 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1976 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1977 continue;
4ed46869
KH
1978
1979 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1980 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1981 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1982 goto label_invalid_code;
4ed46869 1983 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
b73bfc1c 1984 ONE_MORE_BYTE (c1);
e7046a18
KH
1985 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1986 goto label_invalid_code;
4ed46869
KH
1987 break;
1988
1989 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1990 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1991 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1992 goto label_invalid_code;
4ed46869 1993 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
b73bfc1c 1994 ONE_MORE_BYTE (c1);
e7046a18
KH
1995 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1996 goto label_invalid_code;
4ed46869
KH
1997 break;
1998
ec6d2bb8
KH
1999 case '0': case '2': case '3': case '4': /* start composition */
2000 DECODE_COMPOSITION_START (c1);
b73bfc1c 2001 continue;
4ed46869 2002
ec6d2bb8
KH
2003 case '1': /* end composition */
2004 DECODE_COMPOSITION_END (c1);
b73bfc1c 2005 continue;
4ed46869
KH
2006
2007 case '[': /* specification of direction */
d46c5b12
KH
2008 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2009 goto label_invalid_code;
4ed46869 2010 /* For the moment, nested direction is not supported.
d46c5b12 2011 So, `coding->mode & CODING_MODE_DIRECTION' zero means
8ca3766a 2012 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
2013 ONE_MORE_BYTE (c1);
2014 switch (c1)
2015 {
2016 case ']': /* end of the current direction */
d46c5b12 2017 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
2018
2019 case '0': /* end of the current direction */
2020 case '1': /* start of left-to-right direction */
2021 ONE_MORE_BYTE (c1);
2022 if (c1 == ']')
d46c5b12 2023 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 2024 else
d46c5b12 2025 goto label_invalid_code;
4ed46869
KH
2026 break;
2027
2028 case '2': /* start of right-to-left direction */
2029 ONE_MORE_BYTE (c1);
2030 if (c1 == ']')
d46c5b12 2031 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 2032 else
d46c5b12 2033 goto label_invalid_code;
4ed46869
KH
2034 break;
2035
2036 default:
d46c5b12 2037 goto label_invalid_code;
4ed46869 2038 }
b73bfc1c 2039 continue;
4ed46869
KH
2040
2041 default:
d46c5b12
KH
2042 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2043 goto label_invalid_code;
4ed46869
KH
2044 if (c1 >= 0x28 && c1 <= 0x2B)
2045 { /* designation of DIMENSION1_CHARS94 character set */
2046 ONE_MORE_BYTE (c2);
2047 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2048 }
2049 else if (c1 >= 0x2C && c1 <= 0x2F)
2050 { /* designation of DIMENSION1_CHARS96 character set */
2051 ONE_MORE_BYTE (c2);
2052 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2053 }
2054 else
b73bfc1c
KH
2055 goto label_invalid_code;
2056 /* We must update these variables now. */
2057 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2058 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2059 continue;
4ed46869 2060 }
b73bfc1c 2061 }
4ed46869 2062
b73bfc1c
KH
2063 /* Now we know CHARSET and 1st position code C1 of a character.
2064 Produce a multibyte sequence for that character while getting
2065 2nd position code C2 if necessary. */
2066 if (CHARSET_DIMENSION (charset) == 2)
2067 {
2068 ONE_MORE_BYTE (c2);
2069 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2070 /* C2 is not in a valid range. */
2071 goto label_invalid_code;
4ed46869 2072 }
b73bfc1c
KH
2073 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2074 EMIT_CHAR (c);
4ed46869
KH
2075 continue;
2076
b73bfc1c
KH
2077 label_invalid_code:
2078 coding->errors++;
2079 if (COMPOSING_P (coding))
2080 DECODE_COMPOSITION_END ('1');
4ed46869 2081 src = src_base;
b73bfc1c
KH
2082 c = *src++;
2083 EMIT_CHAR (c);
4ed46869 2084 }
fb88bf2d 2085
b73bfc1c
KH
2086 label_end_of_loop:
2087 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 2088 coding->produced = dst - destination;
b73bfc1c 2089 return;
4ed46869
KH
2090}
2091
b73bfc1c 2092
f4dee582 2093/* ISO2022 encoding stuff. */
4ed46869
KH
2094
2095/*
f4dee582 2096 It is not enough to say just "ISO2022" on encoding, we have to
cfb43547 2097 specify more details. In Emacs, each ISO2022 coding system
4ed46869 2098 variant has the following specifications:
8ca3766a 2099 1. Initial designation to G0 through G3.
4ed46869
KH
2100 2. Allows short-form designation?
2101 3. ASCII should be designated to G0 before control characters?
2102 4. ASCII should be designated to G0 at end of line?
2103 5. 7-bit environment or 8-bit environment?
2104 6. Use locking-shift?
2105 7. Use Single-shift?
2106 And the following two are only for Japanese:
2107 8. Use ASCII in place of JIS0201-1976-Roman?
2108 9. Use JISX0208-1983 in place of JISX0208-1978?
2109 These specifications are encoded in `coding->flags' as flag bits
2110 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 2111 details.
4ed46869
KH
2112*/
2113
2114/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
2115 register REG at DST, and increment DST. If <final-char> of CHARSET is
2116 '@', 'A', or 'B' and the coding system CODING allows, produce
2117 designation sequence of short-form. */
4ed46869
KH
2118
2119#define ENCODE_DESIGNATION(charset, reg, coding) \
2120 do { \
2121 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2122 char *intermediate_char_94 = "()*+"; \
2123 char *intermediate_char_96 = ",-./"; \
70c22245 2124 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
b73bfc1c 2125 \
70c22245
KH
2126 if (revision < 255) \
2127 { \
4ed46869
KH
2128 *dst++ = ISO_CODE_ESC; \
2129 *dst++ = '&'; \
70c22245 2130 *dst++ = '@' + revision; \
4ed46869 2131 } \
b73bfc1c 2132 *dst++ = ISO_CODE_ESC; \
4ed46869
KH
2133 if (CHARSET_DIMENSION (charset) == 1) \
2134 { \
2135 if (CHARSET_CHARS (charset) == 94) \
2136 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2137 else \
2138 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2139 } \
2140 else \
2141 { \
2142 *dst++ = '$'; \
2143 if (CHARSET_CHARS (charset) == 94) \
2144 { \
b73bfc1c
KH
2145 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2146 || reg != 0 \
2147 || final_char < '@' || final_char > 'B') \
4ed46869
KH
2148 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2149 } \
2150 else \
b73bfc1c 2151 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
4ed46869 2152 } \
b73bfc1c 2153 *dst++ = final_char; \
4ed46869
KH
2154 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2155 } while (0)
2156
2157/* The following two macros produce codes (control character or escape
2158 sequence) for ISO2022 single-shift functions (single-shift-2 and
2159 single-shift-3). */
2160
2161#define ENCODE_SINGLE_SHIFT_2 \
2162 do { \
2163 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2164 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2165 else \
b73bfc1c 2166 *dst++ = ISO_CODE_SS2; \
4ed46869
KH
2167 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2168 } while (0)
2169
fb88bf2d
KH
2170#define ENCODE_SINGLE_SHIFT_3 \
2171 do { \
4ed46869 2172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
2173 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2174 else \
b73bfc1c 2175 *dst++ = ISO_CODE_SS3; \
4ed46869
KH
2176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2177 } while (0)
2178
2179/* The following four macros produce codes (control character or
2180 escape sequence) for ISO2022 locking-shift functions (shift-in,
2181 shift-out, locking-shift-2, and locking-shift-3). */
2182
b73bfc1c
KH
2183#define ENCODE_SHIFT_IN \
2184 do { \
2185 *dst++ = ISO_CODE_SI; \
4ed46869
KH
2186 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2187 } while (0)
2188
b73bfc1c
KH
2189#define ENCODE_SHIFT_OUT \
2190 do { \
2191 *dst++ = ISO_CODE_SO; \
4ed46869
KH
2192 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2193 } while (0)
2194
2195#define ENCODE_LOCKING_SHIFT_2 \
2196 do { \
2197 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2198 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2199 } while (0)
2200
b73bfc1c
KH
2201#define ENCODE_LOCKING_SHIFT_3 \
2202 do { \
2203 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
4ed46869
KH
2204 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2205 } while (0)
2206
f4dee582
RS
2207/* Produce codes for a DIMENSION1 character whose character set is
2208 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
2209 sequences are also produced in advance if necessary. */
2210
6e85d753
KH
2211#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2212 do { \
2213 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2214 { \
2215 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2216 *dst++ = c1 & 0x7F; \
2217 else \
2218 *dst++ = c1 | 0x80; \
2219 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2220 break; \
2221 } \
2222 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2223 { \
2224 *dst++ = c1 & 0x7F; \
2225 break; \
2226 } \
2227 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2228 { \
2229 *dst++ = c1 | 0x80; \
2230 break; \
2231 } \
6e85d753
KH
2232 else \
2233 /* Since CHARSET is not yet invoked to any graphic planes, we \
2234 must invoke it, or, at first, designate it to some graphic \
2235 register. Then repeat the loop to actually produce the \
2236 character. */ \
2237 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2238 } while (1)
2239
f4dee582
RS
2240/* Produce codes for a DIMENSION2 character whose character set is
2241 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
2242 invocation codes are also produced in advance if necessary. */
2243
6e85d753
KH
2244#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2245 do { \
2246 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2247 { \
2248 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2249 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2250 else \
2251 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2252 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2253 break; \
2254 } \
2255 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2256 { \
2257 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2258 break; \
2259 } \
2260 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2261 { \
2262 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2263 break; \
2264 } \
6e85d753
KH
2265 else \
2266 /* Since CHARSET is not yet invoked to any graphic planes, we \
2267 must invoke it, or, at first, designate it to some graphic \
2268 register. Then repeat the loop to actually produce the \
2269 character. */ \
2270 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2271 } while (1)
2272
05e6f5dc
KH
2273#define ENCODE_ISO_CHARACTER(c) \
2274 do { \
2275 int charset, c1, c2; \
2276 \
2277 SPLIT_CHAR (c, charset, c1, c2); \
2278 if (CHARSET_DEFINED_P (charset)) \
2279 { \
2280 if (CHARSET_DIMENSION (charset) == 1) \
2281 { \
2282 if (charset == CHARSET_ASCII \
2283 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2284 charset = charset_latin_jisx0201; \
2285 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2286 } \
2287 else \
2288 { \
2289 if (charset == charset_jisx0208 \
2290 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2291 charset = charset_jisx0208_1978; \
2292 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2293 } \
2294 } \
2295 else \
2296 { \
2297 *dst++ = c1; \
2298 if (c2 >= 0) \
2299 *dst++ = c2; \
2300 } \
2301 } while (0)
2302
2303
2304/* Instead of encoding character C, produce one or two `?'s. */
2305
2306#define ENCODE_UNSAFE_CHARACTER(c) \
6f551029 2307 do { \
05e6f5dc
KH
2308 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
2309 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2310 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
84fbb8a0 2311 } while (0)
bdd9fb48 2312
05e6f5dc 2313
4ed46869
KH
2314/* Produce designation and invocation codes at a place pointed by DST
2315 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2316 Return new DST. */
2317
2318unsigned char *
2319encode_invocation_designation (charset, coding, dst)
2320 int charset;
2321 struct coding_system *coding;
2322 unsigned char *dst;
2323{
2324 int reg; /* graphic register number */
2325
2326 /* At first, check designations. */
2327 for (reg = 0; reg < 4; reg++)
2328 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2329 break;
2330
2331 if (reg >= 4)
2332 {
2333 /* CHARSET is not yet designated to any graphic registers. */
2334 /* At first check the requested designation. */
2335 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
2336 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2337 /* Since CHARSET requests no special designation, designate it
2338 to graphic register 0. */
4ed46869
KH
2339 reg = 0;
2340
2341 ENCODE_DESIGNATION (charset, reg, coding);
2342 }
2343
2344 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2345 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2346 {
2347 /* Since the graphic register REG is not invoked to any graphic
2348 planes, invoke it to graphic plane 0. */
2349 switch (reg)
2350 {
2351 case 0: /* graphic register 0 */
2352 ENCODE_SHIFT_IN;
2353 break;
2354
2355 case 1: /* graphic register 1 */
2356 ENCODE_SHIFT_OUT;
2357 break;
2358
2359 case 2: /* graphic register 2 */
2360 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2361 ENCODE_SINGLE_SHIFT_2;
2362 else
2363 ENCODE_LOCKING_SHIFT_2;
2364 break;
2365
2366 case 3: /* graphic register 3 */
2367 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2368 ENCODE_SINGLE_SHIFT_3;
2369 else
2370 ENCODE_LOCKING_SHIFT_3;
2371 break;
2372 }
2373 }
b73bfc1c 2374
4ed46869
KH
2375 return dst;
2376}
2377
ec6d2bb8
KH
2378/* Produce 2-byte codes for encoded composition rule RULE. */
2379
2380#define ENCODE_COMPOSITION_RULE(rule) \
2381 do { \
2382 int gref, nref; \
2383 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2384 *dst++ = 32 + 81 + gref; \
2385 *dst++ = 32 + nref; \
2386 } while (0)
2387
2388/* Produce codes for indicating the start of a composition sequence
2389 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2390 which specify information about the composition. See the comment
2391 in coding.h for the format of DATA. */
2392
2393#define ENCODE_COMPOSITION_START(coding, data) \
2394 do { \
2395 coding->composing = data[3]; \
2396 *dst++ = ISO_CODE_ESC; \
2397 if (coding->composing == COMPOSITION_RELATIVE) \
2398 *dst++ = '0'; \
2399 else \
2400 { \
2401 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2402 ? '3' : '4'); \
2403 coding->cmp_data_index = coding->cmp_data_start + 4; \
2404 coding->composition_rule_follows = 0; \
2405 } \
2406 } while (0)
2407
2408/* Produce codes for indicating the end of the current composition. */
2409
2410#define ENCODE_COMPOSITION_END(coding, data) \
2411 do { \
2412 *dst++ = ISO_CODE_ESC; \
2413 *dst++ = '1'; \
2414 coding->cmp_data_start += data[0]; \
2415 coding->composing = COMPOSITION_NO; \
2416 if (coding->cmp_data_start == coding->cmp_data->used \
2417 && coding->cmp_data->next) \
2418 { \
2419 coding->cmp_data = coding->cmp_data->next; \
2420 coding->cmp_data_start = 0; \
2421 } \
2422 } while (0)
2423
2424/* Produce composition start sequence ESC 0. Here, this sequence
2425 doesn't mean the start of a new composition but means that we have
2426 just produced components (alternate chars and composition rules) of
2427 the composition and the actual text follows in SRC. */
2428
2429#define ENCODE_COMPOSITION_FAKE_START(coding) \
2430 do { \
2431 *dst++ = ISO_CODE_ESC; \
2432 *dst++ = '0'; \
2433 coding->composing = COMPOSITION_RELATIVE; \
2434 } while (0)
4ed46869
KH
2435
2436/* The following three macros produce codes for indicating direction
2437 of text. */
b73bfc1c
KH
2438#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2439 do { \
4ed46869 2440 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
b73bfc1c
KH
2441 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2442 else \
2443 *dst++ = ISO_CODE_CSI; \
4ed46869
KH
2444 } while (0)
2445
2446#define ENCODE_DIRECTION_R2L \
b73bfc1c 2447 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
4ed46869
KH
2448
2449#define ENCODE_DIRECTION_L2R \
b73bfc1c 2450 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
4ed46869
KH
2451
2452/* Produce codes for designation and invocation to reset the graphic
2453 planes and registers to initial state. */
e0e989f6
KH
2454#define ENCODE_RESET_PLANE_AND_REGISTER \
2455 do { \
2456 int reg; \
2457 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2458 ENCODE_SHIFT_IN; \
2459 for (reg = 0; reg < 4; reg++) \
2460 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2461 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2462 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2463 ENCODE_DESIGNATION \
2464 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
2465 } while (0)
2466
bdd9fb48 2467/* Produce designation sequences of charsets in the line started from
b73bfc1c 2468 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
2469
2470 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
2471 find all the necessary designations. */
2472
b73bfc1c
KH
2473static unsigned char *
2474encode_designation_at_bol (coding, translation_table, src, src_end, dst)
e0e989f6 2475 struct coding_system *coding;
b73bfc1c
KH
2476 Lisp_Object translation_table;
2477 unsigned char *src, *src_end, *dst;
e0e989f6 2478{
bdd9fb48
KH
2479 int charset, c, found = 0, reg;
2480 /* Table of charsets to be designated to each graphic register. */
2481 int r[4];
bdd9fb48
KH
2482
2483 for (reg = 0; reg < 4; reg++)
2484 r[reg] = -1;
2485
b73bfc1c 2486 while (found < 4)
e0e989f6 2487 {
b73bfc1c
KH
2488 ONE_MORE_CHAR (c);
2489 if (c == '\n')
2490 break;
93dec019 2491
b73bfc1c 2492 charset = CHAR_CHARSET (c);
e0e989f6 2493 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 2494 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
2495 {
2496 found++;
2497 r[reg] = charset;
2498 }
bdd9fb48
KH
2499 }
2500
b73bfc1c 2501 label_end_of_loop:
bdd9fb48
KH
2502 if (found)
2503 {
2504 for (reg = 0; reg < 4; reg++)
2505 if (r[reg] >= 0
2506 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2507 ENCODE_DESIGNATION (r[reg], reg, coding);
e0e989f6 2508 }
b73bfc1c
KH
2509
2510 return dst;
e0e989f6
KH
2511}
2512
4ed46869
KH
2513/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2514
b73bfc1c 2515static void
d46c5b12 2516encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2517 struct coding_system *coding;
2518 unsigned char *source, *destination;
2519 int src_bytes, dst_bytes;
4ed46869
KH
2520{
2521 unsigned char *src = source;
2522 unsigned char *src_end = source + src_bytes;
2523 unsigned char *dst = destination;
2524 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c 2525 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
2526 from DST_END to assure overflow checking is necessary only at the
2527 head of loop. */
b73bfc1c
KH
2528 unsigned char *adjusted_dst_end = dst_end - 19;
2529 /* SRC_BASE remembers the start position in source in each loop.
2530 The loop will be exited when there's not enough source text to
2531 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2532 there's not enough destination area to produce encoded codes
2533 (within macro EMIT_BYTES). */
2534 unsigned char *src_base;
2535 int c;
2536 Lisp_Object translation_table;
05e6f5dc
KH
2537 Lisp_Object safe_chars;
2538
6b89e3aa 2539 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 2540
b73bfc1c
KH
2541 if (NILP (Venable_character_translation))
2542 translation_table = Qnil;
2543 else
2544 {
2545 translation_table = coding->translation_table_for_encode;
2546 if (NILP (translation_table))
2547 translation_table = Vstandard_translation_table_for_encode;
2548 }
4ed46869 2549
d46c5b12 2550 coding->consumed_char = 0;
b73bfc1c
KH
2551 coding->errors = 0;
2552 while (1)
4ed46869 2553 {
b73bfc1c
KH
2554 src_base = src;
2555
2556 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2557 {
2558 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2559 break;
2560 }
4ed46869 2561
e0e989f6
KH
2562 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2563 && CODING_SPEC_ISO_BOL (coding))
2564 {
bdd9fb48 2565 /* We have to produce designation sequences if any now. */
b73bfc1c
KH
2566 dst = encode_designation_at_bol (coding, translation_table,
2567 src, src_end, dst);
e0e989f6
KH
2568 CODING_SPEC_ISO_BOL (coding) = 0;
2569 }
2570
ec6d2bb8
KH
2571 /* Check composition start and end. */
2572 if (coding->composing != COMPOSITION_DISABLED
2573 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 2574 {
ec6d2bb8
KH
2575 struct composition_data *cmp_data = coding->cmp_data;
2576 int *data = cmp_data->data + coding->cmp_data_start;
2577 int this_pos = cmp_data->char_offset + coding->consumed_char;
2578
2579 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 2580 {
ec6d2bb8
KH
2581 if (this_pos == data[2])
2582 {
2583 ENCODE_COMPOSITION_END (coding, data);
2584 cmp_data = coding->cmp_data;
2585 data = cmp_data->data + coding->cmp_data_start;
2586 }
4ed46869 2587 }
ec6d2bb8 2588 else if (COMPOSING_P (coding))
4ed46869 2589 {
ec6d2bb8
KH
2590 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2591 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2592 /* We have consumed components of the composition.
8ca3766a 2593 What follows in SRC is the composition's base
ec6d2bb8
KH
2594 text. */
2595 ENCODE_COMPOSITION_FAKE_START (coding);
2596 else
4ed46869 2597 {
ec6d2bb8
KH
2598 int c = cmp_data->data[coding->cmp_data_index++];
2599 if (coding->composition_rule_follows)
2600 {
2601 ENCODE_COMPOSITION_RULE (c);
2602 coding->composition_rule_follows = 0;
2603 }
2604 else
2605 {
05e6f5dc
KH
2606 if (coding->flags & CODING_FLAG_ISO_SAFE
2607 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2608 ENCODE_UNSAFE_CHARACTER (c);
2609 else
2610 ENCODE_ISO_CHARACTER (c);
ec6d2bb8
KH
2611 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2612 coding->composition_rule_follows = 1;
2613 }
4ed46869
KH
2614 continue;
2615 }
ec6d2bb8
KH
2616 }
2617 if (!COMPOSING_P (coding))
2618 {
2619 if (this_pos == data[1])
4ed46869 2620 {
ec6d2bb8
KH
2621 ENCODE_COMPOSITION_START (coding, data);
2622 continue;
4ed46869 2623 }
4ed46869
KH
2624 }
2625 }
ec6d2bb8 2626
b73bfc1c 2627 ONE_MORE_CHAR (c);
4ed46869 2628
b73bfc1c
KH
2629 /* Now encode the character C. */
2630 if (c < 0x20 || c == 0x7F)
2631 {
2632 if (c == '\r')
19a8d9e0 2633 {
b73bfc1c
KH
2634 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2635 {
2636 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2637 ENCODE_RESET_PLANE_AND_REGISTER;
2638 *dst++ = c;
2639 continue;
2640 }
2641 /* fall down to treat '\r' as '\n' ... */
2642 c = '\n';
19a8d9e0 2643 }
b73bfc1c 2644 if (c == '\n')
19a8d9e0 2645 {
b73bfc1c
KH
2646 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2647 ENCODE_RESET_PLANE_AND_REGISTER;
2648 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2649 bcopy (coding->spec.iso2022.initial_designation,
2650 coding->spec.iso2022.current_designation,
2651 sizeof coding->spec.iso2022.initial_designation);
2652 if (coding->eol_type == CODING_EOL_LF
2653 || coding->eol_type == CODING_EOL_UNDECIDED)
2654 *dst++ = ISO_CODE_LF;
2655 else if (coding->eol_type == CODING_EOL_CRLF)
2656 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2657 else
2658 *dst++ = ISO_CODE_CR;
2659 CODING_SPEC_ISO_BOL (coding) = 1;
19a8d9e0 2660 }
93dec019 2661 else
19a8d9e0 2662 {
b73bfc1c
KH
2663 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2664 ENCODE_RESET_PLANE_AND_REGISTER;
2665 *dst++ = c;
19a8d9e0 2666 }
4ed46869 2667 }
b73bfc1c 2668 else if (ASCII_BYTE_P (c))
05e6f5dc 2669 ENCODE_ISO_CHARACTER (c);
b73bfc1c 2670 else if (SINGLE_BYTE_CHAR_P (c))
88993dfd 2671 {
b73bfc1c
KH
2672 *dst++ = c;
2673 coding->errors++;
88993dfd 2674 }
05e6f5dc
KH
2675 else if (coding->flags & CODING_FLAG_ISO_SAFE
2676 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2677 ENCODE_UNSAFE_CHARACTER (c);
b73bfc1c 2678 else
05e6f5dc 2679 ENCODE_ISO_CHARACTER (c);
b73bfc1c
KH
2680
2681 coding->consumed_char++;
84fbb8a0 2682 }
b73bfc1c
KH
2683
2684 label_end_of_loop:
2685 coding->consumed = src_base - source;
d46c5b12 2686 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2687}
2688
2689\f
2690/*** 4. SJIS and BIG5 handlers ***/
2691
cfb43547 2692/* Although SJIS and BIG5 are not ISO coding systems, they are used
4ed46869
KH
2693 quite widely. So, for the moment, Emacs supports them in the bare
2694 C code. But, in the future, they may be supported only by CCL. */
2695
2696/* SJIS is a coding system encoding three character sets: ASCII, right
2697 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2698 as is. A character of charset katakana-jisx0201 is encoded by
2699 "position-code + 0x80". A character of charset japanese-jisx0208
2700 is encoded in 2-byte but two position-codes are divided and shifted
cfb43547 2701 so that it fits in the range below.
4ed46869
KH
2702
2703 --- CODE RANGE of SJIS ---
2704 (character set) (range)
2705 ASCII 0x00 .. 0x7F
682169fe 2706 KATAKANA-JISX0201 0xA1 .. 0xDF
c28a9453 2707 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2708 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2709 -------------------------------
2710
2711*/
2712
2713/* BIG5 is a coding system encoding two character sets: ASCII and
2714 Big5. An ASCII character is encoded as is. Big5 is a two-byte
cfb43547 2715 character set and is encoded in two bytes.
4ed46869
KH
2716
2717 --- CODE RANGE of BIG5 ---
2718 (character set) (range)
2719 ASCII 0x00 .. 0x7F
2720 Big5 (1st byte) 0xA1 .. 0xFE
2721 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2722 --------------------------
2723
2724 Since the number of characters in Big5 is larger than maximum
2725 characters in Emacs' charset (96x96), it can't be handled as one
2726 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2727 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2728 contains frequently used characters and the latter contains less
2729 frequently used characters. */
2730
2731/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2732 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
f458a8e0 2733 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
4ed46869
KH
2734 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2735
2736/* Number of Big5 characters which have the same code in 1st byte. */
2737#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2738
2739#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2740 do { \
2741 unsigned int temp \
2742 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2743 if (b1 < 0xC9) \
2744 charset = charset_big5_1; \
2745 else \
2746 { \
2747 charset = charset_big5_2; \
2748 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2749 } \
2750 c1 = temp / (0xFF - 0xA1) + 0x21; \
2751 c2 = temp % (0xFF - 0xA1) + 0x21; \
2752 } while (0)
2753
2754#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2755 do { \
2756 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2757 if (charset == charset_big5_2) \
2758 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2759 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2760 b2 = temp % BIG5_SAME_ROW; \
2761 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2762 } while (0)
2763
2764/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2765 Check if a text is encoded in SJIS. If it is, return
2766 CODING_CATEGORY_MASK_SJIS, else return 0. */
2767
0a28aafb
KH
2768static int
2769detect_coding_sjis (src, src_end, multibytep)
4ed46869 2770 unsigned char *src, *src_end;
0a28aafb 2771 int multibytep;
4ed46869 2772{
b73bfc1c
KH
2773 int c;
2774 /* Dummy for ONE_MORE_BYTE. */
2775 struct coding_system dummy_coding;
2776 struct coding_system *coding = &dummy_coding;
4ed46869 2777
b73bfc1c 2778 while (1)
4ed46869 2779 {
0a28aafb 2780 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2781 if (c < 0x80)
2782 continue;
2783 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2784 return 0;
2785 if (c <= 0x9F || c >= 0xE0)
4ed46869 2786 {
682169fe
KH
2787 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2788 if (c < 0x40 || c == 0x7F || c > 0xFC)
4ed46869
KH
2789 return 0;
2790 }
2791 }
b73bfc1c 2792 label_end_of_loop:
4ed46869
KH
2793 return CODING_CATEGORY_MASK_SJIS;
2794}
2795
2796/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2797 Check if a text is encoded in BIG5. If it is, return
2798 CODING_CATEGORY_MASK_BIG5, else return 0. */
2799
0a28aafb
KH
2800static int
2801detect_coding_big5 (src, src_end, multibytep)
4ed46869 2802 unsigned char *src, *src_end;
0a28aafb 2803 int multibytep;
4ed46869 2804{
b73bfc1c
KH
2805 int c;
2806 /* Dummy for ONE_MORE_BYTE. */
2807 struct coding_system dummy_coding;
2808 struct coding_system *coding = &dummy_coding;
4ed46869 2809
b73bfc1c 2810 while (1)
4ed46869 2811 {
0a28aafb 2812 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2813 if (c < 0x80)
2814 continue;
2815 if (c < 0xA1 || c > 0xFE)
2816 return 0;
2817 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2818 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2819 return 0;
4ed46869 2820 }
b73bfc1c 2821 label_end_of_loop:
4ed46869
KH
2822 return CODING_CATEGORY_MASK_BIG5;
2823}
2824
fa42c37f
KH
2825/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2826 Check if a text is encoded in UTF-8. If it is, return
2827 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2828
2829#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2830#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2831#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2832#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2833#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2834#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2835#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2836
0a28aafb
KH
2837static int
2838detect_coding_utf_8 (src, src_end, multibytep)
fa42c37f 2839 unsigned char *src, *src_end;
0a28aafb 2840 int multibytep;
fa42c37f
KH
2841{
2842 unsigned char c;
2843 int seq_maybe_bytes;
b73bfc1c
KH
2844 /* Dummy for ONE_MORE_BYTE. */
2845 struct coding_system dummy_coding;
2846 struct coding_system *coding = &dummy_coding;
fa42c37f 2847
b73bfc1c 2848 while (1)
fa42c37f 2849 {
0a28aafb 2850 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
2851 if (UTF_8_1_OCTET_P (c))
2852 continue;
2853 else if (UTF_8_2_OCTET_LEADING_P (c))
2854 seq_maybe_bytes = 1;
2855 else if (UTF_8_3_OCTET_LEADING_P (c))
2856 seq_maybe_bytes = 2;
2857 else if (UTF_8_4_OCTET_LEADING_P (c))
2858 seq_maybe_bytes = 3;
2859 else if (UTF_8_5_OCTET_LEADING_P (c))
2860 seq_maybe_bytes = 4;
2861 else if (UTF_8_6_OCTET_LEADING_P (c))
2862 seq_maybe_bytes = 5;
2863 else
2864 return 0;
2865
2866 do
2867 {
0a28aafb 2868 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
2869 if (!UTF_8_EXTRA_OCTET_P (c))
2870 return 0;
2871 seq_maybe_bytes--;
2872 }
2873 while (seq_maybe_bytes > 0);
2874 }
2875
b73bfc1c 2876 label_end_of_loop:
fa42c37f
KH
2877 return CODING_CATEGORY_MASK_UTF_8;
2878}
2879
2880/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2881 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2882 Little Endian (otherwise). If it is, return
2883 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2884 else return 0. */
2885
2886#define UTF_16_INVALID_P(val) \
2887 (((val) == 0xFFFE) \
2888 || ((val) == 0xFFFF))
2889
2890#define UTF_16_HIGH_SURROGATE_P(val) \
2891 (((val) & 0xD800) == 0xD800)
2892
2893#define UTF_16_LOW_SURROGATE_P(val) \
2894 (((val) & 0xDC00) == 0xDC00)
2895
0a28aafb
KH
2896static int
2897detect_coding_utf_16 (src, src_end, multibytep)
fa42c37f 2898 unsigned char *src, *src_end;
0a28aafb 2899 int multibytep;
fa42c37f 2900{
b73bfc1c
KH
2901 unsigned char c1, c2;
2902 /* Dummy for TWO_MORE_BYTES. */
2903 struct coding_system dummy_coding;
2904 struct coding_system *coding = &dummy_coding;
fa42c37f 2905
0a28aafb
KH
2906 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2907 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
b73bfc1c
KH
2908
2909 if ((c1 == 0xFF) && (c2 == 0xFE))
fa42c37f 2910 return CODING_CATEGORY_MASK_UTF_16_LE;
b73bfc1c 2911 else if ((c1 == 0xFE) && (c2 == 0xFF))
fa42c37f
KH
2912 return CODING_CATEGORY_MASK_UTF_16_BE;
2913
b73bfc1c 2914 label_end_of_loop:
fa42c37f
KH
2915 return 0;
2916}
2917
4ed46869
KH
2918/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2919 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2920
b73bfc1c 2921static void
4ed46869 2922decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2923 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2924 struct coding_system *coding;
2925 unsigned char *source, *destination;
2926 int src_bytes, dst_bytes;
4ed46869
KH
2927 int sjis_p;
2928{
2929 unsigned char *src = source;
2930 unsigned char *src_end = source + src_bytes;
2931 unsigned char *dst = destination;
2932 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
2933 /* SRC_BASE remembers the start position in source in each loop.
2934 The loop will be exited when there's not enough source code
2935 (within macro ONE_MORE_BYTE), or when there's not enough
2936 destination area to produce a character (within macro
2937 EMIT_CHAR). */
2938 unsigned char *src_base;
2939 Lisp_Object translation_table;
a5d301df 2940
b73bfc1c
KH
2941 if (NILP (Venable_character_translation))
2942 translation_table = Qnil;
2943 else
2944 {
2945 translation_table = coding->translation_table_for_decode;
2946 if (NILP (translation_table))
2947 translation_table = Vstandard_translation_table_for_decode;
2948 }
4ed46869 2949
d46c5b12 2950 coding->produced_char = 0;
b73bfc1c 2951 while (1)
4ed46869 2952 {
b73bfc1c
KH
2953 int c, charset, c1, c2;
2954
2955 src_base = src;
2956 ONE_MORE_BYTE (c1);
2957
2958 if (c1 < 0x80)
4ed46869 2959 {
b73bfc1c
KH
2960 charset = CHARSET_ASCII;
2961 if (c1 < 0x20)
4ed46869 2962 {
b73bfc1c 2963 if (c1 == '\r')
d46c5b12 2964 {
b73bfc1c 2965 if (coding->eol_type == CODING_EOL_CRLF)
d46c5b12 2966 {
b73bfc1c
KH
2967 ONE_MORE_BYTE (c2);
2968 if (c2 == '\n')
2969 c1 = c2;
b73bfc1c
KH
2970 else
2971 /* To process C2 again, SRC is subtracted by 1. */
2972 src--;
d46c5b12 2973 }
b73bfc1c
KH
2974 else if (coding->eol_type == CODING_EOL_CR)
2975 c1 = '\n';
2976 }
2977 else if (c1 == '\n'
2978 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2979 && (coding->eol_type == CODING_EOL_CR
2980 || coding->eol_type == CODING_EOL_CRLF))
2981 {
2982 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2983 goto label_end_of_loop;
d46c5b12 2984 }
4ed46869 2985 }
4ed46869 2986 }
54f78171 2987 else
b73bfc1c 2988 {
4ed46869
KH
2989 if (sjis_p)
2990 {
682169fe 2991 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
b73bfc1c 2992 goto label_invalid_code;
682169fe 2993 if (c1 <= 0x9F || c1 >= 0xE0)
fb88bf2d 2994 {
54f78171
KH
2995 /* SJIS -> JISX0208 */
2996 ONE_MORE_BYTE (c2);
b73bfc1c
KH
2997 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2998 goto label_invalid_code;
2999 DECODE_SJIS (c1, c2, c1, c2);
3000 charset = charset_jisx0208;
5e34de15 3001 }
fb88bf2d 3002 else
b73bfc1c
KH
3003 /* SJIS -> JISX0201-Kana */
3004 charset = charset_katakana_jisx0201;
4ed46869 3005 }
fb88bf2d 3006 else
fb88bf2d 3007 {
54f78171 3008 /* BIG5 -> Big5 */
682169fe 3009 if (c1 < 0xA0 || c1 > 0xFE)
b73bfc1c
KH
3010 goto label_invalid_code;
3011 ONE_MORE_BYTE (c2);
3012 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3013 goto label_invalid_code;
3014 DECODE_BIG5 (c1, c2, charset, c1, c2);
4ed46869
KH
3015 }
3016 }
4ed46869 3017
b73bfc1c
KH
3018 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3019 EMIT_CHAR (c);
fb88bf2d
KH
3020 continue;
3021
b73bfc1c
KH
3022 label_invalid_code:
3023 coding->errors++;
4ed46869 3024 src = src_base;
b73bfc1c
KH
3025 c = *src++;
3026 EMIT_CHAR (c);
fb88bf2d 3027 }
d46c5b12 3028
b73bfc1c
KH
3029 label_end_of_loop:
3030 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 3031 coding->produced = dst - destination;
b73bfc1c 3032 return;
4ed46869
KH
3033}
3034
3035/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
3036 This function can encode charsets `ascii', `katakana-jisx0201',
3037 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3038 are sure that all these charsets are registered as official charset
4ed46869
KH
3039 (i.e. do not have extended leading-codes). Characters of other
3040 charsets are produced without any encoding. If SJIS_P is 1, encode
3041 SJIS text, else encode BIG5 text. */
3042
b73bfc1c 3043static void
4ed46869 3044encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 3045 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
3046 struct coding_system *coding;
3047 unsigned char *source, *destination;
3048 int src_bytes, dst_bytes;
4ed46869
KH
3049 int sjis_p;
3050{
3051 unsigned char *src = source;
3052 unsigned char *src_end = source + src_bytes;
3053 unsigned char *dst = destination;
3054 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
3055 /* SRC_BASE remembers the start position in source in each loop.
3056 The loop will be exited when there's not enough source text to
3057 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3058 there's not enough destination area to produce encoded codes
3059 (within macro EMIT_BYTES). */
3060 unsigned char *src_base;
3061 Lisp_Object translation_table;
4ed46869 3062
b73bfc1c
KH
3063 if (NILP (Venable_character_translation))
3064 translation_table = Qnil;
3065 else
4ed46869 3066 {
39658efc 3067 translation_table = coding->translation_table_for_encode;
b73bfc1c 3068 if (NILP (translation_table))
39658efc 3069 translation_table = Vstandard_translation_table_for_encode;
b73bfc1c 3070 }
a5d301df 3071
b73bfc1c
KH
3072 while (1)
3073 {
3074 int c, charset, c1, c2;
4ed46869 3075
b73bfc1c
KH
3076 src_base = src;
3077 ONE_MORE_CHAR (c);
93dec019 3078
b73bfc1c
KH
3079 /* Now encode the character C. */
3080 if (SINGLE_BYTE_CHAR_P (c))
3081 {
3082 switch (c)
4ed46869 3083 {
b73bfc1c 3084 case '\r':
7371fe0a 3085 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
b73bfc1c
KH
3086 {
3087 EMIT_ONE_BYTE (c);
3088 break;
3089 }
3090 c = '\n';
3091 case '\n':
3092 if (coding->eol_type == CODING_EOL_CRLF)
3093 {
3094 EMIT_TWO_BYTES ('\r', c);
3095 break;
3096 }
3097 else if (coding->eol_type == CODING_EOL_CR)
3098 c = '\r';
3099 default:
3100 EMIT_ONE_BYTE (c);
3101 }
3102 }
3103 else
3104 {
3105 SPLIT_CHAR (c, charset, c1, c2);
3106 if (sjis_p)
3107 {
3108 if (charset == charset_jisx0208
3109 || charset == charset_jisx0208_1978)
3110 {
3111 ENCODE_SJIS (c1, c2, c1, c2);
3112 EMIT_TWO_BYTES (c1, c2);
3113 }
39658efc
KH
3114 else if (charset == charset_katakana_jisx0201)
3115 EMIT_ONE_BYTE (c1 | 0x80);
fc53a214
KH
3116 else if (charset == charset_latin_jisx0201)
3117 EMIT_ONE_BYTE (c1);
b73bfc1c
KH
3118 else
3119 /* There's no way other than producing the internal
3120 codes as is. */
3121 EMIT_BYTES (src_base, src);
4ed46869 3122 }
4ed46869 3123 else
b73bfc1c
KH
3124 {
3125 if (charset == charset_big5_1 || charset == charset_big5_2)
3126 {
3127 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3128 EMIT_TWO_BYTES (c1, c2);
3129 }
3130 else
3131 /* There's no way other than producing the internal
3132 codes as is. */
3133 EMIT_BYTES (src_base, src);
3134 }
4ed46869 3135 }
b73bfc1c 3136 coding->consumed_char++;
4ed46869
KH
3137 }
3138
b73bfc1c
KH
3139 label_end_of_loop:
3140 coding->consumed = src_base - source;
d46c5b12 3141 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
3142}
3143
3144\f
1397dc18
KH
3145/*** 5. CCL handlers ***/
3146
3147/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3148 Check if a text is encoded in a coding system of which
3149 encoder/decoder are written in CCL program. If it is, return
3150 CODING_CATEGORY_MASK_CCL, else return 0. */
3151
0a28aafb
KH
3152static int
3153detect_coding_ccl (src, src_end, multibytep)
1397dc18 3154 unsigned char *src, *src_end;
0a28aafb 3155 int multibytep;
1397dc18
KH
3156{
3157 unsigned char *valid;
b73bfc1c
KH
3158 int c;
3159 /* Dummy for ONE_MORE_BYTE. */
3160 struct coding_system dummy_coding;
3161 struct coding_system *coding = &dummy_coding;
1397dc18
KH
3162
3163 /* No coding system is assigned to coding-category-ccl. */
3164 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3165 return 0;
3166
3167 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
b73bfc1c 3168 while (1)
1397dc18 3169 {
0a28aafb 3170 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
3171 if (! valid[c])
3172 return 0;
1397dc18 3173 }
b73bfc1c 3174 label_end_of_loop:
1397dc18
KH
3175 return CODING_CATEGORY_MASK_CCL;
3176}
3177
3178\f
3179/*** 6. End-of-line handlers ***/
4ed46869 3180
b73bfc1c 3181/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 3182
b73bfc1c 3183static void
d46c5b12 3184decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3185 struct coding_system *coding;
3186 unsigned char *source, *destination;
3187 int src_bytes, dst_bytes;
4ed46869
KH
3188{
3189 unsigned char *src = source;
4ed46869 3190 unsigned char *dst = destination;
b73bfc1c
KH
3191 unsigned char *src_end = src + src_bytes;
3192 unsigned char *dst_end = dst + dst_bytes;
3193 Lisp_Object translation_table;
3194 /* SRC_BASE remembers the start position in source in each loop.
3195 The loop will be exited when there's not enough source code
3196 (within macro ONE_MORE_BYTE), or when there's not enough
3197 destination area to produce a character (within macro
3198 EMIT_CHAR). */
3199 unsigned char *src_base;
3200 int c;
3201
3202 translation_table = Qnil;
4ed46869
KH
3203 switch (coding->eol_type)
3204 {
3205 case CODING_EOL_CRLF:
b73bfc1c 3206 while (1)
d46c5b12 3207 {
b73bfc1c
KH
3208 src_base = src;
3209 ONE_MORE_BYTE (c);
3210 if (c == '\r')
fb88bf2d 3211 {
b73bfc1c
KH
3212 ONE_MORE_BYTE (c);
3213 if (c != '\n')
3214 {
b73bfc1c
KH
3215 src--;
3216 c = '\r';
3217 }
fb88bf2d 3218 }
b73bfc1c
KH
3219 else if (c == '\n'
3220 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
d46c5b12 3221 {
b73bfc1c
KH
3222 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3223 goto label_end_of_loop;
d46c5b12 3224 }
b73bfc1c 3225 EMIT_CHAR (c);
d46c5b12 3226 }
b73bfc1c
KH
3227 break;
3228
3229 case CODING_EOL_CR:
3230 while (1)
d46c5b12 3231 {
b73bfc1c
KH
3232 src_base = src;
3233 ONE_MORE_BYTE (c);
3234 if (c == '\n')
3235 {
3236 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3237 {
3238 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3239 goto label_end_of_loop;
3240 }
3241 }
3242 else if (c == '\r')
3243 c = '\n';
3244 EMIT_CHAR (c);
d46c5b12 3245 }
4ed46869
KH
3246 break;
3247
b73bfc1c
KH
3248 default: /* no need for EOL handling */
3249 while (1)
d46c5b12 3250 {
b73bfc1c
KH
3251 src_base = src;
3252 ONE_MORE_BYTE (c);
3253 EMIT_CHAR (c);
d46c5b12 3254 }
4ed46869
KH
3255 }
3256
b73bfc1c
KH
3257 label_end_of_loop:
3258 coding->consumed = coding->consumed_char = src_base - source;
3259 coding->produced = dst - destination;
3260 return;
4ed46869
KH
3261}
3262
3263/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
b73bfc1c 3264 format of end-of-line according to `coding->eol_type'. It also
8ca3766a 3265 convert multibyte form 8-bit characters to unibyte if
b73bfc1c
KH
3266 CODING->src_multibyte is nonzero. If `coding->mode &
3267 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3268 also means end-of-line. */
4ed46869 3269
b73bfc1c 3270static void
d46c5b12 3271encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869 3272 struct coding_system *coding;
a4244313
KR
3273 const unsigned char *source;
3274 unsigned char *destination;
4ed46869 3275 int src_bytes, dst_bytes;
4ed46869 3276{
a4244313 3277 const unsigned char *src = source;
4ed46869 3278 unsigned char *dst = destination;
a4244313 3279 const unsigned char *src_end = src + src_bytes;
b73bfc1c
KH
3280 unsigned char *dst_end = dst + dst_bytes;
3281 Lisp_Object translation_table;
3282 /* SRC_BASE remembers the start position in source in each loop.
3283 The loop will be exited when there's not enough source text to
3284 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3285 there's not enough destination area to produce encoded codes
3286 (within macro EMIT_BYTES). */
a4244313
KR
3287 const unsigned char *src_base;
3288 unsigned char *tmp;
b73bfc1c
KH
3289 int c;
3290 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3291
3292 translation_table = Qnil;
3293 if (coding->src_multibyte
3294 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3295 {
3296 src_end--;
3297 src_bytes--;
3298 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3299 }
fb88bf2d 3300
d46c5b12
KH
3301 if (coding->eol_type == CODING_EOL_CRLF)
3302 {
b73bfc1c 3303 while (src < src_end)
d46c5b12 3304 {
b73bfc1c 3305 src_base = src;
d46c5b12 3306 c = *src++;
b73bfc1c
KH
3307 if (c >= 0x20)
3308 EMIT_ONE_BYTE (c);
3309 else if (c == '\n' || (c == '\r' && selective_display))
3310 EMIT_TWO_BYTES ('\r', '\n');
d46c5b12 3311 else
b73bfc1c 3312 EMIT_ONE_BYTE (c);
d46c5b12 3313 }
ff2b1ea9 3314 src_base = src;
b73bfc1c 3315 label_end_of_loop:
005f0d35 3316 ;
d46c5b12
KH
3317 }
3318 else
4ed46869 3319 {
78a629d2 3320 if (!dst_bytes || src_bytes <= dst_bytes)
4ed46869 3321 {
b73bfc1c
KH
3322 safe_bcopy (src, dst, src_bytes);
3323 src_base = src_end;
3324 dst += src_bytes;
d46c5b12 3325 }
d46c5b12 3326 else
b73bfc1c
KH
3327 {
3328 if (coding->src_multibyte
3329 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3330 dst_bytes--;
3331 safe_bcopy (src, dst, dst_bytes);
3332 src_base = src + dst_bytes;
3333 dst = destination + dst_bytes;
3334 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3335 }
993824c9 3336 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 3337 {
a4244313
KR
3338 for (tmp = destination; tmp < dst; tmp++)
3339 if (*tmp == '\n') *tmp = '\r';
d46c5b12 3340 }
b73bfc1c 3341 else if (selective_display)
d46c5b12 3342 {
a4244313
KR
3343 for (tmp = destination; tmp < dst; tmp++)
3344 if (*tmp == '\r') *tmp = '\n';
4ed46869 3345 }
4ed46869 3346 }
b73bfc1c
KH
3347 if (coding->src_multibyte)
3348 dst = destination + str_as_unibyte (destination, dst - destination);
4ed46869 3349
b73bfc1c
KH
3350 coding->consumed = src_base - source;
3351 coding->produced = dst - destination;
78a629d2 3352 coding->produced_char = coding->produced;
4ed46869
KH
3353}
3354
3355\f
1397dc18 3356/*** 7. C library functions ***/
4ed46869 3357
cfb43547 3358/* In Emacs Lisp, a coding system is represented by a Lisp symbol which
4ed46869 3359 has a property `coding-system'. The value of this property is a
cfb43547 3360 vector of length 5 (called the coding-vector). Among elements of
4ed46869
KH
3361 this vector, the first (element[0]) and the fifth (element[4])
3362 carry important information for decoding/encoding. Before
3363 decoding/encoding, this information should be set in fields of a
3364 structure of type `coding_system'.
3365
cfb43547 3366 The value of the property `coding-system' can be a symbol of another
4ed46869
KH
3367 subsidiary coding-system. In that case, Emacs gets coding-vector
3368 from that symbol.
3369
3370 `element[0]' contains information to be set in `coding->type'. The
3371 value and its meaning is as follows:
3372
0ef69138
KH
3373 0 -- coding_type_emacs_mule
3374 1 -- coding_type_sjis
3375 2 -- coding_type_iso2022
3376 3 -- coding_type_big5
3377 4 -- coding_type_ccl encoder/decoder written in CCL
3378 nil -- coding_type_no_conversion
3379 t -- coding_type_undecided (automatic conversion on decoding,
3380 no-conversion on encoding)
4ed46869
KH
3381
3382 `element[4]' contains information to be set in `coding->flags' and
3383 `coding->spec'. The meaning varies by `coding->type'.
3384
3385 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3386 of length 32 (of which the first 13 sub-elements are used now).
3387 Meanings of these sub-elements are:
3388
3389 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3390 If the value is an integer of valid charset, the charset is
3391 assumed to be designated to graphic register N initially.
3392
3393 If the value is minus, it is a minus value of charset which
3394 reserves graphic register N, which means that the charset is
3395 not designated initially but should be designated to graphic
3396 register N just before encoding a character in that charset.
3397
3398 If the value is nil, graphic register N is never used on
3399 encoding.
93dec019 3400
4ed46869
KH
3401 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3402 Each value takes t or nil. See the section ISO2022 of
3403 `coding.h' for more information.
3404
3405 If `coding->type' is `coding_type_big5', element[4] is t to denote
3406 BIG5-ETen or nil to denote BIG5-HKU.
3407
3408 If `coding->type' takes the other value, element[4] is ignored.
3409
cfb43547 3410 Emacs Lisp's coding systems also carry information about format of
4ed46869
KH
3411 end-of-line in a value of property `eol-type'. If the value is
3412 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3413 means CODING_EOL_CR. If it is not integer, it should be a vector
3414 of subsidiary coding systems of which property `eol-type' has one
cfb43547 3415 of the above values.
4ed46869
KH
3416
3417*/
3418
3419/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3420 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3421 is setup so that no conversion is necessary and return -1, else
3422 return 0. */
3423
3424int
e0e989f6
KH
3425setup_coding_system (coding_system, coding)
3426 Lisp_Object coding_system;
4ed46869
KH
3427 struct coding_system *coding;
3428{
d46c5b12 3429 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 3430 Lisp_Object val;
4ed46869 3431
c07c8e12
KH
3432 /* At first, zero clear all members. */
3433 bzero (coding, sizeof (struct coding_system));
3434
d46c5b12 3435 /* Initialize some fields required for all kinds of coding systems. */
774324d6 3436 coding->symbol = coding_system;
d46c5b12
KH
3437 coding->heading_ascii = -1;
3438 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
3439 coding->composing = COMPOSITION_DISABLED;
3440 coding->cmp_data = NULL;
1f5dbf34
KH
3441
3442 if (NILP (coding_system))
3443 goto label_invalid_coding_system;
3444
4608c386 3445 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 3446
4608c386
KH
3447 if (!VECTORP (coding_spec)
3448 || XVECTOR (coding_spec)->size != 5
3449 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 3450 goto label_invalid_coding_system;
4608c386 3451
d46c5b12
KH
3452 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3453 if (VECTORP (eol_type))
3454 {
3455 coding->eol_type = CODING_EOL_UNDECIDED;
3456 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3457 }
3458 else if (XFASTINT (eol_type) == 1)
3459 {
3460 coding->eol_type = CODING_EOL_CRLF;
3461 coding->common_flags
3462 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3463 }
3464 else if (XFASTINT (eol_type) == 2)
3465 {
3466 coding->eol_type = CODING_EOL_CR;
3467 coding->common_flags
3468 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3469 }
3470 else
3471 coding->eol_type = CODING_EOL_LF;
3472
3473 coding_type = XVECTOR (coding_spec)->contents[0];
3474 /* Try short cut. */
3475 if (SYMBOLP (coding_type))
3476 {
3477 if (EQ (coding_type, Qt))
3478 {
3479 coding->type = coding_type_undecided;
3480 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3481 }
3482 else
3483 coding->type = coding_type_no_conversion;
9b96232f
KH
3484 /* Initialize this member. Any thing other than
3485 CODING_CATEGORY_IDX_UTF_16_BE and
3486 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3487 special treatment in detect_eol. */
3488 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3489
d46c5b12
KH
3490 return 0;
3491 }
3492
d46c5b12
KH
3493 /* Get values of coding system properties:
3494 `post-read-conversion', `pre-write-conversion',
f967223b 3495 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 3496 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae 3497 /* Pre & post conversion functions should be disabled if
8ca3766a 3498 inhibit_eol_conversion is nonzero. This is the case that a code
b843d1ae
KH
3499 conversion function is called while those functions are running. */
3500 if (! inhibit_pre_post_conversion)
3501 {
3502 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3503 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3504 }
f967223b 3505 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 3506 if (SYMBOLP (val))
f967223b
KH
3507 val = Fget (val, Qtranslation_table_for_decode);
3508 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3509 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 3510 if (SYMBOLP (val))
f967223b
KH
3511 val = Fget (val, Qtranslation_table_for_encode);
3512 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
3513 val = Fplist_get (plist, Qcoding_category);
3514 if (!NILP (val))
3515 {
3516 val = Fget (val, Qcoding_category_index);
3517 if (INTEGERP (val))
3518 coding->category_idx = XINT (val);
3519 else
3520 goto label_invalid_coding_system;
3521 }
3522 else
3523 goto label_invalid_coding_system;
93dec019 3524
ec6d2bb8
KH
3525 /* If the coding system has non-nil `composition' property, enable
3526 composition handling. */
3527 val = Fplist_get (plist, Qcomposition);
3528 if (!NILP (val))
3529 coding->composing = COMPOSITION_NO;
3530
d46c5b12 3531 switch (XFASTINT (coding_type))
4ed46869
KH
3532 {
3533 case 0:
0ef69138 3534 coding->type = coding_type_emacs_mule;
aa72b389
KH
3535 coding->common_flags
3536 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
c952af22
KH
3537 if (!NILP (coding->post_read_conversion))
3538 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3539 if (!NILP (coding->pre_write_conversion))
3540 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3541 break;
3542
3543 case 1:
3544 coding->type = coding_type_sjis;
c952af22
KH
3545 coding->common_flags
3546 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3547 break;
3548
3549 case 2:
3550 coding->type = coding_type_iso2022;
c952af22
KH
3551 coding->common_flags
3552 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3553 {
70c22245 3554 Lisp_Object val, temp;
4ed46869 3555 Lisp_Object *flags;
d46c5b12 3556 int i, charset, reg_bits = 0;
4ed46869 3557
4608c386 3558 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3559
4ed46869
KH
3560 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3561 goto label_invalid_coding_system;
3562
3563 flags = XVECTOR (val)->contents;
3564 coding->flags
3565 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3566 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3567 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3568 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3569 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3570 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3571 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3572 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3573 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3574 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3575 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3576 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3577 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3578 );
4ed46869
KH
3579
3580 /* Invoke graphic register 0 to plane 0. */
3581 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3582 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3583 CODING_SPEC_ISO_INVOCATION (coding, 1)
3584 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3585 /* Not single shifting at first. */
6e85d753 3586 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3587 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3588 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3589
70c22245
KH
3590 for (charset = 0; charset <= MAX_CHARSET; charset++)
3591 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3592 val = Vcharset_revision_alist;
3593 while (CONSP (val))
3594 {
03699b14 3595 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3596 if (charset >= 0
03699b14 3597 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3598 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3599 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3600 val = XCDR (val);
70c22245
KH
3601 }
3602
4ed46869
KH
3603 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3604 FLAGS[REG] can be one of below:
3605 integer CHARSET: CHARSET occupies register I,
3606 t: designate nothing to REG initially, but can be used
3607 by any charsets,
3608 list of integer, nil, or t: designate the first
3609 element (if integer) to REG initially, the remaining
3610 elements (if integer) is designated to REG on request,
d46c5b12 3611 if an element is t, REG can be used by any charsets,
4ed46869 3612 nil: REG is never used. */
467e7675 3613 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3614 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3615 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3616 for (i = 0; i < 4; i++)
3617 {
87323294
PJ
3618 if ((INTEGERP (flags[i])
3619 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
e0e989f6 3620 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3621 {
3622 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3623 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3624 }
3625 else if (EQ (flags[i], Qt))
3626 {
3627 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3628 reg_bits |= 1 << i;
3629 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3630 }
3631 else if (CONSP (flags[i]))
3632 {
84d60297
RS
3633 Lisp_Object tail;
3634 tail = flags[i];
4ed46869 3635
d46c5b12 3636 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
87323294
PJ
3637 if ((INTEGERP (XCAR (tail))
3638 && (charset = XINT (XCAR (tail)),
3639 CHARSET_VALID_P (charset)))
03699b14 3640 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3641 {
3642 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3643 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3644 }
3645 else
3646 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3647 tail = XCDR (tail);
4ed46869
KH
3648 while (CONSP (tail))
3649 {
87323294
PJ
3650 if ((INTEGERP (XCAR (tail))
3651 && (charset = XINT (XCAR (tail)),
3652 CHARSET_VALID_P (charset)))
03699b14 3653 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3654 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3655 = i;
03699b14 3656 else if (EQ (XCAR (tail), Qt))
d46c5b12 3657 reg_bits |= 1 << i;
03699b14 3658 tail = XCDR (tail);
4ed46869
KH
3659 }
3660 }
3661 else
3662 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
93dec019 3663
4ed46869
KH
3664 CODING_SPEC_ISO_DESIGNATION (coding, i)
3665 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3666 }
3667
d46c5b12 3668 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3669 {
3670 /* REG 1 can be used only by locking shift in 7-bit env. */
3671 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3672 reg_bits &= ~2;
4ed46869
KH
3673 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3674 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3675 reg_bits &= 3;
4ed46869
KH
3676 }
3677
d46c5b12
KH
3678 if (reg_bits)
3679 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3680 {
928a85c1 3681 if (CHARSET_DEFINED_P (charset)
96148065
KH
3682 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3683 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
3684 {
3685 /* There exist some default graphic registers to be
96148065 3686 used by CHARSET. */
d46c5b12
KH
3687
3688 /* We had better avoid designating a charset of
3689 CHARS96 to REG 0 as far as possible. */
3690 if (CHARSET_CHARS (charset) == 96)
3691 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3692 = (reg_bits & 2
3693 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3694 else
3695 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3696 = (reg_bits & 1
3697 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3698 }
6e85d753 3699 }
4ed46869 3700 }
c952af22 3701 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3702 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3703 break;
3704
3705 case 3:
3706 coding->type = coding_type_big5;
c952af22
KH
3707 coding->common_flags
3708 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3709 coding->flags
4608c386 3710 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3711 ? CODING_FLAG_BIG5_HKU
3712 : CODING_FLAG_BIG5_ETEN);
3713 break;
3714
3715 case 4:
3716 coding->type = coding_type_ccl;
c952af22
KH
3717 coding->common_flags
3718 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3719 {
84d60297 3720 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3721 if (! CONSP (val)
3722 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3723 XCAR (val)) < 0
ef4ced28 3724 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3725 XCDR (val)) < 0)
4ed46869 3726 goto label_invalid_coding_system;
1397dc18
KH
3727
3728 bzero (coding->spec.ccl.valid_codes, 256);
3729 val = Fplist_get (plist, Qvalid_codes);
3730 if (CONSP (val))
3731 {
3732 Lisp_Object this;
3733
03699b14 3734 for (; CONSP (val); val = XCDR (val))
1397dc18 3735 {
03699b14 3736 this = XCAR (val);
1397dc18
KH
3737 if (INTEGERP (this)
3738 && XINT (this) >= 0 && XINT (this) < 256)
3739 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3740 else if (CONSP (this)
03699b14
KR
3741 && INTEGERP (XCAR (this))
3742 && INTEGERP (XCDR (this)))
1397dc18 3743 {
03699b14
KR
3744 int start = XINT (XCAR (this));
3745 int end = XINT (XCDR (this));
1397dc18
KH
3746
3747 if (start >= 0 && start <= end && end < 256)
e133c8fa 3748 while (start <= end)
1397dc18
KH
3749 coding->spec.ccl.valid_codes[start++] = 1;
3750 }
3751 }
3752 }
4ed46869 3753 }
c952af22 3754 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
aaaf0b1e 3755 coding->spec.ccl.cr_carryover = 0;
1c3478b0 3756 coding->spec.ccl.eight_bit_carryover[0] = 0;
4ed46869
KH
3757 break;
3758
27901516
KH
3759 case 5:
3760 coding->type = coding_type_raw_text;
3761 break;
3762
4ed46869 3763 default:
d46c5b12 3764 goto label_invalid_coding_system;
4ed46869
KH
3765 }
3766 return 0;
3767
3768 label_invalid_coding_system:
3769 coding->type = coding_type_no_conversion;
d46c5b12 3770 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3771 coding->common_flags = 0;
dec137e5 3772 coding->eol_type = CODING_EOL_LF;
d46c5b12 3773 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3774 return -1;
3775}
3776
ec6d2bb8
KH
3777/* Free memory blocks allocated for storing composition information. */
3778
3779void
3780coding_free_composition_data (coding)
3781 struct coding_system *coding;
3782{
3783 struct composition_data *cmp_data = coding->cmp_data, *next;
3784
3785 if (!cmp_data)
3786 return;
3787 /* Memory blocks are chained. At first, rewind to the first, then,
3788 free blocks one by one. */
3789 while (cmp_data->prev)
3790 cmp_data = cmp_data->prev;
3791 while (cmp_data)
3792 {
3793 next = cmp_data->next;
3794 xfree (cmp_data);
3795 cmp_data = next;
3796 }
3797 coding->cmp_data = NULL;
3798}
3799
3800/* Set `char_offset' member of all memory blocks pointed by
3801 coding->cmp_data to POS. */
3802
3803void
3804coding_adjust_composition_offset (coding, pos)
3805 struct coding_system *coding;
3806 int pos;
3807{
3808 struct composition_data *cmp_data;
3809
3810 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3811 cmp_data->char_offset = pos;
3812}
3813
54f78171
KH
3814/* Setup raw-text or one of its subsidiaries in the structure
3815 coding_system CODING according to the already setup value eol_type
3816 in CODING. CODING should be setup for some coding system in
3817 advance. */
3818
3819void
3820setup_raw_text_coding_system (coding)
3821 struct coding_system *coding;
3822{
3823 if (coding->type != coding_type_raw_text)
3824 {
3825 coding->symbol = Qraw_text;
3826 coding->type = coding_type_raw_text;
3827 if (coding->eol_type != CODING_EOL_UNDECIDED)
3828 {
84d60297
RS
3829 Lisp_Object subsidiaries;
3830 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3831
3832 if (VECTORP (subsidiaries)
3833 && XVECTOR (subsidiaries)->size == 3)
3834 coding->symbol
3835 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3836 }
716e0b0a 3837 setup_coding_system (coding->symbol, coding);
54f78171
KH
3838 }
3839 return;
3840}
3841
4ed46869
KH
3842/* Emacs has a mechanism to automatically detect a coding system if it
3843 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3844 it's impossible to distinguish some coding systems accurately
3845 because they use the same range of codes. So, at first, coding
3846 systems are categorized into 7, those are:
3847
0ef69138 3848 o coding-category-emacs-mule
4ed46869
KH
3849
3850 The category for a coding system which has the same code range
3851 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3852 symbol) `emacs-mule' by default.
4ed46869
KH
3853
3854 o coding-category-sjis
3855
3856 The category for a coding system which has the same code range
3857 as SJIS. Assigned the coding-system (Lisp
7717c392 3858 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3859
3860 o coding-category-iso-7
3861
3862 The category for a coding system which has the same code range
7717c392 3863 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3864 shift and single shift functions. This can encode/decode all
3865 charsets. Assigned the coding-system (Lisp symbol)
3866 `iso-2022-7bit' by default.
3867
3868 o coding-category-iso-7-tight
3869
3870 Same as coding-category-iso-7 except that this can
3871 encode/decode only the specified charsets.
4ed46869
KH
3872
3873 o coding-category-iso-8-1
3874
3875 The category for a coding system which has the same code range
3876 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3877 for DIMENSION1 charset. This doesn't use any locking shift
3878 and single shift functions. Assigned the coding-system (Lisp
3879 symbol) `iso-latin-1' by default.
4ed46869
KH
3880
3881 o coding-category-iso-8-2
3882
3883 The category for a coding system which has the same code range
3884 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3885 for DIMENSION2 charset. This doesn't use any locking shift
3886 and single shift functions. Assigned the coding-system (Lisp
3887 symbol) `japanese-iso-8bit' by default.
4ed46869 3888
7717c392 3889 o coding-category-iso-7-else
4ed46869
KH
3890
3891 The category for a coding system which has the same code range
8ca3766a 3892 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
3893 single shift functions. Assigned the coding-system (Lisp
3894 symbol) `iso-2022-7bit-lock' by default.
3895
3896 o coding-category-iso-8-else
3897
3898 The category for a coding system which has the same code range
8ca3766a 3899 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
3900 single shift functions. Assigned the coding-system (Lisp
3901 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3902
3903 o coding-category-big5
3904
3905 The category for a coding system which has the same code range
3906 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3907 `cn-big5' by default.
4ed46869 3908
fa42c37f
KH
3909 o coding-category-utf-8
3910
3911 The category for a coding system which has the same code range
3912 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3913 symbol) `utf-8' by default.
3914
3915 o coding-category-utf-16-be
3916
3917 The category for a coding system in which a text has an
3918 Unicode signature (cf. Unicode Standard) in the order of BIG
3919 endian at the head. Assigned the coding-system (Lisp symbol)
3920 `utf-16-be' by default.
3921
3922 o coding-category-utf-16-le
3923
3924 The category for a coding system in which a text has an
3925 Unicode signature (cf. Unicode Standard) in the order of
3926 LITTLE endian at the head. Assigned the coding-system (Lisp
3927 symbol) `utf-16-le' by default.
3928
1397dc18
KH
3929 o coding-category-ccl
3930
3931 The category for a coding system of which encoder/decoder is
3932 written in CCL programs. The default value is nil, i.e., no
3933 coding system is assigned.
3934
4ed46869
KH
3935 o coding-category-binary
3936
3937 The category for a coding system not categorized in any of the
3938 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3939 `no-conversion' by default.
4ed46869
KH
3940
3941 Each of them is a Lisp symbol and the value is an actual
cfb43547 3942 `coding-system' (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
3943 What Emacs does actually is to detect a category of coding system.
3944 Then, it uses a `coding-system' assigned to it. If Emacs can't
cfb43547 3945 decide a single possible category, it selects a category of the
4ed46869
KH
3946 highest priority. Priorities of categories are also specified by a
3947 user in a Lisp variable `coding-category-list'.
3948
3949*/
3950
66cfb530
KH
3951static
3952int ascii_skip_code[256];
3953
d46c5b12 3954/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3955 If it detects possible coding systems, return an integer in which
3956 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
3957 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3958 it should point the table `coding_priorities'. In that case, only
3959 the flag bit for a coding system of the highest priority is set in
0a28aafb
KH
3960 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
3961 range 0x80..0x9F are in multibyte form.
4ed46869 3962
d46c5b12
KH
3963 How many ASCII characters are at the head is returned as *SKIP. */
3964
3965static int
0a28aafb 3966detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
d46c5b12
KH
3967 unsigned char *source;
3968 int src_bytes, *priorities, *skip;
0a28aafb 3969 int multibytep;
4ed46869
KH
3970{
3971 register unsigned char c;
d46c5b12 3972 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f 3973 unsigned int mask, utf16_examined_p, iso2022_examined_p;
da55a2b7 3974 int i;
4ed46869
KH
3975
3976 /* At first, skip all ASCII characters and control characters except
3977 for three ISO2022 specific control characters. */
66cfb530
KH
3978 ascii_skip_code[ISO_CODE_SO] = 0;
3979 ascii_skip_code[ISO_CODE_SI] = 0;
3980 ascii_skip_code[ISO_CODE_ESC] = 0;
3981
bcf26d6a 3982 label_loop_detect_coding:
66cfb530 3983 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3984 *skip = src - source;
4ed46869
KH
3985
3986 if (src >= src_end)
3987 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3988 return 0;
4ed46869 3989
8a8147d6 3990 c = *src;
4ed46869
KH
3991 /* The text seems to be encoded in some multilingual coding system.
3992 Now, try to find in which coding system the text is encoded. */
3993 if (c < 0x80)
bcf26d6a
KH
3994 {
3995 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3996 /* C is an ISO2022 specific control code of C0. */
0a28aafb 3997 mask = detect_coding_iso2022 (src, src_end, multibytep);
1b2af4b0 3998 if (mask == 0)
d46c5b12
KH
3999 {
4000 /* No valid ISO2022 code follows C. Try again. */
4001 src++;
66cfb530
KH
4002 if (c == ISO_CODE_ESC)
4003 ascii_skip_code[ISO_CODE_ESC] = 1;
4004 else
4005 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
4006 goto label_loop_detect_coding;
4007 }
4008 if (priorities)
fa42c37f
KH
4009 {
4010 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4011 {
4012 if (mask & priorities[i])
4013 return priorities[i];
4014 }
4015 return CODING_CATEGORY_MASK_RAW_TEXT;
4016 }
bcf26d6a 4017 }
d46c5b12 4018 else
c4825358 4019 {
d46c5b12 4020 int try;
4ed46869 4021
0a28aafb 4022 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
67091e59 4023 c = src[1] - 0x20;
0a28aafb 4024
d46c5b12
KH
4025 if (c < 0xA0)
4026 {
4027 /* C is the first byte of SJIS character code,
fa42c37f
KH
4028 or a leading-code of Emacs' internal format (emacs-mule),
4029 or the first byte of UTF-16. */
4030 try = (CODING_CATEGORY_MASK_SJIS
4031 | CODING_CATEGORY_MASK_EMACS_MULE
4032 | CODING_CATEGORY_MASK_UTF_16_BE
4033 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
4034
4035 /* Or, if C is a special latin extra code,
93dec019 4036 or is an ISO2022 specific control code of C1 (SS2 or SS3),
d46c5b12
KH
4037 or is an ISO2022 control-sequence-introducer (CSI),
4038 we should also consider the possibility of ISO2022 codings. */
4039 if ((VECTORP (Vlatin_extra_code_table)
4040 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4041 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4042 || (c == ISO_CODE_CSI
4043 && (src < src_end
4044 && (*src == ']'
4045 || ((*src == '0' || *src == '1' || *src == '2')
4046 && src + 1 < src_end
4047 && src[1] == ']')))))
4048 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4049 | CODING_CATEGORY_MASK_ISO_8BIT);
4050 }
c4825358 4051 else
d46c5b12
KH
4052 /* C is a character of ISO2022 in graphic plane right,
4053 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
4054 or the first byte of BIG5's 2-byte code,
4055 or the first byte of UTF-8/16. */
d46c5b12
KH
4056 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4057 | CODING_CATEGORY_MASK_ISO_8BIT
4058 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
4059 | CODING_CATEGORY_MASK_BIG5
4060 | CODING_CATEGORY_MASK_UTF_8
4061 | CODING_CATEGORY_MASK_UTF_16_BE
4062 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 4063
1397dc18
KH
4064 /* Or, we may have to consider the possibility of CCL. */
4065 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4066 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4067 ->spec.ccl.valid_codes)[c])
4068 try |= CODING_CATEGORY_MASK_CCL;
4069
d46c5b12 4070 mask = 0;
fa42c37f 4071 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
4072 if (priorities)
4073 {
4074 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4075 {
fa42c37f
KH
4076 if (!iso2022_examined_p
4077 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4078 {
0192762c 4079 mask |= detect_coding_iso2022 (src, src_end, multibytep);
fa42c37f
KH
4080 iso2022_examined_p = 1;
4081 }
5ab13dd0 4082 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4083 mask |= detect_coding_sjis (src, src_end, multibytep);
fa42c37f 4084 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4085 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f
KH
4086 else if (!utf16_examined_p
4087 && (priorities[i] & try &
4088 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4089 {
0a28aafb 4090 mask |= detect_coding_utf_16 (src, src_end, multibytep);
fa42c37f
KH
4091 utf16_examined_p = 1;
4092 }
5ab13dd0 4093 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4094 mask |= detect_coding_big5 (src, src_end, multibytep);
5ab13dd0 4095 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4096 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
89fa8b36 4097 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4098 mask |= detect_coding_ccl (src, src_end, multibytep);
5ab13dd0 4099 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 4100 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 4101 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
4102 mask |= CODING_CATEGORY_MASK_BINARY;
4103 if (mask & priorities[i])
4104 return priorities[i];
d46c5b12
KH
4105 }
4106 return CODING_CATEGORY_MASK_RAW_TEXT;
4107 }
4108 if (try & CODING_CATEGORY_MASK_ISO)
0a28aafb 4109 mask |= detect_coding_iso2022 (src, src_end, multibytep);
d46c5b12 4110 if (try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4111 mask |= detect_coding_sjis (src, src_end, multibytep);
d46c5b12 4112 if (try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4113 mask |= detect_coding_big5 (src, src_end, multibytep);
fa42c37f 4114 if (try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4115 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f 4116 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
0a28aafb 4117 mask |= detect_coding_utf_16 (src, src_end, multibytep);
d46c5b12 4118 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4119 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
1397dc18 4120 if (try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4121 mask |= detect_coding_ccl (src, src_end, multibytep);
c4825358 4122 }
5ab13dd0 4123 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
4124}
4125
4126/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4127 The information of the detected coding system is set in CODING. */
4128
4129void
4130detect_coding (coding, src, src_bytes)
4131 struct coding_system *coding;
a4244313 4132 const unsigned char *src;
4ed46869
KH
4133 int src_bytes;
4134{
d46c5b12 4135 unsigned int idx;
da55a2b7 4136 int skip, mask;
84d60297 4137 Lisp_Object val;
4ed46869 4138
84d60297 4139 val = Vcoding_category_list;
64c1e55f
KH
4140 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4141 coding->src_multibyte);
d46c5b12 4142 coding->heading_ascii = skip;
4ed46869 4143
d46c5b12
KH
4144 if (!mask) return;
4145
4146 /* We found a single coding system of the highest priority in MASK. */
4147 idx = 0;
4148 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4149 if (! mask)
4150 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 4151
f5c1dd0d 4152 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
d46c5b12
KH
4153
4154 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 4155 {
84d60297 4156 Lisp_Object tmp;
d46c5b12 4157
84d60297 4158 tmp = Fget (val, Qeol_type);
d46c5b12
KH
4159 if (VECTORP (tmp))
4160 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 4161 }
b73bfc1c
KH
4162
4163 /* Setup this new coding system while preserving some slots. */
4164 {
4165 int src_multibyte = coding->src_multibyte;
4166 int dst_multibyte = coding->dst_multibyte;
4167
4168 setup_coding_system (val, coding);
4169 coding->src_multibyte = src_multibyte;
4170 coding->dst_multibyte = dst_multibyte;
4171 coding->heading_ascii = skip;
4172 }
4ed46869
KH
4173}
4174
d46c5b12
KH
4175/* Detect how end-of-line of a text of length SRC_BYTES pointed by
4176 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4177 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4178
4179 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 4180
bc4bc72a
RS
4181#define MAX_EOL_CHECK_COUNT 3
4182
d46c5b12
KH
4183static int
4184detect_eol_type (source, src_bytes, skip)
4185 unsigned char *source;
4186 int src_bytes, *skip;
4ed46869 4187{
d46c5b12 4188 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 4189 unsigned char c;
bc4bc72a
RS
4190 int total = 0; /* How many end-of-lines are found so far. */
4191 int eol_type = CODING_EOL_UNDECIDED;
4192 int this_eol_type;
4ed46869 4193
d46c5b12
KH
4194 *skip = 0;
4195
bc4bc72a 4196 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
4197 {
4198 c = *src++;
bc4bc72a 4199 if (c == '\n' || c == '\r')
4ed46869 4200 {
d46c5b12
KH
4201 if (*skip == 0)
4202 *skip = src - 1 - source;
bc4bc72a
RS
4203 total++;
4204 if (c == '\n')
4205 this_eol_type = CODING_EOL_LF;
4206 else if (src >= src_end || *src != '\n')
4207 this_eol_type = CODING_EOL_CR;
4ed46869 4208 else
bc4bc72a
RS
4209 this_eol_type = CODING_EOL_CRLF, src++;
4210
4211 if (eol_type == CODING_EOL_UNDECIDED)
4212 /* This is the first end-of-line. */
4213 eol_type = this_eol_type;
4214 else if (eol_type != this_eol_type)
d46c5b12
KH
4215 {
4216 /* The found type is different from what found before. */
4217 eol_type = CODING_EOL_INCONSISTENT;
4218 break;
4219 }
4ed46869
KH
4220 }
4221 }
bc4bc72a 4222
d46c5b12
KH
4223 if (*skip == 0)
4224 *skip = src_end - source;
85a02ca4 4225 return eol_type;
4ed46869
KH
4226}
4227
fa42c37f
KH
4228/* Like detect_eol_type, but detect EOL type in 2-octet
4229 big-endian/little-endian format for coding systems utf-16-be and
4230 utf-16-le. */
4231
4232static int
4233detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4234 unsigned char *source;
cfb43547 4235 int src_bytes, *skip, big_endian_p;
fa42c37f
KH
4236{
4237 unsigned char *src = source, *src_end = src + src_bytes;
4238 unsigned int c1, c2;
4239 int total = 0; /* How many end-of-lines are found so far. */
4240 int eol_type = CODING_EOL_UNDECIDED;
4241 int this_eol_type;
4242 int msb, lsb;
4243
4244 if (big_endian_p)
4245 msb = 0, lsb = 1;
4246 else
4247 msb = 1, lsb = 0;
4248
4249 *skip = 0;
4250
4251 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4252 {
4253 c1 = (src[msb] << 8) | (src[lsb]);
4254 src += 2;
4255
4256 if (c1 == '\n' || c1 == '\r')
4257 {
4258 if (*skip == 0)
4259 *skip = src - 2 - source;
4260 total++;
4261 if (c1 == '\n')
4262 {
4263 this_eol_type = CODING_EOL_LF;
4264 }
4265 else
4266 {
4267 if ((src + 1) >= src_end)
4268 {
4269 this_eol_type = CODING_EOL_CR;
4270 }
4271 else
4272 {
4273 c2 = (src[msb] << 8) | (src[lsb]);
4274 if (c2 == '\n')
4275 this_eol_type = CODING_EOL_CRLF, src += 2;
4276 else
4277 this_eol_type = CODING_EOL_CR;
4278 }
4279 }
4280
4281 if (eol_type == CODING_EOL_UNDECIDED)
4282 /* This is the first end-of-line. */
4283 eol_type = this_eol_type;
4284 else if (eol_type != this_eol_type)
4285 {
4286 /* The found type is different from what found before. */
4287 eol_type = CODING_EOL_INCONSISTENT;
4288 break;
4289 }
4290 }
4291 }
4292
4293 if (*skip == 0)
4294 *skip = src_end - source;
4295 return eol_type;
4296}
4297
4ed46869
KH
4298/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4299 is encoded. If it detects an appropriate format of end-of-line, it
4300 sets the information in *CODING. */
4301
4302void
4303detect_eol (coding, src, src_bytes)
4304 struct coding_system *coding;
a4244313 4305 const unsigned char *src;
4ed46869
KH
4306 int src_bytes;
4307{
4608c386 4308 Lisp_Object val;
d46c5b12 4309 int skip;
fa42c37f
KH
4310 int eol_type;
4311
4312 switch (coding->category_idx)
4313 {
4314 case CODING_CATEGORY_IDX_UTF_16_BE:
4315 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4316 break;
4317 case CODING_CATEGORY_IDX_UTF_16_LE:
4318 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4319 break;
4320 default:
4321 eol_type = detect_eol_type (src, src_bytes, &skip);
4322 break;
4323 }
d46c5b12
KH
4324
4325 if (coding->heading_ascii > skip)
4326 coding->heading_ascii = skip;
4327 else
4328 skip = coding->heading_ascii;
4ed46869 4329
0ef69138 4330 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 4331 return;
27901516
KH
4332 if (eol_type == CODING_EOL_INCONSISTENT)
4333 {
4334#if 0
4335 /* This code is suppressed until we find a better way to
992f23f2 4336 distinguish raw text file and binary file. */
27901516
KH
4337
4338 /* If we have already detected that the coding is raw-text, the
4339 coding should actually be no-conversion. */
4340 if (coding->type == coding_type_raw_text)
4341 {
4342 setup_coding_system (Qno_conversion, coding);
4343 return;
4344 }
4345 /* Else, let's decode only text code anyway. */
4346#endif /* 0 */
1b2af4b0 4347 eol_type = CODING_EOL_LF;
27901516
KH
4348 }
4349
4608c386 4350 val = Fget (coding->symbol, Qeol_type);
4ed46869 4351 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12 4352 {
b73bfc1c
KH
4353 int src_multibyte = coding->src_multibyte;
4354 int dst_multibyte = coding->dst_multibyte;
1cd6b64c 4355 struct composition_data *cmp_data = coding->cmp_data;
b73bfc1c 4356
d46c5b12 4357 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
b73bfc1c
KH
4358 coding->src_multibyte = src_multibyte;
4359 coding->dst_multibyte = dst_multibyte;
d46c5b12 4360 coding->heading_ascii = skip;
1cd6b64c 4361 coding->cmp_data = cmp_data;
d46c5b12
KH
4362 }
4363}
4364
4365#define CONVERSION_BUFFER_EXTRA_ROOM 256
4366
b73bfc1c
KH
4367#define DECODING_BUFFER_MAG(coding) \
4368 (coding->type == coding_type_iso2022 \
4369 ? 3 \
4370 : (coding->type == coding_type_ccl \
4371 ? coding->spec.ccl.decoder.buf_magnification \
4372 : 2))
d46c5b12
KH
4373
4374/* Return maximum size (bytes) of a buffer enough for decoding
4375 SRC_BYTES of text encoded in CODING. */
4376
4377int
4378decoding_buffer_size (coding, src_bytes)
4379 struct coding_system *coding;
4380 int src_bytes;
4381{
4382 return (src_bytes * DECODING_BUFFER_MAG (coding)
4383 + CONVERSION_BUFFER_EXTRA_ROOM);
4384}
4385
4386/* Return maximum size (bytes) of a buffer enough for encoding
4387 SRC_BYTES of text to CODING. */
4388
4389int
4390encoding_buffer_size (coding, src_bytes)
4391 struct coding_system *coding;
4392 int src_bytes;
4393{
4394 int magnification;
4395
4396 if (coding->type == coding_type_ccl)
4397 magnification = coding->spec.ccl.encoder.buf_magnification;
b73bfc1c 4398 else if (CODING_REQUIRE_ENCODING (coding))
d46c5b12 4399 magnification = 3;
b73bfc1c
KH
4400 else
4401 magnification = 1;
d46c5b12
KH
4402
4403 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4404}
4405
73be902c
KH
4406/* Working buffer for code conversion. */
4407struct conversion_buffer
4408{
4409 int size; /* size of data. */
4410 int on_stack; /* 1 if allocated by alloca. */
4411 unsigned char *data;
4412};
d46c5b12 4413
73be902c
KH
4414/* Don't use alloca for allocating memory space larger than this, lest
4415 we overflow their stack. */
4416#define MAX_ALLOCA 16*1024
d46c5b12 4417
73be902c
KH
4418/* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4419#define allocate_conversion_buffer(buf, len) \
4420 do { \
4421 if (len < MAX_ALLOCA) \
4422 { \
4423 buf.data = (unsigned char *) alloca (len); \
4424 buf.on_stack = 1; \
4425 } \
4426 else \
4427 { \
4428 buf.data = (unsigned char *) xmalloc (len); \
4429 buf.on_stack = 0; \
4430 } \
4431 buf.size = len; \
4432 } while (0)
d46c5b12 4433
73be902c
KH
4434/* Double the allocated memory for *BUF. */
4435static void
4436extend_conversion_buffer (buf)
4437 struct conversion_buffer *buf;
d46c5b12 4438{
73be902c 4439 if (buf->on_stack)
d46c5b12 4440 {
73be902c
KH
4441 unsigned char *save = buf->data;
4442 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4443 bcopy (save, buf->data, buf->size);
4444 buf->on_stack = 0;
d46c5b12 4445 }
73be902c
KH
4446 else
4447 {
4448 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4449 }
4450 buf->size *= 2;
4451}
4452
4453/* Free the allocated memory for BUF if it is not on stack. */
4454static void
4455free_conversion_buffer (buf)
4456 struct conversion_buffer *buf;
4457{
4458 if (!buf->on_stack)
4459 xfree (buf->data);
d46c5b12
KH
4460}
4461
4462int
4463ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4464 struct coding_system *coding;
4465 unsigned char *source, *destination;
4466 int src_bytes, dst_bytes, encodep;
4467{
4468 struct ccl_program *ccl
4469 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
1c3478b0 4470 unsigned char *dst = destination;
d46c5b12 4471
bd64290d 4472 ccl->suppress_error = coding->suppress_error;
ae9ff118 4473 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
aaaf0b1e 4474 if (encodep)
80e0ca99
KH
4475 {
4476 /* On encoding, EOL format is converted within ccl_driver. For
4477 that, setup proper information in the structure CCL. */
4478 ccl->eol_type = coding->eol_type;
4479 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4480 ccl->eol_type = CODING_EOL_LF;
4481 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4482 }
7272d75c 4483 ccl->multibyte = coding->src_multibyte;
1c3478b0
KH
4484 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4485 {
4486 /* Move carryover bytes to DESTINATION. */
4487 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4488 while (*p)
4489 *dst++ = *p++;
4490 coding->spec.ccl.eight_bit_carryover[0] = 0;
4491 if (dst_bytes)
4492 dst_bytes -= dst - destination;
4493 }
4494
4495 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4496 &(coding->consumed))
4497 + dst - destination);
4498
b73bfc1c 4499 if (encodep)
80e0ca99
KH
4500 {
4501 coding->produced_char = coding->produced;
4502 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4503 }
ade8d05e
KH
4504 else if (!ccl->eight_bit_control)
4505 {
4506 /* The produced bytes forms a valid multibyte sequence. */
4507 coding->produced_char
4508 = multibyte_chars_in_text (destination, coding->produced);
4509 coding->spec.ccl.eight_bit_carryover[0] = 0;
4510 }
b73bfc1c
KH
4511 else
4512 {
1c3478b0
KH
4513 /* On decoding, the destination should always multibyte. But,
4514 CCL program might have been generated an invalid multibyte
4515 sequence. Here we make such a sequence valid as
4516 multibyte. */
b73bfc1c
KH
4517 int bytes
4518 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
1c3478b0
KH
4519
4520 if ((coding->consumed < src_bytes
4521 || !ccl->last_block)
4522 && coding->produced >= 1
4523 && destination[coding->produced - 1] >= 0x80)
4524 {
4525 /* We should not convert the tailing 8-bit codes to
4526 multibyte form even if they doesn't form a valid
4527 multibyte sequence. They may form a valid sequence in
4528 the next call. */
4529 int carryover = 0;
4530
4531 if (destination[coding->produced - 1] < 0xA0)
4532 carryover = 1;
4533 else if (coding->produced >= 2)
4534 {
4535 if (destination[coding->produced - 2] >= 0x80)
4536 {
4537 if (destination[coding->produced - 2] < 0xA0)
4538 carryover = 2;
4539 else if (coding->produced >= 3
4540 && destination[coding->produced - 3] >= 0x80
4541 && destination[coding->produced - 3] < 0xA0)
4542 carryover = 3;
4543 }
4544 }
4545 if (carryover > 0)
4546 {
4547 BCOPY_SHORT (destination + coding->produced - carryover,
4548 coding->spec.ccl.eight_bit_carryover,
4549 carryover);
4550 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4551 coding->produced -= carryover;
4552 }
4553 }
b73bfc1c
KH
4554 coding->produced = str_as_multibyte (destination, bytes,
4555 coding->produced,
4556 &(coding->produced_char));
4557 }
69f76525 4558
d46c5b12
KH
4559 switch (ccl->status)
4560 {
4561 case CCL_STAT_SUSPEND_BY_SRC:
73be902c 4562 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
d46c5b12
KH
4563 break;
4564 case CCL_STAT_SUSPEND_BY_DST:
73be902c 4565 coding->result = CODING_FINISH_INSUFFICIENT_DST;
d46c5b12 4566 break;
9864ebce
KH
4567 case CCL_STAT_QUIT:
4568 case CCL_STAT_INVALID_CMD:
73be902c 4569 coding->result = CODING_FINISH_INTERRUPT;
9864ebce 4570 break;
d46c5b12 4571 default:
73be902c 4572 coding->result = CODING_FINISH_NORMAL;
d46c5b12
KH
4573 break;
4574 }
73be902c 4575 return coding->result;
4ed46869
KH
4576}
4577
aaaf0b1e
KH
4578/* Decode EOL format of the text at PTR of BYTES length destructively
4579 according to CODING->eol_type. This is called after the CCL
4580 program produced a decoded text at PTR. If we do CRLF->LF
4581 conversion, update CODING->produced and CODING->produced_char. */
4582
4583static void
4584decode_eol_post_ccl (coding, ptr, bytes)
4585 struct coding_system *coding;
4586 unsigned char *ptr;
4587 int bytes;
4588{
4589 Lisp_Object val, saved_coding_symbol;
4590 unsigned char *pend = ptr + bytes;
4591 int dummy;
4592
4593 /* Remember the current coding system symbol. We set it back when
4594 an inconsistent EOL is found so that `last-coding-system-used' is
4595 set to the coding system that doesn't specify EOL conversion. */
4596 saved_coding_symbol = coding->symbol;
4597
4598 coding->spec.ccl.cr_carryover = 0;
4599 if (coding->eol_type == CODING_EOL_UNDECIDED)
4600 {
4601 /* Here, to avoid the call of setup_coding_system, we directly
4602 call detect_eol_type. */
4603 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
74b01b80
EZ
4604 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4605 coding->eol_type = CODING_EOL_LF;
4606 if (coding->eol_type != CODING_EOL_UNDECIDED)
4607 {
4608 val = Fget (coding->symbol, Qeol_type);
4609 if (VECTORP (val) && XVECTOR (val)->size == 3)
4610 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4611 }
aaaf0b1e
KH
4612 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4613 }
4614
74b01b80
EZ
4615 if (coding->eol_type == CODING_EOL_LF
4616 || coding->eol_type == CODING_EOL_UNDECIDED)
aaaf0b1e
KH
4617 {
4618 /* We have nothing to do. */
4619 ptr = pend;
4620 }
4621 else if (coding->eol_type == CODING_EOL_CRLF)
4622 {
4623 unsigned char *pstart = ptr, *p = ptr;
4624
4625 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4626 && *(pend - 1) == '\r')
4627 {
4628 /* If the last character is CR, we can't handle it here
4629 because LF will be in the not-yet-decoded source text.
9861e777 4630 Record that the CR is not yet processed. */
aaaf0b1e
KH
4631 coding->spec.ccl.cr_carryover = 1;
4632 coding->produced--;
4633 coding->produced_char--;
4634 pend--;
4635 }
4636 while (ptr < pend)
4637 {
4638 if (*ptr == '\r')
4639 {
4640 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4641 {
4642 *p++ = '\n';
4643 ptr += 2;
4644 }
4645 else
4646 {
4647 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4648 goto undo_eol_conversion;
4649 *p++ = *ptr++;
4650 }
4651 }
4652 else if (*ptr == '\n'
4653 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4654 goto undo_eol_conversion;
4655 else
4656 *p++ = *ptr++;
4657 continue;
4658
4659 undo_eol_conversion:
4660 /* We have faced with inconsistent EOL format at PTR.
4661 Convert all LFs before PTR back to CRLFs. */
4662 for (p--, ptr--; p >= pstart; p--)
4663 {
4664 if (*p == '\n')
4665 *ptr-- = '\n', *ptr-- = '\r';
4666 else
4667 *ptr-- = *p;
4668 }
4669 /* If carryover is recorded, cancel it because we don't
4670 convert CRLF anymore. */
4671 if (coding->spec.ccl.cr_carryover)
4672 {
4673 coding->spec.ccl.cr_carryover = 0;
4674 coding->produced++;
4675 coding->produced_char++;
4676 pend++;
4677 }
4678 p = ptr = pend;
4679 coding->eol_type = CODING_EOL_LF;
4680 coding->symbol = saved_coding_symbol;
4681 }
4682 if (p < pend)
4683 {
4684 /* As each two-byte sequence CRLF was converted to LF, (PEND
4685 - P) is the number of deleted characters. */
4686 coding->produced -= pend - p;
4687 coding->produced_char -= pend - p;
4688 }
4689 }
4690 else /* i.e. coding->eol_type == CODING_EOL_CR */
4691 {
4692 unsigned char *p = ptr;
4693
4694 for (; ptr < pend; ptr++)
4695 {
4696 if (*ptr == '\r')
4697 *ptr = '\n';
4698 else if (*ptr == '\n'
4699 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4700 {
4701 for (; p < ptr; p++)
4702 {
4703 if (*p == '\n')
4704 *p = '\r';
4705 }
4706 ptr = pend;
4707 coding->eol_type = CODING_EOL_LF;
4708 coding->symbol = saved_coding_symbol;
4709 }
4710 }
4711 }
4712}
4713
4ed46869
KH
4714/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4715 decoding, it may detect coding system and format of end-of-line if
b73bfc1c
KH
4716 those are not yet decided. The source should be unibyte, the
4717 result is multibyte if CODING->dst_multibyte is nonzero, else
4718 unibyte. */
4ed46869
KH
4719
4720int
d46c5b12 4721decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4722 struct coding_system *coding;
a4244313
KR
4723 const unsigned char *source;
4724 unsigned char *destination;
4ed46869 4725 int src_bytes, dst_bytes;
4ed46869 4726{
9861e777
EZ
4727 int extra = 0;
4728
0ef69138 4729 if (coding->type == coding_type_undecided)
4ed46869
KH
4730 detect_coding (coding, source, src_bytes);
4731
aaaf0b1e
KH
4732 if (coding->eol_type == CODING_EOL_UNDECIDED
4733 && coding->type != coding_type_ccl)
8844fa83
KH
4734 {
4735 detect_eol (coding, source, src_bytes);
4736 /* We had better recover the original eol format if we
8ca3766a 4737 encounter an inconsistent eol format while decoding. */
8844fa83
KH
4738 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4739 }
4ed46869 4740
b73bfc1c
KH
4741 coding->produced = coding->produced_char = 0;
4742 coding->consumed = coding->consumed_char = 0;
4743 coding->errors = 0;
4744 coding->result = CODING_FINISH_NORMAL;
4745
4ed46869
KH
4746 switch (coding->type)
4747 {
4ed46869 4748 case coding_type_sjis:
b73bfc1c
KH
4749 decode_coding_sjis_big5 (coding, source, destination,
4750 src_bytes, dst_bytes, 1);
4ed46869
KH
4751 break;
4752
4753 case coding_type_iso2022:
b73bfc1c
KH
4754 decode_coding_iso2022 (coding, source, destination,
4755 src_bytes, dst_bytes);
4ed46869
KH
4756 break;
4757
4758 case coding_type_big5:
b73bfc1c
KH
4759 decode_coding_sjis_big5 (coding, source, destination,
4760 src_bytes, dst_bytes, 0);
4761 break;
4762
4763 case coding_type_emacs_mule:
4764 decode_coding_emacs_mule (coding, source, destination,
4765 src_bytes, dst_bytes);
4ed46869
KH
4766 break;
4767
4768 case coding_type_ccl:
aaaf0b1e
KH
4769 if (coding->spec.ccl.cr_carryover)
4770 {
9861e777
EZ
4771 /* Put the CR which was not processed by the previous call
4772 of decode_eol_post_ccl in DESTINATION. It will be
4773 decoded together with the following LF by the call to
4774 decode_eol_post_ccl below. */
aaaf0b1e
KH
4775 *destination = '\r';
4776 coding->produced++;
4777 coding->produced_char++;
4778 dst_bytes--;
9861e777 4779 extra = coding->spec.ccl.cr_carryover;
aaaf0b1e 4780 }
9861e777 4781 ccl_coding_driver (coding, source, destination + extra,
b73bfc1c 4782 src_bytes, dst_bytes, 0);
aaaf0b1e 4783 if (coding->eol_type != CODING_EOL_LF)
9861e777
EZ
4784 {
4785 coding->produced += extra;
4786 coding->produced_char += extra;
4787 decode_eol_post_ccl (coding, destination, coding->produced);
4788 }
d46c5b12
KH
4789 break;
4790
b73bfc1c
KH
4791 default:
4792 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4793 }
4794
4795 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
e7c9eef9 4796 && coding->mode & CODING_MODE_LAST_BLOCK
b73bfc1c
KH
4797 && coding->consumed == src_bytes)
4798 coding->result = CODING_FINISH_NORMAL;
4799
4800 if (coding->mode & CODING_MODE_LAST_BLOCK
4801 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4802 {
a4244313 4803 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
4804 unsigned char *dst = destination + coding->produced;
4805
4806 src_bytes -= coding->consumed;
bb10be8b 4807 coding->errors++;
b73bfc1c
KH
4808 if (COMPOSING_P (coding))
4809 DECODE_COMPOSITION_END ('1');
4810 while (src_bytes--)
d46c5b12 4811 {
b73bfc1c
KH
4812 int c = *src++;
4813 dst += CHAR_STRING (c, dst);
4814 coding->produced_char++;
d46c5b12 4815 }
b73bfc1c
KH
4816 coding->consumed = coding->consumed_char = src - source;
4817 coding->produced = dst - destination;
73be902c 4818 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
4819 }
4820
b73bfc1c
KH
4821 if (!coding->dst_multibyte)
4822 {
4823 coding->produced = str_as_unibyte (destination, coding->produced);
4824 coding->produced_char = coding->produced;
4825 }
4ed46869 4826
b73bfc1c
KH
4827 return coding->result;
4828}
52d41803 4829
b73bfc1c
KH
4830/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4831 multibyteness of the source is CODING->src_multibyte, the
4832 multibyteness of the result is always unibyte. */
4ed46869
KH
4833
4834int
d46c5b12 4835encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4836 struct coding_system *coding;
a4244313
KR
4837 const unsigned char *source;
4838 unsigned char *destination;
4ed46869 4839 int src_bytes, dst_bytes;
4ed46869 4840{
b73bfc1c
KH
4841 coding->produced = coding->produced_char = 0;
4842 coding->consumed = coding->consumed_char = 0;
4843 coding->errors = 0;
4844 coding->result = CODING_FINISH_NORMAL;
4ed46869 4845
d46c5b12
KH
4846 switch (coding->type)
4847 {
4ed46869 4848 case coding_type_sjis:
b73bfc1c
KH
4849 encode_coding_sjis_big5 (coding, source, destination,
4850 src_bytes, dst_bytes, 1);
4ed46869
KH
4851 break;
4852
4853 case coding_type_iso2022:
b73bfc1c
KH
4854 encode_coding_iso2022 (coding, source, destination,
4855 src_bytes, dst_bytes);
4ed46869
KH
4856 break;
4857
4858 case coding_type_big5:
b73bfc1c
KH
4859 encode_coding_sjis_big5 (coding, source, destination,
4860 src_bytes, dst_bytes, 0);
4861 break;
4862
4863 case coding_type_emacs_mule:
4864 encode_coding_emacs_mule (coding, source, destination,
4865 src_bytes, dst_bytes);
4ed46869
KH
4866 break;
4867
4868 case coding_type_ccl:
b73bfc1c
KH
4869 ccl_coding_driver (coding, source, destination,
4870 src_bytes, dst_bytes, 1);
d46c5b12
KH
4871 break;
4872
b73bfc1c
KH
4873 default:
4874 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4875 }
4876
73be902c
KH
4877 if (coding->mode & CODING_MODE_LAST_BLOCK
4878 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
b73bfc1c 4879 {
a4244313 4880 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
4881 unsigned char *dst = destination + coding->produced;
4882
4883 if (coding->type == coding_type_iso2022)
4884 ENCODE_RESET_PLANE_AND_REGISTER;
4885 if (COMPOSING_P (coding))
4886 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4887 if (coding->consumed < src_bytes)
d46c5b12 4888 {
b73bfc1c
KH
4889 int len = src_bytes - coding->consumed;
4890
fabf4a91 4891 BCOPY_SHORT (src, dst, len);
b73bfc1c
KH
4892 if (coding->src_multibyte)
4893 len = str_as_unibyte (dst, len);
4894 dst += len;
4895 coding->consumed = src_bytes;
d46c5b12 4896 }
b73bfc1c 4897 coding->produced = coding->produced_char = dst - destination;
73be902c 4898 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
4899 }
4900
bb10be8b
KH
4901 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4902 && coding->consumed == src_bytes)
4903 coding->result = CODING_FINISH_NORMAL;
4904
b73bfc1c 4905 return coding->result;
4ed46869
KH
4906}
4907
fb88bf2d
KH
4908/* Scan text in the region between *BEG and *END (byte positions),
4909 skip characters which we don't have to decode by coding system
4910 CODING at the head and tail, then set *BEG and *END to the region
4911 of the text we actually have to convert. The caller should move
b73bfc1c
KH
4912 the gap out of the region in advance if the region is from a
4913 buffer.
4ed46869 4914
d46c5b12
KH
4915 If STR is not NULL, *BEG and *END are indices into STR. */
4916
4917static void
4918shrink_decoding_region (beg, end, coding, str)
4919 int *beg, *end;
4920 struct coding_system *coding;
4921 unsigned char *str;
4922{
fb88bf2d 4923 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 4924 int eol_conversion;
88993dfd 4925 Lisp_Object translation_table;
d46c5b12
KH
4926
4927 if (coding->type == coding_type_ccl
4928 || coding->type == coding_type_undecided
b73bfc1c
KH
4929 || coding->eol_type != CODING_EOL_LF
4930 || !NILP (coding->post_read_conversion)
4931 || coding->composing != COMPOSITION_DISABLED)
d46c5b12
KH
4932 {
4933 /* We can't skip any data. */
4934 return;
4935 }
b73bfc1c
KH
4936 if (coding->type == coding_type_no_conversion
4937 || coding->type == coding_type_raw_text
4938 || coding->type == coding_type_emacs_mule)
d46c5b12 4939 {
fb88bf2d
KH
4940 /* We need no conversion, but don't have to skip any data here.
4941 Decoding routine handles them effectively anyway. */
d46c5b12
KH
4942 return;
4943 }
4944
88993dfd
KH
4945 translation_table = coding->translation_table_for_decode;
4946 if (NILP (translation_table) && !NILP (Venable_character_translation))
4947 translation_table = Vstandard_translation_table_for_decode;
4948 if (CHAR_TABLE_P (translation_table))
4949 {
4950 int i;
4951 for (i = 0; i < 128; i++)
4952 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4953 break;
4954 if (i < 128)
fa46990e 4955 /* Some ASCII character should be translated. We give up
88993dfd
KH
4956 shrinking. */
4957 return;
4958 }
4959
b73bfc1c 4960 if (coding->heading_ascii >= 0)
d46c5b12
KH
4961 /* Detection routine has already found how much we can skip at the
4962 head. */
4963 *beg += coding->heading_ascii;
4964
4965 if (str)
4966 {
4967 begp_orig = begp = str + *beg;
4968 endp_orig = endp = str + *end;
4969 }
4970 else
4971 {
fb88bf2d 4972 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4973 endp_orig = endp = begp + *end - *beg;
4974 }
4975
fa46990e
DL
4976 eol_conversion = (coding->eol_type == CODING_EOL_CR
4977 || coding->eol_type == CODING_EOL_CRLF);
4978
d46c5b12
KH
4979 switch (coding->type)
4980 {
d46c5b12
KH
4981 case coding_type_sjis:
4982 case coding_type_big5:
4983 /* We can skip all ASCII characters at the head. */
4984 if (coding->heading_ascii < 0)
4985 {
4986 if (eol_conversion)
de9d083c 4987 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4988 else
4989 while (begp < endp && *begp < 0x80) begp++;
4990 }
4991 /* We can skip all ASCII characters at the tail except for the
4992 second byte of SJIS or BIG5 code. */
4993 if (eol_conversion)
de9d083c 4994 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4995 else
4996 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4997 /* Do not consider LF as ascii if preceded by CR, since that
4998 confuses eol decoding. */
4999 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5000 endp++;
d46c5b12
KH
5001 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5002 endp++;
5003 break;
5004
b73bfc1c 5005 case coding_type_iso2022:
622fece5
KH
5006 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5007 /* We can't skip any data. */
5008 break;
d46c5b12
KH
5009 if (coding->heading_ascii < 0)
5010 {
d46c5b12
KH
5011 /* We can skip all ASCII characters at the head except for a
5012 few control codes. */
5013 while (begp < endp && (c = *begp) < 0x80
5014 && c != ISO_CODE_CR && c != ISO_CODE_SO
5015 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5016 && (!eol_conversion || c != ISO_CODE_LF))
5017 begp++;
5018 }
5019 switch (coding->category_idx)
5020 {
5021 case CODING_CATEGORY_IDX_ISO_8_1:
5022 case CODING_CATEGORY_IDX_ISO_8_2:
5023 /* We can skip all ASCII characters at the tail. */
5024 if (eol_conversion)
de9d083c 5025 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
5026 else
5027 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
5028 /* Do not consider LF as ascii if preceded by CR, since that
5029 confuses eol decoding. */
5030 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5031 endp++;
d46c5b12
KH
5032 break;
5033
5034 case CODING_CATEGORY_IDX_ISO_7:
5035 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5 5036 {
8ca3766a 5037 /* We can skip all characters at the tail except for 8-bit
de79a6a5
KH
5038 codes and ESC and the following 2-byte at the tail. */
5039 unsigned char *eight_bit = NULL;
5040
5041 if (eol_conversion)
5042 while (begp < endp
5043 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5044 {
5045 if (!eight_bit && c & 0x80) eight_bit = endp;
5046 endp--;
5047 }
5048 else
5049 while (begp < endp
5050 && (c = endp[-1]) != ISO_CODE_ESC)
5051 {
5052 if (!eight_bit && c & 0x80) eight_bit = endp;
5053 endp--;
5054 }
5055 /* Do not consider LF as ascii if preceded by CR, since that
5056 confuses eol decoding. */
5057 if (begp < endp && endp < endp_orig
5058 && endp[-1] == '\r' && endp[0] == '\n')
5059 endp++;
5060 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5061 {
5062 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5063 /* This is an ASCII designation sequence. We can
5064 surely skip the tail. But, if we have
5065 encountered an 8-bit code, skip only the codes
5066 after that. */
5067 endp = eight_bit ? eight_bit : endp + 2;
5068 else
5069 /* Hmmm, we can't skip the tail. */
5070 endp = endp_orig;
5071 }
5072 else if (eight_bit)
5073 endp = eight_bit;
5074 }
d46c5b12 5075 }
b73bfc1c
KH
5076 break;
5077
5078 default:
5079 abort ();
d46c5b12
KH
5080 }
5081 *beg += begp - begp_orig;
5082 *end += endp - endp_orig;
5083 return;
5084}
5085
5086/* Like shrink_decoding_region but for encoding. */
5087
5088static void
5089shrink_encoding_region (beg, end, coding, str)
5090 int *beg, *end;
5091 struct coding_system *coding;
5092 unsigned char *str;
5093{
5094 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5095 int eol_conversion;
88993dfd 5096 Lisp_Object translation_table;
d46c5b12 5097
b73bfc1c
KH
5098 if (coding->type == coding_type_ccl
5099 || coding->eol_type == CODING_EOL_CRLF
5100 || coding->eol_type == CODING_EOL_CR
87323294 5101 || (coding->cmp_data && coding->cmp_data->used > 0))
d46c5b12 5102 {
b73bfc1c
KH
5103 /* We can't skip any data. */
5104 return;
5105 }
5106 if (coding->type == coding_type_no_conversion
5107 || coding->type == coding_type_raw_text
5108 || coding->type == coding_type_emacs_mule
5109 || coding->type == coding_type_undecided)
5110 {
5111 /* We need no conversion, but don't have to skip any data here.
5112 Encoding routine handles them effectively anyway. */
d46c5b12
KH
5113 return;
5114 }
5115
88993dfd
KH
5116 translation_table = coding->translation_table_for_encode;
5117 if (NILP (translation_table) && !NILP (Venable_character_translation))
5118 translation_table = Vstandard_translation_table_for_encode;
5119 if (CHAR_TABLE_P (translation_table))
5120 {
5121 int i;
5122 for (i = 0; i < 128; i++)
5123 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5124 break;
5125 if (i < 128)
8ca3766a 5126 /* Some ASCII character should be translated. We give up
88993dfd
KH
5127 shrinking. */
5128 return;
5129 }
5130
d46c5b12
KH
5131 if (str)
5132 {
5133 begp_orig = begp = str + *beg;
5134 endp_orig = endp = str + *end;
5135 }
5136 else
5137 {
fb88bf2d 5138 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
5139 endp_orig = endp = begp + *end - *beg;
5140 }
5141
5142 eol_conversion = (coding->eol_type == CODING_EOL_CR
5143 || coding->eol_type == CODING_EOL_CRLF);
5144
5145 /* Here, we don't have to check coding->pre_write_conversion because
5146 the caller is expected to have handled it already. */
5147 switch (coding->type)
5148 {
d46c5b12 5149 case coding_type_iso2022:
622fece5
KH
5150 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5151 /* We can't skip any data. */
5152 break;
d46c5b12
KH
5153 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5154 {
93dec019 5155 unsigned char *bol = begp;
d46c5b12
KH
5156 while (begp < endp && *begp < 0x80)
5157 {
5158 begp++;
5159 if (begp[-1] == '\n')
5160 bol = begp;
5161 }
5162 begp = bol;
5163 goto label_skip_tail;
5164 }
5165 /* fall down ... */
5166
b73bfc1c
KH
5167 case coding_type_sjis:
5168 case coding_type_big5:
d46c5b12
KH
5169 /* We can skip all ASCII characters at the head and tail. */
5170 if (eol_conversion)
5171 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5172 else
5173 while (begp < endp && *begp < 0x80) begp++;
5174 label_skip_tail:
5175 if (eol_conversion)
5176 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5177 else
5178 while (begp < endp && *(endp - 1) < 0x80) endp--;
5179 break;
b73bfc1c
KH
5180
5181 default:
5182 abort ();
d46c5b12
KH
5183 }
5184
5185 *beg += begp - begp_orig;
5186 *end += endp - endp_orig;
5187 return;
5188}
5189
88993dfd
KH
5190/* As shrinking conversion region requires some overhead, we don't try
5191 shrinking if the length of conversion region is less than this
5192 value. */
5193static int shrink_conversion_region_threshhold = 1024;
5194
5195#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5196 do { \
5197 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5198 { \
5199 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5200 else shrink_decoding_region (beg, end, coding, str); \
5201 } \
5202 } while (0)
5203
b843d1ae
KH
5204static Lisp_Object
5205code_convert_region_unwind (dummy)
5206 Lisp_Object dummy;
5207{
5208 inhibit_pre_post_conversion = 0;
5209 return Qnil;
5210}
5211
ec6d2bb8
KH
5212/* Store information about all compositions in the range FROM and TO
5213 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5214 buffer or a string, defaults to the current buffer. */
5215
5216void
5217coding_save_composition (coding, from, to, obj)
5218 struct coding_system *coding;
5219 int from, to;
5220 Lisp_Object obj;
5221{
5222 Lisp_Object prop;
5223 int start, end;
5224
91bee881
KH
5225 if (coding->composing == COMPOSITION_DISABLED)
5226 return;
5227 if (!coding->cmp_data)
5228 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
5229 if (!find_composition (from, to, &start, &end, &prop, obj)
5230 || end > to)
5231 return;
5232 if (start < from
5233 && (!find_composition (end, to, &start, &end, &prop, obj)
5234 || end > to))
5235 return;
5236 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
5237 do
5238 {
5239 if (COMPOSITION_VALID_P (start, end, prop))
5240 {
5241 enum composition_method method = COMPOSITION_METHOD (prop);
5242 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5243 >= COMPOSITION_DATA_SIZE)
5244 coding_allocate_composition_data (coding, from);
5245 /* For relative composition, we remember start and end
5246 positions, for the other compositions, we also remember
5247 components. */
5248 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5249 if (method != COMPOSITION_RELATIVE)
5250 {
5251 /* We must store a*/
5252 Lisp_Object val, ch;
5253
5254 val = COMPOSITION_COMPONENTS (prop);
5255 if (CONSP (val))
5256 while (CONSP (val))
5257 {
5258 ch = XCAR (val), val = XCDR (val);
5259 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5260 }
5261 else if (VECTORP (val) || STRINGP (val))
5262 {
5263 int len = (VECTORP (val)
d5db4077 5264 ? XVECTOR (val)->size : SCHARS (val));
ec6d2bb8
KH
5265 int i;
5266 for (i = 0; i < len; i++)
5267 {
5268 ch = (STRINGP (val)
5269 ? Faref (val, make_number (i))
5270 : XVECTOR (val)->contents[i]);
5271 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5272 }
5273 }
5274 else /* INTEGERP (val) */
5275 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5276 }
5277 CODING_ADD_COMPOSITION_END (coding, end - from);
5278 }
5279 start = end;
5280 }
5281 while (start < to
5282 && find_composition (start, to, &start, &end, &prop, obj)
5283 && end <= to);
5284
5285 /* Make coding->cmp_data point to the first memory block. */
5286 while (coding->cmp_data->prev)
5287 coding->cmp_data = coding->cmp_data->prev;
5288 coding->cmp_data_start = 0;
5289}
5290
5291/* Reflect the saved information about compositions to OBJ.
8ca3766a 5292 CODING->cmp_data points to a memory block for the information. OBJ
ec6d2bb8
KH
5293 is a buffer or a string, defaults to the current buffer. */
5294
33fb63eb 5295void
ec6d2bb8
KH
5296coding_restore_composition (coding, obj)
5297 struct coding_system *coding;
5298 Lisp_Object obj;
5299{
5300 struct composition_data *cmp_data = coding->cmp_data;
5301
5302 if (!cmp_data)
5303 return;
5304
5305 while (cmp_data->prev)
5306 cmp_data = cmp_data->prev;
5307
5308 while (cmp_data)
5309 {
5310 int i;
5311
78108bcd
KH
5312 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5313 i += cmp_data->data[i])
ec6d2bb8
KH
5314 {
5315 int *data = cmp_data->data + i;
5316 enum composition_method method = (enum composition_method) data[3];
5317 Lisp_Object components;
5318
5319 if (method == COMPOSITION_RELATIVE)
5320 components = Qnil;
5321 else
5322 {
5323 int len = data[0] - 4, j;
5324 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5325
b6871cc7
KH
5326 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5327 && len % 2 == 0)
5328 len --;
ec6d2bb8
KH
5329 for (j = 0; j < len; j++)
5330 args[j] = make_number (data[4 + j]);
5331 components = (method == COMPOSITION_WITH_ALTCHARS
5332 ? Fstring (len, args) : Fvector (len, args));
5333 }
5334 compose_text (data[1], data[2], components, Qnil, obj);
5335 }
5336 cmp_data = cmp_data->next;
5337 }
5338}
5339
d46c5b12 5340/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
5341 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5342 coding system CODING, and return the status code of code conversion
5343 (currently, this value has no meaning).
5344
5345 How many characters (and bytes) are converted to how many
5346 characters (and bytes) are recorded in members of the structure
5347 CODING.
d46c5b12 5348
6e44253b 5349 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 5350 is deleted and a new text is inserted. See the comments in
b73bfc1c
KH
5351 replace_range (insdel.c) to know what we are doing.
5352
5353 If REPLACE is zero, it is assumed that the source text is unibyte.
8ca3766a 5354 Otherwise, it is assumed that the source text is multibyte. */
4ed46869
KH
5355
5356int
6e44253b
KH
5357code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5358 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 5359 struct coding_system *coding;
4ed46869 5360{
fb88bf2d 5361 int len = to - from, len_byte = to_byte - from_byte;
72d1a715 5362 int nchars_del = 0, nbytes_del = 0;
fb88bf2d 5363 int require, inserted, inserted_byte;
4b39528c 5364 int head_skip, tail_skip, total_skip = 0;
84d60297 5365 Lisp_Object saved_coding_symbol;
fb88bf2d 5366 int first = 1;
fb88bf2d 5367 unsigned char *src, *dst;
84d60297 5368 Lisp_Object deletion;
e133c8fa 5369 int orig_point = PT, orig_len = len;
6abb9bd9 5370 int prev_Z;
b73bfc1c
KH
5371 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5372
84d60297 5373 deletion = Qnil;
8844fa83 5374 saved_coding_symbol = coding->symbol;
d46c5b12 5375
83fa074f 5376 if (from < PT && PT < to)
e133c8fa
KH
5377 {
5378 TEMP_SET_PT_BOTH (from, from_byte);
5379 orig_point = from;
5380 }
83fa074f 5381
6e44253b 5382 if (replace)
d46c5b12 5383 {
fb88bf2d 5384 int saved_from = from;
e077cc80 5385 int saved_inhibit_modification_hooks;
fb88bf2d 5386
d46c5b12 5387 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
5388 if (saved_from != from)
5389 {
5390 to = from + len;
b73bfc1c 5391 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
fb88bf2d
KH
5392 len_byte = to_byte - from_byte;
5393 }
e077cc80
KH
5394
5395 /* The code conversion routine can not preserve text properties
5396 for now. So, we must remove all text properties in the
5397 region. Here, we must suppress all modification hooks. */
5398 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5399 inhibit_modification_hooks = 1;
5400 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5401 inhibit_modification_hooks = saved_inhibit_modification_hooks;
d46c5b12 5402 }
d46c5b12
KH
5403
5404 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5405 {
12410ef1 5406 /* We must detect encoding of text and eol format. */
d46c5b12
KH
5407
5408 if (from < GPT && to > GPT)
5409 move_gap_both (from, from_byte);
5410 if (coding->type == coding_type_undecided)
5411 {
fb88bf2d 5412 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 5413 if (coding->type == coding_type_undecided)
62b3ef1d
KH
5414 {
5415 /* It seems that the text contains only ASCII, but we
d9aef30f 5416 should not leave it undecided because the deeper
62b3ef1d
KH
5417 decoding routine (decode_coding) tries to detect the
5418 encodings again in vain. */
5419 coding->type = coding_type_emacs_mule;
5420 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
d280ccb6
KH
5421 /* As emacs-mule decoder will handle composition, we
5422 need this setting to allocate coding->cmp_data
5423 later. */
5424 coding->composing = COMPOSITION_NO;
62b3ef1d 5425 }
d46c5b12 5426 }
aaaf0b1e
KH
5427 if (coding->eol_type == CODING_EOL_UNDECIDED
5428 && coding->type != coding_type_ccl)
d46c5b12 5429 {
d46c5b12
KH
5430 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5431 if (coding->eol_type == CODING_EOL_UNDECIDED)
5432 coding->eol_type = CODING_EOL_LF;
5433 /* We had better recover the original eol format if we
8ca3766a 5434 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
5435 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5436 }
5437 }
5438
d46c5b12
KH
5439 /* Now we convert the text. */
5440
5441 /* For encoding, we must process pre-write-conversion in advance. */
b73bfc1c
KH
5442 if (! inhibit_pre_post_conversion
5443 && encodep
d46c5b12
KH
5444 && SYMBOLP (coding->pre_write_conversion)
5445 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5446 {
2b4f9037
KH
5447 /* The function in pre-write-conversion may put a new text in a
5448 new buffer. */
0007bdd0
KH
5449 struct buffer *prev = current_buffer;
5450 Lisp_Object new;
d46c5b12 5451
b843d1ae
KH
5452 record_unwind_protect (code_convert_region_unwind, Qnil);
5453 /* We should not call any more pre-write/post-read-conversion
5454 functions while this pre-write-conversion is running. */
5455 inhibit_pre_post_conversion = 1;
b39f748c
AS
5456 call2 (coding->pre_write_conversion,
5457 make_number (from), make_number (to));
b843d1ae
KH
5458 inhibit_pre_post_conversion = 0;
5459 /* Discard the unwind protect. */
5460 specpdl_ptr--;
5461
d46c5b12
KH
5462 if (current_buffer != prev)
5463 {
5464 len = ZV - BEGV;
0007bdd0 5465 new = Fcurrent_buffer ();
d46c5b12 5466 set_buffer_internal_1 (prev);
7dae4502 5467 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 5468 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
5469 insert_from_buffer (XBUFFER (new), 1, len, 0);
5470 Fkill_buffer (new);
e133c8fa
KH
5471 if (orig_point >= to)
5472 orig_point += len - orig_len;
5473 else if (orig_point > from)
5474 orig_point = from;
5475 orig_len = len;
d46c5b12 5476 to = from + len;
b73bfc1c
KH
5477 from_byte = CHAR_TO_BYTE (from);
5478 to_byte = CHAR_TO_BYTE (to);
d46c5b12 5479 len_byte = to_byte - from_byte;
e133c8fa 5480 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
5481 }
5482 }
5483
12410ef1 5484 if (replace)
72d1a715
RS
5485 {
5486 if (! EQ (current_buffer->undo_list, Qt))
5487 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5488 else
5489 {
5490 nchars_del = to - from;
5491 nbytes_del = to_byte - from_byte;
5492 }
5493 }
12410ef1 5494
ec6d2bb8
KH
5495 if (coding->composing != COMPOSITION_DISABLED)
5496 {
5497 if (encodep)
5498 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5499 else
5500 coding_allocate_composition_data (coding, from);
5501 }
fb88bf2d 5502
b73bfc1c 5503 /* Try to skip the heading and tailing ASCIIs. */
4956c225
KH
5504 if (coding->type != coding_type_ccl)
5505 {
5506 int from_byte_orig = from_byte, to_byte_orig = to_byte;
ec6d2bb8 5507
4956c225
KH
5508 if (from < GPT && GPT < to)
5509 move_gap_both (from, from_byte);
5510 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5511 if (from_byte == to_byte
5512 && (encodep || NILP (coding->post_read_conversion))
5513 && ! CODING_REQUIRE_FLUSHING (coding))
5514 {
5515 coding->produced = len_byte;
5516 coding->produced_char = len;
5517 if (!replace)
5518 /* We must record and adjust for this new text now. */
5519 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5520 return 0;
5521 }
5522
5523 head_skip = from_byte - from_byte_orig;
5524 tail_skip = to_byte_orig - to_byte;
5525 total_skip = head_skip + tail_skip;
5526 from += head_skip;
5527 to -= tail_skip;
5528 len -= total_skip; len_byte -= total_skip;
5529 }
d46c5b12 5530
8ca3766a 5531 /* For conversion, we must put the gap before the text in addition to
fb88bf2d
KH
5532 making the gap larger for efficient decoding. The required gap
5533 size starts from 2000 which is the magic number used in make_gap.
5534 But, after one batch of conversion, it will be incremented if we
5535 find that it is not enough . */
d46c5b12
KH
5536 require = 2000;
5537
5538 if (GAP_SIZE < require)
5539 make_gap (require - GAP_SIZE);
5540 move_gap_both (from, from_byte);
5541
d46c5b12 5542 inserted = inserted_byte = 0;
fb88bf2d
KH
5543
5544 GAP_SIZE += len_byte;
5545 ZV -= len;
5546 Z -= len;
5547 ZV_BYTE -= len_byte;
5548 Z_BYTE -= len_byte;
5549
d9f9a1bc
GM
5550 if (GPT - BEG < BEG_UNCHANGED)
5551 BEG_UNCHANGED = GPT - BEG;
5552 if (Z - GPT < END_UNCHANGED)
5553 END_UNCHANGED = Z - GPT;
f2558efd 5554
b73bfc1c
KH
5555 if (!encodep && coding->src_multibyte)
5556 {
5557 /* Decoding routines expects that the source text is unibyte.
5558 We must convert 8-bit characters of multibyte form to
5559 unibyte. */
5560 int len_byte_orig = len_byte;
5561 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5562 if (len_byte < len_byte_orig)
5563 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5564 len_byte);
5565 coding->src_multibyte = 0;
5566 }
5567
d46c5b12
KH
5568 for (;;)
5569 {
fb88bf2d 5570 int result;
d46c5b12 5571
ec6d2bb8 5572 /* The buffer memory is now:
b73bfc1c
KH
5573 +--------+converted-text+---------+-------original-text-------+---+
5574 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5575 |<---------------------- GAP ----------------------->| */
ec6d2bb8
KH
5576 src = GAP_END_ADDR - len_byte;
5577 dst = GPT_ADDR + inserted_byte;
5578
d46c5b12 5579 if (encodep)
fb88bf2d 5580 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 5581 else
0e79d667
RS
5582 {
5583 if (coding->composing != COMPOSITION_DISABLED)
5584 coding->cmp_data->char_offset = from + inserted;
5585 result = decode_coding (coding, src, dst, len_byte, 0);
5586 }
ec6d2bb8
KH
5587
5588 /* The buffer memory is now:
b73bfc1c
KH
5589 +--------+-------converted-text----+--+------original-text----+---+
5590 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5591 |<---------------------- GAP ----------------------->| */
ec6d2bb8 5592
d46c5b12
KH
5593 inserted += coding->produced_char;
5594 inserted_byte += coding->produced;
d46c5b12 5595 len_byte -= coding->consumed;
ec6d2bb8
KH
5596
5597 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5598 {
5599 coding_allocate_composition_data (coding, from + inserted);
5600 continue;
5601 }
5602
fb88bf2d 5603 src += coding->consumed;
3636f7a3 5604 dst += coding->produced;
d46c5b12 5605
9864ebce
KH
5606 if (result == CODING_FINISH_NORMAL)
5607 {
5608 src += len_byte;
5609 break;
5610 }
d46c5b12
KH
5611 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5612 {
fb88bf2d 5613 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 5614 Lisp_Object eol_type;
d46c5b12
KH
5615
5616 /* Encode LFs back to the original eol format (CR or CRLF). */
5617 if (coding->eol_type == CODING_EOL_CR)
5618 {
5619 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5620 }
5621 else
5622 {
d46c5b12
KH
5623 int count = 0;
5624
fb88bf2d
KH
5625 while (p < pend) if (*p++ == '\n') count++;
5626 if (src - dst < count)
d46c5b12 5627 {
38edf7d4 5628 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
5629 back to CRLF. We must record converted and
5630 not-yet-converted text back to the buffer
5631 content, enlarge the gap, then record them out of
5632 the buffer contents again. */
5633 int add = len_byte + inserted_byte;
5634
5635 GAP_SIZE -= add;
5636 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5637 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5638 make_gap (count - GAP_SIZE);
5639 GAP_SIZE += add;
5640 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5641 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5642 /* Don't forget to update SRC, DST, and PEND. */
5643 src = GAP_END_ADDR - len_byte;
5644 dst = GPT_ADDR + inserted_byte;
5645 pend = dst;
d46c5b12 5646 }
d46c5b12
KH
5647 inserted += count;
5648 inserted_byte += count;
fb88bf2d
KH
5649 coding->produced += count;
5650 p = dst = pend + count;
5651 while (count)
5652 {
5653 *--p = *--pend;
5654 if (*p == '\n') count--, *--p = '\r';
5655 }
d46c5b12
KH
5656 }
5657
5658 /* Suppress eol-format conversion in the further conversion. */
5659 coding->eol_type = CODING_EOL_LF;
5660
38edf7d4
KH
5661 /* Set the coding system symbol to that for Unix-like EOL. */
5662 eol_type = Fget (saved_coding_symbol, Qeol_type);
5663 if (VECTORP (eol_type)
5664 && XVECTOR (eol_type)->size == 3
5665 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5666 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5667 else
5668 coding->symbol = saved_coding_symbol;
93dec019 5669
fb88bf2d 5670 continue;
d46c5b12
KH
5671 }
5672 if (len_byte <= 0)
944bd420
KH
5673 {
5674 if (coding->type != coding_type_ccl
5675 || coding->mode & CODING_MODE_LAST_BLOCK)
5676 break;
5677 coding->mode |= CODING_MODE_LAST_BLOCK;
5678 continue;
5679 }
d46c5b12
KH
5680 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5681 {
5682 /* The source text ends in invalid codes. Let's just
5683 make them valid buffer contents, and finish conversion. */
70ad9fc4
GM
5684 if (multibyte_p)
5685 {
5686 unsigned char *start = dst;
93dec019 5687
70ad9fc4
GM
5688 inserted += len_byte;
5689 while (len_byte--)
5690 {
5691 int c = *src++;
5692 dst += CHAR_STRING (c, dst);
5693 }
5694
5695 inserted_byte += dst - start;
5696 }
5697 else
5698 {
5699 inserted += len_byte;
5700 inserted_byte += len_byte;
5701 while (len_byte--)
5702 *dst++ = *src++;
5703 }
d46c5b12
KH
5704 break;
5705 }
9864ebce
KH
5706 if (result == CODING_FINISH_INTERRUPT)
5707 {
5708 /* The conversion procedure was interrupted by a user. */
9864ebce
KH
5709 break;
5710 }
5711 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5712 if (coding->consumed < 1)
5713 {
5714 /* It's quite strange to require more memory without
5715 consuming any bytes. Perhaps CCL program bug. */
9864ebce
KH
5716 break;
5717 }
fb88bf2d
KH
5718 if (first)
5719 {
5720 /* We have just done the first batch of conversion which was
8ca3766a 5721 stopped because of insufficient gap. Let's reconsider the
fb88bf2d
KH
5722 required gap size (i.e. SRT - DST) now.
5723
5724 We have converted ORIG bytes (== coding->consumed) into
5725 NEW bytes (coding->produced). To convert the remaining
5726 LEN bytes, we may need REQUIRE bytes of gap, where:
5727 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5728 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5729 Here, we are sure that NEW >= ORIG. */
b3385c28
KH
5730 float ratio;
5731
5732 if (coding->produced <= coding->consumed)
5733 {
5734 /* This happens because of CCL-based coding system with
5735 eol-type CRLF. */
5736 require = 0;
5737 }
5738 else
5739 {
5740 ratio = (coding->produced - coding->consumed) / coding->consumed;
5741 require = len_byte * ratio;
5742 }
fb88bf2d
KH
5743 first = 0;
5744 }
5745 if ((src - dst) < (require + 2000))
5746 {
5747 /* See the comment above the previous call of make_gap. */
5748 int add = len_byte + inserted_byte;
5749
5750 GAP_SIZE -= add;
5751 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5752 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5753 make_gap (require + 2000);
5754 GAP_SIZE += add;
5755 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5756 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 5757 }
d46c5b12 5758 }
fb88bf2d
KH
5759 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5760
b73bfc1c
KH
5761 if (encodep && coding->dst_multibyte)
5762 {
5763 /* The output is unibyte. We must convert 8-bit characters to
5764 multibyte form. */
5765 if (inserted_byte * 2 > GAP_SIZE)
5766 {
5767 GAP_SIZE -= inserted_byte;
5768 ZV += inserted_byte; Z += inserted_byte;
5769 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5770 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5771 make_gap (inserted_byte - GAP_SIZE);
5772 GAP_SIZE += inserted_byte;
5773 ZV -= inserted_byte; Z -= inserted_byte;
5774 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5775 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5776 }
5777 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5778 }
7553d0e1 5779
93dec019 5780 /* If we shrank the conversion area, adjust it now. */
12410ef1
KH
5781 if (total_skip > 0)
5782 {
5783 if (tail_skip > 0)
5784 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5785 inserted += total_skip; inserted_byte += total_skip;
5786 GAP_SIZE += total_skip;
5787 GPT -= head_skip; GPT_BYTE -= head_skip;
5788 ZV -= total_skip; ZV_BYTE -= total_skip;
5789 Z -= total_skip; Z_BYTE -= total_skip;
5790 from -= head_skip; from_byte -= head_skip;
5791 to += tail_skip; to_byte += tail_skip;
5792 }
5793
6abb9bd9 5794 prev_Z = Z;
72d1a715
RS
5795 if (! EQ (current_buffer->undo_list, Qt))
5796 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5797 else
5798 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5799 inserted, inserted_byte);
6abb9bd9 5800 inserted = Z - prev_Z;
4ed46869 5801
ec6d2bb8
KH
5802 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5803 coding_restore_composition (coding, Fcurrent_buffer ());
5804 coding_free_composition_data (coding);
5805
b73bfc1c
KH
5806 if (! inhibit_pre_post_conversion
5807 && ! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 5808 {
2b4f9037 5809 Lisp_Object val;
4ed46869 5810
e133c8fa
KH
5811 if (from != PT)
5812 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 5813 prev_Z = Z;
b843d1ae
KH
5814 record_unwind_protect (code_convert_region_unwind, Qnil);
5815 /* We should not call any more pre-write/post-read-conversion
5816 functions while this post-read-conversion is running. */
5817 inhibit_pre_post_conversion = 1;
2b4f9037 5818 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae
KH
5819 inhibit_pre_post_conversion = 0;
5820 /* Discard the unwind protect. */
5821 specpdl_ptr--;
b7826503 5822 CHECK_NUMBER (val);
944bd420 5823 inserted += Z - prev_Z;
e133c8fa
KH
5824 }
5825
5826 if (orig_point >= from)
5827 {
5828 if (orig_point >= from + orig_len)
5829 orig_point += inserted - orig_len;
5830 else
5831 orig_point = from;
5832 TEMP_SET_PT (orig_point);
d46c5b12 5833 }
4ed46869 5834
ec6d2bb8
KH
5835 if (replace)
5836 {
5837 signal_after_change (from, to - from, inserted);
e19539f1 5838 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 5839 }
2b4f9037 5840
fb88bf2d 5841 {
12410ef1
KH
5842 coding->consumed = to_byte - from_byte;
5843 coding->consumed_char = to - from;
5844 coding->produced = inserted_byte;
5845 coding->produced_char = inserted;
fb88bf2d 5846 }
7553d0e1 5847
fb88bf2d 5848 return 0;
d46c5b12
KH
5849}
5850
5851Lisp_Object
b73bfc1c
KH
5852run_pre_post_conversion_on_str (str, coding, encodep)
5853 Lisp_Object str;
5854 struct coding_system *coding;
5855 int encodep;
5856{
aed13378 5857 int count = SPECPDL_INDEX ();
cf3b32fc 5858 struct gcpro gcpro1, gcpro2;
b73bfc1c 5859 int multibyte = STRING_MULTIBYTE (str);
3fd9494b
RS
5860 Lisp_Object buffer;
5861 struct buffer *buf;
cf3b32fc 5862 Lisp_Object old_deactivate_mark;
b73bfc1c
KH
5863
5864 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5865 record_unwind_protect (code_convert_region_unwind, Qnil);
cf3b32fc
RS
5866 /* It is not crucial to specbind this. */
5867 old_deactivate_mark = Vdeactivate_mark;
5868 GCPRO2 (str, old_deactivate_mark);
3fd9494b
RS
5869
5870 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5871 buf = XBUFFER (buffer);
5872
5873 buf->directory = current_buffer->directory;
5874 buf->read_only = Qnil;
5875 buf->filename = Qnil;
5876 buf->undo_list = Qt;
5877 buf->overlays_before = Qnil;
5878 buf->overlays_after = Qnil;
5879
5880 set_buffer_internal (buf);
b73bfc1c
KH
5881 /* We must insert the contents of STR as is without
5882 unibyte<->multibyte conversion. For that, we adjust the
5883 multibyteness of the working buffer to that of STR. */
5884 Ferase_buffer ();
3fd9494b
RS
5885 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5886
b73bfc1c 5887 insert_from_string (str, 0, 0,
d5db4077 5888 SCHARS (str), SBYTES (str), 0);
b73bfc1c
KH
5889 UNGCPRO;
5890 inhibit_pre_post_conversion = 1;
5891 if (encodep)
5892 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5893 else
6bac5b12
KH
5894 {
5895 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5896 call1 (coding->post_read_conversion, make_number (Z - BEG));
5897 }
b73bfc1c 5898 inhibit_pre_post_conversion = 0;
cf3b32fc 5899 Vdeactivate_mark = old_deactivate_mark;
78108bcd 5900 str = make_buffer_string (BEG, Z, 1);
b73bfc1c
KH
5901 return unbind_to (count, str);
5902}
5903
5904Lisp_Object
5905decode_coding_string (str, coding, nocopy)
d46c5b12 5906 Lisp_Object str;
4ed46869 5907 struct coding_system *coding;
b73bfc1c 5908 int nocopy;
4ed46869 5909{
d46c5b12 5910 int len;
73be902c 5911 struct conversion_buffer buf;
da55a2b7 5912 int from, to_byte;
84d60297 5913 Lisp_Object saved_coding_symbol;
d46c5b12 5914 int result;
78108bcd 5915 int require_decoding;
73be902c
KH
5916 int shrinked_bytes = 0;
5917 Lisp_Object newstr;
2391eaa4 5918 int consumed, consumed_char, produced, produced_char;
4ed46869 5919
b73bfc1c 5920 from = 0;
d5db4077 5921 to_byte = SBYTES (str);
4ed46869 5922
8844fa83 5923 saved_coding_symbol = coding->symbol;
764ca8da
KH
5924 coding->src_multibyte = STRING_MULTIBYTE (str);
5925 coding->dst_multibyte = 1;
b73bfc1c 5926 if (CODING_REQUIRE_DETECTION (coding))
d46c5b12
KH
5927 {
5928 /* See the comments in code_convert_region. */
5929 if (coding->type == coding_type_undecided)
5930 {
d5db4077 5931 detect_coding (coding, SDATA (str), to_byte);
d46c5b12 5932 if (coding->type == coding_type_undecided)
d280ccb6
KH
5933 {
5934 coding->type = coding_type_emacs_mule;
5935 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5936 /* As emacs-mule decoder will handle composition, we
5937 need this setting to allocate coding->cmp_data
5938 later. */
5939 coding->composing = COMPOSITION_NO;
5940 }
d46c5b12 5941 }
aaaf0b1e
KH
5942 if (coding->eol_type == CODING_EOL_UNDECIDED
5943 && coding->type != coding_type_ccl)
d46c5b12
KH
5944 {
5945 saved_coding_symbol = coding->symbol;
d5db4077 5946 detect_eol (coding, SDATA (str), to_byte);
d46c5b12
KH
5947 if (coding->eol_type == CODING_EOL_UNDECIDED)
5948 coding->eol_type = CODING_EOL_LF;
5949 /* We had better recover the original eol format if we
8ca3766a 5950 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
5951 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5952 }
5953 }
4ed46869 5954
764ca8da
KH
5955 if (coding->type == coding_type_no_conversion
5956 || coding->type == coding_type_raw_text)
5957 coding->dst_multibyte = 0;
5958
78108bcd 5959 require_decoding = CODING_REQUIRE_DECODING (coding);
ec6d2bb8 5960
b73bfc1c 5961 if (STRING_MULTIBYTE (str))
d46c5b12 5962 {
b73bfc1c
KH
5963 /* Decoding routines expect the source text to be unibyte. */
5964 str = Fstring_as_unibyte (str);
d5db4077 5965 to_byte = SBYTES (str);
b73bfc1c 5966 nocopy = 1;
764ca8da 5967 coding->src_multibyte = 0;
b73bfc1c 5968 }
ec6d2bb8 5969
b73bfc1c 5970 /* Try to skip the heading and tailing ASCIIs. */
78108bcd 5971 if (require_decoding && coding->type != coding_type_ccl)
4956c225 5972 {
d5db4077 5973 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
5974 0);
5975 if (from == to_byte)
78108bcd 5976 require_decoding = 0;
d5db4077 5977 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 5978 }
b73bfc1c 5979
78108bcd
KH
5980 if (!require_decoding)
5981 {
d5db4077
KR
5982 coding->consumed = SBYTES (str);
5983 coding->consumed_char = SCHARS (str);
78108bcd
KH
5984 if (coding->dst_multibyte)
5985 {
5986 str = Fstring_as_multibyte (str);
5987 nocopy = 1;
5988 }
d5db4077
KR
5989 coding->produced = SBYTES (str);
5990 coding->produced_char = SCHARS (str);
78108bcd
KH
5991 return (nocopy ? str : Fcopy_sequence (str));
5992 }
5993
5994 if (coding->composing != COMPOSITION_DISABLED)
5995 coding_allocate_composition_data (coding, from);
b73bfc1c 5996 len = decoding_buffer_size (coding, to_byte - from);
73be902c 5997 allocate_conversion_buffer (buf, len);
4ed46869 5998
2391eaa4 5999 consumed = consumed_char = produced = produced_char = 0;
73be902c 6000 while (1)
4ed46869 6001 {
d5db4077 6002 result = decode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6003 buf.data + produced, to_byte - from - consumed,
6004 buf.size - produced);
6005 consumed += coding->consumed;
2391eaa4 6006 consumed_char += coding->consumed_char;
73be902c
KH
6007 produced += coding->produced;
6008 produced_char += coding->produced_char;
2391eaa4
KH
6009 if (result == CODING_FINISH_NORMAL
6010 || (result == CODING_FINISH_INSUFFICIENT_SRC
6011 && coding->consumed == 0))
73be902c
KH
6012 break;
6013 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6014 coding_allocate_composition_data (coding, from + produced_char);
6015 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6016 extend_conversion_buffer (&buf);
6017 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6018 {
8844fa83
KH
6019 Lisp_Object eol_type;
6020
73be902c
KH
6021 /* Recover the original EOL format. */
6022 if (coding->eol_type == CODING_EOL_CR)
6023 {
6024 unsigned char *p;
6025 for (p = buf.data; p < buf.data + produced; p++)
6026 if (*p == '\n') *p = '\r';
6027 }
6028 else if (coding->eol_type == CODING_EOL_CRLF)
6029 {
6030 int num_eol = 0;
6031 unsigned char *p0, *p1;
6032 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6033 if (*p0 == '\n') num_eol++;
6034 if (produced + num_eol >= buf.size)
6035 extend_conversion_buffer (&buf);
6036 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6037 {
6038 *--p1 = *--p0;
6039 if (*p0 == '\n') *--p1 = '\r';
6040 }
6041 produced += num_eol;
6042 produced_char += num_eol;
93dec019 6043 }
8844fa83 6044 /* Suppress eol-format conversion in the further conversion. */
73be902c 6045 coding->eol_type = CODING_EOL_LF;
8844fa83
KH
6046
6047 /* Set the coding system symbol to that for Unix-like EOL. */
6048 eol_type = Fget (saved_coding_symbol, Qeol_type);
6049 if (VECTORP (eol_type)
6050 && XVECTOR (eol_type)->size == 3
6051 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6052 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6053 else
6054 coding->symbol = saved_coding_symbol;
6055
6056
73be902c 6057 }
4ed46869 6058 }
d46c5b12 6059
2391eaa4
KH
6060 coding->consumed = consumed;
6061 coding->consumed_char = consumed_char;
6062 coding->produced = produced;
6063 coding->produced_char = produced_char;
6064
78108bcd 6065 if (coding->dst_multibyte)
73be902c
KH
6066 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6067 produced + shrinked_bytes);
78108bcd 6068 else
73be902c
KH
6069 newstr = make_uninit_string (produced + shrinked_bytes);
6070 if (from > 0)
a4244313
KR
6071 STRING_COPYIN (newstr, 0, SDATA (str), from);
6072 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6073 if (shrinked_bytes > from)
a4244313
KR
6074 STRING_COPYIN (newstr, from + produced,
6075 SDATA (str) + to_byte,
6076 shrinked_bytes - from);
73be902c 6077 free_conversion_buffer (&buf);
b73bfc1c
KH
6078
6079 if (coding->cmp_data && coding->cmp_data->used)
73be902c 6080 coding_restore_composition (coding, newstr);
b73bfc1c
KH
6081 coding_free_composition_data (coding);
6082
6083 if (SYMBOLP (coding->post_read_conversion)
6084 && !NILP (Ffboundp (coding->post_read_conversion)))
73be902c 6085 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
b73bfc1c 6086
73be902c 6087 return newstr;
b73bfc1c
KH
6088}
6089
6090Lisp_Object
6091encode_coding_string (str, coding, nocopy)
6092 Lisp_Object str;
6093 struct coding_system *coding;
6094 int nocopy;
6095{
6096 int len;
73be902c 6097 struct conversion_buffer buf;
b73bfc1c 6098 int from, to, to_byte;
b73bfc1c 6099 int result;
73be902c
KH
6100 int shrinked_bytes = 0;
6101 Lisp_Object newstr;
2391eaa4 6102 int consumed, consumed_char, produced, produced_char;
b73bfc1c
KH
6103
6104 if (SYMBOLP (coding->pre_write_conversion)
6105 && !NILP (Ffboundp (coding->pre_write_conversion)))
6bac5b12 6106 str = run_pre_post_conversion_on_str (str, coding, 1);
b73bfc1c
KH
6107
6108 from = 0;
d5db4077
KR
6109 to = SCHARS (str);
6110 to_byte = SBYTES (str);
b73bfc1c 6111
e2c06b17
KH
6112 /* Encoding routines determine the multibyteness of the source text
6113 by coding->src_multibyte. */
6114 coding->src_multibyte = STRING_MULTIBYTE (str);
6115 coding->dst_multibyte = 0;
b73bfc1c 6116 if (! CODING_REQUIRE_ENCODING (coding))
826bfb8b 6117 {
d5db4077
KR
6118 coding->consumed = SBYTES (str);
6119 coding->consumed_char = SCHARS (str);
b73bfc1c
KH
6120 if (STRING_MULTIBYTE (str))
6121 {
6122 str = Fstring_as_unibyte (str);
6123 nocopy = 1;
6124 }
d5db4077
KR
6125 coding->produced = SBYTES (str);
6126 coding->produced_char = SCHARS (str);
b73bfc1c 6127 return (nocopy ? str : Fcopy_sequence (str));
826bfb8b
KH
6128 }
6129
b73bfc1c
KH
6130 if (coding->composing != COMPOSITION_DISABLED)
6131 coding_save_composition (coding, from, to, str);
ec6d2bb8 6132
b73bfc1c 6133 /* Try to skip the heading and tailing ASCIIs. */
4956c225
KH
6134 if (coding->type != coding_type_ccl)
6135 {
d5db4077 6136 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
6137 1);
6138 if (from == to_byte)
6139 return (nocopy ? str : Fcopy_sequence (str));
d5db4077 6140 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 6141 }
b73bfc1c
KH
6142
6143 len = encoding_buffer_size (coding, to_byte - from);
73be902c
KH
6144 allocate_conversion_buffer (buf, len);
6145
2391eaa4 6146 consumed = consumed_char = produced = produced_char = 0;
73be902c
KH
6147 while (1)
6148 {
d5db4077 6149 result = encode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6150 buf.data + produced, to_byte - from - consumed,
6151 buf.size - produced);
6152 consumed += coding->consumed;
2391eaa4 6153 consumed_char += coding->consumed_char;
13004bef 6154 produced += coding->produced;
2391eaa4
KH
6155 produced_char += coding->produced_char;
6156 if (result == CODING_FINISH_NORMAL
6157 || (result == CODING_FINISH_INSUFFICIENT_SRC
6158 && coding->consumed == 0))
73be902c
KH
6159 break;
6160 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6161 extend_conversion_buffer (&buf);
6162 }
6163
2391eaa4
KH
6164 coding->consumed = consumed;
6165 coding->consumed_char = consumed_char;
6166 coding->produced = produced;
6167 coding->produced_char = produced_char;
6168
73be902c 6169 newstr = make_uninit_string (produced + shrinked_bytes);
b73bfc1c 6170 if (from > 0)
a4244313
KR
6171 STRING_COPYIN (newstr, 0, SDATA (str), from);
6172 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6173 if (shrinked_bytes > from)
a4244313
KR
6174 STRING_COPYIN (newstr, from + produced,
6175 SDATA (str) + to_byte,
6176 shrinked_bytes - from);
73be902c
KH
6177
6178 free_conversion_buffer (&buf);
ec6d2bb8 6179 coding_free_composition_data (coding);
b73bfc1c 6180
73be902c 6181 return newstr;
4ed46869
KH
6182}
6183
6184\f
6185#ifdef emacs
1397dc18 6186/*** 8. Emacs Lisp library functions ***/
4ed46869 6187
4ed46869 6188DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae
PJ
6189 doc: /* Return t if OBJECT is nil or a coding-system.
6190See the documentation of `make-coding-system' for information
6191about coding-system objects. */)
6192 (obj)
4ed46869
KH
6193 Lisp_Object obj;
6194{
4608c386
KH
6195 if (NILP (obj))
6196 return Qt;
6197 if (!SYMBOLP (obj))
6198 return Qnil;
6199 /* Get coding-spec vector for OBJ. */
6200 obj = Fget (obj, Qcoding_system);
6201 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6202 ? Qt : Qnil);
4ed46869
KH
6203}
6204
9d991de8
RS
6205DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6206 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6207 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6208 (prompt)
4ed46869
KH
6209 Lisp_Object prompt;
6210{
e0e989f6 6211 Lisp_Object val;
9d991de8
RS
6212 do
6213 {
4608c386
KH
6214 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6215 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6216 }
d5db4077 6217 while (SCHARS (val) == 0);
e0e989f6 6218 return (Fintern (val, Qnil));
4ed46869
KH
6219}
6220
9b787f3e 6221DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6222 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6223If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6224 (prompt, default_coding_system)
9b787f3e 6225 Lisp_Object prompt, default_coding_system;
4ed46869 6226{
f44d27ce 6227 Lisp_Object val;
9b787f3e 6228 if (SYMBOLP (default_coding_system))
57d25e6f 6229 default_coding_system = SYMBOL_NAME (default_coding_system);
4608c386 6230 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6231 Qt, Qnil, Qcoding_system_history,
6232 default_coding_system, Qnil);
d5db4077 6233 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6234}
6235
6236DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6237 1, 1, 0,
48b0f3ae
PJ
6238 doc: /* Check validity of CODING-SYSTEM.
6239If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6240It is valid if it is a symbol with a non-nil `coding-system' property.
6241The value of property should be a vector of length 5. */)
6242 (coding_system)
4ed46869
KH
6243 Lisp_Object coding_system;
6244{
b7826503 6245 CHECK_SYMBOL (coding_system);
4ed46869
KH
6246 if (!NILP (Fcoding_system_p (coding_system)))
6247 return coding_system;
6248 while (1)
02ba4723 6249 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6250}
3a73fa5d 6251\f
d46c5b12 6252Lisp_Object
0a28aafb 6253detect_coding_system (src, src_bytes, highest, multibytep)
a4244313 6254 const unsigned char *src;
d46c5b12 6255 int src_bytes, highest;
0a28aafb 6256 int multibytep;
4ed46869
KH
6257{
6258 int coding_mask, eol_type;
d46c5b12
KH
6259 Lisp_Object val, tmp;
6260 int dummy;
4ed46869 6261
0a28aafb 6262 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
d46c5b12
KH
6263 eol_type = detect_eol_type (src, src_bytes, &dummy);
6264 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 6265 eol_type = CODING_EOL_UNDECIDED;
4ed46869 6266
d46c5b12 6267 if (!coding_mask)
4ed46869 6268 {
27901516 6269 val = Qundecided;
d46c5b12 6270 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 6271 {
f44d27ce
RS
6272 Lisp_Object val2;
6273 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
6274 if (VECTORP (val2))
6275 val = XVECTOR (val2)->contents[eol_type];
6276 }
80e803b4 6277 return (highest ? val : Fcons (val, Qnil));
4ed46869 6278 }
4ed46869 6279
d46c5b12
KH
6280 /* At first, gather possible coding systems in VAL. */
6281 val = Qnil;
fa42c37f 6282 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6283 {
fa42c37f
KH
6284 Lisp_Object category_val, category_index;
6285
6286 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6287 category_val = Fsymbol_value (XCAR (tmp));
6288 if (!NILP (category_val)
6289 && NATNUMP (category_index)
6290 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 6291 {
fa42c37f 6292 val = Fcons (category_val, val);
d46c5b12
KH
6293 if (highest)
6294 break;
4ed46869
KH
6295 }
6296 }
d46c5b12
KH
6297 if (!highest)
6298 val = Fnreverse (val);
4ed46869 6299
65059037 6300 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 6301 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6302 {
65059037
RS
6303 if (eol_type != CODING_EOL_UNDECIDED
6304 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 6305 {
d46c5b12 6306 Lisp_Object eol;
03699b14 6307 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 6308 if (VECTORP (eol))
f3fbd155 6309 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
4ed46869
KH
6310 }
6311 }
03699b14 6312 return (highest ? XCAR (val) : val);
93dec019 6313}
4ed46869 6314
d46c5b12
KH
6315DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6316 2, 3, 0,
48b0f3ae
PJ
6317 doc: /* Detect coding system of the text in the region between START and END.
6318Return a list of possible coding systems ordered by priority.
6319
6320If only ASCII characters are found, it returns a list of single element
6321`undecided' or its subsidiary coding system according to a detected
6322end-of-line format.
6323
6324If optional argument HIGHEST is non-nil, return the coding system of
6325highest priority. */)
6326 (start, end, highest)
d46c5b12
KH
6327 Lisp_Object start, end, highest;
6328{
6329 int from, to;
6330 int from_byte, to_byte;
682169fe 6331 int include_anchor_byte = 0;
6289dd10 6332
b7826503
PJ
6333 CHECK_NUMBER_COERCE_MARKER (start);
6334 CHECK_NUMBER_COERCE_MARKER (end);
4ed46869 6335
d46c5b12
KH
6336 validate_region (&start, &end);
6337 from = XINT (start), to = XINT (end);
6338 from_byte = CHAR_TO_BYTE (from);
6339 to_byte = CHAR_TO_BYTE (to);
6289dd10 6340
d46c5b12
KH
6341 if (from < GPT && to >= GPT)
6342 move_gap_both (to, to_byte);
c210f766
KH
6343 /* If we an anchor byte `\0' follows the region, we include it in
6344 the detecting source. Then code detectors can handle the tailing
6345 byte sequence more accurately.
6346
7d0393cf 6347 Fix me: This is not a perfect solution. It is better that we
c210f766
KH
6348 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6349 */
682169fe
KH
6350 if (to == Z || (to == GPT && GAP_SIZE > 0))
6351 include_anchor_byte = 1;
d46c5b12 6352 return detect_coding_system (BYTE_POS_ADDR (from_byte),
682169fe 6353 to_byte - from_byte + include_anchor_byte,
0a28aafb
KH
6354 !NILP (highest),
6355 !NILP (current_buffer
6356 ->enable_multibyte_characters));
d46c5b12 6357}
6289dd10 6358
d46c5b12
KH
6359DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6360 1, 2, 0,
48b0f3ae
PJ
6361 doc: /* Detect coding system of the text in STRING.
6362Return a list of possible coding systems ordered by priority.
6363
6364If only ASCII characters are found, it returns a list of single element
6365`undecided' or its subsidiary coding system according to a detected
6366end-of-line format.
6367
6368If optional argument HIGHEST is non-nil, return the coding system of
6369highest priority. */)
6370 (string, highest)
d46c5b12
KH
6371 Lisp_Object string, highest;
6372{
b7826503 6373 CHECK_STRING (string);
4ed46869 6374
d5db4077 6375 return detect_coding_system (SDATA (string),
682169fe
KH
6376 /* "+ 1" is to include the anchor byte
6377 `\0'. With this, code detectors can
c210f766
KH
6378 handle the tailing bytes more
6379 accurately. */
d5db4077 6380 SBYTES (string) + 1,
0a28aafb
KH
6381 !NILP (highest),
6382 STRING_MULTIBYTE (string));
4ed46869
KH
6383}
6384
05e6f5dc
KH
6385/* Return an intersection of lists L1 and L2. */
6386
6387static Lisp_Object
6388intersection (l1, l2)
6389 Lisp_Object l1, l2;
6390{
eef762fc 6391 Lisp_Object val = Fcons (Qnil, Qnil), tail;
05e6f5dc 6392
eef762fc 6393 for (tail = val; CONSP (l1); l1 = XCDR (l1))
05e6f5dc
KH
6394 {
6395 if (!NILP (Fmemq (XCAR (l1), l2)))
eef762fc
AS
6396 {
6397 XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6398 tail = XCDR (tail);
6399 }
05e6f5dc 6400 }
eef762fc 6401 return XCDR (val);
05e6f5dc
KH
6402}
6403
6404
6405/* Subroutine for Fsafe_coding_systems_region_internal.
6406
6407 Return a list of coding systems that safely encode the multibyte
6408 text between P and PEND. SAFE_CODINGS, if non-nil, is a list of
6409 possible coding systems. If it is nil, it means that we have not
6410 yet found any coding systems.
6411
6412 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
6413 element of WORK_TABLE is set to t once the element is looked up.
6414
6415 If a non-ASCII single byte char is found, set
6416 *single_byte_char_found to 1. */
6417
6418static Lisp_Object
6419find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6420 unsigned char *p, *pend;
6421 Lisp_Object safe_codings, work_table;
6422 int *single_byte_char_found;
6423{
6424 int c, len, idx;
6425 Lisp_Object val;
6426
6427 while (p < pend)
6428 {
6429 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6430 p += len;
6431 if (ASCII_BYTE_P (c))
6432 /* We can ignore ASCII characters here. */
6433 continue;
6434 if (SINGLE_BYTE_CHAR_P (c))
6435 *single_byte_char_found = 1;
6436 if (NILP (safe_codings))
6437 continue;
6438 /* Check the safe coding systems for C. */
6439 val = char_table_ref_and_index (work_table, c, &idx);
6440 if (EQ (val, Qt))
6441 /* This element was already checked. Ignore it. */
6442 continue;
6443 /* Remember that we checked this element. */
975f250a 6444 CHAR_TABLE_SET (work_table, make_number (idx), Qt);
05e6f5dc
KH
6445
6446 /* If there are some safe coding systems for C and we have
6447 already found the other set of coding systems for the
6448 different characters, get the intersection of them. */
6449 if (!EQ (safe_codings, Qt) && !NILP (val))
6450 val = intersection (safe_codings, val);
6451 safe_codings = val;
6452 }
6453 return safe_codings;
6454}
6455
6456
6457/* Return a list of coding systems that safely encode the text between
6458 START and END. If the text contains only ASCII or is unibyte,
6459 return t. */
6460
6461DEFUN ("find-coding-systems-region-internal",
6462 Ffind_coding_systems_region_internal,
6463 Sfind_coding_systems_region_internal, 2, 2, 0,
48b0f3ae
PJ
6464 doc: /* Internal use only. */)
6465 (start, end)
05e6f5dc
KH
6466 Lisp_Object start, end;
6467{
6468 Lisp_Object work_table, safe_codings;
6469 int non_ascii_p = 0;
6470 int single_byte_char_found = 0;
a90f2c35 6471 const unsigned char *p1, *p1end, *p2, *p2end, *p;
05e6f5dc
KH
6472
6473 if (STRINGP (start))
6474 {
6475 if (!STRING_MULTIBYTE (start))
6476 return Qt;
d5db4077 6477 p1 = SDATA (start), p1end = p1 + SBYTES (start);
05e6f5dc 6478 p2 = p2end = p1end;
d5db4077 6479 if (SCHARS (start) != SBYTES (start))
05e6f5dc
KH
6480 non_ascii_p = 1;
6481 }
6482 else
6483 {
6484 int from, to, stop;
6485
b7826503
PJ
6486 CHECK_NUMBER_COERCE_MARKER (start);
6487 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
6488 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6489 args_out_of_range (start, end);
6490 if (NILP (current_buffer->enable_multibyte_characters))
6491 return Qt;
6492 from = CHAR_TO_BYTE (XINT (start));
6493 to = CHAR_TO_BYTE (XINT (end));
6494 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6495 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6496 if (stop == to)
6497 p2 = p2end = p1end;
6498 else
6499 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6500 if (XINT (end) - XINT (start) != to - from)
6501 non_ascii_p = 1;
6502 }
6503
6504 if (!non_ascii_p)
6505 {
6506 /* We are sure that the text contains no multibyte character.
6507 Check if it contains eight-bit-graphic. */
6508 p = p1;
6509 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6510 if (p == p1end)
6511 {
93dec019 6512 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
05e6f5dc
KH
6513 if (p == p2end)
6514 return Qt;
6515 }
6516 }
6517
6518 /* The text contains non-ASCII characters. */
6519 work_table = Fcopy_sequence (Vchar_coding_system_table);
6520 safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6521 &single_byte_char_found);
6522 if (p2 < p2end)
6523 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6524 &single_byte_char_found);
6525
176c92e6
SM
6526 if (EQ (safe_codings, Qt))
6527 ; /* Nothing to be done. */
6528 else if (!single_byte_char_found)
05e6f5dc
KH
6529 {
6530 /* Append generic coding systems. */
6531 Lisp_Object args[2];
6532 args[0] = safe_codings;
6533 args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6534 make_number (0));
975f250a 6535 safe_codings = Fappend (2, args);
05e6f5dc
KH
6536 }
6537 else
109a5acb
KH
6538 safe_codings = Fcons (Qraw_text,
6539 Fcons (Qemacs_mule,
6540 Fcons (Qno_conversion, safe_codings)));
05e6f5dc
KH
6541 return safe_codings;
6542}
6543
6544
6b89e3aa
KH
6545static Lisp_Object
6546find_safe_codings_2 (p, pend, safe_codings, work_table, single_byte_char_found)
6547 unsigned char *p, *pend;
6548 Lisp_Object safe_codings, work_table;
6549 int *single_byte_char_found;
6550{
6551 int c, len, i;
6552 Lisp_Object val, ch;
6553 Lisp_Object prev, tail;
177c0ea7 6554
6b89e3aa
KH
6555 while (p < pend)
6556 {
6557 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6558 p += len;
6559 if (ASCII_BYTE_P (c))
6560 /* We can ignore ASCII characters here. */
6561 continue;
6562 if (SINGLE_BYTE_CHAR_P (c))
6563 *single_byte_char_found = 1;
6564 if (NILP (safe_codings))
6565 /* Already all coding systems are excluded. */
6566 continue;
6567 /* Check the safe coding systems for C. */
6568 ch = make_number (c);
6569 val = Faref (work_table, ch);
6570 if (EQ (val, Qt))
6571 /* This element was already checked. Ignore it. */
6572 continue;
6573 /* Remember that we checked this element. */
6574 Faset (work_table, ch, Qt);
6575
6576 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6577 {
6578 val = XCAR (tail);
6579 if (NILP (Faref (XCDR (val), ch)))
6580 {
6581 /* Exclued this coding system from SAFE_CODINGS. */
6582 if (EQ (tail, safe_codings))
6583 safe_codings = XCDR (safe_codings);
6584 else
6585 XSETCDR (prev, XCDR (tail));
6586 }
6587 else
6588 prev = tail;
6589 }
6590 }
6591 return safe_codings;
6592}
6593
6594DEFUN ("find-coding-systems-region-internal-2",
6595 Ffind_coding_systems_region_internal_2,
6596 Sfind_coding_systems_region_internal_2, 2, 2, 0,
6597 doc: /* Internal use only. */)
6598 (start, end)
6599 Lisp_Object start, end;
6600{
6601 Lisp_Object work_table, safe_codings;
6602 int non_ascii_p = 0;
6603 int single_byte_char_found = 0;
6604 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6605
6606 if (STRINGP (start))
6607 {
6608 if (!STRING_MULTIBYTE (start))
6609 return Qt;
6610 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6611 p2 = p2end = p1end;
6612 if (SCHARS (start) != SBYTES (start))
6613 non_ascii_p = 1;
6614 }
6615 else
6616 {
6617 int from, to, stop;
6618
6619 CHECK_NUMBER_COERCE_MARKER (start);
6620 CHECK_NUMBER_COERCE_MARKER (end);
6621 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6622 args_out_of_range (start, end);
6623 if (NILP (current_buffer->enable_multibyte_characters))
6624 return Qt;
6625 from = CHAR_TO_BYTE (XINT (start));
6626 to = CHAR_TO_BYTE (XINT (end));
6627 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6628 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6629 if (stop == to)
6630 p2 = p2end = p1end;
6631 else
6632 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6633 if (XINT (end) - XINT (start) != to - from)
6634 non_ascii_p = 1;
6635 }
6636
6637 if (!non_ascii_p)
6638 {
6639 /* We are sure that the text contains no multibyte character.
6640 Check if it contains eight-bit-graphic. */
6641 p = p1;
6642 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6643 if (p == p1end)
6644 {
6645 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6646 if (p == p2end)
6647 return Qt;
6648 }
6649 }
6650
6651 /* The text contains non-ASCII characters. */
6652
6653 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6654 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6655
6656 safe_codings = find_safe_codings_2 (p1, p1end, safe_codings, work_table,
6657 &single_byte_char_found);
6658 if (p2 < p2end)
6659 safe_codings = find_safe_codings_2 (p2, p2end, safe_codings, work_table,
6660 &single_byte_char_found);
6661 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6662 safe_codings = Qt;
6663 else
6664 {
6665 /* Turn safe_codings to a list of coding systems... */
6666 Lisp_Object val;
6667
6668 if (single_byte_char_found)
6669 /* ... and append these for eight-bit chars. */
6670 val = Fcons (Qraw_text,
6671 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6672 else
6673 /* ... and append generic coding systems. */
6674 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
177c0ea7 6675
6b89e3aa
KH
6676 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6677 val = Fcons (XCAR (XCAR (safe_codings)), val);
6678 safe_codings = val;
6679 }
6680
6681 return safe_codings;
6682}
6683
6684
068a9dbd
KH
6685/* Search from position POS for such characters that are unencodable
6686 accoding to SAFE_CHARS, and return a list of their positions. P
6687 points where in the memory the character at POS exists. Limit the
6688 search at PEND or when Nth unencodable characters are found.
6689
6690 If SAFE_CHARS is a char table, an element for an unencodable
6691 character is nil.
6692
6693 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6694
6695 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6696 eight-bit-graphic characters are unencodable. */
6697
6698static Lisp_Object
6699unencodable_char_position (safe_chars, pos, p, pend, n)
6700 Lisp_Object safe_chars;
6701 int pos;
6702 unsigned char *p, *pend;
6703 int n;
6704{
6705 Lisp_Object pos_list;
6706
6707 pos_list = Qnil;
6708 while (p < pend)
6709 {
6710 int len;
6711 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7d0393cf 6712
068a9dbd
KH
6713 if (c >= 128
6714 && (CHAR_TABLE_P (safe_chars)
6715 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6716 : (NILP (safe_chars) || c < 256)))
6717 {
6718 pos_list = Fcons (make_number (pos), pos_list);
6719 if (--n <= 0)
6720 break;
6721 }
6722 pos++;
6723 p += len;
6724 }
6725 return Fnreverse (pos_list);
6726}
6727
6728
6729DEFUN ("unencodable-char-position", Funencodable_char_position,
6730 Sunencodable_char_position, 3, 5, 0,
6731 doc: /*
6732Return position of first un-encodable character in a region.
6733START and END specfiy the region and CODING-SYSTEM specifies the
6734encoding to check. Return nil if CODING-SYSTEM does encode the region.
6735
6736If optional 4th argument COUNT is non-nil, it specifies at most how
6737many un-encodable characters to search. In this case, the value is a
6738list of positions.
6739
6740If optional 5th argument STRING is non-nil, it is a string to search
6741for un-encodable characters. In that case, START and END are indexes
6742to the string. */)
6743 (start, end, coding_system, count, string)
6744 Lisp_Object start, end, coding_system, count, string;
6745{
6746 int n;
6747 Lisp_Object safe_chars;
6748 struct coding_system coding;
6749 Lisp_Object positions;
6750 int from, to;
6751 unsigned char *p, *pend;
6752
6753 if (NILP (string))
6754 {
6755 validate_region (&start, &end);
6756 from = XINT (start);
6757 to = XINT (end);
6758 if (NILP (current_buffer->enable_multibyte_characters))
6759 return Qnil;
6760 p = CHAR_POS_ADDR (from);
200c93e2
KH
6761 if (to == GPT)
6762 pend = GPT_ADDR;
6763 else
6764 pend = CHAR_POS_ADDR (to);
068a9dbd
KH
6765 }
6766 else
6767 {
6768 CHECK_STRING (string);
6769 CHECK_NATNUM (start);
6770 CHECK_NATNUM (end);
6771 from = XINT (start);
6772 to = XINT (end);
6773 if (from > to
6774 || to > SCHARS (string))
6775 args_out_of_range_3 (string, start, end);
6776 if (! STRING_MULTIBYTE (string))
6777 return Qnil;
6778 p = SDATA (string) + string_char_to_byte (string, from);
6779 pend = SDATA (string) + string_char_to_byte (string, to);
6780 }
6781
6782 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6783
6784 if (NILP (count))
6785 n = 1;
6786 else
6787 {
6788 CHECK_NATNUM (count);
6789 n = XINT (count);
6790 }
6791
6792 if (coding.type == coding_type_no_conversion
6793 || coding.type == coding_type_raw_text)
6794 return Qnil;
6795
6796 if (coding.type == coding_type_undecided)
6797 safe_chars = Qnil;
6798 else
6b89e3aa 6799 safe_chars = coding_safe_chars (coding_system);
068a9dbd
KH
6800
6801 if (STRINGP (string)
6802 || from >= GPT || to <= GPT)
6803 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6804 else
6805 {
6806 Lisp_Object args[2];
6807
6808 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
96d2e64d 6809 n -= XINT (Flength (args[0]));
068a9dbd
KH
6810 if (n <= 0)
6811 positions = args[0];
6812 else
6813 {
6814 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6815 pend, n);
6816 positions = Fappend (2, args);
6817 }
6818 }
6819
6820 return (NILP (count) ? Fcar (positions) : positions);
6821}
6822
6823
4031e2bf
KH
6824Lisp_Object
6825code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 6826 Lisp_Object start, end, coding_system;
4031e2bf 6827 int encodep;
3a73fa5d
RS
6828{
6829 struct coding_system coding;
da55a2b7 6830 int from, to;
3a73fa5d 6831
b7826503
PJ
6832 CHECK_NUMBER_COERCE_MARKER (start);
6833 CHECK_NUMBER_COERCE_MARKER (end);
6834 CHECK_SYMBOL (coding_system);
3a73fa5d 6835
d46c5b12
KH
6836 validate_region (&start, &end);
6837 from = XFASTINT (start);
6838 to = XFASTINT (end);
6839
3a73fa5d 6840 if (NILP (coding_system))
d46c5b12
KH
6841 return make_number (to - from);
6842
3a73fa5d 6843 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 6844 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
3a73fa5d 6845
d46c5b12 6846 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
6847 coding.src_multibyte = coding.dst_multibyte
6848 = !NILP (current_buffer->enable_multibyte_characters);
fb88bf2d
KH
6849 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6850 &coding, encodep, 1);
f072a3e8 6851 Vlast_coding_system_used = coding.symbol;
fb88bf2d 6852 return make_number (coding.produced_char);
4031e2bf
KH
6853}
6854
6855DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6856 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
6857 doc: /* Decode the current region from the specified coding system.
6858When called from a program, takes three arguments:
6859START, END, and CODING-SYSTEM. START and END are buffer positions.
6860This function sets `last-coding-system-used' to the precise coding system
6861used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6862not fully specified.)
6863It returns the length of the decoded text. */)
6864 (start, end, coding_system)
4031e2bf
KH
6865 Lisp_Object start, end, coding_system;
6866{
6867 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
6868}
6869
6870DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6871 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
6872 doc: /* Encode the current region into the specified coding system.
6873When called from a program, takes three arguments:
6874START, END, and CODING-SYSTEM. START and END are buffer positions.
6875This function sets `last-coding-system-used' to the precise coding system
6876used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6877not fully specified.)
6878It returns the length of the encoded text. */)
6879 (start, end, coding_system)
d46c5b12 6880 Lisp_Object start, end, coding_system;
3a73fa5d 6881{
4031e2bf
KH
6882 return code_convert_region1 (start, end, coding_system, 1);
6883}
3a73fa5d 6884
4031e2bf
KH
6885Lisp_Object
6886code_convert_string1 (string, coding_system, nocopy, encodep)
6887 Lisp_Object string, coding_system, nocopy;
6888 int encodep;
6889{
6890 struct coding_system coding;
3a73fa5d 6891
b7826503
PJ
6892 CHECK_STRING (string);
6893 CHECK_SYMBOL (coding_system);
4ed46869 6894
d46c5b12 6895 if (NILP (coding_system))
4031e2bf 6896 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 6897
d46c5b12 6898 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 6899 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
5f1cd180 6900
d46c5b12 6901 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
6902 string = (encodep
6903 ? encode_coding_string (string, &coding, !NILP (nocopy))
6904 : decode_coding_string (string, &coding, !NILP (nocopy)));
f072a3e8 6905 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
6906
6907 return string;
4ed46869
KH
6908}
6909
4ed46869 6910DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6 6911 2, 3, 0,
48b0f3ae
PJ
6912 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6913Optional arg NOCOPY non-nil means it is OK to return STRING itself
6914if the decoding operation is trivial.
6915This function sets `last-coding-system-used' to the precise coding system
6916used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6917not fully specified.) */)
6918 (string, coding_system, nocopy)
e0e989f6 6919 Lisp_Object string, coding_system, nocopy;
4ed46869 6920{
f072a3e8 6921 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
6922}
6923
6924DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6 6925 2, 3, 0,
48b0f3ae
PJ
6926 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6927Optional arg NOCOPY non-nil means it is OK to return STRING itself
6928if the encoding operation is trivial.
6929This function sets `last-coding-system-used' to the precise coding system
6930used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6931not fully specified.) */)
6932 (string, coding_system, nocopy)
e0e989f6 6933 Lisp_Object string, coding_system, nocopy;
4ed46869 6934{
f072a3e8 6935 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 6936}
4031e2bf 6937
ecec61c1 6938/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
6939 Do not set Vlast_coding_system_used.
6940
6941 This function is called only from macros DECODE_FILE and
6942 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
6943
6944Lisp_Object
6945code_convert_string_norecord (string, coding_system, encodep)
6946 Lisp_Object string, coding_system;
6947 int encodep;
6948{
6949 struct coding_system coding;
6950
b7826503
PJ
6951 CHECK_STRING (string);
6952 CHECK_SYMBOL (coding_system);
ecec61c1
KH
6953
6954 if (NILP (coding_system))
6955 return string;
6956
6957 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 6958 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
ecec61c1 6959
ec6d2bb8 6960 coding.composing = COMPOSITION_DISABLED;
ecec61c1 6961 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
6962 return (encodep
6963 ? encode_coding_string (string, &coding, 1)
6964 : decode_coding_string (string, &coding, 1));
ecec61c1 6965}
3a73fa5d 6966\f
4ed46869 6967DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6968 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6969Return the corresponding character. */)
6970 (code)
4ed46869
KH
6971 Lisp_Object code;
6972{
6973 unsigned char c1, c2, s1, s2;
6974 Lisp_Object val;
6975
b7826503 6976 CHECK_NUMBER (code);
4ed46869 6977 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
6978 if (s1 == 0)
6979 {
c28a9453
KH
6980 if (s2 < 0x80)
6981 XSETFASTINT (val, s2);
6982 else if (s2 >= 0xA0 || s2 <= 0xDF)
b73bfc1c 6983 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
c28a9453 6984 else
9da8350f 6985 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
6986 }
6987 else
6988 {
87323294 6989 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
55ab7be3 6990 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 6991 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3 6992 DECODE_SJIS (s1, s2, c1, c2);
b73bfc1c 6993 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
55ab7be3 6994 }
4ed46869
KH
6995 return val;
6996}
6997
6998DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6999 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7000Return the corresponding code in SJIS. */)
7001 (ch)
4ed46869
KH
7002 Lisp_Object ch;
7003{
bcf26d6a 7004 int charset, c1, c2, s1, s2;
4ed46869
KH
7005 Lisp_Object val;
7006
b7826503 7007 CHECK_NUMBER (ch);
4ed46869 7008 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7009 if (charset == CHARSET_ASCII)
7010 {
7011 val = ch;
7012 }
7013 else if (charset == charset_jisx0208
7014 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
7015 {
7016 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 7017 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 7018 }
55ab7be3
KH
7019 else if (charset == charset_katakana_jisx0201
7020 && c1 > 0x20 && c2 < 0xE0)
7021 {
7022 XSETFASTINT (val, c1 | 0x80);
7023 }
4ed46869 7024 else
55ab7be3 7025 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
7026 return val;
7027}
7028
7029DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7030 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7031Return the corresponding character. */)
7032 (code)
4ed46869
KH
7033 Lisp_Object code;
7034{
7035 int charset;
7036 unsigned char b1, b2, c1, c2;
7037 Lisp_Object val;
7038
b7826503 7039 CHECK_NUMBER (code);
4ed46869 7040 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
7041 if (b1 == 0)
7042 {
7043 if (b2 >= 0x80)
9da8350f 7044 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
7045 val = code;
7046 }
7047 else
7048 {
7049 if ((b1 < 0xA1 || b1 > 0xFE)
7050 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 7051 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453 7052 DECODE_BIG5 (b1, b2, charset, c1, c2);
b73bfc1c 7053 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
c28a9453 7054 }
4ed46869
KH
7055 return val;
7056}
7057
7058DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7059 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7060Return the corresponding character code in Big5. */)
7061 (ch)
4ed46869
KH
7062 Lisp_Object ch;
7063{
bcf26d6a 7064 int charset, c1, c2, b1, b2;
4ed46869
KH
7065 Lisp_Object val;
7066
b7826503 7067 CHECK_NUMBER (ch);
4ed46869 7068 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7069 if (charset == CHARSET_ASCII)
7070 {
7071 val = ch;
7072 }
7073 else if ((charset == charset_big5_1
7074 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7075 || (charset == charset_big5_2
7076 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
7077 {
7078 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 7079 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
7080 }
7081 else
c28a9453 7082 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
7083 return val;
7084}
3a73fa5d 7085\f
002fdb44 7086DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7087 Sset_terminal_coding_system_internal, 1, 1, 0,
7088 doc: /* Internal use only. */)
7089 (coding_system)
4ed46869
KH
7090 Lisp_Object coding_system;
7091{
b7826503 7092 CHECK_SYMBOL (coding_system);
4ed46869 7093 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 7094 /* We had better not send unsafe characters to terminal. */
6e85d753 7095 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
8ca3766a 7096 /* Character composition should be disabled. */
ec6d2bb8 7097 terminal_coding.composing = COMPOSITION_DISABLED;
bd64290d
KH
7098 /* Error notification should be suppressed. */
7099 terminal_coding.suppress_error = 1;
b73bfc1c
KH
7100 terminal_coding.src_multibyte = 1;
7101 terminal_coding.dst_multibyte = 0;
4ed46869
KH
7102 return Qnil;
7103}
7104
002fdb44 7105DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
48b0f3ae 7106 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7107 doc: /* Internal use only. */)
48b0f3ae 7108 (coding_system)
c4825358
KH
7109 Lisp_Object coding_system;
7110{
b7826503 7111 CHECK_SYMBOL (coding_system);
c4825358
KH
7112 setup_coding_system (Fcheck_coding_system (coding_system),
7113 &safe_terminal_coding);
8ca3766a 7114 /* Character composition should be disabled. */
ec6d2bb8 7115 safe_terminal_coding.composing = COMPOSITION_DISABLED;
bd64290d
KH
7116 /* Error notification should be suppressed. */
7117 terminal_coding.suppress_error = 1;
b73bfc1c
KH
7118 safe_terminal_coding.src_multibyte = 1;
7119 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7120 return Qnil;
7121}
7122
002fdb44
DL
7123DEFUN ("terminal-coding-system", Fterminal_coding_system,
7124 Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7125 doc: /* Return coding system specified for terminal output. */)
7126 ()
4ed46869
KH
7127{
7128 return terminal_coding.symbol;
7129}
7130
002fdb44 7131DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7132 Sset_keyboard_coding_system_internal, 1, 1, 0,
7133 doc: /* Internal use only. */)
7134 (coding_system)
4ed46869
KH
7135 Lisp_Object coding_system;
7136{
b7826503 7137 CHECK_SYMBOL (coding_system);
4ed46869 7138 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
8ca3766a 7139 /* Character composition should be disabled. */
ec6d2bb8 7140 keyboard_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
7141 return Qnil;
7142}
7143
002fdb44
DL
7144DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7145 Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7146 doc: /* Return coding system specified for decoding keyboard input. */)
7147 ()
4ed46869
KH
7148{
7149 return keyboard_coding.symbol;
7150}
7151
7152\f
a5d301df
KH
7153DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7154 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7155 doc: /* Choose a coding system for an operation based on the target name.
7156The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7157DECODING-SYSTEM is the coding system to use for decoding
7158\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7159for encoding (in case OPERATION does encoding).
7160
7161The first argument OPERATION specifies an I/O primitive:
7162 For file I/O, `insert-file-contents' or `write-region'.
7163 For process I/O, `call-process', `call-process-region', or `start-process'.
7164 For network I/O, `open-network-stream'.
7165
7166The remaining arguments should be the same arguments that were passed
7167to the primitive. Depending on which primitive, one of those arguments
7168is selected as the TARGET. For example, if OPERATION does file I/O,
7169whichever argument specifies the file name is TARGET.
7170
7171TARGET has a meaning which depends on OPERATION:
7172 For file I/O, TARGET is a file name.
7173 For process I/O, TARGET is a process name.
7174 For network I/O, TARGET is a service name or a port number
7175
7176This function looks up what specified for TARGET in,
7177`file-coding-system-alist', `process-coding-system-alist',
7178or `network-coding-system-alist' depending on OPERATION.
7179They may specify a coding system, a cons of coding systems,
7180or a function symbol to call.
7181In the last case, we call the function with one argument,
7182which is a list of all the arguments given to this function.
7183
7184usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7185 (nargs, args)
4ed46869
KH
7186 int nargs;
7187 Lisp_Object *args;
7188{
7189 Lisp_Object operation, target_idx, target, val;
7190 register Lisp_Object chain;
7191
7192 if (nargs < 2)
7193 error ("Too few arguments");
7194 operation = args[0];
7195 if (!SYMBOLP (operation)
7196 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8ca3766a 7197 error ("Invalid first argument");
4ed46869
KH
7198 if (nargs < 1 + XINT (target_idx))
7199 error ("Too few arguments for operation: %s",
d5db4077 7200 SDATA (SYMBOL_NAME (operation)));
7f787cfd
KH
7201 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7202 argument to write-region) is string, it must be treated as a
7203 target file name. */
7204 if (EQ (operation, Qwrite_region)
7205 && nargs > 5
7206 && STRINGP (args[5]))
d90ed3b4 7207 target_idx = make_number (4);
4ed46869
KH
7208 target = args[XINT (target_idx) + 1];
7209 if (!(STRINGP (target)
7210 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8ca3766a 7211 error ("Invalid argument %d", XINT (target_idx) + 1);
4ed46869 7212
2e34157c
RS
7213 chain = ((EQ (operation, Qinsert_file_contents)
7214 || EQ (operation, Qwrite_region))
02ba4723 7215 ? Vfile_coding_system_alist
2e34157c 7216 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7217 ? Vnetwork_coding_system_alist
7218 : Vprocess_coding_system_alist));
4ed46869
KH
7219 if (NILP (chain))
7220 return Qnil;
7221
03699b14 7222 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 7223 {
f44d27ce 7224 Lisp_Object elt;
03699b14 7225 elt = XCAR (chain);
4ed46869
KH
7226
7227 if (CONSP (elt)
7228 && ((STRINGP (target)
03699b14
KR
7229 && STRINGP (XCAR (elt))
7230 && fast_string_match (XCAR (elt), target) >= 0)
7231 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 7232 {
03699b14 7233 val = XCDR (elt);
b19fd4c5
KH
7234 /* Here, if VAL is both a valid coding system and a valid
7235 function symbol, we return VAL as a coding system. */
02ba4723
KH
7236 if (CONSP (val))
7237 return val;
7238 if (! SYMBOLP (val))
7239 return Qnil;
7240 if (! NILP (Fcoding_system_p (val)))
7241 return Fcons (val, val);
b19fd4c5
KH
7242 if (! NILP (Ffboundp (val)))
7243 {
7244 val = call1 (val, Flist (nargs, args));
7245 if (CONSP (val))
7246 return val;
7247 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7248 return Fcons (val, val);
7249 }
02ba4723
KH
7250 return Qnil;
7251 }
4ed46869
KH
7252 }
7253 return Qnil;
7254}
7255
1397dc18
KH
7256DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7257 Supdate_coding_systems_internal, 0, 0, 0,
48b0f3ae
PJ
7258 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7259When values of any coding categories are changed, you must
7260call this function. */)
7261 ()
d46c5b12
KH
7262{
7263 int i;
7264
fa42c37f 7265 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 7266 {
1397dc18
KH
7267 Lisp_Object val;
7268
f5c1dd0d 7269 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
1397dc18
KH
7270 if (!NILP (val))
7271 {
7272 if (! coding_system_table[i])
7273 coding_system_table[i] = ((struct coding_system *)
7274 xmalloc (sizeof (struct coding_system)));
7275 setup_coding_system (val, coding_system_table[i]);
7276 }
7277 else if (coding_system_table[i])
7278 {
7279 xfree (coding_system_table[i]);
7280 coding_system_table[i] = NULL;
7281 }
d46c5b12 7282 }
1397dc18 7283
d46c5b12
KH
7284 return Qnil;
7285}
7286
66cfb530
KH
7287DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7288 Sset_coding_priority_internal, 0, 0, 0,
48b0f3ae
PJ
7289 doc: /* Update internal database for the current value of `coding-category-list'.
7290This function is internal use only. */)
7291 ()
66cfb530
KH
7292{
7293 int i = 0, idx;
84d60297
RS
7294 Lisp_Object val;
7295
7296 val = Vcoding_category_list;
66cfb530
KH
7297
7298 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7299 {
03699b14 7300 if (! SYMBOLP (XCAR (val)))
66cfb530 7301 break;
03699b14 7302 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
7303 if (idx >= CODING_CATEGORY_IDX_MAX)
7304 break;
7305 coding_priorities[i++] = (1 << idx);
03699b14 7306 val = XCDR (val);
66cfb530
KH
7307 }
7308 /* If coding-category-list is valid and contains all coding
7309 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 7310 the following code saves Emacs from crashing. */
66cfb530
KH
7311 while (i < CODING_CATEGORY_IDX_MAX)
7312 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7313
7314 return Qnil;
7315}
7316
6b89e3aa
KH
7317DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7318 Sdefine_coding_system_internal, 1, 1, 0,
7319 doc: /* Register CODING-SYSTEM as a base coding system.
7320This function is internal use only. */)
7321 (coding_system)
7322 Lisp_Object coding_system;
7323{
7324 Lisp_Object safe_chars, slot;
7325
7326 if (NILP (Fcheck_coding_system (coding_system)))
7327 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7328 safe_chars = coding_safe_chars (coding_system);
7329 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7330 error ("No valid safe-chars property for %s",
7331 SDATA (SYMBOL_NAME (coding_system)));
7332 if (EQ (safe_chars, Qt))
7333 {
7334 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7335 XSETCAR (Vcoding_system_safe_chars,
7336 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7337 }
7338 else
7339 {
7340 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7341 if (NILP (slot))
7342 XSETCDR (Vcoding_system_safe_chars,
7343 nconc2 (XCDR (Vcoding_system_safe_chars),
7344 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7345 else
7346 XSETCDR (slot, safe_chars);
7347 }
7348 return Qnil;
7349}
7350
4ed46869
KH
7351#endif /* emacs */
7352
7353\f
1397dc18 7354/*** 9. Post-amble ***/
4ed46869 7355
dfcf069d 7356void
4ed46869
KH
7357init_coding_once ()
7358{
7359 int i;
7360
93dec019 7361 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
7362 for (i = 0; i <= 0x20; i++)
7363 emacs_code_class[i] = EMACS_control_code;
7364 emacs_code_class[0x0A] = EMACS_linefeed_code;
7365 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7366 for (i = 0x21 ; i < 0x7F; i++)
7367 emacs_code_class[i] = EMACS_ascii_code;
7368 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 7369 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
7370 emacs_code_class[i] = EMACS_invalid_code;
7371 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7372 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7373 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7374 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7375
7376 /* ISO2022 specific initialize routine. */
7377 for (i = 0; i < 0x20; i++)
b73bfc1c 7378 iso_code_class[i] = ISO_control_0;
4ed46869
KH
7379 for (i = 0x21; i < 0x7F; i++)
7380 iso_code_class[i] = ISO_graphic_plane_0;
7381 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 7382 iso_code_class[i] = ISO_control_1;
4ed46869
KH
7383 for (i = 0xA1; i < 0xFF; i++)
7384 iso_code_class[i] = ISO_graphic_plane_1;
7385 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7386 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7387 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7388 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7389 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7390 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7391 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7392 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7393 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7394 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7395
e0e989f6
KH
7396 setup_coding_system (Qnil, &keyboard_coding);
7397 setup_coding_system (Qnil, &terminal_coding);
c4825358 7398 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 7399 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 7400
d46c5b12
KH
7401 bzero (coding_system_table, sizeof coding_system_table);
7402
66cfb530
KH
7403 bzero (ascii_skip_code, sizeof ascii_skip_code);
7404 for (i = 0; i < 128; i++)
7405 ascii_skip_code[i] = 1;
7406
9ce27fde
KH
7407#if defined (MSDOS) || defined (WINDOWSNT)
7408 system_eol_type = CODING_EOL_CRLF;
7409#else
7410 system_eol_type = CODING_EOL_LF;
7411#endif
b843d1ae
KH
7412
7413 inhibit_pre_post_conversion = 0;
e0e989f6
KH
7414}
7415
7416#ifdef emacs
7417
dfcf069d 7418void
e0e989f6
KH
7419syms_of_coding ()
7420{
7421 Qtarget_idx = intern ("target-idx");
7422 staticpro (&Qtarget_idx);
7423
bb0115a2
RS
7424 Qcoding_system_history = intern ("coding-system-history");
7425 staticpro (&Qcoding_system_history);
7426 Fset (Qcoding_system_history, Qnil);
7427
9ce27fde 7428 /* Target FILENAME is the first argument. */
e0e989f6 7429 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 7430 /* Target FILENAME is the third argument. */
e0e989f6
KH
7431 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7432
7433 Qcall_process = intern ("call-process");
7434 staticpro (&Qcall_process);
9ce27fde 7435 /* Target PROGRAM is the first argument. */
e0e989f6
KH
7436 Fput (Qcall_process, Qtarget_idx, make_number (0));
7437
7438 Qcall_process_region = intern ("call-process-region");
7439 staticpro (&Qcall_process_region);
9ce27fde 7440 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7441 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7442
7443 Qstart_process = intern ("start-process");
7444 staticpro (&Qstart_process);
9ce27fde 7445 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7446 Fput (Qstart_process, Qtarget_idx, make_number (2));
7447
7448 Qopen_network_stream = intern ("open-network-stream");
7449 staticpro (&Qopen_network_stream);
9ce27fde 7450 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
7451 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7452
4ed46869
KH
7453 Qcoding_system = intern ("coding-system");
7454 staticpro (&Qcoding_system);
7455
7456 Qeol_type = intern ("eol-type");
7457 staticpro (&Qeol_type);
7458
7459 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7460 staticpro (&Qbuffer_file_coding_system);
7461
7462 Qpost_read_conversion = intern ("post-read-conversion");
7463 staticpro (&Qpost_read_conversion);
7464
7465 Qpre_write_conversion = intern ("pre-write-conversion");
7466 staticpro (&Qpre_write_conversion);
7467
27901516
KH
7468 Qno_conversion = intern ("no-conversion");
7469 staticpro (&Qno_conversion);
7470
7471 Qundecided = intern ("undecided");
7472 staticpro (&Qundecided);
7473
4ed46869
KH
7474 Qcoding_system_p = intern ("coding-system-p");
7475 staticpro (&Qcoding_system_p);
7476
7477 Qcoding_system_error = intern ("coding-system-error");
7478 staticpro (&Qcoding_system_error);
7479
7480 Fput (Qcoding_system_error, Qerror_conditions,
7481 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7482 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 7483 build_string ("Invalid coding system"));
4ed46869 7484
d46c5b12
KH
7485 Qcoding_category = intern ("coding-category");
7486 staticpro (&Qcoding_category);
4ed46869
KH
7487 Qcoding_category_index = intern ("coding-category-index");
7488 staticpro (&Qcoding_category_index);
7489
d46c5b12
KH
7490 Vcoding_category_table
7491 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7492 staticpro (&Vcoding_category_table);
4ed46869
KH
7493 {
7494 int i;
7495 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7496 {
d46c5b12
KH
7497 XVECTOR (Vcoding_category_table)->contents[i]
7498 = intern (coding_category_name[i]);
7499 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7500 Qcoding_category_index, make_number (i));
4ed46869
KH
7501 }
7502 }
7503
6b89e3aa
KH
7504 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7505 staticpro (&Vcoding_system_safe_chars);
7506
f967223b
KH
7507 Qtranslation_table = intern ("translation-table");
7508 staticpro (&Qtranslation_table);
1397dc18 7509 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 7510
f967223b
KH
7511 Qtranslation_table_id = intern ("translation-table-id");
7512 staticpro (&Qtranslation_table_id);
84fbb8a0 7513
f967223b
KH
7514 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7515 staticpro (&Qtranslation_table_for_decode);
a5d301df 7516
f967223b
KH
7517 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7518 staticpro (&Qtranslation_table_for_encode);
a5d301df 7519
05e6f5dc
KH
7520 Qsafe_chars = intern ("safe-chars");
7521 staticpro (&Qsafe_chars);
7522
7523 Qchar_coding_system = intern ("char-coding-system");
7524 staticpro (&Qchar_coding_system);
7525
7526 /* Intern this now in case it isn't already done.
7527 Setting this variable twice is harmless.
7528 But don't staticpro it here--that is done in alloc.c. */
7529 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7530 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
0192762c 7531 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
70c22245 7532
1397dc18
KH
7533 Qvalid_codes = intern ("valid-codes");
7534 staticpro (&Qvalid_codes);
7535
9ce27fde
KH
7536 Qemacs_mule = intern ("emacs-mule");
7537 staticpro (&Qemacs_mule);
7538
d46c5b12
KH
7539 Qraw_text = intern ("raw-text");
7540 staticpro (&Qraw_text);
7541
4ed46869
KH
7542 defsubr (&Scoding_system_p);
7543 defsubr (&Sread_coding_system);
7544 defsubr (&Sread_non_nil_coding_system);
7545 defsubr (&Scheck_coding_system);
7546 defsubr (&Sdetect_coding_region);
d46c5b12 7547 defsubr (&Sdetect_coding_string);
05e6f5dc 7548 defsubr (&Sfind_coding_systems_region_internal);
6b89e3aa 7549 defsubr (&Sfind_coding_systems_region_internal_2);
068a9dbd 7550 defsubr (&Sunencodable_char_position);
4ed46869
KH
7551 defsubr (&Sdecode_coding_region);
7552 defsubr (&Sencode_coding_region);
7553 defsubr (&Sdecode_coding_string);
7554 defsubr (&Sencode_coding_string);
7555 defsubr (&Sdecode_sjis_char);
7556 defsubr (&Sencode_sjis_char);
7557 defsubr (&Sdecode_big5_char);
7558 defsubr (&Sencode_big5_char);
1ba9e4ab 7559 defsubr (&Sset_terminal_coding_system_internal);
c4825358 7560 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 7561 defsubr (&Sterminal_coding_system);
1ba9e4ab 7562 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 7563 defsubr (&Skeyboard_coding_system);
a5d301df 7564 defsubr (&Sfind_operation_coding_system);
1397dc18 7565 defsubr (&Supdate_coding_systems_internal);
66cfb530 7566 defsubr (&Sset_coding_priority_internal);
6b89e3aa 7567 defsubr (&Sdefine_coding_system_internal);
4ed46869 7568
4608c386 7569 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
7570 doc: /* List of coding systems.
7571
7572Do not alter the value of this variable manually. This variable should be
7573updated by the functions `make-coding-system' and
7574`define-coding-system-alias'. */);
4608c386
KH
7575 Vcoding_system_list = Qnil;
7576
7577 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
7578 doc: /* Alist of coding system names.
7579Each element is one element list of coding system name.
7580This variable is given to `completing-read' as TABLE argument.
7581
7582Do not alter the value of this variable manually. This variable should be
7583updated by the functions `make-coding-system' and
7584`define-coding-system-alias'. */);
4608c386
KH
7585 Vcoding_system_alist = Qnil;
7586
4ed46869 7587 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
7588 doc: /* List of coding-categories (symbols) ordered by priority.
7589
7590On detecting a coding system, Emacs tries code detection algorithms
7591associated with each coding-category one by one in this order. When
7592one algorithm agrees with a byte sequence of source text, the coding
7593system bound to the corresponding coding-category is selected. */);
4ed46869
KH
7594 {
7595 int i;
7596
7597 Vcoding_category_list = Qnil;
7598 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7599 Vcoding_category_list
d46c5b12
KH
7600 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7601 Vcoding_category_list);
4ed46869
KH
7602 }
7603
7604 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
7605 doc: /* Specify the coding system for read operations.
7606It is useful to bind this variable with `let', but do not set it globally.
7607If the value is a coding system, it is used for decoding on read operation.
7608If not, an appropriate element is used from one of the coding system alists:
7609There are three such tables, `file-coding-system-alist',
7610`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
7611 Vcoding_system_for_read = Qnil;
7612
7613 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
7614 doc: /* Specify the coding system for write operations.
7615Programs bind this variable with `let', but you should not set it globally.
7616If the value is a coding system, it is used for encoding of output,
7617when writing it to a file and when sending it to a file or subprocess.
7618
7619If this does not specify a coding system, an appropriate element
7620is used from one of the coding system alists:
7621There are three such tables, `file-coding-system-alist',
7622`process-coding-system-alist', and `network-coding-system-alist'.
7623For output to files, if the above procedure does not specify a coding system,
7624the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
7625 Vcoding_system_for_write = Qnil;
7626
7627 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
48b0f3ae 7628 doc: /* Coding system used in the latest file or process I/O. */);
4ed46869
KH
7629 Vlast_coding_system_used = Qnil;
7630
9ce27fde 7631 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
48b0f3ae
PJ
7632 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7633See info node `Coding Systems' and info node `Text and Binary' concerning
7634such conversion. */);
9ce27fde
KH
7635 inhibit_eol_conversion = 0;
7636
ed29121d 7637 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
48b0f3ae
PJ
7638 doc: /* Non-nil means process buffer inherits coding system of process output.
7639Bind it to t if the process output is to be treated as if it were a file
7640read from some filesystem. */);
ed29121d
EZ
7641 inherit_process_coding_system = 0;
7642
02ba4723 7643 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
48b0f3ae
PJ
7644 doc: /* Alist to decide a coding system to use for a file I/O operation.
7645The format is ((PATTERN . VAL) ...),
7646where PATTERN is a regular expression matching a file name,
7647VAL is a coding system, a cons of coding systems, or a function symbol.
7648If VAL is a coding system, it is used for both decoding and encoding
7649the file contents.
7650If VAL is a cons of coding systems, the car part is used for decoding,
7651and the cdr part is used for encoding.
7652If VAL is a function symbol, the function must return a coding system
0192762c 7653or a cons of coding systems which are used as above. The function gets
ff955d90 7654the arguments with which `find-operation-coding-system' was called.
48b0f3ae
PJ
7655
7656See also the function `find-operation-coding-system'
7657and the variable `auto-coding-alist'. */);
02ba4723
KH
7658 Vfile_coding_system_alist = Qnil;
7659
7660 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
48b0f3ae
PJ
7661 doc: /* Alist to decide a coding system to use for a process I/O operation.
7662The format is ((PATTERN . VAL) ...),
7663where PATTERN is a regular expression matching a program name,
7664VAL is a coding system, a cons of coding systems, or a function symbol.
7665If VAL is a coding system, it is used for both decoding what received
7666from the program and encoding what sent to the program.
7667If VAL is a cons of coding systems, the car part is used for decoding,
7668and the cdr part is used for encoding.
7669If VAL is a function symbol, the function must return a coding system
7670or a cons of coding systems which are used as above.
7671
7672See also the function `find-operation-coding-system'. */);
02ba4723
KH
7673 Vprocess_coding_system_alist = Qnil;
7674
7675 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
48b0f3ae
PJ
7676 doc: /* Alist to decide a coding system to use for a network I/O operation.
7677The format is ((PATTERN . VAL) ...),
7678where PATTERN is a regular expression matching a network service name
7679or is a port number to connect to,
7680VAL is a coding system, a cons of coding systems, or a function symbol.
7681If VAL is a coding system, it is used for both decoding what received
7682from the network stream and encoding what sent to the network stream.
7683If VAL is a cons of coding systems, the car part is used for decoding,
7684and the cdr part is used for encoding.
7685If VAL is a function symbol, the function must return a coding system
7686or a cons of coding systems which are used as above.
7687
7688See also the function `find-operation-coding-system'. */);
02ba4723 7689 Vnetwork_coding_system_alist = Qnil;
4ed46869 7690
68c45bf0 7691 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
7692 doc: /* Coding system to use with system messages.
7693Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
7694 Vlocale_coding_system = Qnil;
7695
005f0d35 7696 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 7697 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
48b0f3ae 7698 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 7699 eol_mnemonic_unix = build_string (":");
4ed46869 7700
7722baf9 7701 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
48b0f3ae 7702 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 7703 eol_mnemonic_dos = build_string ("\\");
4ed46869 7704
7722baf9 7705 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
48b0f3ae 7706 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 7707 eol_mnemonic_mac = build_string ("/");
4ed46869 7708
7722baf9 7709 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
48b0f3ae 7710 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 7711 eol_mnemonic_undecided = build_string (":");
4ed46869 7712
84fbb8a0 7713 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
48b0f3ae 7714 doc: /* *Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 7715 Venable_character_translation = Qt;
bdd9fb48 7716
f967223b 7717 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
7718 &Vstandard_translation_table_for_decode,
7719 doc: /* Table for translating characters while decoding. */);
f967223b 7720 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 7721
f967223b 7722 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
7723 &Vstandard_translation_table_for_encode,
7724 doc: /* Table for translating characters while encoding. */);
f967223b 7725 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
7726
7727 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
48b0f3ae
PJ
7728 doc: /* Alist of charsets vs revision numbers.
7729While encoding, if a charset (car part of an element) is found,
7730designate it with the escape sequence identifying revision (cdr part of the element). */);
4ed46869 7731 Vcharset_revision_alist = Qnil;
02ba4723
KH
7732
7733 DEFVAR_LISP ("default-process-coding-system",
7734 &Vdefault_process_coding_system,
48b0f3ae
PJ
7735 doc: /* Cons of coding systems used for process I/O by default.
7736The car part is used for decoding a process output,
7737the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 7738 Vdefault_process_coding_system = Qnil;
c4825358 7739
3f003981 7740 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
48b0f3ae
PJ
7741 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7742This is a vector of length 256.
7743If Nth element is non-nil, the existence of code N in a file
7744\(or output of subprocess) doesn't prevent it to be detected as
7745a coding system of ISO 2022 variant which has a flag
7746`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7747or reading output of a subprocess.
7748Only 128th through 159th elements has a meaning. */);
3f003981 7749 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
7750
7751 DEFVAR_LISP ("select-safe-coding-system-function",
7752 &Vselect_safe_coding_system_function,
48b0f3ae
PJ
7753 doc: /* Function to call to select safe coding system for encoding a text.
7754
7755If set, this function is called to force a user to select a proper
7756coding system which can encode the text in the case that a default
7757coding system used in each operation can't encode the text.
7758
7759The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
7760 Vselect_safe_coding_system_function = Qnil;
7761
5d5bf4d8
KH
7762 DEFVAR_BOOL ("coding-system-require-warning",
7763 &coding_system_require_warning,
7764 doc: /* Internal use only.
6b89e3aa
KH
7765If non-nil, on writing a file, `select-safe-coding-system-function' is
7766called even if `coding-system-for-write' is non-nil. The command
7767`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
7768 coding_system_require_warning = 0;
7769
7770
05e6f5dc 7771 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
48b0f3ae
PJ
7772 doc: /* Char-table containing safe coding systems of each characters.
7773Each element doesn't include such generic coding systems that can
5f90b4fb 7774encode any characters. They are in the first extra slot. */);
05e6f5dc
KH
7775 Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7776
22ab2303 7777 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 7778 &inhibit_iso_escape_detection,
48b0f3ae
PJ
7779 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7780
7781By default, on reading a file, Emacs tries to detect how the text is
7782encoded. This code detection is sensitive to escape sequences. If
7783the sequence is valid as ISO2022, the code is determined as one of
7784the ISO2022 encodings, and the file is decoded by the corresponding
7785coding system (e.g. `iso-2022-7bit').
7786
7787However, there may be a case that you want to read escape sequences in
7788a file as is. In such a case, you can set this variable to non-nil.
7789Then, as the code detection ignores any escape sequences, no file is
7790detected as encoded in some ISO2022 encoding. The result is that all
7791escape sequences become visible in a buffer.
7792
7793The default value is nil, and it is strongly recommended not to change
7794it. That is because many Emacs Lisp source files that contain
7795non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7796in Emacs's distribution, and they won't be decoded correctly on
7797reading if you suppress escape sequence detection.
7798
7799The other way to read escape sequences in a file without decoding is
7800to explicitly specify some coding system that doesn't use ISO2022's
7801escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 7802 inhibit_iso_escape_detection = 0;
002fdb44
DL
7803
7804 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
7805 doc: /* Char table for translating self-inserting characters.
7806This is applied to the result of input methods, not their input. See also
7807`keyboard-translate-table'. */);
002fdb44 7808 Vtranslation_table_for_input = Qnil;
4ed46869
KH
7809}
7810
68c45bf0
PE
7811char *
7812emacs_strerror (error_number)
7813 int error_number;
7814{
7815 char *str;
7816
ca9c0567 7817 synchronize_system_messages_locale ();
68c45bf0
PE
7818 str = strerror (error_number);
7819
7820 if (! NILP (Vlocale_coding_system))
7821 {
7822 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7823 Vlocale_coding_system,
7824 0);
d5db4077 7825 str = (char *) SDATA (dec);
68c45bf0
PE
7826 }
7827
7828 return str;
7829}
7830
4ed46869 7831#endif /* emacs */
c2f94ebc 7832