Merged in changes from CVS trunk.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
f1ce3dcf 2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
58f99379 4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
4ed46869 5
369314dc
KH
6This file is part of GNU Emacs.
7
8GNU Emacs is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
4ed46869 12
369314dc
KH
13GNU Emacs is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
4ed46869 17
369314dc
KH
18You should have received a copy of the GNU General Public License
19along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */
4ed46869
KH
22
23/*** TABLE OF CONTENTS ***
24
b73bfc1c 25 0. General comments
4ed46869 26 1. Preamble
0ef69138 27 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
1397dc18
KH
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
4ed46869
KH
35
36*/
37
b73bfc1c
KH
38/*** 0. General comments ***/
39
40
cfb43547 41/*** GENERAL NOTE on CODING SYSTEMS ***
4ed46869 42
cfb43547 43 A coding system is an encoding mechanism for one or more character
4ed46869
KH
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
cfb43547 46 Emacs' internal format (emacs-mule), and when we say "encode",
0ef69138
KH
47 it means converting the coding system emacs-mule to some other
48 coding system.
4ed46869 49
0ef69138 50 0. Emacs' internal format (emacs-mule)
4ed46869 51
cfb43547 52 Emacs itself holds a multi-lingual character in buffers and strings
f4dee582 53 in a special format. Details are described in section 2.
4ed46869
KH
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
f4dee582
RS
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 63
4ed46869
KH
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 66 section 4.
4ed46869
KH
67
68 3. BIG5
69
cfb43547
DL
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
4ed46869 75
27901516
KH
76 4. Raw text
77
cfb43547
DL
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
27901516
KH
80
81 5. Other
4ed46869 82
cfb43547
DL
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
4ed46869
KH
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
d46c5b12
KH
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
4ed46869 90 information about it is set in a structure of type `struct
f4dee582 91 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
92
93*/
94
95/*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
cfb43547
DL
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
f4dee582 99 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
4ed46869 102
cfb43547
DL
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
4ed46869
KH
107
108*/
109
110/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
cfb43547 114 which appropriate flag bits for the category XXX are set. The flag
4ed46869 115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
cfb43547 116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
0a28aafb 117 of the range 0x80..0x9F are in multibyte form. */
4ed46869
KH
118#if 0
119int
0a28aafb 120detect_coding_emacs_mule (src, src_end, multibytep)
4ed46869 121 unsigned char *src, *src_end;
0a28aafb 122 int multibytep;
4ed46869
KH
123{
124 ...
125}
126#endif
127
128/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
b73bfc1c
KH
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
d46c5b12 134
cfb43547
DL
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
d46c5b12 140
cfb43547 141 DST_BYTES zero means that the source area and destination area are
d46c5b12 142 overlapped, which means that we can produce a decoded text until it
cfb43547 143 reaches the head of the not-yet-decoded source text.
d46c5b12 144
cfb43547 145 Below is a template for these functions. */
4ed46869 146#if 0
b73bfc1c 147static void
d46c5b12 148decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
4ed46869
KH
152{
153 ...
154}
155#endif
156
157/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
cfb43547 159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
b73bfc1c
KH
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
d46c5b12 163
cfb43547
DL
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
d46c5b12 169
cfb43547
DL
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
d46c5b12 173
cfb43547 174 Below is a template for these functions. */
4ed46869 175#if 0
b73bfc1c 176static void
d46c5b12 177encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
4ed46869
KH
181{
182 ...
183}
184#endif
185
186/*** COMMONLY USED MACROS ***/
187
b73bfc1c
KH
188/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
4ed46869 195
b73bfc1c
KH
196#define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
4ed46869
KH
204 } while (0)
205
b73bfc1c
KH
206#define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
4ed46869
KH
215 } while (0)
216
4ed46869 217
0a28aafb
KH
218/* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221#define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
b73bfc1c
KH
233/* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
4ed46869 242
b73bfc1c
KH
243#define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
39658efc 258 c = translate_char (translation_table, c, -1, 0, 0); \
b73bfc1c 259 src += bytes; \
4ed46869
KH
260 } while (0)
261
4ed46869 262
8ca3766a 263/* Produce a multibyte form of character C to `dst'. Jump to
b73bfc1c
KH
264 `label_end_of_loop' if there's not enough space at `dst'.
265
cfb43547 266 If we are now in the middle of a composition sequence, the decoded
b73bfc1c
KH
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273#define EMIT_CHAR(c) \
4ed46869 274 do { \
b73bfc1c
KH
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
ec6d2bb8 288 \
b73bfc1c
KH
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
4ed46869
KH
296 } while (0)
297
4ed46869 298
b73bfc1c
KH
299#define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309#define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319#define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
4ed46869
KH
328 } while (0)
329
330\f
331/*** 1. Preamble ***/
332
68c45bf0
PE
333#ifdef emacs
334#include <config.h>
335#endif
336
4ed46869
KH
337#include <stdio.h>
338
339#ifdef emacs
340
4ed46869
KH
341#include "lisp.h"
342#include "buffer.h"
343#include "charset.h"
ec6d2bb8 344#include "composite.h"
4ed46869
KH
345#include "ccl.h"
346#include "coding.h"
347#include "window.h"
66638433 348#include "intervals.h"
b8299c66
KL
349#include "frame.h"
350#include "termhooks.h"
4ed46869
KH
351
352#else /* not emacs */
353
354#include "mulelib.h"
355
356#endif /* not emacs */
357
358Lisp_Object Qcoding_system, Qeol_type;
359Lisp_Object Qbuffer_file_coding_system;
360Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 361Lisp_Object Qno_conversion, Qundecided;
bb0115a2 362Lisp_Object Qcoding_system_history;
05e6f5dc 363Lisp_Object Qsafe_chars;
1397dc18 364Lisp_Object Qvalid_codes;
4ed46869
KH
365
366extern Lisp_Object Qinsert_file_contents, Qwrite_region;
367Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
368Lisp_Object Qstart_process, Qopen_network_stream;
369Lisp_Object Qtarget_idx;
370
a362520d
KH
371/* If a symbol has this property, evaluate the value to define the
372 symbol as a coding system. */
373Lisp_Object Qcoding_system_define_form;
374
d46c5b12
KH
375Lisp_Object Vselect_safe_coding_system_function;
376
5d5bf4d8
KH
377int coding_system_require_warning;
378
7722baf9
EZ
379/* Mnemonic string for each format of end-of-line. */
380Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
381/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 382 decided. */
7722baf9 383Lisp_Object eol_mnemonic_undecided;
4ed46869 384
9ce27fde
KH
385/* Format of end-of-line decided by system. This is CODING_EOL_LF on
386 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
387int system_eol_type;
388
4ed46869
KH
389#ifdef emacs
390
6b89e3aa
KH
391/* Information about which coding system is safe for which chars.
392 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
393
394 GENERIC-LIST is a list of generic coding systems which can encode
395 any characters.
396
397 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
398 corresponding char table that contains safe chars. */
399Lisp_Object Vcoding_system_safe_chars;
400
4608c386
KH
401Lisp_Object Vcoding_system_list, Vcoding_system_alist;
402
403Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 404
d46c5b12
KH
405/* Coding system emacs-mule and raw-text are for converting only
406 end-of-line format. */
407Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 408
ecf488bc
DL
409Lisp_Object Qutf_8;
410
4ed46869
KH
411/* Coding-systems are handed between Emacs Lisp programs and C internal
412 routines by the following three variables. */
413/* Coding-system for reading files and receiving data from process. */
414Lisp_Object Vcoding_system_for_read;
415/* Coding-system for writing files and sending data to process. */
416Lisp_Object Vcoding_system_for_write;
417/* Coding-system actually used in the latest I/O. */
418Lisp_Object Vlast_coding_system_used;
419
c4825358 420/* A vector of length 256 which contains information about special
94487c4e 421 Latin codes (especially for dealing with Microsoft codes). */
3f003981 422Lisp_Object Vlatin_extra_code_table;
c4825358 423
9ce27fde
KH
424/* Flag to inhibit code conversion of end-of-line format. */
425int inhibit_eol_conversion;
426
74383408
KH
427/* Flag to inhibit ISO2022 escape sequence detection. */
428int inhibit_iso_escape_detection;
429
ed29121d
EZ
430/* Flag to make buffer-file-coding-system inherit from process-coding. */
431int inherit_process_coding_system;
432
c4825358
KH
433/* Coding system to be used to encode text for terminal display when
434 terminal coding system is nil. */
435struct coding_system safe_terminal_coding;
436
6bc51348
KH
437/* Default coding system to be used to write a file. */
438struct coding_system default_buffer_file_coding;
439
02ba4723
KH
440Lisp_Object Vfile_coding_system_alist;
441Lisp_Object Vprocess_coding_system_alist;
442Lisp_Object Vnetwork_coding_system_alist;
4ed46869 443
68c45bf0
PE
444Lisp_Object Vlocale_coding_system;
445
4ed46869
KH
446#endif /* emacs */
447
d46c5b12 448Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
449
450/* List of symbols `coding-category-xxx' ordered by priority. */
451Lisp_Object Vcoding_category_list;
452
d46c5b12
KH
453/* Table of coding categories (Lisp symbols). */
454Lisp_Object Vcoding_category_table;
4ed46869
KH
455
456/* Table of names of symbol for each coding-category. */
457char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 458 "coding-category-emacs-mule",
4ed46869
KH
459 "coding-category-sjis",
460 "coding-category-iso-7",
d46c5b12 461 "coding-category-iso-7-tight",
4ed46869
KH
462 "coding-category-iso-8-1",
463 "coding-category-iso-8-2",
7717c392
KH
464 "coding-category-iso-7-else",
465 "coding-category-iso-8-else",
89fa8b36 466 "coding-category-ccl",
4ed46869 467 "coding-category-big5",
fa42c37f
KH
468 "coding-category-utf-8",
469 "coding-category-utf-16-be",
470 "coding-category-utf-16-le",
27901516 471 "coding-category-raw-text",
89fa8b36 472 "coding-category-binary"
4ed46869
KH
473};
474
66cfb530 475/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
476 categories. */
477struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
478
66cfb530 479/* Table of coding category masks. Nth element is a mask for a coding
8ca3766a 480 category of which priority is Nth. */
66cfb530
KH
481static
482int coding_priorities[CODING_CATEGORY_IDX_MAX];
483
f967223b
KH
484/* Flag to tell if we look up translation table on character code
485 conversion. */
84fbb8a0 486Lisp_Object Venable_character_translation;
f967223b
KH
487/* Standard translation table to look up on decoding (reading). */
488Lisp_Object Vstandard_translation_table_for_decode;
489/* Standard translation table to look up on encoding (writing). */
490Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 491
f967223b
KH
492Lisp_Object Qtranslation_table;
493Lisp_Object Qtranslation_table_id;
494Lisp_Object Qtranslation_table_for_decode;
495Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
496
497/* Alist of charsets vs revision number. */
498Lisp_Object Vcharset_revision_alist;
499
02ba4723
KH
500/* Default coding systems used for process I/O. */
501Lisp_Object Vdefault_process_coding_system;
502
002fdb44
DL
503/* Char table for translating Quail and self-inserting input. */
504Lisp_Object Vtranslation_table_for_input;
505
b843d1ae
KH
506/* Global flag to tell that we can't call post-read-conversion and
507 pre-write-conversion functions. Usually the value is zero, but it
508 is set to 1 temporarily while such functions are running. This is
509 to avoid infinite recursive call. */
510static int inhibit_pre_post_conversion;
511
05e6f5dc
KH
512Lisp_Object Qchar_coding_system;
513
6b89e3aa
KH
514/* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
515 its validity. */
05e6f5dc
KH
516
517Lisp_Object
6b89e3aa
KH
518coding_safe_chars (coding_system)
519 Lisp_Object coding_system;
05e6f5dc
KH
520{
521 Lisp_Object coding_spec, plist, safe_chars;
93dec019 522
6b89e3aa 523 coding_spec = Fget (coding_system, Qcoding_system);
05e6f5dc
KH
524 plist = XVECTOR (coding_spec)->contents[3];
525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
527}
528
529#define CODING_SAFE_CHAR_P(safe_chars, c) \
530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
531
4ed46869 532\f
0ef69138 533/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869 534
aa72b389
KH
535/* Emacs' internal format for representation of multiple character
536 sets is a kind of multi-byte encoding, i.e. characters are
537 represented by variable-length sequences of one-byte codes.
b73bfc1c
KH
538
539 ASCII characters and control characters (e.g. `tab', `newline') are
540 represented by one-byte sequences which are their ASCII codes, in
541 the range 0x00 through 0x7F.
542
543 8-bit characters of the range 0x80..0x9F are represented by
544 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
545 code + 0x20).
546
547 8-bit characters of the range 0xA0..0xFF are represented by
548 one-byte sequences which are their 8-bit code.
549
550 The other characters are represented by a sequence of `base
551 leading-code', optional `extended leading-code', and one or two
552 `position-code's. The length of the sequence is determined by the
aa72b389 553 base leading-code. Leading-code takes the range 0x81 through 0x9D,
b73bfc1c
KH
554 whereas extended leading-code and position-code take the range 0xA0
555 through 0xFF. See `charset.h' for more details about leading-code
556 and position-code.
f4dee582 557
4ed46869 558 --- CODE RANGE of Emacs' internal format ---
b73bfc1c
KH
559 character set range
560 ------------- -----
561 ascii 0x00..0x7F
562 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
563 eight-bit-graphic 0xA0..0xBF
aa72b389 564 ELSE 0x81..0x9D + [0xA0..0xFF]+
4ed46869
KH
565 ---------------------------------------------
566
aa72b389
KH
567 As this is the internal character representation, the format is
568 usually not used externally (i.e. in a file or in a data sent to a
569 process). But, it is possible to have a text externally in this
570 format (i.e. by encoding by the coding system `emacs-mule').
571
572 In that case, a sequence of one-byte codes has a slightly different
573 form.
574
ae5145c2 575 Firstly, all characters in eight-bit-control are represented by
aa72b389
KH
576 one-byte sequences which are their 8-bit code.
577
578 Next, character composition data are represented by the byte
579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
580 where,
581 METHOD is 0xF0 plus one of composition method (enum
582 composition_method),
583
ae5145c2 584 BYTES is 0xA0 plus the byte length of these composition data,
aa72b389 585
ae5145c2 586 CHARS is 0xA0 plus the number of characters composed by these
aa72b389
KH
587 data,
588
8ca3766a 589 COMPONENTs are characters of multibyte form or composition
aa72b389
KH
590 rules encoded by two-byte of ASCII codes.
591
592 In addition, for backward compatibility, the following formats are
593 also recognized as composition data on decoding.
594
595 0x80 MSEQ ...
596 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
597
598 Here,
599 MSEQ is a multibyte form but in these special format:
600 ASCII: 0xA0 ASCII_CODE+0x80,
601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
602 RULE is a one byte code of the range 0xA0..0xF0 that
603 represents a composition rule.
4ed46869
KH
604 */
605
606enum emacs_code_class_type emacs_code_class[256];
607
4ed46869
KH
608/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
609 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869 611
0a28aafb
KH
612static int
613detect_coding_emacs_mule (src, src_end, multibytep)
b73bfc1c 614 unsigned char *src, *src_end;
0a28aafb 615 int multibytep;
4ed46869
KH
616{
617 unsigned char c;
618 int composing = 0;
b73bfc1c
KH
619 /* Dummy for ONE_MORE_BYTE. */
620 struct coding_system dummy_coding;
621 struct coding_system *coding = &dummy_coding;
4ed46869 622
b73bfc1c 623 while (1)
4ed46869 624 {
0a28aafb 625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
4ed46869
KH
626
627 if (composing)
628 {
629 if (c < 0xA0)
630 composing = 0;
b73bfc1c
KH
631 else if (c == 0xA0)
632 {
0a28aafb 633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
634 c &= 0x7F;
635 }
4ed46869
KH
636 else
637 c -= 0x20;
638 }
639
b73bfc1c 640 if (c < 0x20)
4ed46869 641 {
4ed46869
KH
642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
643 return 0;
b73bfc1c
KH
644 }
645 else if (c >= 0x80 && c < 0xA0)
646 {
647 if (c == 0x80)
648 /* Old leading code for a composite character. */
649 composing = 1;
650 else
651 {
652 unsigned char *src_base = src - 1;
653 int bytes;
4ed46869 654
b73bfc1c
KH
655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
656 bytes))
657 return 0;
658 src = src_base + bytes;
659 }
660 }
661 }
662 label_end_of_loop:
663 return CODING_CATEGORY_MASK_EMACS_MULE;
664}
4ed46869 665
4ed46869 666
aa72b389
KH
667/* Record the starting position START and METHOD of one composition. */
668
669#define CODING_ADD_COMPOSITION_START(coding, start, method) \
670 do { \
671 struct composition_data *cmp_data = coding->cmp_data; \
672 int *data = cmp_data->data + cmp_data->used; \
673 coding->cmp_data_start = cmp_data->used; \
674 data[0] = -1; \
675 data[1] = cmp_data->char_offset + start; \
676 data[3] = (int) method; \
677 cmp_data->used += 4; \
678 } while (0)
679
680/* Record the ending position END of the current composition. */
681
682#define CODING_ADD_COMPOSITION_END(coding, end) \
683 do { \
684 struct composition_data *cmp_data = coding->cmp_data; \
685 int *data = cmp_data->data + coding->cmp_data_start; \
686 data[0] = cmp_data->used - coding->cmp_data_start; \
687 data[2] = cmp_data->char_offset + end; \
688 } while (0)
689
690/* Record one COMPONENT (alternate character or composition rule). */
691
b6871cc7
KH
692#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
693 do { \
694 coding->cmp_data->data[coding->cmp_data->used++] = component; \
695 if (coding->cmp_data->used - coding->cmp_data_start \
696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
697 { \
698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
699 coding->composing = COMPOSITION_NO; \
700 } \
701 } while (0)
aa72b389
KH
702
703
704/* Get one byte from a data pointed by SRC and increment SRC. If SRC
8ca3766a 705 is not less than SRC_END, return -1 without incrementing Src. */
aa72b389
KH
706
707#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
708
709
710/* Decode a character represented as a component of composition
711 sequence of Emacs 20 style at SRC. Set C to that character, store
712 its multibyte form sequence at P, and set P to the end of that
713 sequence. If no valid character is found, set C to -1. */
714
715#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
716 do { \
717 int bytes; \
fd3ae0b9 718 \
aa72b389
KH
719 c = SAFE_ONE_MORE_BYTE (); \
720 if (c < 0) \
721 break; \
722 if (CHAR_HEAD_P (c)) \
723 c = -1; \
724 else if (c == 0xA0) \
725 { \
726 c = SAFE_ONE_MORE_BYTE (); \
727 if (c < 0xA0) \
728 c = -1; \
729 else \
730 { \
731 c -= 0xA0; \
732 *p++ = c; \
733 } \
734 } \
735 else if (BASE_LEADING_CODE_P (c - 0x20)) \
736 { \
737 unsigned char *p0 = p; \
738 \
739 c -= 0x20; \
740 *p++ = c; \
741 bytes = BYTES_BY_CHAR_HEAD (c); \
742 while (--bytes) \
743 { \
744 c = SAFE_ONE_MORE_BYTE (); \
745 if (c < 0) \
746 break; \
747 *p++ = c; \
748 } \
fd3ae0b9
KH
749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
750 || (coding->flags /* We are recovering a file. */ \
751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
752 && ! CHAR_HEAD_P (p0[1]))) \
aa72b389
KH
753 c = STRING_CHAR (p0, bytes); \
754 else \
755 c = -1; \
756 } \
757 else \
758 c = -1; \
759 } while (0)
760
761
762/* Decode a composition rule represented as a component of composition
763 sequence of Emacs 20 style at SRC. Set C to the rule. If not
764 valid rule is found, set C to -1. */
765
766#define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
767 do { \
768 c = SAFE_ONE_MORE_BYTE (); \
769 c -= 0xA0; \
770 if (c < 0 || c >= 81) \
771 c = -1; \
772 else \
773 { \
774 gref = c / 9, nref = c % 9; \
775 c = COMPOSITION_ENCODE_RULE (gref, nref); \
776 } \
777 } while (0)
778
779
780/* Decode composition sequence encoded by `emacs-mule' at the source
781 pointed by SRC. SRC_END is the end of source. Store information
782 of the composition in CODING->cmp_data.
783
784 For backward compatibility, decode also a composition sequence of
785 Emacs 20 style. In that case, the composition sequence contains
786 characters that should be extracted into a buffer or string. Store
787 those characters at *DESTINATION in multibyte form.
788
789 If we encounter an invalid byte sequence, return 0.
790 If we encounter an insufficient source or destination, or
791 insufficient space in CODING->cmp_data, return 1.
792 Otherwise, return consumed bytes in the source.
793
794*/
795static INLINE int
796decode_composition_emacs_mule (coding, src, src_end,
797 destination, dst_end, dst_bytes)
798 struct coding_system *coding;
799 unsigned char *src, *src_end, **destination, *dst_end;
800 int dst_bytes;
801{
802 unsigned char *dst = *destination;
803 int method, data_len, nchars;
804 unsigned char *src_base = src++;
8ca3766a 805 /* Store components of composition. */
aa72b389
KH
806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
807 int ncomponent;
808 /* Store multibyte form of characters to be composed. This is for
809 Emacs 20 style composition sequence. */
810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
811 unsigned char *bufp = buf;
812 int c, i, gref, nref;
813
814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
815 >= COMPOSITION_DATA_SIZE)
816 {
817 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
818 return -1;
819 }
820
821 ONE_MORE_BYTE (c);
822 if (c - 0xF0 >= COMPOSITION_RELATIVE
823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
824 {
825 int with_rule;
826
827 method = c - 0xF0;
828 with_rule = (method == COMPOSITION_WITH_RULE
829 || method == COMPOSITION_WITH_RULE_ALTCHARS);
830 ONE_MORE_BYTE (c);
831 data_len = c - 0xA0;
832 if (data_len < 4
833 || src_base + data_len > src_end)
834 return 0;
835 ONE_MORE_BYTE (c);
836 nchars = c - 0xA0;
837 if (c < 1)
838 return 0;
839 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
840 {
b1887814
RS
841 /* If it is longer than this, it can't be valid. */
842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
843 return 0;
844
aa72b389
KH
845 if (ncomponent % 2 && with_rule)
846 {
847 ONE_MORE_BYTE (gref);
848 gref -= 32;
849 ONE_MORE_BYTE (nref);
850 nref -= 32;
851 c = COMPOSITION_ENCODE_RULE (gref, nref);
852 }
853 else
854 {
855 int bytes;
fd3ae0b9
KH
856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
857 || (coding->flags /* We are recovering a file. */
858 && src[0] == LEADING_CODE_8_BIT_CONTROL
859 && ! CHAR_HEAD_P (src[1])))
aa72b389
KH
860 c = STRING_CHAR (src, bytes);
861 else
862 c = *src, bytes = 1;
863 src += bytes;
864 }
865 component[ncomponent] = c;
866 }
867 }
868 else
869 {
870 /* This may be an old Emacs 20 style format. See the comment at
871 the section 2 of this file. */
872 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
873 if (src == src_end
874 && !(coding->mode & CODING_MODE_LAST_BLOCK))
875 goto label_end_of_loop;
876
877 src_end = src;
878 src = src_base + 1;
879 if (c < 0xC0)
880 {
881 method = COMPOSITION_RELATIVE;
882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
883 {
884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
885 if (c < 0)
886 break;
887 component[ncomponent++] = c;
888 }
889 if (ncomponent < 2)
890 return 0;
891 nchars = ncomponent;
892 }
893 else if (c == 0xFF)
894 {
895 method = COMPOSITION_WITH_RULE;
896 src++;
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 return 0;
900 component[0] = c;
901 for (ncomponent = 1;
902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
903 {
904 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
905 if (c < 0)
906 break;
907 component[ncomponent++] = c;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 break;
911 component[ncomponent++] = c;
912 }
913 if (ncomponent < 3)
914 return 0;
915 nchars = (ncomponent + 1) / 2;
916 }
917 else
918 return 0;
919 }
920
921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
922 {
923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
924 for (i = 0; i < ncomponent; i++)
925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
93dec019 926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
aa72b389
KH
927 if (buf < bufp)
928 {
929 unsigned char *p = buf;
930 EMIT_BYTES (p, bufp);
931 *destination += bufp - buf;
932 coding->produced_char += nchars;
933 }
934 return (src - src_base);
935 }
936 label_end_of_loop:
937 return -1;
938}
939
b73bfc1c 940/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 941
b73bfc1c
KH
942static void
943decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
944 struct coding_system *coding;
945 unsigned char *source, *destination;
946 int src_bytes, dst_bytes;
947{
948 unsigned char *src = source;
949 unsigned char *src_end = source + src_bytes;
950 unsigned char *dst = destination;
951 unsigned char *dst_end = destination + dst_bytes;
952 /* SRC_BASE remembers the start position in source in each loop.
953 The loop will be exited when there's not enough source code, or
954 when there's not enough destination area to produce a
955 character. */
956 unsigned char *src_base;
4ed46869 957
b73bfc1c 958 coding->produced_char = 0;
8a33cf7b 959 while ((src_base = src) < src_end)
b73bfc1c
KH
960 {
961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
962 int bytes;
ec6d2bb8 963
4af310db
EZ
964 if (*src == '\r')
965 {
2bcdf662 966 int c = *src++;
4af310db 967
4af310db
EZ
968 if (coding->eol_type == CODING_EOL_CR)
969 c = '\n';
970 else if (coding->eol_type == CODING_EOL_CRLF)
971 {
972 ONE_MORE_BYTE (c);
973 if (c != '\n')
974 {
4af310db
EZ
975 src--;
976 c = '\r';
977 }
978 }
979 *dst++ = c;
980 coding->produced_char++;
981 continue;
982 }
983 else if (*src == '\n')
984 {
985 if ((coding->eol_type == CODING_EOL_CR
986 || coding->eol_type == CODING_EOL_CRLF)
987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
988 {
989 coding->result = CODING_FINISH_INCONSISTENT_EOL;
990 goto label_end_of_loop;
991 }
992 *dst++ = *src++;
993 coding->produced_char++;
994 continue;
995 }
3089d25c 996 else if (*src == 0x80 && coding->cmp_data)
aa72b389
KH
997 {
998 /* Start of composition data. */
999 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1000 &dst, dst_end,
1001 dst_bytes);
1002 if (consumed < 0)
1003 goto label_end_of_loop;
1004 else if (consumed > 0)
1005 {
1006 src += consumed;
1007 continue;
1008 }
1009 bytes = CHAR_STRING (*src, tmp);
1010 p = tmp;
1011 src++;
1012 }
fd3ae0b9
KH
1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014 || (coding->flags /* We are recovering a file. */
1015 && src[0] == LEADING_CODE_8_BIT_CONTROL
1016 && ! CHAR_HEAD_P (src[1])))
b73bfc1c
KH
1017 {
1018 p = src;
1019 src += bytes;
1020 }
1021 else
1022 {
6eced09c
KH
1023 int i, c;
1024
1025 bytes = BYTES_BY_CHAR_HEAD (*src);
b73bfc1c 1026 src++;
6eced09c
KH
1027 for (i = 1; i < bytes; i++)
1028 {
1029 ONE_MORE_BYTE (c);
1030 if (CHAR_HEAD_P (c))
1031 break;
1032 }
1033 if (i < bytes)
1034 {
1035 bytes = CHAR_STRING (*src_base, tmp);
1036 p = tmp;
1037 src = src_base + 1;
1038 }
1039 else
1040 {
1041 p = src_base;
1042 }
b73bfc1c
KH
1043 }
1044 if (dst + bytes >= (dst_bytes ? dst_end : src))
1045 {
1046 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869
KH
1047 break;
1048 }
b73bfc1c
KH
1049 while (bytes--) *dst++ = *p++;
1050 coding->produced_char++;
4ed46869 1051 }
4af310db 1052 label_end_of_loop:
b73bfc1c
KH
1053 coding->consumed = coding->consumed_char = src_base - source;
1054 coding->produced = dst - destination;
4ed46869
KH
1055}
1056
b73bfc1c 1057
aa72b389
KH
1058/* Encode composition data stored at DATA into a special byte sequence
1059 starting by 0x80. Update CODING->cmp_data_start and maybe
1060 CODING->cmp_data for the next call. */
1061
1062#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1063 do { \
1064 unsigned char buf[1024], *p0 = buf, *p; \
1065 int len = data[0]; \
1066 int i; \
1067 \
1068 buf[0] = 0x80; \
1069 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1070 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1071 p = buf + 4; \
1072 if (data[3] == COMPOSITION_WITH_RULE \
1073 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1074 { \
1075 p += CHAR_STRING (data[4], p); \
1076 for (i = 5; i < len; i += 2) \
1077 { \
1078 int gref, nref; \
1079 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1080 *p++ = 0x20 + gref; \
1081 *p++ = 0x20 + nref; \
1082 p += CHAR_STRING (data[i + 1], p); \
1083 } \
1084 } \
1085 else \
1086 { \
1087 for (i = 4; i < len; i++) \
1088 p += CHAR_STRING (data[i], p); \
1089 } \
1090 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1091 \
1092 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1093 { \
1094 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1095 goto label_end_of_loop; \
1096 } \
1097 while (p0 < p) \
1098 *dst++ = *p0++; \
1099 coding->cmp_data_start += data[0]; \
1100 if (coding->cmp_data_start == coding->cmp_data->used \
1101 && coding->cmp_data->next) \
1102 { \
1103 coding->cmp_data = coding->cmp_data->next; \
1104 coding->cmp_data_start = 0; \
1105 } \
1106 } while (0)
93dec019 1107
aa72b389 1108
a4244313 1109static void encode_eol P_ ((struct coding_system *, const unsigned char *,
aa72b389
KH
1110 unsigned char *, int, int));
1111
1112static void
1113encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1114 struct coding_system *coding;
1115 unsigned char *source, *destination;
1116 int src_bytes, dst_bytes;
1117{
1118 unsigned char *src = source;
1119 unsigned char *src_end = source + src_bytes;
1120 unsigned char *dst = destination;
1121 unsigned char *dst_end = destination + dst_bytes;
1122 unsigned char *src_base;
1123 int c;
1124 int char_offset;
1125 int *data;
1126
1127 Lisp_Object translation_table;
1128
1129 translation_table = Qnil;
1130
1131 /* Optimization for the case that there's no composition. */
1132 if (!coding->cmp_data || coding->cmp_data->used == 0)
1133 {
1134 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1135 return;
1136 }
1137
1138 char_offset = coding->cmp_data->char_offset;
1139 data = coding->cmp_data->data + coding->cmp_data_start;
1140 while (1)
1141 {
1142 src_base = src;
1143
1144 /* If SRC starts a composition, encode the information about the
1145 composition in advance. */
1146 if (coding->cmp_data_start < coding->cmp_data->used
1147 && char_offset + coding->consumed_char == data[1])
1148 {
1149 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1150 char_offset = coding->cmp_data->char_offset;
1151 data = coding->cmp_data->data + coding->cmp_data_start;
1152 }
1153
1154 ONE_MORE_CHAR (c);
1155 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1156 || coding->eol_type == CODING_EOL_CR))
1157 {
1158 if (coding->eol_type == CODING_EOL_CRLF)
1159 EMIT_TWO_BYTES ('\r', c);
1160 else
1161 EMIT_ONE_BYTE ('\r');
1162 }
1163 else if (SINGLE_BYTE_CHAR_P (c))
fd3ae0b9
KH
1164 {
1165 if (coding->flags && ! ASCII_BYTE_P (c))
1166 {
1167 /* As we are auto saving, retain the multibyte form for
1168 8-bit chars. */
1169 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1170 int bytes = CHAR_STRING (c, buf);
1171
1172 if (bytes == 1)
1173 EMIT_ONE_BYTE (buf[0]);
1174 else
1175 EMIT_TWO_BYTES (buf[0], buf[1]);
1176 }
1177 else
1178 EMIT_ONE_BYTE (c);
1179 }
aa72b389
KH
1180 else
1181 EMIT_BYTES (src_base, src);
1182 coding->consumed_char++;
1183 }
1184 label_end_of_loop:
1185 coding->consumed = src_base - source;
1186 coding->produced = coding->produced_char = dst - destination;
1187 return;
1188}
b73bfc1c 1189
4ed46869
KH
1190\f
1191/*** 3. ISO2022 handlers ***/
1192
1193/* The following note describes the coding system ISO2022 briefly.
39787efd 1194 Since the intention of this note is to help understand the
cfb43547 1195 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 1196 SIMPLIFIED. For thorough understanding, please refer to the
cfb43547
DL
1197 original document of ISO2022. This is equivalent to the standard
1198 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
1199
1200 ISO2022 provides many mechanisms to encode several character sets
cfb43547 1201 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
1202 is encoded using bytes less than 128. This may make the encoded
1203 text a little bit longer, but the text passes more easily through
cfb43547 1204 several types of gateway, some of which strip off the MSB (Most
8ca3766a 1205 Significant Bit).
b73bfc1c 1206
cfb43547
DL
1207 There are two kinds of character sets: control character sets and
1208 graphic character sets. The former contain control characters such
4ed46869 1209 as `newline' and `escape' to provide control functions (control
39787efd 1210 functions are also provided by escape sequences). The latter
cfb43547 1211 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
1212 two control character sets and many graphic character sets.
1213
1214 Graphic character sets are classified into one of the following
39787efd
KH
1215 four classes, according to the number of bytes (DIMENSION) and
1216 number of characters in one dimension (CHARS) of the set:
1217 - DIMENSION1_CHARS94
1218 - DIMENSION1_CHARS96
1219 - DIMENSION2_CHARS94
1220 - DIMENSION2_CHARS96
1221
1222 In addition, each character set is assigned an identification tag,
cfb43547 1223 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
1224 hereafter). The <F> of each character set is decided by ECMA(*)
1225 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1226 (0x30..0x3F are for private use only).
4ed46869
KH
1227
1228 Note (*): ECMA = European Computer Manufacturers Association
1229
cfb43547 1230 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
1231 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1232 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1233 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1234 o DIMENSION2_CHARS96 -- none for the moment
1235
39787efd 1236 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
1237 C0 [0x00..0x1F] -- control character plane 0
1238 GL [0x20..0x7F] -- graphic character plane 0
1239 C1 [0x80..0x9F] -- control character plane 1
1240 GR [0xA0..0xFF] -- graphic character plane 1
1241
1242 A control character set is directly designated and invoked to C0 or
39787efd
KH
1243 C1 by an escape sequence. The most common case is that:
1244 - ISO646's control character set is designated/invoked to C0, and
1245 - ISO6429's control character set is designated/invoked to C1,
1246 and usually these designations/invocations are omitted in encoded
1247 text. In a 7-bit environment, only C0 can be used, and a control
1248 character for C1 is encoded by an appropriate escape sequence to
1249 fit into the environment. All control characters for C1 are
1250 defined to have corresponding escape sequences.
4ed46869
KH
1251
1252 A graphic character set is at first designated to one of four
1253 graphic registers (G0 through G3), then these graphic registers are
1254 invoked to GL or GR. These designations and invocations can be
1255 done independently. The most common case is that G0 is invoked to
39787efd
KH
1256 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1257 these invocations and designations are omitted in encoded text.
1258 In a 7-bit environment, only GL can be used.
4ed46869 1259
39787efd
KH
1260 When a graphic character set of CHARS94 is invoked to GL, codes
1261 0x20 and 0x7F of the GL area work as control characters SPACE and
1262 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1263 be used.
4ed46869
KH
1264
1265 There are two ways of invocation: locking-shift and single-shift.
1266 With locking-shift, the invocation lasts until the next different
39787efd
KH
1267 invocation, whereas with single-shift, the invocation affects the
1268 following character only and doesn't affect the locking-shift
1269 state. Invocations are done by the following control characters or
1270 escape sequences:
4ed46869
KH
1271
1272 ----------------------------------------------------------------------
39787efd 1273 abbrev function cntrl escape seq description
4ed46869 1274 ----------------------------------------------------------------------
39787efd
KH
1275 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1276 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1277 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1278 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1279 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1280 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1281 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1282 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1283 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 1284 ----------------------------------------------------------------------
39787efd
KH
1285 (*) These are not used by any known coding system.
1286
1287 Control characters for these functions are defined by macros
1288 ISO_CODE_XXX in `coding.h'.
4ed46869 1289
39787efd 1290 Designations are done by the following escape sequences:
4ed46869
KH
1291 ----------------------------------------------------------------------
1292 escape sequence description
1293 ----------------------------------------------------------------------
1294 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1295 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1296 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1297 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1298 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1299 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1300 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1301 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1302 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1303 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1304 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1305 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1306 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1307 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1308 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1309 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1310 ----------------------------------------------------------------------
1311
1312 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 1313 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
1314
1315 Note (*): Although these designations are not allowed in ISO2022,
1316 Emacs accepts them on decoding, and produces them on encoding
39787efd 1317 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
1318 7-bit environment, non-locking-shift, and non-single-shift.
1319
1320 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 1321 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869 1322
cfb43547 1323 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
1324 same multilingual text in ISO2022. Actually, there exist many
1325 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
1326 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1327 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
1328 localized platforms), and all of these are variants of ISO2022.
1329
1330 In addition to the above, Emacs handles two more kinds of escape
1331 sequences: ISO6429's direction specification and Emacs' private
1332 sequence for specifying character composition.
1333
39787efd 1334 ISO6429's direction specification takes the following form:
4ed46869
KH
1335 o CSI ']' -- end of the current direction
1336 o CSI '0' ']' -- end of the current direction
1337 o CSI '1' ']' -- start of left-to-right text
1338 o CSI '2' ']' -- start of right-to-left text
1339 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
1340 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1341
1342 Character composition specification takes the following form:
ec6d2bb8
KH
1343 o ESC '0' -- start relative composition
1344 o ESC '1' -- end composition
1345 o ESC '2' -- start rule-base composition (*)
1346 o ESC '3' -- start relative composition with alternate chars (**)
1347 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 1348 Since these are not standard escape sequences of any ISO standard,
cfb43547 1349 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 1350
cfb43547 1351 (*) This form is used only in Emacs 20.5 and older versions,
b73bfc1c 1352 but the newer versions can safely decode it.
cfb43547 1353 (**) This form is used only in Emacs 21.1 and newer versions,
b73bfc1c 1354 and the older versions can't decode it.
ec6d2bb8 1355
cfb43547 1356 Here's a list of example usages of these composition escape
b73bfc1c 1357 sequences (categorized by `enum composition_method').
ec6d2bb8 1358
b73bfc1c 1359 COMPOSITION_RELATIVE:
ec6d2bb8 1360 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 1361 COMPOSITION_WITH_RULE:
ec6d2bb8 1362 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 1363 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 1364 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 1365 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 1366 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
1367
1368enum iso_code_class_type iso_code_class[256];
1369
05e6f5dc
KH
1370#define CHARSET_OK(idx, charset, c) \
1371 (coding_system_table[idx] \
1372 && (charset == CHARSET_ASCII \
6b89e3aa 1373 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
05e6f5dc
KH
1374 CODING_SAFE_CHAR_P (safe_chars, c))) \
1375 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1376 charset) \
1377 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
1378
1379#define SHIFT_OUT_OK(idx) \
1380 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1381
b6871cc7
KH
1382#define COMPOSITION_OK(idx) \
1383 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1384
4ed46869 1385/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
cfb43547 1386 Check if a text is encoded in ISO2022. If it is, return an
4ed46869
KH
1387 integer in which appropriate flag bits any of:
1388 CODING_CATEGORY_MASK_ISO_7
d46c5b12 1389 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
1390 CODING_CATEGORY_MASK_ISO_8_1
1391 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
1392 CODING_CATEGORY_MASK_ISO_7_ELSE
1393 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
1394 are set. If a code which should never appear in ISO2022 is found,
1395 returns 0. */
1396
0a28aafb
KH
1397static int
1398detect_coding_iso2022 (src, src_end, multibytep)
4ed46869 1399 unsigned char *src, *src_end;
0a28aafb 1400 int multibytep;
4ed46869 1401{
d46c5b12
KH
1402 int mask = CODING_CATEGORY_MASK_ISO;
1403 int mask_found = 0;
f46869e4 1404 int reg[4], shift_out = 0, single_shifting = 0;
da55a2b7 1405 int c, c1, charset;
b73bfc1c
KH
1406 /* Dummy for ONE_MORE_BYTE. */
1407 struct coding_system dummy_coding;
1408 struct coding_system *coding = &dummy_coding;
05e6f5dc 1409 Lisp_Object safe_chars;
3f003981 1410
d46c5b12 1411 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 1412 while (mask && src < src_end)
4ed46869 1413 {
0a28aafb 1414 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
8d239c89 1415 retry:
4ed46869
KH
1416 switch (c)
1417 {
1418 case ISO_CODE_ESC:
74383408
KH
1419 if (inhibit_iso_escape_detection)
1420 break;
f46869e4 1421 single_shifting = 0;
0a28aafb 1422 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
d46c5b12 1423 if (c >= '(' && c <= '/')
4ed46869 1424 {
bf9cdd4e 1425 /* Designation sequence for a charset of dimension 1. */
0a28aafb 1426 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1427 if (c1 < ' ' || c1 >= 0x80
1428 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1429 /* Invalid designation sequence. Just ignore. */
1430 break;
1431 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
1432 }
1433 else if (c == '$')
1434 {
1435 /* Designation sequence for a charset of dimension 2. */
0a28aafb 1436 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
bf9cdd4e
KH
1437 if (c >= '@' && c <= 'B')
1438 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 1439 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 1440 else if (c >= '(' && c <= '/')
bcf26d6a 1441 {
0a28aafb 1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1443 if (c1 < ' ' || c1 >= 0x80
1444 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1445 /* Invalid designation sequence. Just ignore. */
1446 break;
1447 reg[(c - '(') % 4] = charset;
bcf26d6a 1448 }
bf9cdd4e 1449 else
d46c5b12
KH
1450 /* Invalid designation sequence. Just ignore. */
1451 break;
1452 }
ae9ff118 1453 else if (c == 'N' || c == 'O')
d46c5b12 1454 {
ae9ff118
KH
1455 /* ESC <Fe> for SS2 or SS3. */
1456 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 1457 break;
4ed46869 1458 }
ec6d2bb8
KH
1459 else if (c >= '0' && c <= '4')
1460 {
1461 /* ESC <Fp> for start/end composition. */
b6871cc7
KH
1462 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1463 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1464 else
1465 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1466 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1467 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1468 else
1469 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1470 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1471 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1472 else
1473 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1474 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1475 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1476 else
1477 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
ec6d2bb8
KH
1486 break;
1487 }
bf9cdd4e 1488 else
d46c5b12
KH
1489 /* Invalid escape sequence. Just ignore. */
1490 break;
1491
1492 /* We found a valid designation sequence for CHARSET. */
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
05e6f5dc
KH
1494 c = MAKE_CHAR (charset, 0, 0);
1495 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
d46c5b12
KH
1496 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1497 else
1498 mask &= ~CODING_CATEGORY_MASK_ISO_7;
05e6f5dc 1499 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
d46c5b12
KH
1500 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1501 else
1502 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
05e6f5dc 1503 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
ae9ff118
KH
1504 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1505 else
d46c5b12 1506 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
05e6f5dc 1507 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
ae9ff118
KH
1508 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1509 else
d46c5b12 1510 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
1511 break;
1512
4ed46869 1513 case ISO_CODE_SO:
74383408
KH
1514 if (inhibit_iso_escape_detection)
1515 break;
f46869e4 1516 single_shifting = 0;
d46c5b12
KH
1517 if (shift_out == 0
1518 && (reg[1] >= 0
1519 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1520 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1521 {
1522 /* Locking shift out. */
1523 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1524 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1525 }
e0e989f6 1526 break;
93dec019 1527
d46c5b12 1528 case ISO_CODE_SI:
74383408
KH
1529 if (inhibit_iso_escape_detection)
1530 break;
f46869e4 1531 single_shifting = 0;
d46c5b12
KH
1532 if (shift_out == 1)
1533 {
1534 /* Locking shift in. */
1535 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537 }
1538 break;
1539
4ed46869 1540 case ISO_CODE_CSI:
f46869e4 1541 single_shifting = 0;
4ed46869
KH
1542 case ISO_CODE_SS2:
1543 case ISO_CODE_SS3:
3f003981
KH
1544 {
1545 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1546
74383408
KH
1547 if (inhibit_iso_escape_detection)
1548 break;
70c22245
KH
1549 if (c != ISO_CODE_CSI)
1550 {
d46c5b12
KH
1551 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1552 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1553 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1554 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1555 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1556 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 1557 single_shifting = 1;
70c22245 1558 }
3f003981
KH
1559 if (VECTORP (Vlatin_extra_code_table)
1560 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1561 {
d46c5b12
KH
1562 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1563 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1564 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1565 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1566 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1567 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1568 }
1569 mask &= newmask;
d46c5b12 1570 mask_found |= newmask;
3f003981
KH
1571 }
1572 break;
4ed46869
KH
1573
1574 default:
1575 if (c < 0x80)
f46869e4
KH
1576 {
1577 single_shifting = 0;
1578 break;
1579 }
4ed46869 1580 else if (c < 0xA0)
c4825358 1581 {
f46869e4 1582 single_shifting = 0;
3f003981
KH
1583 if (VECTORP (Vlatin_extra_code_table)
1584 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 1585 {
3f003981
KH
1586 int newmask = 0;
1587
d46c5b12
KH
1588 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1589 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1590 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1591 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1592 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1593 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1594 mask &= newmask;
d46c5b12 1595 mask_found |= newmask;
c4825358 1596 }
3f003981
KH
1597 else
1598 return 0;
c4825358 1599 }
4ed46869
KH
1600 else
1601 {
d46c5b12 1602 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 1603 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 1604 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
1605 /* Check the length of succeeding codes of the range
1606 0xA0..0FF. If the byte length is odd, we exclude
1607 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1608 when we are not single shifting. */
b73bfc1c
KH
1609 if (!single_shifting
1610 && mask & CODING_CATEGORY_MASK_ISO_8_2)
f46869e4 1611 {
e17de821 1612 int i = 1;
8d239c89
KH
1613
1614 c = -1;
b73bfc1c
KH
1615 while (src < src_end)
1616 {
0a28aafb 1617 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
1618 if (c < 0xA0)
1619 break;
1620 i++;
1621 }
1622
1623 if (i & 1 && src < src_end)
f46869e4
KH
1624 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1625 else
1626 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
8d239c89
KH
1627 if (c >= 0)
1628 /* This means that we have read one extra byte. */
1629 goto retry;
f46869e4 1630 }
4ed46869
KH
1631 }
1632 break;
1633 }
1634 }
b73bfc1c 1635 label_end_of_loop:
d46c5b12 1636 return (mask & mask_found);
4ed46869
KH
1637}
1638
b73bfc1c
KH
1639/* Decode a character of which charset is CHARSET, the 1st position
1640 code is C1, the 2nd position code is C2, and return the decoded
1641 character code. If the variable `translation_table' is non-nil,
1642 returned the translated code. */
ec6d2bb8 1643
b73bfc1c
KH
1644#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1645 (NILP (translation_table) \
1646 ? MAKE_CHAR (charset, c1, c2) \
1647 : translate_char (translation_table, -1, charset, c1, c2))
4ed46869
KH
1648
1649/* Set designation state into CODING. */
d46c5b12
KH
1650#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1651 do { \
05e6f5dc 1652 int charset, c; \
944bd420
KH
1653 \
1654 if (final_char < '0' || final_char >= 128) \
1655 goto label_invalid_code; \
1656 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1657 make_number (chars), \
1658 make_number (final_char)); \
05e6f5dc 1659 c = MAKE_CHAR (charset, 0, 0); \
d46c5b12 1660 if (charset >= 0 \
704c5781 1661 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
05e6f5dc 1662 || CODING_SAFE_CHAR_P (safe_chars, c))) \
d46c5b12
KH
1663 { \
1664 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1665 && reg == 0 \
1666 && charset == CHARSET_ASCII) \
1667 { \
1668 /* We should insert this designation sequence as is so \
1669 that it is surely written back to a file. */ \
1670 coding->spec.iso2022.last_invalid_designation_register = -1; \
1671 goto label_invalid_code; \
1672 } \
1673 coding->spec.iso2022.last_invalid_designation_register = -1; \
1674 if ((coding->mode & CODING_MODE_DIRECTION) \
1675 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1676 charset = CHARSET_REVERSE_CHARSET (charset); \
1677 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1678 } \
1679 else \
1680 { \
1681 coding->spec.iso2022.last_invalid_designation_register = reg; \
1682 goto label_invalid_code; \
1683 } \
4ed46869
KH
1684 } while (0)
1685
ec6d2bb8
KH
1686/* Allocate a memory block for storing information about compositions.
1687 The block is chained to the already allocated blocks. */
d46c5b12 1688
33fb63eb 1689void
ec6d2bb8 1690coding_allocate_composition_data (coding, char_offset)
d46c5b12 1691 struct coding_system *coding;
ec6d2bb8 1692 int char_offset;
d46c5b12 1693{
ec6d2bb8
KH
1694 struct composition_data *cmp_data
1695 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1696
1697 cmp_data->char_offset = char_offset;
1698 cmp_data->used = 0;
1699 cmp_data->prev = coding->cmp_data;
1700 cmp_data->next = NULL;
1701 if (coding->cmp_data)
1702 coding->cmp_data->next = cmp_data;
1703 coding->cmp_data = cmp_data;
1704 coding->cmp_data_start = 0;
4307d534 1705 coding->composing = COMPOSITION_NO;
ec6d2bb8 1706}
d46c5b12 1707
aa72b389
KH
1708/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1709 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1710 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1711 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1712 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1713 */
ec6d2bb8 1714
33fb63eb
KH
1715#define DECODE_COMPOSITION_START(c1) \
1716 do { \
1717 if (coding->composing == COMPOSITION_DISABLED) \
1718 { \
1719 *dst++ = ISO_CODE_ESC; \
1720 *dst++ = c1 & 0x7f; \
1721 coding->produced_char += 2; \
1722 } \
1723 else if (!COMPOSING_P (coding)) \
1724 { \
1725 /* This is surely the start of a composition. We must be sure \
1726 that coding->cmp_data has enough space to store the \
1727 information about the composition. If not, terminate the \
1728 current decoding loop, allocate one more memory block for \
8ca3766a 1729 coding->cmp_data in the caller, then start the decoding \
33fb63eb
KH
1730 loop again. We can't allocate memory here directly because \
1731 it may cause buffer/string relocation. */ \
1732 if (!coding->cmp_data \
1733 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1734 >= COMPOSITION_DATA_SIZE)) \
1735 { \
1736 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1737 goto label_end_of_loop; \
1738 } \
1739 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1740 : c1 == '2' ? COMPOSITION_WITH_RULE \
1741 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1742 : COMPOSITION_WITH_RULE_ALTCHARS); \
1743 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1744 coding->composing); \
1745 coding->composition_rule_follows = 0; \
1746 } \
1747 else \
1748 { \
1749 /* We are already handling a composition. If the method is \
1750 the following two, the codes following the current escape \
1751 sequence are actual characters stored in a buffer. */ \
1752 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1753 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1754 { \
1755 coding->composing = COMPOSITION_RELATIVE; \
1756 coding->composition_rule_follows = 0; \
1757 } \
1758 } \
ec6d2bb8
KH
1759 } while (0)
1760
8ca3766a 1761/* Handle composition end sequence ESC 1. */
ec6d2bb8
KH
1762
1763#define DECODE_COMPOSITION_END(c1) \
1764 do { \
93dec019 1765 if (! COMPOSING_P (coding)) \
ec6d2bb8
KH
1766 { \
1767 *dst++ = ISO_CODE_ESC; \
1768 *dst++ = c1; \
1769 coding->produced_char += 2; \
1770 } \
1771 else \
1772 { \
1773 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1774 coding->composing = COMPOSITION_NO; \
1775 } \
1776 } while (0)
1777
1778/* Decode a composition rule from the byte C1 (and maybe one more byte
1779 from SRC) and store one encoded composition rule in
1780 coding->cmp_data. */
1781
1782#define DECODE_COMPOSITION_RULE(c1) \
1783 do { \
1784 int rule = 0; \
1785 (c1) -= 32; \
1786 if (c1 < 81) /* old format (before ver.21) */ \
1787 { \
1788 int gref = (c1) / 9; \
1789 int nref = (c1) % 9; \
1790 if (gref == 4) gref = 10; \
1791 if (nref == 4) nref = 10; \
1792 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1793 } \
b73bfc1c 1794 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
1795 { \
1796 ONE_MORE_BYTE (c2); \
1797 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1798 } \
1799 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1800 coding->composition_rule_follows = 0; \
1801 } while (0)
88993dfd 1802
d46c5b12 1803
4ed46869
KH
1804/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1805
b73bfc1c 1806static void
d46c5b12 1807decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1808 struct coding_system *coding;
1809 unsigned char *source, *destination;
1810 int src_bytes, dst_bytes;
4ed46869
KH
1811{
1812 unsigned char *src = source;
1813 unsigned char *src_end = source + src_bytes;
1814 unsigned char *dst = destination;
1815 unsigned char *dst_end = destination + dst_bytes;
4ed46869
KH
1816 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1817 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1818 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
b73bfc1c
KH
1819 /* SRC_BASE remembers the start position in source in each loop.
1820 The loop will be exited when there's not enough source code
1821 (within macro ONE_MORE_BYTE), or when there's not enough
1822 destination area to produce a character (within macro
1823 EMIT_CHAR). */
1824 unsigned char *src_base;
1825 int c, charset;
1826 Lisp_Object translation_table;
05e6f5dc
KH
1827 Lisp_Object safe_chars;
1828
6b89e3aa 1829 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 1830
b73bfc1c
KH
1831 if (NILP (Venable_character_translation))
1832 translation_table = Qnil;
1833 else
1834 {
1835 translation_table = coding->translation_table_for_decode;
1836 if (NILP (translation_table))
1837 translation_table = Vstandard_translation_table_for_decode;
1838 }
4ed46869 1839
b73bfc1c
KH
1840 coding->result = CODING_FINISH_NORMAL;
1841
1842 while (1)
4ed46869 1843 {
85478bc6 1844 int c1, c2 = 0;
b73bfc1c
KH
1845
1846 src_base = src;
1847 ONE_MORE_BYTE (c1);
4ed46869 1848
ec6d2bb8 1849 /* We produce no character or one character. */
4ed46869
KH
1850 switch (iso_code_class [c1])
1851 {
1852 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1853 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1854 {
1855 DECODE_COMPOSITION_RULE (c1);
b73bfc1c 1856 continue;
ec6d2bb8
KH
1857 }
1858 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1859 {
1860 /* This is SPACE or DEL. */
b73bfc1c 1861 charset = CHARSET_ASCII;
4ed46869
KH
1862 break;
1863 }
1864 /* This is a graphic character, we fall down ... */
1865
1866 case ISO_graphic_plane_0:
ec6d2bb8 1867 if (COMPOSING_P (coding) && coding->composition_rule_follows)
b73bfc1c
KH
1868 {
1869 DECODE_COMPOSITION_RULE (c1);
1870 continue;
1871 }
1872 charset = charset0;
4ed46869
KH
1873 break;
1874
1875 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1876 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1877 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1878 goto label_invalid_code;
4ed46869
KH
1879 /* This is a graphic character, we fall down ... */
1880
1881 case ISO_graphic_plane_1:
b73bfc1c 1882 if (charset1 < 0)
fb88bf2d 1883 goto label_invalid_code;
b73bfc1c 1884 charset = charset1;
4ed46869
KH
1885 break;
1886
b73bfc1c 1887 case ISO_control_0:
ec6d2bb8
KH
1888 if (COMPOSING_P (coding))
1889 DECODE_COMPOSITION_END ('1');
1890
4ed46869
KH
1891 /* All ISO2022 control characters in this class have the
1892 same representation in Emacs internal format. */
d46c5b12
KH
1893 if (c1 == '\n'
1894 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1895 && (coding->eol_type == CODING_EOL_CR
1896 || coding->eol_type == CODING_EOL_CRLF))
1897 {
b73bfc1c
KH
1898 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1899 goto label_end_of_loop;
d46c5b12 1900 }
b73bfc1c 1901 charset = CHARSET_ASCII;
4ed46869
KH
1902 break;
1903
b73bfc1c
KH
1904 case ISO_control_1:
1905 if (COMPOSING_P (coding))
1906 DECODE_COMPOSITION_END ('1');
1907 goto label_invalid_code;
1908
4ed46869 1909 case ISO_carriage_return:
ec6d2bb8
KH
1910 if (COMPOSING_P (coding))
1911 DECODE_COMPOSITION_END ('1');
1912
4ed46869 1913 if (coding->eol_type == CODING_EOL_CR)
b73bfc1c 1914 c1 = '\n';
4ed46869
KH
1915 else if (coding->eol_type == CODING_EOL_CRLF)
1916 {
1917 ONE_MORE_BYTE (c1);
b73bfc1c 1918 if (c1 != ISO_CODE_LF)
4ed46869
KH
1919 {
1920 src--;
b73bfc1c 1921 c1 = '\r';
4ed46869
KH
1922 }
1923 }
b73bfc1c 1924 charset = CHARSET_ASCII;
4ed46869
KH
1925 break;
1926
1927 case ISO_shift_out:
d46c5b12
KH
1928 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1929 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1930 goto label_invalid_code;
4ed46869
KH
1931 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1932 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1933 continue;
4ed46869
KH
1934
1935 case ISO_shift_in:
d46c5b12
KH
1936 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1937 goto label_invalid_code;
4ed46869
KH
1938 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1939 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1940 continue;
4ed46869
KH
1941
1942 case ISO_single_shift_2_7:
1943 case ISO_single_shift_2:
d46c5b12
KH
1944 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1945 goto label_invalid_code;
4ed46869
KH
1946 /* SS2 is handled as an escape sequence of ESC 'N' */
1947 c1 = 'N';
1948 goto label_escape_sequence;
1949
1950 case ISO_single_shift_3:
d46c5b12
KH
1951 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1952 goto label_invalid_code;
4ed46869
KH
1953 /* SS2 is handled as an escape sequence of ESC 'O' */
1954 c1 = 'O';
1955 goto label_escape_sequence;
1956
1957 case ISO_control_sequence_introducer:
1958 /* CSI is handled as an escape sequence of ESC '[' ... */
1959 c1 = '[';
1960 goto label_escape_sequence;
1961
1962 case ISO_escape:
1963 ONE_MORE_BYTE (c1);
1964 label_escape_sequence:
1965 /* Escape sequences handled by Emacs are invocation,
1966 designation, direction specification, and character
1967 composition specification. */
1968 switch (c1)
1969 {
1970 case '&': /* revision of following character set */
1971 ONE_MORE_BYTE (c1);
1972 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1973 goto label_invalid_code;
4ed46869
KH
1974 ONE_MORE_BYTE (c1);
1975 if (c1 != ISO_CODE_ESC)
d46c5b12 1976 goto label_invalid_code;
4ed46869
KH
1977 ONE_MORE_BYTE (c1);
1978 goto label_escape_sequence;
1979
1980 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1981 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1982 goto label_invalid_code;
4ed46869
KH
1983 ONE_MORE_BYTE (c1);
1984 if (c1 >= '@' && c1 <= 'B')
1985 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1986 or JISX0208.1980 */
4ed46869
KH
1987 DECODE_DESIGNATION (0, 2, 94, c1);
1988 }
1989 else if (c1 >= 0x28 && c1 <= 0x2B)
1990 { /* designation of DIMENSION2_CHARS94 character set */
1991 ONE_MORE_BYTE (c2);
1992 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1993 }
1994 else if (c1 >= 0x2C && c1 <= 0x2F)
1995 { /* designation of DIMENSION2_CHARS96 character set */
1996 ONE_MORE_BYTE (c2);
1997 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1998 }
1999 else
d46c5b12 2000 goto label_invalid_code;
b73bfc1c
KH
2001 /* We must update these variables now. */
2002 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2003 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2004 continue;
4ed46869
KH
2005
2006 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
2007 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2008 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2009 goto label_invalid_code;
4ed46869 2010 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 2011 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 2012 continue;
4ed46869
KH
2013
2014 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
2015 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2017 goto label_invalid_code;
4ed46869 2018 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 2019 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 2020 continue;
4ed46869
KH
2021
2022 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
2023 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2024 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2025 goto label_invalid_code;
4ed46869 2026 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
b73bfc1c 2027 ONE_MORE_BYTE (c1);
e7046a18
KH
2028 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2029 goto label_invalid_code;
4ed46869
KH
2030 break;
2031
2032 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
2033 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2034 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2035 goto label_invalid_code;
4ed46869 2036 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
b73bfc1c 2037 ONE_MORE_BYTE (c1);
e7046a18
KH
2038 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2039 goto label_invalid_code;
4ed46869
KH
2040 break;
2041
ec6d2bb8
KH
2042 case '0': case '2': case '3': case '4': /* start composition */
2043 DECODE_COMPOSITION_START (c1);
b73bfc1c 2044 continue;
4ed46869 2045
ec6d2bb8
KH
2046 case '1': /* end composition */
2047 DECODE_COMPOSITION_END (c1);
b73bfc1c 2048 continue;
4ed46869
KH
2049
2050 case '[': /* specification of direction */
d46c5b12
KH
2051 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2052 goto label_invalid_code;
4ed46869 2053 /* For the moment, nested direction is not supported.
d46c5b12 2054 So, `coding->mode & CODING_MODE_DIRECTION' zero means
8ca3766a 2055 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
2056 ONE_MORE_BYTE (c1);
2057 switch (c1)
2058 {
2059 case ']': /* end of the current direction */
d46c5b12 2060 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
2061
2062 case '0': /* end of the current direction */
2063 case '1': /* start of left-to-right direction */
2064 ONE_MORE_BYTE (c1);
2065 if (c1 == ']')
d46c5b12 2066 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 2067 else
d46c5b12 2068 goto label_invalid_code;
4ed46869
KH
2069 break;
2070
2071 case '2': /* start of right-to-left direction */
2072 ONE_MORE_BYTE (c1);
2073 if (c1 == ']')
d46c5b12 2074 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 2075 else
d46c5b12 2076 goto label_invalid_code;
4ed46869
KH
2077 break;
2078
2079 default:
d46c5b12 2080 goto label_invalid_code;
4ed46869 2081 }
b73bfc1c 2082 continue;
4ed46869 2083
103e0180
KH
2084 case '%':
2085 if (COMPOSING_P (coding))
2086 DECODE_COMPOSITION_END ('1');
2087 ONE_MORE_BYTE (c1);
2088 if (c1 == '/')
2089 {
2090 /* CTEXT extended segment:
2091 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2092 We keep these bytes as is for the moment.
2093 They may be decoded by post-read-conversion. */
2094 int dim, M, L;
2095 int size, required;
2096 int produced_chars;
43e4a82f 2097
103e0180
KH
2098 ONE_MORE_BYTE (dim);
2099 ONE_MORE_BYTE (M);
2100 ONE_MORE_BYTE (L);
2101 size = ((M - 128) * 128) + (L - 128);
2102 required = 8 + size * 2;
2103 if (dst + required > (dst_bytes ? dst_end : src))
2104 goto label_end_of_loop;
2105 *dst++ = ISO_CODE_ESC;
2106 *dst++ = '%';
2107 *dst++ = '/';
2108 *dst++ = dim;
2109 produced_chars = 4;
2110 dst += CHAR_STRING (M, dst), produced_chars++;
2111 dst += CHAR_STRING (L, dst), produced_chars++;
2112 while (size-- > 0)
2113 {
2114 ONE_MORE_BYTE (c1);
2115 dst += CHAR_STRING (c1, dst), produced_chars++;
2116 }
2117 coding->produced_char += produced_chars;
2118 }
2119 else if (c1 == 'G')
2120 {
2121 unsigned char *d = dst;
2122 int produced_chars;
2123
2124 /* XFree86 extension for embedding UTF-8 in CTEXT:
2125 ESC % G --UTF-8-BYTES-- ESC % @
2126 We keep these bytes as is for the moment.
2127 They may be decoded by post-read-conversion. */
2128 if (d + 6 > (dst_bytes ? dst_end : src))
2129 goto label_end_of_loop;
2130 *d++ = ISO_CODE_ESC;
2131 *d++ = '%';
2132 *d++ = 'G';
2133 produced_chars = 3;
2134 while (d + 1 < (dst_bytes ? dst_end : src))
2135 {
2136 ONE_MORE_BYTE (c1);
2137 if (c1 == ISO_CODE_ESC
2138 && src + 1 < src_end
2139 && src[0] == '%'
2140 && src[1] == '@')
47dc91ad
KH
2141 {
2142 src += 2;
2143 break;
2144 }
103e0180
KH
2145 d += CHAR_STRING (c1, d), produced_chars++;
2146 }
2147 if (d + 3 > (dst_bytes ? dst_end : src))
2148 goto label_end_of_loop;
2149 *d++ = ISO_CODE_ESC;
2150 *d++ = '%';
2151 *d++ = '@';
2152 dst = d;
2153 coding->produced_char += produced_chars + 3;
2154 }
2155 else
2156 goto label_invalid_code;
2157 continue;
2158
4ed46869 2159 default:
d46c5b12
KH
2160 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2161 goto label_invalid_code;
4ed46869
KH
2162 if (c1 >= 0x28 && c1 <= 0x2B)
2163 { /* designation of DIMENSION1_CHARS94 character set */
2164 ONE_MORE_BYTE (c2);
2165 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2166 }
2167 else if (c1 >= 0x2C && c1 <= 0x2F)
2168 { /* designation of DIMENSION1_CHARS96 character set */
2169 ONE_MORE_BYTE (c2);
2170 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2171 }
2172 else
b73bfc1c
KH
2173 goto label_invalid_code;
2174 /* We must update these variables now. */
2175 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2176 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2177 continue;
4ed46869 2178 }
b73bfc1c 2179 }
4ed46869 2180
b73bfc1c
KH
2181 /* Now we know CHARSET and 1st position code C1 of a character.
2182 Produce a multibyte sequence for that character while getting
2183 2nd position code C2 if necessary. */
2184 if (CHARSET_DIMENSION (charset) == 2)
2185 {
2186 ONE_MORE_BYTE (c2);
2187 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2188 /* C2 is not in a valid range. */
2189 goto label_invalid_code;
4ed46869 2190 }
b73bfc1c
KH
2191 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2192 EMIT_CHAR (c);
4ed46869
KH
2193 continue;
2194
b73bfc1c
KH
2195 label_invalid_code:
2196 coding->errors++;
2197 if (COMPOSING_P (coding))
2198 DECODE_COMPOSITION_END ('1');
4ed46869 2199 src = src_base;
b73bfc1c
KH
2200 c = *src++;
2201 EMIT_CHAR (c);
4ed46869 2202 }
fb88bf2d 2203
b73bfc1c
KH
2204 label_end_of_loop:
2205 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 2206 coding->produced = dst - destination;
b73bfc1c 2207 return;
4ed46869
KH
2208}
2209
b73bfc1c 2210
f4dee582 2211/* ISO2022 encoding stuff. */
4ed46869
KH
2212
2213/*
f4dee582 2214 It is not enough to say just "ISO2022" on encoding, we have to
cfb43547 2215 specify more details. In Emacs, each ISO2022 coding system
4ed46869 2216 variant has the following specifications:
8ca3766a 2217 1. Initial designation to G0 through G3.
4ed46869
KH
2218 2. Allows short-form designation?
2219 3. ASCII should be designated to G0 before control characters?
2220 4. ASCII should be designated to G0 at end of line?
2221 5. 7-bit environment or 8-bit environment?
2222 6. Use locking-shift?
2223 7. Use Single-shift?
2224 And the following two are only for Japanese:
2225 8. Use ASCII in place of JIS0201-1976-Roman?
2226 9. Use JISX0208-1983 in place of JISX0208-1978?
2227 These specifications are encoded in `coding->flags' as flag bits
2228 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 2229 details.
4ed46869
KH
2230*/
2231
2232/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
2233 register REG at DST, and increment DST. If <final-char> of CHARSET is
2234 '@', 'A', or 'B' and the coding system CODING allows, produce
2235 designation sequence of short-form. */
4ed46869
KH
2236
2237#define ENCODE_DESIGNATION(charset, reg, coding) \
2238 do { \
2239 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2240 char *intermediate_char_94 = "()*+"; \
2241 char *intermediate_char_96 = ",-./"; \
70c22245 2242 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
b73bfc1c 2243 \
70c22245
KH
2244 if (revision < 255) \
2245 { \
4ed46869
KH
2246 *dst++ = ISO_CODE_ESC; \
2247 *dst++ = '&'; \
70c22245 2248 *dst++ = '@' + revision; \
4ed46869 2249 } \
b73bfc1c 2250 *dst++ = ISO_CODE_ESC; \
4ed46869
KH
2251 if (CHARSET_DIMENSION (charset) == 1) \
2252 { \
2253 if (CHARSET_CHARS (charset) == 94) \
2254 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2255 else \
2256 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2257 } \
2258 else \
2259 { \
2260 *dst++ = '$'; \
2261 if (CHARSET_CHARS (charset) == 94) \
2262 { \
b73bfc1c
KH
2263 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2264 || reg != 0 \
2265 || final_char < '@' || final_char > 'B') \
4ed46869
KH
2266 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2267 } \
2268 else \
b73bfc1c 2269 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
4ed46869 2270 } \
b73bfc1c 2271 *dst++ = final_char; \
4ed46869
KH
2272 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2273 } while (0)
2274
2275/* The following two macros produce codes (control character or escape
2276 sequence) for ISO2022 single-shift functions (single-shift-2 and
2277 single-shift-3). */
2278
2279#define ENCODE_SINGLE_SHIFT_2 \
2280 do { \
2281 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2282 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2283 else \
b73bfc1c 2284 *dst++ = ISO_CODE_SS2; \
4ed46869
KH
2285 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2286 } while (0)
2287
fb88bf2d
KH
2288#define ENCODE_SINGLE_SHIFT_3 \
2289 do { \
4ed46869 2290 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
2291 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2292 else \
b73bfc1c 2293 *dst++ = ISO_CODE_SS3; \
4ed46869
KH
2294 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2295 } while (0)
2296
2297/* The following four macros produce codes (control character or
2298 escape sequence) for ISO2022 locking-shift functions (shift-in,
2299 shift-out, locking-shift-2, and locking-shift-3). */
2300
b73bfc1c
KH
2301#define ENCODE_SHIFT_IN \
2302 do { \
2303 *dst++ = ISO_CODE_SI; \
4ed46869
KH
2304 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2305 } while (0)
2306
b73bfc1c
KH
2307#define ENCODE_SHIFT_OUT \
2308 do { \
2309 *dst++ = ISO_CODE_SO; \
4ed46869
KH
2310 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2311 } while (0)
2312
2313#define ENCODE_LOCKING_SHIFT_2 \
2314 do { \
2315 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2316 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2317 } while (0)
2318
b73bfc1c
KH
2319#define ENCODE_LOCKING_SHIFT_3 \
2320 do { \
2321 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
4ed46869
KH
2322 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2323 } while (0)
2324
f4dee582
RS
2325/* Produce codes for a DIMENSION1 character whose character set is
2326 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
2327 sequences are also produced in advance if necessary. */
2328
6e85d753
KH
2329#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2330 do { \
2331 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2332 { \
2333 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2334 *dst++ = c1 & 0x7F; \
2335 else \
2336 *dst++ = c1 | 0x80; \
2337 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2338 break; \
2339 } \
2340 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2341 { \
2342 *dst++ = c1 & 0x7F; \
2343 break; \
2344 } \
2345 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2346 { \
2347 *dst++ = c1 | 0x80; \
2348 break; \
2349 } \
6e85d753
KH
2350 else \
2351 /* Since CHARSET is not yet invoked to any graphic planes, we \
2352 must invoke it, or, at first, designate it to some graphic \
2353 register. Then repeat the loop to actually produce the \
2354 character. */ \
2355 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2356 } while (1)
2357
f4dee582
RS
2358/* Produce codes for a DIMENSION2 character whose character set is
2359 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
2360 invocation codes are also produced in advance if necessary. */
2361
6e85d753
KH
2362#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2363 do { \
2364 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2365 { \
2366 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2367 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2368 else \
2369 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2370 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2371 break; \
2372 } \
2373 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2374 { \
2375 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2376 break; \
2377 } \
2378 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2379 { \
2380 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2381 break; \
2382 } \
6e85d753
KH
2383 else \
2384 /* Since CHARSET is not yet invoked to any graphic planes, we \
2385 must invoke it, or, at first, designate it to some graphic \
2386 register. Then repeat the loop to actually produce the \
2387 character. */ \
2388 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2389 } while (1)
2390
05e6f5dc
KH
2391#define ENCODE_ISO_CHARACTER(c) \
2392 do { \
2393 int charset, c1, c2; \
2394 \
2395 SPLIT_CHAR (c, charset, c1, c2); \
2396 if (CHARSET_DEFINED_P (charset)) \
2397 { \
2398 if (CHARSET_DIMENSION (charset) == 1) \
2399 { \
2400 if (charset == CHARSET_ASCII \
2401 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2402 charset = charset_latin_jisx0201; \
2403 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2404 } \
2405 else \
2406 { \
2407 if (charset == charset_jisx0208 \
2408 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2409 charset = charset_jisx0208_1978; \
2410 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2411 } \
2412 } \
2413 else \
2414 { \
2415 *dst++ = c1; \
2416 if (c2 >= 0) \
2417 *dst++ = c2; \
2418 } \
2419 } while (0)
2420
2421
2422/* Instead of encoding character C, produce one or two `?'s. */
2423
0eecad43
KH
2424#define ENCODE_UNSAFE_CHARACTER(c) \
2425 do { \
2426 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2427 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2428 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
84fbb8a0 2429 } while (0)
bdd9fb48 2430
05e6f5dc 2431
4ed46869
KH
2432/* Produce designation and invocation codes at a place pointed by DST
2433 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2434 Return new DST. */
2435
2436unsigned char *
2437encode_invocation_designation (charset, coding, dst)
2438 int charset;
2439 struct coding_system *coding;
2440 unsigned char *dst;
2441{
2442 int reg; /* graphic register number */
2443
2444 /* At first, check designations. */
2445 for (reg = 0; reg < 4; reg++)
2446 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2447 break;
2448
2449 if (reg >= 4)
2450 {
2451 /* CHARSET is not yet designated to any graphic registers. */
2452 /* At first check the requested designation. */
2453 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
2454 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2455 /* Since CHARSET requests no special designation, designate it
2456 to graphic register 0. */
4ed46869
KH
2457 reg = 0;
2458
2459 ENCODE_DESIGNATION (charset, reg, coding);
2460 }
2461
2462 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2463 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2464 {
2465 /* Since the graphic register REG is not invoked to any graphic
2466 planes, invoke it to graphic plane 0. */
2467 switch (reg)
2468 {
2469 case 0: /* graphic register 0 */
2470 ENCODE_SHIFT_IN;
2471 break;
2472
2473 case 1: /* graphic register 1 */
2474 ENCODE_SHIFT_OUT;
2475 break;
2476
2477 case 2: /* graphic register 2 */
2478 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2479 ENCODE_SINGLE_SHIFT_2;
2480 else
2481 ENCODE_LOCKING_SHIFT_2;
2482 break;
2483
2484 case 3: /* graphic register 3 */
2485 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2486 ENCODE_SINGLE_SHIFT_3;
2487 else
2488 ENCODE_LOCKING_SHIFT_3;
2489 break;
2490 }
2491 }
b73bfc1c 2492
4ed46869
KH
2493 return dst;
2494}
2495
ec6d2bb8
KH
2496/* Produce 2-byte codes for encoded composition rule RULE. */
2497
2498#define ENCODE_COMPOSITION_RULE(rule) \
2499 do { \
2500 int gref, nref; \
2501 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2502 *dst++ = 32 + 81 + gref; \
2503 *dst++ = 32 + nref; \
2504 } while (0)
2505
2506/* Produce codes for indicating the start of a composition sequence
2507 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2508 which specify information about the composition. See the comment
2509 in coding.h for the format of DATA. */
2510
2511#define ENCODE_COMPOSITION_START(coding, data) \
2512 do { \
2513 coding->composing = data[3]; \
2514 *dst++ = ISO_CODE_ESC; \
2515 if (coding->composing == COMPOSITION_RELATIVE) \
2516 *dst++ = '0'; \
2517 else \
2518 { \
2519 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2520 ? '3' : '4'); \
2521 coding->cmp_data_index = coding->cmp_data_start + 4; \
2522 coding->composition_rule_follows = 0; \
2523 } \
2524 } while (0)
2525
2526/* Produce codes for indicating the end of the current composition. */
2527
2528#define ENCODE_COMPOSITION_END(coding, data) \
2529 do { \
2530 *dst++ = ISO_CODE_ESC; \
2531 *dst++ = '1'; \
2532 coding->cmp_data_start += data[0]; \
2533 coding->composing = COMPOSITION_NO; \
2534 if (coding->cmp_data_start == coding->cmp_data->used \
2535 && coding->cmp_data->next) \
2536 { \
2537 coding->cmp_data = coding->cmp_data->next; \
2538 coding->cmp_data_start = 0; \
2539 } \
2540 } while (0)
2541
2542/* Produce composition start sequence ESC 0. Here, this sequence
2543 doesn't mean the start of a new composition but means that we have
2544 just produced components (alternate chars and composition rules) of
2545 the composition and the actual text follows in SRC. */
2546
2547#define ENCODE_COMPOSITION_FAKE_START(coding) \
2548 do { \
2549 *dst++ = ISO_CODE_ESC; \
2550 *dst++ = '0'; \
2551 coding->composing = COMPOSITION_RELATIVE; \
2552 } while (0)
4ed46869
KH
2553
2554/* The following three macros produce codes for indicating direction
2555 of text. */
b73bfc1c
KH
2556#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2557 do { \
4ed46869 2558 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
b73bfc1c
KH
2559 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2560 else \
2561 *dst++ = ISO_CODE_CSI; \
4ed46869
KH
2562 } while (0)
2563
2564#define ENCODE_DIRECTION_R2L \
b73bfc1c 2565 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
4ed46869
KH
2566
2567#define ENCODE_DIRECTION_L2R \
b73bfc1c 2568 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
4ed46869
KH
2569
2570/* Produce codes for designation and invocation to reset the graphic
2571 planes and registers to initial state. */
e0e989f6
KH
2572#define ENCODE_RESET_PLANE_AND_REGISTER \
2573 do { \
2574 int reg; \
2575 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2576 ENCODE_SHIFT_IN; \
2577 for (reg = 0; reg < 4; reg++) \
2578 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2579 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2580 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2581 ENCODE_DESIGNATION \
2582 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
2583 } while (0)
2584
bdd9fb48 2585/* Produce designation sequences of charsets in the line started from
b73bfc1c 2586 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
2587
2588 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
2589 find all the necessary designations. */
2590
b73bfc1c
KH
2591static unsigned char *
2592encode_designation_at_bol (coding, translation_table, src, src_end, dst)
e0e989f6 2593 struct coding_system *coding;
b73bfc1c
KH
2594 Lisp_Object translation_table;
2595 unsigned char *src, *src_end, *dst;
e0e989f6 2596{
bdd9fb48
KH
2597 int charset, c, found = 0, reg;
2598 /* Table of charsets to be designated to each graphic register. */
2599 int r[4];
bdd9fb48
KH
2600
2601 for (reg = 0; reg < 4; reg++)
2602 r[reg] = -1;
2603
b73bfc1c 2604 while (found < 4)
e0e989f6 2605 {
b73bfc1c
KH
2606 ONE_MORE_CHAR (c);
2607 if (c == '\n')
2608 break;
93dec019 2609
b73bfc1c 2610 charset = CHAR_CHARSET (c);
e0e989f6 2611 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 2612 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
2613 {
2614 found++;
2615 r[reg] = charset;
2616 }
bdd9fb48
KH
2617 }
2618
b73bfc1c 2619 label_end_of_loop:
bdd9fb48
KH
2620 if (found)
2621 {
2622 for (reg = 0; reg < 4; reg++)
2623 if (r[reg] >= 0
2624 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2625 ENCODE_DESIGNATION (r[reg], reg, coding);
e0e989f6 2626 }
b73bfc1c
KH
2627
2628 return dst;
e0e989f6
KH
2629}
2630
4ed46869
KH
2631/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2632
b73bfc1c 2633static void
d46c5b12 2634encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2635 struct coding_system *coding;
2636 unsigned char *source, *destination;
2637 int src_bytes, dst_bytes;
4ed46869
KH
2638{
2639 unsigned char *src = source;
2640 unsigned char *src_end = source + src_bytes;
2641 unsigned char *dst = destination;
2642 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c 2643 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
2644 from DST_END to assure overflow checking is necessary only at the
2645 head of loop. */
b73bfc1c
KH
2646 unsigned char *adjusted_dst_end = dst_end - 19;
2647 /* SRC_BASE remembers the start position in source in each loop.
2648 The loop will be exited when there's not enough source text to
2649 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2650 there's not enough destination area to produce encoded codes
2651 (within macro EMIT_BYTES). */
2652 unsigned char *src_base;
2653 int c;
2654 Lisp_Object translation_table;
05e6f5dc
KH
2655 Lisp_Object safe_chars;
2656
0eecad43
KH
2657 if (coding->flags & CODING_FLAG_ISO_SAFE)
2658 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2659
6b89e3aa 2660 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 2661
b73bfc1c
KH
2662 if (NILP (Venable_character_translation))
2663 translation_table = Qnil;
2664 else
2665 {
2666 translation_table = coding->translation_table_for_encode;
2667 if (NILP (translation_table))
2668 translation_table = Vstandard_translation_table_for_encode;
2669 }
4ed46869 2670
d46c5b12 2671 coding->consumed_char = 0;
b73bfc1c
KH
2672 coding->errors = 0;
2673 while (1)
4ed46869 2674 {
b73bfc1c
KH
2675 src_base = src;
2676
2677 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2678 {
2679 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2680 break;
2681 }
4ed46869 2682
e0e989f6
KH
2683 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2684 && CODING_SPEC_ISO_BOL (coding))
2685 {
bdd9fb48 2686 /* We have to produce designation sequences if any now. */
b73bfc1c
KH
2687 dst = encode_designation_at_bol (coding, translation_table,
2688 src, src_end, dst);
e0e989f6
KH
2689 CODING_SPEC_ISO_BOL (coding) = 0;
2690 }
2691
ec6d2bb8
KH
2692 /* Check composition start and end. */
2693 if (coding->composing != COMPOSITION_DISABLED
2694 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 2695 {
ec6d2bb8
KH
2696 struct composition_data *cmp_data = coding->cmp_data;
2697 int *data = cmp_data->data + coding->cmp_data_start;
2698 int this_pos = cmp_data->char_offset + coding->consumed_char;
2699
2700 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 2701 {
ec6d2bb8
KH
2702 if (this_pos == data[2])
2703 {
2704 ENCODE_COMPOSITION_END (coding, data);
2705 cmp_data = coding->cmp_data;
2706 data = cmp_data->data + coding->cmp_data_start;
2707 }
4ed46869 2708 }
ec6d2bb8 2709 else if (COMPOSING_P (coding))
4ed46869 2710 {
ec6d2bb8
KH
2711 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2712 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2713 /* We have consumed components of the composition.
8ca3766a 2714 What follows in SRC is the composition's base
ec6d2bb8
KH
2715 text. */
2716 ENCODE_COMPOSITION_FAKE_START (coding);
2717 else
4ed46869 2718 {
ec6d2bb8
KH
2719 int c = cmp_data->data[coding->cmp_data_index++];
2720 if (coding->composition_rule_follows)
2721 {
2722 ENCODE_COMPOSITION_RULE (c);
2723 coding->composition_rule_follows = 0;
2724 }
2725 else
2726 {
0eecad43 2727 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
05e6f5dc
KH
2728 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2729 ENCODE_UNSAFE_CHARACTER (c);
2730 else
2731 ENCODE_ISO_CHARACTER (c);
ec6d2bb8
KH
2732 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2733 coding->composition_rule_follows = 1;
2734 }
4ed46869
KH
2735 continue;
2736 }
ec6d2bb8
KH
2737 }
2738 if (!COMPOSING_P (coding))
2739 {
2740 if (this_pos == data[1])
4ed46869 2741 {
ec6d2bb8
KH
2742 ENCODE_COMPOSITION_START (coding, data);
2743 continue;
4ed46869 2744 }
4ed46869
KH
2745 }
2746 }
ec6d2bb8 2747
b73bfc1c 2748 ONE_MORE_CHAR (c);
4ed46869 2749
b73bfc1c
KH
2750 /* Now encode the character C. */
2751 if (c < 0x20 || c == 0x7F)
2752 {
2753 if (c == '\r')
19a8d9e0 2754 {
b73bfc1c
KH
2755 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2756 {
2757 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2758 ENCODE_RESET_PLANE_AND_REGISTER;
2759 *dst++ = c;
2760 continue;
2761 }
2762 /* fall down to treat '\r' as '\n' ... */
2763 c = '\n';
19a8d9e0 2764 }
b73bfc1c 2765 if (c == '\n')
19a8d9e0 2766 {
b73bfc1c
KH
2767 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2768 ENCODE_RESET_PLANE_AND_REGISTER;
2769 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2770 bcopy (coding->spec.iso2022.initial_designation,
2771 coding->spec.iso2022.current_designation,
2772 sizeof coding->spec.iso2022.initial_designation);
2773 if (coding->eol_type == CODING_EOL_LF
2774 || coding->eol_type == CODING_EOL_UNDECIDED)
2775 *dst++ = ISO_CODE_LF;
2776 else if (coding->eol_type == CODING_EOL_CRLF)
2777 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2778 else
2779 *dst++ = ISO_CODE_CR;
2780 CODING_SPEC_ISO_BOL (coding) = 1;
19a8d9e0 2781 }
93dec019 2782 else
19a8d9e0 2783 {
b73bfc1c
KH
2784 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2785 ENCODE_RESET_PLANE_AND_REGISTER;
2786 *dst++ = c;
19a8d9e0 2787 }
4ed46869 2788 }
b73bfc1c 2789 else if (ASCII_BYTE_P (c))
05e6f5dc 2790 ENCODE_ISO_CHARACTER (c);
b73bfc1c 2791 else if (SINGLE_BYTE_CHAR_P (c))
88993dfd 2792 {
b73bfc1c
KH
2793 *dst++ = c;
2794 coding->errors++;
88993dfd 2795 }
0eecad43 2796 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
05e6f5dc
KH
2797 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2798 ENCODE_UNSAFE_CHARACTER (c);
b73bfc1c 2799 else
05e6f5dc 2800 ENCODE_ISO_CHARACTER (c);
b73bfc1c
KH
2801
2802 coding->consumed_char++;
84fbb8a0 2803 }
b73bfc1c
KH
2804
2805 label_end_of_loop:
2806 coding->consumed = src_base - source;
d46c5b12 2807 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2808}
2809
2810\f
2811/*** 4. SJIS and BIG5 handlers ***/
2812
cfb43547 2813/* Although SJIS and BIG5 are not ISO coding systems, they are used
4ed46869
KH
2814 quite widely. So, for the moment, Emacs supports them in the bare
2815 C code. But, in the future, they may be supported only by CCL. */
2816
2817/* SJIS is a coding system encoding three character sets: ASCII, right
2818 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2819 as is. A character of charset katakana-jisx0201 is encoded by
2820 "position-code + 0x80". A character of charset japanese-jisx0208
2821 is encoded in 2-byte but two position-codes are divided and shifted
cfb43547 2822 so that it fits in the range below.
4ed46869
KH
2823
2824 --- CODE RANGE of SJIS ---
2825 (character set) (range)
2826 ASCII 0x00 .. 0x7F
682169fe 2827 KATAKANA-JISX0201 0xA1 .. 0xDF
c28a9453 2828 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2829 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2830 -------------------------------
2831
2832*/
2833
2834/* BIG5 is a coding system encoding two character sets: ASCII and
2835 Big5. An ASCII character is encoded as is. Big5 is a two-byte
cfb43547 2836 character set and is encoded in two bytes.
4ed46869
KH
2837
2838 --- CODE RANGE of BIG5 ---
2839 (character set) (range)
2840 ASCII 0x00 .. 0x7F
2841 Big5 (1st byte) 0xA1 .. 0xFE
2842 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2843 --------------------------
2844
2845 Since the number of characters in Big5 is larger than maximum
2846 characters in Emacs' charset (96x96), it can't be handled as one
2847 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2848 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2849 contains frequently used characters and the latter contains less
2850 frequently used characters. */
2851
2852/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2853 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
f458a8e0 2854 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
4ed46869
KH
2855 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2856
2857/* Number of Big5 characters which have the same code in 1st byte. */
2858#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2859
2860#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2861 do { \
2862 unsigned int temp \
2863 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2864 if (b1 < 0xC9) \
2865 charset = charset_big5_1; \
2866 else \
2867 { \
2868 charset = charset_big5_2; \
2869 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2870 } \
2871 c1 = temp / (0xFF - 0xA1) + 0x21; \
2872 c2 = temp % (0xFF - 0xA1) + 0x21; \
2873 } while (0)
2874
2875#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2876 do { \
2877 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2878 if (charset == charset_big5_2) \
2879 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2880 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2881 b2 = temp % BIG5_SAME_ROW; \
2882 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2883 } while (0)
2884
2885/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2886 Check if a text is encoded in SJIS. If it is, return
2887 CODING_CATEGORY_MASK_SJIS, else return 0. */
2888
0a28aafb
KH
2889static int
2890detect_coding_sjis (src, src_end, multibytep)
4ed46869 2891 unsigned char *src, *src_end;
0a28aafb 2892 int multibytep;
4ed46869 2893{
b73bfc1c
KH
2894 int c;
2895 /* Dummy for ONE_MORE_BYTE. */
2896 struct coding_system dummy_coding;
2897 struct coding_system *coding = &dummy_coding;
4ed46869 2898
b73bfc1c 2899 while (1)
4ed46869 2900 {
0a28aafb 2901 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2902 if (c < 0x80)
2903 continue;
2904 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2905 return 0;
2906 if (c <= 0x9F || c >= 0xE0)
4ed46869 2907 {
682169fe
KH
2908 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2909 if (c < 0x40 || c == 0x7F || c > 0xFC)
4ed46869
KH
2910 return 0;
2911 }
2912 }
b73bfc1c 2913 label_end_of_loop:
4ed46869
KH
2914 return CODING_CATEGORY_MASK_SJIS;
2915}
2916
2917/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2918 Check if a text is encoded in BIG5. If it is, return
2919 CODING_CATEGORY_MASK_BIG5, else return 0. */
2920
0a28aafb
KH
2921static int
2922detect_coding_big5 (src, src_end, multibytep)
4ed46869 2923 unsigned char *src, *src_end;
0a28aafb 2924 int multibytep;
4ed46869 2925{
b73bfc1c
KH
2926 int c;
2927 /* Dummy for ONE_MORE_BYTE. */
2928 struct coding_system dummy_coding;
2929 struct coding_system *coding = &dummy_coding;
4ed46869 2930
b73bfc1c 2931 while (1)
4ed46869 2932 {
0a28aafb 2933 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2934 if (c < 0x80)
2935 continue;
2936 if (c < 0xA1 || c > 0xFE)
2937 return 0;
2938 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2939 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2940 return 0;
4ed46869 2941 }
b73bfc1c 2942 label_end_of_loop:
4ed46869
KH
2943 return CODING_CATEGORY_MASK_BIG5;
2944}
2945
fa42c37f
KH
2946/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2947 Check if a text is encoded in UTF-8. If it is, return
2948 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2949
2950#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2951#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2952#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2953#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2954#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2955#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2956#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2957
0a28aafb
KH
2958static int
2959detect_coding_utf_8 (src, src_end, multibytep)
fa42c37f 2960 unsigned char *src, *src_end;
0a28aafb 2961 int multibytep;
fa42c37f
KH
2962{
2963 unsigned char c;
2964 int seq_maybe_bytes;
b73bfc1c
KH
2965 /* Dummy for ONE_MORE_BYTE. */
2966 struct coding_system dummy_coding;
2967 struct coding_system *coding = &dummy_coding;
fa42c37f 2968
b73bfc1c 2969 while (1)
fa42c37f 2970 {
0a28aafb 2971 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
2972 if (UTF_8_1_OCTET_P (c))
2973 continue;
2974 else if (UTF_8_2_OCTET_LEADING_P (c))
2975 seq_maybe_bytes = 1;
2976 else if (UTF_8_3_OCTET_LEADING_P (c))
2977 seq_maybe_bytes = 2;
2978 else if (UTF_8_4_OCTET_LEADING_P (c))
2979 seq_maybe_bytes = 3;
2980 else if (UTF_8_5_OCTET_LEADING_P (c))
2981 seq_maybe_bytes = 4;
2982 else if (UTF_8_6_OCTET_LEADING_P (c))
2983 seq_maybe_bytes = 5;
2984 else
2985 return 0;
2986
2987 do
2988 {
0a28aafb 2989 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
2990 if (!UTF_8_EXTRA_OCTET_P (c))
2991 return 0;
2992 seq_maybe_bytes--;
2993 }
2994 while (seq_maybe_bytes > 0);
2995 }
2996
b73bfc1c 2997 label_end_of_loop:
fa42c37f
KH
2998 return CODING_CATEGORY_MASK_UTF_8;
2999}
3000
3001/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3002 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3003 Little Endian (otherwise). If it is, return
3004 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3005 else return 0. */
3006
3007#define UTF_16_INVALID_P(val) \
3008 (((val) == 0xFFFE) \
3009 || ((val) == 0xFFFF))
3010
3011#define UTF_16_HIGH_SURROGATE_P(val) \
3012 (((val) & 0xD800) == 0xD800)
3013
3014#define UTF_16_LOW_SURROGATE_P(val) \
3015 (((val) & 0xDC00) == 0xDC00)
3016
0a28aafb
KH
3017static int
3018detect_coding_utf_16 (src, src_end, multibytep)
fa42c37f 3019 unsigned char *src, *src_end;
0a28aafb 3020 int multibytep;
fa42c37f 3021{
b73bfc1c 3022 unsigned char c1, c2;
1c7457e2 3023 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
b73bfc1c
KH
3024 struct coding_system dummy_coding;
3025 struct coding_system *coding = &dummy_coding;
fa42c37f 3026
0a28aafb
KH
3027 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3028 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
b73bfc1c
KH
3029
3030 if ((c1 == 0xFF) && (c2 == 0xFE))
fa42c37f 3031 return CODING_CATEGORY_MASK_UTF_16_LE;
b73bfc1c 3032 else if ((c1 == 0xFE) && (c2 == 0xFF))
fa42c37f
KH
3033 return CODING_CATEGORY_MASK_UTF_16_BE;
3034
b73bfc1c 3035 label_end_of_loop:
fa42c37f
KH
3036 return 0;
3037}
3038
4ed46869
KH
3039/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3040 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3041
b73bfc1c 3042static void
4ed46869 3043decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 3044 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
3045 struct coding_system *coding;
3046 unsigned char *source, *destination;
3047 int src_bytes, dst_bytes;
4ed46869
KH
3048 int sjis_p;
3049{
3050 unsigned char *src = source;
3051 unsigned char *src_end = source + src_bytes;
3052 unsigned char *dst = destination;
3053 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
3054 /* SRC_BASE remembers the start position in source in each loop.
3055 The loop will be exited when there's not enough source code
3056 (within macro ONE_MORE_BYTE), or when there's not enough
3057 destination area to produce a character (within macro
3058 EMIT_CHAR). */
3059 unsigned char *src_base;
3060 Lisp_Object translation_table;
a5d301df 3061
b73bfc1c
KH
3062 if (NILP (Venable_character_translation))
3063 translation_table = Qnil;
3064 else
3065 {
3066 translation_table = coding->translation_table_for_decode;
3067 if (NILP (translation_table))
3068 translation_table = Vstandard_translation_table_for_decode;
3069 }
4ed46869 3070
d46c5b12 3071 coding->produced_char = 0;
b73bfc1c 3072 while (1)
4ed46869 3073 {
85478bc6 3074 int c, charset, c1, c2 = 0;
b73bfc1c
KH
3075
3076 src_base = src;
3077 ONE_MORE_BYTE (c1);
3078
3079 if (c1 < 0x80)
4ed46869 3080 {
b73bfc1c
KH
3081 charset = CHARSET_ASCII;
3082 if (c1 < 0x20)
4ed46869 3083 {
b73bfc1c 3084 if (c1 == '\r')
d46c5b12 3085 {
b73bfc1c 3086 if (coding->eol_type == CODING_EOL_CRLF)
d46c5b12 3087 {
b73bfc1c
KH
3088 ONE_MORE_BYTE (c2);
3089 if (c2 == '\n')
3090 c1 = c2;
b73bfc1c
KH
3091 else
3092 /* To process C2 again, SRC is subtracted by 1. */
3093 src--;
d46c5b12 3094 }
b73bfc1c
KH
3095 else if (coding->eol_type == CODING_EOL_CR)
3096 c1 = '\n';
3097 }
3098 else if (c1 == '\n'
3099 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3100 && (coding->eol_type == CODING_EOL_CR
3101 || coding->eol_type == CODING_EOL_CRLF))
3102 {
3103 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3104 goto label_end_of_loop;
d46c5b12 3105 }
4ed46869 3106 }
4ed46869 3107 }
54f78171 3108 else
b73bfc1c 3109 {
4ed46869
KH
3110 if (sjis_p)
3111 {
682169fe 3112 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
b73bfc1c 3113 goto label_invalid_code;
682169fe 3114 if (c1 <= 0x9F || c1 >= 0xE0)
fb88bf2d 3115 {
54f78171
KH
3116 /* SJIS -> JISX0208 */
3117 ONE_MORE_BYTE (c2);
b73bfc1c
KH
3118 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3119 goto label_invalid_code;
3120 DECODE_SJIS (c1, c2, c1, c2);
3121 charset = charset_jisx0208;
5e34de15 3122 }
fb88bf2d 3123 else
b73bfc1c
KH
3124 /* SJIS -> JISX0201-Kana */
3125 charset = charset_katakana_jisx0201;
4ed46869 3126 }
fb88bf2d 3127 else
fb88bf2d 3128 {
54f78171 3129 /* BIG5 -> Big5 */
682169fe 3130 if (c1 < 0xA0 || c1 > 0xFE)
b73bfc1c
KH
3131 goto label_invalid_code;
3132 ONE_MORE_BYTE (c2);
3133 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3134 goto label_invalid_code;
3135 DECODE_BIG5 (c1, c2, charset, c1, c2);
4ed46869
KH
3136 }
3137 }
4ed46869 3138
b73bfc1c
KH
3139 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3140 EMIT_CHAR (c);
fb88bf2d
KH
3141 continue;
3142
b73bfc1c
KH
3143 label_invalid_code:
3144 coding->errors++;
4ed46869 3145 src = src_base;
b73bfc1c
KH
3146 c = *src++;
3147 EMIT_CHAR (c);
fb88bf2d 3148 }
d46c5b12 3149
b73bfc1c
KH
3150 label_end_of_loop:
3151 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 3152 coding->produced = dst - destination;
b73bfc1c 3153 return;
4ed46869
KH
3154}
3155
3156/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
3157 This function can encode charsets `ascii', `katakana-jisx0201',
3158 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3159 are sure that all these charsets are registered as official charset
4ed46869
KH
3160 (i.e. do not have extended leading-codes). Characters of other
3161 charsets are produced without any encoding. If SJIS_P is 1, encode
3162 SJIS text, else encode BIG5 text. */
3163
b73bfc1c 3164static void
4ed46869 3165encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 3166 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
3167 struct coding_system *coding;
3168 unsigned char *source, *destination;
3169 int src_bytes, dst_bytes;
4ed46869
KH
3170 int sjis_p;
3171{
3172 unsigned char *src = source;
3173 unsigned char *src_end = source + src_bytes;
3174 unsigned char *dst = destination;
3175 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
3176 /* SRC_BASE remembers the start position in source in each loop.
3177 The loop will be exited when there's not enough source text to
3178 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3179 there's not enough destination area to produce encoded codes
3180 (within macro EMIT_BYTES). */
3181 unsigned char *src_base;
3182 Lisp_Object translation_table;
4ed46869 3183
b73bfc1c
KH
3184 if (NILP (Venable_character_translation))
3185 translation_table = Qnil;
3186 else
4ed46869 3187 {
39658efc 3188 translation_table = coding->translation_table_for_encode;
b73bfc1c 3189 if (NILP (translation_table))
39658efc 3190 translation_table = Vstandard_translation_table_for_encode;
b73bfc1c 3191 }
a5d301df 3192
b73bfc1c
KH
3193 while (1)
3194 {
3195 int c, charset, c1, c2;
4ed46869 3196
b73bfc1c
KH
3197 src_base = src;
3198 ONE_MORE_CHAR (c);
93dec019 3199
b73bfc1c
KH
3200 /* Now encode the character C. */
3201 if (SINGLE_BYTE_CHAR_P (c))
3202 {
3203 switch (c)
4ed46869 3204 {
b73bfc1c 3205 case '\r':
7371fe0a 3206 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
b73bfc1c
KH
3207 {
3208 EMIT_ONE_BYTE (c);
3209 break;
3210 }
3211 c = '\n';
3212 case '\n':
3213 if (coding->eol_type == CODING_EOL_CRLF)
3214 {
3215 EMIT_TWO_BYTES ('\r', c);
3216 break;
3217 }
3218 else if (coding->eol_type == CODING_EOL_CR)
3219 c = '\r';
3220 default:
3221 EMIT_ONE_BYTE (c);
3222 }
3223 }
3224 else
3225 {
3226 SPLIT_CHAR (c, charset, c1, c2);
3227 if (sjis_p)
3228 {
3229 if (charset == charset_jisx0208
3230 || charset == charset_jisx0208_1978)
3231 {
3232 ENCODE_SJIS (c1, c2, c1, c2);
3233 EMIT_TWO_BYTES (c1, c2);
3234 }
39658efc
KH
3235 else if (charset == charset_katakana_jisx0201)
3236 EMIT_ONE_BYTE (c1 | 0x80);
fc53a214
KH
3237 else if (charset == charset_latin_jisx0201)
3238 EMIT_ONE_BYTE (c1);
0eecad43
KH
3239 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3240 {
3241 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3242 if (CHARSET_WIDTH (charset) > 1)
3243 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3244 }
b73bfc1c
KH
3245 else
3246 /* There's no way other than producing the internal
3247 codes as is. */
3248 EMIT_BYTES (src_base, src);
4ed46869 3249 }
4ed46869 3250 else
b73bfc1c
KH
3251 {
3252 if (charset == charset_big5_1 || charset == charset_big5_2)
3253 {
3254 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3255 EMIT_TWO_BYTES (c1, c2);
3256 }
0eecad43
KH
3257 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3258 {
3259 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260 if (CHARSET_WIDTH (charset) > 1)
3261 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3262 }
b73bfc1c
KH
3263 else
3264 /* There's no way other than producing the internal
3265 codes as is. */
3266 EMIT_BYTES (src_base, src);
3267 }
4ed46869 3268 }
b73bfc1c 3269 coding->consumed_char++;
4ed46869
KH
3270 }
3271
b73bfc1c
KH
3272 label_end_of_loop:
3273 coding->consumed = src_base - source;
d46c5b12 3274 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
3275}
3276
3277\f
1397dc18
KH
3278/*** 5. CCL handlers ***/
3279
3280/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3281 Check if a text is encoded in a coding system of which
3282 encoder/decoder are written in CCL program. If it is, return
3283 CODING_CATEGORY_MASK_CCL, else return 0. */
3284
0a28aafb
KH
3285static int
3286detect_coding_ccl (src, src_end, multibytep)
1397dc18 3287 unsigned char *src, *src_end;
0a28aafb 3288 int multibytep;
1397dc18
KH
3289{
3290 unsigned char *valid;
b73bfc1c
KH
3291 int c;
3292 /* Dummy for ONE_MORE_BYTE. */
3293 struct coding_system dummy_coding;
3294 struct coding_system *coding = &dummy_coding;
1397dc18
KH
3295
3296 /* No coding system is assigned to coding-category-ccl. */
3297 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3298 return 0;
3299
3300 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
b73bfc1c 3301 while (1)
1397dc18 3302 {
0a28aafb 3303 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
3304 if (! valid[c])
3305 return 0;
1397dc18 3306 }
b73bfc1c 3307 label_end_of_loop:
1397dc18
KH
3308 return CODING_CATEGORY_MASK_CCL;
3309}
3310
3311\f
3312/*** 6. End-of-line handlers ***/
4ed46869 3313
b73bfc1c 3314/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 3315
b73bfc1c 3316static void
d46c5b12 3317decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3318 struct coding_system *coding;
3319 unsigned char *source, *destination;
3320 int src_bytes, dst_bytes;
4ed46869
KH
3321{
3322 unsigned char *src = source;
4ed46869 3323 unsigned char *dst = destination;
b73bfc1c
KH
3324 unsigned char *src_end = src + src_bytes;
3325 unsigned char *dst_end = dst + dst_bytes;
3326 Lisp_Object translation_table;
3327 /* SRC_BASE remembers the start position in source in each loop.
3328 The loop will be exited when there's not enough source code
3329 (within macro ONE_MORE_BYTE), or when there's not enough
3330 destination area to produce a character (within macro
3331 EMIT_CHAR). */
3332 unsigned char *src_base;
3333 int c;
3334
3335 translation_table = Qnil;
4ed46869
KH
3336 switch (coding->eol_type)
3337 {
3338 case CODING_EOL_CRLF:
b73bfc1c 3339 while (1)
d46c5b12 3340 {
b73bfc1c
KH
3341 src_base = src;
3342 ONE_MORE_BYTE (c);
3343 if (c == '\r')
fb88bf2d 3344 {
b73bfc1c
KH
3345 ONE_MORE_BYTE (c);
3346 if (c != '\n')
3347 {
b73bfc1c
KH
3348 src--;
3349 c = '\r';
3350 }
fb88bf2d 3351 }
b73bfc1c
KH
3352 else if (c == '\n'
3353 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
d46c5b12 3354 {
b73bfc1c
KH
3355 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3356 goto label_end_of_loop;
d46c5b12 3357 }
b73bfc1c 3358 EMIT_CHAR (c);
d46c5b12 3359 }
b73bfc1c
KH
3360 break;
3361
3362 case CODING_EOL_CR:
3363 while (1)
d46c5b12 3364 {
b73bfc1c
KH
3365 src_base = src;
3366 ONE_MORE_BYTE (c);
3367 if (c == '\n')
3368 {
3369 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3370 {
3371 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3372 goto label_end_of_loop;
3373 }
3374 }
3375 else if (c == '\r')
3376 c = '\n';
3377 EMIT_CHAR (c);
d46c5b12 3378 }
4ed46869
KH
3379 break;
3380
b73bfc1c
KH
3381 default: /* no need for EOL handling */
3382 while (1)
d46c5b12 3383 {
b73bfc1c
KH
3384 src_base = src;
3385 ONE_MORE_BYTE (c);
3386 EMIT_CHAR (c);
d46c5b12 3387 }
4ed46869
KH
3388 }
3389
b73bfc1c
KH
3390 label_end_of_loop:
3391 coding->consumed = coding->consumed_char = src_base - source;
3392 coding->produced = dst - destination;
3393 return;
4ed46869
KH
3394}
3395
3396/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
b73bfc1c 3397 format of end-of-line according to `coding->eol_type'. It also
8ca3766a 3398 convert multibyte form 8-bit characters to unibyte if
b73bfc1c
KH
3399 CODING->src_multibyte is nonzero. If `coding->mode &
3400 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3401 also means end-of-line. */
4ed46869 3402
b73bfc1c 3403static void
d46c5b12 3404encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869 3405 struct coding_system *coding;
a4244313
KR
3406 const unsigned char *source;
3407 unsigned char *destination;
4ed46869 3408 int src_bytes, dst_bytes;
4ed46869 3409{
a4244313 3410 const unsigned char *src = source;
4ed46869 3411 unsigned char *dst = destination;
a4244313 3412 const unsigned char *src_end = src + src_bytes;
b73bfc1c
KH
3413 unsigned char *dst_end = dst + dst_bytes;
3414 Lisp_Object translation_table;
3415 /* SRC_BASE remembers the start position in source in each loop.
3416 The loop will be exited when there's not enough source text to
3417 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3418 there's not enough destination area to produce encoded codes
3419 (within macro EMIT_BYTES). */
a4244313
KR
3420 const unsigned char *src_base;
3421 unsigned char *tmp;
b73bfc1c
KH
3422 int c;
3423 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3424
3425 translation_table = Qnil;
3426 if (coding->src_multibyte
3427 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3428 {
3429 src_end--;
3430 src_bytes--;
3431 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3432 }
fb88bf2d 3433
d46c5b12
KH
3434 if (coding->eol_type == CODING_EOL_CRLF)
3435 {
b73bfc1c 3436 while (src < src_end)
d46c5b12 3437 {
b73bfc1c 3438 src_base = src;
d46c5b12 3439 c = *src++;
b73bfc1c
KH
3440 if (c >= 0x20)
3441 EMIT_ONE_BYTE (c);
3442 else if (c == '\n' || (c == '\r' && selective_display))
3443 EMIT_TWO_BYTES ('\r', '\n');
d46c5b12 3444 else
b73bfc1c 3445 EMIT_ONE_BYTE (c);
d46c5b12 3446 }
ff2b1ea9 3447 src_base = src;
b73bfc1c 3448 label_end_of_loop:
005f0d35 3449 ;
d46c5b12
KH
3450 }
3451 else
4ed46869 3452 {
78a629d2 3453 if (!dst_bytes || src_bytes <= dst_bytes)
4ed46869 3454 {
b73bfc1c
KH
3455 safe_bcopy (src, dst, src_bytes);
3456 src_base = src_end;
3457 dst += src_bytes;
d46c5b12 3458 }
d46c5b12 3459 else
b73bfc1c
KH
3460 {
3461 if (coding->src_multibyte
3462 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3463 dst_bytes--;
3464 safe_bcopy (src, dst, dst_bytes);
3465 src_base = src + dst_bytes;
3466 dst = destination + dst_bytes;
3467 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3468 }
993824c9 3469 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 3470 {
a4244313
KR
3471 for (tmp = destination; tmp < dst; tmp++)
3472 if (*tmp == '\n') *tmp = '\r';
d46c5b12 3473 }
b73bfc1c 3474 else if (selective_display)
d46c5b12 3475 {
a4244313
KR
3476 for (tmp = destination; tmp < dst; tmp++)
3477 if (*tmp == '\r') *tmp = '\n';
4ed46869 3478 }
4ed46869 3479 }
b73bfc1c
KH
3480 if (coding->src_multibyte)
3481 dst = destination + str_as_unibyte (destination, dst - destination);
4ed46869 3482
b73bfc1c
KH
3483 coding->consumed = src_base - source;
3484 coding->produced = dst - destination;
78a629d2 3485 coding->produced_char = coding->produced;
4ed46869
KH
3486}
3487
3488\f
1397dc18 3489/*** 7. C library functions ***/
4ed46869 3490
cfb43547 3491/* In Emacs Lisp, a coding system is represented by a Lisp symbol which
4ed46869 3492 has a property `coding-system'. The value of this property is a
cfb43547 3493 vector of length 5 (called the coding-vector). Among elements of
4ed46869
KH
3494 this vector, the first (element[0]) and the fifth (element[4])
3495 carry important information for decoding/encoding. Before
3496 decoding/encoding, this information should be set in fields of a
3497 structure of type `coding_system'.
3498
cfb43547 3499 The value of the property `coding-system' can be a symbol of another
4ed46869
KH
3500 subsidiary coding-system. In that case, Emacs gets coding-vector
3501 from that symbol.
3502
3503 `element[0]' contains information to be set in `coding->type'. The
3504 value and its meaning is as follows:
3505
0ef69138
KH
3506 0 -- coding_type_emacs_mule
3507 1 -- coding_type_sjis
3508 2 -- coding_type_iso2022
3509 3 -- coding_type_big5
3510 4 -- coding_type_ccl encoder/decoder written in CCL
3511 nil -- coding_type_no_conversion
3512 t -- coding_type_undecided (automatic conversion on decoding,
3513 no-conversion on encoding)
4ed46869
KH
3514
3515 `element[4]' contains information to be set in `coding->flags' and
3516 `coding->spec'. The meaning varies by `coding->type'.
3517
3518 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3519 of length 32 (of which the first 13 sub-elements are used now).
3520 Meanings of these sub-elements are:
3521
3522 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3523 If the value is an integer of valid charset, the charset is
3524 assumed to be designated to graphic register N initially.
3525
3526 If the value is minus, it is a minus value of charset which
3527 reserves graphic register N, which means that the charset is
3528 not designated initially but should be designated to graphic
3529 register N just before encoding a character in that charset.
3530
3531 If the value is nil, graphic register N is never used on
3532 encoding.
93dec019 3533
4ed46869
KH
3534 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3535 Each value takes t or nil. See the section ISO2022 of
3536 `coding.h' for more information.
3537
3538 If `coding->type' is `coding_type_big5', element[4] is t to denote
3539 BIG5-ETen or nil to denote BIG5-HKU.
3540
3541 If `coding->type' takes the other value, element[4] is ignored.
3542
cfb43547 3543 Emacs Lisp's coding systems also carry information about format of
4ed46869
KH
3544 end-of-line in a value of property `eol-type'. If the value is
3545 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3546 means CODING_EOL_CR. If it is not integer, it should be a vector
3547 of subsidiary coding systems of which property `eol-type' has one
cfb43547 3548 of the above values.
4ed46869
KH
3549
3550*/
3551
3552/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3553 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3554 is setup so that no conversion is necessary and return -1, else
3555 return 0. */
3556
3557int
e0e989f6
KH
3558setup_coding_system (coding_system, coding)
3559 Lisp_Object coding_system;
4ed46869
KH
3560 struct coding_system *coding;
3561{
d46c5b12 3562 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 3563 Lisp_Object val;
4ed46869 3564
c07c8e12
KH
3565 /* At first, zero clear all members. */
3566 bzero (coding, sizeof (struct coding_system));
3567
d46c5b12 3568 /* Initialize some fields required for all kinds of coding systems. */
774324d6 3569 coding->symbol = coding_system;
d46c5b12
KH
3570 coding->heading_ascii = -1;
3571 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
3572 coding->composing = COMPOSITION_DISABLED;
3573 coding->cmp_data = NULL;
1f5dbf34
KH
3574
3575 if (NILP (coding_system))
3576 goto label_invalid_coding_system;
3577
4608c386 3578 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 3579
4608c386
KH
3580 if (!VECTORP (coding_spec)
3581 || XVECTOR (coding_spec)->size != 5
3582 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 3583 goto label_invalid_coding_system;
4608c386 3584
d46c5b12
KH
3585 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3586 if (VECTORP (eol_type))
3587 {
3588 coding->eol_type = CODING_EOL_UNDECIDED;
3589 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3590 }
3591 else if (XFASTINT (eol_type) == 1)
3592 {
3593 coding->eol_type = CODING_EOL_CRLF;
3594 coding->common_flags
3595 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3596 }
3597 else if (XFASTINT (eol_type) == 2)
3598 {
3599 coding->eol_type = CODING_EOL_CR;
3600 coding->common_flags
3601 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3602 }
3603 else
3604 coding->eol_type = CODING_EOL_LF;
3605
3606 coding_type = XVECTOR (coding_spec)->contents[0];
3607 /* Try short cut. */
3608 if (SYMBOLP (coding_type))
3609 {
3610 if (EQ (coding_type, Qt))
3611 {
3612 coding->type = coding_type_undecided;
3613 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3614 }
3615 else
3616 coding->type = coding_type_no_conversion;
9b96232f
KH
3617 /* Initialize this member. Any thing other than
3618 CODING_CATEGORY_IDX_UTF_16_BE and
3619 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3620 special treatment in detect_eol. */
3621 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3622
d46c5b12
KH
3623 return 0;
3624 }
3625
d46c5b12
KH
3626 /* Get values of coding system properties:
3627 `post-read-conversion', `pre-write-conversion',
f967223b 3628 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 3629 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae 3630 /* Pre & post conversion functions should be disabled if
8ca3766a 3631 inhibit_eol_conversion is nonzero. This is the case that a code
b843d1ae
KH
3632 conversion function is called while those functions are running. */
3633 if (! inhibit_pre_post_conversion)
3634 {
3635 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3636 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3637 }
f967223b 3638 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 3639 if (SYMBOLP (val))
f967223b
KH
3640 val = Fget (val, Qtranslation_table_for_decode);
3641 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3642 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 3643 if (SYMBOLP (val))
f967223b
KH
3644 val = Fget (val, Qtranslation_table_for_encode);
3645 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
3646 val = Fplist_get (plist, Qcoding_category);
3647 if (!NILP (val))
3648 {
3649 val = Fget (val, Qcoding_category_index);
3650 if (INTEGERP (val))
3651 coding->category_idx = XINT (val);
3652 else
3653 goto label_invalid_coding_system;
3654 }
3655 else
3656 goto label_invalid_coding_system;
93dec019 3657
ec6d2bb8
KH
3658 /* If the coding system has non-nil `composition' property, enable
3659 composition handling. */
3660 val = Fplist_get (plist, Qcomposition);
3661 if (!NILP (val))
3662 coding->composing = COMPOSITION_NO;
3663
d46c5b12 3664 switch (XFASTINT (coding_type))
4ed46869
KH
3665 {
3666 case 0:
0ef69138 3667 coding->type = coding_type_emacs_mule;
aa72b389
KH
3668 coding->common_flags
3669 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
c952af22
KH
3670 if (!NILP (coding->post_read_conversion))
3671 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3672 if (!NILP (coding->pre_write_conversion))
3673 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3674 break;
3675
3676 case 1:
3677 coding->type = coding_type_sjis;
c952af22
KH
3678 coding->common_flags
3679 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3680 break;
3681
3682 case 2:
3683 coding->type = coding_type_iso2022;
c952af22
KH
3684 coding->common_flags
3685 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3686 {
70c22245 3687 Lisp_Object val, temp;
4ed46869 3688 Lisp_Object *flags;
d46c5b12 3689 int i, charset, reg_bits = 0;
4ed46869 3690
4608c386 3691 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3692
4ed46869
KH
3693 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3694 goto label_invalid_coding_system;
3695
3696 flags = XVECTOR (val)->contents;
3697 coding->flags
3698 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3699 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3700 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3701 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3702 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3703 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3704 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3705 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3706 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3707 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3708 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3709 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3710 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3711 );
4ed46869
KH
3712
3713 /* Invoke graphic register 0 to plane 0. */
3714 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3715 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3716 CODING_SPEC_ISO_INVOCATION (coding, 1)
3717 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3718 /* Not single shifting at first. */
6e85d753 3719 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3720 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3721 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3722
70c22245
KH
3723 for (charset = 0; charset <= MAX_CHARSET; charset++)
3724 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3725 val = Vcharset_revision_alist;
3726 while (CONSP (val))
3727 {
03699b14 3728 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3729 if (charset >= 0
03699b14 3730 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3731 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3732 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3733 val = XCDR (val);
70c22245
KH
3734 }
3735
4ed46869
KH
3736 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3737 FLAGS[REG] can be one of below:
3738 integer CHARSET: CHARSET occupies register I,
3739 t: designate nothing to REG initially, but can be used
3740 by any charsets,
3741 list of integer, nil, or t: designate the first
3742 element (if integer) to REG initially, the remaining
3743 elements (if integer) is designated to REG on request,
d46c5b12 3744 if an element is t, REG can be used by any charsets,
4ed46869 3745 nil: REG is never used. */
467e7675 3746 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3747 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3748 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3749 for (i = 0; i < 4; i++)
3750 {
87323294
PJ
3751 if ((INTEGERP (flags[i])
3752 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
e0e989f6 3753 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3754 {
3755 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3756 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3757 }
3758 else if (EQ (flags[i], Qt))
3759 {
3760 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3761 reg_bits |= 1 << i;
3762 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3763 }
3764 else if (CONSP (flags[i]))
3765 {
84d60297
RS
3766 Lisp_Object tail;
3767 tail = flags[i];
4ed46869 3768
d46c5b12 3769 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
87323294
PJ
3770 if ((INTEGERP (XCAR (tail))
3771 && (charset = XINT (XCAR (tail)),
3772 CHARSET_VALID_P (charset)))
03699b14 3773 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3774 {
3775 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3776 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3777 }
3778 else
3779 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3780 tail = XCDR (tail);
4ed46869
KH
3781 while (CONSP (tail))
3782 {
87323294
PJ
3783 if ((INTEGERP (XCAR (tail))
3784 && (charset = XINT (XCAR (tail)),
3785 CHARSET_VALID_P (charset)))
03699b14 3786 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3787 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3788 = i;
03699b14 3789 else if (EQ (XCAR (tail), Qt))
d46c5b12 3790 reg_bits |= 1 << i;
03699b14 3791 tail = XCDR (tail);
4ed46869
KH
3792 }
3793 }
3794 else
3795 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
93dec019 3796
4ed46869
KH
3797 CODING_SPEC_ISO_DESIGNATION (coding, i)
3798 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3799 }
3800
d46c5b12 3801 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3802 {
3803 /* REG 1 can be used only by locking shift in 7-bit env. */
3804 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3805 reg_bits &= ~2;
4ed46869
KH
3806 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3807 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3808 reg_bits &= 3;
4ed46869
KH
3809 }
3810
d46c5b12
KH
3811 if (reg_bits)
3812 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3813 {
928a85c1 3814 if (CHARSET_DEFINED_P (charset)
96148065
KH
3815 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3816 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
3817 {
3818 /* There exist some default graphic registers to be
96148065 3819 used by CHARSET. */
d46c5b12
KH
3820
3821 /* We had better avoid designating a charset of
3822 CHARS96 to REG 0 as far as possible. */
3823 if (CHARSET_CHARS (charset) == 96)
3824 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3825 = (reg_bits & 2
3826 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3827 else
3828 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3829 = (reg_bits & 1
3830 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3831 }
6e85d753 3832 }
4ed46869 3833 }
c952af22 3834 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3835 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3836 break;
3837
3838 case 3:
3839 coding->type = coding_type_big5;
c952af22
KH
3840 coding->common_flags
3841 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3842 coding->flags
4608c386 3843 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3844 ? CODING_FLAG_BIG5_HKU
3845 : CODING_FLAG_BIG5_ETEN);
3846 break;
3847
3848 case 4:
3849 coding->type = coding_type_ccl;
c952af22
KH
3850 coding->common_flags
3851 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3852 {
84d60297 3853 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3854 if (! CONSP (val)
3855 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3856 XCAR (val)) < 0
ef4ced28 3857 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3858 XCDR (val)) < 0)
4ed46869 3859 goto label_invalid_coding_system;
1397dc18
KH
3860
3861 bzero (coding->spec.ccl.valid_codes, 256);
3862 val = Fplist_get (plist, Qvalid_codes);
3863 if (CONSP (val))
3864 {
3865 Lisp_Object this;
3866
03699b14 3867 for (; CONSP (val); val = XCDR (val))
1397dc18 3868 {
03699b14 3869 this = XCAR (val);
1397dc18
KH
3870 if (INTEGERP (this)
3871 && XINT (this) >= 0 && XINT (this) < 256)
3872 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3873 else if (CONSP (this)
03699b14
KR
3874 && INTEGERP (XCAR (this))
3875 && INTEGERP (XCDR (this)))
1397dc18 3876 {
03699b14
KR
3877 int start = XINT (XCAR (this));
3878 int end = XINT (XCDR (this));
1397dc18
KH
3879
3880 if (start >= 0 && start <= end && end < 256)
e133c8fa 3881 while (start <= end)
1397dc18
KH
3882 coding->spec.ccl.valid_codes[start++] = 1;
3883 }
3884 }
3885 }
4ed46869 3886 }
c952af22 3887 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
aaaf0b1e 3888 coding->spec.ccl.cr_carryover = 0;
1c3478b0 3889 coding->spec.ccl.eight_bit_carryover[0] = 0;
4ed46869
KH
3890 break;
3891
27901516
KH
3892 case 5:
3893 coding->type = coding_type_raw_text;
3894 break;
3895
4ed46869 3896 default:
d46c5b12 3897 goto label_invalid_coding_system;
4ed46869
KH
3898 }
3899 return 0;
3900
3901 label_invalid_coding_system:
3902 coding->type = coding_type_no_conversion;
d46c5b12 3903 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3904 coding->common_flags = 0;
dec137e5 3905 coding->eol_type = CODING_EOL_LF;
d46c5b12 3906 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3907 return -1;
3908}
3909
ec6d2bb8
KH
3910/* Free memory blocks allocated for storing composition information. */
3911
3912void
3913coding_free_composition_data (coding)
3914 struct coding_system *coding;
3915{
3916 struct composition_data *cmp_data = coding->cmp_data, *next;
3917
3918 if (!cmp_data)
3919 return;
3920 /* Memory blocks are chained. At first, rewind to the first, then,
3921 free blocks one by one. */
3922 while (cmp_data->prev)
3923 cmp_data = cmp_data->prev;
3924 while (cmp_data)
3925 {
3926 next = cmp_data->next;
3927 xfree (cmp_data);
3928 cmp_data = next;
3929 }
3930 coding->cmp_data = NULL;
3931}
3932
3933/* Set `char_offset' member of all memory blocks pointed by
3934 coding->cmp_data to POS. */
3935
3936void
3937coding_adjust_composition_offset (coding, pos)
3938 struct coding_system *coding;
3939 int pos;
3940{
3941 struct composition_data *cmp_data;
3942
3943 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3944 cmp_data->char_offset = pos;
3945}
3946
54f78171
KH
3947/* Setup raw-text or one of its subsidiaries in the structure
3948 coding_system CODING according to the already setup value eol_type
3949 in CODING. CODING should be setup for some coding system in
3950 advance. */
3951
3952void
3953setup_raw_text_coding_system (coding)
3954 struct coding_system *coding;
3955{
3956 if (coding->type != coding_type_raw_text)
3957 {
3958 coding->symbol = Qraw_text;
3959 coding->type = coding_type_raw_text;
3960 if (coding->eol_type != CODING_EOL_UNDECIDED)
3961 {
84d60297
RS
3962 Lisp_Object subsidiaries;
3963 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3964
3965 if (VECTORP (subsidiaries)
3966 && XVECTOR (subsidiaries)->size == 3)
3967 coding->symbol
3968 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3969 }
716e0b0a 3970 setup_coding_system (coding->symbol, coding);
54f78171
KH
3971 }
3972 return;
3973}
3974
4ed46869
KH
3975/* Emacs has a mechanism to automatically detect a coding system if it
3976 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3977 it's impossible to distinguish some coding systems accurately
3978 because they use the same range of codes. So, at first, coding
3979 systems are categorized into 7, those are:
3980
0ef69138 3981 o coding-category-emacs-mule
4ed46869
KH
3982
3983 The category for a coding system which has the same code range
3984 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3985 symbol) `emacs-mule' by default.
4ed46869
KH
3986
3987 o coding-category-sjis
3988
3989 The category for a coding system which has the same code range
3990 as SJIS. Assigned the coding-system (Lisp
7717c392 3991 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3992
3993 o coding-category-iso-7
3994
3995 The category for a coding system which has the same code range
7717c392 3996 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3997 shift and single shift functions. This can encode/decode all
3998 charsets. Assigned the coding-system (Lisp symbol)
3999 `iso-2022-7bit' by default.
4000
4001 o coding-category-iso-7-tight
4002
4003 Same as coding-category-iso-7 except that this can
4004 encode/decode only the specified charsets.
4ed46869
KH
4005
4006 o coding-category-iso-8-1
4007
4008 The category for a coding system which has the same code range
4009 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4010 for DIMENSION1 charset. This doesn't use any locking shift
4011 and single shift functions. Assigned the coding-system (Lisp
4012 symbol) `iso-latin-1' by default.
4ed46869
KH
4013
4014 o coding-category-iso-8-2
4015
4016 The category for a coding system which has the same code range
4017 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4018 for DIMENSION2 charset. This doesn't use any locking shift
4019 and single shift functions. Assigned the coding-system (Lisp
4020 symbol) `japanese-iso-8bit' by default.
4ed46869 4021
7717c392 4022 o coding-category-iso-7-else
4ed46869
KH
4023
4024 The category for a coding system which has the same code range
8ca3766a 4025 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
4026 single shift functions. Assigned the coding-system (Lisp
4027 symbol) `iso-2022-7bit-lock' by default.
4028
4029 o coding-category-iso-8-else
4030
4031 The category for a coding system which has the same code range
8ca3766a 4032 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
4033 single shift functions. Assigned the coding-system (Lisp
4034 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
4035
4036 o coding-category-big5
4037
4038 The category for a coding system which has the same code range
4039 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 4040 `cn-big5' by default.
4ed46869 4041
fa42c37f
KH
4042 o coding-category-utf-8
4043
4044 The category for a coding system which has the same code range
38b92c42 4045 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
4046 symbol) `utf-8' by default.
4047
4048 o coding-category-utf-16-be
4049
4050 The category for a coding system in which a text has an
4051 Unicode signature (cf. Unicode Standard) in the order of BIG
4052 endian at the head. Assigned the coding-system (Lisp symbol)
4053 `utf-16-be' by default.
4054
4055 o coding-category-utf-16-le
4056
4057 The category for a coding system in which a text has an
4058 Unicode signature (cf. Unicode Standard) in the order of
4059 LITTLE endian at the head. Assigned the coding-system (Lisp
4060 symbol) `utf-16-le' by default.
4061
1397dc18
KH
4062 o coding-category-ccl
4063
4064 The category for a coding system of which encoder/decoder is
4065 written in CCL programs. The default value is nil, i.e., no
4066 coding system is assigned.
4067
4ed46869
KH
4068 o coding-category-binary
4069
4070 The category for a coding system not categorized in any of the
4071 above. Assigned the coding-system (Lisp symbol)
e0e989f6 4072 `no-conversion' by default.
4ed46869
KH
4073
4074 Each of them is a Lisp symbol and the value is an actual
cfb43547 4075 `coding-system' (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
4076 What Emacs does actually is to detect a category of coding system.
4077 Then, it uses a `coding-system' assigned to it. If Emacs can't
cfb43547 4078 decide a single possible category, it selects a category of the
4ed46869
KH
4079 highest priority. Priorities of categories are also specified by a
4080 user in a Lisp variable `coding-category-list'.
4081
4082*/
4083
66cfb530
KH
4084static
4085int ascii_skip_code[256];
4086
d46c5b12 4087/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
4088 If it detects possible coding systems, return an integer in which
4089 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
4090 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4091 it should point the table `coding_priorities'. In that case, only
4092 the flag bit for a coding system of the highest priority is set in
0a28aafb
KH
4093 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4094 range 0x80..0x9F are in multibyte form.
4ed46869 4095
d46c5b12
KH
4096 How many ASCII characters are at the head is returned as *SKIP. */
4097
4098static int
0a28aafb 4099detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
d46c5b12
KH
4100 unsigned char *source;
4101 int src_bytes, *priorities, *skip;
0a28aafb 4102 int multibytep;
4ed46869
KH
4103{
4104 register unsigned char c;
d46c5b12 4105 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f 4106 unsigned int mask, utf16_examined_p, iso2022_examined_p;
da55a2b7 4107 int i;
4ed46869
KH
4108
4109 /* At first, skip all ASCII characters and control characters except
4110 for three ISO2022 specific control characters. */
66cfb530
KH
4111 ascii_skip_code[ISO_CODE_SO] = 0;
4112 ascii_skip_code[ISO_CODE_SI] = 0;
4113 ascii_skip_code[ISO_CODE_ESC] = 0;
4114
bcf26d6a 4115 label_loop_detect_coding:
66cfb530 4116 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 4117 *skip = src - source;
4ed46869
KH
4118
4119 if (src >= src_end)
4120 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 4121 return 0;
4ed46869 4122
8a8147d6 4123 c = *src;
4ed46869
KH
4124 /* The text seems to be encoded in some multilingual coding system.
4125 Now, try to find in which coding system the text is encoded. */
4126 if (c < 0x80)
bcf26d6a
KH
4127 {
4128 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4129 /* C is an ISO2022 specific control code of C0. */
0a28aafb 4130 mask = detect_coding_iso2022 (src, src_end, multibytep);
1b2af4b0 4131 if (mask == 0)
d46c5b12
KH
4132 {
4133 /* No valid ISO2022 code follows C. Try again. */
4134 src++;
66cfb530
KH
4135 if (c == ISO_CODE_ESC)
4136 ascii_skip_code[ISO_CODE_ESC] = 1;
4137 else
4138 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
4139 goto label_loop_detect_coding;
4140 }
4141 if (priorities)
fa42c37f
KH
4142 {
4143 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4144 {
4145 if (mask & priorities[i])
4146 return priorities[i];
4147 }
4148 return CODING_CATEGORY_MASK_RAW_TEXT;
4149 }
bcf26d6a 4150 }
d46c5b12 4151 else
c4825358 4152 {
d46c5b12 4153 int try;
4ed46869 4154
0a28aafb 4155 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
67091e59 4156 c = src[1] - 0x20;
0a28aafb 4157
d46c5b12
KH
4158 if (c < 0xA0)
4159 {
4160 /* C is the first byte of SJIS character code,
fa42c37f
KH
4161 or a leading-code of Emacs' internal format (emacs-mule),
4162 or the first byte of UTF-16. */
4163 try = (CODING_CATEGORY_MASK_SJIS
4164 | CODING_CATEGORY_MASK_EMACS_MULE
4165 | CODING_CATEGORY_MASK_UTF_16_BE
4166 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
4167
4168 /* Or, if C is a special latin extra code,
93dec019 4169 or is an ISO2022 specific control code of C1 (SS2 or SS3),
d46c5b12
KH
4170 or is an ISO2022 control-sequence-introducer (CSI),
4171 we should also consider the possibility of ISO2022 codings. */
4172 if ((VECTORP (Vlatin_extra_code_table)
4173 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4174 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4175 || (c == ISO_CODE_CSI
4176 && (src < src_end
4177 && (*src == ']'
4178 || ((*src == '0' || *src == '1' || *src == '2')
4179 && src + 1 < src_end
4180 && src[1] == ']')))))
4181 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4182 | CODING_CATEGORY_MASK_ISO_8BIT);
4183 }
c4825358 4184 else
d46c5b12
KH
4185 /* C is a character of ISO2022 in graphic plane right,
4186 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
4187 or the first byte of BIG5's 2-byte code,
4188 or the first byte of UTF-8/16. */
d46c5b12
KH
4189 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4190 | CODING_CATEGORY_MASK_ISO_8BIT
4191 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
4192 | CODING_CATEGORY_MASK_BIG5
4193 | CODING_CATEGORY_MASK_UTF_8
4194 | CODING_CATEGORY_MASK_UTF_16_BE
4195 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 4196
1397dc18
KH
4197 /* Or, we may have to consider the possibility of CCL. */
4198 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4199 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4200 ->spec.ccl.valid_codes)[c])
4201 try |= CODING_CATEGORY_MASK_CCL;
4202
d46c5b12 4203 mask = 0;
fa42c37f 4204 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
4205 if (priorities)
4206 {
4207 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4208 {
fa42c37f
KH
4209 if (!iso2022_examined_p
4210 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4211 {
0192762c 4212 mask |= detect_coding_iso2022 (src, src_end, multibytep);
fa42c37f
KH
4213 iso2022_examined_p = 1;
4214 }
5ab13dd0 4215 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4216 mask |= detect_coding_sjis (src, src_end, multibytep);
fa42c37f 4217 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4218 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f
KH
4219 else if (!utf16_examined_p
4220 && (priorities[i] & try &
4221 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4222 {
0a28aafb 4223 mask |= detect_coding_utf_16 (src, src_end, multibytep);
fa42c37f
KH
4224 utf16_examined_p = 1;
4225 }
5ab13dd0 4226 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4227 mask |= detect_coding_big5 (src, src_end, multibytep);
5ab13dd0 4228 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4229 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
89fa8b36 4230 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4231 mask |= detect_coding_ccl (src, src_end, multibytep);
5ab13dd0 4232 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 4233 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 4234 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
4235 mask |= CODING_CATEGORY_MASK_BINARY;
4236 if (mask & priorities[i])
4237 return priorities[i];
d46c5b12
KH
4238 }
4239 return CODING_CATEGORY_MASK_RAW_TEXT;
4240 }
4241 if (try & CODING_CATEGORY_MASK_ISO)
0a28aafb 4242 mask |= detect_coding_iso2022 (src, src_end, multibytep);
d46c5b12 4243 if (try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4244 mask |= detect_coding_sjis (src, src_end, multibytep);
d46c5b12 4245 if (try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4246 mask |= detect_coding_big5 (src, src_end, multibytep);
fa42c37f 4247 if (try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4248 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f 4249 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
0a28aafb 4250 mask |= detect_coding_utf_16 (src, src_end, multibytep);
d46c5b12 4251 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4252 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
1397dc18 4253 if (try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4254 mask |= detect_coding_ccl (src, src_end, multibytep);
c4825358 4255 }
5ab13dd0 4256 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
4257}
4258
4259/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4260 The information of the detected coding system is set in CODING. */
4261
4262void
4263detect_coding (coding, src, src_bytes)
4264 struct coding_system *coding;
a4244313 4265 const unsigned char *src;
4ed46869
KH
4266 int src_bytes;
4267{
d46c5b12 4268 unsigned int idx;
da55a2b7 4269 int skip, mask;
84d60297 4270 Lisp_Object val;
4ed46869 4271
84d60297 4272 val = Vcoding_category_list;
64c1e55f
KH
4273 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4274 coding->src_multibyte);
d46c5b12 4275 coding->heading_ascii = skip;
4ed46869 4276
d46c5b12
KH
4277 if (!mask) return;
4278
4279 /* We found a single coding system of the highest priority in MASK. */
4280 idx = 0;
4281 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4282 if (! mask)
4283 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 4284
f5c1dd0d 4285 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
d46c5b12
KH
4286
4287 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 4288 {
84d60297 4289 Lisp_Object tmp;
d46c5b12 4290
84d60297 4291 tmp = Fget (val, Qeol_type);
d46c5b12
KH
4292 if (VECTORP (tmp))
4293 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 4294 }
b73bfc1c
KH
4295
4296 /* Setup this new coding system while preserving some slots. */
4297 {
4298 int src_multibyte = coding->src_multibyte;
4299 int dst_multibyte = coding->dst_multibyte;
4300
4301 setup_coding_system (val, coding);
4302 coding->src_multibyte = src_multibyte;
4303 coding->dst_multibyte = dst_multibyte;
4304 coding->heading_ascii = skip;
4305 }
4ed46869
KH
4306}
4307
d46c5b12
KH
4308/* Detect how end-of-line of a text of length SRC_BYTES pointed by
4309 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4310 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4311
4312 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 4313
bc4bc72a
RS
4314#define MAX_EOL_CHECK_COUNT 3
4315
d46c5b12
KH
4316static int
4317detect_eol_type (source, src_bytes, skip)
4318 unsigned char *source;
4319 int src_bytes, *skip;
4ed46869 4320{
d46c5b12 4321 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 4322 unsigned char c;
bc4bc72a
RS
4323 int total = 0; /* How many end-of-lines are found so far. */
4324 int eol_type = CODING_EOL_UNDECIDED;
4325 int this_eol_type;
4ed46869 4326
d46c5b12
KH
4327 *skip = 0;
4328
bc4bc72a 4329 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
4330 {
4331 c = *src++;
bc4bc72a 4332 if (c == '\n' || c == '\r')
4ed46869 4333 {
d46c5b12
KH
4334 if (*skip == 0)
4335 *skip = src - 1 - source;
bc4bc72a
RS
4336 total++;
4337 if (c == '\n')
4338 this_eol_type = CODING_EOL_LF;
4339 else if (src >= src_end || *src != '\n')
4340 this_eol_type = CODING_EOL_CR;
4ed46869 4341 else
bc4bc72a
RS
4342 this_eol_type = CODING_EOL_CRLF, src++;
4343
4344 if (eol_type == CODING_EOL_UNDECIDED)
4345 /* This is the first end-of-line. */
4346 eol_type = this_eol_type;
4347 else if (eol_type != this_eol_type)
d46c5b12
KH
4348 {
4349 /* The found type is different from what found before. */
4350 eol_type = CODING_EOL_INCONSISTENT;
4351 break;
4352 }
4ed46869
KH
4353 }
4354 }
bc4bc72a 4355
d46c5b12
KH
4356 if (*skip == 0)
4357 *skip = src_end - source;
85a02ca4 4358 return eol_type;
4ed46869
KH
4359}
4360
fa42c37f
KH
4361/* Like detect_eol_type, but detect EOL type in 2-octet
4362 big-endian/little-endian format for coding systems utf-16-be and
4363 utf-16-le. */
4364
4365static int
4366detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4367 unsigned char *source;
cfb43547 4368 int src_bytes, *skip, big_endian_p;
fa42c37f
KH
4369{
4370 unsigned char *src = source, *src_end = src + src_bytes;
4371 unsigned int c1, c2;
4372 int total = 0; /* How many end-of-lines are found so far. */
4373 int eol_type = CODING_EOL_UNDECIDED;
4374 int this_eol_type;
4375 int msb, lsb;
4376
4377 if (big_endian_p)
4378 msb = 0, lsb = 1;
4379 else
4380 msb = 1, lsb = 0;
4381
4382 *skip = 0;
4383
4384 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4385 {
4386 c1 = (src[msb] << 8) | (src[lsb]);
4387 src += 2;
4388
4389 if (c1 == '\n' || c1 == '\r')
4390 {
4391 if (*skip == 0)
4392 *skip = src - 2 - source;
4393 total++;
4394 if (c1 == '\n')
4395 {
4396 this_eol_type = CODING_EOL_LF;
4397 }
4398 else
4399 {
4400 if ((src + 1) >= src_end)
4401 {
4402 this_eol_type = CODING_EOL_CR;
4403 }
4404 else
4405 {
4406 c2 = (src[msb] << 8) | (src[lsb]);
4407 if (c2 == '\n')
4408 this_eol_type = CODING_EOL_CRLF, src += 2;
4409 else
4410 this_eol_type = CODING_EOL_CR;
4411 }
4412 }
4413
4414 if (eol_type == CODING_EOL_UNDECIDED)
4415 /* This is the first end-of-line. */
4416 eol_type = this_eol_type;
4417 else if (eol_type != this_eol_type)
4418 {
4419 /* The found type is different from what found before. */
4420 eol_type = CODING_EOL_INCONSISTENT;
4421 break;
4422 }
4423 }
4424 }
4425
4426 if (*skip == 0)
4427 *skip = src_end - source;
4428 return eol_type;
4429}
4430
4ed46869
KH
4431/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4432 is encoded. If it detects an appropriate format of end-of-line, it
4433 sets the information in *CODING. */
4434
4435void
4436detect_eol (coding, src, src_bytes)
4437 struct coding_system *coding;
a4244313 4438 const unsigned char *src;
4ed46869
KH
4439 int src_bytes;
4440{
4608c386 4441 Lisp_Object val;
d46c5b12 4442 int skip;
fa42c37f
KH
4443 int eol_type;
4444
4445 switch (coding->category_idx)
4446 {
4447 case CODING_CATEGORY_IDX_UTF_16_BE:
4448 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4449 break;
4450 case CODING_CATEGORY_IDX_UTF_16_LE:
4451 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4452 break;
4453 default:
4454 eol_type = detect_eol_type (src, src_bytes, &skip);
4455 break;
4456 }
d46c5b12
KH
4457
4458 if (coding->heading_ascii > skip)
4459 coding->heading_ascii = skip;
4460 else
4461 skip = coding->heading_ascii;
4ed46869 4462
0ef69138 4463 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 4464 return;
27901516
KH
4465 if (eol_type == CODING_EOL_INCONSISTENT)
4466 {
4467#if 0
4468 /* This code is suppressed until we find a better way to
992f23f2 4469 distinguish raw text file and binary file. */
27901516
KH
4470
4471 /* If we have already detected that the coding is raw-text, the
4472 coding should actually be no-conversion. */
4473 if (coding->type == coding_type_raw_text)
4474 {
4475 setup_coding_system (Qno_conversion, coding);
4476 return;
4477 }
4478 /* Else, let's decode only text code anyway. */
4479#endif /* 0 */
1b2af4b0 4480 eol_type = CODING_EOL_LF;
27901516
KH
4481 }
4482
4608c386 4483 val = Fget (coding->symbol, Qeol_type);
4ed46869 4484 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12 4485 {
b73bfc1c
KH
4486 int src_multibyte = coding->src_multibyte;
4487 int dst_multibyte = coding->dst_multibyte;
1cd6b64c 4488 struct composition_data *cmp_data = coding->cmp_data;
b73bfc1c 4489
d46c5b12 4490 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
b73bfc1c
KH
4491 coding->src_multibyte = src_multibyte;
4492 coding->dst_multibyte = dst_multibyte;
d46c5b12 4493 coding->heading_ascii = skip;
1cd6b64c 4494 coding->cmp_data = cmp_data;
d46c5b12
KH
4495 }
4496}
4497
4498#define CONVERSION_BUFFER_EXTRA_ROOM 256
4499
b73bfc1c
KH
4500#define DECODING_BUFFER_MAG(coding) \
4501 (coding->type == coding_type_iso2022 \
4502 ? 3 \
4503 : (coding->type == coding_type_ccl \
4504 ? coding->spec.ccl.decoder.buf_magnification \
4505 : 2))
d46c5b12
KH
4506
4507/* Return maximum size (bytes) of a buffer enough for decoding
4508 SRC_BYTES of text encoded in CODING. */
4509
4510int
4511decoding_buffer_size (coding, src_bytes)
4512 struct coding_system *coding;
4513 int src_bytes;
4514{
4515 return (src_bytes * DECODING_BUFFER_MAG (coding)
4516 + CONVERSION_BUFFER_EXTRA_ROOM);
4517}
4518
4519/* Return maximum size (bytes) of a buffer enough for encoding
4520 SRC_BYTES of text to CODING. */
4521
4522int
4523encoding_buffer_size (coding, src_bytes)
4524 struct coding_system *coding;
4525 int src_bytes;
4526{
4527 int magnification;
4528
4529 if (coding->type == coding_type_ccl)
a84f1519
KH
4530 {
4531 magnification = coding->spec.ccl.encoder.buf_magnification;
4532 if (coding->eol_type == CODING_EOL_CRLF)
4533 magnification *= 2;
4534 }
b73bfc1c 4535 else if (CODING_REQUIRE_ENCODING (coding))
d46c5b12 4536 magnification = 3;
b73bfc1c
KH
4537 else
4538 magnification = 1;
d46c5b12
KH
4539
4540 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4541}
4542
73be902c
KH
4543/* Working buffer for code conversion. */
4544struct conversion_buffer
4545{
4546 int size; /* size of data. */
4547 int on_stack; /* 1 if allocated by alloca. */
4548 unsigned char *data;
4549};
d46c5b12 4550
73be902c
KH
4551/* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4552#define allocate_conversion_buffer(buf, len) \
4553 do { \
4554 if (len < MAX_ALLOCA) \
4555 { \
4556 buf.data = (unsigned char *) alloca (len); \
4557 buf.on_stack = 1; \
4558 } \
4559 else \
4560 { \
4561 buf.data = (unsigned char *) xmalloc (len); \
4562 buf.on_stack = 0; \
4563 } \
4564 buf.size = len; \
4565 } while (0)
d46c5b12 4566
73be902c
KH
4567/* Double the allocated memory for *BUF. */
4568static void
4569extend_conversion_buffer (buf)
4570 struct conversion_buffer *buf;
d46c5b12 4571{
73be902c 4572 if (buf->on_stack)
d46c5b12 4573 {
73be902c
KH
4574 unsigned char *save = buf->data;
4575 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4576 bcopy (save, buf->data, buf->size);
4577 buf->on_stack = 0;
d46c5b12 4578 }
73be902c
KH
4579 else
4580 {
4581 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4582 }
4583 buf->size *= 2;
4584}
4585
4586/* Free the allocated memory for BUF if it is not on stack. */
4587static void
4588free_conversion_buffer (buf)
4589 struct conversion_buffer *buf;
4590{
4591 if (!buf->on_stack)
4592 xfree (buf->data);
d46c5b12
KH
4593}
4594
4595int
4596ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4597 struct coding_system *coding;
4598 unsigned char *source, *destination;
4599 int src_bytes, dst_bytes, encodep;
4600{
4601 struct ccl_program *ccl
4602 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
1c3478b0 4603 unsigned char *dst = destination;
d46c5b12 4604
bd64290d 4605 ccl->suppress_error = coding->suppress_error;
ae9ff118 4606 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
aaaf0b1e 4607 if (encodep)
80e0ca99
KH
4608 {
4609 /* On encoding, EOL format is converted within ccl_driver. For
4610 that, setup proper information in the structure CCL. */
4611 ccl->eol_type = coding->eol_type;
4612 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4613 ccl->eol_type = CODING_EOL_LF;
4614 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
b671ed5e 4615 ccl->eight_bit_control = coding->dst_multibyte;
80e0ca99 4616 }
b671ed5e
KH
4617 else
4618 ccl->eight_bit_control = 1;
7272d75c 4619 ccl->multibyte = coding->src_multibyte;
1c3478b0
KH
4620 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4621 {
4622 /* Move carryover bytes to DESTINATION. */
4623 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4624 while (*p)
4625 *dst++ = *p++;
4626 coding->spec.ccl.eight_bit_carryover[0] = 0;
4627 if (dst_bytes)
4628 dst_bytes -= dst - destination;
4629 }
4630
4631 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4632 &(coding->consumed))
4633 + dst - destination);
4634
b73bfc1c 4635 if (encodep)
80e0ca99
KH
4636 {
4637 coding->produced_char = coding->produced;
4638 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4639 }
ade8d05e
KH
4640 else if (!ccl->eight_bit_control)
4641 {
4642 /* The produced bytes forms a valid multibyte sequence. */
4643 coding->produced_char
4644 = multibyte_chars_in_text (destination, coding->produced);
4645 coding->spec.ccl.eight_bit_carryover[0] = 0;
4646 }
b73bfc1c
KH
4647 else
4648 {
1c3478b0
KH
4649 /* On decoding, the destination should always multibyte. But,
4650 CCL program might have been generated an invalid multibyte
4651 sequence. Here we make such a sequence valid as
4652 multibyte. */
b73bfc1c
KH
4653 int bytes
4654 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
1c3478b0
KH
4655
4656 if ((coding->consumed < src_bytes
4657 || !ccl->last_block)
4658 && coding->produced >= 1
4659 && destination[coding->produced - 1] >= 0x80)
4660 {
4661 /* We should not convert the tailing 8-bit codes to
4662 multibyte form even if they doesn't form a valid
4663 multibyte sequence. They may form a valid sequence in
4664 the next call. */
4665 int carryover = 0;
4666
4667 if (destination[coding->produced - 1] < 0xA0)
4668 carryover = 1;
4669 else if (coding->produced >= 2)
4670 {
4671 if (destination[coding->produced - 2] >= 0x80)
4672 {
4673 if (destination[coding->produced - 2] < 0xA0)
4674 carryover = 2;
4675 else if (coding->produced >= 3
4676 && destination[coding->produced - 3] >= 0x80
4677 && destination[coding->produced - 3] < 0xA0)
4678 carryover = 3;
4679 }
4680 }
4681 if (carryover > 0)
4682 {
4683 BCOPY_SHORT (destination + coding->produced - carryover,
4684 coding->spec.ccl.eight_bit_carryover,
4685 carryover);
4686 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4687 coding->produced -= carryover;
4688 }
4689 }
b73bfc1c
KH
4690 coding->produced = str_as_multibyte (destination, bytes,
4691 coding->produced,
4692 &(coding->produced_char));
4693 }
69f76525 4694
d46c5b12
KH
4695 switch (ccl->status)
4696 {
4697 case CCL_STAT_SUSPEND_BY_SRC:
73be902c 4698 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
d46c5b12
KH
4699 break;
4700 case CCL_STAT_SUSPEND_BY_DST:
73be902c 4701 coding->result = CODING_FINISH_INSUFFICIENT_DST;
d46c5b12 4702 break;
9864ebce
KH
4703 case CCL_STAT_QUIT:
4704 case CCL_STAT_INVALID_CMD:
73be902c 4705 coding->result = CODING_FINISH_INTERRUPT;
9864ebce 4706 break;
d46c5b12 4707 default:
73be902c 4708 coding->result = CODING_FINISH_NORMAL;
d46c5b12
KH
4709 break;
4710 }
73be902c 4711 return coding->result;
4ed46869
KH
4712}
4713
aaaf0b1e
KH
4714/* Decode EOL format of the text at PTR of BYTES length destructively
4715 according to CODING->eol_type. This is called after the CCL
4716 program produced a decoded text at PTR. If we do CRLF->LF
4717 conversion, update CODING->produced and CODING->produced_char. */
4718
4719static void
4720decode_eol_post_ccl (coding, ptr, bytes)
4721 struct coding_system *coding;
4722 unsigned char *ptr;
4723 int bytes;
4724{
4725 Lisp_Object val, saved_coding_symbol;
4726 unsigned char *pend = ptr + bytes;
4727 int dummy;
4728
4729 /* Remember the current coding system symbol. We set it back when
4730 an inconsistent EOL is found so that `last-coding-system-used' is
4731 set to the coding system that doesn't specify EOL conversion. */
4732 saved_coding_symbol = coding->symbol;
4733
4734 coding->spec.ccl.cr_carryover = 0;
4735 if (coding->eol_type == CODING_EOL_UNDECIDED)
4736 {
4737 /* Here, to avoid the call of setup_coding_system, we directly
4738 call detect_eol_type. */
4739 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
74b01b80
EZ
4740 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4741 coding->eol_type = CODING_EOL_LF;
4742 if (coding->eol_type != CODING_EOL_UNDECIDED)
4743 {
4744 val = Fget (coding->symbol, Qeol_type);
4745 if (VECTORP (val) && XVECTOR (val)->size == 3)
4746 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4747 }
aaaf0b1e
KH
4748 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4749 }
4750
74b01b80
EZ
4751 if (coding->eol_type == CODING_EOL_LF
4752 || coding->eol_type == CODING_EOL_UNDECIDED)
aaaf0b1e
KH
4753 {
4754 /* We have nothing to do. */
4755 ptr = pend;
4756 }
4757 else if (coding->eol_type == CODING_EOL_CRLF)
4758 {
4759 unsigned char *pstart = ptr, *p = ptr;
4760
4761 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4762 && *(pend - 1) == '\r')
4763 {
4764 /* If the last character is CR, we can't handle it here
4765 because LF will be in the not-yet-decoded source text.
9861e777 4766 Record that the CR is not yet processed. */
aaaf0b1e
KH
4767 coding->spec.ccl.cr_carryover = 1;
4768 coding->produced--;
4769 coding->produced_char--;
4770 pend--;
4771 }
4772 while (ptr < pend)
4773 {
4774 if (*ptr == '\r')
4775 {
4776 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4777 {
4778 *p++ = '\n';
4779 ptr += 2;
4780 }
4781 else
4782 {
4783 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4784 goto undo_eol_conversion;
4785 *p++ = *ptr++;
4786 }
4787 }
4788 else if (*ptr == '\n'
4789 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4790 goto undo_eol_conversion;
4791 else
4792 *p++ = *ptr++;
4793 continue;
4794
4795 undo_eol_conversion:
4796 /* We have faced with inconsistent EOL format at PTR.
4797 Convert all LFs before PTR back to CRLFs. */
4798 for (p--, ptr--; p >= pstart; p--)
4799 {
4800 if (*p == '\n')
4801 *ptr-- = '\n', *ptr-- = '\r';
4802 else
4803 *ptr-- = *p;
4804 }
4805 /* If carryover is recorded, cancel it because we don't
4806 convert CRLF anymore. */
4807 if (coding->spec.ccl.cr_carryover)
4808 {
4809 coding->spec.ccl.cr_carryover = 0;
4810 coding->produced++;
4811 coding->produced_char++;
4812 pend++;
4813 }
4814 p = ptr = pend;
4815 coding->eol_type = CODING_EOL_LF;
4816 coding->symbol = saved_coding_symbol;
4817 }
4818 if (p < pend)
4819 {
4820 /* As each two-byte sequence CRLF was converted to LF, (PEND
4821 - P) is the number of deleted characters. */
4822 coding->produced -= pend - p;
4823 coding->produced_char -= pend - p;
4824 }
4825 }
4826 else /* i.e. coding->eol_type == CODING_EOL_CR */
4827 {
4828 unsigned char *p = ptr;
4829
4830 for (; ptr < pend; ptr++)
4831 {
4832 if (*ptr == '\r')
4833 *ptr = '\n';
4834 else if (*ptr == '\n'
4835 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4836 {
4837 for (; p < ptr; p++)
4838 {
4839 if (*p == '\n')
4840 *p = '\r';
4841 }
4842 ptr = pend;
4843 coding->eol_type = CODING_EOL_LF;
4844 coding->symbol = saved_coding_symbol;
4845 }
4846 }
4847 }
4848}
4849
4ed46869
KH
4850/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4851 decoding, it may detect coding system and format of end-of-line if
b73bfc1c
KH
4852 those are not yet decided. The source should be unibyte, the
4853 result is multibyte if CODING->dst_multibyte is nonzero, else
4854 unibyte. */
4ed46869
KH
4855
4856int
d46c5b12 4857decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4858 struct coding_system *coding;
a4244313
KR
4859 const unsigned char *source;
4860 unsigned char *destination;
4ed46869 4861 int src_bytes, dst_bytes;
4ed46869 4862{
9861e777
EZ
4863 int extra = 0;
4864
0ef69138 4865 if (coding->type == coding_type_undecided)
4ed46869
KH
4866 detect_coding (coding, source, src_bytes);
4867
aaaf0b1e
KH
4868 if (coding->eol_type == CODING_EOL_UNDECIDED
4869 && coding->type != coding_type_ccl)
8844fa83
KH
4870 {
4871 detect_eol (coding, source, src_bytes);
4872 /* We had better recover the original eol format if we
8ca3766a 4873 encounter an inconsistent eol format while decoding. */
8844fa83
KH
4874 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4875 }
4ed46869 4876
b73bfc1c
KH
4877 coding->produced = coding->produced_char = 0;
4878 coding->consumed = coding->consumed_char = 0;
4879 coding->errors = 0;
4880 coding->result = CODING_FINISH_NORMAL;
4881
4ed46869
KH
4882 switch (coding->type)
4883 {
4ed46869 4884 case coding_type_sjis:
b73bfc1c
KH
4885 decode_coding_sjis_big5 (coding, source, destination,
4886 src_bytes, dst_bytes, 1);
4ed46869
KH
4887 break;
4888
4889 case coding_type_iso2022:
b73bfc1c
KH
4890 decode_coding_iso2022 (coding, source, destination,
4891 src_bytes, dst_bytes);
4ed46869
KH
4892 break;
4893
4894 case coding_type_big5:
b73bfc1c
KH
4895 decode_coding_sjis_big5 (coding, source, destination,
4896 src_bytes, dst_bytes, 0);
4897 break;
4898
4899 case coding_type_emacs_mule:
4900 decode_coding_emacs_mule (coding, source, destination,
4901 src_bytes, dst_bytes);
4ed46869
KH
4902 break;
4903
4904 case coding_type_ccl:
aaaf0b1e
KH
4905 if (coding->spec.ccl.cr_carryover)
4906 {
9861e777
EZ
4907 /* Put the CR which was not processed by the previous call
4908 of decode_eol_post_ccl in DESTINATION. It will be
4909 decoded together with the following LF by the call to
4910 decode_eol_post_ccl below. */
aaaf0b1e
KH
4911 *destination = '\r';
4912 coding->produced++;
4913 coding->produced_char++;
4914 dst_bytes--;
9861e777 4915 extra = coding->spec.ccl.cr_carryover;
aaaf0b1e 4916 }
9861e777 4917 ccl_coding_driver (coding, source, destination + extra,
b73bfc1c 4918 src_bytes, dst_bytes, 0);
aaaf0b1e 4919 if (coding->eol_type != CODING_EOL_LF)
9861e777
EZ
4920 {
4921 coding->produced += extra;
4922 coding->produced_char += extra;
4923 decode_eol_post_ccl (coding, destination, coding->produced);
4924 }
d46c5b12
KH
4925 break;
4926
b73bfc1c
KH
4927 default:
4928 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4929 }
4930
4931 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
e7c9eef9 4932 && coding->mode & CODING_MODE_LAST_BLOCK
b73bfc1c
KH
4933 && coding->consumed == src_bytes)
4934 coding->result = CODING_FINISH_NORMAL;
4935
4936 if (coding->mode & CODING_MODE_LAST_BLOCK
4937 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4938 {
a4244313 4939 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
4940 unsigned char *dst = destination + coding->produced;
4941
4942 src_bytes -= coding->consumed;
bb10be8b 4943 coding->errors++;
b73bfc1c
KH
4944 if (COMPOSING_P (coding))
4945 DECODE_COMPOSITION_END ('1');
4946 while (src_bytes--)
d46c5b12 4947 {
b73bfc1c
KH
4948 int c = *src++;
4949 dst += CHAR_STRING (c, dst);
4950 coding->produced_char++;
d46c5b12 4951 }
b73bfc1c
KH
4952 coding->consumed = coding->consumed_char = src - source;
4953 coding->produced = dst - destination;
73be902c 4954 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
4955 }
4956
b73bfc1c
KH
4957 if (!coding->dst_multibyte)
4958 {
4959 coding->produced = str_as_unibyte (destination, coding->produced);
4960 coding->produced_char = coding->produced;
4961 }
4ed46869 4962
b73bfc1c
KH
4963 return coding->result;
4964}
52d41803 4965
b73bfc1c
KH
4966/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4967 multibyteness of the source is CODING->src_multibyte, the
4968 multibyteness of the result is always unibyte. */
4ed46869
KH
4969
4970int
d46c5b12 4971encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4972 struct coding_system *coding;
a4244313
KR
4973 const unsigned char *source;
4974 unsigned char *destination;
4ed46869 4975 int src_bytes, dst_bytes;
4ed46869 4976{
b73bfc1c
KH
4977 coding->produced = coding->produced_char = 0;
4978 coding->consumed = coding->consumed_char = 0;
4979 coding->errors = 0;
4980 coding->result = CODING_FINISH_NORMAL;
4ed46869 4981
d46c5b12
KH
4982 switch (coding->type)
4983 {
4ed46869 4984 case coding_type_sjis:
b73bfc1c
KH
4985 encode_coding_sjis_big5 (coding, source, destination,
4986 src_bytes, dst_bytes, 1);
4ed46869
KH
4987 break;
4988
4989 case coding_type_iso2022:
b73bfc1c
KH
4990 encode_coding_iso2022 (coding, source, destination,
4991 src_bytes, dst_bytes);
4ed46869
KH
4992 break;
4993
4994 case coding_type_big5:
b73bfc1c
KH
4995 encode_coding_sjis_big5 (coding, source, destination,
4996 src_bytes, dst_bytes, 0);
4997 break;
4998
4999 case coding_type_emacs_mule:
5000 encode_coding_emacs_mule (coding, source, destination,
5001 src_bytes, dst_bytes);
4ed46869
KH
5002 break;
5003
5004 case coding_type_ccl:
b73bfc1c
KH
5005 ccl_coding_driver (coding, source, destination,
5006 src_bytes, dst_bytes, 1);
d46c5b12
KH
5007 break;
5008
b73bfc1c
KH
5009 default:
5010 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5011 }
5012
73be902c
KH
5013 if (coding->mode & CODING_MODE_LAST_BLOCK
5014 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
b73bfc1c 5015 {
a4244313 5016 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
5017 unsigned char *dst = destination + coding->produced;
5018
5019 if (coding->type == coding_type_iso2022)
5020 ENCODE_RESET_PLANE_AND_REGISTER;
5021 if (COMPOSING_P (coding))
5022 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5023 if (coding->consumed < src_bytes)
d46c5b12 5024 {
b73bfc1c
KH
5025 int len = src_bytes - coding->consumed;
5026
fabf4a91 5027 BCOPY_SHORT (src, dst, len);
b73bfc1c
KH
5028 if (coding->src_multibyte)
5029 len = str_as_unibyte (dst, len);
5030 dst += len;
5031 coding->consumed = src_bytes;
d46c5b12 5032 }
b73bfc1c 5033 coding->produced = coding->produced_char = dst - destination;
73be902c 5034 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
5035 }
5036
bb10be8b
KH
5037 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5038 && coding->consumed == src_bytes)
5039 coding->result = CODING_FINISH_NORMAL;
5040
b73bfc1c 5041 return coding->result;
4ed46869
KH
5042}
5043
fb88bf2d
KH
5044/* Scan text in the region between *BEG and *END (byte positions),
5045 skip characters which we don't have to decode by coding system
5046 CODING at the head and tail, then set *BEG and *END to the region
5047 of the text we actually have to convert. The caller should move
b73bfc1c
KH
5048 the gap out of the region in advance if the region is from a
5049 buffer.
4ed46869 5050
d46c5b12
KH
5051 If STR is not NULL, *BEG and *END are indices into STR. */
5052
5053static void
5054shrink_decoding_region (beg, end, coding, str)
5055 int *beg, *end;
5056 struct coding_system *coding;
5057 unsigned char *str;
5058{
fb88bf2d 5059 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 5060 int eol_conversion;
88993dfd 5061 Lisp_Object translation_table;
d46c5b12
KH
5062
5063 if (coding->type == coding_type_ccl
5064 || coding->type == coding_type_undecided
b73bfc1c
KH
5065 || coding->eol_type != CODING_EOL_LF
5066 || !NILP (coding->post_read_conversion)
5067 || coding->composing != COMPOSITION_DISABLED)
d46c5b12
KH
5068 {
5069 /* We can't skip any data. */
5070 return;
5071 }
b73bfc1c
KH
5072 if (coding->type == coding_type_no_conversion
5073 || coding->type == coding_type_raw_text
5074 || coding->type == coding_type_emacs_mule)
d46c5b12 5075 {
fb88bf2d
KH
5076 /* We need no conversion, but don't have to skip any data here.
5077 Decoding routine handles them effectively anyway. */
d46c5b12
KH
5078 return;
5079 }
5080
88993dfd
KH
5081 translation_table = coding->translation_table_for_decode;
5082 if (NILP (translation_table) && !NILP (Venable_character_translation))
5083 translation_table = Vstandard_translation_table_for_decode;
5084 if (CHAR_TABLE_P (translation_table))
5085 {
5086 int i;
5087 for (i = 0; i < 128; i++)
5088 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5089 break;
5090 if (i < 128)
fa46990e 5091 /* Some ASCII character should be translated. We give up
88993dfd
KH
5092 shrinking. */
5093 return;
5094 }
5095
b73bfc1c 5096 if (coding->heading_ascii >= 0)
d46c5b12
KH
5097 /* Detection routine has already found how much we can skip at the
5098 head. */
5099 *beg += coding->heading_ascii;
5100
5101 if (str)
5102 {
5103 begp_orig = begp = str + *beg;
5104 endp_orig = endp = str + *end;
5105 }
5106 else
5107 {
fb88bf2d 5108 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
5109 endp_orig = endp = begp + *end - *beg;
5110 }
5111
fa46990e
DL
5112 eol_conversion = (coding->eol_type == CODING_EOL_CR
5113 || coding->eol_type == CODING_EOL_CRLF);
5114
d46c5b12
KH
5115 switch (coding->type)
5116 {
d46c5b12
KH
5117 case coding_type_sjis:
5118 case coding_type_big5:
5119 /* We can skip all ASCII characters at the head. */
5120 if (coding->heading_ascii < 0)
5121 {
5122 if (eol_conversion)
de9d083c 5123 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
5124 else
5125 while (begp < endp && *begp < 0x80) begp++;
5126 }
5127 /* We can skip all ASCII characters at the tail except for the
5128 second byte of SJIS or BIG5 code. */
5129 if (eol_conversion)
de9d083c 5130 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
5131 else
5132 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
5133 /* Do not consider LF as ascii if preceded by CR, since that
5134 confuses eol decoding. */
5135 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5136 endp++;
d46c5b12
KH
5137 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5138 endp++;
5139 break;
5140
b73bfc1c 5141 case coding_type_iso2022:
622fece5
KH
5142 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5143 /* We can't skip any data. */
5144 break;
d46c5b12
KH
5145 if (coding->heading_ascii < 0)
5146 {
d46c5b12
KH
5147 /* We can skip all ASCII characters at the head except for a
5148 few control codes. */
5149 while (begp < endp && (c = *begp) < 0x80
5150 && c != ISO_CODE_CR && c != ISO_CODE_SO
5151 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5152 && (!eol_conversion || c != ISO_CODE_LF))
5153 begp++;
5154 }
5155 switch (coding->category_idx)
5156 {
5157 case CODING_CATEGORY_IDX_ISO_8_1:
5158 case CODING_CATEGORY_IDX_ISO_8_2:
5159 /* We can skip all ASCII characters at the tail. */
5160 if (eol_conversion)
de9d083c 5161 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
5162 else
5163 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
5164 /* Do not consider LF as ascii if preceded by CR, since that
5165 confuses eol decoding. */
5166 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5167 endp++;
d46c5b12
KH
5168 break;
5169
5170 case CODING_CATEGORY_IDX_ISO_7:
5171 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5 5172 {
8ca3766a 5173 /* We can skip all characters at the tail except for 8-bit
de79a6a5
KH
5174 codes and ESC and the following 2-byte at the tail. */
5175 unsigned char *eight_bit = NULL;
5176
5177 if (eol_conversion)
5178 while (begp < endp
5179 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5180 {
5181 if (!eight_bit && c & 0x80) eight_bit = endp;
5182 endp--;
5183 }
5184 else
5185 while (begp < endp
5186 && (c = endp[-1]) != ISO_CODE_ESC)
5187 {
5188 if (!eight_bit && c & 0x80) eight_bit = endp;
5189 endp--;
5190 }
5191 /* Do not consider LF as ascii if preceded by CR, since that
5192 confuses eol decoding. */
5193 if (begp < endp && endp < endp_orig
5194 && endp[-1] == '\r' && endp[0] == '\n')
5195 endp++;
5196 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5197 {
5198 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5199 /* This is an ASCII designation sequence. We can
5200 surely skip the tail. But, if we have
5201 encountered an 8-bit code, skip only the codes
5202 after that. */
5203 endp = eight_bit ? eight_bit : endp + 2;
5204 else
5205 /* Hmmm, we can't skip the tail. */
5206 endp = endp_orig;
5207 }
5208 else if (eight_bit)
5209 endp = eight_bit;
5210 }
d46c5b12 5211 }
b73bfc1c
KH
5212 break;
5213
5214 default:
5215 abort ();
d46c5b12
KH
5216 }
5217 *beg += begp - begp_orig;
5218 *end += endp - endp_orig;
5219 return;
5220}
5221
5222/* Like shrink_decoding_region but for encoding. */
5223
5224static void
5225shrink_encoding_region (beg, end, coding, str)
5226 int *beg, *end;
5227 struct coding_system *coding;
5228 unsigned char *str;
5229{
5230 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5231 int eol_conversion;
88993dfd 5232 Lisp_Object translation_table;
d46c5b12 5233
b73bfc1c
KH
5234 if (coding->type == coding_type_ccl
5235 || coding->eol_type == CODING_EOL_CRLF
5236 || coding->eol_type == CODING_EOL_CR
87323294 5237 || (coding->cmp_data && coding->cmp_data->used > 0))
d46c5b12 5238 {
b73bfc1c
KH
5239 /* We can't skip any data. */
5240 return;
5241 }
5242 if (coding->type == coding_type_no_conversion
5243 || coding->type == coding_type_raw_text
5244 || coding->type == coding_type_emacs_mule
5245 || coding->type == coding_type_undecided)
5246 {
5247 /* We need no conversion, but don't have to skip any data here.
5248 Encoding routine handles them effectively anyway. */
d46c5b12
KH
5249 return;
5250 }
5251
88993dfd
KH
5252 translation_table = coding->translation_table_for_encode;
5253 if (NILP (translation_table) && !NILP (Venable_character_translation))
5254 translation_table = Vstandard_translation_table_for_encode;
5255 if (CHAR_TABLE_P (translation_table))
5256 {
5257 int i;
5258 for (i = 0; i < 128; i++)
5259 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5260 break;
5261 if (i < 128)
8ca3766a 5262 /* Some ASCII character should be translated. We give up
88993dfd
KH
5263 shrinking. */
5264 return;
5265 }
5266
d46c5b12
KH
5267 if (str)
5268 {
5269 begp_orig = begp = str + *beg;
5270 endp_orig = endp = str + *end;
5271 }
5272 else
5273 {
fb88bf2d 5274 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
5275 endp_orig = endp = begp + *end - *beg;
5276 }
5277
5278 eol_conversion = (coding->eol_type == CODING_EOL_CR
5279 || coding->eol_type == CODING_EOL_CRLF);
5280
5281 /* Here, we don't have to check coding->pre_write_conversion because
5282 the caller is expected to have handled it already. */
5283 switch (coding->type)
5284 {
d46c5b12 5285 case coding_type_iso2022:
622fece5
KH
5286 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5287 /* We can't skip any data. */
5288 break;
d46c5b12
KH
5289 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5290 {
93dec019 5291 unsigned char *bol = begp;
d46c5b12
KH
5292 while (begp < endp && *begp < 0x80)
5293 {
5294 begp++;
5295 if (begp[-1] == '\n')
5296 bol = begp;
5297 }
5298 begp = bol;
5299 goto label_skip_tail;
5300 }
5301 /* fall down ... */
5302
b73bfc1c
KH
5303 case coding_type_sjis:
5304 case coding_type_big5:
d46c5b12
KH
5305 /* We can skip all ASCII characters at the head and tail. */
5306 if (eol_conversion)
5307 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5308 else
5309 while (begp < endp && *begp < 0x80) begp++;
5310 label_skip_tail:
5311 if (eol_conversion)
5312 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5313 else
5314 while (begp < endp && *(endp - 1) < 0x80) endp--;
5315 break;
b73bfc1c
KH
5316
5317 default:
5318 abort ();
d46c5b12
KH
5319 }
5320
5321 *beg += begp - begp_orig;
5322 *end += endp - endp_orig;
5323 return;
5324}
5325
88993dfd
KH
5326/* As shrinking conversion region requires some overhead, we don't try
5327 shrinking if the length of conversion region is less than this
5328 value. */
5329static int shrink_conversion_region_threshhold = 1024;
5330
5331#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5332 do { \
5333 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5334 { \
5335 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5336 else shrink_decoding_region (beg, end, coding, str); \
5337 } \
5338 } while (0)
5339
b843d1ae 5340static Lisp_Object
1c7457e2
KH
5341code_convert_region_unwind (arg)
5342 Lisp_Object arg;
b843d1ae
KH
5343{
5344 inhibit_pre_post_conversion = 0;
1c7457e2 5345 Vlast_coding_system_used = arg;
b843d1ae
KH
5346 return Qnil;
5347}
5348
ec6d2bb8
KH
5349/* Store information about all compositions in the range FROM and TO
5350 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5351 buffer or a string, defaults to the current buffer. */
5352
5353void
5354coding_save_composition (coding, from, to, obj)
5355 struct coding_system *coding;
5356 int from, to;
5357 Lisp_Object obj;
5358{
5359 Lisp_Object prop;
5360 int start, end;
5361
91bee881
KH
5362 if (coding->composing == COMPOSITION_DISABLED)
5363 return;
5364 if (!coding->cmp_data)
5365 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
5366 if (!find_composition (from, to, &start, &end, &prop, obj)
5367 || end > to)
5368 return;
5369 if (start < from
5370 && (!find_composition (end, to, &start, &end, &prop, obj)
5371 || end > to))
5372 return;
5373 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
5374 do
5375 {
5376 if (COMPOSITION_VALID_P (start, end, prop))
5377 {
5378 enum composition_method method = COMPOSITION_METHOD (prop);
5379 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5380 >= COMPOSITION_DATA_SIZE)
5381 coding_allocate_composition_data (coding, from);
5382 /* For relative composition, we remember start and end
5383 positions, for the other compositions, we also remember
5384 components. */
5385 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5386 if (method != COMPOSITION_RELATIVE)
5387 {
5388 /* We must store a*/
5389 Lisp_Object val, ch;
5390
5391 val = COMPOSITION_COMPONENTS (prop);
5392 if (CONSP (val))
5393 while (CONSP (val))
5394 {
5395 ch = XCAR (val), val = XCDR (val);
5396 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5397 }
5398 else if (VECTORP (val) || STRINGP (val))
5399 {
5400 int len = (VECTORP (val)
d5db4077 5401 ? XVECTOR (val)->size : SCHARS (val));
ec6d2bb8
KH
5402 int i;
5403 for (i = 0; i < len; i++)
5404 {
5405 ch = (STRINGP (val)
5406 ? Faref (val, make_number (i))
5407 : XVECTOR (val)->contents[i]);
5408 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5409 }
5410 }
5411 else /* INTEGERP (val) */
5412 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5413 }
5414 CODING_ADD_COMPOSITION_END (coding, end - from);
5415 }
5416 start = end;
5417 }
5418 while (start < to
5419 && find_composition (start, to, &start, &end, &prop, obj)
5420 && end <= to);
5421
5422 /* Make coding->cmp_data point to the first memory block. */
5423 while (coding->cmp_data->prev)
5424 coding->cmp_data = coding->cmp_data->prev;
5425 coding->cmp_data_start = 0;
5426}
5427
5428/* Reflect the saved information about compositions to OBJ.
8ca3766a 5429 CODING->cmp_data points to a memory block for the information. OBJ
ec6d2bb8
KH
5430 is a buffer or a string, defaults to the current buffer. */
5431
33fb63eb 5432void
ec6d2bb8
KH
5433coding_restore_composition (coding, obj)
5434 struct coding_system *coding;
5435 Lisp_Object obj;
5436{
5437 struct composition_data *cmp_data = coding->cmp_data;
5438
5439 if (!cmp_data)
5440 return;
5441
5442 while (cmp_data->prev)
5443 cmp_data = cmp_data->prev;
5444
5445 while (cmp_data)
5446 {
5447 int i;
5448
78108bcd
KH
5449 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5450 i += cmp_data->data[i])
ec6d2bb8
KH
5451 {
5452 int *data = cmp_data->data + i;
5453 enum composition_method method = (enum composition_method) data[3];
5454 Lisp_Object components;
5455
4307d534
KH
5456 if (data[0] < 0 || i + data[0] > cmp_data->used)
5457 /* Invalid composition data. */
5458 break;
5459
ec6d2bb8
KH
5460 if (method == COMPOSITION_RELATIVE)
5461 components = Qnil;
5462 else
5463 {
5464 int len = data[0] - 4, j;
5465 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5466
b6871cc7
KH
5467 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5468 && len % 2 == 0)
5469 len --;
09721b31
KH
5470 if (len < 1)
5471 /* Invalid composition data. */
5472 break;
ec6d2bb8
KH
5473 for (j = 0; j < len; j++)
5474 args[j] = make_number (data[4 + j]);
5475 components = (method == COMPOSITION_WITH_ALTCHARS
316d4bf9
SM
5476 ? Fstring (len, args)
5477 : Fvector (len, args));
ec6d2bb8
KH
5478 }
5479 compose_text (data[1], data[2], components, Qnil, obj);
5480 }
5481 cmp_data = cmp_data->next;
5482 }
5483}
5484
d46c5b12 5485/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
5486 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5487 coding system CODING, and return the status code of code conversion
5488 (currently, this value has no meaning).
5489
5490 How many characters (and bytes) are converted to how many
5491 characters (and bytes) are recorded in members of the structure
5492 CODING.
d46c5b12 5493
6e44253b 5494 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 5495 is deleted and a new text is inserted. See the comments in
b73bfc1c
KH
5496 replace_range (insdel.c) to know what we are doing.
5497
5498 If REPLACE is zero, it is assumed that the source text is unibyte.
8ca3766a 5499 Otherwise, it is assumed that the source text is multibyte. */
4ed46869
KH
5500
5501int
6e44253b
KH
5502code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5503 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 5504 struct coding_system *coding;
4ed46869 5505{
fb88bf2d 5506 int len = to - from, len_byte = to_byte - from_byte;
72d1a715 5507 int nchars_del = 0, nbytes_del = 0;
fb88bf2d 5508 int require, inserted, inserted_byte;
4b39528c 5509 int head_skip, tail_skip, total_skip = 0;
84d60297 5510 Lisp_Object saved_coding_symbol;
fb88bf2d 5511 int first = 1;
fb88bf2d 5512 unsigned char *src, *dst;
84d60297 5513 Lisp_Object deletion;
e133c8fa 5514 int orig_point = PT, orig_len = len;
6abb9bd9 5515 int prev_Z;
b73bfc1c
KH
5516 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5517
84d60297 5518 deletion = Qnil;
8844fa83 5519 saved_coding_symbol = coding->symbol;
d46c5b12 5520
83fa074f 5521 if (from < PT && PT < to)
e133c8fa
KH
5522 {
5523 TEMP_SET_PT_BOTH (from, from_byte);
5524 orig_point = from;
5525 }
83fa074f 5526
6e44253b 5527 if (replace)
d46c5b12 5528 {
fb88bf2d 5529 int saved_from = from;
e077cc80 5530 int saved_inhibit_modification_hooks;
fb88bf2d 5531
d46c5b12 5532 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
5533 if (saved_from != from)
5534 {
5535 to = from + len;
b73bfc1c 5536 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
fb88bf2d
KH
5537 len_byte = to_byte - from_byte;
5538 }
e077cc80
KH
5539
5540 /* The code conversion routine can not preserve text properties
5541 for now. So, we must remove all text properties in the
5542 region. Here, we must suppress all modification hooks. */
5543 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5544 inhibit_modification_hooks = 1;
5545 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5546 inhibit_modification_hooks = saved_inhibit_modification_hooks;
d46c5b12 5547 }
d46c5b12
KH
5548
5549 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5550 {
12410ef1 5551 /* We must detect encoding of text and eol format. */
d46c5b12
KH
5552
5553 if (from < GPT && to > GPT)
5554 move_gap_both (from, from_byte);
5555 if (coding->type == coding_type_undecided)
5556 {
fb88bf2d 5557 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 5558 if (coding->type == coding_type_undecided)
62b3ef1d
KH
5559 {
5560 /* It seems that the text contains only ASCII, but we
d9aef30f 5561 should not leave it undecided because the deeper
62b3ef1d
KH
5562 decoding routine (decode_coding) tries to detect the
5563 encodings again in vain. */
5564 coding->type = coding_type_emacs_mule;
5565 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
d280ccb6
KH
5566 /* As emacs-mule decoder will handle composition, we
5567 need this setting to allocate coding->cmp_data
5568 later. */
5569 coding->composing = COMPOSITION_NO;
62b3ef1d 5570 }
d46c5b12 5571 }
aaaf0b1e
KH
5572 if (coding->eol_type == CODING_EOL_UNDECIDED
5573 && coding->type != coding_type_ccl)
d46c5b12 5574 {
d46c5b12
KH
5575 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5576 if (coding->eol_type == CODING_EOL_UNDECIDED)
5577 coding->eol_type = CODING_EOL_LF;
5578 /* We had better recover the original eol format if we
8ca3766a 5579 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
5580 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5581 }
5582 }
5583
d46c5b12
KH
5584 /* Now we convert the text. */
5585
5586 /* For encoding, we must process pre-write-conversion in advance. */
b73bfc1c
KH
5587 if (! inhibit_pre_post_conversion
5588 && encodep
d46c5b12
KH
5589 && SYMBOLP (coding->pre_write_conversion)
5590 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5591 {
2b4f9037
KH
5592 /* The function in pre-write-conversion may put a new text in a
5593 new buffer. */
0007bdd0
KH
5594 struct buffer *prev = current_buffer;
5595 Lisp_Object new;
d46c5b12 5596
1c7457e2 5597 record_unwind_protect (code_convert_region_unwind,
24a948a7 5598 Vlast_coding_system_used);
b843d1ae
KH
5599 /* We should not call any more pre-write/post-read-conversion
5600 functions while this pre-write-conversion is running. */
5601 inhibit_pre_post_conversion = 1;
b39f748c
AS
5602 call2 (coding->pre_write_conversion,
5603 make_number (from), make_number (to));
b843d1ae
KH
5604 inhibit_pre_post_conversion = 0;
5605 /* Discard the unwind protect. */
5606 specpdl_ptr--;
5607
d46c5b12
KH
5608 if (current_buffer != prev)
5609 {
5610 len = ZV - BEGV;
0007bdd0 5611 new = Fcurrent_buffer ();
d46c5b12 5612 set_buffer_internal_1 (prev);
7dae4502 5613 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 5614 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
5615 insert_from_buffer (XBUFFER (new), 1, len, 0);
5616 Fkill_buffer (new);
e133c8fa
KH
5617 if (orig_point >= to)
5618 orig_point += len - orig_len;
5619 else if (orig_point > from)
5620 orig_point = from;
5621 orig_len = len;
d46c5b12 5622 to = from + len;
b73bfc1c
KH
5623 from_byte = CHAR_TO_BYTE (from);
5624 to_byte = CHAR_TO_BYTE (to);
d46c5b12 5625 len_byte = to_byte - from_byte;
e133c8fa 5626 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
5627 }
5628 }
5629
12410ef1 5630 if (replace)
72d1a715
RS
5631 {
5632 if (! EQ (current_buffer->undo_list, Qt))
5633 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5634 else
5635 {
5636 nchars_del = to - from;
5637 nbytes_del = to_byte - from_byte;
5638 }
5639 }
12410ef1 5640
ec6d2bb8
KH
5641 if (coding->composing != COMPOSITION_DISABLED)
5642 {
5643 if (encodep)
5644 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5645 else
5646 coding_allocate_composition_data (coding, from);
5647 }
fb88bf2d 5648
b73bfc1c 5649 /* Try to skip the heading and tailing ASCIIs. */
4956c225
KH
5650 if (coding->type != coding_type_ccl)
5651 {
5652 int from_byte_orig = from_byte, to_byte_orig = to_byte;
ec6d2bb8 5653
4956c225
KH
5654 if (from < GPT && GPT < to)
5655 move_gap_both (from, from_byte);
5656 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5657 if (from_byte == to_byte
5658 && (encodep || NILP (coding->post_read_conversion))
5659 && ! CODING_REQUIRE_FLUSHING (coding))
5660 {
5661 coding->produced = len_byte;
5662 coding->produced_char = len;
5663 if (!replace)
5664 /* We must record and adjust for this new text now. */
5665 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5666 return 0;
5667 }
5668
5669 head_skip = from_byte - from_byte_orig;
5670 tail_skip = to_byte_orig - to_byte;
5671 total_skip = head_skip + tail_skip;
5672 from += head_skip;
5673 to -= tail_skip;
5674 len -= total_skip; len_byte -= total_skip;
5675 }
d46c5b12 5676
8ca3766a 5677 /* For conversion, we must put the gap before the text in addition to
fb88bf2d
KH
5678 making the gap larger for efficient decoding. The required gap
5679 size starts from 2000 which is the magic number used in make_gap.
5680 But, after one batch of conversion, it will be incremented if we
5681 find that it is not enough . */
d46c5b12
KH
5682 require = 2000;
5683
5684 if (GAP_SIZE < require)
5685 make_gap (require - GAP_SIZE);
5686 move_gap_both (from, from_byte);
5687
d46c5b12 5688 inserted = inserted_byte = 0;
fb88bf2d
KH
5689
5690 GAP_SIZE += len_byte;
5691 ZV -= len;
5692 Z -= len;
5693 ZV_BYTE -= len_byte;
5694 Z_BYTE -= len_byte;
5695
d9f9a1bc
GM
5696 if (GPT - BEG < BEG_UNCHANGED)
5697 BEG_UNCHANGED = GPT - BEG;
5698 if (Z - GPT < END_UNCHANGED)
5699 END_UNCHANGED = Z - GPT;
f2558efd 5700
b73bfc1c
KH
5701 if (!encodep && coding->src_multibyte)
5702 {
5703 /* Decoding routines expects that the source text is unibyte.
5704 We must convert 8-bit characters of multibyte form to
5705 unibyte. */
5706 int len_byte_orig = len_byte;
5707 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5708 if (len_byte < len_byte_orig)
5709 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5710 len_byte);
5711 coding->src_multibyte = 0;
5712 }
5713
d46c5b12
KH
5714 for (;;)
5715 {
fb88bf2d 5716 int result;
d46c5b12 5717
ec6d2bb8 5718 /* The buffer memory is now:
b73bfc1c
KH
5719 +--------+converted-text+---------+-------original-text-------+---+
5720 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5721 |<---------------------- GAP ----------------------->| */
ec6d2bb8
KH
5722 src = GAP_END_ADDR - len_byte;
5723 dst = GPT_ADDR + inserted_byte;
5724
d46c5b12 5725 if (encodep)
fb88bf2d 5726 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 5727 else
0e79d667
RS
5728 {
5729 if (coding->composing != COMPOSITION_DISABLED)
5730 coding->cmp_data->char_offset = from + inserted;
5731 result = decode_coding (coding, src, dst, len_byte, 0);
5732 }
ec6d2bb8
KH
5733
5734 /* The buffer memory is now:
b73bfc1c
KH
5735 +--------+-------converted-text----+--+------original-text----+---+
5736 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5737 |<---------------------- GAP ----------------------->| */
ec6d2bb8 5738
d46c5b12
KH
5739 inserted += coding->produced_char;
5740 inserted_byte += coding->produced;
d46c5b12 5741 len_byte -= coding->consumed;
ec6d2bb8
KH
5742
5743 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5744 {
5745 coding_allocate_composition_data (coding, from + inserted);
5746 continue;
5747 }
5748
fb88bf2d 5749 src += coding->consumed;
3636f7a3 5750 dst += coding->produced;
d46c5b12 5751
9864ebce
KH
5752 if (result == CODING_FINISH_NORMAL)
5753 {
5754 src += len_byte;
5755 break;
5756 }
d46c5b12
KH
5757 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5758 {
fb88bf2d 5759 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 5760 Lisp_Object eol_type;
d46c5b12
KH
5761
5762 /* Encode LFs back to the original eol format (CR or CRLF). */
5763 if (coding->eol_type == CODING_EOL_CR)
5764 {
5765 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5766 }
5767 else
5768 {
d46c5b12
KH
5769 int count = 0;
5770
fb88bf2d
KH
5771 while (p < pend) if (*p++ == '\n') count++;
5772 if (src - dst < count)
d46c5b12 5773 {
38edf7d4 5774 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
5775 back to CRLF. We must record converted and
5776 not-yet-converted text back to the buffer
5777 content, enlarge the gap, then record them out of
5778 the buffer contents again. */
5779 int add = len_byte + inserted_byte;
5780
5781 GAP_SIZE -= add;
5782 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5783 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5784 make_gap (count - GAP_SIZE);
5785 GAP_SIZE += add;
5786 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5787 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5788 /* Don't forget to update SRC, DST, and PEND. */
5789 src = GAP_END_ADDR - len_byte;
5790 dst = GPT_ADDR + inserted_byte;
5791 pend = dst;
d46c5b12 5792 }
d46c5b12
KH
5793 inserted += count;
5794 inserted_byte += count;
fb88bf2d
KH
5795 coding->produced += count;
5796 p = dst = pend + count;
5797 while (count)
5798 {
5799 *--p = *--pend;
5800 if (*p == '\n') count--, *--p = '\r';
5801 }
d46c5b12
KH
5802 }
5803
5804 /* Suppress eol-format conversion in the further conversion. */
5805 coding->eol_type = CODING_EOL_LF;
5806
38edf7d4
KH
5807 /* Set the coding system symbol to that for Unix-like EOL. */
5808 eol_type = Fget (saved_coding_symbol, Qeol_type);
5809 if (VECTORP (eol_type)
5810 && XVECTOR (eol_type)->size == 3
5811 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5812 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5813 else
5814 coding->symbol = saved_coding_symbol;
93dec019 5815
fb88bf2d 5816 continue;
d46c5b12
KH
5817 }
5818 if (len_byte <= 0)
944bd420
KH
5819 {
5820 if (coding->type != coding_type_ccl
5821 || coding->mode & CODING_MODE_LAST_BLOCK)
5822 break;
5823 coding->mode |= CODING_MODE_LAST_BLOCK;
5824 continue;
5825 }
d46c5b12
KH
5826 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5827 {
5828 /* The source text ends in invalid codes. Let's just
5829 make them valid buffer contents, and finish conversion. */
70ad9fc4
GM
5830 if (multibyte_p)
5831 {
5832 unsigned char *start = dst;
93dec019 5833
70ad9fc4
GM
5834 inserted += len_byte;
5835 while (len_byte--)
5836 {
5837 int c = *src++;
5838 dst += CHAR_STRING (c, dst);
5839 }
5840
5841 inserted_byte += dst - start;
5842 }
5843 else
5844 {
5845 inserted += len_byte;
5846 inserted_byte += len_byte;
5847 while (len_byte--)
5848 *dst++ = *src++;
5849 }
d46c5b12
KH
5850 break;
5851 }
9864ebce
KH
5852 if (result == CODING_FINISH_INTERRUPT)
5853 {
5854 /* The conversion procedure was interrupted by a user. */
9864ebce
KH
5855 break;
5856 }
5857 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5858 if (coding->consumed < 1)
5859 {
5860 /* It's quite strange to require more memory without
5861 consuming any bytes. Perhaps CCL program bug. */
9864ebce
KH
5862 break;
5863 }
fb88bf2d
KH
5864 if (first)
5865 {
5866 /* We have just done the first batch of conversion which was
8ca3766a 5867 stopped because of insufficient gap. Let's reconsider the
fb88bf2d
KH
5868 required gap size (i.e. SRT - DST) now.
5869
5870 We have converted ORIG bytes (== coding->consumed) into
5871 NEW bytes (coding->produced). To convert the remaining
5872 LEN bytes, we may need REQUIRE bytes of gap, where:
5873 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5874 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5875 Here, we are sure that NEW >= ORIG. */
b3385c28
KH
5876 float ratio;
5877
5878 if (coding->produced <= coding->consumed)
5879 {
5880 /* This happens because of CCL-based coding system with
5881 eol-type CRLF. */
5882 require = 0;
5883 }
5884 else
5885 {
5886 ratio = (coding->produced - coding->consumed) / coding->consumed;
5887 require = len_byte * ratio;
5888 }
fb88bf2d
KH
5889 first = 0;
5890 }
5891 if ((src - dst) < (require + 2000))
5892 {
5893 /* See the comment above the previous call of make_gap. */
5894 int add = len_byte + inserted_byte;
5895
5896 GAP_SIZE -= add;
5897 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5898 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5899 make_gap (require + 2000);
5900 GAP_SIZE += add;
5901 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5902 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 5903 }
d46c5b12 5904 }
fb88bf2d
KH
5905 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5906
b73bfc1c
KH
5907 if (encodep && coding->dst_multibyte)
5908 {
5909 /* The output is unibyte. We must convert 8-bit characters to
5910 multibyte form. */
5911 if (inserted_byte * 2 > GAP_SIZE)
5912 {
5913 GAP_SIZE -= inserted_byte;
5914 ZV += inserted_byte; Z += inserted_byte;
5915 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5916 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5917 make_gap (inserted_byte - GAP_SIZE);
5918 GAP_SIZE += inserted_byte;
5919 ZV -= inserted_byte; Z -= inserted_byte;
5920 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5921 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5922 }
5923 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5924 }
7553d0e1 5925
93dec019 5926 /* If we shrank the conversion area, adjust it now. */
12410ef1
KH
5927 if (total_skip > 0)
5928 {
5929 if (tail_skip > 0)
5930 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5931 inserted += total_skip; inserted_byte += total_skip;
5932 GAP_SIZE += total_skip;
5933 GPT -= head_skip; GPT_BYTE -= head_skip;
5934 ZV -= total_skip; ZV_BYTE -= total_skip;
5935 Z -= total_skip; Z_BYTE -= total_skip;
5936 from -= head_skip; from_byte -= head_skip;
5937 to += tail_skip; to_byte += tail_skip;
5938 }
5939
6abb9bd9 5940 prev_Z = Z;
72d1a715
RS
5941 if (! EQ (current_buffer->undo_list, Qt))
5942 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5943 else
5944 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5945 inserted, inserted_byte);
6abb9bd9 5946 inserted = Z - prev_Z;
4ed46869 5947
ec6d2bb8
KH
5948 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5949 coding_restore_composition (coding, Fcurrent_buffer ());
5950 coding_free_composition_data (coding);
5951
b73bfc1c
KH
5952 if (! inhibit_pre_post_conversion
5953 && ! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 5954 {
2b4f9037 5955 Lisp_Object val;
1c7457e2 5956 Lisp_Object saved_coding_system;
4ed46869 5957
e133c8fa
KH
5958 if (from != PT)
5959 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 5960 prev_Z = Z;
1c7457e2
KH
5961 record_unwind_protect (code_convert_region_unwind,
5962 Vlast_coding_system_used);
5963 saved_coding_system = Vlast_coding_system_used;
5964 Vlast_coding_system_used = coding->symbol;
b843d1ae
KH
5965 /* We should not call any more pre-write/post-read-conversion
5966 functions while this post-read-conversion is running. */
5967 inhibit_pre_post_conversion = 1;
2b4f9037 5968 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae 5969 inhibit_pre_post_conversion = 0;
1c7457e2
KH
5970 coding->symbol = Vlast_coding_system_used;
5971 Vlast_coding_system_used = saved_coding_system;
b843d1ae
KH
5972 /* Discard the unwind protect. */
5973 specpdl_ptr--;
b7826503 5974 CHECK_NUMBER (val);
944bd420 5975 inserted += Z - prev_Z;
e133c8fa
KH
5976 }
5977
5978 if (orig_point >= from)
5979 {
5980 if (orig_point >= from + orig_len)
5981 orig_point += inserted - orig_len;
5982 else
5983 orig_point = from;
5984 TEMP_SET_PT (orig_point);
d46c5b12 5985 }
4ed46869 5986
ec6d2bb8
KH
5987 if (replace)
5988 {
5989 signal_after_change (from, to - from, inserted);
e19539f1 5990 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 5991 }
2b4f9037 5992
fb88bf2d 5993 {
12410ef1
KH
5994 coding->consumed = to_byte - from_byte;
5995 coding->consumed_char = to - from;
5996 coding->produced = inserted_byte;
5997 coding->produced_char = inserted;
fb88bf2d 5998 }
7553d0e1 5999
fb88bf2d 6000 return 0;
d46c5b12
KH
6001}
6002
6003Lisp_Object
b73bfc1c
KH
6004run_pre_post_conversion_on_str (str, coding, encodep)
6005 Lisp_Object str;
6006 struct coding_system *coding;
6007 int encodep;
6008{
aed13378 6009 int count = SPECPDL_INDEX ();
cf3b32fc 6010 struct gcpro gcpro1, gcpro2;
b73bfc1c 6011 int multibyte = STRING_MULTIBYTE (str);
3fd9494b
RS
6012 Lisp_Object buffer;
6013 struct buffer *buf;
cf3b32fc 6014 Lisp_Object old_deactivate_mark;
b73bfc1c
KH
6015
6016 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
1c7457e2
KH
6017 record_unwind_protect (code_convert_region_unwind,
6018 Vlast_coding_system_used);
cf3b32fc
RS
6019 /* It is not crucial to specbind this. */
6020 old_deactivate_mark = Vdeactivate_mark;
6021 GCPRO2 (str, old_deactivate_mark);
3fd9494b
RS
6022
6023 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
6024 buf = XBUFFER (buffer);
6025
6ed0af67 6026 delete_all_overlays (buf);
3fd9494b
RS
6027 buf->directory = current_buffer->directory;
6028 buf->read_only = Qnil;
6029 buf->filename = Qnil;
6030 buf->undo_list = Qt;
6ed0af67
SM
6031 eassert (buf->overlays_before == NULL);
6032 eassert (buf->overlays_after == NULL);
3fd9494b
RS
6033
6034 set_buffer_internal (buf);
b73bfc1c
KH
6035 /* We must insert the contents of STR as is without
6036 unibyte<->multibyte conversion. For that, we adjust the
6037 multibyteness of the working buffer to that of STR. */
6038 Ferase_buffer ();
3fd9494b
RS
6039 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6040
b73bfc1c 6041 insert_from_string (str, 0, 0,
d5db4077 6042 SCHARS (str), SBYTES (str), 0);
b73bfc1c
KH
6043 UNGCPRO;
6044 inhibit_pre_post_conversion = 1;
6045 if (encodep)
6046 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6047 else
6bac5b12 6048 {
1c7457e2 6049 Vlast_coding_system_used = coding->symbol;
6bac5b12
KH
6050 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6051 call1 (coding->post_read_conversion, make_number (Z - BEG));
1c7457e2 6052 coding->symbol = Vlast_coding_system_used;
6bac5b12 6053 }
b73bfc1c 6054 inhibit_pre_post_conversion = 0;
cf3b32fc 6055 Vdeactivate_mark = old_deactivate_mark;
78108bcd 6056 str = make_buffer_string (BEG, Z, 1);
b73bfc1c
KH
6057 return unbind_to (count, str);
6058}
6059
6060Lisp_Object
6061decode_coding_string (str, coding, nocopy)
d46c5b12 6062 Lisp_Object str;
4ed46869 6063 struct coding_system *coding;
b73bfc1c 6064 int nocopy;
4ed46869 6065{
d46c5b12 6066 int len;
73be902c 6067 struct conversion_buffer buf;
da55a2b7 6068 int from, to_byte;
84d60297 6069 Lisp_Object saved_coding_symbol;
d46c5b12 6070 int result;
78108bcd 6071 int require_decoding;
73be902c
KH
6072 int shrinked_bytes = 0;
6073 Lisp_Object newstr;
2391eaa4 6074 int consumed, consumed_char, produced, produced_char;
4ed46869 6075
b73bfc1c 6076 from = 0;
d5db4077 6077 to_byte = SBYTES (str);
4ed46869 6078
8844fa83 6079 saved_coding_symbol = coding->symbol;
764ca8da
KH
6080 coding->src_multibyte = STRING_MULTIBYTE (str);
6081 coding->dst_multibyte = 1;
b73bfc1c 6082 if (CODING_REQUIRE_DETECTION (coding))
d46c5b12
KH
6083 {
6084 /* See the comments in code_convert_region. */
6085 if (coding->type == coding_type_undecided)
6086 {
d5db4077 6087 detect_coding (coding, SDATA (str), to_byte);
d46c5b12 6088 if (coding->type == coding_type_undecided)
d280ccb6
KH
6089 {
6090 coding->type = coding_type_emacs_mule;
6091 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6092 /* As emacs-mule decoder will handle composition, we
6093 need this setting to allocate coding->cmp_data
6094 later. */
6095 coding->composing = COMPOSITION_NO;
6096 }
d46c5b12 6097 }
aaaf0b1e
KH
6098 if (coding->eol_type == CODING_EOL_UNDECIDED
6099 && coding->type != coding_type_ccl)
d46c5b12
KH
6100 {
6101 saved_coding_symbol = coding->symbol;
d5db4077 6102 detect_eol (coding, SDATA (str), to_byte);
d46c5b12
KH
6103 if (coding->eol_type == CODING_EOL_UNDECIDED)
6104 coding->eol_type = CODING_EOL_LF;
6105 /* We had better recover the original eol format if we
8ca3766a 6106 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
6107 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6108 }
6109 }
4ed46869 6110
764ca8da
KH
6111 if (coding->type == coding_type_no_conversion
6112 || coding->type == coding_type_raw_text)
6113 coding->dst_multibyte = 0;
6114
78108bcd 6115 require_decoding = CODING_REQUIRE_DECODING (coding);
ec6d2bb8 6116
b73bfc1c 6117 if (STRING_MULTIBYTE (str))
d46c5b12 6118 {
b73bfc1c
KH
6119 /* Decoding routines expect the source text to be unibyte. */
6120 str = Fstring_as_unibyte (str);
d5db4077 6121 to_byte = SBYTES (str);
b73bfc1c 6122 nocopy = 1;
764ca8da 6123 coding->src_multibyte = 0;
b73bfc1c 6124 }
ec6d2bb8 6125
b73bfc1c 6126 /* Try to skip the heading and tailing ASCIIs. */
78108bcd 6127 if (require_decoding && coding->type != coding_type_ccl)
4956c225 6128 {
d5db4077 6129 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
6130 0);
6131 if (from == to_byte)
78108bcd 6132 require_decoding = 0;
d5db4077 6133 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 6134 }
b73bfc1c 6135
439ad9ea
KH
6136 if (!require_decoding
6137 && !(SYMBOLP (coding->post_read_conversion)
6138 && !NILP (Ffboundp (coding->post_read_conversion))))
78108bcd 6139 {
d5db4077
KR
6140 coding->consumed = SBYTES (str);
6141 coding->consumed_char = SCHARS (str);
78108bcd
KH
6142 if (coding->dst_multibyte)
6143 {
6144 str = Fstring_as_multibyte (str);
6145 nocopy = 1;
6146 }
d5db4077
KR
6147 coding->produced = SBYTES (str);
6148 coding->produced_char = SCHARS (str);
78108bcd
KH
6149 return (nocopy ? str : Fcopy_sequence (str));
6150 }
6151
6152 if (coding->composing != COMPOSITION_DISABLED)
6153 coding_allocate_composition_data (coding, from);
b73bfc1c 6154 len = decoding_buffer_size (coding, to_byte - from);
73be902c 6155 allocate_conversion_buffer (buf, len);
4ed46869 6156
2391eaa4 6157 consumed = consumed_char = produced = produced_char = 0;
73be902c 6158 while (1)
4ed46869 6159 {
d5db4077 6160 result = decode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6161 buf.data + produced, to_byte - from - consumed,
6162 buf.size - produced);
6163 consumed += coding->consumed;
2391eaa4 6164 consumed_char += coding->consumed_char;
73be902c
KH
6165 produced += coding->produced;
6166 produced_char += coding->produced_char;
2391eaa4
KH
6167 if (result == CODING_FINISH_NORMAL
6168 || (result == CODING_FINISH_INSUFFICIENT_SRC
6169 && coding->consumed == 0))
73be902c
KH
6170 break;
6171 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6172 coding_allocate_composition_data (coding, from + produced_char);
6173 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6174 extend_conversion_buffer (&buf);
6175 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6176 {
8844fa83
KH
6177 Lisp_Object eol_type;
6178
73be902c
KH
6179 /* Recover the original EOL format. */
6180 if (coding->eol_type == CODING_EOL_CR)
6181 {
6182 unsigned char *p;
6183 for (p = buf.data; p < buf.data + produced; p++)
6184 if (*p == '\n') *p = '\r';
6185 }
6186 else if (coding->eol_type == CODING_EOL_CRLF)
6187 {
6188 int num_eol = 0;
6189 unsigned char *p0, *p1;
6190 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6191 if (*p0 == '\n') num_eol++;
6192 if (produced + num_eol >= buf.size)
6193 extend_conversion_buffer (&buf);
6194 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6195 {
6196 *--p1 = *--p0;
6197 if (*p0 == '\n') *--p1 = '\r';
6198 }
6199 produced += num_eol;
6200 produced_char += num_eol;
93dec019 6201 }
8844fa83 6202 /* Suppress eol-format conversion in the further conversion. */
73be902c 6203 coding->eol_type = CODING_EOL_LF;
8844fa83
KH
6204
6205 /* Set the coding system symbol to that for Unix-like EOL. */
6206 eol_type = Fget (saved_coding_symbol, Qeol_type);
6207 if (VECTORP (eol_type)
6208 && XVECTOR (eol_type)->size == 3
6209 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6210 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6211 else
6212 coding->symbol = saved_coding_symbol;
6213
6214
73be902c 6215 }
4ed46869 6216 }
d46c5b12 6217
2391eaa4
KH
6218 coding->consumed = consumed;
6219 coding->consumed_char = consumed_char;
6220 coding->produced = produced;
6221 coding->produced_char = produced_char;
6222
78108bcd 6223 if (coding->dst_multibyte)
73be902c
KH
6224 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6225 produced + shrinked_bytes);
78108bcd 6226 else
73be902c
KH
6227 newstr = make_uninit_string (produced + shrinked_bytes);
6228 if (from > 0)
a4244313
KR
6229 STRING_COPYIN (newstr, 0, SDATA (str), from);
6230 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6231 if (shrinked_bytes > from)
a4244313
KR
6232 STRING_COPYIN (newstr, from + produced,
6233 SDATA (str) + to_byte,
6234 shrinked_bytes - from);
73be902c 6235 free_conversion_buffer (&buf);
b73bfc1c 6236
160a708c
KH
6237 coding->consumed += shrinked_bytes;
6238 coding->consumed_char += shrinked_bytes;
6239 coding->produced += shrinked_bytes;
6240 coding->produced_char += shrinked_bytes;
6241
b73bfc1c 6242 if (coding->cmp_data && coding->cmp_data->used)
73be902c 6243 coding_restore_composition (coding, newstr);
b73bfc1c
KH
6244 coding_free_composition_data (coding);
6245
6246 if (SYMBOLP (coding->post_read_conversion)
6247 && !NILP (Ffboundp (coding->post_read_conversion)))
73be902c 6248 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
b73bfc1c 6249
73be902c 6250 return newstr;
b73bfc1c
KH
6251}
6252
6253Lisp_Object
6254encode_coding_string (str, coding, nocopy)
6255 Lisp_Object str;
6256 struct coding_system *coding;
6257 int nocopy;
6258{
6259 int len;
73be902c 6260 struct conversion_buffer buf;
b73bfc1c 6261 int from, to, to_byte;
b73bfc1c 6262 int result;
73be902c
KH
6263 int shrinked_bytes = 0;
6264 Lisp_Object newstr;
2391eaa4 6265 int consumed, consumed_char, produced, produced_char;
b73bfc1c
KH
6266
6267 if (SYMBOLP (coding->pre_write_conversion)
6268 && !NILP (Ffboundp (coding->pre_write_conversion)))
6bac5b12 6269 str = run_pre_post_conversion_on_str (str, coding, 1);
b73bfc1c
KH
6270
6271 from = 0;
d5db4077
KR
6272 to = SCHARS (str);
6273 to_byte = SBYTES (str);
b73bfc1c 6274
e2c06b17
KH
6275 /* Encoding routines determine the multibyteness of the source text
6276 by coding->src_multibyte. */
6277 coding->src_multibyte = STRING_MULTIBYTE (str);
6278 coding->dst_multibyte = 0;
b73bfc1c 6279 if (! CODING_REQUIRE_ENCODING (coding))
826bfb8b 6280 {
d5db4077
KR
6281 coding->consumed = SBYTES (str);
6282 coding->consumed_char = SCHARS (str);
b73bfc1c
KH
6283 if (STRING_MULTIBYTE (str))
6284 {
6285 str = Fstring_as_unibyte (str);
6286 nocopy = 1;
6287 }
d5db4077
KR
6288 coding->produced = SBYTES (str);
6289 coding->produced_char = SCHARS (str);
b73bfc1c 6290 return (nocopy ? str : Fcopy_sequence (str));
826bfb8b
KH
6291 }
6292
b73bfc1c
KH
6293 if (coding->composing != COMPOSITION_DISABLED)
6294 coding_save_composition (coding, from, to, str);
ec6d2bb8 6295
b73bfc1c 6296 /* Try to skip the heading and tailing ASCIIs. */
4956c225
KH
6297 if (coding->type != coding_type_ccl)
6298 {
d5db4077 6299 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
6300 1);
6301 if (from == to_byte)
6302 return (nocopy ? str : Fcopy_sequence (str));
d5db4077 6303 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 6304 }
b73bfc1c
KH
6305
6306 len = encoding_buffer_size (coding, to_byte - from);
73be902c
KH
6307 allocate_conversion_buffer (buf, len);
6308
2391eaa4 6309 consumed = consumed_char = produced = produced_char = 0;
73be902c
KH
6310 while (1)
6311 {
d5db4077 6312 result = encode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6313 buf.data + produced, to_byte - from - consumed,
6314 buf.size - produced);
6315 consumed += coding->consumed;
2391eaa4 6316 consumed_char += coding->consumed_char;
13004bef 6317 produced += coding->produced;
2391eaa4
KH
6318 produced_char += coding->produced_char;
6319 if (result == CODING_FINISH_NORMAL
230779b9 6320 || result == CODING_FINISH_INTERRUPT
2391eaa4
KH
6321 || (result == CODING_FINISH_INSUFFICIENT_SRC
6322 && coding->consumed == 0))
73be902c
KH
6323 break;
6324 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6325 extend_conversion_buffer (&buf);
6326 }
6327
2391eaa4
KH
6328 coding->consumed = consumed;
6329 coding->consumed_char = consumed_char;
6330 coding->produced = produced;
6331 coding->produced_char = produced_char;
6332
73be902c 6333 newstr = make_uninit_string (produced + shrinked_bytes);
b73bfc1c 6334 if (from > 0)
a4244313
KR
6335 STRING_COPYIN (newstr, 0, SDATA (str), from);
6336 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6337 if (shrinked_bytes > from)
a4244313
KR
6338 STRING_COPYIN (newstr, from + produced,
6339 SDATA (str) + to_byte,
6340 shrinked_bytes - from);
73be902c
KH
6341
6342 free_conversion_buffer (&buf);
ec6d2bb8 6343 coding_free_composition_data (coding);
b73bfc1c 6344
73be902c 6345 return newstr;
4ed46869
KH
6346}
6347
6348\f
6349#ifdef emacs
1397dc18 6350/*** 8. Emacs Lisp library functions ***/
4ed46869 6351
4ed46869 6352DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae
PJ
6353 doc: /* Return t if OBJECT is nil or a coding-system.
6354See the documentation of `make-coding-system' for information
6355about coding-system objects. */)
6356 (obj)
4ed46869
KH
6357 Lisp_Object obj;
6358{
4608c386
KH
6359 if (NILP (obj))
6360 return Qt;
6361 if (!SYMBOLP (obj))
6362 return Qnil;
c2164d91
KH
6363 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6364 return Qt;
4608c386
KH
6365 /* Get coding-spec vector for OBJ. */
6366 obj = Fget (obj, Qcoding_system);
6367 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6368 ? Qt : Qnil);
4ed46869
KH
6369}
6370
9d991de8
RS
6371DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6372 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6373 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6374 (prompt)
4ed46869
KH
6375 Lisp_Object prompt;
6376{
e0e989f6 6377 Lisp_Object val;
9d991de8
RS
6378 do
6379 {
4608c386
KH
6380 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6381 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6382 }
d5db4077 6383 while (SCHARS (val) == 0);
e0e989f6 6384 return (Fintern (val, Qnil));
4ed46869
KH
6385}
6386
9b787f3e 6387DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6388 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6389If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6390 (prompt, default_coding_system)
9b787f3e 6391 Lisp_Object prompt, default_coding_system;
4ed46869 6392{
f44d27ce 6393 Lisp_Object val;
9b787f3e 6394 if (SYMBOLP (default_coding_system))
57d25e6f 6395 default_coding_system = SYMBOL_NAME (default_coding_system);
4608c386 6396 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6397 Qt, Qnil, Qcoding_system_history,
6398 default_coding_system, Qnil);
d5db4077 6399 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6400}
6401
6402DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6403 1, 1, 0,
48b0f3ae
PJ
6404 doc: /* Check validity of CODING-SYSTEM.
6405If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
303cdc2d 6406It is valid if it is nil or a symbol with a non-nil `coding-system' property.
de1d1a40 6407The value of this property should be a vector of length 5. */)
48b0f3ae 6408 (coding_system)
4ed46869
KH
6409 Lisp_Object coding_system;
6410{
a362520d
KH
6411 Lisp_Object define_form;
6412
6413 define_form = Fget (coding_system, Qcoding_system_define_form);
6414 if (! NILP (define_form))
6415 {
6416 Fput (coding_system, Qcoding_system_define_form, Qnil);
6417 safe_eval (define_form);
6418 }
4ed46869
KH
6419 if (!NILP (Fcoding_system_p (coding_system)))
6420 return coding_system;
6421 while (1)
02ba4723 6422 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6423}
3a73fa5d 6424\f
d46c5b12 6425Lisp_Object
0a28aafb 6426detect_coding_system (src, src_bytes, highest, multibytep)
a4244313 6427 const unsigned char *src;
d46c5b12 6428 int src_bytes, highest;
0a28aafb 6429 int multibytep;
4ed46869
KH
6430{
6431 int coding_mask, eol_type;
d46c5b12
KH
6432 Lisp_Object val, tmp;
6433 int dummy;
4ed46869 6434
0a28aafb 6435 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
d46c5b12
KH
6436 eol_type = detect_eol_type (src, src_bytes, &dummy);
6437 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 6438 eol_type = CODING_EOL_UNDECIDED;
4ed46869 6439
d46c5b12 6440 if (!coding_mask)
4ed46869 6441 {
27901516 6442 val = Qundecided;
d46c5b12 6443 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 6444 {
f44d27ce
RS
6445 Lisp_Object val2;
6446 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
6447 if (VECTORP (val2))
6448 val = XVECTOR (val2)->contents[eol_type];
6449 }
80e803b4 6450 return (highest ? val : Fcons (val, Qnil));
4ed46869 6451 }
4ed46869 6452
d46c5b12
KH
6453 /* At first, gather possible coding systems in VAL. */
6454 val = Qnil;
fa42c37f 6455 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6456 {
fa42c37f
KH
6457 Lisp_Object category_val, category_index;
6458
6459 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6460 category_val = Fsymbol_value (XCAR (tmp));
6461 if (!NILP (category_val)
6462 && NATNUMP (category_index)
6463 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 6464 {
fa42c37f 6465 val = Fcons (category_val, val);
d46c5b12
KH
6466 if (highest)
6467 break;
4ed46869
KH
6468 }
6469 }
d46c5b12
KH
6470 if (!highest)
6471 val = Fnreverse (val);
4ed46869 6472
65059037 6473 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 6474 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6475 {
65059037
RS
6476 if (eol_type != CODING_EOL_UNDECIDED
6477 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 6478 {
d46c5b12 6479 Lisp_Object eol;
03699b14 6480 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 6481 if (VECTORP (eol))
f3fbd155 6482 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
4ed46869
KH
6483 }
6484 }
03699b14 6485 return (highest ? XCAR (val) : val);
93dec019 6486}
4ed46869 6487
d46c5b12
KH
6488DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6489 2, 3, 0,
40fd536c
KH
6490 doc: /* Detect how the byte sequence in the region is encoded.
6491Return a list of possible coding systems used on decoding a byte
6492sequence containing the bytes in the region between START and END when
6493the coding system `undecided' is specified. The list is ordered by
6494priority decided in the current language environment.
48b0f3ae
PJ
6495
6496If only ASCII characters are found, it returns a list of single element
6497`undecided' or its subsidiary coding system according to a detected
6498end-of-line format.
6499
6500If optional argument HIGHEST is non-nil, return the coding system of
6501highest priority. */)
6502 (start, end, highest)
d46c5b12
KH
6503 Lisp_Object start, end, highest;
6504{
6505 int from, to;
6506 int from_byte, to_byte;
682169fe 6507 int include_anchor_byte = 0;
6289dd10 6508
b7826503
PJ
6509 CHECK_NUMBER_COERCE_MARKER (start);
6510 CHECK_NUMBER_COERCE_MARKER (end);
4ed46869 6511
d46c5b12
KH
6512 validate_region (&start, &end);
6513 from = XINT (start), to = XINT (end);
6514 from_byte = CHAR_TO_BYTE (from);
6515 to_byte = CHAR_TO_BYTE (to);
6289dd10 6516
d46c5b12
KH
6517 if (from < GPT && to >= GPT)
6518 move_gap_both (to, to_byte);
c210f766
KH
6519 /* If we an anchor byte `\0' follows the region, we include it in
6520 the detecting source. Then code detectors can handle the tailing
6521 byte sequence more accurately.
6522
7d0393cf 6523 Fix me: This is not a perfect solution. It is better that we
c210f766
KH
6524 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6525 */
682169fe
KH
6526 if (to == Z || (to == GPT && GAP_SIZE > 0))
6527 include_anchor_byte = 1;
d46c5b12 6528 return detect_coding_system (BYTE_POS_ADDR (from_byte),
682169fe 6529 to_byte - from_byte + include_anchor_byte,
0a28aafb
KH
6530 !NILP (highest),
6531 !NILP (current_buffer
6532 ->enable_multibyte_characters));
d46c5b12 6533}
6289dd10 6534
d46c5b12
KH
6535DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6536 1, 2, 0,
eec1f3c7
KH
6537 doc: /* Detect how the byte sequence in STRING is encoded.
6538Return a list of possible coding systems used on decoding a byte
6539sequence containing the bytes in STRING when the coding system
6540`undecided' is specified. The list is ordered by priority decided in
6541the current language environment.
48b0f3ae
PJ
6542
6543If only ASCII characters are found, it returns a list of single element
6544`undecided' or its subsidiary coding system according to a detected
6545end-of-line format.
6546
6547If optional argument HIGHEST is non-nil, return the coding system of
6548highest priority. */)
6549 (string, highest)
d46c5b12
KH
6550 Lisp_Object string, highest;
6551{
b7826503 6552 CHECK_STRING (string);
4ed46869 6553
d5db4077 6554 return detect_coding_system (SDATA (string),
682169fe
KH
6555 /* "+ 1" is to include the anchor byte
6556 `\0'. With this, code detectors can
c210f766
KH
6557 handle the tailing bytes more
6558 accurately. */
d5db4077 6559 SBYTES (string) + 1,
0a28aafb
KH
6560 !NILP (highest),
6561 STRING_MULTIBYTE (string));
4ed46869
KH
6562}
6563
05e6f5dc
KH
6564/* Subroutine for Fsafe_coding_systems_region_internal.
6565
6566 Return a list of coding systems that safely encode the multibyte
b666620c 6567 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
05e6f5dc
KH
6568 possible coding systems. If it is nil, it means that we have not
6569 yet found any coding systems.
6570
12d5b185
KH
6571 WORK_TABLE a char-table of which element is set to t once the
6572 element is looked up.
05e6f5dc
KH
6573
6574 If a non-ASCII single byte char is found, set
6575 *single_byte_char_found to 1. */
6576
6577static Lisp_Object
6578find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6579 unsigned char *p, *pend;
6580 Lisp_Object safe_codings, work_table;
6581 int *single_byte_char_found;
6b89e3aa 6582{
f1ce3dcf 6583 int c, len;
6b89e3aa
KH
6584 Lisp_Object val, ch;
6585 Lisp_Object prev, tail;
177c0ea7 6586
12d5b185
KH
6587 if (NILP (safe_codings))
6588 goto done_safe_codings;
6b89e3aa
KH
6589 while (p < pend)
6590 {
6591 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6592 p += len;
6593 if (ASCII_BYTE_P (c))
6594 /* We can ignore ASCII characters here. */
6595 continue;
6596 if (SINGLE_BYTE_CHAR_P (c))
6597 *single_byte_char_found = 1;
6b89e3aa
KH
6598 /* Check the safe coding systems for C. */
6599 ch = make_number (c);
6600 val = Faref (work_table, ch);
6601 if (EQ (val, Qt))
6602 /* This element was already checked. Ignore it. */
6603 continue;
6604 /* Remember that we checked this element. */
6605 Faset (work_table, ch, Qt);
6606
6607 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6608 {
b666620c
KH
6609 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6610 int encodable;
6611
6612 elt = XCAR (tail);
6613 if (CONSP (XCDR (elt)))
6614 {
6615 /* This entry has this format now:
6616 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6617 ACCEPT-LATIN-EXTRA ) */
6618 val = XCDR (elt);
6619 encodable = ! NILP (Faref (XCAR (val), ch));
6620 if (! encodable)
6621 {
6622 val = XCDR (val);
6623 translation_table = XCAR (val);
6624 hash_table = XCAR (XCDR (val));
6625 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6626 }
6627 }
6628 else
6629 {
6630 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6631 encodable = ! NILP (Faref (XCDR (elt), ch));
6632 if (! encodable)
6633 {
6634 /* Transform the format to:
6635 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6636 ACCEPT-LATIN-EXTRA ) */
6637 val = Fget (XCAR (elt), Qcoding_system);
6638 translation_table
6639 = Fplist_get (AREF (val, 3),
6640 Qtranslation_table_for_encode);
6641 if (SYMBOLP (translation_table))
6642 translation_table = Fget (translation_table,
6643 Qtranslation_table);
6644 hash_table
6645 = (CHAR_TABLE_P (translation_table)
6646 ? XCHAR_TABLE (translation_table)->extras[1]
6647 : Qnil);
6648 accept_latin_extra
6649 = ((EQ (AREF (val, 0), make_number (2))
6650 && VECTORP (AREF (val, 4)))
58f99379 6651 ? AREF (AREF (val, 4), 16)
b666620c
KH
6652 : Qnil);
6653 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6654 translation_table, hash_table,
6655 accept_latin_extra));
6656 }
6657 }
43e4a82f 6658
b666620c
KH
6659 if (! encodable
6660 && ((CHAR_TABLE_P (translation_table)
6661 && ! NILP (Faref (translation_table, ch)))
6662 || (HASH_TABLE_P (hash_table)
6663 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6664 || (SINGLE_BYTE_CHAR_P (c)
6665 && ! NILP (accept_latin_extra)
6666 && VECTORP (Vlatin_extra_code_table)
6667 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6668 encodable = 1;
6669 if (encodable)
6670 prev = tail;
6671 else
6b89e3aa 6672 {
7c695ab9 6673 /* Exclude this coding system from SAFE_CODINGS. */
6b89e3aa 6674 if (EQ (tail, safe_codings))
12d5b185
KH
6675 {
6676 safe_codings = XCDR (safe_codings);
6677 if (NILP (safe_codings))
6678 goto done_safe_codings;
6679 }
6b89e3aa
KH
6680 else
6681 XSETCDR (prev, XCDR (tail));
6682 }
6b89e3aa
KH
6683 }
6684 }
12d5b185
KH
6685
6686 done_safe_codings:
6687 /* If the above loop was terminated before P reaches PEND, it means
6688 SAFE_CODINGS was set to nil. If we have not yet found an
6689 non-ASCII single-byte char, check it now. */
6690 if (! *single_byte_char_found)
6691 while (p < pend)
6692 {
6693 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6694 p += len;
6695 if (! ASCII_BYTE_P (c)
6696 && SINGLE_BYTE_CHAR_P (c))
6697 {
6698 *single_byte_char_found = 1;
6699 break;
6700 }
6701 }
6b89e3aa
KH
6702 return safe_codings;
6703}
6704
067a6a66
KH
6705DEFUN ("find-coding-systems-region-internal",
6706 Ffind_coding_systems_region_internal,
6707 Sfind_coding_systems_region_internal, 2, 2, 0,
6b89e3aa
KH
6708 doc: /* Internal use only. */)
6709 (start, end)
6710 Lisp_Object start, end;
6711{
6712 Lisp_Object work_table, safe_codings;
6713 int non_ascii_p = 0;
6714 int single_byte_char_found = 0;
6715 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6716
6717 if (STRINGP (start))
6718 {
6719 if (!STRING_MULTIBYTE (start))
6720 return Qt;
6721 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6722 p2 = p2end = p1end;
6723 if (SCHARS (start) != SBYTES (start))
6724 non_ascii_p = 1;
6725 }
6726 else
6727 {
6728 int from, to, stop;
6729
6730 CHECK_NUMBER_COERCE_MARKER (start);
6731 CHECK_NUMBER_COERCE_MARKER (end);
6732 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6733 args_out_of_range (start, end);
6734 if (NILP (current_buffer->enable_multibyte_characters))
6735 return Qt;
6736 from = CHAR_TO_BYTE (XINT (start));
6737 to = CHAR_TO_BYTE (XINT (end));
6738 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6739 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6740 if (stop == to)
6741 p2 = p2end = p1end;
6742 else
6743 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6744 if (XINT (end) - XINT (start) != to - from)
6745 non_ascii_p = 1;
6746 }
6747
6748 if (!non_ascii_p)
6749 {
6750 /* We are sure that the text contains no multibyte character.
6751 Check if it contains eight-bit-graphic. */
6752 p = p1;
6753 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6754 if (p == p1end)
6755 {
6756 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6757 if (p == p2end)
6758 return Qt;
6759 }
6760 }
6761
6762 /* The text contains non-ASCII characters. */
6763
6764 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6765 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6766
067a6a66
KH
6767 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6768 &single_byte_char_found);
6b89e3aa 6769 if (p2 < p2end)
067a6a66
KH
6770 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6771 &single_byte_char_found);
6b89e3aa
KH
6772 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6773 safe_codings = Qt;
6774 else
6775 {
6776 /* Turn safe_codings to a list of coding systems... */
6777 Lisp_Object val;
6778
6779 if (single_byte_char_found)
6780 /* ... and append these for eight-bit chars. */
6781 val = Fcons (Qraw_text,
6782 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6783 else
6784 /* ... and append generic coding systems. */
6785 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
177c0ea7 6786
6b89e3aa
KH
6787 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6788 val = Fcons (XCAR (XCAR (safe_codings)), val);
6789 safe_codings = val;
6790 }
6791
6792 return safe_codings;
6793}
6794
6795
068a9dbd
KH
6796/* Search from position POS for such characters that are unencodable
6797 accoding to SAFE_CHARS, and return a list of their positions. P
6798 points where in the memory the character at POS exists. Limit the
6799 search at PEND or when Nth unencodable characters are found.
6800
6801 If SAFE_CHARS is a char table, an element for an unencodable
6802 character is nil.
6803
6804 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6805
6806 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6807 eight-bit-graphic characters are unencodable. */
6808
6809static Lisp_Object
6810unencodable_char_position (safe_chars, pos, p, pend, n)
6811 Lisp_Object safe_chars;
6812 int pos;
6813 unsigned char *p, *pend;
6814 int n;
6815{
6816 Lisp_Object pos_list;
6817
6818 pos_list = Qnil;
6819 while (p < pend)
6820 {
6821 int len;
6822 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7d0393cf 6823
068a9dbd
KH
6824 if (c >= 128
6825 && (CHAR_TABLE_P (safe_chars)
6826 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6827 : (NILP (safe_chars) || c < 256)))
6828 {
6829 pos_list = Fcons (make_number (pos), pos_list);
6830 if (--n <= 0)
6831 break;
6832 }
6833 pos++;
6834 p += len;
6835 }
6836 return Fnreverse (pos_list);
6837}
6838
6839
6840DEFUN ("unencodable-char-position", Funencodable_char_position,
6841 Sunencodable_char_position, 3, 5, 0,
6842 doc: /*
6843Return position of first un-encodable character in a region.
6844START and END specfiy the region and CODING-SYSTEM specifies the
6845encoding to check. Return nil if CODING-SYSTEM does encode the region.
6846
6847If optional 4th argument COUNT is non-nil, it specifies at most how
6848many un-encodable characters to search. In this case, the value is a
6849list of positions.
6850
6851If optional 5th argument STRING is non-nil, it is a string to search
6852for un-encodable characters. In that case, START and END are indexes
6853to the string. */)
6854 (start, end, coding_system, count, string)
6855 Lisp_Object start, end, coding_system, count, string;
6856{
6857 int n;
6858 Lisp_Object safe_chars;
6859 struct coding_system coding;
6860 Lisp_Object positions;
6861 int from, to;
6862 unsigned char *p, *pend;
6863
6864 if (NILP (string))
6865 {
6866 validate_region (&start, &end);
6867 from = XINT (start);
6868 to = XINT (end);
6869 if (NILP (current_buffer->enable_multibyte_characters))
6870 return Qnil;
6871 p = CHAR_POS_ADDR (from);
200c93e2
KH
6872 if (to == GPT)
6873 pend = GPT_ADDR;
6874 else
6875 pend = CHAR_POS_ADDR (to);
068a9dbd
KH
6876 }
6877 else
6878 {
6879 CHECK_STRING (string);
6880 CHECK_NATNUM (start);
6881 CHECK_NATNUM (end);
6882 from = XINT (start);
6883 to = XINT (end);
6884 if (from > to
6885 || to > SCHARS (string))
6886 args_out_of_range_3 (string, start, end);
6887 if (! STRING_MULTIBYTE (string))
6888 return Qnil;
6889 p = SDATA (string) + string_char_to_byte (string, from);
6890 pend = SDATA (string) + string_char_to_byte (string, to);
6891 }
6892
6893 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6894
6895 if (NILP (count))
6896 n = 1;
6897 else
6898 {
6899 CHECK_NATNUM (count);
6900 n = XINT (count);
6901 }
6902
6903 if (coding.type == coding_type_no_conversion
6904 || coding.type == coding_type_raw_text)
6905 return Qnil;
6906
6907 if (coding.type == coding_type_undecided)
6908 safe_chars = Qnil;
6909 else
6b89e3aa 6910 safe_chars = coding_safe_chars (coding_system);
068a9dbd
KH
6911
6912 if (STRINGP (string)
6913 || from >= GPT || to <= GPT)
6914 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6915 else
6916 {
6917 Lisp_Object args[2];
6918
6919 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
96d2e64d 6920 n -= XINT (Flength (args[0]));
068a9dbd
KH
6921 if (n <= 0)
6922 positions = args[0];
6923 else
6924 {
6925 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6926 pend, n);
6927 positions = Fappend (2, args);
6928 }
6929 }
6930
6931 return (NILP (count) ? Fcar (positions) : positions);
6932}
6933
6934
4031e2bf
KH
6935Lisp_Object
6936code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 6937 Lisp_Object start, end, coding_system;
4031e2bf 6938 int encodep;
3a73fa5d
RS
6939{
6940 struct coding_system coding;
da55a2b7 6941 int from, to;
3a73fa5d 6942
b7826503
PJ
6943 CHECK_NUMBER_COERCE_MARKER (start);
6944 CHECK_NUMBER_COERCE_MARKER (end);
6945 CHECK_SYMBOL (coding_system);
3a73fa5d 6946
d46c5b12
KH
6947 validate_region (&start, &end);
6948 from = XFASTINT (start);
6949 to = XFASTINT (end);
6950
3a73fa5d 6951 if (NILP (coding_system))
d46c5b12
KH
6952 return make_number (to - from);
6953
3a73fa5d 6954 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 6955 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
3a73fa5d 6956
d46c5b12 6957 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
6958 coding.src_multibyte = coding.dst_multibyte
6959 = !NILP (current_buffer->enable_multibyte_characters);
fb88bf2d
KH
6960 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6961 &coding, encodep, 1);
f072a3e8 6962 Vlast_coding_system_used = coding.symbol;
fb88bf2d 6963 return make_number (coding.produced_char);
4031e2bf
KH
6964}
6965
6966DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6967 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
6968 doc: /* Decode the current region from the specified coding system.
6969When called from a program, takes three arguments:
6970START, END, and CODING-SYSTEM. START and END are buffer positions.
6971This function sets `last-coding-system-used' to the precise coding system
6972used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6973not fully specified.)
6974It returns the length of the decoded text. */)
6975 (start, end, coding_system)
4031e2bf
KH
6976 Lisp_Object start, end, coding_system;
6977{
6978 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
6979}
6980
6981DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6982 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
6983 doc: /* Encode the current region into the specified coding system.
6984When called from a program, takes three arguments:
6985START, END, and CODING-SYSTEM. START and END are buffer positions.
6986This function sets `last-coding-system-used' to the precise coding system
6987used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6988not fully specified.)
6989It returns the length of the encoded text. */)
6990 (start, end, coding_system)
d46c5b12 6991 Lisp_Object start, end, coding_system;
3a73fa5d 6992{
4031e2bf
KH
6993 return code_convert_region1 (start, end, coding_system, 1);
6994}
3a73fa5d 6995
4031e2bf
KH
6996Lisp_Object
6997code_convert_string1 (string, coding_system, nocopy, encodep)
6998 Lisp_Object string, coding_system, nocopy;
6999 int encodep;
7000{
7001 struct coding_system coding;
3a73fa5d 7002
b7826503
PJ
7003 CHECK_STRING (string);
7004 CHECK_SYMBOL (coding_system);
4ed46869 7005
d46c5b12 7006 if (NILP (coding_system))
4031e2bf 7007 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 7008
d46c5b12 7009 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 7010 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
5f1cd180 7011
d46c5b12 7012 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
7013 string = (encodep
7014 ? encode_coding_string (string, &coding, !NILP (nocopy))
7015 : decode_coding_string (string, &coding, !NILP (nocopy)));
f072a3e8 7016 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
7017
7018 return string;
4ed46869
KH
7019}
7020
4ed46869 7021DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6 7022 2, 3, 0,
48b0f3ae
PJ
7023 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7024Optional arg NOCOPY non-nil means it is OK to return STRING itself
7025if the decoding operation is trivial.
7026This function sets `last-coding-system-used' to the precise coding system
7027used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7028not fully specified.) */)
7029 (string, coding_system, nocopy)
e0e989f6 7030 Lisp_Object string, coding_system, nocopy;
4ed46869 7031{
f072a3e8 7032 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
7033}
7034
7035DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6 7036 2, 3, 0,
48b0f3ae
PJ
7037 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7038Optional arg NOCOPY non-nil means it is OK to return STRING itself
7039if the encoding operation is trivial.
7040This function sets `last-coding-system-used' to the precise coding system
7041used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7042not fully specified.) */)
7043 (string, coding_system, nocopy)
e0e989f6 7044 Lisp_Object string, coding_system, nocopy;
4ed46869 7045{
f072a3e8 7046 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 7047}
4031e2bf 7048
ecec61c1 7049/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
7050 Do not set Vlast_coding_system_used.
7051
7052 This function is called only from macros DECODE_FILE and
7053 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
7054
7055Lisp_Object
7056code_convert_string_norecord (string, coding_system, encodep)
7057 Lisp_Object string, coding_system;
7058 int encodep;
7059{
7060 struct coding_system coding;
7061
b7826503
PJ
7062 CHECK_STRING (string);
7063 CHECK_SYMBOL (coding_system);
ecec61c1
KH
7064
7065 if (NILP (coding_system))
7066 return string;
7067
7068 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 7069 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
ecec61c1 7070
ec6d2bb8 7071 coding.composing = COMPOSITION_DISABLED;
ecec61c1 7072 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
7073 return (encodep
7074 ? encode_coding_string (string, &coding, 1)
7075 : decode_coding_string (string, &coding, 1));
ecec61c1 7076}
3a73fa5d 7077\f
4ed46869 7078DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7079 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7080Return the corresponding character. */)
7081 (code)
4ed46869
KH
7082 Lisp_Object code;
7083{
7084 unsigned char c1, c2, s1, s2;
7085 Lisp_Object val;
7086
b7826503 7087 CHECK_NUMBER (code);
4ed46869 7088 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
7089 if (s1 == 0)
7090 {
c28a9453
KH
7091 if (s2 < 0x80)
7092 XSETFASTINT (val, s2);
7093 else if (s2 >= 0xA0 || s2 <= 0xDF)
b73bfc1c 7094 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
c28a9453 7095 else
9da8350f 7096 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
7097 }
7098 else
7099 {
87323294 7100 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
55ab7be3 7101 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 7102 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3 7103 DECODE_SJIS (s1, s2, c1, c2);
b73bfc1c 7104 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
55ab7be3 7105 }
4ed46869
KH
7106 return val;
7107}
7108
7109DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7110 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7111Return the corresponding code in SJIS. */)
7112 (ch)
4ed46869
KH
7113 Lisp_Object ch;
7114{
bcf26d6a 7115 int charset, c1, c2, s1, s2;
4ed46869
KH
7116 Lisp_Object val;
7117
b7826503 7118 CHECK_NUMBER (ch);
4ed46869 7119 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7120 if (charset == CHARSET_ASCII)
7121 {
7122 val = ch;
7123 }
7124 else if (charset == charset_jisx0208
7125 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
7126 {
7127 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 7128 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 7129 }
55ab7be3
KH
7130 else if (charset == charset_katakana_jisx0201
7131 && c1 > 0x20 && c2 < 0xE0)
7132 {
7133 XSETFASTINT (val, c1 | 0x80);
7134 }
4ed46869 7135 else
55ab7be3 7136 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
7137 return val;
7138}
7139
7140DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7141 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7142Return the corresponding character. */)
7143 (code)
4ed46869
KH
7144 Lisp_Object code;
7145{
7146 int charset;
7147 unsigned char b1, b2, c1, c2;
7148 Lisp_Object val;
7149
b7826503 7150 CHECK_NUMBER (code);
4ed46869 7151 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
7152 if (b1 == 0)
7153 {
7154 if (b2 >= 0x80)
9da8350f 7155 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
7156 val = code;
7157 }
7158 else
7159 {
7160 if ((b1 < 0xA1 || b1 > 0xFE)
7161 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 7162 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453 7163 DECODE_BIG5 (b1, b2, charset, c1, c2);
b73bfc1c 7164 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
c28a9453 7165 }
4ed46869
KH
7166 return val;
7167}
7168
7169DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7170 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7171Return the corresponding character code in Big5. */)
7172 (ch)
4ed46869
KH
7173 Lisp_Object ch;
7174{
bcf26d6a 7175 int charset, c1, c2, b1, b2;
4ed46869
KH
7176 Lisp_Object val;
7177
b7826503 7178 CHECK_NUMBER (ch);
4ed46869 7179 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7180 if (charset == CHARSET_ASCII)
7181 {
7182 val = ch;
7183 }
7184 else if ((charset == charset_big5_1
7185 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7186 || (charset == charset_big5_2
7187 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
7188 {
7189 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 7190 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
7191 }
7192 else
c28a9453 7193 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
7194 return val;
7195}
3a73fa5d 7196\f
002fdb44 7197DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7198 Sset_terminal_coding_system_internal, 1, 1, 0,
7199 doc: /* Internal use only. */)
7200 (coding_system)
4ed46869
KH
7201 Lisp_Object coding_system;
7202{
b8299c66 7203 struct coding_system *terminal_coding = FRAME_TERMINAL_CODING (SELECTED_FRAME ());
b7826503 7204 CHECK_SYMBOL (coding_system);
b8299c66 7205 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 7206 /* We had better not send unsafe characters to terminal. */
b8299c66 7207 terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
8ca3766a 7208 /* Character composition should be disabled. */
b8299c66 7209 terminal_coding->composing = COMPOSITION_DISABLED;
bd64290d 7210 /* Error notification should be suppressed. */
b8299c66
KL
7211 terminal_coding->suppress_error = 1;
7212 terminal_coding->src_multibyte = 1;
7213 terminal_coding->dst_multibyte = 0;
4ed46869
KH
7214 return Qnil;
7215}
7216
002fdb44 7217DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
48b0f3ae 7218 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7219 doc: /* Internal use only. */)
48b0f3ae 7220 (coding_system)
c4825358
KH
7221 Lisp_Object coding_system;
7222{
b7826503 7223 CHECK_SYMBOL (coding_system);
c4825358
KH
7224 setup_coding_system (Fcheck_coding_system (coding_system),
7225 &safe_terminal_coding);
8ca3766a 7226 /* Character composition should be disabled. */
ec6d2bb8 7227 safe_terminal_coding.composing = COMPOSITION_DISABLED;
bd64290d 7228 /* Error notification should be suppressed. */
b8299c66 7229 safe_terminal_coding.suppress_error = 1;
b73bfc1c
KH
7230 safe_terminal_coding.src_multibyte = 1;
7231 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7232 return Qnil;
7233}
7234
002fdb44
DL
7235DEFUN ("terminal-coding-system", Fterminal_coding_system,
7236 Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7237 doc: /* Return coding system specified for terminal output. */)
7238 ()
4ed46869 7239{
b8299c66 7240 return FRAME_TERMINAL_CODING (SELECTED_FRAME ())->symbol;
4ed46869
KH
7241}
7242
002fdb44 7243DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7244 Sset_keyboard_coding_system_internal, 1, 1, 0,
7245 doc: /* Internal use only. */)
7246 (coding_system)
4ed46869
KH
7247 Lisp_Object coding_system;
7248{
b7826503 7249 CHECK_SYMBOL (coding_system);
b8299c66
KL
7250 setup_coding_system (Fcheck_coding_system (coding_system),
7251 FRAME_KEYBOARD_CODING (SELECTED_FRAME ()));
8ca3766a 7252 /* Character composition should be disabled. */
b8299c66 7253 FRAME_KEYBOARD_CODING (SELECTED_FRAME ())->composing = COMPOSITION_DISABLED;
4ed46869
KH
7254 return Qnil;
7255}
7256
002fdb44
DL
7257DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7258 Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7259 doc: /* Return coding system specified for decoding keyboard input. */)
7260 ()
4ed46869 7261{
b8299c66 7262 return FRAME_KEYBOARD_CODING (SELECTED_FRAME ())->symbol;
4ed46869
KH
7263}
7264
7265\f
a5d301df
KH
7266DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7267 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7268 doc: /* Choose a coding system for an operation based on the target name.
7269The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7270DECODING-SYSTEM is the coding system to use for decoding
7271\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7272for encoding (in case OPERATION does encoding).
7273
7274The first argument OPERATION specifies an I/O primitive:
7275 For file I/O, `insert-file-contents' or `write-region'.
7276 For process I/O, `call-process', `call-process-region', or `start-process'.
7277 For network I/O, `open-network-stream'.
7278
7279The remaining arguments should be the same arguments that were passed
7280to the primitive. Depending on which primitive, one of those arguments
7281is selected as the TARGET. For example, if OPERATION does file I/O,
7282whichever argument specifies the file name is TARGET.
7283
7284TARGET has a meaning which depends on OPERATION:
7285 For file I/O, TARGET is a file name.
7286 For process I/O, TARGET is a process name.
7287 For network I/O, TARGET is a service name or a port number
7288
7289This function looks up what specified for TARGET in,
7290`file-coding-system-alist', `process-coding-system-alist',
7291or `network-coding-system-alist' depending on OPERATION.
7292They may specify a coding system, a cons of coding systems,
7293or a function symbol to call.
7294In the last case, we call the function with one argument,
7295which is a list of all the arguments given to this function.
7296
7297usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7298 (nargs, args)
4ed46869
KH
7299 int nargs;
7300 Lisp_Object *args;
7301{
7302 Lisp_Object operation, target_idx, target, val;
7303 register Lisp_Object chain;
7304
7305 if (nargs < 2)
7306 error ("Too few arguments");
7307 operation = args[0];
7308 if (!SYMBOLP (operation)
7309 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8ca3766a 7310 error ("Invalid first argument");
4ed46869
KH
7311 if (nargs < 1 + XINT (target_idx))
7312 error ("Too few arguments for operation: %s",
d5db4077 7313 SDATA (SYMBOL_NAME (operation)));
7f787cfd
KH
7314 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7315 argument to write-region) is string, it must be treated as a
7316 target file name. */
7317 if (EQ (operation, Qwrite_region)
7318 && nargs > 5
7319 && STRINGP (args[5]))
d90ed3b4 7320 target_idx = make_number (4);
4ed46869
KH
7321 target = args[XINT (target_idx) + 1];
7322 if (!(STRINGP (target)
7323 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8ca3766a 7324 error ("Invalid argument %d", XINT (target_idx) + 1);
4ed46869 7325
2e34157c
RS
7326 chain = ((EQ (operation, Qinsert_file_contents)
7327 || EQ (operation, Qwrite_region))
02ba4723 7328 ? Vfile_coding_system_alist
2e34157c 7329 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7330 ? Vnetwork_coding_system_alist
7331 : Vprocess_coding_system_alist));
4ed46869
KH
7332 if (NILP (chain))
7333 return Qnil;
7334
03699b14 7335 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 7336 {
f44d27ce 7337 Lisp_Object elt;
03699b14 7338 elt = XCAR (chain);
4ed46869
KH
7339
7340 if (CONSP (elt)
7341 && ((STRINGP (target)
03699b14
KR
7342 && STRINGP (XCAR (elt))
7343 && fast_string_match (XCAR (elt), target) >= 0)
7344 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 7345 {
03699b14 7346 val = XCDR (elt);
b19fd4c5
KH
7347 /* Here, if VAL is both a valid coding system and a valid
7348 function symbol, we return VAL as a coding system. */
02ba4723
KH
7349 if (CONSP (val))
7350 return val;
7351 if (! SYMBOLP (val))
7352 return Qnil;
7353 if (! NILP (Fcoding_system_p (val)))
7354 return Fcons (val, val);
b19fd4c5
KH
7355 if (! NILP (Ffboundp (val)))
7356 {
7357 val = call1 (val, Flist (nargs, args));
7358 if (CONSP (val))
7359 return val;
7360 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7361 return Fcons (val, val);
7362 }
02ba4723
KH
7363 return Qnil;
7364 }
4ed46869
KH
7365 }
7366 return Qnil;
7367}
7368
1397dc18
KH
7369DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7370 Supdate_coding_systems_internal, 0, 0, 0,
48b0f3ae
PJ
7371 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7372When values of any coding categories are changed, you must
7373call this function. */)
7374 ()
d46c5b12
KH
7375{
7376 int i;
7377
fa42c37f 7378 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 7379 {
1397dc18
KH
7380 Lisp_Object val;
7381
f5c1dd0d 7382 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
1397dc18
KH
7383 if (!NILP (val))
7384 {
7385 if (! coding_system_table[i])
7386 coding_system_table[i] = ((struct coding_system *)
7387 xmalloc (sizeof (struct coding_system)));
7388 setup_coding_system (val, coding_system_table[i]);
7389 }
7390 else if (coding_system_table[i])
7391 {
7392 xfree (coding_system_table[i]);
7393 coding_system_table[i] = NULL;
7394 }
d46c5b12 7395 }
1397dc18 7396
d46c5b12
KH
7397 return Qnil;
7398}
7399
66cfb530
KH
7400DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7401 Sset_coding_priority_internal, 0, 0, 0,
48b0f3ae
PJ
7402 doc: /* Update internal database for the current value of `coding-category-list'.
7403This function is internal use only. */)
7404 ()
66cfb530
KH
7405{
7406 int i = 0, idx;
84d60297
RS
7407 Lisp_Object val;
7408
7409 val = Vcoding_category_list;
66cfb530
KH
7410
7411 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7412 {
03699b14 7413 if (! SYMBOLP (XCAR (val)))
66cfb530 7414 break;
03699b14 7415 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
7416 if (idx >= CODING_CATEGORY_IDX_MAX)
7417 break;
7418 coding_priorities[i++] = (1 << idx);
03699b14 7419 val = XCDR (val);
66cfb530
KH
7420 }
7421 /* If coding-category-list is valid and contains all coding
7422 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 7423 the following code saves Emacs from crashing. */
66cfb530
KH
7424 while (i < CODING_CATEGORY_IDX_MAX)
7425 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7426
7427 return Qnil;
7428}
7429
6b89e3aa
KH
7430DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7431 Sdefine_coding_system_internal, 1, 1, 0,
7432 doc: /* Register CODING-SYSTEM as a base coding system.
7433This function is internal use only. */)
7434 (coding_system)
7435 Lisp_Object coding_system;
7436{
7437 Lisp_Object safe_chars, slot;
7438
7439 if (NILP (Fcheck_coding_system (coding_system)))
7440 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7441 safe_chars = coding_safe_chars (coding_system);
7442 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7443 error ("No valid safe-chars property for %s",
7444 SDATA (SYMBOL_NAME (coding_system)));
7445 if (EQ (safe_chars, Qt))
7446 {
7447 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7448 XSETCAR (Vcoding_system_safe_chars,
7449 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7450 }
7451 else
7452 {
7453 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7454 if (NILP (slot))
7455 XSETCDR (Vcoding_system_safe_chars,
7456 nconc2 (XCDR (Vcoding_system_safe_chars),
7457 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7458 else
7459 XSETCDR (slot, safe_chars);
7460 }
7461 return Qnil;
7462}
7463
4ed46869
KH
7464#endif /* emacs */
7465
7466\f
1397dc18 7467/*** 9. Post-amble ***/
4ed46869 7468
dfcf069d 7469void
4ed46869
KH
7470init_coding_once ()
7471{
7472 int i;
7473
93dec019 7474 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
7475 for (i = 0; i <= 0x20; i++)
7476 emacs_code_class[i] = EMACS_control_code;
7477 emacs_code_class[0x0A] = EMACS_linefeed_code;
7478 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7479 for (i = 0x21 ; i < 0x7F; i++)
7480 emacs_code_class[i] = EMACS_ascii_code;
7481 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 7482 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
7483 emacs_code_class[i] = EMACS_invalid_code;
7484 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7485 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7486 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7487 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7488
7489 /* ISO2022 specific initialize routine. */
7490 for (i = 0; i < 0x20; i++)
b73bfc1c 7491 iso_code_class[i] = ISO_control_0;
4ed46869
KH
7492 for (i = 0x21; i < 0x7F; i++)
7493 iso_code_class[i] = ISO_graphic_plane_0;
7494 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 7495 iso_code_class[i] = ISO_control_1;
4ed46869
KH
7496 for (i = 0xA1; i < 0xFF; i++)
7497 iso_code_class[i] = ISO_graphic_plane_1;
7498 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7499 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7500 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7501 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7502 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7503 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7504 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7505 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7506 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7507 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7508
c4825358 7509 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 7510 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 7511
d46c5b12
KH
7512 bzero (coding_system_table, sizeof coding_system_table);
7513
66cfb530
KH
7514 bzero (ascii_skip_code, sizeof ascii_skip_code);
7515 for (i = 0; i < 128; i++)
7516 ascii_skip_code[i] = 1;
7517
9ce27fde
KH
7518#if defined (MSDOS) || defined (WINDOWSNT)
7519 system_eol_type = CODING_EOL_CRLF;
7520#else
7521 system_eol_type = CODING_EOL_LF;
7522#endif
b843d1ae
KH
7523
7524 inhibit_pre_post_conversion = 0;
e0e989f6
KH
7525}
7526
7527#ifdef emacs
7528
dfcf069d 7529void
e0e989f6
KH
7530syms_of_coding ()
7531{
7532 Qtarget_idx = intern ("target-idx");
7533 staticpro (&Qtarget_idx);
7534
bb0115a2
RS
7535 Qcoding_system_history = intern ("coding-system-history");
7536 staticpro (&Qcoding_system_history);
7537 Fset (Qcoding_system_history, Qnil);
7538
9ce27fde 7539 /* Target FILENAME is the first argument. */
e0e989f6 7540 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 7541 /* Target FILENAME is the third argument. */
e0e989f6
KH
7542 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7543
7544 Qcall_process = intern ("call-process");
7545 staticpro (&Qcall_process);
9ce27fde 7546 /* Target PROGRAM is the first argument. */
e0e989f6
KH
7547 Fput (Qcall_process, Qtarget_idx, make_number (0));
7548
7549 Qcall_process_region = intern ("call-process-region");
7550 staticpro (&Qcall_process_region);
9ce27fde 7551 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7552 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7553
7554 Qstart_process = intern ("start-process");
7555 staticpro (&Qstart_process);
9ce27fde 7556 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7557 Fput (Qstart_process, Qtarget_idx, make_number (2));
7558
7559 Qopen_network_stream = intern ("open-network-stream");
7560 staticpro (&Qopen_network_stream);
9ce27fde 7561 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
7562 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7563
4ed46869
KH
7564 Qcoding_system = intern ("coding-system");
7565 staticpro (&Qcoding_system);
7566
7567 Qeol_type = intern ("eol-type");
7568 staticpro (&Qeol_type);
7569
7570 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7571 staticpro (&Qbuffer_file_coding_system);
7572
7573 Qpost_read_conversion = intern ("post-read-conversion");
7574 staticpro (&Qpost_read_conversion);
7575
7576 Qpre_write_conversion = intern ("pre-write-conversion");
7577 staticpro (&Qpre_write_conversion);
7578
27901516
KH
7579 Qno_conversion = intern ("no-conversion");
7580 staticpro (&Qno_conversion);
7581
7582 Qundecided = intern ("undecided");
7583 staticpro (&Qundecided);
7584
4ed46869
KH
7585 Qcoding_system_p = intern ("coding-system-p");
7586 staticpro (&Qcoding_system_p);
7587
7588 Qcoding_system_error = intern ("coding-system-error");
7589 staticpro (&Qcoding_system_error);
7590
7591 Fput (Qcoding_system_error, Qerror_conditions,
7592 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7593 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 7594 build_string ("Invalid coding system"));
4ed46869 7595
d46c5b12
KH
7596 Qcoding_category = intern ("coding-category");
7597 staticpro (&Qcoding_category);
4ed46869
KH
7598 Qcoding_category_index = intern ("coding-category-index");
7599 staticpro (&Qcoding_category_index);
7600
d46c5b12
KH
7601 Vcoding_category_table
7602 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7603 staticpro (&Vcoding_category_table);
4ed46869
KH
7604 {
7605 int i;
7606 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7607 {
d46c5b12
KH
7608 XVECTOR (Vcoding_category_table)->contents[i]
7609 = intern (coding_category_name[i]);
7610 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7611 Qcoding_category_index, make_number (i));
4ed46869
KH
7612 }
7613 }
7614
6b89e3aa
KH
7615 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7616 staticpro (&Vcoding_system_safe_chars);
7617
f967223b
KH
7618 Qtranslation_table = intern ("translation-table");
7619 staticpro (&Qtranslation_table);
b666620c 7620 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
bdd9fb48 7621
f967223b
KH
7622 Qtranslation_table_id = intern ("translation-table-id");
7623 staticpro (&Qtranslation_table_id);
84fbb8a0 7624
f967223b
KH
7625 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7626 staticpro (&Qtranslation_table_for_decode);
a5d301df 7627
f967223b
KH
7628 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7629 staticpro (&Qtranslation_table_for_encode);
a5d301df 7630
05e6f5dc
KH
7631 Qsafe_chars = intern ("safe-chars");
7632 staticpro (&Qsafe_chars);
7633
7634 Qchar_coding_system = intern ("char-coding-system");
7635 staticpro (&Qchar_coding_system);
7636
7637 /* Intern this now in case it isn't already done.
7638 Setting this variable twice is harmless.
7639 But don't staticpro it here--that is done in alloc.c. */
7640 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7641 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
067a6a66 7642 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
70c22245 7643
1397dc18
KH
7644 Qvalid_codes = intern ("valid-codes");
7645 staticpro (&Qvalid_codes);
7646
9ce27fde
KH
7647 Qemacs_mule = intern ("emacs-mule");
7648 staticpro (&Qemacs_mule);
7649
d46c5b12
KH
7650 Qraw_text = intern ("raw-text");
7651 staticpro (&Qraw_text);
7652
ecf488bc
DL
7653 Qutf_8 = intern ("utf-8");
7654 staticpro (&Qutf_8);
7655
a362520d
KH
7656 Qcoding_system_define_form = intern ("coding-system-define-form");
7657 staticpro (&Qcoding_system_define_form);
7658
4ed46869
KH
7659 defsubr (&Scoding_system_p);
7660 defsubr (&Sread_coding_system);
7661 defsubr (&Sread_non_nil_coding_system);
7662 defsubr (&Scheck_coding_system);
7663 defsubr (&Sdetect_coding_region);
d46c5b12 7664 defsubr (&Sdetect_coding_string);
05e6f5dc 7665 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 7666 defsubr (&Sunencodable_char_position);
4ed46869
KH
7667 defsubr (&Sdecode_coding_region);
7668 defsubr (&Sencode_coding_region);
7669 defsubr (&Sdecode_coding_string);
7670 defsubr (&Sencode_coding_string);
7671 defsubr (&Sdecode_sjis_char);
7672 defsubr (&Sencode_sjis_char);
7673 defsubr (&Sdecode_big5_char);
7674 defsubr (&Sencode_big5_char);
1ba9e4ab 7675 defsubr (&Sset_terminal_coding_system_internal);
c4825358 7676 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 7677 defsubr (&Sterminal_coding_system);
1ba9e4ab 7678 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 7679 defsubr (&Skeyboard_coding_system);
a5d301df 7680 defsubr (&Sfind_operation_coding_system);
1397dc18 7681 defsubr (&Supdate_coding_systems_internal);
66cfb530 7682 defsubr (&Sset_coding_priority_internal);
6b89e3aa 7683 defsubr (&Sdefine_coding_system_internal);
4ed46869 7684
4608c386 7685 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
7686 doc: /* List of coding systems.
7687
7688Do not alter the value of this variable manually. This variable should be
7689updated by the functions `make-coding-system' and
7690`define-coding-system-alias'. */);
4608c386
KH
7691 Vcoding_system_list = Qnil;
7692
7693 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
7694 doc: /* Alist of coding system names.
7695Each element is one element list of coding system name.
7696This variable is given to `completing-read' as TABLE argument.
7697
7698Do not alter the value of this variable manually. This variable should be
7699updated by the functions `make-coding-system' and
7700`define-coding-system-alias'. */);
4608c386
KH
7701 Vcoding_system_alist = Qnil;
7702
4ed46869 7703 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
7704 doc: /* List of coding-categories (symbols) ordered by priority.
7705
7706On detecting a coding system, Emacs tries code detection algorithms
7707associated with each coding-category one by one in this order. When
7708one algorithm agrees with a byte sequence of source text, the coding
7709system bound to the corresponding coding-category is selected. */);
4ed46869
KH
7710 {
7711 int i;
7712
7713 Vcoding_category_list = Qnil;
7714 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7715 Vcoding_category_list
d46c5b12
KH
7716 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7717 Vcoding_category_list);
4ed46869
KH
7718 }
7719
7720 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
7721 doc: /* Specify the coding system for read operations.
7722It is useful to bind this variable with `let', but do not set it globally.
7723If the value is a coding system, it is used for decoding on read operation.
7724If not, an appropriate element is used from one of the coding system alists:
7725There are three such tables, `file-coding-system-alist',
7726`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
7727 Vcoding_system_for_read = Qnil;
7728
7729 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
7730 doc: /* Specify the coding system for write operations.
7731Programs bind this variable with `let', but you should not set it globally.
7732If the value is a coding system, it is used for encoding of output,
7733when writing it to a file and when sending it to a file or subprocess.
7734
7735If this does not specify a coding system, an appropriate element
7736is used from one of the coding system alists:
7737There are three such tables, `file-coding-system-alist',
7738`process-coding-system-alist', and `network-coding-system-alist'.
7739For output to files, if the above procedure does not specify a coding system,
7740the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
7741 Vcoding_system_for_write = Qnil;
7742
7743 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7c695ab9
DL
7744 doc: /* Coding system used in the latest file or process I/O.
7745Also set by `encode-coding-region', `decode-coding-region',
7746`encode-coding-string' and `decode-coding-string'. */);
4ed46869
KH
7747 Vlast_coding_system_used = Qnil;
7748
9ce27fde 7749 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
48b0f3ae
PJ
7750 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7751See info node `Coding Systems' and info node `Text and Binary' concerning
7752such conversion. */);
9ce27fde
KH
7753 inhibit_eol_conversion = 0;
7754
ed29121d 7755 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
48b0f3ae
PJ
7756 doc: /* Non-nil means process buffer inherits coding system of process output.
7757Bind it to t if the process output is to be treated as if it were a file
7758read from some filesystem. */);
ed29121d
EZ
7759 inherit_process_coding_system = 0;
7760
02ba4723 7761 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
48b0f3ae
PJ
7762 doc: /* Alist to decide a coding system to use for a file I/O operation.
7763The format is ((PATTERN . VAL) ...),
7764where PATTERN is a regular expression matching a file name,
7765VAL is a coding system, a cons of coding systems, or a function symbol.
7766If VAL is a coding system, it is used for both decoding and encoding
7767the file contents.
7768If VAL is a cons of coding systems, the car part is used for decoding,
7769and the cdr part is used for encoding.
7770If VAL is a function symbol, the function must return a coding system
0192762c 7771or a cons of coding systems which are used as above. The function gets
ff955d90 7772the arguments with which `find-operation-coding-system' was called.
48b0f3ae
PJ
7773
7774See also the function `find-operation-coding-system'
7775and the variable `auto-coding-alist'. */);
02ba4723
KH
7776 Vfile_coding_system_alist = Qnil;
7777
7778 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
48b0f3ae
PJ
7779 doc: /* Alist to decide a coding system to use for a process I/O operation.
7780The format is ((PATTERN . VAL) ...),
7781where PATTERN is a regular expression matching a program name,
7782VAL is a coding system, a cons of coding systems, or a function symbol.
7783If VAL is a coding system, it is used for both decoding what received
7784from the program and encoding what sent to the program.
7785If VAL is a cons of coding systems, the car part is used for decoding,
7786and the cdr part is used for encoding.
7787If VAL is a function symbol, the function must return a coding system
7788or a cons of coding systems which are used as above.
7789
7790See also the function `find-operation-coding-system'. */);
02ba4723
KH
7791 Vprocess_coding_system_alist = Qnil;
7792
7793 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
48b0f3ae
PJ
7794 doc: /* Alist to decide a coding system to use for a network I/O operation.
7795The format is ((PATTERN . VAL) ...),
7796where PATTERN is a regular expression matching a network service name
7797or is a port number to connect to,
7798VAL is a coding system, a cons of coding systems, or a function symbol.
7799If VAL is a coding system, it is used for both decoding what received
7800from the network stream and encoding what sent to the network stream.
7801If VAL is a cons of coding systems, the car part is used for decoding,
7802and the cdr part is used for encoding.
7803If VAL is a function symbol, the function must return a coding system
7804or a cons of coding systems which are used as above.
7805
7806See also the function `find-operation-coding-system'. */);
02ba4723 7807 Vnetwork_coding_system_alist = Qnil;
4ed46869 7808
68c45bf0 7809 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
7810 doc: /* Coding system to use with system messages.
7811Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
7812 Vlocale_coding_system = Qnil;
7813
005f0d35 7814 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 7815 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
48b0f3ae 7816 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 7817 eol_mnemonic_unix = build_string (":");
4ed46869 7818
7722baf9 7819 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
48b0f3ae 7820 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 7821 eol_mnemonic_dos = build_string ("\\");
4ed46869 7822
7722baf9 7823 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
48b0f3ae 7824 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 7825 eol_mnemonic_mac = build_string ("/");
4ed46869 7826
7722baf9 7827 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
48b0f3ae 7828 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 7829 eol_mnemonic_undecided = build_string (":");
4ed46869 7830
84fbb8a0 7831 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
48b0f3ae 7832 doc: /* *Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 7833 Venable_character_translation = Qt;
bdd9fb48 7834
f967223b 7835 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
7836 &Vstandard_translation_table_for_decode,
7837 doc: /* Table for translating characters while decoding. */);
f967223b 7838 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 7839
f967223b 7840 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
7841 &Vstandard_translation_table_for_encode,
7842 doc: /* Table for translating characters while encoding. */);
f967223b 7843 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
7844
7845 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
48b0f3ae
PJ
7846 doc: /* Alist of charsets vs revision numbers.
7847While encoding, if a charset (car part of an element) is found,
7848designate it with the escape sequence identifying revision (cdr part of the element). */);
4ed46869 7849 Vcharset_revision_alist = Qnil;
02ba4723
KH
7850
7851 DEFVAR_LISP ("default-process-coding-system",
7852 &Vdefault_process_coding_system,
48b0f3ae
PJ
7853 doc: /* Cons of coding systems used for process I/O by default.
7854The car part is used for decoding a process output,
7855the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 7856 Vdefault_process_coding_system = Qnil;
c4825358 7857
3f003981 7858 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
48b0f3ae
PJ
7859 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7860This is a vector of length 256.
7861If Nth element is non-nil, the existence of code N in a file
7862\(or output of subprocess) doesn't prevent it to be detected as
7863a coding system of ISO 2022 variant which has a flag
7864`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7865or reading output of a subprocess.
7866Only 128th through 159th elements has a meaning. */);
3f003981 7867 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
7868
7869 DEFVAR_LISP ("select-safe-coding-system-function",
7870 &Vselect_safe_coding_system_function,
48b0f3ae
PJ
7871 doc: /* Function to call to select safe coding system for encoding a text.
7872
7873If set, this function is called to force a user to select a proper
7874coding system which can encode the text in the case that a default
7875coding system used in each operation can't encode the text.
7876
7877The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
7878 Vselect_safe_coding_system_function = Qnil;
7879
5d5bf4d8
KH
7880 DEFVAR_BOOL ("coding-system-require-warning",
7881 &coding_system_require_warning,
7882 doc: /* Internal use only.
6b89e3aa
KH
7883If non-nil, on writing a file, `select-safe-coding-system-function' is
7884called even if `coding-system-for-write' is non-nil. The command
7885`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
7886 coding_system_require_warning = 0;
7887
7888
22ab2303 7889 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 7890 &inhibit_iso_escape_detection,
48b0f3ae
PJ
7891 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7892
7893By default, on reading a file, Emacs tries to detect how the text is
7894encoded. This code detection is sensitive to escape sequences. If
7895the sequence is valid as ISO2022, the code is determined as one of
7896the ISO2022 encodings, and the file is decoded by the corresponding
7897coding system (e.g. `iso-2022-7bit').
7898
7899However, there may be a case that you want to read escape sequences in
7900a file as is. In such a case, you can set this variable to non-nil.
7901Then, as the code detection ignores any escape sequences, no file is
7902detected as encoded in some ISO2022 encoding. The result is that all
7903escape sequences become visible in a buffer.
7904
7905The default value is nil, and it is strongly recommended not to change
7906it. That is because many Emacs Lisp source files that contain
7907non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7908in Emacs's distribution, and they won't be decoded correctly on
7909reading if you suppress escape sequence detection.
7910
7911The other way to read escape sequences in a file without decoding is
7912to explicitly specify some coding system that doesn't use ISO2022's
7913escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 7914 inhibit_iso_escape_detection = 0;
002fdb44
DL
7915
7916 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
7917 doc: /* Char table for translating self-inserting characters.
7918This is applied to the result of input methods, not their input. See also
7919`keyboard-translate-table'. */);
002fdb44 7920 Vtranslation_table_for_input = Qnil;
4ed46869
KH
7921}
7922
68c45bf0
PE
7923char *
7924emacs_strerror (error_number)
7925 int error_number;
7926{
7927 char *str;
7928
ca9c0567 7929 synchronize_system_messages_locale ();
68c45bf0
PE
7930 str = strerror (error_number);
7931
7932 if (! NILP (Vlocale_coding_system))
7933 {
7934 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7935 Vlocale_coding_system,
7936 0);
d5db4077 7937 str = (char *) SDATA (dec);
68c45bf0
PE
7938 }
7939
7940 return str;
7941}
7942
4ed46869 7943#endif /* emacs */
c2f94ebc 7944
ab5796a9
MB
7945/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
7946 (do not change this comment) */