(openp): Initialized encoded_fn before GCPRO it.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
0b5538bd 2 Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
ce03bf76
KH
3 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
4 National Institute of Advanced Industrial Science and Technology (AIST)
5 Registration Number H14PRO021
4ed46869 6
369314dc
KH
7This file is part of GNU Emacs.
8
9GNU Emacs is free software; you can redistribute it and/or modify
10it under the terms of the GNU General Public License as published by
11the Free Software Foundation; either version 2, or (at your option)
12any later version.
4ed46869 13
369314dc
KH
14GNU Emacs is distributed in the hope that it will be useful,
15but WITHOUT ANY WARRANTY; without even the implied warranty of
16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17GNU General Public License for more details.
4ed46869 18
369314dc
KH
19You should have received a copy of the GNU General Public License
20along with GNU Emacs; see the file COPYING. If not, write to
4fc5845f
LK
21the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22Boston, MA 02110-1301, USA. */
4ed46869
KH
23
24/*** TABLE OF CONTENTS ***
25
b73bfc1c 26 0. General comments
4ed46869 27 1. Preamble
0ef69138 28 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
29 3. ISO2022 handlers
30 4. Shift-JIS and BIG5 handlers
1397dc18
KH
31 5. CCL handlers
32 6. End-of-line handlers
33 7. C library functions
34 8. Emacs Lisp library functions
35 9. Post-amble
4ed46869
KH
36
37*/
38
b73bfc1c
KH
39/*** 0. General comments ***/
40
41
cfb43547 42/*** GENERAL NOTE on CODING SYSTEMS ***
4ed46869 43
cfb43547 44 A coding system is an encoding mechanism for one or more character
4ed46869
KH
45 sets. Here's a list of coding systems which Emacs can handle. When
46 we say "decode", it means converting some other coding system to
cfb43547 47 Emacs' internal format (emacs-mule), and when we say "encode",
0ef69138
KH
48 it means converting the coding system emacs-mule to some other
49 coding system.
4ed46869 50
0ef69138 51 0. Emacs' internal format (emacs-mule)
4ed46869 52
cfb43547 53 Emacs itself holds a multi-lingual character in buffers and strings
f4dee582 54 in a special format. Details are described in section 2.
4ed46869
KH
55
56 1. ISO2022
57
58 The most famous coding system for multiple character sets. X's
f4dee582
RS
59 Compound Text, various EUCs (Extended Unix Code), and coding
60 systems used in Internet communication such as ISO-2022-JP are
61 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
62
63 2. SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 64
4ed46869
KH
65 A coding system to encode character sets: ASCII, JISX0201, and
66 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 67 section 4.
4ed46869
KH
68
69 3. BIG5
70
cfb43547
DL
71 A coding system to encode the character sets ASCII and Big5. Widely
72 used for Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
73 described in section 4. In this file, when we write "BIG5"
74 (all uppercase), we mean the coding system, and when we write
75 "Big5" (capitalized), we mean the character set.
4ed46869 76
27901516
KH
77 4. Raw text
78
cfb43547
DL
79 A coding system for text containing random 8-bit code. Emacs does
80 no code conversion on such text except for end-of-line format.
27901516
KH
81
82 5. Other
4ed46869 83
cfb43547
DL
84 If a user wants to read/write text encoded in a coding system not
85 listed above, he can supply a decoder and an encoder for it as CCL
4ed46869
KH
86 (Code Conversion Language) programs. Emacs executes the CCL program
87 while reading/writing.
88
d46c5b12
KH
89 Emacs represents a coding system by a Lisp symbol that has a property
90 `coding-system'. But, before actually using the coding system, the
4ed46869 91 information about it is set in a structure of type `struct
f4dee582 92 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
93
94*/
95
96/*** GENERAL NOTES on END-OF-LINE FORMAT ***
97
cfb43547
DL
98 How end-of-line of text is encoded depends on the operating system.
99 For instance, Unix's format is just one byte of `line-feed' code,
f4dee582 100 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
101 `line-feed' codes. MacOS's format is usually one byte of
102 `carriage-return'.
4ed46869 103
cfb43547
DL
104 Since text character encoding and end-of-line encoding are
105 independent, any coding system described above can have any
106 end-of-line format. So Emacs has information about end-of-line
107 format in each coding-system. See section 6 for more details.
4ed46869
KH
108
109*/
110
111/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
112
113 These functions check if a text between SRC and SRC_END is encoded
114 in the coding system category XXX. Each returns an integer value in
cfb43547 115 which appropriate flag bits for the category XXX are set. The flag
4ed46869 116 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
cfb43547 117 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
0a28aafb 118 of the range 0x80..0x9F are in multibyte form. */
4ed46869
KH
119#if 0
120int
0a28aafb 121detect_coding_emacs_mule (src, src_end, multibytep)
4ed46869 122 unsigned char *src, *src_end;
0a28aafb 123 int multibytep;
4ed46869
KH
124{
125 ...
126}
127#endif
128
129/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
130
b73bfc1c
KH
131 These functions decode SRC_BYTES length of unibyte text at SOURCE
132 encoded in CODING to Emacs' internal format. The resulting
133 multibyte text goes to a place pointed to by DESTINATION, the length
134 of which should not exceed DST_BYTES.
d46c5b12 135
cfb43547
DL
136 These functions set the information about original and decoded texts
137 in the members `produced', `produced_char', `consumed', and
138 `consumed_char' of the structure *CODING. They also set the member
139 `result' to one of CODING_FINISH_XXX indicating how the decoding
140 finished.
d46c5b12 141
cfb43547 142 DST_BYTES zero means that the source area and destination area are
d46c5b12 143 overlapped, which means that we can produce a decoded text until it
cfb43547 144 reaches the head of the not-yet-decoded source text.
d46c5b12 145
cfb43547 146 Below is a template for these functions. */
4ed46869 147#if 0
b73bfc1c 148static void
d46c5b12 149decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869 150 struct coding_system *coding;
5bdca8af
DN
151 const unsigned char *source;
152 unsigned char *destination;
4ed46869 153 int src_bytes, dst_bytes;
4ed46869
KH
154{
155 ...
156}
157#endif
158
159/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
160
cfb43547 161 These functions encode SRC_BYTES length text at SOURCE from Emacs'
b73bfc1c
KH
162 internal multibyte format to CODING. The resulting unibyte text
163 goes to a place pointed to by DESTINATION, the length of which
164 should not exceed DST_BYTES.
d46c5b12 165
cfb43547
DL
166 These functions set the information about original and encoded texts
167 in the members `produced', `produced_char', `consumed', and
168 `consumed_char' of the structure *CODING. They also set the member
169 `result' to one of CODING_FINISH_XXX indicating how the encoding
170 finished.
d46c5b12 171
cfb43547
DL
172 DST_BYTES zero means that the source area and destination area are
173 overlapped, which means that we can produce encoded text until it
174 reaches at the head of the not-yet-encoded source text.
d46c5b12 175
cfb43547 176 Below is a template for these functions. */
4ed46869 177#if 0
b73bfc1c 178static void
d46c5b12 179encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
180 struct coding_system *coding;
181 unsigned char *source, *destination;
182 int src_bytes, dst_bytes;
4ed46869
KH
183{
184 ...
185}
186#endif
187
188/*** COMMONLY USED MACROS ***/
189
b73bfc1c
KH
190/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
191 get one, two, and three bytes from the source text respectively.
192 If there are not enough bytes in the source, they jump to
193 `label_end_of_loop'. The caller should set variables `coding',
194 `src' and `src_end' to appropriate pointer in advance. These
195 macros are called from decoding routines `decode_coding_XXX', thus
196 it is assumed that the source text is unibyte. */
4ed46869 197
b73bfc1c
KH
198#define ONE_MORE_BYTE(c1) \
199 do { \
200 if (src >= src_end) \
201 { \
202 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
203 goto label_end_of_loop; \
204 } \
205 c1 = *src++; \
4ed46869
KH
206 } while (0)
207
b73bfc1c
KH
208#define TWO_MORE_BYTES(c1, c2) \
209 do { \
210 if (src + 1 >= src_end) \
211 { \
212 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
213 goto label_end_of_loop; \
214 } \
215 c1 = *src++; \
216 c2 = *src++; \
4ed46869
KH
217 } while (0)
218
4ed46869 219
0a28aafb
KH
220/* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
221 form if MULTIBYTEP is nonzero. */
222
223#define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
224 do { \
225 if (src >= src_end) \
226 { \
227 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
228 goto label_end_of_loop; \
229 } \
230 c1 = *src++; \
231 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
232 c1 = *src++ - 0x20; \
233 } while (0)
234
b73bfc1c
KH
235/* Set C to the next character at the source text pointed by `src'.
236 If there are not enough characters in the source, jump to
237 `label_end_of_loop'. The caller should set variables `coding'
238 `src', `src_end', and `translation_table' to appropriate pointers
239 in advance. This macro is used in encoding routines
240 `encode_coding_XXX', thus it assumes that the source text is in
241 multibyte form except for 8-bit characters. 8-bit characters are
242 in multibyte form if coding->src_multibyte is nonzero, else they
243 are represented by a single byte. */
4ed46869 244
b73bfc1c
KH
245#define ONE_MORE_CHAR(c) \
246 do { \
247 int len = src_end - src; \
248 int bytes; \
249 if (len <= 0) \
250 { \
251 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
252 goto label_end_of_loop; \
253 } \
254 if (coding->src_multibyte \
255 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
256 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
257 else \
258 c = *src, bytes = 1; \
259 if (!NILP (translation_table)) \
39658efc 260 c = translate_char (translation_table, c, -1, 0, 0); \
b73bfc1c 261 src += bytes; \
4ed46869
KH
262 } while (0)
263
4ed46869 264
8ca3766a 265/* Produce a multibyte form of character C to `dst'. Jump to
b73bfc1c
KH
266 `label_end_of_loop' if there's not enough space at `dst'.
267
cfb43547 268 If we are now in the middle of a composition sequence, the decoded
b73bfc1c
KH
269 character may be ALTCHAR (for the current composition). In that
270 case, the character goes to coding->cmp_data->data instead of
271 `dst'.
272
273 This macro is used in decoding routines. */
274
275#define EMIT_CHAR(c) \
4ed46869 276 do { \
b73bfc1c
KH
277 if (! COMPOSING_P (coding) \
278 || coding->composing == COMPOSITION_RELATIVE \
279 || coding->composing == COMPOSITION_WITH_RULE) \
280 { \
281 int bytes = CHAR_BYTES (c); \
282 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
283 { \
284 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
285 goto label_end_of_loop; \
286 } \
287 dst += CHAR_STRING (c, dst); \
288 coding->produced_char++; \
289 } \
ec6d2bb8 290 \
b73bfc1c
KH
291 if (COMPOSING_P (coding) \
292 && coding->composing != COMPOSITION_RELATIVE) \
293 { \
294 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
295 coding->composition_rule_follows \
296 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
297 } \
4ed46869
KH
298 } while (0)
299
4ed46869 300
b73bfc1c
KH
301#define EMIT_ONE_BYTE(c) \
302 do { \
303 if (dst >= (dst_bytes ? dst_end : src)) \
304 { \
305 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
306 goto label_end_of_loop; \
307 } \
308 *dst++ = c; \
309 } while (0)
310
311#define EMIT_TWO_BYTES(c1, c2) \
312 do { \
313 if (dst + 2 > (dst_bytes ? dst_end : src)) \
314 { \
315 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
316 goto label_end_of_loop; \
317 } \
318 *dst++ = c1, *dst++ = c2; \
319 } while (0)
320
321#define EMIT_BYTES(from, to) \
322 do { \
323 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
324 { \
325 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
326 goto label_end_of_loop; \
327 } \
328 while (from < to) \
329 *dst++ = *from++; \
4ed46869
KH
330 } while (0)
331
332\f
333/*** 1. Preamble ***/
334
68c45bf0
PE
335#ifdef emacs
336#include <config.h>
337#endif
338
4ed46869
KH
339#include <stdio.h>
340
341#ifdef emacs
342
4ed46869
KH
343#include "lisp.h"
344#include "buffer.h"
345#include "charset.h"
ec6d2bb8 346#include "composite.h"
4ed46869
KH
347#include "ccl.h"
348#include "coding.h"
349#include "window.h"
66638433 350#include "intervals.h"
4ed46869
KH
351
352#else /* not emacs */
353
354#include "mulelib.h"
355
356#endif /* not emacs */
357
358Lisp_Object Qcoding_system, Qeol_type;
359Lisp_Object Qbuffer_file_coding_system;
360Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 361Lisp_Object Qno_conversion, Qundecided;
bb0115a2 362Lisp_Object Qcoding_system_history;
05e6f5dc 363Lisp_Object Qsafe_chars;
1397dc18 364Lisp_Object Qvalid_codes;
4ed46869
KH
365
366extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 367Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
368Lisp_Object Qstart_process, Qopen_network_stream;
369Lisp_Object Qtarget_idx;
370
a362520d
KH
371/* If a symbol has this property, evaluate the value to define the
372 symbol as a coding system. */
373Lisp_Object Qcoding_system_define_form;
374
d46c5b12
KH
375Lisp_Object Vselect_safe_coding_system_function;
376
5d5bf4d8
KH
377int coding_system_require_warning;
378
7722baf9
EZ
379/* Mnemonic string for each format of end-of-line. */
380Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
381/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 382 decided. */
7722baf9 383Lisp_Object eol_mnemonic_undecided;
4ed46869 384
9ce27fde
KH
385/* Format of end-of-line decided by system. This is CODING_EOL_LF on
386 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
387int system_eol_type;
388
4ed46869
KH
389#ifdef emacs
390
6b89e3aa
KH
391/* Information about which coding system is safe for which chars.
392 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
393
394 GENERIC-LIST is a list of generic coding systems which can encode
395 any characters.
396
397 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
398 corresponding char table that contains safe chars. */
399Lisp_Object Vcoding_system_safe_chars;
400
4608c386
KH
401Lisp_Object Vcoding_system_list, Vcoding_system_alist;
402
403Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 404
d46c5b12
KH
405/* Coding system emacs-mule and raw-text are for converting only
406 end-of-line format. */
407Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 408
ecf488bc
DL
409Lisp_Object Qutf_8;
410
4ed46869
KH
411/* Coding-systems are handed between Emacs Lisp programs and C internal
412 routines by the following three variables. */
413/* Coding-system for reading files and receiving data from process. */
414Lisp_Object Vcoding_system_for_read;
415/* Coding-system for writing files and sending data to process. */
416Lisp_Object Vcoding_system_for_write;
417/* Coding-system actually used in the latest I/O. */
418Lisp_Object Vlast_coding_system_used;
419
c4825358 420/* A vector of length 256 which contains information about special
94487c4e 421 Latin codes (especially for dealing with Microsoft codes). */
3f003981 422Lisp_Object Vlatin_extra_code_table;
c4825358 423
9ce27fde
KH
424/* Flag to inhibit code conversion of end-of-line format. */
425int inhibit_eol_conversion;
426
74383408
KH
427/* Flag to inhibit ISO2022 escape sequence detection. */
428int inhibit_iso_escape_detection;
429
ed29121d
EZ
430/* Flag to make buffer-file-coding-system inherit from process-coding. */
431int inherit_process_coding_system;
432
c4825358 433/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
434struct coding_system terminal_coding;
435
c4825358
KH
436/* Coding system to be used to encode text for terminal display when
437 terminal coding system is nil. */
438struct coding_system safe_terminal_coding;
439
440/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
441struct coding_system keyboard_coding;
442
6bc51348
KH
443/* Default coding system to be used to write a file. */
444struct coding_system default_buffer_file_coding;
445
02ba4723
KH
446Lisp_Object Vfile_coding_system_alist;
447Lisp_Object Vprocess_coding_system_alist;
448Lisp_Object Vnetwork_coding_system_alist;
4ed46869 449
68c45bf0
PE
450Lisp_Object Vlocale_coding_system;
451
4ed46869
KH
452#endif /* emacs */
453
d46c5b12 454Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
455
456/* List of symbols `coding-category-xxx' ordered by priority. */
457Lisp_Object Vcoding_category_list;
458
d46c5b12
KH
459/* Table of coding categories (Lisp symbols). */
460Lisp_Object Vcoding_category_table;
4ed46869
KH
461
462/* Table of names of symbol for each coding-category. */
463char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 464 "coding-category-emacs-mule",
4ed46869
KH
465 "coding-category-sjis",
466 "coding-category-iso-7",
d46c5b12 467 "coding-category-iso-7-tight",
4ed46869
KH
468 "coding-category-iso-8-1",
469 "coding-category-iso-8-2",
7717c392
KH
470 "coding-category-iso-7-else",
471 "coding-category-iso-8-else",
89fa8b36 472 "coding-category-ccl",
4ed46869 473 "coding-category-big5",
fa42c37f
KH
474 "coding-category-utf-8",
475 "coding-category-utf-16-be",
476 "coding-category-utf-16-le",
27901516 477 "coding-category-raw-text",
89fa8b36 478 "coding-category-binary"
4ed46869
KH
479};
480
66cfb530 481/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
482 categories. */
483struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
484
66cfb530 485/* Table of coding category masks. Nth element is a mask for a coding
8ca3766a 486 category of which priority is Nth. */
66cfb530
KH
487static
488int coding_priorities[CODING_CATEGORY_IDX_MAX];
489
f967223b
KH
490/* Flag to tell if we look up translation table on character code
491 conversion. */
84fbb8a0 492Lisp_Object Venable_character_translation;
f967223b
KH
493/* Standard translation table to look up on decoding (reading). */
494Lisp_Object Vstandard_translation_table_for_decode;
495/* Standard translation table to look up on encoding (writing). */
496Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 497
f967223b
KH
498Lisp_Object Qtranslation_table;
499Lisp_Object Qtranslation_table_id;
500Lisp_Object Qtranslation_table_for_decode;
501Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
502
503/* Alist of charsets vs revision number. */
504Lisp_Object Vcharset_revision_alist;
505
02ba4723
KH
506/* Default coding systems used for process I/O. */
507Lisp_Object Vdefault_process_coding_system;
508
002fdb44
DL
509/* Char table for translating Quail and self-inserting input. */
510Lisp_Object Vtranslation_table_for_input;
511
b843d1ae
KH
512/* Global flag to tell that we can't call post-read-conversion and
513 pre-write-conversion functions. Usually the value is zero, but it
514 is set to 1 temporarily while such functions are running. This is
515 to avoid infinite recursive call. */
516static int inhibit_pre_post_conversion;
517
05e6f5dc
KH
518Lisp_Object Qchar_coding_system;
519
6b89e3aa
KH
520/* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
521 its validity. */
05e6f5dc
KH
522
523Lisp_Object
6b89e3aa
KH
524coding_safe_chars (coding_system)
525 Lisp_Object coding_system;
05e6f5dc
KH
526{
527 Lisp_Object coding_spec, plist, safe_chars;
93dec019 528
6b89e3aa 529 coding_spec = Fget (coding_system, Qcoding_system);
05e6f5dc
KH
530 plist = XVECTOR (coding_spec)->contents[3];
531 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
532 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
533}
534
535#define CODING_SAFE_CHAR_P(safe_chars, c) \
536 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
537
4ed46869 538\f
0ef69138 539/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869 540
aa72b389
KH
541/* Emacs' internal format for representation of multiple character
542 sets is a kind of multi-byte encoding, i.e. characters are
543 represented by variable-length sequences of one-byte codes.
b73bfc1c
KH
544
545 ASCII characters and control characters (e.g. `tab', `newline') are
546 represented by one-byte sequences which are their ASCII codes, in
547 the range 0x00 through 0x7F.
548
549 8-bit characters of the range 0x80..0x9F are represented by
550 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
551 code + 0x20).
552
553 8-bit characters of the range 0xA0..0xFF are represented by
554 one-byte sequences which are their 8-bit code.
555
556 The other characters are represented by a sequence of `base
557 leading-code', optional `extended leading-code', and one or two
558 `position-code's. The length of the sequence is determined by the
aa72b389 559 base leading-code. Leading-code takes the range 0x81 through 0x9D,
b73bfc1c
KH
560 whereas extended leading-code and position-code take the range 0xA0
561 through 0xFF. See `charset.h' for more details about leading-code
562 and position-code.
f4dee582 563
4ed46869 564 --- CODE RANGE of Emacs' internal format ---
b73bfc1c
KH
565 character set range
566 ------------- -----
567 ascii 0x00..0x7F
568 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
569 eight-bit-graphic 0xA0..0xBF
aa72b389 570 ELSE 0x81..0x9D + [0xA0..0xFF]+
4ed46869
KH
571 ---------------------------------------------
572
aa72b389
KH
573 As this is the internal character representation, the format is
574 usually not used externally (i.e. in a file or in a data sent to a
575 process). But, it is possible to have a text externally in this
576 format (i.e. by encoding by the coding system `emacs-mule').
577
578 In that case, a sequence of one-byte codes has a slightly different
579 form.
580
ae5145c2 581 Firstly, all characters in eight-bit-control are represented by
aa72b389
KH
582 one-byte sequences which are their 8-bit code.
583
584 Next, character composition data are represented by the byte
585 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
586 where,
587 METHOD is 0xF0 plus one of composition method (enum
588 composition_method),
589
ae5145c2 590 BYTES is 0xA0 plus the byte length of these composition data,
aa72b389 591
ae5145c2 592 CHARS is 0xA0 plus the number of characters composed by these
aa72b389
KH
593 data,
594
8ca3766a 595 COMPONENTs are characters of multibyte form or composition
aa72b389
KH
596 rules encoded by two-byte of ASCII codes.
597
598 In addition, for backward compatibility, the following formats are
599 also recognized as composition data on decoding.
600
601 0x80 MSEQ ...
602 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
603
604 Here,
605 MSEQ is a multibyte form but in these special format:
606 ASCII: 0xA0 ASCII_CODE+0x80,
607 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
608 RULE is a one byte code of the range 0xA0..0xF0 that
609 represents a composition rule.
4ed46869
KH
610 */
611
612enum emacs_code_class_type emacs_code_class[256];
613
4ed46869
KH
614/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
615 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 616 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869 617
0a28aafb
KH
618static int
619detect_coding_emacs_mule (src, src_end, multibytep)
b73bfc1c 620 unsigned char *src, *src_end;
0a28aafb 621 int multibytep;
4ed46869
KH
622{
623 unsigned char c;
624 int composing = 0;
b73bfc1c
KH
625 /* Dummy for ONE_MORE_BYTE. */
626 struct coding_system dummy_coding;
627 struct coding_system *coding = &dummy_coding;
4ed46869 628
b73bfc1c 629 while (1)
4ed46869 630 {
0a28aafb 631 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
4ed46869
KH
632
633 if (composing)
634 {
635 if (c < 0xA0)
636 composing = 0;
b73bfc1c
KH
637 else if (c == 0xA0)
638 {
0a28aafb 639 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
640 c &= 0x7F;
641 }
4ed46869
KH
642 else
643 c -= 0x20;
644 }
645
b73bfc1c 646 if (c < 0x20)
4ed46869 647 {
4ed46869
KH
648 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
649 return 0;
b73bfc1c
KH
650 }
651 else if (c >= 0x80 && c < 0xA0)
652 {
653 if (c == 0x80)
654 /* Old leading code for a composite character. */
655 composing = 1;
656 else
657 {
658 unsigned char *src_base = src - 1;
659 int bytes;
4ed46869 660
b73bfc1c
KH
661 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
662 bytes))
663 return 0;
664 src = src_base + bytes;
665 }
666 }
667 }
668 label_end_of_loop:
669 return CODING_CATEGORY_MASK_EMACS_MULE;
670}
4ed46869 671
4ed46869 672
aa72b389
KH
673/* Record the starting position START and METHOD of one composition. */
674
675#define CODING_ADD_COMPOSITION_START(coding, start, method) \
676 do { \
677 struct composition_data *cmp_data = coding->cmp_data; \
678 int *data = cmp_data->data + cmp_data->used; \
679 coding->cmp_data_start = cmp_data->used; \
680 data[0] = -1; \
681 data[1] = cmp_data->char_offset + start; \
682 data[3] = (int) method; \
683 cmp_data->used += 4; \
684 } while (0)
685
686/* Record the ending position END of the current composition. */
687
688#define CODING_ADD_COMPOSITION_END(coding, end) \
689 do { \
690 struct composition_data *cmp_data = coding->cmp_data; \
691 int *data = cmp_data->data + coding->cmp_data_start; \
692 data[0] = cmp_data->used - coding->cmp_data_start; \
693 data[2] = cmp_data->char_offset + end; \
694 } while (0)
695
696/* Record one COMPONENT (alternate character or composition rule). */
697
b6871cc7
KH
698#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
699 do { \
700 coding->cmp_data->data[coding->cmp_data->used++] = component; \
701 if (coding->cmp_data->used - coding->cmp_data_start \
702 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
703 { \
704 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
705 coding->composing = COMPOSITION_NO; \
706 } \
707 } while (0)
aa72b389
KH
708
709
710/* Get one byte from a data pointed by SRC and increment SRC. If SRC
8ca3766a 711 is not less than SRC_END, return -1 without incrementing Src. */
aa72b389
KH
712
713#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
714
715
716/* Decode a character represented as a component of composition
717 sequence of Emacs 20 style at SRC. Set C to that character, store
718 its multibyte form sequence at P, and set P to the end of that
719 sequence. If no valid character is found, set C to -1. */
720
721#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
722 do { \
723 int bytes; \
fd3ae0b9 724 \
aa72b389
KH
725 c = SAFE_ONE_MORE_BYTE (); \
726 if (c < 0) \
727 break; \
728 if (CHAR_HEAD_P (c)) \
729 c = -1; \
730 else if (c == 0xA0) \
731 { \
732 c = SAFE_ONE_MORE_BYTE (); \
733 if (c < 0xA0) \
734 c = -1; \
735 else \
736 { \
737 c -= 0xA0; \
738 *p++ = c; \
739 } \
740 } \
741 else if (BASE_LEADING_CODE_P (c - 0x20)) \
742 { \
743 unsigned char *p0 = p; \
744 \
745 c -= 0x20; \
746 *p++ = c; \
747 bytes = BYTES_BY_CHAR_HEAD (c); \
748 while (--bytes) \
749 { \
750 c = SAFE_ONE_MORE_BYTE (); \
751 if (c < 0) \
752 break; \
753 *p++ = c; \
754 } \
fd3ae0b9
KH
755 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
756 || (coding->flags /* We are recovering a file. */ \
757 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
758 && ! CHAR_HEAD_P (p0[1]))) \
aa72b389
KH
759 c = STRING_CHAR (p0, bytes); \
760 else \
761 c = -1; \
762 } \
763 else \
764 c = -1; \
765 } while (0)
766
767
768/* Decode a composition rule represented as a component of composition
769 sequence of Emacs 20 style at SRC. Set C to the rule. If not
770 valid rule is found, set C to -1. */
771
772#define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
773 do { \
774 c = SAFE_ONE_MORE_BYTE (); \
775 c -= 0xA0; \
776 if (c < 0 || c >= 81) \
777 c = -1; \
778 else \
779 { \
780 gref = c / 9, nref = c % 9; \
781 c = COMPOSITION_ENCODE_RULE (gref, nref); \
782 } \
783 } while (0)
784
785
786/* Decode composition sequence encoded by `emacs-mule' at the source
787 pointed by SRC. SRC_END is the end of source. Store information
788 of the composition in CODING->cmp_data.
789
790 For backward compatibility, decode also a composition sequence of
791 Emacs 20 style. In that case, the composition sequence contains
792 characters that should be extracted into a buffer or string. Store
793 those characters at *DESTINATION in multibyte form.
794
795 If we encounter an invalid byte sequence, return 0.
796 If we encounter an insufficient source or destination, or
797 insufficient space in CODING->cmp_data, return 1.
798 Otherwise, return consumed bytes in the source.
799
800*/
801static INLINE int
802decode_composition_emacs_mule (coding, src, src_end,
803 destination, dst_end, dst_bytes)
804 struct coding_system *coding;
5bdca8af
DN
805 const unsigned char *src, *src_end;
806 unsigned char **destination, *dst_end;
aa72b389
KH
807 int dst_bytes;
808{
809 unsigned char *dst = *destination;
810 int method, data_len, nchars;
5bdca8af 811 const unsigned char *src_base = src++;
8ca3766a 812 /* Store components of composition. */
aa72b389
KH
813 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
814 int ncomponent;
815 /* Store multibyte form of characters to be composed. This is for
816 Emacs 20 style composition sequence. */
817 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
818 unsigned char *bufp = buf;
819 int c, i, gref, nref;
820
821 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
822 >= COMPOSITION_DATA_SIZE)
823 {
824 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
825 return -1;
826 }
827
828 ONE_MORE_BYTE (c);
829 if (c - 0xF0 >= COMPOSITION_RELATIVE
830 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
831 {
832 int with_rule;
833
834 method = c - 0xF0;
835 with_rule = (method == COMPOSITION_WITH_RULE
836 || method == COMPOSITION_WITH_RULE_ALTCHARS);
837 ONE_MORE_BYTE (c);
838 data_len = c - 0xA0;
839 if (data_len < 4
840 || src_base + data_len > src_end)
841 return 0;
842 ONE_MORE_BYTE (c);
843 nchars = c - 0xA0;
844 if (c < 1)
845 return 0;
846 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
847 {
b1887814
RS
848 /* If it is longer than this, it can't be valid. */
849 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
850 return 0;
851
aa72b389
KH
852 if (ncomponent % 2 && with_rule)
853 {
854 ONE_MORE_BYTE (gref);
855 gref -= 32;
856 ONE_MORE_BYTE (nref);
857 nref -= 32;
858 c = COMPOSITION_ENCODE_RULE (gref, nref);
859 }
860 else
861 {
862 int bytes;
fd3ae0b9
KH
863 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
864 || (coding->flags /* We are recovering a file. */
865 && src[0] == LEADING_CODE_8_BIT_CONTROL
866 && ! CHAR_HEAD_P (src[1])))
aa72b389
KH
867 c = STRING_CHAR (src, bytes);
868 else
869 c = *src, bytes = 1;
870 src += bytes;
871 }
872 component[ncomponent] = c;
873 }
874 }
999a0fe5 875 else if (c >= 0x80)
aa72b389
KH
876 {
877 /* This may be an old Emacs 20 style format. See the comment at
878 the section 2 of this file. */
879 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
880 if (src == src_end
881 && !(coding->mode & CODING_MODE_LAST_BLOCK))
882 goto label_end_of_loop;
883
884 src_end = src;
885 src = src_base + 1;
886 if (c < 0xC0)
887 {
888 method = COMPOSITION_RELATIVE;
889 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
890 {
891 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
892 if (c < 0)
893 break;
894 component[ncomponent++] = c;
895 }
896 if (ncomponent < 2)
897 return 0;
898 nchars = ncomponent;
899 }
900 else if (c == 0xFF)
901 {
902 method = COMPOSITION_WITH_RULE;
903 src++;
904 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
905 if (c < 0)
906 return 0;
907 component[0] = c;
908 for (ncomponent = 1;
909 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
910 {
911 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
912 if (c < 0)
913 break;
914 component[ncomponent++] = c;
915 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
916 if (c < 0)
917 break;
918 component[ncomponent++] = c;
919 }
920 if (ncomponent < 3)
921 return 0;
922 nchars = (ncomponent + 1) / 2;
923 }
924 else
925 return 0;
926 }
999a0fe5
KH
927 else
928 return 0;
aa72b389
KH
929
930 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
931 {
932 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
933 for (i = 0; i < ncomponent; i++)
934 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
93dec019 935 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
aa72b389
KH
936 if (buf < bufp)
937 {
938 unsigned char *p = buf;
939 EMIT_BYTES (p, bufp);
940 *destination += bufp - buf;
941 coding->produced_char += nchars;
942 }
943 return (src - src_base);
944 }
945 label_end_of_loop:
946 return -1;
947}
948
b73bfc1c 949/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 950
b73bfc1c
KH
951static void
952decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
953 struct coding_system *coding;
5bdca8af
DN
954 const unsigned char *source;
955 unsigned char *destination;
b73bfc1c
KH
956 int src_bytes, dst_bytes;
957{
5bdca8af
DN
958 const unsigned char *src = source;
959 const unsigned char *src_end = source + src_bytes;
b73bfc1c
KH
960 unsigned char *dst = destination;
961 unsigned char *dst_end = destination + dst_bytes;
962 /* SRC_BASE remembers the start position in source in each loop.
963 The loop will be exited when there's not enough source code, or
964 when there's not enough destination area to produce a
965 character. */
5bdca8af 966 const unsigned char *src_base;
4ed46869 967
b73bfc1c 968 coding->produced_char = 0;
8a33cf7b 969 while ((src_base = src) < src_end)
b73bfc1c 970 {
5bdca8af
DN
971 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
972 const unsigned char *p;
b73bfc1c 973 int bytes;
ec6d2bb8 974
4af310db
EZ
975 if (*src == '\r')
976 {
2bcdf662 977 int c = *src++;
4af310db 978
4af310db
EZ
979 if (coding->eol_type == CODING_EOL_CR)
980 c = '\n';
981 else if (coding->eol_type == CODING_EOL_CRLF)
982 {
983 ONE_MORE_BYTE (c);
984 if (c != '\n')
985 {
4af310db
EZ
986 src--;
987 c = '\r';
988 }
989 }
990 *dst++ = c;
991 coding->produced_char++;
992 continue;
993 }
994 else if (*src == '\n')
995 {
996 if ((coding->eol_type == CODING_EOL_CR
997 || coding->eol_type == CODING_EOL_CRLF)
998 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
999 {
1000 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1001 goto label_end_of_loop;
1002 }
1003 *dst++ = *src++;
1004 coding->produced_char++;
1005 continue;
1006 }
3089d25c 1007 else if (*src == 0x80 && coding->cmp_data)
aa72b389
KH
1008 {
1009 /* Start of composition data. */
1010 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1011 &dst, dst_end,
1012 dst_bytes);
1013 if (consumed < 0)
1014 goto label_end_of_loop;
1015 else if (consumed > 0)
1016 {
1017 src += consumed;
1018 continue;
1019 }
1020 bytes = CHAR_STRING (*src, tmp);
1021 p = tmp;
1022 src++;
1023 }
fd3ae0b9
KH
1024 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1025 || (coding->flags /* We are recovering a file. */
1026 && src[0] == LEADING_CODE_8_BIT_CONTROL
1027 && ! CHAR_HEAD_P (src[1])))
b73bfc1c
KH
1028 {
1029 p = src;
1030 src += bytes;
1031 }
1032 else
1033 {
6eced09c
KH
1034 int i, c;
1035
1036 bytes = BYTES_BY_CHAR_HEAD (*src);
b73bfc1c 1037 src++;
6eced09c
KH
1038 for (i = 1; i < bytes; i++)
1039 {
1040 ONE_MORE_BYTE (c);
1041 if (CHAR_HEAD_P (c))
1042 break;
1043 }
1044 if (i < bytes)
1045 {
1046 bytes = CHAR_STRING (*src_base, tmp);
1047 p = tmp;
1048 src = src_base + 1;
1049 }
1050 else
1051 {
1052 p = src_base;
1053 }
b73bfc1c
KH
1054 }
1055 if (dst + bytes >= (dst_bytes ? dst_end : src))
1056 {
1057 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869
KH
1058 break;
1059 }
b73bfc1c
KH
1060 while (bytes--) *dst++ = *p++;
1061 coding->produced_char++;
4ed46869 1062 }
4af310db 1063 label_end_of_loop:
b73bfc1c
KH
1064 coding->consumed = coding->consumed_char = src_base - source;
1065 coding->produced = dst - destination;
4ed46869
KH
1066}
1067
b73bfc1c 1068
aa72b389
KH
1069/* Encode composition data stored at DATA into a special byte sequence
1070 starting by 0x80. Update CODING->cmp_data_start and maybe
1071 CODING->cmp_data for the next call. */
1072
1073#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1074 do { \
1075 unsigned char buf[1024], *p0 = buf, *p; \
1076 int len = data[0]; \
1077 int i; \
1078 \
1079 buf[0] = 0x80; \
1080 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1081 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1082 p = buf + 4; \
1083 if (data[3] == COMPOSITION_WITH_RULE \
1084 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1085 { \
1086 p += CHAR_STRING (data[4], p); \
1087 for (i = 5; i < len; i += 2) \
1088 { \
1089 int gref, nref; \
1090 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1091 *p++ = 0x20 + gref; \
1092 *p++ = 0x20 + nref; \
1093 p += CHAR_STRING (data[i + 1], p); \
1094 } \
1095 } \
1096 else \
1097 { \
1098 for (i = 4; i < len; i++) \
1099 p += CHAR_STRING (data[i], p); \
1100 } \
1101 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1102 \
1103 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1104 { \
1105 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1106 goto label_end_of_loop; \
1107 } \
1108 while (p0 < p) \
1109 *dst++ = *p0++; \
1110 coding->cmp_data_start += data[0]; \
1111 if (coding->cmp_data_start == coding->cmp_data->used \
1112 && coding->cmp_data->next) \
1113 { \
1114 coding->cmp_data = coding->cmp_data->next; \
1115 coding->cmp_data_start = 0; \
1116 } \
1117 } while (0)
93dec019 1118
aa72b389 1119
a4244313 1120static void encode_eol P_ ((struct coding_system *, const unsigned char *,
aa72b389
KH
1121 unsigned char *, int, int));
1122
1123static void
1124encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1125 struct coding_system *coding;
5bdca8af
DN
1126 const unsigned char *source;
1127 unsigned char *destination;
aa72b389
KH
1128 int src_bytes, dst_bytes;
1129{
5bdca8af
DN
1130 const unsigned char *src = source;
1131 const unsigned char *src_end = source + src_bytes;
aa72b389
KH
1132 unsigned char *dst = destination;
1133 unsigned char *dst_end = destination + dst_bytes;
5bdca8af 1134 const unsigned char *src_base;
aa72b389
KH
1135 int c;
1136 int char_offset;
1137 int *data;
1138
1139 Lisp_Object translation_table;
1140
1141 translation_table = Qnil;
1142
1143 /* Optimization for the case that there's no composition. */
1144 if (!coding->cmp_data || coding->cmp_data->used == 0)
1145 {
1146 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1147 return;
1148 }
1149
1150 char_offset = coding->cmp_data->char_offset;
1151 data = coding->cmp_data->data + coding->cmp_data_start;
1152 while (1)
1153 {
1154 src_base = src;
1155
1156 /* If SRC starts a composition, encode the information about the
1157 composition in advance. */
1158 if (coding->cmp_data_start < coding->cmp_data->used
1159 && char_offset + coding->consumed_char == data[1])
1160 {
1161 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1162 char_offset = coding->cmp_data->char_offset;
1163 data = coding->cmp_data->data + coding->cmp_data_start;
1164 }
1165
1166 ONE_MORE_CHAR (c);
1167 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1168 || coding->eol_type == CODING_EOL_CR))
1169 {
1170 if (coding->eol_type == CODING_EOL_CRLF)
1171 EMIT_TWO_BYTES ('\r', c);
1172 else
1173 EMIT_ONE_BYTE ('\r');
1174 }
1175 else if (SINGLE_BYTE_CHAR_P (c))
fd3ae0b9
KH
1176 {
1177 if (coding->flags && ! ASCII_BYTE_P (c))
1178 {
1179 /* As we are auto saving, retain the multibyte form for
1180 8-bit chars. */
1181 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1182 int bytes = CHAR_STRING (c, buf);
1183
1184 if (bytes == 1)
1185 EMIT_ONE_BYTE (buf[0]);
1186 else
1187 EMIT_TWO_BYTES (buf[0], buf[1]);
1188 }
1189 else
1190 EMIT_ONE_BYTE (c);
1191 }
aa72b389
KH
1192 else
1193 EMIT_BYTES (src_base, src);
1194 coding->consumed_char++;
1195 }
1196 label_end_of_loop:
1197 coding->consumed = src_base - source;
1198 coding->produced = coding->produced_char = dst - destination;
1199 return;
1200}
b73bfc1c 1201
4ed46869
KH
1202\f
1203/*** 3. ISO2022 handlers ***/
1204
1205/* The following note describes the coding system ISO2022 briefly.
39787efd 1206 Since the intention of this note is to help understand the
cfb43547 1207 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 1208 SIMPLIFIED. For thorough understanding, please refer to the
cfb43547
DL
1209 original document of ISO2022. This is equivalent to the standard
1210 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
1211
1212 ISO2022 provides many mechanisms to encode several character sets
cfb43547 1213 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
1214 is encoded using bytes less than 128. This may make the encoded
1215 text a little bit longer, but the text passes more easily through
cfb43547 1216 several types of gateway, some of which strip off the MSB (Most
8ca3766a 1217 Significant Bit).
b73bfc1c 1218
cfb43547
DL
1219 There are two kinds of character sets: control character sets and
1220 graphic character sets. The former contain control characters such
4ed46869 1221 as `newline' and `escape' to provide control functions (control
39787efd 1222 functions are also provided by escape sequences). The latter
cfb43547 1223 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
1224 two control character sets and many graphic character sets.
1225
1226 Graphic character sets are classified into one of the following
39787efd
KH
1227 four classes, according to the number of bytes (DIMENSION) and
1228 number of characters in one dimension (CHARS) of the set:
1229 - DIMENSION1_CHARS94
1230 - DIMENSION1_CHARS96
1231 - DIMENSION2_CHARS94
1232 - DIMENSION2_CHARS96
1233
1234 In addition, each character set is assigned an identification tag,
cfb43547 1235 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
1236 hereafter). The <F> of each character set is decided by ECMA(*)
1237 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1238 (0x30..0x3F are for private use only).
4ed46869
KH
1239
1240 Note (*): ECMA = European Computer Manufacturers Association
1241
cfb43547 1242 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
1243 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1244 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1245 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1246 o DIMENSION2_CHARS96 -- none for the moment
1247
39787efd 1248 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
1249 C0 [0x00..0x1F] -- control character plane 0
1250 GL [0x20..0x7F] -- graphic character plane 0
1251 C1 [0x80..0x9F] -- control character plane 1
1252 GR [0xA0..0xFF] -- graphic character plane 1
1253
1254 A control character set is directly designated and invoked to C0 or
39787efd
KH
1255 C1 by an escape sequence. The most common case is that:
1256 - ISO646's control character set is designated/invoked to C0, and
1257 - ISO6429's control character set is designated/invoked to C1,
1258 and usually these designations/invocations are omitted in encoded
1259 text. In a 7-bit environment, only C0 can be used, and a control
1260 character for C1 is encoded by an appropriate escape sequence to
1261 fit into the environment. All control characters for C1 are
1262 defined to have corresponding escape sequences.
4ed46869
KH
1263
1264 A graphic character set is at first designated to one of four
1265 graphic registers (G0 through G3), then these graphic registers are
1266 invoked to GL or GR. These designations and invocations can be
1267 done independently. The most common case is that G0 is invoked to
39787efd
KH
1268 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1269 these invocations and designations are omitted in encoded text.
1270 In a 7-bit environment, only GL can be used.
4ed46869 1271
39787efd
KH
1272 When a graphic character set of CHARS94 is invoked to GL, codes
1273 0x20 and 0x7F of the GL area work as control characters SPACE and
1274 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1275 be used.
4ed46869
KH
1276
1277 There are two ways of invocation: locking-shift and single-shift.
1278 With locking-shift, the invocation lasts until the next different
39787efd
KH
1279 invocation, whereas with single-shift, the invocation affects the
1280 following character only and doesn't affect the locking-shift
1281 state. Invocations are done by the following control characters or
1282 escape sequences:
4ed46869
KH
1283
1284 ----------------------------------------------------------------------
39787efd 1285 abbrev function cntrl escape seq description
4ed46869 1286 ----------------------------------------------------------------------
39787efd
KH
1287 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1288 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1289 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1290 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1291 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1292 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1293 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1294 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1295 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 1296 ----------------------------------------------------------------------
39787efd
KH
1297 (*) These are not used by any known coding system.
1298
1299 Control characters for these functions are defined by macros
1300 ISO_CODE_XXX in `coding.h'.
4ed46869 1301
39787efd 1302 Designations are done by the following escape sequences:
4ed46869
KH
1303 ----------------------------------------------------------------------
1304 escape sequence description
1305 ----------------------------------------------------------------------
1306 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1307 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1308 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1309 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1310 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1311 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1312 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1313 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1314 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1315 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1316 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1317 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1318 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1319 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1320 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1321 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1322 ----------------------------------------------------------------------
1323
1324 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 1325 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
1326
1327 Note (*): Although these designations are not allowed in ISO2022,
1328 Emacs accepts them on decoding, and produces them on encoding
39787efd 1329 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
1330 7-bit environment, non-locking-shift, and non-single-shift.
1331
1332 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 1333 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869 1334
cfb43547 1335 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
1336 same multilingual text in ISO2022. Actually, there exist many
1337 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
1338 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1339 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
1340 localized platforms), and all of these are variants of ISO2022.
1341
1342 In addition to the above, Emacs handles two more kinds of escape
1343 sequences: ISO6429's direction specification and Emacs' private
1344 sequence for specifying character composition.
1345
39787efd 1346 ISO6429's direction specification takes the following form:
4ed46869
KH
1347 o CSI ']' -- end of the current direction
1348 o CSI '0' ']' -- end of the current direction
1349 o CSI '1' ']' -- start of left-to-right text
1350 o CSI '2' ']' -- start of right-to-left text
1351 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
1352 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1353
1354 Character composition specification takes the following form:
ec6d2bb8
KH
1355 o ESC '0' -- start relative composition
1356 o ESC '1' -- end composition
1357 o ESC '2' -- start rule-base composition (*)
1358 o ESC '3' -- start relative composition with alternate chars (**)
1359 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 1360 Since these are not standard escape sequences of any ISO standard,
cfb43547 1361 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 1362
cfb43547 1363 (*) This form is used only in Emacs 20.5 and older versions,
b73bfc1c 1364 but the newer versions can safely decode it.
cfb43547 1365 (**) This form is used only in Emacs 21.1 and newer versions,
b73bfc1c 1366 and the older versions can't decode it.
ec6d2bb8 1367
cfb43547 1368 Here's a list of example usages of these composition escape
b73bfc1c 1369 sequences (categorized by `enum composition_method').
ec6d2bb8 1370
b73bfc1c 1371 COMPOSITION_RELATIVE:
ec6d2bb8 1372 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 1373 COMPOSITION_WITH_RULE:
ec6d2bb8 1374 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 1375 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 1376 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 1377 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 1378 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
1379
1380enum iso_code_class_type iso_code_class[256];
1381
05e6f5dc
KH
1382#define CHARSET_OK(idx, charset, c) \
1383 (coding_system_table[idx] \
1384 && (charset == CHARSET_ASCII \
6b89e3aa 1385 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
05e6f5dc
KH
1386 CODING_SAFE_CHAR_P (safe_chars, c))) \
1387 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1388 charset) \
1389 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
1390
1391#define SHIFT_OUT_OK(idx) \
1392 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1393
b6871cc7
KH
1394#define COMPOSITION_OK(idx) \
1395 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1396
4ed46869 1397/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
cfb43547 1398 Check if a text is encoded in ISO2022. If it is, return an
4ed46869
KH
1399 integer in which appropriate flag bits any of:
1400 CODING_CATEGORY_MASK_ISO_7
d46c5b12 1401 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
1402 CODING_CATEGORY_MASK_ISO_8_1
1403 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
1404 CODING_CATEGORY_MASK_ISO_7_ELSE
1405 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
1406 are set. If a code which should never appear in ISO2022 is found,
1407 returns 0. */
1408
0a28aafb
KH
1409static int
1410detect_coding_iso2022 (src, src_end, multibytep)
4ed46869 1411 unsigned char *src, *src_end;
0a28aafb 1412 int multibytep;
4ed46869 1413{
d46c5b12
KH
1414 int mask = CODING_CATEGORY_MASK_ISO;
1415 int mask_found = 0;
f46869e4 1416 int reg[4], shift_out = 0, single_shifting = 0;
da55a2b7 1417 int c, c1, charset;
b73bfc1c
KH
1418 /* Dummy for ONE_MORE_BYTE. */
1419 struct coding_system dummy_coding;
1420 struct coding_system *coding = &dummy_coding;
05e6f5dc 1421 Lisp_Object safe_chars;
3f003981 1422
d46c5b12 1423 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 1424 while (mask && src < src_end)
4ed46869 1425 {
0a28aafb 1426 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
8d239c89 1427 retry:
4ed46869
KH
1428 switch (c)
1429 {
1430 case ISO_CODE_ESC:
74383408
KH
1431 if (inhibit_iso_escape_detection)
1432 break;
f46869e4 1433 single_shifting = 0;
0a28aafb 1434 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
d46c5b12 1435 if (c >= '(' && c <= '/')
4ed46869 1436 {
bf9cdd4e 1437 /* Designation sequence for a charset of dimension 1. */
0a28aafb 1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1439 if (c1 < ' ' || c1 >= 0x80
1440 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1441 /* Invalid designation sequence. Just ignore. */
1442 break;
1443 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
1444 }
1445 else if (c == '$')
1446 {
1447 /* Designation sequence for a charset of dimension 2. */
0a28aafb 1448 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
bf9cdd4e
KH
1449 if (c >= '@' && c <= 'B')
1450 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 1451 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 1452 else if (c >= '(' && c <= '/')
bcf26d6a 1453 {
0a28aafb 1454 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
d46c5b12
KH
1455 if (c1 < ' ' || c1 >= 0x80
1456 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1457 /* Invalid designation sequence. Just ignore. */
1458 break;
1459 reg[(c - '(') % 4] = charset;
bcf26d6a 1460 }
bf9cdd4e 1461 else
d46c5b12
KH
1462 /* Invalid designation sequence. Just ignore. */
1463 break;
1464 }
ae9ff118 1465 else if (c == 'N' || c == 'O')
d46c5b12 1466 {
ae9ff118
KH
1467 /* ESC <Fe> for SS2 or SS3. */
1468 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 1469 break;
4ed46869 1470 }
ec6d2bb8
KH
1471 else if (c >= '0' && c <= '4')
1472 {
1473 /* ESC <Fp> for start/end composition. */
b6871cc7
KH
1474 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1475 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1476 else
1477 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
ec6d2bb8
KH
1498 break;
1499 }
bf9cdd4e 1500 else
d46c5b12
KH
1501 /* Invalid escape sequence. Just ignore. */
1502 break;
1503
1504 /* We found a valid designation sequence for CHARSET. */
1505 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
05e6f5dc
KH
1506 c = MAKE_CHAR (charset, 0, 0);
1507 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
d46c5b12
KH
1508 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1509 else
1510 mask &= ~CODING_CATEGORY_MASK_ISO_7;
05e6f5dc 1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
d46c5b12
KH
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
05e6f5dc 1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
ae9ff118
KH
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1517 else
d46c5b12 1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
05e6f5dc 1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
ae9ff118
KH
1520 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1521 else
d46c5b12 1522 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
1523 break;
1524
4ed46869 1525 case ISO_CODE_SO:
74383408
KH
1526 if (inhibit_iso_escape_detection)
1527 break;
f46869e4 1528 single_shifting = 0;
d46c5b12
KH
1529 if (shift_out == 0
1530 && (reg[1] >= 0
1531 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1532 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1533 {
1534 /* Locking shift out. */
1535 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537 }
e0e989f6 1538 break;
93dec019 1539
d46c5b12 1540 case ISO_CODE_SI:
74383408
KH
1541 if (inhibit_iso_escape_detection)
1542 break;
f46869e4 1543 single_shifting = 0;
d46c5b12
KH
1544 if (shift_out == 1)
1545 {
1546 /* Locking shift in. */
1547 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1548 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1549 }
1550 break;
1551
4ed46869 1552 case ISO_CODE_CSI:
f46869e4 1553 single_shifting = 0;
4ed46869
KH
1554 case ISO_CODE_SS2:
1555 case ISO_CODE_SS3:
3f003981
KH
1556 {
1557 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1558
74383408
KH
1559 if (inhibit_iso_escape_detection)
1560 break;
70c22245
KH
1561 if (c != ISO_CODE_CSI)
1562 {
d46c5b12
KH
1563 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1564 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1565 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1566 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1567 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 1568 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 1569 single_shifting = 1;
70c22245 1570 }
3f003981
KH
1571 if (VECTORP (Vlatin_extra_code_table)
1572 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1573 {
d46c5b12
KH
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1576 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1578 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1580 }
1581 mask &= newmask;
d46c5b12 1582 mask_found |= newmask;
3f003981
KH
1583 }
1584 break;
4ed46869
KH
1585
1586 default:
1587 if (c < 0x80)
f46869e4
KH
1588 {
1589 single_shifting = 0;
1590 break;
1591 }
4ed46869 1592 else if (c < 0xA0)
c4825358 1593 {
f46869e4 1594 single_shifting = 0;
3f003981
KH
1595 if (VECTORP (Vlatin_extra_code_table)
1596 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 1597 {
3f003981
KH
1598 int newmask = 0;
1599
d46c5b12
KH
1600 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1601 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 1602 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1604 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1606 mask &= newmask;
d46c5b12 1607 mask_found |= newmask;
c4825358 1608 }
3f003981
KH
1609 else
1610 return 0;
c4825358 1611 }
4ed46869
KH
1612 else
1613 {
d46c5b12 1614 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 1615 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 1616 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
1617 /* Check the length of succeeding codes of the range
1618 0xA0..0FF. If the byte length is odd, we exclude
1619 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1620 when we are not single shifting. */
b73bfc1c
KH
1621 if (!single_shifting
1622 && mask & CODING_CATEGORY_MASK_ISO_8_2)
f46869e4 1623 {
e17de821 1624 int i = 1;
8d239c89
KH
1625
1626 c = -1;
b73bfc1c
KH
1627 while (src < src_end)
1628 {
0a28aafb 1629 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
1630 if (c < 0xA0)
1631 break;
1632 i++;
1633 }
1634
1635 if (i & 1 && src < src_end)
f46869e4
KH
1636 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1637 else
1638 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
8d239c89
KH
1639 if (c >= 0)
1640 /* This means that we have read one extra byte. */
1641 goto retry;
f46869e4 1642 }
4ed46869
KH
1643 }
1644 break;
1645 }
1646 }
b73bfc1c 1647 label_end_of_loop:
d46c5b12 1648 return (mask & mask_found);
4ed46869
KH
1649}
1650
b73bfc1c
KH
1651/* Decode a character of which charset is CHARSET, the 1st position
1652 code is C1, the 2nd position code is C2, and return the decoded
1653 character code. If the variable `translation_table' is non-nil,
1654 returned the translated code. */
ec6d2bb8 1655
b73bfc1c
KH
1656#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1657 (NILP (translation_table) \
1658 ? MAKE_CHAR (charset, c1, c2) \
1659 : translate_char (translation_table, -1, charset, c1, c2))
4ed46869
KH
1660
1661/* Set designation state into CODING. */
d46c5b12
KH
1662#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1663 do { \
05e6f5dc 1664 int charset, c; \
944bd420
KH
1665 \
1666 if (final_char < '0' || final_char >= 128) \
1667 goto label_invalid_code; \
1668 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1669 make_number (chars), \
1670 make_number (final_char)); \
05e6f5dc 1671 c = MAKE_CHAR (charset, 0, 0); \
d46c5b12 1672 if (charset >= 0 \
704c5781 1673 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
05e6f5dc 1674 || CODING_SAFE_CHAR_P (safe_chars, c))) \
d46c5b12
KH
1675 { \
1676 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1677 && reg == 0 \
1678 && charset == CHARSET_ASCII) \
1679 { \
1680 /* We should insert this designation sequence as is so \
1681 that it is surely written back to a file. */ \
1682 coding->spec.iso2022.last_invalid_designation_register = -1; \
1683 goto label_invalid_code; \
1684 } \
1685 coding->spec.iso2022.last_invalid_designation_register = -1; \
1686 if ((coding->mode & CODING_MODE_DIRECTION) \
1687 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1688 charset = CHARSET_REVERSE_CHARSET (charset); \
1689 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1690 } \
1691 else \
1692 { \
1693 coding->spec.iso2022.last_invalid_designation_register = reg; \
1694 goto label_invalid_code; \
1695 } \
4ed46869
KH
1696 } while (0)
1697
ec6d2bb8
KH
1698/* Allocate a memory block for storing information about compositions.
1699 The block is chained to the already allocated blocks. */
d46c5b12 1700
33fb63eb 1701void
ec6d2bb8 1702coding_allocate_composition_data (coding, char_offset)
d46c5b12 1703 struct coding_system *coding;
ec6d2bb8 1704 int char_offset;
d46c5b12 1705{
ec6d2bb8
KH
1706 struct composition_data *cmp_data
1707 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1708
1709 cmp_data->char_offset = char_offset;
1710 cmp_data->used = 0;
1711 cmp_data->prev = coding->cmp_data;
1712 cmp_data->next = NULL;
1713 if (coding->cmp_data)
1714 coding->cmp_data->next = cmp_data;
1715 coding->cmp_data = cmp_data;
1716 coding->cmp_data_start = 0;
4307d534 1717 coding->composing = COMPOSITION_NO;
ec6d2bb8 1718}
d46c5b12 1719
aa72b389
KH
1720/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1721 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1722 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1723 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1724 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1725 */
ec6d2bb8 1726
33fb63eb
KH
1727#define DECODE_COMPOSITION_START(c1) \
1728 do { \
1729 if (coding->composing == COMPOSITION_DISABLED) \
1730 { \
1731 *dst++ = ISO_CODE_ESC; \
1732 *dst++ = c1 & 0x7f; \
1733 coding->produced_char += 2; \
1734 } \
1735 else if (!COMPOSING_P (coding)) \
1736 { \
1737 /* This is surely the start of a composition. We must be sure \
1738 that coding->cmp_data has enough space to store the \
1739 information about the composition. If not, terminate the \
1740 current decoding loop, allocate one more memory block for \
8ca3766a 1741 coding->cmp_data in the caller, then start the decoding \
33fb63eb
KH
1742 loop again. We can't allocate memory here directly because \
1743 it may cause buffer/string relocation. */ \
1744 if (!coding->cmp_data \
1745 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1746 >= COMPOSITION_DATA_SIZE)) \
1747 { \
1748 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1749 goto label_end_of_loop; \
1750 } \
1751 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1752 : c1 == '2' ? COMPOSITION_WITH_RULE \
1753 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1754 : COMPOSITION_WITH_RULE_ALTCHARS); \
1755 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1756 coding->composing); \
1757 coding->composition_rule_follows = 0; \
1758 } \
1759 else \
1760 { \
1761 /* We are already handling a composition. If the method is \
1762 the following two, the codes following the current escape \
1763 sequence are actual characters stored in a buffer. */ \
1764 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1765 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1766 { \
1767 coding->composing = COMPOSITION_RELATIVE; \
1768 coding->composition_rule_follows = 0; \
1769 } \
1770 } \
ec6d2bb8
KH
1771 } while (0)
1772
8ca3766a 1773/* Handle composition end sequence ESC 1. */
ec6d2bb8
KH
1774
1775#define DECODE_COMPOSITION_END(c1) \
1776 do { \
93dec019 1777 if (! COMPOSING_P (coding)) \
ec6d2bb8
KH
1778 { \
1779 *dst++ = ISO_CODE_ESC; \
1780 *dst++ = c1; \
1781 coding->produced_char += 2; \
1782 } \
1783 else \
1784 { \
1785 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1786 coding->composing = COMPOSITION_NO; \
1787 } \
1788 } while (0)
1789
1790/* Decode a composition rule from the byte C1 (and maybe one more byte
1791 from SRC) and store one encoded composition rule in
1792 coding->cmp_data. */
1793
1794#define DECODE_COMPOSITION_RULE(c1) \
1795 do { \
1796 int rule = 0; \
1797 (c1) -= 32; \
1798 if (c1 < 81) /* old format (before ver.21) */ \
1799 { \
1800 int gref = (c1) / 9; \
1801 int nref = (c1) % 9; \
1802 if (gref == 4) gref = 10; \
1803 if (nref == 4) nref = 10; \
1804 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1805 } \
b73bfc1c 1806 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
1807 { \
1808 ONE_MORE_BYTE (c2); \
1809 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1810 } \
1811 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1812 coding->composition_rule_follows = 0; \
1813 } while (0)
88993dfd 1814
d46c5b12 1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1817
b73bfc1c 1818static void
d46c5b12 1819decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869 1820 struct coding_system *coding;
5bdca8af
DN
1821 const unsigned char *source;
1822 unsigned char *destination;
4ed46869 1823 int src_bytes, dst_bytes;
4ed46869 1824{
5bdca8af
DN
1825 const unsigned char *src = source;
1826 const unsigned char *src_end = source + src_bytes;
4ed46869
KH
1827 unsigned char *dst = destination;
1828 unsigned char *dst_end = destination + dst_bytes;
4ed46869
KH
1829 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1830 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1831 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
b73bfc1c
KH
1832 /* SRC_BASE remembers the start position in source in each loop.
1833 The loop will be exited when there's not enough source code
1834 (within macro ONE_MORE_BYTE), or when there's not enough
1835 destination area to produce a character (within macro
1836 EMIT_CHAR). */
5bdca8af 1837 const unsigned char *src_base;
b73bfc1c
KH
1838 int c, charset;
1839 Lisp_Object translation_table;
05e6f5dc
KH
1840 Lisp_Object safe_chars;
1841
6b89e3aa 1842 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 1843
b73bfc1c
KH
1844 if (NILP (Venable_character_translation))
1845 translation_table = Qnil;
1846 else
1847 {
1848 translation_table = coding->translation_table_for_decode;
1849 if (NILP (translation_table))
1850 translation_table = Vstandard_translation_table_for_decode;
1851 }
4ed46869 1852
b73bfc1c
KH
1853 coding->result = CODING_FINISH_NORMAL;
1854
1855 while (1)
4ed46869 1856 {
85478bc6 1857 int c1, c2 = 0;
b73bfc1c
KH
1858
1859 src_base = src;
1860 ONE_MORE_BYTE (c1);
4ed46869 1861
ec6d2bb8 1862 /* We produce no character or one character. */
4ed46869
KH
1863 switch (iso_code_class [c1])
1864 {
1865 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1866 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1867 {
1868 DECODE_COMPOSITION_RULE (c1);
b73bfc1c 1869 continue;
ec6d2bb8
KH
1870 }
1871 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1872 {
1873 /* This is SPACE or DEL. */
b73bfc1c 1874 charset = CHARSET_ASCII;
4ed46869
KH
1875 break;
1876 }
1877 /* This is a graphic character, we fall down ... */
1878
1879 case ISO_graphic_plane_0:
ec6d2bb8 1880 if (COMPOSING_P (coding) && coding->composition_rule_follows)
b73bfc1c
KH
1881 {
1882 DECODE_COMPOSITION_RULE (c1);
1883 continue;
1884 }
1885 charset = charset0;
4ed46869
KH
1886 break;
1887
1888 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1889 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1890 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1891 goto label_invalid_code;
4ed46869
KH
1892 /* This is a graphic character, we fall down ... */
1893
1894 case ISO_graphic_plane_1:
b73bfc1c 1895 if (charset1 < 0)
fb88bf2d 1896 goto label_invalid_code;
b73bfc1c 1897 charset = charset1;
4ed46869
KH
1898 break;
1899
b73bfc1c 1900 case ISO_control_0:
ec6d2bb8
KH
1901 if (COMPOSING_P (coding))
1902 DECODE_COMPOSITION_END ('1');
1903
4ed46869
KH
1904 /* All ISO2022 control characters in this class have the
1905 same representation in Emacs internal format. */
d46c5b12
KH
1906 if (c1 == '\n'
1907 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1908 && (coding->eol_type == CODING_EOL_CR
1909 || coding->eol_type == CODING_EOL_CRLF))
1910 {
b73bfc1c
KH
1911 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1912 goto label_end_of_loop;
d46c5b12 1913 }
b73bfc1c 1914 charset = CHARSET_ASCII;
4ed46869
KH
1915 break;
1916
b73bfc1c
KH
1917 case ISO_control_1:
1918 if (COMPOSING_P (coding))
1919 DECODE_COMPOSITION_END ('1');
1920 goto label_invalid_code;
1921
4ed46869 1922 case ISO_carriage_return:
ec6d2bb8
KH
1923 if (COMPOSING_P (coding))
1924 DECODE_COMPOSITION_END ('1');
1925
4ed46869 1926 if (coding->eol_type == CODING_EOL_CR)
b73bfc1c 1927 c1 = '\n';
4ed46869
KH
1928 else if (coding->eol_type == CODING_EOL_CRLF)
1929 {
1930 ONE_MORE_BYTE (c1);
b73bfc1c 1931 if (c1 != ISO_CODE_LF)
4ed46869
KH
1932 {
1933 src--;
b73bfc1c 1934 c1 = '\r';
4ed46869
KH
1935 }
1936 }
b73bfc1c 1937 charset = CHARSET_ASCII;
4ed46869
KH
1938 break;
1939
1940 case ISO_shift_out:
d46c5b12
KH
1941 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1942 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1943 goto label_invalid_code;
4ed46869
KH
1944 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1945 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1946 continue;
4ed46869
KH
1947
1948 case ISO_shift_in:
d46c5b12
KH
1949 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1950 goto label_invalid_code;
4ed46869
KH
1951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1953 continue;
4ed46869
KH
1954
1955 case ISO_single_shift_2_7:
1956 case ISO_single_shift_2:
d46c5b12
KH
1957 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1958 goto label_invalid_code;
4ed46869
KH
1959 /* SS2 is handled as an escape sequence of ESC 'N' */
1960 c1 = 'N';
1961 goto label_escape_sequence;
1962
1963 case ISO_single_shift_3:
d46c5b12
KH
1964 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965 goto label_invalid_code;
4ed46869
KH
1966 /* SS2 is handled as an escape sequence of ESC 'O' */
1967 c1 = 'O';
1968 goto label_escape_sequence;
1969
1970 case ISO_control_sequence_introducer:
1971 /* CSI is handled as an escape sequence of ESC '[' ... */
1972 c1 = '[';
1973 goto label_escape_sequence;
1974
1975 case ISO_escape:
1976 ONE_MORE_BYTE (c1);
1977 label_escape_sequence:
1978 /* Escape sequences handled by Emacs are invocation,
1979 designation, direction specification, and character
1980 composition specification. */
1981 switch (c1)
1982 {
1983 case '&': /* revision of following character set */
1984 ONE_MORE_BYTE (c1);
1985 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1986 goto label_invalid_code;
4ed46869
KH
1987 ONE_MORE_BYTE (c1);
1988 if (c1 != ISO_CODE_ESC)
d46c5b12 1989 goto label_invalid_code;
4ed46869
KH
1990 ONE_MORE_BYTE (c1);
1991 goto label_escape_sequence;
1992
1993 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1994 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1995 goto label_invalid_code;
4ed46869
KH
1996 ONE_MORE_BYTE (c1);
1997 if (c1 >= '@' && c1 <= 'B')
1998 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1999 or JISX0208.1980 */
4ed46869
KH
2000 DECODE_DESIGNATION (0, 2, 94, c1);
2001 }
2002 else if (c1 >= 0x28 && c1 <= 0x2B)
2003 { /* designation of DIMENSION2_CHARS94 character set */
2004 ONE_MORE_BYTE (c2);
2005 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2006 }
2007 else if (c1 >= 0x2C && c1 <= 0x2F)
2008 { /* designation of DIMENSION2_CHARS96 character set */
2009 ONE_MORE_BYTE (c2);
2010 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2011 }
2012 else
d46c5b12 2013 goto label_invalid_code;
b73bfc1c
KH
2014 /* We must update these variables now. */
2015 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2016 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2017 continue;
4ed46869
KH
2018
2019 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
2020 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2021 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2022 goto label_invalid_code;
4ed46869 2023 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 2024 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 2025 continue;
4ed46869
KH
2026
2027 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
2028 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2029 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2030 goto label_invalid_code;
4ed46869 2031 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 2032 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 2033 continue;
4ed46869
KH
2034
2035 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
2036 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2037 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2038 goto label_invalid_code;
4ed46869 2039 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
b73bfc1c 2040 ONE_MORE_BYTE (c1);
e7046a18
KH
2041 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2042 goto label_invalid_code;
4ed46869
KH
2043 break;
2044
2045 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
2046 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2047 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048 goto label_invalid_code;
4ed46869 2049 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
b73bfc1c 2050 ONE_MORE_BYTE (c1);
e7046a18
KH
2051 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2052 goto label_invalid_code;
4ed46869
KH
2053 break;
2054
ec6d2bb8
KH
2055 case '0': case '2': case '3': case '4': /* start composition */
2056 DECODE_COMPOSITION_START (c1);
b73bfc1c 2057 continue;
4ed46869 2058
ec6d2bb8
KH
2059 case '1': /* end composition */
2060 DECODE_COMPOSITION_END (c1);
b73bfc1c 2061 continue;
4ed46869
KH
2062
2063 case '[': /* specification of direction */
d46c5b12
KH
2064 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2065 goto label_invalid_code;
4ed46869 2066 /* For the moment, nested direction is not supported.
d46c5b12 2067 So, `coding->mode & CODING_MODE_DIRECTION' zero means
8ca3766a 2068 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
2069 ONE_MORE_BYTE (c1);
2070 switch (c1)
2071 {
2072 case ']': /* end of the current direction */
d46c5b12 2073 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
2074
2075 case '0': /* end of the current direction */
2076 case '1': /* start of left-to-right direction */
2077 ONE_MORE_BYTE (c1);
2078 if (c1 == ']')
d46c5b12 2079 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 2080 else
d46c5b12 2081 goto label_invalid_code;
4ed46869
KH
2082 break;
2083
2084 case '2': /* start of right-to-left direction */
2085 ONE_MORE_BYTE (c1);
2086 if (c1 == ']')
d46c5b12 2087 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 2088 else
d46c5b12 2089 goto label_invalid_code;
4ed46869
KH
2090 break;
2091
2092 default:
d46c5b12 2093 goto label_invalid_code;
4ed46869 2094 }
b73bfc1c 2095 continue;
4ed46869 2096
103e0180
KH
2097 case '%':
2098 if (COMPOSING_P (coding))
2099 DECODE_COMPOSITION_END ('1');
2100 ONE_MORE_BYTE (c1);
2101 if (c1 == '/')
2102 {
2103 /* CTEXT extended segment:
2104 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2105 We keep these bytes as is for the moment.
2106 They may be decoded by post-read-conversion. */
2107 int dim, M, L;
2108 int size, required;
2109 int produced_chars;
43e4a82f 2110
103e0180
KH
2111 ONE_MORE_BYTE (dim);
2112 ONE_MORE_BYTE (M);
2113 ONE_MORE_BYTE (L);
2114 size = ((M - 128) * 128) + (L - 128);
2115 required = 8 + size * 2;
2116 if (dst + required > (dst_bytes ? dst_end : src))
2117 goto label_end_of_loop;
2118 *dst++ = ISO_CODE_ESC;
2119 *dst++ = '%';
2120 *dst++ = '/';
2121 *dst++ = dim;
2122 produced_chars = 4;
2123 dst += CHAR_STRING (M, dst), produced_chars++;
2124 dst += CHAR_STRING (L, dst), produced_chars++;
2125 while (size-- > 0)
2126 {
2127 ONE_MORE_BYTE (c1);
2128 dst += CHAR_STRING (c1, dst), produced_chars++;
2129 }
2130 coding->produced_char += produced_chars;
2131 }
2132 else if (c1 == 'G')
2133 {
2134 unsigned char *d = dst;
2135 int produced_chars;
2136
2137 /* XFree86 extension for embedding UTF-8 in CTEXT:
2138 ESC % G --UTF-8-BYTES-- ESC % @
2139 We keep these bytes as is for the moment.
2140 They may be decoded by post-read-conversion. */
2141 if (d + 6 > (dst_bytes ? dst_end : src))
2142 goto label_end_of_loop;
2143 *d++ = ISO_CODE_ESC;
2144 *d++ = '%';
2145 *d++ = 'G';
2146 produced_chars = 3;
2147 while (d + 1 < (dst_bytes ? dst_end : src))
2148 {
2149 ONE_MORE_BYTE (c1);
2150 if (c1 == ISO_CODE_ESC
2151 && src + 1 < src_end
2152 && src[0] == '%'
2153 && src[1] == '@')
47dc91ad
KH
2154 {
2155 src += 2;
2156 break;
2157 }
103e0180
KH
2158 d += CHAR_STRING (c1, d), produced_chars++;
2159 }
2160 if (d + 3 > (dst_bytes ? dst_end : src))
2161 goto label_end_of_loop;
2162 *d++ = ISO_CODE_ESC;
2163 *d++ = '%';
2164 *d++ = '@';
2165 dst = d;
2166 coding->produced_char += produced_chars + 3;
2167 }
2168 else
2169 goto label_invalid_code;
2170 continue;
2171
4ed46869 2172 default:
d46c5b12
KH
2173 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2174 goto label_invalid_code;
4ed46869
KH
2175 if (c1 >= 0x28 && c1 <= 0x2B)
2176 { /* designation of DIMENSION1_CHARS94 character set */
2177 ONE_MORE_BYTE (c2);
2178 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2179 }
2180 else if (c1 >= 0x2C && c1 <= 0x2F)
2181 { /* designation of DIMENSION1_CHARS96 character set */
2182 ONE_MORE_BYTE (c2);
2183 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2184 }
2185 else
b73bfc1c
KH
2186 goto label_invalid_code;
2187 /* We must update these variables now. */
2188 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2189 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2190 continue;
4ed46869 2191 }
b73bfc1c 2192 }
4ed46869 2193
b73bfc1c
KH
2194 /* Now we know CHARSET and 1st position code C1 of a character.
2195 Produce a multibyte sequence for that character while getting
2196 2nd position code C2 if necessary. */
2197 if (CHARSET_DIMENSION (charset) == 2)
2198 {
2199 ONE_MORE_BYTE (c2);
2200 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2201 /* C2 is not in a valid range. */
2202 goto label_invalid_code;
4ed46869 2203 }
b73bfc1c
KH
2204 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2205 EMIT_CHAR (c);
4ed46869
KH
2206 continue;
2207
b73bfc1c
KH
2208 label_invalid_code:
2209 coding->errors++;
2210 if (COMPOSING_P (coding))
2211 DECODE_COMPOSITION_END ('1');
4ed46869 2212 src = src_base;
b73bfc1c 2213 c = *src++;
2d4430a8
KH
2214 if (! NILP (translation_table))
2215 c = translate_char (translation_table, c, 0, 0, 0);
b73bfc1c 2216 EMIT_CHAR (c);
4ed46869 2217 }
fb88bf2d 2218
b73bfc1c
KH
2219 label_end_of_loop:
2220 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 2221 coding->produced = dst - destination;
b73bfc1c 2222 return;
4ed46869
KH
2223}
2224
b73bfc1c 2225
f4dee582 2226/* ISO2022 encoding stuff. */
4ed46869
KH
2227
2228/*
f4dee582 2229 It is not enough to say just "ISO2022" on encoding, we have to
cfb43547 2230 specify more details. In Emacs, each ISO2022 coding system
4ed46869 2231 variant has the following specifications:
8ca3766a 2232 1. Initial designation to G0 through G3.
4ed46869
KH
2233 2. Allows short-form designation?
2234 3. ASCII should be designated to G0 before control characters?
2235 4. ASCII should be designated to G0 at end of line?
2236 5. 7-bit environment or 8-bit environment?
2237 6. Use locking-shift?
2238 7. Use Single-shift?
2239 And the following two are only for Japanese:
2240 8. Use ASCII in place of JIS0201-1976-Roman?
2241 9. Use JISX0208-1983 in place of JISX0208-1978?
2242 These specifications are encoded in `coding->flags' as flag bits
2243 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 2244 details.
4ed46869
KH
2245*/
2246
2247/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
2248 register REG at DST, and increment DST. If <final-char> of CHARSET is
2249 '@', 'A', or 'B' and the coding system CODING allows, produce
2250 designation sequence of short-form. */
4ed46869
KH
2251
2252#define ENCODE_DESIGNATION(charset, reg, coding) \
2253 do { \
2254 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2255 char *intermediate_char_94 = "()*+"; \
2256 char *intermediate_char_96 = ",-./"; \
70c22245 2257 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
b73bfc1c 2258 \
70c22245
KH
2259 if (revision < 255) \
2260 { \
4ed46869
KH
2261 *dst++ = ISO_CODE_ESC; \
2262 *dst++ = '&'; \
70c22245 2263 *dst++ = '@' + revision; \
4ed46869 2264 } \
b73bfc1c 2265 *dst++ = ISO_CODE_ESC; \
4ed46869
KH
2266 if (CHARSET_DIMENSION (charset) == 1) \
2267 { \
2268 if (CHARSET_CHARS (charset) == 94) \
2269 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2270 else \
2271 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2272 } \
2273 else \
2274 { \
2275 *dst++ = '$'; \
2276 if (CHARSET_CHARS (charset) == 94) \
2277 { \
b73bfc1c
KH
2278 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2279 || reg != 0 \
2280 || final_char < '@' || final_char > 'B') \
4ed46869
KH
2281 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2282 } \
2283 else \
b73bfc1c 2284 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
4ed46869 2285 } \
b73bfc1c 2286 *dst++ = final_char; \
4ed46869
KH
2287 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2288 } while (0)
2289
2290/* The following two macros produce codes (control character or escape
2291 sequence) for ISO2022 single-shift functions (single-shift-2 and
2292 single-shift-3). */
2293
2294#define ENCODE_SINGLE_SHIFT_2 \
2295 do { \
2296 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2297 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2298 else \
b73bfc1c 2299 *dst++ = ISO_CODE_SS2; \
4ed46869
KH
2300 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2301 } while (0)
2302
fb88bf2d
KH
2303#define ENCODE_SINGLE_SHIFT_3 \
2304 do { \
4ed46869 2305 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
2306 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2307 else \
b73bfc1c 2308 *dst++ = ISO_CODE_SS3; \
4ed46869
KH
2309 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2310 } while (0)
2311
2312/* The following four macros produce codes (control character or
2313 escape sequence) for ISO2022 locking-shift functions (shift-in,
2314 shift-out, locking-shift-2, and locking-shift-3). */
2315
b73bfc1c
KH
2316#define ENCODE_SHIFT_IN \
2317 do { \
2318 *dst++ = ISO_CODE_SI; \
4ed46869
KH
2319 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2320 } while (0)
2321
b73bfc1c
KH
2322#define ENCODE_SHIFT_OUT \
2323 do { \
2324 *dst++ = ISO_CODE_SO; \
4ed46869
KH
2325 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2326 } while (0)
2327
2328#define ENCODE_LOCKING_SHIFT_2 \
2329 do { \
2330 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2331 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2332 } while (0)
2333
b73bfc1c
KH
2334#define ENCODE_LOCKING_SHIFT_3 \
2335 do { \
2336 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
4ed46869
KH
2337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2338 } while (0)
2339
f4dee582
RS
2340/* Produce codes for a DIMENSION1 character whose character set is
2341 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
2342 sequences are also produced in advance if necessary. */
2343
6e85d753
KH
2344#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2345 do { \
2346 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2347 { \
2348 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2349 *dst++ = c1 & 0x7F; \
2350 else \
2351 *dst++ = c1 | 0x80; \
2352 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2353 break; \
2354 } \
2355 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2356 { \
2357 *dst++ = c1 & 0x7F; \
2358 break; \
2359 } \
2360 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2361 { \
2362 *dst++ = c1 | 0x80; \
2363 break; \
2364 } \
6e85d753
KH
2365 else \
2366 /* Since CHARSET is not yet invoked to any graphic planes, we \
2367 must invoke it, or, at first, designate it to some graphic \
2368 register. Then repeat the loop to actually produce the \
2369 character. */ \
2370 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2371 } while (1)
2372
f4dee582
RS
2373/* Produce codes for a DIMENSION2 character whose character set is
2374 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
2375 invocation codes are also produced in advance if necessary. */
2376
6e85d753
KH
2377#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2378 do { \
2379 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2380 { \
2381 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2382 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2383 else \
2384 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2385 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2386 break; \
2387 } \
2388 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2389 { \
2390 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2391 break; \
2392 } \
2393 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2394 { \
2395 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2396 break; \
2397 } \
6e85d753
KH
2398 else \
2399 /* Since CHARSET is not yet invoked to any graphic planes, we \
2400 must invoke it, or, at first, designate it to some graphic \
2401 register. Then repeat the loop to actually produce the \
2402 character. */ \
2403 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
2404 } while (1)
2405
05e6f5dc
KH
2406#define ENCODE_ISO_CHARACTER(c) \
2407 do { \
2408 int charset, c1, c2; \
2409 \
2410 SPLIT_CHAR (c, charset, c1, c2); \
2411 if (CHARSET_DEFINED_P (charset)) \
2412 { \
2413 if (CHARSET_DIMENSION (charset) == 1) \
2414 { \
2415 if (charset == CHARSET_ASCII \
2416 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2417 charset = charset_latin_jisx0201; \
2418 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2419 } \
2420 else \
2421 { \
2422 if (charset == charset_jisx0208 \
2423 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2424 charset = charset_jisx0208_1978; \
2425 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2426 } \
2427 } \
2428 else \
2429 { \
2430 *dst++ = c1; \
2431 if (c2 >= 0) \
2432 *dst++ = c2; \
2433 } \
2434 } while (0)
2435
2436
2437/* Instead of encoding character C, produce one or two `?'s. */
2438
0eecad43
KH
2439#define ENCODE_UNSAFE_CHARACTER(c) \
2440 do { \
2441 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2442 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2443 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
84fbb8a0 2444 } while (0)
bdd9fb48 2445
05e6f5dc 2446
4ed46869
KH
2447/* Produce designation and invocation codes at a place pointed by DST
2448 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2449 Return new DST. */
2450
2451unsigned char *
2452encode_invocation_designation (charset, coding, dst)
2453 int charset;
2454 struct coding_system *coding;
2455 unsigned char *dst;
2456{
2457 int reg; /* graphic register number */
2458
2459 /* At first, check designations. */
2460 for (reg = 0; reg < 4; reg++)
2461 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2462 break;
2463
2464 if (reg >= 4)
2465 {
2466 /* CHARSET is not yet designated to any graphic registers. */
2467 /* At first check the requested designation. */
2468 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
2469 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2470 /* Since CHARSET requests no special designation, designate it
2471 to graphic register 0. */
4ed46869
KH
2472 reg = 0;
2473
2474 ENCODE_DESIGNATION (charset, reg, coding);
2475 }
2476
2477 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2478 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2479 {
2480 /* Since the graphic register REG is not invoked to any graphic
2481 planes, invoke it to graphic plane 0. */
2482 switch (reg)
2483 {
2484 case 0: /* graphic register 0 */
2485 ENCODE_SHIFT_IN;
2486 break;
2487
2488 case 1: /* graphic register 1 */
2489 ENCODE_SHIFT_OUT;
2490 break;
2491
2492 case 2: /* graphic register 2 */
2493 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2494 ENCODE_SINGLE_SHIFT_2;
2495 else
2496 ENCODE_LOCKING_SHIFT_2;
2497 break;
2498
2499 case 3: /* graphic register 3 */
2500 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501 ENCODE_SINGLE_SHIFT_3;
2502 else
2503 ENCODE_LOCKING_SHIFT_3;
2504 break;
2505 }
2506 }
b73bfc1c 2507
4ed46869
KH
2508 return dst;
2509}
2510
ec6d2bb8
KH
2511/* Produce 2-byte codes for encoded composition rule RULE. */
2512
2513#define ENCODE_COMPOSITION_RULE(rule) \
2514 do { \
2515 int gref, nref; \
2516 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2517 *dst++ = 32 + 81 + gref; \
2518 *dst++ = 32 + nref; \
2519 } while (0)
2520
2521/* Produce codes for indicating the start of a composition sequence
2522 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2523 which specify information about the composition. See the comment
2524 in coding.h for the format of DATA. */
2525
2526#define ENCODE_COMPOSITION_START(coding, data) \
2527 do { \
2528 coding->composing = data[3]; \
2529 *dst++ = ISO_CODE_ESC; \
2530 if (coding->composing == COMPOSITION_RELATIVE) \
2531 *dst++ = '0'; \
2532 else \
2533 { \
2534 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2535 ? '3' : '4'); \
2536 coding->cmp_data_index = coding->cmp_data_start + 4; \
2537 coding->composition_rule_follows = 0; \
2538 } \
2539 } while (0)
2540
2541/* Produce codes for indicating the end of the current composition. */
2542
2543#define ENCODE_COMPOSITION_END(coding, data) \
2544 do { \
2545 *dst++ = ISO_CODE_ESC; \
2546 *dst++ = '1'; \
2547 coding->cmp_data_start += data[0]; \
2548 coding->composing = COMPOSITION_NO; \
2549 if (coding->cmp_data_start == coding->cmp_data->used \
2550 && coding->cmp_data->next) \
2551 { \
2552 coding->cmp_data = coding->cmp_data->next; \
2553 coding->cmp_data_start = 0; \
2554 } \
2555 } while (0)
2556
2557/* Produce composition start sequence ESC 0. Here, this sequence
2558 doesn't mean the start of a new composition but means that we have
2559 just produced components (alternate chars and composition rules) of
2560 the composition and the actual text follows in SRC. */
2561
2562#define ENCODE_COMPOSITION_FAKE_START(coding) \
2563 do { \
2564 *dst++ = ISO_CODE_ESC; \
2565 *dst++ = '0'; \
2566 coding->composing = COMPOSITION_RELATIVE; \
2567 } while (0)
4ed46869
KH
2568
2569/* The following three macros produce codes for indicating direction
2570 of text. */
b73bfc1c
KH
2571#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2572 do { \
4ed46869 2573 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
b73bfc1c
KH
2574 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2575 else \
2576 *dst++ = ISO_CODE_CSI; \
4ed46869
KH
2577 } while (0)
2578
2579#define ENCODE_DIRECTION_R2L \
b73bfc1c 2580 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
4ed46869
KH
2581
2582#define ENCODE_DIRECTION_L2R \
b73bfc1c 2583 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
4ed46869
KH
2584
2585/* Produce codes for designation and invocation to reset the graphic
2586 planes and registers to initial state. */
e0e989f6
KH
2587#define ENCODE_RESET_PLANE_AND_REGISTER \
2588 do { \
2589 int reg; \
2590 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2591 ENCODE_SHIFT_IN; \
2592 for (reg = 0; reg < 4; reg++) \
2593 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2594 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2595 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2596 ENCODE_DESIGNATION \
2597 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
2598 } while (0)
2599
bdd9fb48 2600/* Produce designation sequences of charsets in the line started from
b73bfc1c 2601 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
2602
2603 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
2604 find all the necessary designations. */
2605
b73bfc1c
KH
2606static unsigned char *
2607encode_designation_at_bol (coding, translation_table, src, src_end, dst)
e0e989f6 2608 struct coding_system *coding;
b73bfc1c 2609 Lisp_Object translation_table;
5bdca8af
DN
2610 const unsigned char *src, *src_end;
2611 unsigned char *dst;
e0e989f6 2612{
bdd9fb48
KH
2613 int charset, c, found = 0, reg;
2614 /* Table of charsets to be designated to each graphic register. */
2615 int r[4];
bdd9fb48
KH
2616
2617 for (reg = 0; reg < 4; reg++)
2618 r[reg] = -1;
2619
b73bfc1c 2620 while (found < 4)
e0e989f6 2621 {
b73bfc1c
KH
2622 ONE_MORE_CHAR (c);
2623 if (c == '\n')
2624 break;
93dec019 2625
b73bfc1c 2626 charset = CHAR_CHARSET (c);
e0e989f6 2627 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 2628 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
2629 {
2630 found++;
2631 r[reg] = charset;
2632 }
bdd9fb48
KH
2633 }
2634
b73bfc1c 2635 label_end_of_loop:
bdd9fb48
KH
2636 if (found)
2637 {
2638 for (reg = 0; reg < 4; reg++)
2639 if (r[reg] >= 0
2640 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2641 ENCODE_DESIGNATION (r[reg], reg, coding);
e0e989f6 2642 }
b73bfc1c
KH
2643
2644 return dst;
e0e989f6
KH
2645}
2646
4ed46869
KH
2647/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2648
b73bfc1c 2649static void
d46c5b12 2650encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869 2651 struct coding_system *coding;
5bdca8af
DN
2652 const unsigned char *source;
2653 unsigned char *destination;
4ed46869 2654 int src_bytes, dst_bytes;
4ed46869 2655{
5bdca8af
DN
2656 const unsigned char *src = source;
2657 const unsigned char *src_end = source + src_bytes;
4ed46869
KH
2658 unsigned char *dst = destination;
2659 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c 2660 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
2661 from DST_END to assure overflow checking is necessary only at the
2662 head of loop. */
b73bfc1c
KH
2663 unsigned char *adjusted_dst_end = dst_end - 19;
2664 /* SRC_BASE remembers the start position in source in each loop.
2665 The loop will be exited when there's not enough source text to
2666 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2667 there's not enough destination area to produce encoded codes
2668 (within macro EMIT_BYTES). */
5bdca8af 2669 const unsigned char *src_base;
b73bfc1c
KH
2670 int c;
2671 Lisp_Object translation_table;
05e6f5dc
KH
2672 Lisp_Object safe_chars;
2673
0eecad43
KH
2674 if (coding->flags & CODING_FLAG_ISO_SAFE)
2675 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2676
6b89e3aa 2677 safe_chars = coding_safe_chars (coding->symbol);
bdd9fb48 2678
b73bfc1c
KH
2679 if (NILP (Venable_character_translation))
2680 translation_table = Qnil;
2681 else
2682 {
2683 translation_table = coding->translation_table_for_encode;
2684 if (NILP (translation_table))
2685 translation_table = Vstandard_translation_table_for_encode;
2686 }
4ed46869 2687
d46c5b12 2688 coding->consumed_char = 0;
b73bfc1c
KH
2689 coding->errors = 0;
2690 while (1)
4ed46869 2691 {
b73bfc1c
KH
2692 src_base = src;
2693
2694 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2695 {
2696 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2697 break;
2698 }
4ed46869 2699
e0e989f6
KH
2700 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2701 && CODING_SPEC_ISO_BOL (coding))
2702 {
bdd9fb48 2703 /* We have to produce designation sequences if any now. */
b73bfc1c
KH
2704 dst = encode_designation_at_bol (coding, translation_table,
2705 src, src_end, dst);
e0e989f6
KH
2706 CODING_SPEC_ISO_BOL (coding) = 0;
2707 }
2708
ec6d2bb8
KH
2709 /* Check composition start and end. */
2710 if (coding->composing != COMPOSITION_DISABLED
2711 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 2712 {
ec6d2bb8
KH
2713 struct composition_data *cmp_data = coding->cmp_data;
2714 int *data = cmp_data->data + coding->cmp_data_start;
2715 int this_pos = cmp_data->char_offset + coding->consumed_char;
2716
2717 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 2718 {
ec6d2bb8
KH
2719 if (this_pos == data[2])
2720 {
2721 ENCODE_COMPOSITION_END (coding, data);
2722 cmp_data = coding->cmp_data;
2723 data = cmp_data->data + coding->cmp_data_start;
2724 }
4ed46869 2725 }
ec6d2bb8 2726 else if (COMPOSING_P (coding))
4ed46869 2727 {
ec6d2bb8
KH
2728 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2729 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2730 /* We have consumed components of the composition.
8ca3766a 2731 What follows in SRC is the composition's base
ec6d2bb8
KH
2732 text. */
2733 ENCODE_COMPOSITION_FAKE_START (coding);
2734 else
4ed46869 2735 {
ec6d2bb8
KH
2736 int c = cmp_data->data[coding->cmp_data_index++];
2737 if (coding->composition_rule_follows)
2738 {
2739 ENCODE_COMPOSITION_RULE (c);
2740 coding->composition_rule_follows = 0;
2741 }
2742 else
2743 {
0eecad43 2744 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
05e6f5dc
KH
2745 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2746 ENCODE_UNSAFE_CHARACTER (c);
2747 else
2748 ENCODE_ISO_CHARACTER (c);
ec6d2bb8
KH
2749 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2750 coding->composition_rule_follows = 1;
2751 }
4ed46869
KH
2752 continue;
2753 }
ec6d2bb8
KH
2754 }
2755 if (!COMPOSING_P (coding))
2756 {
2757 if (this_pos == data[1])
4ed46869 2758 {
ec6d2bb8
KH
2759 ENCODE_COMPOSITION_START (coding, data);
2760 continue;
4ed46869 2761 }
4ed46869
KH
2762 }
2763 }
ec6d2bb8 2764
b73bfc1c 2765 ONE_MORE_CHAR (c);
4ed46869 2766
b73bfc1c
KH
2767 /* Now encode the character C. */
2768 if (c < 0x20 || c == 0x7F)
2769 {
2770 if (c == '\r')
19a8d9e0 2771 {
b73bfc1c
KH
2772 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2773 {
2774 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2775 ENCODE_RESET_PLANE_AND_REGISTER;
2776 *dst++ = c;
2777 continue;
2778 }
2779 /* fall down to treat '\r' as '\n' ... */
2780 c = '\n';
19a8d9e0 2781 }
b73bfc1c 2782 if (c == '\n')
19a8d9e0 2783 {
b73bfc1c
KH
2784 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2785 ENCODE_RESET_PLANE_AND_REGISTER;
2786 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2787 bcopy (coding->spec.iso2022.initial_designation,
2788 coding->spec.iso2022.current_designation,
2789 sizeof coding->spec.iso2022.initial_designation);
2790 if (coding->eol_type == CODING_EOL_LF
2791 || coding->eol_type == CODING_EOL_UNDECIDED)
2792 *dst++ = ISO_CODE_LF;
2793 else if (coding->eol_type == CODING_EOL_CRLF)
2794 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2795 else
2796 *dst++ = ISO_CODE_CR;
2797 CODING_SPEC_ISO_BOL (coding) = 1;
19a8d9e0 2798 }
93dec019 2799 else
19a8d9e0 2800 {
b73bfc1c
KH
2801 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2802 ENCODE_RESET_PLANE_AND_REGISTER;
2803 *dst++ = c;
19a8d9e0 2804 }
4ed46869 2805 }
b73bfc1c 2806 else if (ASCII_BYTE_P (c))
05e6f5dc 2807 ENCODE_ISO_CHARACTER (c);
b73bfc1c 2808 else if (SINGLE_BYTE_CHAR_P (c))
88993dfd 2809 {
b73bfc1c
KH
2810 *dst++ = c;
2811 coding->errors++;
88993dfd 2812 }
0eecad43 2813 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
05e6f5dc
KH
2814 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2815 ENCODE_UNSAFE_CHARACTER (c);
b73bfc1c 2816 else
05e6f5dc 2817 ENCODE_ISO_CHARACTER (c);
b73bfc1c
KH
2818
2819 coding->consumed_char++;
84fbb8a0 2820 }
b73bfc1c
KH
2821
2822 label_end_of_loop:
2823 coding->consumed = src_base - source;
d46c5b12 2824 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2825}
2826
2827\f
2828/*** 4. SJIS and BIG5 handlers ***/
2829
cfb43547 2830/* Although SJIS and BIG5 are not ISO coding systems, they are used
4ed46869
KH
2831 quite widely. So, for the moment, Emacs supports them in the bare
2832 C code. But, in the future, they may be supported only by CCL. */
2833
2834/* SJIS is a coding system encoding three character sets: ASCII, right
2835 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2836 as is. A character of charset katakana-jisx0201 is encoded by
2837 "position-code + 0x80". A character of charset japanese-jisx0208
2838 is encoded in 2-byte but two position-codes are divided and shifted
cfb43547 2839 so that it fits in the range below.
4ed46869
KH
2840
2841 --- CODE RANGE of SJIS ---
2842 (character set) (range)
2843 ASCII 0x00 .. 0x7F
682169fe 2844 KATAKANA-JISX0201 0xA1 .. 0xDF
c28a9453 2845 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2846 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2847 -------------------------------
2848
2849*/
2850
2851/* BIG5 is a coding system encoding two character sets: ASCII and
2852 Big5. An ASCII character is encoded as is. Big5 is a two-byte
cfb43547 2853 character set and is encoded in two bytes.
4ed46869
KH
2854
2855 --- CODE RANGE of BIG5 ---
2856 (character set) (range)
2857 ASCII 0x00 .. 0x7F
2858 Big5 (1st byte) 0xA1 .. 0xFE
2859 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2860 --------------------------
2861
2862 Since the number of characters in Big5 is larger than maximum
2863 characters in Emacs' charset (96x96), it can't be handled as one
2864 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2865 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2866 contains frequently used characters and the latter contains less
2867 frequently used characters. */
2868
2869/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2870 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
f458a8e0 2871 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
4ed46869
KH
2872 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2873
2874/* Number of Big5 characters which have the same code in 1st byte. */
2875#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2876
2877#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2878 do { \
2879 unsigned int temp \
2880 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2881 if (b1 < 0xC9) \
2882 charset = charset_big5_1; \
2883 else \
2884 { \
2885 charset = charset_big5_2; \
2886 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2887 } \
2888 c1 = temp / (0xFF - 0xA1) + 0x21; \
2889 c2 = temp % (0xFF - 0xA1) + 0x21; \
2890 } while (0)
2891
2892#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2893 do { \
2894 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2895 if (charset == charset_big5_2) \
2896 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2897 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2898 b2 = temp % BIG5_SAME_ROW; \
2899 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2900 } while (0)
2901
2902/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2903 Check if a text is encoded in SJIS. If it is, return
2904 CODING_CATEGORY_MASK_SJIS, else return 0. */
2905
0a28aafb
KH
2906static int
2907detect_coding_sjis (src, src_end, multibytep)
4ed46869 2908 unsigned char *src, *src_end;
0a28aafb 2909 int multibytep;
4ed46869 2910{
b73bfc1c
KH
2911 int c;
2912 /* Dummy for ONE_MORE_BYTE. */
2913 struct coding_system dummy_coding;
2914 struct coding_system *coding = &dummy_coding;
4ed46869 2915
b73bfc1c 2916 while (1)
4ed46869 2917 {
0a28aafb 2918 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2919 if (c < 0x80)
2920 continue;
2921 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2922 return 0;
2923 if (c <= 0x9F || c >= 0xE0)
4ed46869 2924 {
682169fe
KH
2925 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2926 if (c < 0x40 || c == 0x7F || c > 0xFC)
4ed46869
KH
2927 return 0;
2928 }
2929 }
b73bfc1c 2930 label_end_of_loop:
4ed46869
KH
2931 return CODING_CATEGORY_MASK_SJIS;
2932}
2933
2934/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2935 Check if a text is encoded in BIG5. If it is, return
2936 CODING_CATEGORY_MASK_BIG5, else return 0. */
2937
0a28aafb
KH
2938static int
2939detect_coding_big5 (src, src_end, multibytep)
4ed46869 2940 unsigned char *src, *src_end;
0a28aafb 2941 int multibytep;
4ed46869 2942{
b73bfc1c
KH
2943 int c;
2944 /* Dummy for ONE_MORE_BYTE. */
2945 struct coding_system dummy_coding;
2946 struct coding_system *coding = &dummy_coding;
4ed46869 2947
b73bfc1c 2948 while (1)
4ed46869 2949 {
0a28aafb 2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
682169fe
KH
2951 if (c < 0x80)
2952 continue;
2953 if (c < 0xA1 || c > 0xFE)
2954 return 0;
2955 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2956 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2957 return 0;
4ed46869 2958 }
b73bfc1c 2959 label_end_of_loop:
4ed46869
KH
2960 return CODING_CATEGORY_MASK_BIG5;
2961}
2962
fa42c37f
KH
2963/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2964 Check if a text is encoded in UTF-8. If it is, return
2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2966
2967#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2968#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2969#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2970#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2971#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2972#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2973#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2974
0a28aafb
KH
2975static int
2976detect_coding_utf_8 (src, src_end, multibytep)
fa42c37f 2977 unsigned char *src, *src_end;
0a28aafb 2978 int multibytep;
fa42c37f
KH
2979{
2980 unsigned char c;
2981 int seq_maybe_bytes;
b73bfc1c
KH
2982 /* Dummy for ONE_MORE_BYTE. */
2983 struct coding_system dummy_coding;
2984 struct coding_system *coding = &dummy_coding;
fa42c37f 2985
b73bfc1c 2986 while (1)
fa42c37f 2987 {
0a28aafb 2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
2989 if (UTF_8_1_OCTET_P (c))
2990 continue;
2991 else if (UTF_8_2_OCTET_LEADING_P (c))
2992 seq_maybe_bytes = 1;
2993 else if (UTF_8_3_OCTET_LEADING_P (c))
2994 seq_maybe_bytes = 2;
2995 else if (UTF_8_4_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 3;
2997 else if (UTF_8_5_OCTET_LEADING_P (c))
2998 seq_maybe_bytes = 4;
2999 else if (UTF_8_6_OCTET_LEADING_P (c))
3000 seq_maybe_bytes = 5;
3001 else
3002 return 0;
3003
3004 do
3005 {
0a28aafb 3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
fa42c37f
KH
3007 if (!UTF_8_EXTRA_OCTET_P (c))
3008 return 0;
3009 seq_maybe_bytes--;
3010 }
3011 while (seq_maybe_bytes > 0);
3012 }
3013
b73bfc1c 3014 label_end_of_loop:
fa42c37f
KH
3015 return CODING_CATEGORY_MASK_UTF_8;
3016}
3017
3018/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3019 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3020 Little Endian (otherwise). If it is, return
3021 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3022 else return 0. */
3023
3024#define UTF_16_INVALID_P(val) \
3025 (((val) == 0xFFFE) \
3026 || ((val) == 0xFFFF))
3027
3028#define UTF_16_HIGH_SURROGATE_P(val) \
3029 (((val) & 0xD800) == 0xD800)
3030
3031#define UTF_16_LOW_SURROGATE_P(val) \
3032 (((val) & 0xDC00) == 0xDC00)
3033
0a28aafb
KH
3034static int
3035detect_coding_utf_16 (src, src_end, multibytep)
fa42c37f 3036 unsigned char *src, *src_end;
0a28aafb 3037 int multibytep;
fa42c37f 3038{
b73bfc1c 3039 unsigned char c1, c2;
1c7457e2 3040 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
b73bfc1c
KH
3041 struct coding_system dummy_coding;
3042 struct coding_system *coding = &dummy_coding;
fa42c37f 3043
0a28aafb
KH
3044 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3045 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
b73bfc1c
KH
3046
3047 if ((c1 == 0xFF) && (c2 == 0xFE))
fa42c37f 3048 return CODING_CATEGORY_MASK_UTF_16_LE;
b73bfc1c 3049 else if ((c1 == 0xFE) && (c2 == 0xFF))
fa42c37f
KH
3050 return CODING_CATEGORY_MASK_UTF_16_BE;
3051
b73bfc1c 3052 label_end_of_loop:
fa42c37f
KH
3053 return 0;
3054}
3055
4ed46869
KH
3056/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3057 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3058
b73bfc1c 3059static void
4ed46869 3060decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 3061 src_bytes, dst_bytes, sjis_p)
4ed46869 3062 struct coding_system *coding;
5bdca8af
DN
3063 const unsigned char *source;
3064 unsigned char *destination;
4ed46869 3065 int src_bytes, dst_bytes;
4ed46869
KH
3066 int sjis_p;
3067{
5bdca8af
DN
3068 const unsigned char *src = source;
3069 const unsigned char *src_end = source + src_bytes;
4ed46869
KH
3070 unsigned char *dst = destination;
3071 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
3072 /* SRC_BASE remembers the start position in source in each loop.
3073 The loop will be exited when there's not enough source code
3074 (within macro ONE_MORE_BYTE), or when there's not enough
3075 destination area to produce a character (within macro
3076 EMIT_CHAR). */
5bdca8af 3077 const unsigned char *src_base;
b73bfc1c 3078 Lisp_Object translation_table;
a5d301df 3079
b73bfc1c
KH
3080 if (NILP (Venable_character_translation))
3081 translation_table = Qnil;
3082 else
3083 {
3084 translation_table = coding->translation_table_for_decode;
3085 if (NILP (translation_table))
3086 translation_table = Vstandard_translation_table_for_decode;
3087 }
4ed46869 3088
d46c5b12 3089 coding->produced_char = 0;
b73bfc1c 3090 while (1)
4ed46869 3091 {
85478bc6 3092 int c, charset, c1, c2 = 0;
b73bfc1c
KH
3093
3094 src_base = src;
3095 ONE_MORE_BYTE (c1);
3096
3097 if (c1 < 0x80)
4ed46869 3098 {
b73bfc1c
KH
3099 charset = CHARSET_ASCII;
3100 if (c1 < 0x20)
4ed46869 3101 {
b73bfc1c 3102 if (c1 == '\r')
d46c5b12 3103 {
b73bfc1c 3104 if (coding->eol_type == CODING_EOL_CRLF)
d46c5b12 3105 {
b73bfc1c
KH
3106 ONE_MORE_BYTE (c2);
3107 if (c2 == '\n')
3108 c1 = c2;
b73bfc1c
KH
3109 else
3110 /* To process C2 again, SRC is subtracted by 1. */
3111 src--;
d46c5b12 3112 }
b73bfc1c
KH
3113 else if (coding->eol_type == CODING_EOL_CR)
3114 c1 = '\n';
3115 }
3116 else if (c1 == '\n'
3117 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3118 && (coding->eol_type == CODING_EOL_CR
3119 || coding->eol_type == CODING_EOL_CRLF))
3120 {
3121 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3122 goto label_end_of_loop;
d46c5b12 3123 }
4ed46869 3124 }
4ed46869 3125 }
54f78171 3126 else
b73bfc1c 3127 {
4ed46869
KH
3128 if (sjis_p)
3129 {
682169fe 3130 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
b73bfc1c 3131 goto label_invalid_code;
682169fe 3132 if (c1 <= 0x9F || c1 >= 0xE0)
fb88bf2d 3133 {
54f78171
KH
3134 /* SJIS -> JISX0208 */
3135 ONE_MORE_BYTE (c2);
b73bfc1c
KH
3136 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3137 goto label_invalid_code;
3138 DECODE_SJIS (c1, c2, c1, c2);
3139 charset = charset_jisx0208;
5e34de15 3140 }
fb88bf2d 3141 else
b73bfc1c
KH
3142 /* SJIS -> JISX0201-Kana */
3143 charset = charset_katakana_jisx0201;
4ed46869 3144 }
fb88bf2d 3145 else
fb88bf2d 3146 {
54f78171 3147 /* BIG5 -> Big5 */
682169fe 3148 if (c1 < 0xA0 || c1 > 0xFE)
b73bfc1c
KH
3149 goto label_invalid_code;
3150 ONE_MORE_BYTE (c2);
3151 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3152 goto label_invalid_code;
3153 DECODE_BIG5 (c1, c2, charset, c1, c2);
4ed46869
KH
3154 }
3155 }
4ed46869 3156
b73bfc1c
KH
3157 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3158 EMIT_CHAR (c);
fb88bf2d
KH
3159 continue;
3160
b73bfc1c
KH
3161 label_invalid_code:
3162 coding->errors++;
4ed46869 3163 src = src_base;
b73bfc1c
KH
3164 c = *src++;
3165 EMIT_CHAR (c);
fb88bf2d 3166 }
d46c5b12 3167
b73bfc1c
KH
3168 label_end_of_loop:
3169 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 3170 coding->produced = dst - destination;
b73bfc1c 3171 return;
4ed46869
KH
3172}
3173
3174/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
3175 This function can encode charsets `ascii', `katakana-jisx0201',
3176 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3177 are sure that all these charsets are registered as official charset
4ed46869
KH
3178 (i.e. do not have extended leading-codes). Characters of other
3179 charsets are produced without any encoding. If SJIS_P is 1, encode
3180 SJIS text, else encode BIG5 text. */
3181
b73bfc1c 3182static void
4ed46869 3183encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 3184 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
3185 struct coding_system *coding;
3186 unsigned char *source, *destination;
3187 int src_bytes, dst_bytes;
4ed46869
KH
3188 int sjis_p;
3189{
3190 unsigned char *src = source;
3191 unsigned char *src_end = source + src_bytes;
3192 unsigned char *dst = destination;
3193 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
3194 /* SRC_BASE remembers the start position in source in each loop.
3195 The loop will be exited when there's not enough source text to
3196 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3197 there's not enough destination area to produce encoded codes
3198 (within macro EMIT_BYTES). */
3199 unsigned char *src_base;
3200 Lisp_Object translation_table;
4ed46869 3201
b73bfc1c
KH
3202 if (NILP (Venable_character_translation))
3203 translation_table = Qnil;
3204 else
4ed46869 3205 {
39658efc 3206 translation_table = coding->translation_table_for_encode;
b73bfc1c 3207 if (NILP (translation_table))
39658efc 3208 translation_table = Vstandard_translation_table_for_encode;
b73bfc1c 3209 }
a5d301df 3210
b73bfc1c
KH
3211 while (1)
3212 {
3213 int c, charset, c1, c2;
4ed46869 3214
b73bfc1c
KH
3215 src_base = src;
3216 ONE_MORE_CHAR (c);
93dec019 3217
b73bfc1c
KH
3218 /* Now encode the character C. */
3219 if (SINGLE_BYTE_CHAR_P (c))
3220 {
3221 switch (c)
4ed46869 3222 {
b73bfc1c 3223 case '\r':
7371fe0a 3224 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
b73bfc1c
KH
3225 {
3226 EMIT_ONE_BYTE (c);
3227 break;
3228 }
3229 c = '\n';
3230 case '\n':
3231 if (coding->eol_type == CODING_EOL_CRLF)
3232 {
3233 EMIT_TWO_BYTES ('\r', c);
3234 break;
3235 }
3236 else if (coding->eol_type == CODING_EOL_CR)
3237 c = '\r';
3238 default:
3239 EMIT_ONE_BYTE (c);
3240 }
3241 }
3242 else
3243 {
3244 SPLIT_CHAR (c, charset, c1, c2);
3245 if (sjis_p)
3246 {
3247 if (charset == charset_jisx0208
3248 || charset == charset_jisx0208_1978)
3249 {
3250 ENCODE_SJIS (c1, c2, c1, c2);
3251 EMIT_TWO_BYTES (c1, c2);
3252 }
39658efc
KH
3253 else if (charset == charset_katakana_jisx0201)
3254 EMIT_ONE_BYTE (c1 | 0x80);
fc53a214
KH
3255 else if (charset == charset_latin_jisx0201)
3256 EMIT_ONE_BYTE (c1);
0eecad43
KH
3257 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3258 {
3259 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260 if (CHARSET_WIDTH (charset) > 1)
3261 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3262 }
b73bfc1c
KH
3263 else
3264 /* There's no way other than producing the internal
3265 codes as is. */
3266 EMIT_BYTES (src_base, src);
4ed46869 3267 }
4ed46869 3268 else
b73bfc1c
KH
3269 {
3270 if (charset == charset_big5_1 || charset == charset_big5_2)
3271 {
3272 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3273 EMIT_TWO_BYTES (c1, c2);
3274 }
0eecad43
KH
3275 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3276 {
3277 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3278 if (CHARSET_WIDTH (charset) > 1)
3279 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3280 }
b73bfc1c
KH
3281 else
3282 /* There's no way other than producing the internal
3283 codes as is. */
3284 EMIT_BYTES (src_base, src);
3285 }
4ed46869 3286 }
b73bfc1c 3287 coding->consumed_char++;
4ed46869
KH
3288 }
3289
b73bfc1c
KH
3290 label_end_of_loop:
3291 coding->consumed = src_base - source;
d46c5b12 3292 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
3293}
3294
3295\f
1397dc18
KH
3296/*** 5. CCL handlers ***/
3297
3298/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3299 Check if a text is encoded in a coding system of which
3300 encoder/decoder are written in CCL program. If it is, return
3301 CODING_CATEGORY_MASK_CCL, else return 0. */
3302
0a28aafb
KH
3303static int
3304detect_coding_ccl (src, src_end, multibytep)
1397dc18 3305 unsigned char *src, *src_end;
0a28aafb 3306 int multibytep;
1397dc18
KH
3307{
3308 unsigned char *valid;
b73bfc1c
KH
3309 int c;
3310 /* Dummy for ONE_MORE_BYTE. */
3311 struct coding_system dummy_coding;
3312 struct coding_system *coding = &dummy_coding;
1397dc18
KH
3313
3314 /* No coding system is assigned to coding-category-ccl. */
3315 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3316 return 0;
3317
3318 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
b73bfc1c 3319 while (1)
1397dc18 3320 {
0a28aafb 3321 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
b73bfc1c
KH
3322 if (! valid[c])
3323 return 0;
1397dc18 3324 }
b73bfc1c 3325 label_end_of_loop:
1397dc18
KH
3326 return CODING_CATEGORY_MASK_CCL;
3327}
3328
3329\f
3330/*** 6. End-of-line handlers ***/
4ed46869 3331
b73bfc1c 3332/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 3333
b73bfc1c 3334static void
d46c5b12 3335decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869 3336 struct coding_system *coding;
5bdca8af
DN
3337 const unsigned char *source;
3338 unsigned char *destination;
4ed46869 3339 int src_bytes, dst_bytes;
4ed46869 3340{
5bdca8af 3341 const unsigned char *src = source;
4ed46869 3342 unsigned char *dst = destination;
5bdca8af 3343 const unsigned char *src_end = src + src_bytes;
b73bfc1c
KH
3344 unsigned char *dst_end = dst + dst_bytes;
3345 Lisp_Object translation_table;
3346 /* SRC_BASE remembers the start position in source in each loop.
3347 The loop will be exited when there's not enough source code
3348 (within macro ONE_MORE_BYTE), or when there's not enough
3349 destination area to produce a character (within macro
3350 EMIT_CHAR). */
5bdca8af 3351 const unsigned char *src_base;
b73bfc1c
KH
3352 int c;
3353
3354 translation_table = Qnil;
4ed46869
KH
3355 switch (coding->eol_type)
3356 {
3357 case CODING_EOL_CRLF:
b73bfc1c 3358 while (1)
d46c5b12 3359 {
b73bfc1c
KH
3360 src_base = src;
3361 ONE_MORE_BYTE (c);
3362 if (c == '\r')
fb88bf2d 3363 {
b73bfc1c
KH
3364 ONE_MORE_BYTE (c);
3365 if (c != '\n')
3366 {
b73bfc1c
KH
3367 src--;
3368 c = '\r';
3369 }
fb88bf2d 3370 }
b73bfc1c
KH
3371 else if (c == '\n'
3372 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
d46c5b12 3373 {
b73bfc1c
KH
3374 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3375 goto label_end_of_loop;
d46c5b12 3376 }
b73bfc1c 3377 EMIT_CHAR (c);
d46c5b12 3378 }
b73bfc1c
KH
3379 break;
3380
3381 case CODING_EOL_CR:
3382 while (1)
d46c5b12 3383 {
b73bfc1c
KH
3384 src_base = src;
3385 ONE_MORE_BYTE (c);
3386 if (c == '\n')
3387 {
3388 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3389 {
3390 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3391 goto label_end_of_loop;
3392 }
3393 }
3394 else if (c == '\r')
3395 c = '\n';
3396 EMIT_CHAR (c);
d46c5b12 3397 }
4ed46869
KH
3398 break;
3399
b73bfc1c
KH
3400 default: /* no need for EOL handling */
3401 while (1)
d46c5b12 3402 {
b73bfc1c
KH
3403 src_base = src;
3404 ONE_MORE_BYTE (c);
3405 EMIT_CHAR (c);
d46c5b12 3406 }
4ed46869
KH
3407 }
3408
b73bfc1c
KH
3409 label_end_of_loop:
3410 coding->consumed = coding->consumed_char = src_base - source;
3411 coding->produced = dst - destination;
3412 return;
4ed46869
KH
3413}
3414
3415/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
b73bfc1c 3416 format of end-of-line according to `coding->eol_type'. It also
8ca3766a 3417 convert multibyte form 8-bit characters to unibyte if
b73bfc1c
KH
3418 CODING->src_multibyte is nonzero. If `coding->mode &
3419 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3420 also means end-of-line. */
4ed46869 3421
b73bfc1c 3422static void
d46c5b12 3423encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869 3424 struct coding_system *coding;
a4244313
KR
3425 const unsigned char *source;
3426 unsigned char *destination;
4ed46869 3427 int src_bytes, dst_bytes;
4ed46869 3428{
a4244313 3429 const unsigned char *src = source;
4ed46869 3430 unsigned char *dst = destination;
a4244313 3431 const unsigned char *src_end = src + src_bytes;
b73bfc1c
KH
3432 unsigned char *dst_end = dst + dst_bytes;
3433 Lisp_Object translation_table;
3434 /* SRC_BASE remembers the start position in source in each loop.
3435 The loop will be exited when there's not enough source text to
3436 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3437 there's not enough destination area to produce encoded codes
3438 (within macro EMIT_BYTES). */
a4244313
KR
3439 const unsigned char *src_base;
3440 unsigned char *tmp;
b73bfc1c
KH
3441 int c;
3442 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3443
3444 translation_table = Qnil;
3445 if (coding->src_multibyte
3446 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3447 {
3448 src_end--;
3449 src_bytes--;
3450 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3451 }
fb88bf2d 3452
d46c5b12
KH
3453 if (coding->eol_type == CODING_EOL_CRLF)
3454 {
b73bfc1c 3455 while (src < src_end)
d46c5b12 3456 {
b73bfc1c 3457 src_base = src;
d46c5b12 3458 c = *src++;
b73bfc1c
KH
3459 if (c >= 0x20)
3460 EMIT_ONE_BYTE (c);
3461 else if (c == '\n' || (c == '\r' && selective_display))
3462 EMIT_TWO_BYTES ('\r', '\n');
d46c5b12 3463 else
b73bfc1c 3464 EMIT_ONE_BYTE (c);
d46c5b12 3465 }
ff2b1ea9 3466 src_base = src;
b73bfc1c 3467 label_end_of_loop:
005f0d35 3468 ;
d46c5b12
KH
3469 }
3470 else
4ed46869 3471 {
78a629d2 3472 if (!dst_bytes || src_bytes <= dst_bytes)
4ed46869 3473 {
b73bfc1c
KH
3474 safe_bcopy (src, dst, src_bytes);
3475 src_base = src_end;
3476 dst += src_bytes;
d46c5b12 3477 }
d46c5b12 3478 else
b73bfc1c
KH
3479 {
3480 if (coding->src_multibyte
3481 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3482 dst_bytes--;
3483 safe_bcopy (src, dst, dst_bytes);
3484 src_base = src + dst_bytes;
3485 dst = destination + dst_bytes;
3486 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3487 }
993824c9 3488 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 3489 {
a4244313
KR
3490 for (tmp = destination; tmp < dst; tmp++)
3491 if (*tmp == '\n') *tmp = '\r';
d46c5b12 3492 }
b73bfc1c 3493 else if (selective_display)
d46c5b12 3494 {
a4244313
KR
3495 for (tmp = destination; tmp < dst; tmp++)
3496 if (*tmp == '\r') *tmp = '\n';
4ed46869 3497 }
4ed46869 3498 }
b73bfc1c
KH
3499 if (coding->src_multibyte)
3500 dst = destination + str_as_unibyte (destination, dst - destination);
4ed46869 3501
b73bfc1c
KH
3502 coding->consumed = src_base - source;
3503 coding->produced = dst - destination;
78a629d2 3504 coding->produced_char = coding->produced;
4ed46869
KH
3505}
3506
3507\f
1397dc18 3508/*** 7. C library functions ***/
4ed46869 3509
cfb43547 3510/* In Emacs Lisp, a coding system is represented by a Lisp symbol which
4ed46869 3511 has a property `coding-system'. The value of this property is a
cfb43547 3512 vector of length 5 (called the coding-vector). Among elements of
4ed46869
KH
3513 this vector, the first (element[0]) and the fifth (element[4])
3514 carry important information for decoding/encoding. Before
3515 decoding/encoding, this information should be set in fields of a
3516 structure of type `coding_system'.
3517
cfb43547 3518 The value of the property `coding-system' can be a symbol of another
4ed46869
KH
3519 subsidiary coding-system. In that case, Emacs gets coding-vector
3520 from that symbol.
3521
3522 `element[0]' contains information to be set in `coding->type'. The
3523 value and its meaning is as follows:
3524
0ef69138
KH
3525 0 -- coding_type_emacs_mule
3526 1 -- coding_type_sjis
3527 2 -- coding_type_iso2022
3528 3 -- coding_type_big5
3529 4 -- coding_type_ccl encoder/decoder written in CCL
3530 nil -- coding_type_no_conversion
3531 t -- coding_type_undecided (automatic conversion on decoding,
3532 no-conversion on encoding)
4ed46869
KH
3533
3534 `element[4]' contains information to be set in `coding->flags' and
3535 `coding->spec'. The meaning varies by `coding->type'.
3536
3537 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3538 of length 32 (of which the first 13 sub-elements are used now).
3539 Meanings of these sub-elements are:
3540
3541 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3542 If the value is an integer of valid charset, the charset is
3543 assumed to be designated to graphic register N initially.
3544
3545 If the value is minus, it is a minus value of charset which
3546 reserves graphic register N, which means that the charset is
3547 not designated initially but should be designated to graphic
3548 register N just before encoding a character in that charset.
3549
3550 If the value is nil, graphic register N is never used on
3551 encoding.
93dec019 3552
4ed46869
KH
3553 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3554 Each value takes t or nil. See the section ISO2022 of
3555 `coding.h' for more information.
3556
3557 If `coding->type' is `coding_type_big5', element[4] is t to denote
3558 BIG5-ETen or nil to denote BIG5-HKU.
3559
3560 If `coding->type' takes the other value, element[4] is ignored.
3561
cfb43547 3562 Emacs Lisp's coding systems also carry information about format of
4ed46869
KH
3563 end-of-line in a value of property `eol-type'. If the value is
3564 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3565 means CODING_EOL_CR. If it is not integer, it should be a vector
3566 of subsidiary coding systems of which property `eol-type' has one
cfb43547 3567 of the above values.
4ed46869
KH
3568
3569*/
3570
3571/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3572 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3573 is setup so that no conversion is necessary and return -1, else
3574 return 0. */
3575
3576int
e0e989f6
KH
3577setup_coding_system (coding_system, coding)
3578 Lisp_Object coding_system;
4ed46869
KH
3579 struct coding_system *coding;
3580{
d46c5b12 3581 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 3582 Lisp_Object val;
4ed46869 3583
c07c8e12
KH
3584 /* At first, zero clear all members. */
3585 bzero (coding, sizeof (struct coding_system));
3586
d46c5b12 3587 /* Initialize some fields required for all kinds of coding systems. */
774324d6 3588 coding->symbol = coding_system;
d46c5b12
KH
3589 coding->heading_ascii = -1;
3590 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
3591 coding->composing = COMPOSITION_DISABLED;
3592 coding->cmp_data = NULL;
1f5dbf34
KH
3593
3594 if (NILP (coding_system))
3595 goto label_invalid_coding_system;
3596
4608c386 3597 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 3598
4608c386
KH
3599 if (!VECTORP (coding_spec)
3600 || XVECTOR (coding_spec)->size != 5
3601 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 3602 goto label_invalid_coding_system;
4608c386 3603
d46c5b12
KH
3604 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3605 if (VECTORP (eol_type))
3606 {
3607 coding->eol_type = CODING_EOL_UNDECIDED;
3608 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3609 }
3610 else if (XFASTINT (eol_type) == 1)
3611 {
3612 coding->eol_type = CODING_EOL_CRLF;
3613 coding->common_flags
3614 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3615 }
3616 else if (XFASTINT (eol_type) == 2)
3617 {
3618 coding->eol_type = CODING_EOL_CR;
3619 coding->common_flags
3620 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3621 }
3622 else
3623 coding->eol_type = CODING_EOL_LF;
3624
3625 coding_type = XVECTOR (coding_spec)->contents[0];
3626 /* Try short cut. */
3627 if (SYMBOLP (coding_type))
3628 {
3629 if (EQ (coding_type, Qt))
3630 {
3631 coding->type = coding_type_undecided;
3632 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3633 }
3634 else
3635 coding->type = coding_type_no_conversion;
9b96232f
KH
3636 /* Initialize this member. Any thing other than
3637 CODING_CATEGORY_IDX_UTF_16_BE and
3638 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3639 special treatment in detect_eol. */
3640 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3641
d46c5b12
KH
3642 return 0;
3643 }
3644
d46c5b12
KH
3645 /* Get values of coding system properties:
3646 `post-read-conversion', `pre-write-conversion',
f967223b 3647 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 3648 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae 3649 /* Pre & post conversion functions should be disabled if
8ca3766a 3650 inhibit_eol_conversion is nonzero. This is the case that a code
b843d1ae
KH
3651 conversion function is called while those functions are running. */
3652 if (! inhibit_pre_post_conversion)
3653 {
3654 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3655 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3656 }
f967223b 3657 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 3658 if (SYMBOLP (val))
f967223b
KH
3659 val = Fget (val, Qtranslation_table_for_decode);
3660 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3661 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 3662 if (SYMBOLP (val))
f967223b
KH
3663 val = Fget (val, Qtranslation_table_for_encode);
3664 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
3665 val = Fplist_get (plist, Qcoding_category);
3666 if (!NILP (val))
3667 {
3668 val = Fget (val, Qcoding_category_index);
3669 if (INTEGERP (val))
3670 coding->category_idx = XINT (val);
3671 else
3672 goto label_invalid_coding_system;
3673 }
3674 else
3675 goto label_invalid_coding_system;
93dec019 3676
ec6d2bb8
KH
3677 /* If the coding system has non-nil `composition' property, enable
3678 composition handling. */
3679 val = Fplist_get (plist, Qcomposition);
3680 if (!NILP (val))
3681 coding->composing = COMPOSITION_NO;
3682
d46c5b12 3683 switch (XFASTINT (coding_type))
4ed46869
KH
3684 {
3685 case 0:
0ef69138 3686 coding->type = coding_type_emacs_mule;
aa72b389
KH
3687 coding->common_flags
3688 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
c952af22
KH
3689 if (!NILP (coding->post_read_conversion))
3690 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3691 if (!NILP (coding->pre_write_conversion))
3692 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3693 break;
3694
3695 case 1:
3696 coding->type = coding_type_sjis;
c952af22
KH
3697 coding->common_flags
3698 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3699 break;
3700
3701 case 2:
3702 coding->type = coding_type_iso2022;
c952af22
KH
3703 coding->common_flags
3704 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3705 {
70c22245 3706 Lisp_Object val, temp;
4ed46869 3707 Lisp_Object *flags;
d46c5b12 3708 int i, charset, reg_bits = 0;
4ed46869 3709
4608c386 3710 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3711
4ed46869
KH
3712 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3713 goto label_invalid_coding_system;
3714
3715 flags = XVECTOR (val)->contents;
3716 coding->flags
3717 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3718 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3719 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3720 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3721 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3722 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3723 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3724 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3725 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3726 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3727 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3728 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3729 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3730 );
4ed46869
KH
3731
3732 /* Invoke graphic register 0 to plane 0. */
3733 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3734 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3735 CODING_SPEC_ISO_INVOCATION (coding, 1)
3736 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3737 /* Not single shifting at first. */
6e85d753 3738 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3739 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3740 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3741
70c22245
KH
3742 for (charset = 0; charset <= MAX_CHARSET; charset++)
3743 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3744 val = Vcharset_revision_alist;
3745 while (CONSP (val))
3746 {
03699b14 3747 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3748 if (charset >= 0
03699b14 3749 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3750 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3751 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3752 val = XCDR (val);
70c22245
KH
3753 }
3754
4ed46869
KH
3755 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3756 FLAGS[REG] can be one of below:
3757 integer CHARSET: CHARSET occupies register I,
3758 t: designate nothing to REG initially, but can be used
3759 by any charsets,
3760 list of integer, nil, or t: designate the first
3761 element (if integer) to REG initially, the remaining
3762 elements (if integer) is designated to REG on request,
d46c5b12 3763 if an element is t, REG can be used by any charsets,
4ed46869 3764 nil: REG is never used. */
467e7675 3765 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3768 for (i = 0; i < 4; i++)
3769 {
87323294
PJ
3770 if ((INTEGERP (flags[i])
3771 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
e0e989f6 3772 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3773 {
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3775 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3776 }
3777 else if (EQ (flags[i], Qt))
3778 {
3779 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3780 reg_bits |= 1 << i;
3781 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3782 }
3783 else if (CONSP (flags[i]))
3784 {
84d60297
RS
3785 Lisp_Object tail;
3786 tail = flags[i];
4ed46869 3787
d46c5b12 3788 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
87323294
PJ
3789 if ((INTEGERP (XCAR (tail))
3790 && (charset = XINT (XCAR (tail)),
3791 CHARSET_VALID_P (charset)))
03699b14 3792 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3793 {
3794 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3795 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3796 }
3797 else
3798 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3799 tail = XCDR (tail);
4ed46869
KH
3800 while (CONSP (tail))
3801 {
87323294
PJ
3802 if ((INTEGERP (XCAR (tail))
3803 && (charset = XINT (XCAR (tail)),
3804 CHARSET_VALID_P (charset)))
03699b14 3805 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3806 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3807 = i;
03699b14 3808 else if (EQ (XCAR (tail), Qt))
d46c5b12 3809 reg_bits |= 1 << i;
03699b14 3810 tail = XCDR (tail);
4ed46869
KH
3811 }
3812 }
3813 else
3814 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
93dec019 3815
4ed46869
KH
3816 CODING_SPEC_ISO_DESIGNATION (coding, i)
3817 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3818 }
3819
d46c5b12 3820 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3821 {
3822 /* REG 1 can be used only by locking shift in 7-bit env. */
3823 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3824 reg_bits &= ~2;
4ed46869
KH
3825 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3826 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3827 reg_bits &= 3;
4ed46869
KH
3828 }
3829
d46c5b12
KH
3830 if (reg_bits)
3831 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3832 {
928a85c1 3833 if (CHARSET_DEFINED_P (charset)
96148065
KH
3834 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3835 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
3836 {
3837 /* There exist some default graphic registers to be
96148065 3838 used by CHARSET. */
d46c5b12
KH
3839
3840 /* We had better avoid designating a charset of
3841 CHARS96 to REG 0 as far as possible. */
3842 if (CHARSET_CHARS (charset) == 96)
3843 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3844 = (reg_bits & 2
3845 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3846 else
3847 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848 = (reg_bits & 1
3849 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3850 }
6e85d753 3851 }
4ed46869 3852 }
c952af22 3853 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3854 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3855 break;
3856
3857 case 3:
3858 coding->type = coding_type_big5;
c952af22
KH
3859 coding->common_flags
3860 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3861 coding->flags
4608c386 3862 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3863 ? CODING_FLAG_BIG5_HKU
3864 : CODING_FLAG_BIG5_ETEN);
3865 break;
3866
3867 case 4:
3868 coding->type = coding_type_ccl;
c952af22
KH
3869 coding->common_flags
3870 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3871 {
84d60297 3872 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3873 if (! CONSP (val)
3874 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3875 XCAR (val)) < 0
ef4ced28 3876 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3877 XCDR (val)) < 0)
4ed46869 3878 goto label_invalid_coding_system;
1397dc18
KH
3879
3880 bzero (coding->spec.ccl.valid_codes, 256);
3881 val = Fplist_get (plist, Qvalid_codes);
3882 if (CONSP (val))
3883 {
3884 Lisp_Object this;
3885
03699b14 3886 for (; CONSP (val); val = XCDR (val))
1397dc18 3887 {
03699b14 3888 this = XCAR (val);
1397dc18
KH
3889 if (INTEGERP (this)
3890 && XINT (this) >= 0 && XINT (this) < 256)
3891 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3892 else if (CONSP (this)
03699b14
KR
3893 && INTEGERP (XCAR (this))
3894 && INTEGERP (XCDR (this)))
1397dc18 3895 {
03699b14
KR
3896 int start = XINT (XCAR (this));
3897 int end = XINT (XCDR (this));
1397dc18
KH
3898
3899 if (start >= 0 && start <= end && end < 256)
e133c8fa 3900 while (start <= end)
1397dc18
KH
3901 coding->spec.ccl.valid_codes[start++] = 1;
3902 }
3903 }
3904 }
4ed46869 3905 }
c952af22 3906 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
aaaf0b1e 3907 coding->spec.ccl.cr_carryover = 0;
1c3478b0 3908 coding->spec.ccl.eight_bit_carryover[0] = 0;
4ed46869
KH
3909 break;
3910
27901516
KH
3911 case 5:
3912 coding->type = coding_type_raw_text;
3913 break;
3914
4ed46869 3915 default:
d46c5b12 3916 goto label_invalid_coding_system;
4ed46869
KH
3917 }
3918 return 0;
3919
3920 label_invalid_coding_system:
3921 coding->type = coding_type_no_conversion;
d46c5b12 3922 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3923 coding->common_flags = 0;
dec137e5 3924 coding->eol_type = CODING_EOL_LF;
d46c5b12 3925 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3926 return -1;
3927}
3928
ec6d2bb8
KH
3929/* Free memory blocks allocated for storing composition information. */
3930
3931void
3932coding_free_composition_data (coding)
3933 struct coding_system *coding;
3934{
3935 struct composition_data *cmp_data = coding->cmp_data, *next;
3936
3937 if (!cmp_data)
3938 return;
3939 /* Memory blocks are chained. At first, rewind to the first, then,
3940 free blocks one by one. */
3941 while (cmp_data->prev)
3942 cmp_data = cmp_data->prev;
3943 while (cmp_data)
3944 {
3945 next = cmp_data->next;
3946 xfree (cmp_data);
3947 cmp_data = next;
3948 }
3949 coding->cmp_data = NULL;
3950}
3951
3952/* Set `char_offset' member of all memory blocks pointed by
3953 coding->cmp_data to POS. */
3954
3955void
3956coding_adjust_composition_offset (coding, pos)
3957 struct coding_system *coding;
3958 int pos;
3959{
3960 struct composition_data *cmp_data;
3961
3962 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3963 cmp_data->char_offset = pos;
3964}
3965
54f78171
KH
3966/* Setup raw-text or one of its subsidiaries in the structure
3967 coding_system CODING according to the already setup value eol_type
3968 in CODING. CODING should be setup for some coding system in
3969 advance. */
3970
3971void
3972setup_raw_text_coding_system (coding)
3973 struct coding_system *coding;
3974{
3975 if (coding->type != coding_type_raw_text)
3976 {
3977 coding->symbol = Qraw_text;
3978 coding->type = coding_type_raw_text;
3979 if (coding->eol_type != CODING_EOL_UNDECIDED)
3980 {
84d60297
RS
3981 Lisp_Object subsidiaries;
3982 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3983
3984 if (VECTORP (subsidiaries)
3985 && XVECTOR (subsidiaries)->size == 3)
3986 coding->symbol
3987 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3988 }
716e0b0a 3989 setup_coding_system (coding->symbol, coding);
54f78171
KH
3990 }
3991 return;
3992}
3993
4ed46869
KH
3994/* Emacs has a mechanism to automatically detect a coding system if it
3995 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3996 it's impossible to distinguish some coding systems accurately
3997 because they use the same range of codes. So, at first, coding
3998 systems are categorized into 7, those are:
3999
0ef69138 4000 o coding-category-emacs-mule
4ed46869
KH
4001
4002 The category for a coding system which has the same code range
4003 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 4004 symbol) `emacs-mule' by default.
4ed46869
KH
4005
4006 o coding-category-sjis
4007
4008 The category for a coding system which has the same code range
4009 as SJIS. Assigned the coding-system (Lisp
7717c392 4010 symbol) `japanese-shift-jis' by default.
4ed46869
KH
4011
4012 o coding-category-iso-7
4013
4014 The category for a coding system which has the same code range
7717c392 4015 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
4016 shift and single shift functions. This can encode/decode all
4017 charsets. Assigned the coding-system (Lisp symbol)
4018 `iso-2022-7bit' by default.
4019
4020 o coding-category-iso-7-tight
4021
4022 Same as coding-category-iso-7 except that this can
4023 encode/decode only the specified charsets.
4ed46869
KH
4024
4025 o coding-category-iso-8-1
4026
4027 The category for a coding system which has the same code range
4028 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4029 for DIMENSION1 charset. This doesn't use any locking shift
4030 and single shift functions. Assigned the coding-system (Lisp
4031 symbol) `iso-latin-1' by default.
4ed46869
KH
4032
4033 o coding-category-iso-8-2
4034
4035 The category for a coding system which has the same code range
4036 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4037 for DIMENSION2 charset. This doesn't use any locking shift
4038 and single shift functions. Assigned the coding-system (Lisp
4039 symbol) `japanese-iso-8bit' by default.
4ed46869 4040
7717c392 4041 o coding-category-iso-7-else
4ed46869
KH
4042
4043 The category for a coding system which has the same code range
8ca3766a 4044 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
4045 single shift functions. Assigned the coding-system (Lisp
4046 symbol) `iso-2022-7bit-lock' by default.
4047
4048 o coding-category-iso-8-else
4049
4050 The category for a coding system which has the same code range
8ca3766a 4051 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
4052 single shift functions. Assigned the coding-system (Lisp
4053 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
4054
4055 o coding-category-big5
4056
4057 The category for a coding system which has the same code range
4058 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 4059 `cn-big5' by default.
4ed46869 4060
fa42c37f
KH
4061 o coding-category-utf-8
4062
4063 The category for a coding system which has the same code range
38b92c42 4064 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
4065 symbol) `utf-8' by default.
4066
4067 o coding-category-utf-16-be
4068
4069 The category for a coding system in which a text has an
4070 Unicode signature (cf. Unicode Standard) in the order of BIG
4071 endian at the head. Assigned the coding-system (Lisp symbol)
4072 `utf-16-be' by default.
4073
4074 o coding-category-utf-16-le
4075
4076 The category for a coding system in which a text has an
4077 Unicode signature (cf. Unicode Standard) in the order of
4078 LITTLE endian at the head. Assigned the coding-system (Lisp
4079 symbol) `utf-16-le' by default.
4080
1397dc18
KH
4081 o coding-category-ccl
4082
4083 The category for a coding system of which encoder/decoder is
4084 written in CCL programs. The default value is nil, i.e., no
4085 coding system is assigned.
4086
4ed46869
KH
4087 o coding-category-binary
4088
4089 The category for a coding system not categorized in any of the
4090 above. Assigned the coding-system (Lisp symbol)
e0e989f6 4091 `no-conversion' by default.
4ed46869
KH
4092
4093 Each of them is a Lisp symbol and the value is an actual
cfb43547 4094 `coding-system' (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
4095 What Emacs does actually is to detect a category of coding system.
4096 Then, it uses a `coding-system' assigned to it. If Emacs can't
cfb43547 4097 decide a single possible category, it selects a category of the
4ed46869
KH
4098 highest priority. Priorities of categories are also specified by a
4099 user in a Lisp variable `coding-category-list'.
4100
4101*/
4102
66cfb530
KH
4103static
4104int ascii_skip_code[256];
4105
d46c5b12 4106/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
4107 If it detects possible coding systems, return an integer in which
4108 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
4109 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4110 it should point the table `coding_priorities'. In that case, only
4111 the flag bit for a coding system of the highest priority is set in
0a28aafb
KH
4112 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4113 range 0x80..0x9F are in multibyte form.
4ed46869 4114
d46c5b12
KH
4115 How many ASCII characters are at the head is returned as *SKIP. */
4116
4117static int
0a28aafb 4118detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
d46c5b12
KH
4119 unsigned char *source;
4120 int src_bytes, *priorities, *skip;
0a28aafb 4121 int multibytep;
4ed46869
KH
4122{
4123 register unsigned char c;
d46c5b12 4124 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f 4125 unsigned int mask, utf16_examined_p, iso2022_examined_p;
da55a2b7 4126 int i;
4ed46869
KH
4127
4128 /* At first, skip all ASCII characters and control characters except
4129 for three ISO2022 specific control characters. */
66cfb530
KH
4130 ascii_skip_code[ISO_CODE_SO] = 0;
4131 ascii_skip_code[ISO_CODE_SI] = 0;
4132 ascii_skip_code[ISO_CODE_ESC] = 0;
4133
bcf26d6a 4134 label_loop_detect_coding:
66cfb530 4135 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 4136 *skip = src - source;
4ed46869
KH
4137
4138 if (src >= src_end)
4139 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 4140 return 0;
4ed46869 4141
8a8147d6 4142 c = *src;
4ed46869
KH
4143 /* The text seems to be encoded in some multilingual coding system.
4144 Now, try to find in which coding system the text is encoded. */
4145 if (c < 0x80)
bcf26d6a
KH
4146 {
4147 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4148 /* C is an ISO2022 specific control code of C0. */
0a28aafb 4149 mask = detect_coding_iso2022 (src, src_end, multibytep);
1b2af4b0 4150 if (mask == 0)
d46c5b12
KH
4151 {
4152 /* No valid ISO2022 code follows C. Try again. */
4153 src++;
66cfb530
KH
4154 if (c == ISO_CODE_ESC)
4155 ascii_skip_code[ISO_CODE_ESC] = 1;
4156 else
4157 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
4158 goto label_loop_detect_coding;
4159 }
4160 if (priorities)
fa42c37f
KH
4161 {
4162 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4163 {
4164 if (mask & priorities[i])
4165 return priorities[i];
4166 }
4167 return CODING_CATEGORY_MASK_RAW_TEXT;
4168 }
bcf26d6a 4169 }
d46c5b12 4170 else
c4825358 4171 {
d46c5b12 4172 int try;
4ed46869 4173
0a28aafb 4174 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
67091e59 4175 c = src[1] - 0x20;
0a28aafb 4176
d46c5b12
KH
4177 if (c < 0xA0)
4178 {
4179 /* C is the first byte of SJIS character code,
fa42c37f
KH
4180 or a leading-code of Emacs' internal format (emacs-mule),
4181 or the first byte of UTF-16. */
4182 try = (CODING_CATEGORY_MASK_SJIS
4183 | CODING_CATEGORY_MASK_EMACS_MULE
4184 | CODING_CATEGORY_MASK_UTF_16_BE
4185 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
4186
4187 /* Or, if C is a special latin extra code,
93dec019 4188 or is an ISO2022 specific control code of C1 (SS2 or SS3),
d46c5b12
KH
4189 or is an ISO2022 control-sequence-introducer (CSI),
4190 we should also consider the possibility of ISO2022 codings. */
4191 if ((VECTORP (Vlatin_extra_code_table)
4192 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4193 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4194 || (c == ISO_CODE_CSI
4195 && (src < src_end
4196 && (*src == ']'
4197 || ((*src == '0' || *src == '1' || *src == '2')
4198 && src + 1 < src_end
4199 && src[1] == ']')))))
4200 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4201 | CODING_CATEGORY_MASK_ISO_8BIT);
4202 }
c4825358 4203 else
d46c5b12
KH
4204 /* C is a character of ISO2022 in graphic plane right,
4205 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
4206 or the first byte of BIG5's 2-byte code,
4207 or the first byte of UTF-8/16. */
d46c5b12
KH
4208 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4209 | CODING_CATEGORY_MASK_ISO_8BIT
4210 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
4211 | CODING_CATEGORY_MASK_BIG5
4212 | CODING_CATEGORY_MASK_UTF_8
4213 | CODING_CATEGORY_MASK_UTF_16_BE
4214 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 4215
1397dc18
KH
4216 /* Or, we may have to consider the possibility of CCL. */
4217 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4218 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4219 ->spec.ccl.valid_codes)[c])
4220 try |= CODING_CATEGORY_MASK_CCL;
4221
d46c5b12 4222 mask = 0;
fa42c37f 4223 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
4224 if (priorities)
4225 {
4226 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4227 {
fa42c37f
KH
4228 if (!iso2022_examined_p
4229 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4230 {
0192762c 4231 mask |= detect_coding_iso2022 (src, src_end, multibytep);
fa42c37f
KH
4232 iso2022_examined_p = 1;
4233 }
5ab13dd0 4234 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4235 mask |= detect_coding_sjis (src, src_end, multibytep);
fa42c37f 4236 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4237 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f
KH
4238 else if (!utf16_examined_p
4239 && (priorities[i] & try &
4240 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4241 {
0a28aafb 4242 mask |= detect_coding_utf_16 (src, src_end, multibytep);
fa42c37f
KH
4243 utf16_examined_p = 1;
4244 }
5ab13dd0 4245 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4246 mask |= detect_coding_big5 (src, src_end, multibytep);
5ab13dd0 4247 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4248 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
89fa8b36 4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4250 mask |= detect_coding_ccl (src, src_end, multibytep);
5ab13dd0 4251 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 4252 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 4253 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
4254 mask |= CODING_CATEGORY_MASK_BINARY;
4255 if (mask & priorities[i])
4256 return priorities[i];
d46c5b12
KH
4257 }
4258 return CODING_CATEGORY_MASK_RAW_TEXT;
4259 }
4260 if (try & CODING_CATEGORY_MASK_ISO)
0a28aafb 4261 mask |= detect_coding_iso2022 (src, src_end, multibytep);
d46c5b12 4262 if (try & CODING_CATEGORY_MASK_SJIS)
0a28aafb 4263 mask |= detect_coding_sjis (src, src_end, multibytep);
d46c5b12 4264 if (try & CODING_CATEGORY_MASK_BIG5)
0a28aafb 4265 mask |= detect_coding_big5 (src, src_end, multibytep);
fa42c37f 4266 if (try & CODING_CATEGORY_MASK_UTF_8)
0a28aafb 4267 mask |= detect_coding_utf_8 (src, src_end, multibytep);
fa42c37f 4268 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
0a28aafb 4269 mask |= detect_coding_utf_16 (src, src_end, multibytep);
d46c5b12 4270 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
0a28aafb 4271 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
1397dc18 4272 if (try & CODING_CATEGORY_MASK_CCL)
0a28aafb 4273 mask |= detect_coding_ccl (src, src_end, multibytep);
c4825358 4274 }
5ab13dd0 4275 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
4276}
4277
4278/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4279 The information of the detected coding system is set in CODING. */
4280
4281void
4282detect_coding (coding, src, src_bytes)
4283 struct coding_system *coding;
a4244313 4284 const unsigned char *src;
4ed46869
KH
4285 int src_bytes;
4286{
d46c5b12 4287 unsigned int idx;
da55a2b7 4288 int skip, mask;
84d60297 4289 Lisp_Object val;
4ed46869 4290
84d60297 4291 val = Vcoding_category_list;
64c1e55f
KH
4292 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4293 coding->src_multibyte);
d46c5b12 4294 coding->heading_ascii = skip;
4ed46869 4295
d46c5b12
KH
4296 if (!mask) return;
4297
4298 /* We found a single coding system of the highest priority in MASK. */
4299 idx = 0;
4300 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4301 if (! mask)
4302 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 4303
f5c1dd0d 4304 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
d46c5b12
KH
4305
4306 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 4307 {
84d60297 4308 Lisp_Object tmp;
d46c5b12 4309
84d60297 4310 tmp = Fget (val, Qeol_type);
d46c5b12
KH
4311 if (VECTORP (tmp))
4312 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 4313 }
b73bfc1c
KH
4314
4315 /* Setup this new coding system while preserving some slots. */
4316 {
4317 int src_multibyte = coding->src_multibyte;
4318 int dst_multibyte = coding->dst_multibyte;
4319
4320 setup_coding_system (val, coding);
4321 coding->src_multibyte = src_multibyte;
4322 coding->dst_multibyte = dst_multibyte;
4323 coding->heading_ascii = skip;
4324 }
4ed46869
KH
4325}
4326
d46c5b12
KH
4327/* Detect how end-of-line of a text of length SRC_BYTES pointed by
4328 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4329 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4330
4331 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 4332
bc4bc72a
RS
4333#define MAX_EOL_CHECK_COUNT 3
4334
d46c5b12
KH
4335static int
4336detect_eol_type (source, src_bytes, skip)
4337 unsigned char *source;
4338 int src_bytes, *skip;
4ed46869 4339{
d46c5b12 4340 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 4341 unsigned char c;
bc4bc72a
RS
4342 int total = 0; /* How many end-of-lines are found so far. */
4343 int eol_type = CODING_EOL_UNDECIDED;
4344 int this_eol_type;
4ed46869 4345
d46c5b12
KH
4346 *skip = 0;
4347
bc4bc72a 4348 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
4349 {
4350 c = *src++;
bc4bc72a 4351 if (c == '\n' || c == '\r')
4ed46869 4352 {
d46c5b12
KH
4353 if (*skip == 0)
4354 *skip = src - 1 - source;
bc4bc72a
RS
4355 total++;
4356 if (c == '\n')
4357 this_eol_type = CODING_EOL_LF;
4358 else if (src >= src_end || *src != '\n')
4359 this_eol_type = CODING_EOL_CR;
4ed46869 4360 else
bc4bc72a
RS
4361 this_eol_type = CODING_EOL_CRLF, src++;
4362
4363 if (eol_type == CODING_EOL_UNDECIDED)
4364 /* This is the first end-of-line. */
4365 eol_type = this_eol_type;
4366 else if (eol_type != this_eol_type)
d46c5b12
KH
4367 {
4368 /* The found type is different from what found before. */
4369 eol_type = CODING_EOL_INCONSISTENT;
4370 break;
4371 }
4ed46869
KH
4372 }
4373 }
bc4bc72a 4374
d46c5b12
KH
4375 if (*skip == 0)
4376 *skip = src_end - source;
85a02ca4 4377 return eol_type;
4ed46869
KH
4378}
4379
fa42c37f
KH
4380/* Like detect_eol_type, but detect EOL type in 2-octet
4381 big-endian/little-endian format for coding systems utf-16-be and
4382 utf-16-le. */
4383
4384static int
4385detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4386 unsigned char *source;
cfb43547 4387 int src_bytes, *skip, big_endian_p;
fa42c37f
KH
4388{
4389 unsigned char *src = source, *src_end = src + src_bytes;
4390 unsigned int c1, c2;
4391 int total = 0; /* How many end-of-lines are found so far. */
4392 int eol_type = CODING_EOL_UNDECIDED;
4393 int this_eol_type;
4394 int msb, lsb;
4395
4396 if (big_endian_p)
4397 msb = 0, lsb = 1;
4398 else
4399 msb = 1, lsb = 0;
4400
4401 *skip = 0;
4402
4403 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4404 {
4405 c1 = (src[msb] << 8) | (src[lsb]);
4406 src += 2;
4407
4408 if (c1 == '\n' || c1 == '\r')
4409 {
4410 if (*skip == 0)
4411 *skip = src - 2 - source;
4412 total++;
4413 if (c1 == '\n')
4414 {
4415 this_eol_type = CODING_EOL_LF;
4416 }
4417 else
4418 {
4419 if ((src + 1) >= src_end)
4420 {
4421 this_eol_type = CODING_EOL_CR;
4422 }
4423 else
4424 {
4425 c2 = (src[msb] << 8) | (src[lsb]);
4426 if (c2 == '\n')
4427 this_eol_type = CODING_EOL_CRLF, src += 2;
4428 else
4429 this_eol_type = CODING_EOL_CR;
4430 }
4431 }
4432
4433 if (eol_type == CODING_EOL_UNDECIDED)
4434 /* This is the first end-of-line. */
4435 eol_type = this_eol_type;
4436 else if (eol_type != this_eol_type)
4437 {
4438 /* The found type is different from what found before. */
4439 eol_type = CODING_EOL_INCONSISTENT;
4440 break;
4441 }
4442 }
4443 }
4444
4445 if (*skip == 0)
4446 *skip = src_end - source;
4447 return eol_type;
4448}
4449
4ed46869
KH
4450/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4451 is encoded. If it detects an appropriate format of end-of-line, it
4452 sets the information in *CODING. */
4453
4454void
4455detect_eol (coding, src, src_bytes)
4456 struct coding_system *coding;
a4244313 4457 const unsigned char *src;
4ed46869
KH
4458 int src_bytes;
4459{
4608c386 4460 Lisp_Object val;
d46c5b12 4461 int skip;
fa42c37f
KH
4462 int eol_type;
4463
4464 switch (coding->category_idx)
4465 {
4466 case CODING_CATEGORY_IDX_UTF_16_BE:
4467 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4468 break;
4469 case CODING_CATEGORY_IDX_UTF_16_LE:
4470 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4471 break;
4472 default:
4473 eol_type = detect_eol_type (src, src_bytes, &skip);
4474 break;
4475 }
d46c5b12
KH
4476
4477 if (coding->heading_ascii > skip)
4478 coding->heading_ascii = skip;
4479 else
4480 skip = coding->heading_ascii;
4ed46869 4481
0ef69138 4482 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 4483 return;
27901516
KH
4484 if (eol_type == CODING_EOL_INCONSISTENT)
4485 {
4486#if 0
4487 /* This code is suppressed until we find a better way to
992f23f2 4488 distinguish raw text file and binary file. */
27901516
KH
4489
4490 /* If we have already detected that the coding is raw-text, the
4491 coding should actually be no-conversion. */
4492 if (coding->type == coding_type_raw_text)
4493 {
4494 setup_coding_system (Qno_conversion, coding);
4495 return;
4496 }
4497 /* Else, let's decode only text code anyway. */
4498#endif /* 0 */
1b2af4b0 4499 eol_type = CODING_EOL_LF;
27901516
KH
4500 }
4501
4608c386 4502 val = Fget (coding->symbol, Qeol_type);
4ed46869 4503 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12 4504 {
b73bfc1c
KH
4505 int src_multibyte = coding->src_multibyte;
4506 int dst_multibyte = coding->dst_multibyte;
1cd6b64c 4507 struct composition_data *cmp_data = coding->cmp_data;
b73bfc1c 4508
d46c5b12 4509 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
b73bfc1c
KH
4510 coding->src_multibyte = src_multibyte;
4511 coding->dst_multibyte = dst_multibyte;
d46c5b12 4512 coding->heading_ascii = skip;
1cd6b64c 4513 coding->cmp_data = cmp_data;
d46c5b12
KH
4514 }
4515}
4516
4517#define CONVERSION_BUFFER_EXTRA_ROOM 256
4518
b73bfc1c
KH
4519#define DECODING_BUFFER_MAG(coding) \
4520 (coding->type == coding_type_iso2022 \
4521 ? 3 \
4522 : (coding->type == coding_type_ccl \
4523 ? coding->spec.ccl.decoder.buf_magnification \
4524 : 2))
d46c5b12
KH
4525
4526/* Return maximum size (bytes) of a buffer enough for decoding
4527 SRC_BYTES of text encoded in CODING. */
4528
4529int
4530decoding_buffer_size (coding, src_bytes)
4531 struct coding_system *coding;
4532 int src_bytes;
4533{
4534 return (src_bytes * DECODING_BUFFER_MAG (coding)
4535 + CONVERSION_BUFFER_EXTRA_ROOM);
4536}
4537
4538/* Return maximum size (bytes) of a buffer enough for encoding
4539 SRC_BYTES of text to CODING. */
4540
4541int
4542encoding_buffer_size (coding, src_bytes)
4543 struct coding_system *coding;
4544 int src_bytes;
4545{
4546 int magnification;
4547
4548 if (coding->type == coding_type_ccl)
a84f1519
KH
4549 {
4550 magnification = coding->spec.ccl.encoder.buf_magnification;
4551 if (coding->eol_type == CODING_EOL_CRLF)
4552 magnification *= 2;
4553 }
b73bfc1c 4554 else if (CODING_REQUIRE_ENCODING (coding))
d46c5b12 4555 magnification = 3;
b73bfc1c
KH
4556 else
4557 magnification = 1;
d46c5b12
KH
4558
4559 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4560}
4561
73be902c
KH
4562/* Working buffer for code conversion. */
4563struct conversion_buffer
4564{
4565 int size; /* size of data. */
4566 int on_stack; /* 1 if allocated by alloca. */
4567 unsigned char *data;
4568};
d46c5b12 4569
73be902c
KH
4570/* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4571#define allocate_conversion_buffer(buf, len) \
4572 do { \
4573 if (len < MAX_ALLOCA) \
4574 { \
4575 buf.data = (unsigned char *) alloca (len); \
4576 buf.on_stack = 1; \
4577 } \
4578 else \
4579 { \
4580 buf.data = (unsigned char *) xmalloc (len); \
4581 buf.on_stack = 0; \
4582 } \
4583 buf.size = len; \
4584 } while (0)
d46c5b12 4585
73be902c
KH
4586/* Double the allocated memory for *BUF. */
4587static void
4588extend_conversion_buffer (buf)
4589 struct conversion_buffer *buf;
d46c5b12 4590{
73be902c 4591 if (buf->on_stack)
d46c5b12 4592 {
73be902c
KH
4593 unsigned char *save = buf->data;
4594 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4595 bcopy (save, buf->data, buf->size);
4596 buf->on_stack = 0;
d46c5b12 4597 }
73be902c
KH
4598 else
4599 {
4600 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4601 }
4602 buf->size *= 2;
4603}
4604
4605/* Free the allocated memory for BUF if it is not on stack. */
4606static void
4607free_conversion_buffer (buf)
4608 struct conversion_buffer *buf;
4609{
4610 if (!buf->on_stack)
4611 xfree (buf->data);
d46c5b12
KH
4612}
4613
4614int
4615ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4616 struct coding_system *coding;
4617 unsigned char *source, *destination;
4618 int src_bytes, dst_bytes, encodep;
4619{
4620 struct ccl_program *ccl
4621 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
1c3478b0 4622 unsigned char *dst = destination;
d46c5b12 4623
bd64290d 4624 ccl->suppress_error = coding->suppress_error;
ae9ff118 4625 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
aaaf0b1e 4626 if (encodep)
80e0ca99
KH
4627 {
4628 /* On encoding, EOL format is converted within ccl_driver. For
4629 that, setup proper information in the structure CCL. */
4630 ccl->eol_type = coding->eol_type;
4631 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4632 ccl->eol_type = CODING_EOL_LF;
4633 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
b671ed5e 4634 ccl->eight_bit_control = coding->dst_multibyte;
80e0ca99 4635 }
b671ed5e
KH
4636 else
4637 ccl->eight_bit_control = 1;
7272d75c 4638 ccl->multibyte = coding->src_multibyte;
1c3478b0
KH
4639 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4640 {
4641 /* Move carryover bytes to DESTINATION. */
4642 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4643 while (*p)
4644 *dst++ = *p++;
4645 coding->spec.ccl.eight_bit_carryover[0] = 0;
4646 if (dst_bytes)
4647 dst_bytes -= dst - destination;
4648 }
4649
4650 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4651 &(coding->consumed))
4652 + dst - destination);
4653
b73bfc1c 4654 if (encodep)
80e0ca99
KH
4655 {
4656 coding->produced_char = coding->produced;
4657 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4658 }
ade8d05e
KH
4659 else if (!ccl->eight_bit_control)
4660 {
4661 /* The produced bytes forms a valid multibyte sequence. */
4662 coding->produced_char
4663 = multibyte_chars_in_text (destination, coding->produced);
4664 coding->spec.ccl.eight_bit_carryover[0] = 0;
4665 }
b73bfc1c
KH
4666 else
4667 {
1c3478b0
KH
4668 /* On decoding, the destination should always multibyte. But,
4669 CCL program might have been generated an invalid multibyte
4670 sequence. Here we make such a sequence valid as
4671 multibyte. */
b73bfc1c
KH
4672 int bytes
4673 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
1c3478b0
KH
4674
4675 if ((coding->consumed < src_bytes
4676 || !ccl->last_block)
4677 && coding->produced >= 1
4678 && destination[coding->produced - 1] >= 0x80)
4679 {
4680 /* We should not convert the tailing 8-bit codes to
4681 multibyte form even if they doesn't form a valid
4682 multibyte sequence. They may form a valid sequence in
4683 the next call. */
4684 int carryover = 0;
4685
4686 if (destination[coding->produced - 1] < 0xA0)
4687 carryover = 1;
4688 else if (coding->produced >= 2)
4689 {
4690 if (destination[coding->produced - 2] >= 0x80)
4691 {
4692 if (destination[coding->produced - 2] < 0xA0)
4693 carryover = 2;
4694 else if (coding->produced >= 3
4695 && destination[coding->produced - 3] >= 0x80
4696 && destination[coding->produced - 3] < 0xA0)
4697 carryover = 3;
4698 }
4699 }
4700 if (carryover > 0)
4701 {
4702 BCOPY_SHORT (destination + coding->produced - carryover,
4703 coding->spec.ccl.eight_bit_carryover,
4704 carryover);
4705 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4706 coding->produced -= carryover;
4707 }
4708 }
b73bfc1c
KH
4709 coding->produced = str_as_multibyte (destination, bytes,
4710 coding->produced,
4711 &(coding->produced_char));
4712 }
69f76525 4713
d46c5b12
KH
4714 switch (ccl->status)
4715 {
4716 case CCL_STAT_SUSPEND_BY_SRC:
73be902c 4717 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
d46c5b12
KH
4718 break;
4719 case CCL_STAT_SUSPEND_BY_DST:
73be902c 4720 coding->result = CODING_FINISH_INSUFFICIENT_DST;
d46c5b12 4721 break;
9864ebce
KH
4722 case CCL_STAT_QUIT:
4723 case CCL_STAT_INVALID_CMD:
73be902c 4724 coding->result = CODING_FINISH_INTERRUPT;
9864ebce 4725 break;
d46c5b12 4726 default:
73be902c 4727 coding->result = CODING_FINISH_NORMAL;
d46c5b12
KH
4728 break;
4729 }
73be902c 4730 return coding->result;
4ed46869
KH
4731}
4732
aaaf0b1e
KH
4733/* Decode EOL format of the text at PTR of BYTES length destructively
4734 according to CODING->eol_type. This is called after the CCL
4735 program produced a decoded text at PTR. If we do CRLF->LF
4736 conversion, update CODING->produced and CODING->produced_char. */
4737
4738static void
4739decode_eol_post_ccl (coding, ptr, bytes)
4740 struct coding_system *coding;
4741 unsigned char *ptr;
4742 int bytes;
4743{
4744 Lisp_Object val, saved_coding_symbol;
4745 unsigned char *pend = ptr + bytes;
4746 int dummy;
4747
4748 /* Remember the current coding system symbol. We set it back when
4749 an inconsistent EOL is found so that `last-coding-system-used' is
4750 set to the coding system that doesn't specify EOL conversion. */
4751 saved_coding_symbol = coding->symbol;
4752
4753 coding->spec.ccl.cr_carryover = 0;
4754 if (coding->eol_type == CODING_EOL_UNDECIDED)
4755 {
4756 /* Here, to avoid the call of setup_coding_system, we directly
4757 call detect_eol_type. */
4758 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
74b01b80
EZ
4759 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4760 coding->eol_type = CODING_EOL_LF;
4761 if (coding->eol_type != CODING_EOL_UNDECIDED)
4762 {
4763 val = Fget (coding->symbol, Qeol_type);
4764 if (VECTORP (val) && XVECTOR (val)->size == 3)
4765 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4766 }
aaaf0b1e
KH
4767 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4768 }
4769
74b01b80
EZ
4770 if (coding->eol_type == CODING_EOL_LF
4771 || coding->eol_type == CODING_EOL_UNDECIDED)
aaaf0b1e
KH
4772 {
4773 /* We have nothing to do. */
4774 ptr = pend;
4775 }
4776 else if (coding->eol_type == CODING_EOL_CRLF)
4777 {
4778 unsigned char *pstart = ptr, *p = ptr;
4779
4780 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4781 && *(pend - 1) == '\r')
4782 {
4783 /* If the last character is CR, we can't handle it here
4784 because LF will be in the not-yet-decoded source text.
9861e777 4785 Record that the CR is not yet processed. */
aaaf0b1e
KH
4786 coding->spec.ccl.cr_carryover = 1;
4787 coding->produced--;
4788 coding->produced_char--;
4789 pend--;
4790 }
4791 while (ptr < pend)
4792 {
4793 if (*ptr == '\r')
4794 {
4795 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4796 {
4797 *p++ = '\n';
4798 ptr += 2;
4799 }
4800 else
4801 {
4802 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4803 goto undo_eol_conversion;
4804 *p++ = *ptr++;
4805 }
4806 }
4807 else if (*ptr == '\n'
4808 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4809 goto undo_eol_conversion;
4810 else
4811 *p++ = *ptr++;
4812 continue;
4813
4814 undo_eol_conversion:
4815 /* We have faced with inconsistent EOL format at PTR.
4816 Convert all LFs before PTR back to CRLFs. */
4817 for (p--, ptr--; p >= pstart; p--)
4818 {
4819 if (*p == '\n')
4820 *ptr-- = '\n', *ptr-- = '\r';
4821 else
4822 *ptr-- = *p;
4823 }
4824 /* If carryover is recorded, cancel it because we don't
4825 convert CRLF anymore. */
4826 if (coding->spec.ccl.cr_carryover)
4827 {
4828 coding->spec.ccl.cr_carryover = 0;
4829 coding->produced++;
4830 coding->produced_char++;
4831 pend++;
4832 }
4833 p = ptr = pend;
4834 coding->eol_type = CODING_EOL_LF;
4835 coding->symbol = saved_coding_symbol;
4836 }
4837 if (p < pend)
4838 {
4839 /* As each two-byte sequence CRLF was converted to LF, (PEND
4840 - P) is the number of deleted characters. */
4841 coding->produced -= pend - p;
4842 coding->produced_char -= pend - p;
4843 }
4844 }
4845 else /* i.e. coding->eol_type == CODING_EOL_CR */
4846 {
4847 unsigned char *p = ptr;
4848
4849 for (; ptr < pend; ptr++)
4850 {
4851 if (*ptr == '\r')
4852 *ptr = '\n';
4853 else if (*ptr == '\n'
4854 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4855 {
4856 for (; p < ptr; p++)
4857 {
4858 if (*p == '\n')
4859 *p = '\r';
4860 }
4861 ptr = pend;
4862 coding->eol_type = CODING_EOL_LF;
4863 coding->symbol = saved_coding_symbol;
4864 }
4865 }
4866 }
4867}
4868
4ed46869
KH
4869/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4870 decoding, it may detect coding system and format of end-of-line if
b73bfc1c
KH
4871 those are not yet decided. The source should be unibyte, the
4872 result is multibyte if CODING->dst_multibyte is nonzero, else
4873 unibyte. */
4ed46869
KH
4874
4875int
d46c5b12 4876decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4877 struct coding_system *coding;
4b982924 4878 const unsigned char *source;
a4244313 4879 unsigned char *destination;
4ed46869 4880 int src_bytes, dst_bytes;
4ed46869 4881{
9861e777
EZ
4882 int extra = 0;
4883
0ef69138 4884 if (coding->type == coding_type_undecided)
4ed46869
KH
4885 detect_coding (coding, source, src_bytes);
4886
aaaf0b1e
KH
4887 if (coding->eol_type == CODING_EOL_UNDECIDED
4888 && coding->type != coding_type_ccl)
8844fa83
KH
4889 {
4890 detect_eol (coding, source, src_bytes);
4891 /* We had better recover the original eol format if we
8ca3766a 4892 encounter an inconsistent eol format while decoding. */
8844fa83
KH
4893 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4894 }
4ed46869 4895
b73bfc1c
KH
4896 coding->produced = coding->produced_char = 0;
4897 coding->consumed = coding->consumed_char = 0;
4898 coding->errors = 0;
4899 coding->result = CODING_FINISH_NORMAL;
4900
4ed46869
KH
4901 switch (coding->type)
4902 {
4ed46869 4903 case coding_type_sjis:
b73bfc1c
KH
4904 decode_coding_sjis_big5 (coding, source, destination,
4905 src_bytes, dst_bytes, 1);
4ed46869
KH
4906 break;
4907
4908 case coding_type_iso2022:
b73bfc1c
KH
4909 decode_coding_iso2022 (coding, source, destination,
4910 src_bytes, dst_bytes);
4ed46869
KH
4911 break;
4912
4913 case coding_type_big5:
b73bfc1c
KH
4914 decode_coding_sjis_big5 (coding, source, destination,
4915 src_bytes, dst_bytes, 0);
4916 break;
4917
4918 case coding_type_emacs_mule:
4919 decode_coding_emacs_mule (coding, source, destination,
4920 src_bytes, dst_bytes);
4ed46869
KH
4921 break;
4922
4923 case coding_type_ccl:
aaaf0b1e
KH
4924 if (coding->spec.ccl.cr_carryover)
4925 {
9861e777
EZ
4926 /* Put the CR which was not processed by the previous call
4927 of decode_eol_post_ccl in DESTINATION. It will be
4928 decoded together with the following LF by the call to
4929 decode_eol_post_ccl below. */
aaaf0b1e
KH
4930 *destination = '\r';
4931 coding->produced++;
4932 coding->produced_char++;
4933 dst_bytes--;
9861e777 4934 extra = coding->spec.ccl.cr_carryover;
aaaf0b1e 4935 }
9861e777 4936 ccl_coding_driver (coding, source, destination + extra,
b73bfc1c 4937 src_bytes, dst_bytes, 0);
aaaf0b1e 4938 if (coding->eol_type != CODING_EOL_LF)
9861e777
EZ
4939 {
4940 coding->produced += extra;
4941 coding->produced_char += extra;
4942 decode_eol_post_ccl (coding, destination, coding->produced);
4943 }
d46c5b12
KH
4944 break;
4945
b73bfc1c
KH
4946 default:
4947 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4948 }
4949
4950 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
e7c9eef9 4951 && coding->mode & CODING_MODE_LAST_BLOCK
b73bfc1c
KH
4952 && coding->consumed == src_bytes)
4953 coding->result = CODING_FINISH_NORMAL;
4954
4955 if (coding->mode & CODING_MODE_LAST_BLOCK
4956 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4957 {
a4244313 4958 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
4959 unsigned char *dst = destination + coding->produced;
4960
4961 src_bytes -= coding->consumed;
bb10be8b 4962 coding->errors++;
b73bfc1c
KH
4963 if (COMPOSING_P (coding))
4964 DECODE_COMPOSITION_END ('1');
4965 while (src_bytes--)
d46c5b12 4966 {
b73bfc1c
KH
4967 int c = *src++;
4968 dst += CHAR_STRING (c, dst);
4969 coding->produced_char++;
d46c5b12 4970 }
b73bfc1c
KH
4971 coding->consumed = coding->consumed_char = src - source;
4972 coding->produced = dst - destination;
73be902c 4973 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
4974 }
4975
b73bfc1c
KH
4976 if (!coding->dst_multibyte)
4977 {
4978 coding->produced = str_as_unibyte (destination, coding->produced);
4979 coding->produced_char = coding->produced;
4980 }
4ed46869 4981
b73bfc1c
KH
4982 return coding->result;
4983}
52d41803 4984
b73bfc1c
KH
4985/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4986 multibyteness of the source is CODING->src_multibyte, the
4987 multibyteness of the result is always unibyte. */
4ed46869
KH
4988
4989int
d46c5b12 4990encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869 4991 struct coding_system *coding;
a4244313
KR
4992 const unsigned char *source;
4993 unsigned char *destination;
4ed46869 4994 int src_bytes, dst_bytes;
4ed46869 4995{
b73bfc1c
KH
4996 coding->produced = coding->produced_char = 0;
4997 coding->consumed = coding->consumed_char = 0;
4998 coding->errors = 0;
4999 coding->result = CODING_FINISH_NORMAL;
4ed46869 5000
d46c5b12
KH
5001 switch (coding->type)
5002 {
4ed46869 5003 case coding_type_sjis:
b73bfc1c
KH
5004 encode_coding_sjis_big5 (coding, source, destination,
5005 src_bytes, dst_bytes, 1);
4ed46869
KH
5006 break;
5007
5008 case coding_type_iso2022:
b73bfc1c
KH
5009 encode_coding_iso2022 (coding, source, destination,
5010 src_bytes, dst_bytes);
4ed46869
KH
5011 break;
5012
5013 case coding_type_big5:
b73bfc1c
KH
5014 encode_coding_sjis_big5 (coding, source, destination,
5015 src_bytes, dst_bytes, 0);
5016 break;
5017
5018 case coding_type_emacs_mule:
5019 encode_coding_emacs_mule (coding, source, destination,
5020 src_bytes, dst_bytes);
4ed46869
KH
5021 break;
5022
5023 case coding_type_ccl:
b73bfc1c
KH
5024 ccl_coding_driver (coding, source, destination,
5025 src_bytes, dst_bytes, 1);
d46c5b12
KH
5026 break;
5027
b73bfc1c
KH
5028 default:
5029 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5030 }
5031
73be902c
KH
5032 if (coding->mode & CODING_MODE_LAST_BLOCK
5033 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
b73bfc1c 5034 {
a4244313 5035 const unsigned char *src = source + coding->consumed;
b73bfc1c
KH
5036 unsigned char *dst = destination + coding->produced;
5037
5038 if (coding->type == coding_type_iso2022)
5039 ENCODE_RESET_PLANE_AND_REGISTER;
5040 if (COMPOSING_P (coding))
5041 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5042 if (coding->consumed < src_bytes)
d46c5b12 5043 {
b73bfc1c
KH
5044 int len = src_bytes - coding->consumed;
5045
fabf4a91 5046 BCOPY_SHORT (src, dst, len);
b73bfc1c
KH
5047 if (coding->src_multibyte)
5048 len = str_as_unibyte (dst, len);
5049 dst += len;
5050 coding->consumed = src_bytes;
d46c5b12 5051 }
b73bfc1c 5052 coding->produced = coding->produced_char = dst - destination;
73be902c 5053 coding->result = CODING_FINISH_NORMAL;
4ed46869
KH
5054 }
5055
bb10be8b
KH
5056 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5057 && coding->consumed == src_bytes)
5058 coding->result = CODING_FINISH_NORMAL;
5059
b73bfc1c 5060 return coding->result;
4ed46869
KH
5061}
5062
fb88bf2d
KH
5063/* Scan text in the region between *BEG and *END (byte positions),
5064 skip characters which we don't have to decode by coding system
5065 CODING at the head and tail, then set *BEG and *END to the region
5066 of the text we actually have to convert. The caller should move
b73bfc1c
KH
5067 the gap out of the region in advance if the region is from a
5068 buffer.
4ed46869 5069
d46c5b12
KH
5070 If STR is not NULL, *BEG and *END are indices into STR. */
5071
5072static void
5073shrink_decoding_region (beg, end, coding, str)
5074 int *beg, *end;
5075 struct coding_system *coding;
5076 unsigned char *str;
5077{
fb88bf2d 5078 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 5079 int eol_conversion;
88993dfd 5080 Lisp_Object translation_table;
d46c5b12
KH
5081
5082 if (coding->type == coding_type_ccl
5083 || coding->type == coding_type_undecided
b73bfc1c
KH
5084 || coding->eol_type != CODING_EOL_LF
5085 || !NILP (coding->post_read_conversion)
5086 || coding->composing != COMPOSITION_DISABLED)
d46c5b12
KH
5087 {
5088 /* We can't skip any data. */
5089 return;
5090 }
b73bfc1c
KH
5091 if (coding->type == coding_type_no_conversion
5092 || coding->type == coding_type_raw_text
5093 || coding->type == coding_type_emacs_mule)
d46c5b12 5094 {
fb88bf2d
KH
5095 /* We need no conversion, but don't have to skip any data here.
5096 Decoding routine handles them effectively anyway. */
d46c5b12
KH
5097 return;
5098 }
5099
88993dfd
KH
5100 translation_table = coding->translation_table_for_decode;
5101 if (NILP (translation_table) && !NILP (Venable_character_translation))
5102 translation_table = Vstandard_translation_table_for_decode;
5103 if (CHAR_TABLE_P (translation_table))
5104 {
5105 int i;
5106 for (i = 0; i < 128; i++)
5107 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5108 break;
5109 if (i < 128)
fa46990e 5110 /* Some ASCII character should be translated. We give up
88993dfd
KH
5111 shrinking. */
5112 return;
5113 }
5114
b73bfc1c 5115 if (coding->heading_ascii >= 0)
d46c5b12
KH
5116 /* Detection routine has already found how much we can skip at the
5117 head. */
5118 *beg += coding->heading_ascii;
5119
5120 if (str)
5121 {
5122 begp_orig = begp = str + *beg;
5123 endp_orig = endp = str + *end;
5124 }
5125 else
5126 {
fb88bf2d 5127 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
5128 endp_orig = endp = begp + *end - *beg;
5129 }
5130
fa46990e
DL
5131 eol_conversion = (coding->eol_type == CODING_EOL_CR
5132 || coding->eol_type == CODING_EOL_CRLF);
5133
d46c5b12
KH
5134 switch (coding->type)
5135 {
d46c5b12
KH
5136 case coding_type_sjis:
5137 case coding_type_big5:
5138 /* We can skip all ASCII characters at the head. */
5139 if (coding->heading_ascii < 0)
5140 {
5141 if (eol_conversion)
de9d083c 5142 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
5143 else
5144 while (begp < endp && *begp < 0x80) begp++;
5145 }
5146 /* We can skip all ASCII characters at the tail except for the
5147 second byte of SJIS or BIG5 code. */
5148 if (eol_conversion)
de9d083c 5149 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
5150 else
5151 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
5152 /* Do not consider LF as ascii if preceded by CR, since that
5153 confuses eol decoding. */
5154 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5155 endp++;
d46c5b12
KH
5156 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5157 endp++;
5158 break;
5159
b73bfc1c 5160 case coding_type_iso2022:
622fece5
KH
5161 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5162 /* We can't skip any data. */
5163 break;
d46c5b12
KH
5164 if (coding->heading_ascii < 0)
5165 {
d46c5b12
KH
5166 /* We can skip all ASCII characters at the head except for a
5167 few control codes. */
5168 while (begp < endp && (c = *begp) < 0x80
5169 && c != ISO_CODE_CR && c != ISO_CODE_SO
5170 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5171 && (!eol_conversion || c != ISO_CODE_LF))
5172 begp++;
5173 }
5174 switch (coding->category_idx)
5175 {
5176 case CODING_CATEGORY_IDX_ISO_8_1:
5177 case CODING_CATEGORY_IDX_ISO_8_2:
5178 /* We can skip all ASCII characters at the tail. */
5179 if (eol_conversion)
de9d083c 5180 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
5181 else
5182 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
5183 /* Do not consider LF as ascii if preceded by CR, since that
5184 confuses eol decoding. */
5185 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5186 endp++;
d46c5b12
KH
5187 break;
5188
5189 case CODING_CATEGORY_IDX_ISO_7:
5190 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5 5191 {
8ca3766a 5192 /* We can skip all characters at the tail except for 8-bit
de79a6a5
KH
5193 codes and ESC and the following 2-byte at the tail. */
5194 unsigned char *eight_bit = NULL;
5195
5196 if (eol_conversion)
5197 while (begp < endp
5198 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5199 {
5200 if (!eight_bit && c & 0x80) eight_bit = endp;
5201 endp--;
5202 }
5203 else
5204 while (begp < endp
5205 && (c = endp[-1]) != ISO_CODE_ESC)
5206 {
5207 if (!eight_bit && c & 0x80) eight_bit = endp;
5208 endp--;
5209 }
5210 /* Do not consider LF as ascii if preceded by CR, since that
5211 confuses eol decoding. */
5212 if (begp < endp && endp < endp_orig
5213 && endp[-1] == '\r' && endp[0] == '\n')
5214 endp++;
5215 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5216 {
5217 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5218 /* This is an ASCII designation sequence. We can
5219 surely skip the tail. But, if we have
5220 encountered an 8-bit code, skip only the codes
5221 after that. */
5222 endp = eight_bit ? eight_bit : endp + 2;
5223 else
5224 /* Hmmm, we can't skip the tail. */
5225 endp = endp_orig;
5226 }
5227 else if (eight_bit)
5228 endp = eight_bit;
5229 }
d46c5b12 5230 }
b73bfc1c
KH
5231 break;
5232
5233 default:
5234 abort ();
d46c5b12
KH
5235 }
5236 *beg += begp - begp_orig;
5237 *end += endp - endp_orig;
5238 return;
5239}
5240
5241/* Like shrink_decoding_region but for encoding. */
5242
5243static void
5244shrink_encoding_region (beg, end, coding, str)
5245 int *beg, *end;
5246 struct coding_system *coding;
5247 unsigned char *str;
5248{
5249 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5250 int eol_conversion;
88993dfd 5251 Lisp_Object translation_table;
d46c5b12 5252
b73bfc1c
KH
5253 if (coding->type == coding_type_ccl
5254 || coding->eol_type == CODING_EOL_CRLF
5255 || coding->eol_type == CODING_EOL_CR
87323294 5256 || (coding->cmp_data && coding->cmp_data->used > 0))
d46c5b12 5257 {
b73bfc1c
KH
5258 /* We can't skip any data. */
5259 return;
5260 }
5261 if (coding->type == coding_type_no_conversion
5262 || coding->type == coding_type_raw_text
5263 || coding->type == coding_type_emacs_mule
5264 || coding->type == coding_type_undecided)
5265 {
5266 /* We need no conversion, but don't have to skip any data here.
5267 Encoding routine handles them effectively anyway. */
d46c5b12
KH
5268 return;
5269 }
5270
88993dfd
KH
5271 translation_table = coding->translation_table_for_encode;
5272 if (NILP (translation_table) && !NILP (Venable_character_translation))
5273 translation_table = Vstandard_translation_table_for_encode;
5274 if (CHAR_TABLE_P (translation_table))
5275 {
5276 int i;
5277 for (i = 0; i < 128; i++)
5278 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5279 break;
5280 if (i < 128)
8ca3766a 5281 /* Some ASCII character should be translated. We give up
88993dfd
KH
5282 shrinking. */
5283 return;
5284 }
5285
d46c5b12
KH
5286 if (str)
5287 {
5288 begp_orig = begp = str + *beg;
5289 endp_orig = endp = str + *end;
5290 }
5291 else
5292 {
fb88bf2d 5293 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
5294 endp_orig = endp = begp + *end - *beg;
5295 }
5296
5297 eol_conversion = (coding->eol_type == CODING_EOL_CR
5298 || coding->eol_type == CODING_EOL_CRLF);
5299
5300 /* Here, we don't have to check coding->pre_write_conversion because
5301 the caller is expected to have handled it already. */
5302 switch (coding->type)
5303 {
d46c5b12 5304 case coding_type_iso2022:
622fece5
KH
5305 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5306 /* We can't skip any data. */
5307 break;
d46c5b12
KH
5308 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5309 {
93dec019 5310 unsigned char *bol = begp;
d46c5b12
KH
5311 while (begp < endp && *begp < 0x80)
5312 {
5313 begp++;
5314 if (begp[-1] == '\n')
5315 bol = begp;
5316 }
5317 begp = bol;
5318 goto label_skip_tail;
5319 }
5320 /* fall down ... */
5321
b73bfc1c
KH
5322 case coding_type_sjis:
5323 case coding_type_big5:
d46c5b12
KH
5324 /* We can skip all ASCII characters at the head and tail. */
5325 if (eol_conversion)
5326 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5327 else
5328 while (begp < endp && *begp < 0x80) begp++;
5329 label_skip_tail:
5330 if (eol_conversion)
5331 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5332 else
5333 while (begp < endp && *(endp - 1) < 0x80) endp--;
5334 break;
b73bfc1c
KH
5335
5336 default:
5337 abort ();
d46c5b12
KH
5338 }
5339
5340 *beg += begp - begp_orig;
5341 *end += endp - endp_orig;
5342 return;
5343}
5344
88993dfd
KH
5345/* As shrinking conversion region requires some overhead, we don't try
5346 shrinking if the length of conversion region is less than this
5347 value. */
5348static int shrink_conversion_region_threshhold = 1024;
5349
5350#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5351 do { \
5352 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5353 { \
5354 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5355 else shrink_decoding_region (beg, end, coding, str); \
5356 } \
5357 } while (0)
5358
24a2b282
KH
5359/* ARG is (CODING BUFFER ...) where CODING is what to be set in
5360 Vlast_coding_system_used and the remaining elements are buffers to
16ef9c56 5361 kill. */
b843d1ae 5362static Lisp_Object
1c7457e2
KH
5363code_convert_region_unwind (arg)
5364 Lisp_Object arg;
b843d1ae 5365{
89aa725a
KH
5366 struct gcpro gcpro1;
5367 GCPRO1 (arg);
5368
b843d1ae 5369 inhibit_pre_post_conversion = 0;
16ef9c56 5370 Vlast_coding_system_used = XCAR (arg);
24a2b282
KH
5371 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5372 Fkill_buffer (XCAR (arg));
89aa725a
KH
5373
5374 UNGCPRO;
b843d1ae
KH
5375 return Qnil;
5376}
5377
ec6d2bb8
KH
5378/* Store information about all compositions in the range FROM and TO
5379 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5380 buffer or a string, defaults to the current buffer. */
5381
5382void
5383coding_save_composition (coding, from, to, obj)
5384 struct coding_system *coding;
5385 int from, to;
5386 Lisp_Object obj;
5387{
5388 Lisp_Object prop;
5389 int start, end;
5390
91bee881
KH
5391 if (coding->composing == COMPOSITION_DISABLED)
5392 return;
5393 if (!coding->cmp_data)
5394 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
5395 if (!find_composition (from, to, &start, &end, &prop, obj)
5396 || end > to)
5397 return;
5398 if (start < from
5399 && (!find_composition (end, to, &start, &end, &prop, obj)
5400 || end > to))
5401 return;
5402 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
5403 do
5404 {
5405 if (COMPOSITION_VALID_P (start, end, prop))
5406 {
5407 enum composition_method method = COMPOSITION_METHOD (prop);
5408 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5409 >= COMPOSITION_DATA_SIZE)
5410 coding_allocate_composition_data (coding, from);
5411 /* For relative composition, we remember start and end
5412 positions, for the other compositions, we also remember
5413 components. */
5414 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5415 if (method != COMPOSITION_RELATIVE)
5416 {
5417 /* We must store a*/
5418 Lisp_Object val, ch;
5419
5420 val = COMPOSITION_COMPONENTS (prop);
5421 if (CONSP (val))
5422 while (CONSP (val))
5423 {
5424 ch = XCAR (val), val = XCDR (val);
5425 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5426 }
5427 else if (VECTORP (val) || STRINGP (val))
5428 {
5429 int len = (VECTORP (val)
d5db4077 5430 ? XVECTOR (val)->size : SCHARS (val));
ec6d2bb8
KH
5431 int i;
5432 for (i = 0; i < len; i++)
5433 {
5434 ch = (STRINGP (val)
5435 ? Faref (val, make_number (i))
5436 : XVECTOR (val)->contents[i]);
5437 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5438 }
5439 }
5440 else /* INTEGERP (val) */
5441 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5442 }
5443 CODING_ADD_COMPOSITION_END (coding, end - from);
5444 }
5445 start = end;
5446 }
5447 while (start < to
5448 && find_composition (start, to, &start, &end, &prop, obj)
5449 && end <= to);
5450
5451 /* Make coding->cmp_data point to the first memory block. */
5452 while (coding->cmp_data->prev)
5453 coding->cmp_data = coding->cmp_data->prev;
5454 coding->cmp_data_start = 0;
5455}
5456
5457/* Reflect the saved information about compositions to OBJ.
8ca3766a 5458 CODING->cmp_data points to a memory block for the information. OBJ
ec6d2bb8
KH
5459 is a buffer or a string, defaults to the current buffer. */
5460
33fb63eb 5461void
ec6d2bb8
KH
5462coding_restore_composition (coding, obj)
5463 struct coding_system *coding;
5464 Lisp_Object obj;
5465{
5466 struct composition_data *cmp_data = coding->cmp_data;
5467
5468 if (!cmp_data)
5469 return;
5470
5471 while (cmp_data->prev)
5472 cmp_data = cmp_data->prev;
5473
5474 while (cmp_data)
5475 {
5476 int i;
5477
78108bcd
KH
5478 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5479 i += cmp_data->data[i])
ec6d2bb8
KH
5480 {
5481 int *data = cmp_data->data + i;
5482 enum composition_method method = (enum composition_method) data[3];
5483 Lisp_Object components;
5484
4307d534
KH
5485 if (data[0] < 0 || i + data[0] > cmp_data->used)
5486 /* Invalid composition data. */
5487 break;
5488
ec6d2bb8
KH
5489 if (method == COMPOSITION_RELATIVE)
5490 components = Qnil;
5491 else
5492 {
5493 int len = data[0] - 4, j;
5494 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5495
b6871cc7
KH
5496 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5497 && len % 2 == 0)
5498 len --;
09721b31
KH
5499 if (len < 1)
5500 /* Invalid composition data. */
5501 break;
ec6d2bb8
KH
5502 for (j = 0; j < len; j++)
5503 args[j] = make_number (data[4 + j]);
5504 components = (method == COMPOSITION_WITH_ALTCHARS
316d4bf9
SM
5505 ? Fstring (len, args)
5506 : Fvector (len, args));
ec6d2bb8
KH
5507 }
5508 compose_text (data[1], data[2], components, Qnil, obj);
5509 }
5510 cmp_data = cmp_data->next;
5511 }
5512}
5513
d46c5b12 5514/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
5515 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5516 coding system CODING, and return the status code of code conversion
5517 (currently, this value has no meaning).
5518
5519 How many characters (and bytes) are converted to how many
5520 characters (and bytes) are recorded in members of the structure
5521 CODING.
d46c5b12 5522
6e44253b 5523 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 5524 is deleted and a new text is inserted. See the comments in
b73bfc1c
KH
5525 replace_range (insdel.c) to know what we are doing.
5526
5527 If REPLACE is zero, it is assumed that the source text is unibyte.
8ca3766a 5528 Otherwise, it is assumed that the source text is multibyte. */
4ed46869
KH
5529
5530int
6e44253b
KH
5531code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5532 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 5533 struct coding_system *coding;
4ed46869 5534{
fb88bf2d 5535 int len = to - from, len_byte = to_byte - from_byte;
72d1a715 5536 int nchars_del = 0, nbytes_del = 0;
fb88bf2d 5537 int require, inserted, inserted_byte;
4b39528c 5538 int head_skip, tail_skip, total_skip = 0;
84d60297 5539 Lisp_Object saved_coding_symbol;
fb88bf2d 5540 int first = 1;
fb88bf2d 5541 unsigned char *src, *dst;
84d60297 5542 Lisp_Object deletion;
e133c8fa 5543 int orig_point = PT, orig_len = len;
6abb9bd9 5544 int prev_Z;
b73bfc1c
KH
5545 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5546
84d60297 5547 deletion = Qnil;
8844fa83 5548 saved_coding_symbol = coding->symbol;
d46c5b12 5549
83fa074f 5550 if (from < PT && PT < to)
e133c8fa
KH
5551 {
5552 TEMP_SET_PT_BOTH (from, from_byte);
5553 orig_point = from;
5554 }
83fa074f 5555
6e44253b 5556 if (replace)
d46c5b12 5557 {
fb88bf2d 5558 int saved_from = from;
e077cc80 5559 int saved_inhibit_modification_hooks;
fb88bf2d 5560
d46c5b12 5561 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
5562 if (saved_from != from)
5563 {
5564 to = from + len;
b73bfc1c 5565 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
fb88bf2d
KH
5566 len_byte = to_byte - from_byte;
5567 }
e077cc80
KH
5568
5569 /* The code conversion routine can not preserve text properties
5570 for now. So, we must remove all text properties in the
5571 region. Here, we must suppress all modification hooks. */
5572 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5573 inhibit_modification_hooks = 1;
5574 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5575 inhibit_modification_hooks = saved_inhibit_modification_hooks;
d46c5b12 5576 }
d46c5b12
KH
5577
5578 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5579 {
12410ef1 5580 /* We must detect encoding of text and eol format. */
d46c5b12
KH
5581
5582 if (from < GPT && to > GPT)
5583 move_gap_both (from, from_byte);
5584 if (coding->type == coding_type_undecided)
5585 {
fb88bf2d 5586 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 5587 if (coding->type == coding_type_undecided)
62b3ef1d
KH
5588 {
5589 /* It seems that the text contains only ASCII, but we
d9aef30f 5590 should not leave it undecided because the deeper
62b3ef1d
KH
5591 decoding routine (decode_coding) tries to detect the
5592 encodings again in vain. */
5593 coding->type = coding_type_emacs_mule;
5594 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
d280ccb6
KH
5595 /* As emacs-mule decoder will handle composition, we
5596 need this setting to allocate coding->cmp_data
5597 later. */
5598 coding->composing = COMPOSITION_NO;
62b3ef1d 5599 }
d46c5b12 5600 }
aaaf0b1e
KH
5601 if (coding->eol_type == CODING_EOL_UNDECIDED
5602 && coding->type != coding_type_ccl)
d46c5b12 5603 {
d46c5b12
KH
5604 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5605 if (coding->eol_type == CODING_EOL_UNDECIDED)
5606 coding->eol_type = CODING_EOL_LF;
5607 /* We had better recover the original eol format if we
8ca3766a 5608 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
5609 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5610 }
5611 }
5612
d46c5b12
KH
5613 /* Now we convert the text. */
5614
5615 /* For encoding, we must process pre-write-conversion in advance. */
b73bfc1c
KH
5616 if (! inhibit_pre_post_conversion
5617 && encodep
d46c5b12
KH
5618 && SYMBOLP (coding->pre_write_conversion)
5619 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5620 {
2b4f9037
KH
5621 /* The function in pre-write-conversion may put a new text in a
5622 new buffer. */
0007bdd0
KH
5623 struct buffer *prev = current_buffer;
5624 Lisp_Object new;
d46c5b12 5625
1c7457e2 5626 record_unwind_protect (code_convert_region_unwind,
16ef9c56 5627 Fcons (Vlast_coding_system_used, Qnil));
b843d1ae
KH
5628 /* We should not call any more pre-write/post-read-conversion
5629 functions while this pre-write-conversion is running. */
5630 inhibit_pre_post_conversion = 1;
b39f748c
AS
5631 call2 (coding->pre_write_conversion,
5632 make_number (from), make_number (to));
b843d1ae
KH
5633 inhibit_pre_post_conversion = 0;
5634 /* Discard the unwind protect. */
5635 specpdl_ptr--;
5636
d46c5b12
KH
5637 if (current_buffer != prev)
5638 {
5639 len = ZV - BEGV;
0007bdd0 5640 new = Fcurrent_buffer ();
d46c5b12 5641 set_buffer_internal_1 (prev);
7dae4502 5642 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 5643 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
5644 insert_from_buffer (XBUFFER (new), 1, len, 0);
5645 Fkill_buffer (new);
e133c8fa
KH
5646 if (orig_point >= to)
5647 orig_point += len - orig_len;
5648 else if (orig_point > from)
5649 orig_point = from;
5650 orig_len = len;
d46c5b12 5651 to = from + len;
b73bfc1c
KH
5652 from_byte = CHAR_TO_BYTE (from);
5653 to_byte = CHAR_TO_BYTE (to);
d46c5b12 5654 len_byte = to_byte - from_byte;
e133c8fa 5655 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
5656 }
5657 }
5658
12410ef1 5659 if (replace)
72d1a715
RS
5660 {
5661 if (! EQ (current_buffer->undo_list, Qt))
5662 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5663 else
5664 {
5665 nchars_del = to - from;
5666 nbytes_del = to_byte - from_byte;
5667 }
5668 }
12410ef1 5669
ec6d2bb8
KH
5670 if (coding->composing != COMPOSITION_DISABLED)
5671 {
5672 if (encodep)
5673 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5674 else
5675 coding_allocate_composition_data (coding, from);
5676 }
fb88bf2d 5677
ce559e6f
KH
5678 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5679 if we must run CCL program or there are compositions to
5680 encode. */
5681 if (coding->type != coding_type_ccl
5682 && (! coding->cmp_data || coding->cmp_data->used == 0))
4956c225
KH
5683 {
5684 int from_byte_orig = from_byte, to_byte_orig = to_byte;
ec6d2bb8 5685
4956c225
KH
5686 if (from < GPT && GPT < to)
5687 move_gap_both (from, from_byte);
5688 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5689 if (from_byte == to_byte
5690 && (encodep || NILP (coding->post_read_conversion))
5691 && ! CODING_REQUIRE_FLUSHING (coding))
5692 {
5693 coding->produced = len_byte;
5694 coding->produced_char = len;
5695 if (!replace)
5696 /* We must record and adjust for this new text now. */
5697 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
ce559e6f 5698 coding_free_composition_data (coding);
4956c225
KH
5699 return 0;
5700 }
5701
5702 head_skip = from_byte - from_byte_orig;
5703 tail_skip = to_byte_orig - to_byte;
5704 total_skip = head_skip + tail_skip;
5705 from += head_skip;
5706 to -= tail_skip;
5707 len -= total_skip; len_byte -= total_skip;
5708 }
d46c5b12 5709
8ca3766a 5710 /* For conversion, we must put the gap before the text in addition to
fb88bf2d
KH
5711 making the gap larger for efficient decoding. The required gap
5712 size starts from 2000 which is the magic number used in make_gap.
5713 But, after one batch of conversion, it will be incremented if we
5714 find that it is not enough . */
d46c5b12
KH
5715 require = 2000;
5716
5717 if (GAP_SIZE < require)
5718 make_gap (require - GAP_SIZE);
5719 move_gap_both (from, from_byte);
5720
d46c5b12 5721 inserted = inserted_byte = 0;
fb88bf2d
KH
5722
5723 GAP_SIZE += len_byte;
5724 ZV -= len;
5725 Z -= len;
5726 ZV_BYTE -= len_byte;
5727 Z_BYTE -= len_byte;
5728
d9f9a1bc
GM
5729 if (GPT - BEG < BEG_UNCHANGED)
5730 BEG_UNCHANGED = GPT - BEG;
5731 if (Z - GPT < END_UNCHANGED)
5732 END_UNCHANGED = Z - GPT;
f2558efd 5733
b73bfc1c
KH
5734 if (!encodep && coding->src_multibyte)
5735 {
5736 /* Decoding routines expects that the source text is unibyte.
5737 We must convert 8-bit characters of multibyte form to
5738 unibyte. */
5739 int len_byte_orig = len_byte;
5740 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5741 if (len_byte < len_byte_orig)
5742 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5743 len_byte);
5744 coding->src_multibyte = 0;
5745 }
5746
d46c5b12
KH
5747 for (;;)
5748 {
fb88bf2d 5749 int result;
d46c5b12 5750
ec6d2bb8 5751 /* The buffer memory is now:
b73bfc1c
KH
5752 +--------+converted-text+---------+-------original-text-------+---+
5753 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5754 |<---------------------- GAP ----------------------->| */
ec6d2bb8
KH
5755 src = GAP_END_ADDR - len_byte;
5756 dst = GPT_ADDR + inserted_byte;
5757
d46c5b12 5758 if (encodep)
fb88bf2d 5759 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 5760 else
0e79d667
RS
5761 {
5762 if (coding->composing != COMPOSITION_DISABLED)
5763 coding->cmp_data->char_offset = from + inserted;
5764 result = decode_coding (coding, src, dst, len_byte, 0);
5765 }
ec6d2bb8
KH
5766
5767 /* The buffer memory is now:
b73bfc1c
KH
5768 +--------+-------converted-text----+--+------original-text----+---+
5769 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5770 |<---------------------- GAP ----------------------->| */
ec6d2bb8 5771
d46c5b12
KH
5772 inserted += coding->produced_char;
5773 inserted_byte += coding->produced;
d46c5b12 5774 len_byte -= coding->consumed;
ec6d2bb8
KH
5775
5776 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5777 {
5778 coding_allocate_composition_data (coding, from + inserted);
5779 continue;
5780 }
5781
fb88bf2d 5782 src += coding->consumed;
3636f7a3 5783 dst += coding->produced;
d46c5b12 5784
9864ebce
KH
5785 if (result == CODING_FINISH_NORMAL)
5786 {
5787 src += len_byte;
5788 break;
5789 }
d46c5b12
KH
5790 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5791 {
fb88bf2d 5792 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 5793 Lisp_Object eol_type;
d46c5b12
KH
5794
5795 /* Encode LFs back to the original eol format (CR or CRLF). */
5796 if (coding->eol_type == CODING_EOL_CR)
5797 {
5798 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5799 }
5800 else
5801 {
d46c5b12
KH
5802 int count = 0;
5803
fb88bf2d
KH
5804 while (p < pend) if (*p++ == '\n') count++;
5805 if (src - dst < count)
d46c5b12 5806 {
38edf7d4 5807 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
5808 back to CRLF. We must record converted and
5809 not-yet-converted text back to the buffer
5810 content, enlarge the gap, then record them out of
5811 the buffer contents again. */
5812 int add = len_byte + inserted_byte;
5813
5814 GAP_SIZE -= add;
5815 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5816 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5817 make_gap (count - GAP_SIZE);
5818 GAP_SIZE += add;
5819 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5820 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5821 /* Don't forget to update SRC, DST, and PEND. */
5822 src = GAP_END_ADDR - len_byte;
5823 dst = GPT_ADDR + inserted_byte;
5824 pend = dst;
d46c5b12 5825 }
d46c5b12
KH
5826 inserted += count;
5827 inserted_byte += count;
fb88bf2d
KH
5828 coding->produced += count;
5829 p = dst = pend + count;
5830 while (count)
5831 {
5832 *--p = *--pend;
5833 if (*p == '\n') count--, *--p = '\r';
5834 }
d46c5b12
KH
5835 }
5836
5837 /* Suppress eol-format conversion in the further conversion. */
5838 coding->eol_type = CODING_EOL_LF;
5839
38edf7d4
KH
5840 /* Set the coding system symbol to that for Unix-like EOL. */
5841 eol_type = Fget (saved_coding_symbol, Qeol_type);
5842 if (VECTORP (eol_type)
5843 && XVECTOR (eol_type)->size == 3
5844 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5845 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5846 else
5847 coding->symbol = saved_coding_symbol;
93dec019 5848
fb88bf2d 5849 continue;
d46c5b12
KH
5850 }
5851 if (len_byte <= 0)
944bd420
KH
5852 {
5853 if (coding->type != coding_type_ccl
5854 || coding->mode & CODING_MODE_LAST_BLOCK)
5855 break;
5856 coding->mode |= CODING_MODE_LAST_BLOCK;
5857 continue;
5858 }
d46c5b12
KH
5859 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5860 {
5861 /* The source text ends in invalid codes. Let's just
5862 make them valid buffer contents, and finish conversion. */
70ad9fc4
GM
5863 if (multibyte_p)
5864 {
5865 unsigned char *start = dst;
93dec019 5866
70ad9fc4
GM
5867 inserted += len_byte;
5868 while (len_byte--)
5869 {
5870 int c = *src++;
5871 dst += CHAR_STRING (c, dst);
5872 }
5873
5874 inserted_byte += dst - start;
5875 }
5876 else
5877 {
5878 inserted += len_byte;
5879 inserted_byte += len_byte;
5880 while (len_byte--)
5881 *dst++ = *src++;
5882 }
d46c5b12
KH
5883 break;
5884 }
9864ebce
KH
5885 if (result == CODING_FINISH_INTERRUPT)
5886 {
5887 /* The conversion procedure was interrupted by a user. */
9864ebce
KH
5888 break;
5889 }
5890 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5891 if (coding->consumed < 1)
5892 {
5893 /* It's quite strange to require more memory without
5894 consuming any bytes. Perhaps CCL program bug. */
9864ebce
KH
5895 break;
5896 }
fb88bf2d
KH
5897 if (first)
5898 {
5899 /* We have just done the first batch of conversion which was
8ca3766a 5900 stopped because of insufficient gap. Let's reconsider the
fb88bf2d
KH
5901 required gap size (i.e. SRT - DST) now.
5902
5903 We have converted ORIG bytes (== coding->consumed) into
5904 NEW bytes (coding->produced). To convert the remaining
5905 LEN bytes, we may need REQUIRE bytes of gap, where:
5906 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5907 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5908 Here, we are sure that NEW >= ORIG. */
b3385c28
KH
5909
5910 if (coding->produced <= coding->consumed)
5911 {
5912 /* This happens because of CCL-based coding system with
5913 eol-type CRLF. */
5914 require = 0;
5915 }
5916 else
5917 {
b3ebb2d4
KH
5918 float ratio = coding->produced - coding->consumed;
5919 ratio /= coding->consumed;
b3385c28
KH
5920 require = len_byte * ratio;
5921 }
fb88bf2d
KH
5922 first = 0;
5923 }
5924 if ((src - dst) < (require + 2000))
5925 {
5926 /* See the comment above the previous call of make_gap. */
5927 int add = len_byte + inserted_byte;
5928
5929 GAP_SIZE -= add;
5930 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5931 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5932 make_gap (require + 2000);
5933 GAP_SIZE += add;
5934 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5935 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 5936 }
d46c5b12 5937 }
fb88bf2d
KH
5938 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5939
b73bfc1c
KH
5940 if (encodep && coding->dst_multibyte)
5941 {
5942 /* The output is unibyte. We must convert 8-bit characters to
5943 multibyte form. */
5944 if (inserted_byte * 2 > GAP_SIZE)
5945 {
5946 GAP_SIZE -= inserted_byte;
5947 ZV += inserted_byte; Z += inserted_byte;
5948 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5949 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5950 make_gap (inserted_byte - GAP_SIZE);
5951 GAP_SIZE += inserted_byte;
5952 ZV -= inserted_byte; Z -= inserted_byte;
5953 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5954 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5955 }
5956 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5957 }
7553d0e1 5958
93dec019 5959 /* If we shrank the conversion area, adjust it now. */
12410ef1
KH
5960 if (total_skip > 0)
5961 {
5962 if (tail_skip > 0)
5963 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5964 inserted += total_skip; inserted_byte += total_skip;
5965 GAP_SIZE += total_skip;
5966 GPT -= head_skip; GPT_BYTE -= head_skip;
5967 ZV -= total_skip; ZV_BYTE -= total_skip;
5968 Z -= total_skip; Z_BYTE -= total_skip;
5969 from -= head_skip; from_byte -= head_skip;
5970 to += tail_skip; to_byte += tail_skip;
5971 }
5972
6abb9bd9 5973 prev_Z = Z;
72d1a715
RS
5974 if (! EQ (current_buffer->undo_list, Qt))
5975 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5976 else
5977 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5978 inserted, inserted_byte);
6abb9bd9 5979 inserted = Z - prev_Z;
4ed46869 5980
ec6d2bb8
KH
5981 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5982 coding_restore_composition (coding, Fcurrent_buffer ());
5983 coding_free_composition_data (coding);
5984
b73bfc1c
KH
5985 if (! inhibit_pre_post_conversion
5986 && ! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 5987 {
2b4f9037 5988 Lisp_Object val;
1c7457e2 5989 Lisp_Object saved_coding_system;
4ed46869 5990
e133c8fa
KH
5991 if (from != PT)
5992 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 5993 prev_Z = Z;
1c7457e2 5994 record_unwind_protect (code_convert_region_unwind,
16ef9c56 5995 Fcons (Vlast_coding_system_used, Qnil));
1c7457e2
KH
5996 saved_coding_system = Vlast_coding_system_used;
5997 Vlast_coding_system_used = coding->symbol;
b843d1ae
KH
5998 /* We should not call any more pre-write/post-read-conversion
5999 functions while this post-read-conversion is running. */
6000 inhibit_pre_post_conversion = 1;
2b4f9037 6001 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae 6002 inhibit_pre_post_conversion = 0;
1c7457e2
KH
6003 coding->symbol = Vlast_coding_system_used;
6004 Vlast_coding_system_used = saved_coding_system;
b843d1ae
KH
6005 /* Discard the unwind protect. */
6006 specpdl_ptr--;
b7826503 6007 CHECK_NUMBER (val);
944bd420 6008 inserted += Z - prev_Z;
e133c8fa
KH
6009 }
6010
6011 if (orig_point >= from)
6012 {
6013 if (orig_point >= from + orig_len)
6014 orig_point += inserted - orig_len;
6015 else
6016 orig_point = from;
6017 TEMP_SET_PT (orig_point);
d46c5b12 6018 }
4ed46869 6019
ec6d2bb8
KH
6020 if (replace)
6021 {
6022 signal_after_change (from, to - from, inserted);
e19539f1 6023 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 6024 }
2b4f9037 6025
fb88bf2d 6026 {
12410ef1
KH
6027 coding->consumed = to_byte - from_byte;
6028 coding->consumed_char = to - from;
6029 coding->produced = inserted_byte;
6030 coding->produced_char = inserted;
fb88bf2d 6031 }
7553d0e1 6032
fb88bf2d 6033 return 0;
d46c5b12
KH
6034}
6035
2a47931b
KH
6036/* Name (or base name) of work buffer for code conversion. */
6037static Lisp_Object Vcode_conversion_workbuf_name;
6038
6039/* Set the current buffer to the working buffer prepared for
6040 code-conversion. MULTIBYTE specifies the multibyteness of the
16ef9c56
KH
6041 buffer. Return the buffer we set if it must be killed after use.
6042 Otherwise return Qnil. */
2a47931b 6043
16ef9c56 6044static Lisp_Object
2a47931b
KH
6045set_conversion_work_buffer (multibyte)
6046 int multibyte;
6047{
16ef9c56 6048 Lisp_Object buffer, buffer_to_kill;
2a47931b
KH
6049 struct buffer *buf;
6050
6051 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6052 buf = XBUFFER (buffer);
16ef9c56
KH
6053 if (buf == current_buffer)
6054 {
6055 /* As we are already in the work buffer, we must generate a new
6056 buffer for the work. */
6057 Lisp_Object name;
6058
6059 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6060 buffer = buffer_to_kill = Fget_buffer_create (name);
6061 buf = XBUFFER (buffer);
6062 }
6063 else
6064 buffer_to_kill = Qnil;
6065
2a47931b
KH
6066 delete_all_overlays (buf);
6067 buf->directory = current_buffer->directory;
6068 buf->read_only = Qnil;
6069 buf->filename = Qnil;
6070 buf->undo_list = Qt;
6071 eassert (buf->overlays_before == NULL);
6072 eassert (buf->overlays_after == NULL);
6073 set_buffer_internal (buf);
6074 if (BEG != BEGV || Z != ZV)
6075 Fwiden ();
6076 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6077 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
16ef9c56 6078 return buffer_to_kill;
2a47931b
KH
6079}
6080
d46c5b12 6081Lisp_Object
b73bfc1c
KH
6082run_pre_post_conversion_on_str (str, coding, encodep)
6083 Lisp_Object str;
6084 struct coding_system *coding;
6085 int encodep;
6086{
aed13378 6087 int count = SPECPDL_INDEX ();
cf3b32fc 6088 struct gcpro gcpro1, gcpro2;
b73bfc1c 6089 int multibyte = STRING_MULTIBYTE (str);
cf3b32fc 6090 Lisp_Object old_deactivate_mark;
16ef9c56 6091 Lisp_Object buffer_to_kill;
24a2b282 6092 Lisp_Object unwind_arg;
b73bfc1c
KH
6093
6094 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
cf3b32fc
RS
6095 /* It is not crucial to specbind this. */
6096 old_deactivate_mark = Vdeactivate_mark;
6097 GCPRO2 (str, old_deactivate_mark);
3fd9494b 6098
b73bfc1c
KH
6099 /* We must insert the contents of STR as is without
6100 unibyte<->multibyte conversion. For that, we adjust the
6101 multibyteness of the working buffer to that of STR. */
16ef9c56 6102 buffer_to_kill = set_conversion_work_buffer (multibyte);
24a2b282
KH
6103 if (NILP (buffer_to_kill))
6104 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6105 else
6106 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6107 record_unwind_protect (code_convert_region_unwind, unwind_arg);
3fd9494b 6108
b73bfc1c 6109 insert_from_string (str, 0, 0,
d5db4077 6110 SCHARS (str), SBYTES (str), 0);
b73bfc1c
KH
6111 UNGCPRO;
6112 inhibit_pre_post_conversion = 1;
6113 if (encodep)
24a2b282
KH
6114 {
6115 struct buffer *prev = current_buffer;
6116
6117 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6118 if (prev != current_buffer)
6119 /* We must kill the current buffer too. */
6120 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6121 }
b73bfc1c 6122 else
6bac5b12 6123 {
1c7457e2 6124 Vlast_coding_system_used = coding->symbol;
6bac5b12
KH
6125 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6126 call1 (coding->post_read_conversion, make_number (Z - BEG));
1c7457e2 6127 coding->symbol = Vlast_coding_system_used;
6bac5b12 6128 }
b73bfc1c 6129 inhibit_pre_post_conversion = 0;
cf3b32fc 6130 Vdeactivate_mark = old_deactivate_mark;
78108bcd 6131 str = make_buffer_string (BEG, Z, 1);
b73bfc1c
KH
6132 return unbind_to (count, str);
6133}
6134
2a47931b
KH
6135
6136/* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6137 text in *STR. *SIZE is the allocated bytes for STR. As it
6138 is intended that this function is called from encode_terminal_code,
6139 the pre-write-conversion function is run by safe_call and thus
6140 "Error during redisplay: ..." is logged when an error occurs.
6141
6142 Store the resulting text in *STR and set CODING->produced_char and
6143 CODING->produced to the number of characters and bytes
6144 respectively. If the size of *STR is too small, enlarge it by
6145 xrealloc and update *STR and *SIZE. */
6146
6147void
6148run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6149 unsigned char **str;
6150 int *size, nchars, nbytes;
6151 struct coding_system *coding;
6152{
6153 struct gcpro gcpro1, gcpro2;
6154 struct buffer *cur = current_buffer;
24a2b282 6155 struct buffer *prev;
2a47931b
KH
6156 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6157 Lisp_Object args[3];
16ef9c56 6158 Lisp_Object buffer_to_kill;
2a47931b
KH
6159
6160 /* It is not crucial to specbind this. */
6161 old_deactivate_mark = Vdeactivate_mark;
6162 old_last_coding_system_used = Vlast_coding_system_used;
6163 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6164
6165 /* We must insert the contents of STR as is without
6166 unibyte<->multibyte conversion. For that, we adjust the
6167 multibyteness of the working buffer to that of STR. */
16ef9c56 6168 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
2a47931b
KH
6169 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6170 UNGCPRO;
6171 inhibit_pre_post_conversion = 1;
24a2b282 6172 prev = current_buffer;
2a47931b
KH
6173 args[0] = coding->pre_write_conversion;
6174 args[1] = make_number (BEG);
6175 args[2] = make_number (Z);
6176 safe_call (3, args);
6177 inhibit_pre_post_conversion = 0;
6178 Vdeactivate_mark = old_deactivate_mark;
6179 Vlast_coding_system_used = old_last_coding_system_used;
6180 coding->produced_char = Z - BEG;
6181 coding->produced = Z_BYTE - BEG_BYTE;
6182 if (coding->produced > *size)
6183 {
6184 *size = coding->produced;
6185 *str = xrealloc (*str, *size);
6186 }
6187 if (BEG < GPT && GPT < Z)
6188 move_gap (BEG);
6189 bcopy (BEG_ADDR, *str, coding->produced);
6190 coding->src_multibyte
6191 = ! NILP (current_buffer->enable_multibyte_characters);
24a2b282
KH
6192 if (prev != current_buffer)
6193 Fkill_buffer (Fcurrent_buffer ());
2a47931b 6194 set_buffer_internal (cur);
16ef9c56
KH
6195 if (! NILP (buffer_to_kill))
6196 Fkill_buffer (buffer_to_kill);
2a47931b
KH
6197}
6198
6199
b73bfc1c
KH
6200Lisp_Object
6201decode_coding_string (str, coding, nocopy)
d46c5b12 6202 Lisp_Object str;
4ed46869 6203 struct coding_system *coding;
b73bfc1c 6204 int nocopy;
4ed46869 6205{
d46c5b12 6206 int len;
73be902c 6207 struct conversion_buffer buf;
da55a2b7 6208 int from, to_byte;
84d60297 6209 Lisp_Object saved_coding_symbol;
d46c5b12 6210 int result;
78108bcd 6211 int require_decoding;
73be902c
KH
6212 int shrinked_bytes = 0;
6213 Lisp_Object newstr;
2391eaa4 6214 int consumed, consumed_char, produced, produced_char;
4ed46869 6215
b73bfc1c 6216 from = 0;
d5db4077 6217 to_byte = SBYTES (str);
4ed46869 6218
8844fa83 6219 saved_coding_symbol = coding->symbol;
764ca8da
KH
6220 coding->src_multibyte = STRING_MULTIBYTE (str);
6221 coding->dst_multibyte = 1;
b73bfc1c 6222 if (CODING_REQUIRE_DETECTION (coding))
d46c5b12
KH
6223 {
6224 /* See the comments in code_convert_region. */
6225 if (coding->type == coding_type_undecided)
6226 {
d5db4077 6227 detect_coding (coding, SDATA (str), to_byte);
d46c5b12 6228 if (coding->type == coding_type_undecided)
d280ccb6
KH
6229 {
6230 coding->type = coding_type_emacs_mule;
6231 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6232 /* As emacs-mule decoder will handle composition, we
6233 need this setting to allocate coding->cmp_data
6234 later. */
6235 coding->composing = COMPOSITION_NO;
6236 }
d46c5b12 6237 }
aaaf0b1e
KH
6238 if (coding->eol_type == CODING_EOL_UNDECIDED
6239 && coding->type != coding_type_ccl)
d46c5b12
KH
6240 {
6241 saved_coding_symbol = coding->symbol;
d5db4077 6242 detect_eol (coding, SDATA (str), to_byte);
d46c5b12
KH
6243 if (coding->eol_type == CODING_EOL_UNDECIDED)
6244 coding->eol_type = CODING_EOL_LF;
6245 /* We had better recover the original eol format if we
8ca3766a 6246 encounter an inconsistent eol format while decoding. */
d46c5b12
KH
6247 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6248 }
6249 }
4ed46869 6250
764ca8da
KH
6251 if (coding->type == coding_type_no_conversion
6252 || coding->type == coding_type_raw_text)
6253 coding->dst_multibyte = 0;
6254
78108bcd 6255 require_decoding = CODING_REQUIRE_DECODING (coding);
ec6d2bb8 6256
b73bfc1c 6257 if (STRING_MULTIBYTE (str))
d46c5b12 6258 {
b73bfc1c
KH
6259 /* Decoding routines expect the source text to be unibyte. */
6260 str = Fstring_as_unibyte (str);
d5db4077 6261 to_byte = SBYTES (str);
b73bfc1c 6262 nocopy = 1;
764ca8da 6263 coding->src_multibyte = 0;
b73bfc1c 6264 }
ec6d2bb8 6265
b73bfc1c 6266 /* Try to skip the heading and tailing ASCIIs. */
78108bcd 6267 if (require_decoding && coding->type != coding_type_ccl)
4956c225 6268 {
d5db4077 6269 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
6270 0);
6271 if (from == to_byte)
78108bcd 6272 require_decoding = 0;
d5db4077 6273 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 6274 }
b73bfc1c 6275
439ad9ea
KH
6276 if (!require_decoding
6277 && !(SYMBOLP (coding->post_read_conversion)
6278 && !NILP (Ffboundp (coding->post_read_conversion))))
78108bcd 6279 {
d5db4077
KR
6280 coding->consumed = SBYTES (str);
6281 coding->consumed_char = SCHARS (str);
78108bcd
KH
6282 if (coding->dst_multibyte)
6283 {
6284 str = Fstring_as_multibyte (str);
6285 nocopy = 1;
6286 }
d5db4077
KR
6287 coding->produced = SBYTES (str);
6288 coding->produced_char = SCHARS (str);
78108bcd
KH
6289 return (nocopy ? str : Fcopy_sequence (str));
6290 }
6291
6292 if (coding->composing != COMPOSITION_DISABLED)
6293 coding_allocate_composition_data (coding, from);
b73bfc1c 6294 len = decoding_buffer_size (coding, to_byte - from);
73be902c 6295 allocate_conversion_buffer (buf, len);
4ed46869 6296
2391eaa4 6297 consumed = consumed_char = produced = produced_char = 0;
73be902c 6298 while (1)
4ed46869 6299 {
d5db4077 6300 result = decode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6301 buf.data + produced, to_byte - from - consumed,
6302 buf.size - produced);
6303 consumed += coding->consumed;
2391eaa4 6304 consumed_char += coding->consumed_char;
73be902c
KH
6305 produced += coding->produced;
6306 produced_char += coding->produced_char;
2391eaa4 6307 if (result == CODING_FINISH_NORMAL
c3912f23 6308 || result == CODING_FINISH_INTERRUPT
2391eaa4
KH
6309 || (result == CODING_FINISH_INSUFFICIENT_SRC
6310 && coding->consumed == 0))
73be902c
KH
6311 break;
6312 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6313 coding_allocate_composition_data (coding, from + produced_char);
6314 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6315 extend_conversion_buffer (&buf);
6316 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6317 {
8844fa83
KH
6318 Lisp_Object eol_type;
6319
73be902c
KH
6320 /* Recover the original EOL format. */
6321 if (coding->eol_type == CODING_EOL_CR)
6322 {
6323 unsigned char *p;
6324 for (p = buf.data; p < buf.data + produced; p++)
6325 if (*p == '\n') *p = '\r';
6326 }
6327 else if (coding->eol_type == CODING_EOL_CRLF)
6328 {
6329 int num_eol = 0;
6330 unsigned char *p0, *p1;
6331 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6332 if (*p0 == '\n') num_eol++;
6333 if (produced + num_eol >= buf.size)
6334 extend_conversion_buffer (&buf);
6335 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6336 {
6337 *--p1 = *--p0;
6338 if (*p0 == '\n') *--p1 = '\r';
6339 }
6340 produced += num_eol;
6341 produced_char += num_eol;
93dec019 6342 }
8844fa83 6343 /* Suppress eol-format conversion in the further conversion. */
73be902c 6344 coding->eol_type = CODING_EOL_LF;
8844fa83
KH
6345
6346 /* Set the coding system symbol to that for Unix-like EOL. */
6347 eol_type = Fget (saved_coding_symbol, Qeol_type);
6348 if (VECTORP (eol_type)
6349 && XVECTOR (eol_type)->size == 3
6350 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6351 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6352 else
6353 coding->symbol = saved_coding_symbol;
6354
6355
73be902c 6356 }
4ed46869 6357 }
d46c5b12 6358
2391eaa4
KH
6359 coding->consumed = consumed;
6360 coding->consumed_char = consumed_char;
6361 coding->produced = produced;
6362 coding->produced_char = produced_char;
6363
78108bcd 6364 if (coding->dst_multibyte)
73be902c
KH
6365 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6366 produced + shrinked_bytes);
78108bcd 6367 else
73be902c
KH
6368 newstr = make_uninit_string (produced + shrinked_bytes);
6369 if (from > 0)
a4244313
KR
6370 STRING_COPYIN (newstr, 0, SDATA (str), from);
6371 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6372 if (shrinked_bytes > from)
a4244313
KR
6373 STRING_COPYIN (newstr, from + produced,
6374 SDATA (str) + to_byte,
6375 shrinked_bytes - from);
73be902c 6376 free_conversion_buffer (&buf);
b73bfc1c 6377
160a708c
KH
6378 coding->consumed += shrinked_bytes;
6379 coding->consumed_char += shrinked_bytes;
6380 coding->produced += shrinked_bytes;
6381 coding->produced_char += shrinked_bytes;
6382
b73bfc1c 6383 if (coding->cmp_data && coding->cmp_data->used)
73be902c 6384 coding_restore_composition (coding, newstr);
b73bfc1c
KH
6385 coding_free_composition_data (coding);
6386
6387 if (SYMBOLP (coding->post_read_conversion)
6388 && !NILP (Ffboundp (coding->post_read_conversion)))
73be902c 6389 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
b73bfc1c 6390
73be902c 6391 return newstr;
b73bfc1c
KH
6392}
6393
6394Lisp_Object
6395encode_coding_string (str, coding, nocopy)
6396 Lisp_Object str;
6397 struct coding_system *coding;
6398 int nocopy;
6399{
6400 int len;
73be902c 6401 struct conversion_buffer buf;
b73bfc1c 6402 int from, to, to_byte;
b73bfc1c 6403 int result;
73be902c
KH
6404 int shrinked_bytes = 0;
6405 Lisp_Object newstr;
2391eaa4 6406 int consumed, consumed_char, produced, produced_char;
b73bfc1c
KH
6407
6408 if (SYMBOLP (coding->pre_write_conversion)
6409 && !NILP (Ffboundp (coding->pre_write_conversion)))
3bb917bf
KH
6410 {
6411 str = run_pre_post_conversion_on_str (str, coding, 1);
6412 /* As STR is just newly generated, we don't have to copy it
6413 anymore. */
6414 nocopy = 1;
6415 }
b73bfc1c
KH
6416
6417 from = 0;
d5db4077
KR
6418 to = SCHARS (str);
6419 to_byte = SBYTES (str);
b73bfc1c 6420
e2c06b17
KH
6421 /* Encoding routines determine the multibyteness of the source text
6422 by coding->src_multibyte. */
3bb917bf 6423 coding->src_multibyte = SCHARS (str) < SBYTES (str);
e2c06b17 6424 coding->dst_multibyte = 0;
b73bfc1c 6425 if (! CODING_REQUIRE_ENCODING (coding))
3bb917bf 6426 goto no_need_of_encoding;
826bfb8b 6427
b73bfc1c
KH
6428 if (coding->composing != COMPOSITION_DISABLED)
6429 coding_save_composition (coding, from, to, str);
ec6d2bb8 6430
ce559e6f
KH
6431 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6432 if we must run CCL program or there are compositions to
6433 encode. */
6434 if (coding->type != coding_type_ccl
6435 && (! coding->cmp_data || coding->cmp_data->used == 0))
4956c225 6436 {
d5db4077 6437 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
4956c225
KH
6438 1);
6439 if (from == to_byte)
ce559e6f
KH
6440 {
6441 coding_free_composition_data (coding);
3bb917bf 6442 goto no_need_of_encoding;
ce559e6f 6443 }
d5db4077 6444 shrinked_bytes = from + (SBYTES (str) - to_byte);
4956c225 6445 }
b73bfc1c
KH
6446
6447 len = encoding_buffer_size (coding, to_byte - from);
73be902c
KH
6448 allocate_conversion_buffer (buf, len);
6449
2391eaa4 6450 consumed = consumed_char = produced = produced_char = 0;
73be902c
KH
6451 while (1)
6452 {
d5db4077 6453 result = encode_coding (coding, SDATA (str) + from + consumed,
73be902c
KH
6454 buf.data + produced, to_byte - from - consumed,
6455 buf.size - produced);
6456 consumed += coding->consumed;
2391eaa4 6457 consumed_char += coding->consumed_char;
13004bef 6458 produced += coding->produced;
2391eaa4
KH
6459 produced_char += coding->produced_char;
6460 if (result == CODING_FINISH_NORMAL
230779b9 6461 || result == CODING_FINISH_INTERRUPT
2391eaa4
KH
6462 || (result == CODING_FINISH_INSUFFICIENT_SRC
6463 && coding->consumed == 0))
73be902c
KH
6464 break;
6465 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6466 extend_conversion_buffer (&buf);
6467 }
6468
2391eaa4
KH
6469 coding->consumed = consumed;
6470 coding->consumed_char = consumed_char;
6471 coding->produced = produced;
6472 coding->produced_char = produced_char;
6473
73be902c 6474 newstr = make_uninit_string (produced + shrinked_bytes);
b73bfc1c 6475 if (from > 0)
a4244313
KR
6476 STRING_COPYIN (newstr, 0, SDATA (str), from);
6477 STRING_COPYIN (newstr, from, buf.data, produced);
73be902c 6478 if (shrinked_bytes > from)
a4244313
KR
6479 STRING_COPYIN (newstr, from + produced,
6480 SDATA (str) + to_byte,
6481 shrinked_bytes - from);
73be902c
KH
6482
6483 free_conversion_buffer (&buf);
ec6d2bb8 6484 coding_free_composition_data (coding);
b73bfc1c 6485
73be902c 6486 return newstr;
3bb917bf
KH
6487
6488 no_need_of_encoding:
6489 coding->consumed = SBYTES (str);
6490 coding->consumed_char = SCHARS (str);
6491 if (STRING_MULTIBYTE (str))
6492 {
6493 if (nocopy)
6494 /* We are sure that STR doesn't contain a multibyte
6495 character. */
6496 STRING_SET_UNIBYTE (str);
6497 else
6498 {
6499 str = Fstring_as_unibyte (str);
6500 nocopy = 1;
6501 }
6502 }
6503 coding->produced = SBYTES (str);
6504 coding->produced_char = SCHARS (str);
6505 return (nocopy ? str : Fcopy_sequence (str));
4ed46869
KH
6506}
6507
6508\f
6509#ifdef emacs
1397dc18 6510/*** 8. Emacs Lisp library functions ***/
4ed46869 6511
4ed46869 6512DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae
PJ
6513 doc: /* Return t if OBJECT is nil or a coding-system.
6514See the documentation of `make-coding-system' for information
6515about coding-system objects. */)
6516 (obj)
4ed46869
KH
6517 Lisp_Object obj;
6518{
4608c386
KH
6519 if (NILP (obj))
6520 return Qt;
6521 if (!SYMBOLP (obj))
6522 return Qnil;
c2164d91
KH
6523 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6524 return Qt;
4608c386
KH
6525 /* Get coding-spec vector for OBJ. */
6526 obj = Fget (obj, Qcoding_system);
6527 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6528 ? Qt : Qnil);
4ed46869
KH
6529}
6530
9d991de8
RS
6531DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6532 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6533 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6534 (prompt)
4ed46869
KH
6535 Lisp_Object prompt;
6536{
e0e989f6 6537 Lisp_Object val;
9d991de8
RS
6538 do
6539 {
4608c386
KH
6540 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6541 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6542 }
d5db4077 6543 while (SCHARS (val) == 0);
e0e989f6 6544 return (Fintern (val, Qnil));
4ed46869
KH
6545}
6546
9b787f3e 6547DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6548 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6549If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6550 (prompt, default_coding_system)
9b787f3e 6551 Lisp_Object prompt, default_coding_system;
4ed46869 6552{
f44d27ce 6553 Lisp_Object val;
9b787f3e 6554 if (SYMBOLP (default_coding_system))
57d25e6f 6555 default_coding_system = SYMBOL_NAME (default_coding_system);
4608c386 6556 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6557 Qt, Qnil, Qcoding_system_history,
6558 default_coding_system, Qnil);
d5db4077 6559 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6560}
6561
6562DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6563 1, 1, 0,
48b0f3ae
PJ
6564 doc: /* Check validity of CODING-SYSTEM.
6565If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
303cdc2d 6566It is valid if it is nil or a symbol with a non-nil `coding-system' property.
de1d1a40 6567The value of this property should be a vector of length 5. */)
48b0f3ae 6568 (coding_system)
4ed46869
KH
6569 Lisp_Object coding_system;
6570{
a362520d
KH
6571 Lisp_Object define_form;
6572
6573 define_form = Fget (coding_system, Qcoding_system_define_form);
6574 if (! NILP (define_form))
6575 {
6576 Fput (coding_system, Qcoding_system_define_form, Qnil);
6577 safe_eval (define_form);
6578 }
4ed46869
KH
6579 if (!NILP (Fcoding_system_p (coding_system)))
6580 return coding_system;
6581 while (1)
02ba4723 6582 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6583}
3a73fa5d 6584\f
d46c5b12 6585Lisp_Object
0a28aafb 6586detect_coding_system (src, src_bytes, highest, multibytep)
a4244313 6587 const unsigned char *src;
d46c5b12 6588 int src_bytes, highest;
0a28aafb 6589 int multibytep;
4ed46869
KH
6590{
6591 int coding_mask, eol_type;
d46c5b12
KH
6592 Lisp_Object val, tmp;
6593 int dummy;
4ed46869 6594
0a28aafb 6595 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
d46c5b12
KH
6596 eol_type = detect_eol_type (src, src_bytes, &dummy);
6597 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 6598 eol_type = CODING_EOL_UNDECIDED;
4ed46869 6599
d46c5b12 6600 if (!coding_mask)
4ed46869 6601 {
27901516 6602 val = Qundecided;
d46c5b12 6603 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 6604 {
f44d27ce
RS
6605 Lisp_Object val2;
6606 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
6607 if (VECTORP (val2))
6608 val = XVECTOR (val2)->contents[eol_type];
6609 }
80e803b4 6610 return (highest ? val : Fcons (val, Qnil));
4ed46869 6611 }
4ed46869 6612
d46c5b12
KH
6613 /* At first, gather possible coding systems in VAL. */
6614 val = Qnil;
fa42c37f 6615 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6616 {
fa42c37f
KH
6617 Lisp_Object category_val, category_index;
6618
6619 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6620 category_val = Fsymbol_value (XCAR (tmp));
6621 if (!NILP (category_val)
6622 && NATNUMP (category_index)
6623 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 6624 {
fa42c37f 6625 val = Fcons (category_val, val);
d46c5b12
KH
6626 if (highest)
6627 break;
4ed46869
KH
6628 }
6629 }
d46c5b12
KH
6630 if (!highest)
6631 val = Fnreverse (val);
4ed46869 6632
65059037 6633 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 6634 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 6635 {
65059037
RS
6636 if (eol_type != CODING_EOL_UNDECIDED
6637 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 6638 {
d46c5b12 6639 Lisp_Object eol;
03699b14 6640 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 6641 if (VECTORP (eol))
f3fbd155 6642 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
4ed46869
KH
6643 }
6644 }
03699b14 6645 return (highest ? XCAR (val) : val);
93dec019 6646}
4ed46869 6647
d46c5b12
KH
6648DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6649 2, 3, 0,
40fd536c
KH
6650 doc: /* Detect how the byte sequence in the region is encoded.
6651Return a list of possible coding systems used on decoding a byte
6652sequence containing the bytes in the region between START and END when
6653the coding system `undecided' is specified. The list is ordered by
6654priority decided in the current language environment.
48b0f3ae
PJ
6655
6656If only ASCII characters are found, it returns a list of single element
6657`undecided' or its subsidiary coding system according to a detected
6658end-of-line format.
6659
6660If optional argument HIGHEST is non-nil, return the coding system of
6661highest priority. */)
6662 (start, end, highest)
d46c5b12
KH
6663 Lisp_Object start, end, highest;
6664{
6665 int from, to;
6666 int from_byte, to_byte;
682169fe 6667 int include_anchor_byte = 0;
6289dd10 6668
b7826503
PJ
6669 CHECK_NUMBER_COERCE_MARKER (start);
6670 CHECK_NUMBER_COERCE_MARKER (end);
4ed46869 6671
d46c5b12
KH
6672 validate_region (&start, &end);
6673 from = XINT (start), to = XINT (end);
6674 from_byte = CHAR_TO_BYTE (from);
6675 to_byte = CHAR_TO_BYTE (to);
6289dd10 6676
d46c5b12
KH
6677 if (from < GPT && to >= GPT)
6678 move_gap_both (to, to_byte);
c210f766
KH
6679 /* If we an anchor byte `\0' follows the region, we include it in
6680 the detecting source. Then code detectors can handle the tailing
6681 byte sequence more accurately.
6682
7d0393cf 6683 Fix me: This is not a perfect solution. It is better that we
c210f766
KH
6684 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6685 */
682169fe
KH
6686 if (to == Z || (to == GPT && GAP_SIZE > 0))
6687 include_anchor_byte = 1;
d46c5b12 6688 return detect_coding_system (BYTE_POS_ADDR (from_byte),
682169fe 6689 to_byte - from_byte + include_anchor_byte,
0a28aafb
KH
6690 !NILP (highest),
6691 !NILP (current_buffer
6692 ->enable_multibyte_characters));
d46c5b12 6693}
6289dd10 6694
d46c5b12
KH
6695DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6696 1, 2, 0,
eec1f3c7
KH
6697 doc: /* Detect how the byte sequence in STRING is encoded.
6698Return a list of possible coding systems used on decoding a byte
6699sequence containing the bytes in STRING when the coding system
6700`undecided' is specified. The list is ordered by priority decided in
6701the current language environment.
48b0f3ae
PJ
6702
6703If only ASCII characters are found, it returns a list of single element
6704`undecided' or its subsidiary coding system according to a detected
6705end-of-line format.
6706
6707If optional argument HIGHEST is non-nil, return the coding system of
6708highest priority. */)
6709 (string, highest)
d46c5b12
KH
6710 Lisp_Object string, highest;
6711{
b7826503 6712 CHECK_STRING (string);
4ed46869 6713
d5db4077 6714 return detect_coding_system (SDATA (string),
682169fe
KH
6715 /* "+ 1" is to include the anchor byte
6716 `\0'. With this, code detectors can
c210f766
KH
6717 handle the tailing bytes more
6718 accurately. */
d5db4077 6719 SBYTES (string) + 1,
0a28aafb
KH
6720 !NILP (highest),
6721 STRING_MULTIBYTE (string));
4ed46869
KH
6722}
6723
d12168d6 6724/* Subroutine for Ffind_coding_systems_region_internal.
05e6f5dc
KH
6725
6726 Return a list of coding systems that safely encode the multibyte
b666620c 6727 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
05e6f5dc
KH
6728 possible coding systems. If it is nil, it means that we have not
6729 yet found any coding systems.
6730
12d5b185
KH
6731 WORK_TABLE a char-table of which element is set to t once the
6732 element is looked up.
05e6f5dc
KH
6733
6734 If a non-ASCII single byte char is found, set
6735 *single_byte_char_found to 1. */
6736
6737static Lisp_Object
6738find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6739 unsigned char *p, *pend;
6740 Lisp_Object safe_codings, work_table;
6741 int *single_byte_char_found;
6b89e3aa 6742{
f1ce3dcf 6743 int c, len;
6b89e3aa
KH
6744 Lisp_Object val, ch;
6745 Lisp_Object prev, tail;
177c0ea7 6746
12d5b185
KH
6747 if (NILP (safe_codings))
6748 goto done_safe_codings;
6b89e3aa
KH
6749 while (p < pend)
6750 {
6751 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6752 p += len;
6753 if (ASCII_BYTE_P (c))
6754 /* We can ignore ASCII characters here. */
6755 continue;
6756 if (SINGLE_BYTE_CHAR_P (c))
6757 *single_byte_char_found = 1;
6b89e3aa
KH
6758 /* Check the safe coding systems for C. */
6759 ch = make_number (c);
6760 val = Faref (work_table, ch);
6761 if (EQ (val, Qt))
6762 /* This element was already checked. Ignore it. */
6763 continue;
6764 /* Remember that we checked this element. */
6765 Faset (work_table, ch, Qt);
6766
6767 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6768 {
b666620c
KH
6769 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6770 int encodable;
6771
6772 elt = XCAR (tail);
6773 if (CONSP (XCDR (elt)))
6774 {
6775 /* This entry has this format now:
6776 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6777 ACCEPT-LATIN-EXTRA ) */
6778 val = XCDR (elt);
6779 encodable = ! NILP (Faref (XCAR (val), ch));
6780 if (! encodable)
6781 {
6782 val = XCDR (val);
6783 translation_table = XCAR (val);
6784 hash_table = XCAR (XCDR (val));
6785 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6786 }
6787 }
6788 else
6789 {
6790 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6791 encodable = ! NILP (Faref (XCDR (elt), ch));
6792 if (! encodable)
6793 {
6794 /* Transform the format to:
6795 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6796 ACCEPT-LATIN-EXTRA ) */
6797 val = Fget (XCAR (elt), Qcoding_system);
6798 translation_table
6799 = Fplist_get (AREF (val, 3),
6800 Qtranslation_table_for_encode);
6801 if (SYMBOLP (translation_table))
6802 translation_table = Fget (translation_table,
6803 Qtranslation_table);
6804 hash_table
6805 = (CHAR_TABLE_P (translation_table)
6806 ? XCHAR_TABLE (translation_table)->extras[1]
6807 : Qnil);
6808 accept_latin_extra
6809 = ((EQ (AREF (val, 0), make_number (2))
6810 && VECTORP (AREF (val, 4)))
58f99379 6811 ? AREF (AREF (val, 4), 16)
b666620c
KH
6812 : Qnil);
6813 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6814 translation_table, hash_table,
6815 accept_latin_extra));
6816 }
6817 }
43e4a82f 6818
b666620c
KH
6819 if (! encodable
6820 && ((CHAR_TABLE_P (translation_table)
6821 && ! NILP (Faref (translation_table, ch)))
6822 || (HASH_TABLE_P (hash_table)
6823 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6824 || (SINGLE_BYTE_CHAR_P (c)
6825 && ! NILP (accept_latin_extra)
6826 && VECTORP (Vlatin_extra_code_table)
6827 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6828 encodable = 1;
6829 if (encodable)
6830 prev = tail;
6831 else
6b89e3aa 6832 {
7c695ab9 6833 /* Exclude this coding system from SAFE_CODINGS. */
6b89e3aa 6834 if (EQ (tail, safe_codings))
12d5b185
KH
6835 {
6836 safe_codings = XCDR (safe_codings);
6837 if (NILP (safe_codings))
6838 goto done_safe_codings;
6839 }
6b89e3aa
KH
6840 else
6841 XSETCDR (prev, XCDR (tail));
6842 }
6b89e3aa
KH
6843 }
6844 }
12d5b185
KH
6845
6846 done_safe_codings:
6847 /* If the above loop was terminated before P reaches PEND, it means
6848 SAFE_CODINGS was set to nil. If we have not yet found an
6849 non-ASCII single-byte char, check it now. */
6850 if (! *single_byte_char_found)
6851 while (p < pend)
6852 {
6853 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6854 p += len;
6855 if (! ASCII_BYTE_P (c)
6856 && SINGLE_BYTE_CHAR_P (c))
6857 {
6858 *single_byte_char_found = 1;
6859 break;
6860 }
6861 }
6b89e3aa
KH
6862 return safe_codings;
6863}
6864
067a6a66
KH
6865DEFUN ("find-coding-systems-region-internal",
6866 Ffind_coding_systems_region_internal,
6867 Sfind_coding_systems_region_internal, 2, 2, 0,
6b89e3aa
KH
6868 doc: /* Internal use only. */)
6869 (start, end)
6870 Lisp_Object start, end;
6871{
6872 Lisp_Object work_table, safe_codings;
6873 int non_ascii_p = 0;
6874 int single_byte_char_found = 0;
6875 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6876
6877 if (STRINGP (start))
6878 {
6879 if (!STRING_MULTIBYTE (start))
6880 return Qt;
6881 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6882 p2 = p2end = p1end;
6883 if (SCHARS (start) != SBYTES (start))
6884 non_ascii_p = 1;
6885 }
6886 else
6887 {
6888 int from, to, stop;
6889
6890 CHECK_NUMBER_COERCE_MARKER (start);
6891 CHECK_NUMBER_COERCE_MARKER (end);
6892 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6893 args_out_of_range (start, end);
6894 if (NILP (current_buffer->enable_multibyte_characters))
6895 return Qt;
6896 from = CHAR_TO_BYTE (XINT (start));
6897 to = CHAR_TO_BYTE (XINT (end));
6898 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6899 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6900 if (stop == to)
6901 p2 = p2end = p1end;
6902 else
6903 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6904 if (XINT (end) - XINT (start) != to - from)
6905 non_ascii_p = 1;
6906 }
6907
6908 if (!non_ascii_p)
6909 {
6910 /* We are sure that the text contains no multibyte character.
6911 Check if it contains eight-bit-graphic. */
6912 p = p1;
6913 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6914 if (p == p1end)
6915 {
6916 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6917 if (p == p2end)
6918 return Qt;
6919 }
6920 }
6921
6922 /* The text contains non-ASCII characters. */
6923
6924 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6925 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6926
067a6a66
KH
6927 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6928 &single_byte_char_found);
6b89e3aa 6929 if (p2 < p2end)
067a6a66
KH
6930 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6931 &single_byte_char_found);
6b89e3aa
KH
6932 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6933 safe_codings = Qt;
6934 else
6935 {
6936 /* Turn safe_codings to a list of coding systems... */
6937 Lisp_Object val;
6938
6939 if (single_byte_char_found)
6940 /* ... and append these for eight-bit chars. */
6941 val = Fcons (Qraw_text,
6942 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6943 else
6944 /* ... and append generic coding systems. */
6945 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
177c0ea7 6946
6b89e3aa
KH
6947 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6948 val = Fcons (XCAR (XCAR (safe_codings)), val);
6949 safe_codings = val;
6950 }
6951
6952 return safe_codings;
6953}
6954
6955
068a9dbd
KH
6956/* Search from position POS for such characters that are unencodable
6957 accoding to SAFE_CHARS, and return a list of their positions. P
6958 points where in the memory the character at POS exists. Limit the
6959 search at PEND or when Nth unencodable characters are found.
6960
6961 If SAFE_CHARS is a char table, an element for an unencodable
6962 character is nil.
6963
6964 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6965
6966 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6967 eight-bit-graphic characters are unencodable. */
6968
6969static Lisp_Object
6970unencodable_char_position (safe_chars, pos, p, pend, n)
6971 Lisp_Object safe_chars;
6972 int pos;
6973 unsigned char *p, *pend;
6974 int n;
6975{
6976 Lisp_Object pos_list;
6977
6978 pos_list = Qnil;
6979 while (p < pend)
6980 {
6981 int len;
6982 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7d0393cf 6983
068a9dbd
KH
6984 if (c >= 128
6985 && (CHAR_TABLE_P (safe_chars)
6986 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6987 : (NILP (safe_chars) || c < 256)))
6988 {
6989 pos_list = Fcons (make_number (pos), pos_list);
6990 if (--n <= 0)
6991 break;
6992 }
6993 pos++;
6994 p += len;
6995 }
6996 return Fnreverse (pos_list);
6997}
6998
6999
7000DEFUN ("unencodable-char-position", Funencodable_char_position,
7001 Sunencodable_char_position, 3, 5, 0,
7002 doc: /*
7003Return position of first un-encodable character in a region.
7004START and END specfiy the region and CODING-SYSTEM specifies the
7005encoding to check. Return nil if CODING-SYSTEM does encode the region.
7006
7007If optional 4th argument COUNT is non-nil, it specifies at most how
7008many un-encodable characters to search. In this case, the value is a
7009list of positions.
7010
7011If optional 5th argument STRING is non-nil, it is a string to search
7012for un-encodable characters. In that case, START and END are indexes
7013to the string. */)
7014 (start, end, coding_system, count, string)
7015 Lisp_Object start, end, coding_system, count, string;
7016{
7017 int n;
7018 Lisp_Object safe_chars;
7019 struct coding_system coding;
7020 Lisp_Object positions;
7021 int from, to;
7022 unsigned char *p, *pend;
7023
7024 if (NILP (string))
7025 {
7026 validate_region (&start, &end);
7027 from = XINT (start);
7028 to = XINT (end);
7029 if (NILP (current_buffer->enable_multibyte_characters))
7030 return Qnil;
7031 p = CHAR_POS_ADDR (from);
200c93e2
KH
7032 if (to == GPT)
7033 pend = GPT_ADDR;
7034 else
7035 pend = CHAR_POS_ADDR (to);
068a9dbd
KH
7036 }
7037 else
7038 {
7039 CHECK_STRING (string);
7040 CHECK_NATNUM (start);
7041 CHECK_NATNUM (end);
7042 from = XINT (start);
7043 to = XINT (end);
7044 if (from > to
7045 || to > SCHARS (string))
7046 args_out_of_range_3 (string, start, end);
7047 if (! STRING_MULTIBYTE (string))
7048 return Qnil;
7049 p = SDATA (string) + string_char_to_byte (string, from);
7050 pend = SDATA (string) + string_char_to_byte (string, to);
7051 }
7052
7053 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7054
7055 if (NILP (count))
7056 n = 1;
7057 else
7058 {
7059 CHECK_NATNUM (count);
7060 n = XINT (count);
7061 }
7062
7063 if (coding.type == coding_type_no_conversion
7064 || coding.type == coding_type_raw_text)
7065 return Qnil;
7066
7067 if (coding.type == coding_type_undecided)
7068 safe_chars = Qnil;
7069 else
6b89e3aa 7070 safe_chars = coding_safe_chars (coding_system);
068a9dbd
KH
7071
7072 if (STRINGP (string)
7073 || from >= GPT || to <= GPT)
7074 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7075 else
7076 {
7077 Lisp_Object args[2];
7078
7079 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
96d2e64d 7080 n -= XINT (Flength (args[0]));
068a9dbd
KH
7081 if (n <= 0)
7082 positions = args[0];
7083 else
7084 {
7085 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7086 pend, n);
7087 positions = Fappend (2, args);
7088 }
7089 }
7090
7091 return (NILP (count) ? Fcar (positions) : positions);
7092}
7093
7094
4031e2bf
KH
7095Lisp_Object
7096code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 7097 Lisp_Object start, end, coding_system;
4031e2bf 7098 int encodep;
3a73fa5d
RS
7099{
7100 struct coding_system coding;
da55a2b7 7101 int from, to;
3a73fa5d 7102
b7826503
PJ
7103 CHECK_NUMBER_COERCE_MARKER (start);
7104 CHECK_NUMBER_COERCE_MARKER (end);
7105 CHECK_SYMBOL (coding_system);
3a73fa5d 7106
d46c5b12
KH
7107 validate_region (&start, &end);
7108 from = XFASTINT (start);
7109 to = XFASTINT (end);
7110
3a73fa5d 7111 if (NILP (coding_system))
d46c5b12
KH
7112 return make_number (to - from);
7113
3a73fa5d 7114 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 7115 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
3a73fa5d 7116
d46c5b12 7117 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
7118 coding.src_multibyte = coding.dst_multibyte
7119 = !NILP (current_buffer->enable_multibyte_characters);
fb88bf2d
KH
7120 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7121 &coding, encodep, 1);
f072a3e8 7122 Vlast_coding_system_used = coding.symbol;
fb88bf2d 7123 return make_number (coding.produced_char);
4031e2bf
KH
7124}
7125
7126DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7127 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
7128 doc: /* Decode the current region from the specified coding system.
7129When called from a program, takes three arguments:
7130START, END, and CODING-SYSTEM. START and END are buffer positions.
7131This function sets `last-coding-system-used' to the precise coding system
7132used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7133not fully specified.)
7134It returns the length of the decoded text. */)
7135 (start, end, coding_system)
4031e2bf
KH
7136 Lisp_Object start, end, coding_system;
7137{
7138 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
7139}
7140
7141DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7142 3, 3, "r\nzCoding system: ",
48b0f3ae
PJ
7143 doc: /* Encode the current region into the specified coding system.
7144When called from a program, takes three arguments:
7145START, END, and CODING-SYSTEM. START and END are buffer positions.
7146This function sets `last-coding-system-used' to the precise coding system
7147used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7148not fully specified.)
7149It returns the length of the encoded text. */)
7150 (start, end, coding_system)
d46c5b12 7151 Lisp_Object start, end, coding_system;
3a73fa5d 7152{
4031e2bf
KH
7153 return code_convert_region1 (start, end, coding_system, 1);
7154}
3a73fa5d 7155
4031e2bf
KH
7156Lisp_Object
7157code_convert_string1 (string, coding_system, nocopy, encodep)
7158 Lisp_Object string, coding_system, nocopy;
7159 int encodep;
7160{
7161 struct coding_system coding;
3a73fa5d 7162
b7826503
PJ
7163 CHECK_STRING (string);
7164 CHECK_SYMBOL (coding_system);
4ed46869 7165
d46c5b12 7166 if (NILP (coding_system))
4031e2bf 7167 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 7168
d46c5b12 7169 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 7170 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
5f1cd180 7171
d46c5b12 7172 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
7173 string = (encodep
7174 ? encode_coding_string (string, &coding, !NILP (nocopy))
7175 : decode_coding_string (string, &coding, !NILP (nocopy)));
f072a3e8 7176 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
7177
7178 return string;
4ed46869
KH
7179}
7180
4ed46869 7181DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6 7182 2, 3, 0,
48b0f3ae
PJ
7183 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7184Optional arg NOCOPY non-nil means it is OK to return STRING itself
7185if the decoding operation is trivial.
7186This function sets `last-coding-system-used' to the precise coding system
7187used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7188not fully specified.) */)
7189 (string, coding_system, nocopy)
e0e989f6 7190 Lisp_Object string, coding_system, nocopy;
4ed46869 7191{
f072a3e8 7192 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
7193}
7194
7195DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6 7196 2, 3, 0,
48b0f3ae
PJ
7197 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7198Optional arg NOCOPY non-nil means it is OK to return STRING itself
7199if the encoding operation is trivial.
7200This function sets `last-coding-system-used' to the precise coding system
7201used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7202not fully specified.) */)
7203 (string, coding_system, nocopy)
e0e989f6 7204 Lisp_Object string, coding_system, nocopy;
4ed46869 7205{
f072a3e8 7206 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 7207}
4031e2bf 7208
ecec61c1 7209/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
7210 Do not set Vlast_coding_system_used.
7211
7212 This function is called only from macros DECODE_FILE and
7213 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
7214
7215Lisp_Object
7216code_convert_string_norecord (string, coding_system, encodep)
7217 Lisp_Object string, coding_system;
7218 int encodep;
7219{
7220 struct coding_system coding;
7221
b7826503
PJ
7222 CHECK_STRING (string);
7223 CHECK_SYMBOL (coding_system);
ecec61c1
KH
7224
7225 if (NILP (coding_system))
7226 return string;
7227
7228 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d5db4077 7229 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
ecec61c1 7230
ec6d2bb8 7231 coding.composing = COMPOSITION_DISABLED;
ecec61c1 7232 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
7233 return (encodep
7234 ? encode_coding_string (string, &coding, 1)
7235 : decode_coding_string (string, &coding, 1));
ecec61c1 7236}
3a73fa5d 7237\f
4ed46869 7238DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7239 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7240Return the corresponding character. */)
7241 (code)
4ed46869
KH
7242 Lisp_Object code;
7243{
7244 unsigned char c1, c2, s1, s2;
7245 Lisp_Object val;
7246
b7826503 7247 CHECK_NUMBER (code);
4ed46869 7248 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
7249 if (s1 == 0)
7250 {
c28a9453
KH
7251 if (s2 < 0x80)
7252 XSETFASTINT (val, s2);
7253 else if (s2 >= 0xA0 || s2 <= 0xDF)
b73bfc1c 7254 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
c28a9453 7255 else
9da8350f 7256 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
7257 }
7258 else
7259 {
87323294 7260 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
55ab7be3 7261 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 7262 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3 7263 DECODE_SJIS (s1, s2, c1, c2);
b73bfc1c 7264 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
55ab7be3 7265 }
4ed46869
KH
7266 return val;
7267}
7268
7269DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7270 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7271Return the corresponding code in SJIS. */)
7272 (ch)
4ed46869
KH
7273 Lisp_Object ch;
7274{
bcf26d6a 7275 int charset, c1, c2, s1, s2;
4ed46869
KH
7276 Lisp_Object val;
7277
b7826503 7278 CHECK_NUMBER (ch);
4ed46869 7279 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7280 if (charset == CHARSET_ASCII)
7281 {
7282 val = ch;
7283 }
7284 else if (charset == charset_jisx0208
7285 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
7286 {
7287 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 7288 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 7289 }
55ab7be3
KH
7290 else if (charset == charset_katakana_jisx0201
7291 && c1 > 0x20 && c2 < 0xE0)
7292 {
7293 XSETFASTINT (val, c1 | 0x80);
7294 }
4ed46869 7295 else
55ab7be3 7296 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
7297 return val;
7298}
7299
7300DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7301 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7302Return the corresponding character. */)
7303 (code)
4ed46869
KH
7304 Lisp_Object code;
7305{
7306 int charset;
7307 unsigned char b1, b2, c1, c2;
7308 Lisp_Object val;
7309
b7826503 7310 CHECK_NUMBER (code);
4ed46869 7311 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
7312 if (b1 == 0)
7313 {
7314 if (b2 >= 0x80)
9da8350f 7315 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
7316 val = code;
7317 }
7318 else
7319 {
7320 if ((b1 < 0xA1 || b1 > 0xFE)
7321 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 7322 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453 7323 DECODE_BIG5 (b1, b2, charset, c1, c2);
b73bfc1c 7324 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
c28a9453 7325 }
4ed46869
KH
7326 return val;
7327}
7328
7329DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7330 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7331Return the corresponding character code in Big5. */)
7332 (ch)
4ed46869
KH
7333 Lisp_Object ch;
7334{
bcf26d6a 7335 int charset, c1, c2, b1, b2;
4ed46869
KH
7336 Lisp_Object val;
7337
b7826503 7338 CHECK_NUMBER (ch);
4ed46869 7339 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
7340 if (charset == CHARSET_ASCII)
7341 {
7342 val = ch;
7343 }
7344 else if ((charset == charset_big5_1
7345 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7346 || (charset == charset_big5_2
7347 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
7348 {
7349 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 7350 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
7351 }
7352 else
c28a9453 7353 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
7354 return val;
7355}
3a73fa5d 7356\f
002fdb44 7357DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7358 Sset_terminal_coding_system_internal, 1, 1, 0,
7359 doc: /* Internal use only. */)
7360 (coding_system)
4ed46869
KH
7361 Lisp_Object coding_system;
7362{
b7826503 7363 CHECK_SYMBOL (coding_system);
4ed46869 7364 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 7365 /* We had better not send unsafe characters to terminal. */
0eecad43 7366 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
8ca3766a 7367 /* Character composition should be disabled. */
ec6d2bb8 7368 terminal_coding.composing = COMPOSITION_DISABLED;
bd64290d
KH
7369 /* Error notification should be suppressed. */
7370 terminal_coding.suppress_error = 1;
b73bfc1c
KH
7371 terminal_coding.src_multibyte = 1;
7372 terminal_coding.dst_multibyte = 0;
4ed46869
KH
7373 return Qnil;
7374}
7375
002fdb44 7376DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
48b0f3ae 7377 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7378 doc: /* Internal use only. */)
48b0f3ae 7379 (coding_system)
c4825358
KH
7380 Lisp_Object coding_system;
7381{
b7826503 7382 CHECK_SYMBOL (coding_system);
c4825358
KH
7383 setup_coding_system (Fcheck_coding_system (coding_system),
7384 &safe_terminal_coding);
8ca3766a 7385 /* Character composition should be disabled. */
ec6d2bb8 7386 safe_terminal_coding.composing = COMPOSITION_DISABLED;
bd64290d 7387 /* Error notification should be suppressed. */
6239a668 7388 safe_terminal_coding.suppress_error = 1;
b73bfc1c
KH
7389 safe_terminal_coding.src_multibyte = 1;
7390 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7391 return Qnil;
7392}
7393
002fdb44
DL
7394DEFUN ("terminal-coding-system", Fterminal_coding_system,
7395 Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7396 doc: /* Return coding system specified for terminal output. */)
7397 ()
4ed46869
KH
7398{
7399 return terminal_coding.symbol;
7400}
7401
002fdb44 7402DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7403 Sset_keyboard_coding_system_internal, 1, 1, 0,
7404 doc: /* Internal use only. */)
7405 (coding_system)
4ed46869
KH
7406 Lisp_Object coding_system;
7407{
b7826503 7408 CHECK_SYMBOL (coding_system);
4ed46869 7409 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
8ca3766a 7410 /* Character composition should be disabled. */
ec6d2bb8 7411 keyboard_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
7412 return Qnil;
7413}
7414
002fdb44
DL
7415DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7416 Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7417 doc: /* Return coding system specified for decoding keyboard input. */)
7418 ()
4ed46869
KH
7419{
7420 return keyboard_coding.symbol;
7421}
7422
7423\f
a5d301df
KH
7424DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7425 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7426 doc: /* Choose a coding system for an operation based on the target name.
7427The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7428DECODING-SYSTEM is the coding system to use for decoding
7429\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7430for encoding (in case OPERATION does encoding).
7431
7432The first argument OPERATION specifies an I/O primitive:
7433 For file I/O, `insert-file-contents' or `write-region'.
7434 For process I/O, `call-process', `call-process-region', or `start-process'.
7435 For network I/O, `open-network-stream'.
7436
7437The remaining arguments should be the same arguments that were passed
7438to the primitive. Depending on which primitive, one of those arguments
7439is selected as the TARGET. For example, if OPERATION does file I/O,
7440whichever argument specifies the file name is TARGET.
7441
7442TARGET has a meaning which depends on OPERATION:
7443 For file I/O, TARGET is a file name.
7444 For process I/O, TARGET is a process name.
7445 For network I/O, TARGET is a service name or a port number
7446
7447This function looks up what specified for TARGET in,
7448`file-coding-system-alist', `process-coding-system-alist',
7449or `network-coding-system-alist' depending on OPERATION.
7450They may specify a coding system, a cons of coding systems,
7451or a function symbol to call.
7452In the last case, we call the function with one argument,
7453which is a list of all the arguments given to this function.
7454
7455usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7456 (nargs, args)
4ed46869
KH
7457 int nargs;
7458 Lisp_Object *args;
7459{
7460 Lisp_Object operation, target_idx, target, val;
7461 register Lisp_Object chain;
7462
7463 if (nargs < 2)
7464 error ("Too few arguments");
7465 operation = args[0];
7466 if (!SYMBOLP (operation)
7467 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8ca3766a 7468 error ("Invalid first argument");
4ed46869
KH
7469 if (nargs < 1 + XINT (target_idx))
7470 error ("Too few arguments for operation: %s",
d5db4077 7471 SDATA (SYMBOL_NAME (operation)));
7f787cfd
KH
7472 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7473 argument to write-region) is string, it must be treated as a
7474 target file name. */
7475 if (EQ (operation, Qwrite_region)
7476 && nargs > 5
7477 && STRINGP (args[5]))
d90ed3b4 7478 target_idx = make_number (4);
4ed46869
KH
7479 target = args[XINT (target_idx) + 1];
7480 if (!(STRINGP (target)
7481 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8ca3766a 7482 error ("Invalid argument %d", XINT (target_idx) + 1);
4ed46869 7483
2e34157c
RS
7484 chain = ((EQ (operation, Qinsert_file_contents)
7485 || EQ (operation, Qwrite_region))
02ba4723 7486 ? Vfile_coding_system_alist
2e34157c 7487 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7488 ? Vnetwork_coding_system_alist
7489 : Vprocess_coding_system_alist));
4ed46869
KH
7490 if (NILP (chain))
7491 return Qnil;
7492
03699b14 7493 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 7494 {
f44d27ce 7495 Lisp_Object elt;
03699b14 7496 elt = XCAR (chain);
4ed46869
KH
7497
7498 if (CONSP (elt)
7499 && ((STRINGP (target)
03699b14
KR
7500 && STRINGP (XCAR (elt))
7501 && fast_string_match (XCAR (elt), target) >= 0)
7502 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 7503 {
03699b14 7504 val = XCDR (elt);
b19fd4c5
KH
7505 /* Here, if VAL is both a valid coding system and a valid
7506 function symbol, we return VAL as a coding system. */
02ba4723
KH
7507 if (CONSP (val))
7508 return val;
7509 if (! SYMBOLP (val))
7510 return Qnil;
7511 if (! NILP (Fcoding_system_p (val)))
7512 return Fcons (val, val);
b19fd4c5
KH
7513 if (! NILP (Ffboundp (val)))
7514 {
7515 val = call1 (val, Flist (nargs, args));
7516 if (CONSP (val))
7517 return val;
7518 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7519 return Fcons (val, val);
7520 }
02ba4723
KH
7521 return Qnil;
7522 }
4ed46869
KH
7523 }
7524 return Qnil;
7525}
7526
1397dc18
KH
7527DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7528 Supdate_coding_systems_internal, 0, 0, 0,
48b0f3ae
PJ
7529 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7530When values of any coding categories are changed, you must
7531call this function. */)
7532 ()
d46c5b12
KH
7533{
7534 int i;
7535
fa42c37f 7536 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 7537 {
1397dc18
KH
7538 Lisp_Object val;
7539
f5c1dd0d 7540 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
1397dc18
KH
7541 if (!NILP (val))
7542 {
7543 if (! coding_system_table[i])
7544 coding_system_table[i] = ((struct coding_system *)
7545 xmalloc (sizeof (struct coding_system)));
7546 setup_coding_system (val, coding_system_table[i]);
7547 }
7548 else if (coding_system_table[i])
7549 {
7550 xfree (coding_system_table[i]);
7551 coding_system_table[i] = NULL;
7552 }
d46c5b12 7553 }
1397dc18 7554
d46c5b12
KH
7555 return Qnil;
7556}
7557
66cfb530
KH
7558DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7559 Sset_coding_priority_internal, 0, 0, 0,
48b0f3ae
PJ
7560 doc: /* Update internal database for the current value of `coding-category-list'.
7561This function is internal use only. */)
7562 ()
66cfb530
KH
7563{
7564 int i = 0, idx;
84d60297
RS
7565 Lisp_Object val;
7566
7567 val = Vcoding_category_list;
66cfb530
KH
7568
7569 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7570 {
03699b14 7571 if (! SYMBOLP (XCAR (val)))
66cfb530 7572 break;
03699b14 7573 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
7574 if (idx >= CODING_CATEGORY_IDX_MAX)
7575 break;
7576 coding_priorities[i++] = (1 << idx);
03699b14 7577 val = XCDR (val);
66cfb530
KH
7578 }
7579 /* If coding-category-list is valid and contains all coding
7580 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 7581 the following code saves Emacs from crashing. */
66cfb530
KH
7582 while (i < CODING_CATEGORY_IDX_MAX)
7583 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7584
7585 return Qnil;
7586}
7587
6b89e3aa
KH
7588DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7589 Sdefine_coding_system_internal, 1, 1, 0,
7590 doc: /* Register CODING-SYSTEM as a base coding system.
7591This function is internal use only. */)
7592 (coding_system)
7593 Lisp_Object coding_system;
7594{
7595 Lisp_Object safe_chars, slot;
7596
7597 if (NILP (Fcheck_coding_system (coding_system)))
7598 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7599 safe_chars = coding_safe_chars (coding_system);
7600 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7601 error ("No valid safe-chars property for %s",
7602 SDATA (SYMBOL_NAME (coding_system)));
7603 if (EQ (safe_chars, Qt))
7604 {
7605 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7606 XSETCAR (Vcoding_system_safe_chars,
7607 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7608 }
7609 else
7610 {
7611 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7612 if (NILP (slot))
7613 XSETCDR (Vcoding_system_safe_chars,
7614 nconc2 (XCDR (Vcoding_system_safe_chars),
7615 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7616 else
7617 XSETCDR (slot, safe_chars);
7618 }
7619 return Qnil;
7620}
7621
4ed46869
KH
7622#endif /* emacs */
7623
7624\f
1397dc18 7625/*** 9. Post-amble ***/
4ed46869 7626
dfcf069d 7627void
4ed46869
KH
7628init_coding_once ()
7629{
7630 int i;
7631
93dec019 7632 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
7633 for (i = 0; i <= 0x20; i++)
7634 emacs_code_class[i] = EMACS_control_code;
7635 emacs_code_class[0x0A] = EMACS_linefeed_code;
7636 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7637 for (i = 0x21 ; i < 0x7F; i++)
7638 emacs_code_class[i] = EMACS_ascii_code;
7639 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 7640 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
7641 emacs_code_class[i] = EMACS_invalid_code;
7642 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7643 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7644 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7645 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7646
7647 /* ISO2022 specific initialize routine. */
7648 for (i = 0; i < 0x20; i++)
b73bfc1c 7649 iso_code_class[i] = ISO_control_0;
4ed46869
KH
7650 for (i = 0x21; i < 0x7F; i++)
7651 iso_code_class[i] = ISO_graphic_plane_0;
7652 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 7653 iso_code_class[i] = ISO_control_1;
4ed46869
KH
7654 for (i = 0xA1; i < 0xFF; i++)
7655 iso_code_class[i] = ISO_graphic_plane_1;
7656 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7657 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7658 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7659 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7660 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7661 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7662 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7663 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7664 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7665 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7666
e0e989f6
KH
7667 setup_coding_system (Qnil, &keyboard_coding);
7668 setup_coding_system (Qnil, &terminal_coding);
c4825358 7669 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 7670 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 7671
d46c5b12
KH
7672 bzero (coding_system_table, sizeof coding_system_table);
7673
66cfb530
KH
7674 bzero (ascii_skip_code, sizeof ascii_skip_code);
7675 for (i = 0; i < 128; i++)
7676 ascii_skip_code[i] = 1;
7677
9ce27fde
KH
7678#if defined (MSDOS) || defined (WINDOWSNT)
7679 system_eol_type = CODING_EOL_CRLF;
7680#else
7681 system_eol_type = CODING_EOL_LF;
7682#endif
b843d1ae
KH
7683
7684 inhibit_pre_post_conversion = 0;
e0e989f6
KH
7685}
7686
7687#ifdef emacs
7688
dfcf069d 7689void
e0e989f6
KH
7690syms_of_coding ()
7691{
2a47931b
KH
7692 staticpro (&Vcode_conversion_workbuf_name);
7693 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7694
e0e989f6
KH
7695 Qtarget_idx = intern ("target-idx");
7696 staticpro (&Qtarget_idx);
7697
bb0115a2
RS
7698 Qcoding_system_history = intern ("coding-system-history");
7699 staticpro (&Qcoding_system_history);
7700 Fset (Qcoding_system_history, Qnil);
7701
9ce27fde 7702 /* Target FILENAME is the first argument. */
e0e989f6 7703 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 7704 /* Target FILENAME is the third argument. */
e0e989f6
KH
7705 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7706
7707 Qcall_process = intern ("call-process");
7708 staticpro (&Qcall_process);
9ce27fde 7709 /* Target PROGRAM is the first argument. */
e0e989f6
KH
7710 Fput (Qcall_process, Qtarget_idx, make_number (0));
7711
7712 Qcall_process_region = intern ("call-process-region");
7713 staticpro (&Qcall_process_region);
9ce27fde 7714 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7715 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7716
7717 Qstart_process = intern ("start-process");
7718 staticpro (&Qstart_process);
9ce27fde 7719 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7720 Fput (Qstart_process, Qtarget_idx, make_number (2));
7721
7722 Qopen_network_stream = intern ("open-network-stream");
7723 staticpro (&Qopen_network_stream);
9ce27fde 7724 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
7725 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7726
4ed46869
KH
7727 Qcoding_system = intern ("coding-system");
7728 staticpro (&Qcoding_system);
7729
7730 Qeol_type = intern ("eol-type");
7731 staticpro (&Qeol_type);
7732
7733 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7734 staticpro (&Qbuffer_file_coding_system);
7735
7736 Qpost_read_conversion = intern ("post-read-conversion");
7737 staticpro (&Qpost_read_conversion);
7738
7739 Qpre_write_conversion = intern ("pre-write-conversion");
7740 staticpro (&Qpre_write_conversion);
7741
27901516
KH
7742 Qno_conversion = intern ("no-conversion");
7743 staticpro (&Qno_conversion);
7744
7745 Qundecided = intern ("undecided");
7746 staticpro (&Qundecided);
7747
4ed46869
KH
7748 Qcoding_system_p = intern ("coding-system-p");
7749 staticpro (&Qcoding_system_p);
7750
7751 Qcoding_system_error = intern ("coding-system-error");
7752 staticpro (&Qcoding_system_error);
7753
7754 Fput (Qcoding_system_error, Qerror_conditions,
7755 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7756 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 7757 build_string ("Invalid coding system"));
4ed46869 7758
d46c5b12
KH
7759 Qcoding_category = intern ("coding-category");
7760 staticpro (&Qcoding_category);
4ed46869
KH
7761 Qcoding_category_index = intern ("coding-category-index");
7762 staticpro (&Qcoding_category_index);
7763
d46c5b12
KH
7764 Vcoding_category_table
7765 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7766 staticpro (&Vcoding_category_table);
4ed46869
KH
7767 {
7768 int i;
7769 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7770 {
d46c5b12
KH
7771 XVECTOR (Vcoding_category_table)->contents[i]
7772 = intern (coding_category_name[i]);
7773 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7774 Qcoding_category_index, make_number (i));
4ed46869
KH
7775 }
7776 }
7777
6b89e3aa
KH
7778 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7779 staticpro (&Vcoding_system_safe_chars);
7780
f967223b
KH
7781 Qtranslation_table = intern ("translation-table");
7782 staticpro (&Qtranslation_table);
b666620c 7783 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
bdd9fb48 7784
f967223b
KH
7785 Qtranslation_table_id = intern ("translation-table-id");
7786 staticpro (&Qtranslation_table_id);
84fbb8a0 7787
f967223b
KH
7788 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7789 staticpro (&Qtranslation_table_for_decode);
a5d301df 7790
f967223b
KH
7791 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7792 staticpro (&Qtranslation_table_for_encode);
a5d301df 7793
05e6f5dc
KH
7794 Qsafe_chars = intern ("safe-chars");
7795 staticpro (&Qsafe_chars);
7796
7797 Qchar_coding_system = intern ("char-coding-system");
7798 staticpro (&Qchar_coding_system);
7799
7800 /* Intern this now in case it isn't already done.
7801 Setting this variable twice is harmless.
7802 But don't staticpro it here--that is done in alloc.c. */
7803 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7804 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
067a6a66 7805 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
70c22245 7806
1397dc18
KH
7807 Qvalid_codes = intern ("valid-codes");
7808 staticpro (&Qvalid_codes);
7809
9ce27fde
KH
7810 Qemacs_mule = intern ("emacs-mule");
7811 staticpro (&Qemacs_mule);
7812
d46c5b12
KH
7813 Qraw_text = intern ("raw-text");
7814 staticpro (&Qraw_text);
7815
ecf488bc
DL
7816 Qutf_8 = intern ("utf-8");
7817 staticpro (&Qutf_8);
7818
a362520d
KH
7819 Qcoding_system_define_form = intern ("coding-system-define-form");
7820 staticpro (&Qcoding_system_define_form);
7821
4ed46869
KH
7822 defsubr (&Scoding_system_p);
7823 defsubr (&Sread_coding_system);
7824 defsubr (&Sread_non_nil_coding_system);
7825 defsubr (&Scheck_coding_system);
7826 defsubr (&Sdetect_coding_region);
d46c5b12 7827 defsubr (&Sdetect_coding_string);
05e6f5dc 7828 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 7829 defsubr (&Sunencodable_char_position);
4ed46869
KH
7830 defsubr (&Sdecode_coding_region);
7831 defsubr (&Sencode_coding_region);
7832 defsubr (&Sdecode_coding_string);
7833 defsubr (&Sencode_coding_string);
7834 defsubr (&Sdecode_sjis_char);
7835 defsubr (&Sencode_sjis_char);
7836 defsubr (&Sdecode_big5_char);
7837 defsubr (&Sencode_big5_char);
1ba9e4ab 7838 defsubr (&Sset_terminal_coding_system_internal);
c4825358 7839 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 7840 defsubr (&Sterminal_coding_system);
1ba9e4ab 7841 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 7842 defsubr (&Skeyboard_coding_system);
a5d301df 7843 defsubr (&Sfind_operation_coding_system);
1397dc18 7844 defsubr (&Supdate_coding_systems_internal);
66cfb530 7845 defsubr (&Sset_coding_priority_internal);
6b89e3aa 7846 defsubr (&Sdefine_coding_system_internal);
4ed46869 7847
4608c386 7848 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
7849 doc: /* List of coding systems.
7850
7851Do not alter the value of this variable manually. This variable should be
7852updated by the functions `make-coding-system' and
7853`define-coding-system-alias'. */);
4608c386
KH
7854 Vcoding_system_list = Qnil;
7855
7856 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
7857 doc: /* Alist of coding system names.
7858Each element is one element list of coding system name.
7859This variable is given to `completing-read' as TABLE argument.
7860
7861Do not alter the value of this variable manually. This variable should be
7862updated by the functions `make-coding-system' and
7863`define-coding-system-alias'. */);
4608c386
KH
7864 Vcoding_system_alist = Qnil;
7865
4ed46869 7866 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
7867 doc: /* List of coding-categories (symbols) ordered by priority.
7868
7869On detecting a coding system, Emacs tries code detection algorithms
7870associated with each coding-category one by one in this order. When
7871one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
7872system bound to the corresponding coding-category is selected.
7873
42205607 7874Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
7875 {
7876 int i;
7877
7878 Vcoding_category_list = Qnil;
7879 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7880 Vcoding_category_list
d46c5b12
KH
7881 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7882 Vcoding_category_list);
4ed46869
KH
7883 }
7884
7885 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
7886 doc: /* Specify the coding system for read operations.
7887It is useful to bind this variable with `let', but do not set it globally.
7888If the value is a coding system, it is used for decoding on read operation.
7889If not, an appropriate element is used from one of the coding system alists:
7890There are three such tables, `file-coding-system-alist',
7891`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
7892 Vcoding_system_for_read = Qnil;
7893
7894 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
7895 doc: /* Specify the coding system for write operations.
7896Programs bind this variable with `let', but you should not set it globally.
7897If the value is a coding system, it is used for encoding of output,
7898when writing it to a file and when sending it to a file or subprocess.
7899
7900If this does not specify a coding system, an appropriate element
7901is used from one of the coding system alists:
7902There are three such tables, `file-coding-system-alist',
7903`process-coding-system-alist', and `network-coding-system-alist'.
7904For output to files, if the above procedure does not specify a coding system,
7905the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
7906 Vcoding_system_for_write = Qnil;
7907
7908 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7c695ab9
DL
7909 doc: /* Coding system used in the latest file or process I/O.
7910Also set by `encode-coding-region', `decode-coding-region',
7911`encode-coding-string' and `decode-coding-string'. */);
4ed46869
KH
7912 Vlast_coding_system_used = Qnil;
7913
9ce27fde 7914 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
48b0f3ae
PJ
7915 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7916See info node `Coding Systems' and info node `Text and Binary' concerning
7917such conversion. */);
9ce27fde
KH
7918 inhibit_eol_conversion = 0;
7919
ed29121d 7920 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
48b0f3ae
PJ
7921 doc: /* Non-nil means process buffer inherits coding system of process output.
7922Bind it to t if the process output is to be treated as if it were a file
7923read from some filesystem. */);
ed29121d
EZ
7924 inherit_process_coding_system = 0;
7925
02ba4723 7926 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
48b0f3ae
PJ
7927 doc: /* Alist to decide a coding system to use for a file I/O operation.
7928The format is ((PATTERN . VAL) ...),
7929where PATTERN is a regular expression matching a file name,
7930VAL is a coding system, a cons of coding systems, or a function symbol.
7931If VAL is a coding system, it is used for both decoding and encoding
7932the file contents.
7933If VAL is a cons of coding systems, the car part is used for decoding,
7934and the cdr part is used for encoding.
7935If VAL is a function symbol, the function must return a coding system
0192762c 7936or a cons of coding systems which are used as above. The function gets
ff955d90 7937the arguments with which `find-operation-coding-system' was called.
48b0f3ae
PJ
7938
7939See also the function `find-operation-coding-system'
7940and the variable `auto-coding-alist'. */);
02ba4723
KH
7941 Vfile_coding_system_alist = Qnil;
7942
7943 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
48b0f3ae
PJ
7944 doc: /* Alist to decide a coding system to use for a process I/O operation.
7945The format is ((PATTERN . VAL) ...),
7946where PATTERN is a regular expression matching a program name,
7947VAL is a coding system, a cons of coding systems, or a function symbol.
7948If VAL is a coding system, it is used for both decoding what received
7949from the program and encoding what sent to the program.
7950If VAL is a cons of coding systems, the car part is used for decoding,
7951and the cdr part is used for encoding.
7952If VAL is a function symbol, the function must return a coding system
7953or a cons of coding systems which are used as above.
7954
7955See also the function `find-operation-coding-system'. */);
02ba4723
KH
7956 Vprocess_coding_system_alist = Qnil;
7957
7958 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
48b0f3ae
PJ
7959 doc: /* Alist to decide a coding system to use for a network I/O operation.
7960The format is ((PATTERN . VAL) ...),
7961where PATTERN is a regular expression matching a network service name
7962or is a port number to connect to,
7963VAL is a coding system, a cons of coding systems, or a function symbol.
7964If VAL is a coding system, it is used for both decoding what received
7965from the network stream and encoding what sent to the network stream.
7966If VAL is a cons of coding systems, the car part is used for decoding,
7967and the cdr part is used for encoding.
7968If VAL is a function symbol, the function must return a coding system
7969or a cons of coding systems which are used as above.
7970
7971See also the function `find-operation-coding-system'. */);
02ba4723 7972 Vnetwork_coding_system_alist = Qnil;
4ed46869 7973
68c45bf0 7974 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
7975 doc: /* Coding system to use with system messages.
7976Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
7977 Vlocale_coding_system = Qnil;
7978
005f0d35 7979 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 7980 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
48b0f3ae 7981 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 7982 eol_mnemonic_unix = build_string (":");
4ed46869 7983
7722baf9 7984 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
48b0f3ae 7985 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 7986 eol_mnemonic_dos = build_string ("\\");
4ed46869 7987
7722baf9 7988 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
48b0f3ae 7989 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 7990 eol_mnemonic_mac = build_string ("/");
4ed46869 7991
7722baf9 7992 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
48b0f3ae 7993 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 7994 eol_mnemonic_undecided = build_string (":");
4ed46869 7995
84fbb8a0 7996 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
48b0f3ae 7997 doc: /* *Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 7998 Venable_character_translation = Qt;
bdd9fb48 7999
f967223b 8000 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
8001 &Vstandard_translation_table_for_decode,
8002 doc: /* Table for translating characters while decoding. */);
f967223b 8003 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 8004
f967223b 8005 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
8006 &Vstandard_translation_table_for_encode,
8007 doc: /* Table for translating characters while encoding. */);
f967223b 8008 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
8009
8010 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
48b0f3ae
PJ
8011 doc: /* Alist of charsets vs revision numbers.
8012While encoding, if a charset (car part of an element) is found,
8013designate it with the escape sequence identifying revision (cdr part of the element). */);
4ed46869 8014 Vcharset_revision_alist = Qnil;
02ba4723
KH
8015
8016 DEFVAR_LISP ("default-process-coding-system",
8017 &Vdefault_process_coding_system,
48b0f3ae
PJ
8018 doc: /* Cons of coding systems used for process I/O by default.
8019The car part is used for decoding a process output,
8020the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 8021 Vdefault_process_coding_system = Qnil;
c4825358 8022
3f003981 8023 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
48b0f3ae
PJ
8024 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8025This is a vector of length 256.
8026If Nth element is non-nil, the existence of code N in a file
8027\(or output of subprocess) doesn't prevent it to be detected as
8028a coding system of ISO 2022 variant which has a flag
8029`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8030or reading output of a subprocess.
8031Only 128th through 159th elements has a meaning. */);
3f003981 8032 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
8033
8034 DEFVAR_LISP ("select-safe-coding-system-function",
8035 &Vselect_safe_coding_system_function,
48b0f3ae
PJ
8036 doc: /* Function to call to select safe coding system for encoding a text.
8037
8038If set, this function is called to force a user to select a proper
8039coding system which can encode the text in the case that a default
8040coding system used in each operation can't encode the text.
8041
8042The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
8043 Vselect_safe_coding_system_function = Qnil;
8044
5d5bf4d8
KH
8045 DEFVAR_BOOL ("coding-system-require-warning",
8046 &coding_system_require_warning,
8047 doc: /* Internal use only.
6b89e3aa
KH
8048If non-nil, on writing a file, `select-safe-coding-system-function' is
8049called even if `coding-system-for-write' is non-nil. The command
8050`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
8051 coding_system_require_warning = 0;
8052
8053
22ab2303 8054 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 8055 &inhibit_iso_escape_detection,
48b0f3ae
PJ
8056 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8057
8058By default, on reading a file, Emacs tries to detect how the text is
8059encoded. This code detection is sensitive to escape sequences. If
8060the sequence is valid as ISO2022, the code is determined as one of
8061the ISO2022 encodings, and the file is decoded by the corresponding
8062coding system (e.g. `iso-2022-7bit').
8063
8064However, there may be a case that you want to read escape sequences in
8065a file as is. In such a case, you can set this variable to non-nil.
8066Then, as the code detection ignores any escape sequences, no file is
8067detected as encoded in some ISO2022 encoding. The result is that all
8068escape sequences become visible in a buffer.
8069
8070The default value is nil, and it is strongly recommended not to change
8071it. That is because many Emacs Lisp source files that contain
8072non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8073in Emacs's distribution, and they won't be decoded correctly on
8074reading if you suppress escape sequence detection.
8075
8076The other way to read escape sequences in a file without decoding is
8077to explicitly specify some coding system that doesn't use ISO2022's
8078escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 8079 inhibit_iso_escape_detection = 0;
002fdb44
DL
8080
8081 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
8082 doc: /* Char table for translating self-inserting characters.
8083This is applied to the result of input methods, not their input. See also
8084`keyboard-translate-table'. */);
002fdb44 8085 Vtranslation_table_for_input = Qnil;
4ed46869
KH
8086}
8087
68c45bf0
PE
8088char *
8089emacs_strerror (error_number)
8090 int error_number;
8091{
8092 char *str;
8093
ca9c0567 8094 synchronize_system_messages_locale ();
68c45bf0
PE
8095 str = strerror (error_number);
8096
8097 if (! NILP (Vlocale_coding_system))
8098 {
8099 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8100 Vlocale_coding_system,
8101 0);
d5db4077 8102 str = (char *) SDATA (dec);
68c45bf0
PE
8103 }
8104
8105 return str;
8106}
8107
4ed46869 8108#endif /* emacs */
c2f94ebc 8109
ab5796a9
MB
8110/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8111 (do not change this comment) */