*** empty log message ***
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
b73bfc1c 24 0. General comments
4ed46869 25 1. Preamble
0ef69138 26 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
1397dc18
KH
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
4ed46869
KH
34
35*/
36
b73bfc1c
KH
37/*** 0. General comments ***/
38
39
4ed46869
KH
40/*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
0ef69138
KH
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
4ed46869 48
0ef69138 49 0. Emacs' internal format (emacs-mule)
4ed46869
KH
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 52 in a special format. Details are described in section 2.
4ed46869
KH
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
f4dee582
RS
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 65 section 4.
4ed46869
KH
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
4ed46869 74
27901516
KH
75 4. Raw text
76
4608c386
KH
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
27901516
KH
79
80 5. Other
4ed46869 81
f4dee582 82 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
d46c5b12
KH
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
4ed46869 89 information about it is set in a structure of type `struct
f4dee582 90 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
91
92*/
93
94/*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 98 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
4ed46869 101
f4dee582
RS
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
4ed46869 104 any format of end-of-line. So, Emacs has information of format of
f4dee582 105 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
106
107*/
108
109/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116#if 0
117int
0ef69138 118detect_coding_emacs_mule (src, src_end)
4ed46869
KH
119 unsigned char *src, *src_end;
120{
121 ...
122}
123#endif
124
125/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
b73bfc1c
KH
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
d46c5b12 131
b73bfc1c
KH
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
d46c5b12
KH
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
4ed46869 142#if 0
b73bfc1c 143static void
d46c5b12 144decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
4ed46869
KH
148{
149 ...
150}
151#endif
152
153/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
0ef69138 155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
b73bfc1c
KH
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
d46c5b12 159
b73bfc1c
KH
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
d46c5b12
KH
164
165 DST_BYTES zero means that source area and destination area are
b73bfc1c
KH
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
d46c5b12
KH
168
169 Below is a template of these functions. */
4ed46869 170#if 0
b73bfc1c 171static void
d46c5b12 172encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
4ed46869
KH
176{
177 ...
178}
179#endif
180
181/*** COMMONLY USED MACROS ***/
182
b73bfc1c
KH
183/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
4ed46869 190
b73bfc1c
KH
191#define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
4ed46869
KH
199 } while (0)
200
b73bfc1c
KH
201#define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
4ed46869
KH
210 } while (0)
211
4ed46869 212
b73bfc1c
KH
213/* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
4ed46869 222
b73bfc1c
KH
223#define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
4ed46869
KH
240 } while (0)
241
4ed46869 242
b73bfc1c
KH
243/* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253#define EMIT_CHAR(c) \
4ed46869 254 do { \
b73bfc1c
KH
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
ec6d2bb8 268 \
b73bfc1c
KH
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
4ed46869
KH
276 } while (0)
277
4ed46869 278
b73bfc1c
KH
279#define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289#define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299#define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
4ed46869
KH
308 } while (0)
309
310\f
311/*** 1. Preamble ***/
312
68c45bf0
PE
313#ifdef emacs
314#include <config.h>
315#endif
316
4ed46869
KH
317#include <stdio.h>
318
319#ifdef emacs
320
4ed46869
KH
321#include "lisp.h"
322#include "buffer.h"
323#include "charset.h"
ec6d2bb8 324#include "composite.h"
4ed46869
KH
325#include "ccl.h"
326#include "coding.h"
327#include "window.h"
328
329#else /* not emacs */
330
331#include "mulelib.h"
332
333#endif /* not emacs */
334
335Lisp_Object Qcoding_system, Qeol_type;
336Lisp_Object Qbuffer_file_coding_system;
337Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 338Lisp_Object Qno_conversion, Qundecided;
bb0115a2 339Lisp_Object Qcoding_system_history;
70c22245 340Lisp_Object Qsafe_charsets;
1397dc18 341Lisp_Object Qvalid_codes;
4ed46869
KH
342
343extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345Lisp_Object Qstart_process, Qopen_network_stream;
346Lisp_Object Qtarget_idx;
347
d46c5b12
KH
348Lisp_Object Vselect_safe_coding_system_function;
349
7722baf9
EZ
350/* Mnemonic string for each format of end-of-line. */
351Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 353 decided. */
7722baf9 354Lisp_Object eol_mnemonic_undecided;
4ed46869 355
9ce27fde
KH
356/* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358int system_eol_type;
359
4ed46869
KH
360#ifdef emacs
361
4608c386
KH
362Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 365
d46c5b12
KH
366/* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 369
4ed46869
KH
370/* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372/* Coding-system for reading files and receiving data from process. */
373Lisp_Object Vcoding_system_for_read;
374/* Coding-system for writing files and sending data to process. */
375Lisp_Object Vcoding_system_for_write;
376/* Coding-system actually used in the latest I/O. */
377Lisp_Object Vlast_coding_system_used;
378
c4825358 379/* A vector of length 256 which contains information about special
94487c4e 380 Latin codes (especially for dealing with Microsoft codes). */
3f003981 381Lisp_Object Vlatin_extra_code_table;
c4825358 382
9ce27fde
KH
383/* Flag to inhibit code conversion of end-of-line format. */
384int inhibit_eol_conversion;
385
ed29121d
EZ
386/* Flag to make buffer-file-coding-system inherit from process-coding. */
387int inherit_process_coding_system;
388
c4825358 389/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
390struct coding_system terminal_coding;
391
c4825358
KH
392/* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394struct coding_system safe_terminal_coding;
395
396/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
397struct coding_system keyboard_coding;
398
6bc51348
KH
399/* Default coding system to be used to write a file. */
400struct coding_system default_buffer_file_coding;
401
02ba4723
KH
402Lisp_Object Vfile_coding_system_alist;
403Lisp_Object Vprocess_coding_system_alist;
404Lisp_Object Vnetwork_coding_system_alist;
4ed46869 405
68c45bf0
PE
406Lisp_Object Vlocale_coding_system;
407
4ed46869
KH
408#endif /* emacs */
409
d46c5b12 410Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
411
412/* List of symbols `coding-category-xxx' ordered by priority. */
413Lisp_Object Vcoding_category_list;
414
d46c5b12
KH
415/* Table of coding categories (Lisp symbols). */
416Lisp_Object Vcoding_category_table;
4ed46869
KH
417
418/* Table of names of symbol for each coding-category. */
419char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 420 "coding-category-emacs-mule",
4ed46869
KH
421 "coding-category-sjis",
422 "coding-category-iso-7",
d46c5b12 423 "coding-category-iso-7-tight",
4ed46869
KH
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
7717c392
KH
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
89fa8b36 428 "coding-category-ccl",
4ed46869 429 "coding-category-big5",
fa42c37f
KH
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
27901516 433 "coding-category-raw-text",
89fa8b36 434 "coding-category-binary"
4ed46869
KH
435};
436
66cfb530 437/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
438 categories. */
439struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
66cfb530
KH
441/* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443static
444int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
f967223b
KH
446/* Flag to tell if we look up translation table on character code
447 conversion. */
84fbb8a0 448Lisp_Object Venable_character_translation;
f967223b
KH
449/* Standard translation table to look up on decoding (reading). */
450Lisp_Object Vstandard_translation_table_for_decode;
451/* Standard translation table to look up on encoding (writing). */
452Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 453
f967223b
KH
454Lisp_Object Qtranslation_table;
455Lisp_Object Qtranslation_table_id;
456Lisp_Object Qtranslation_table_for_decode;
457Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
458
459/* Alist of charsets vs revision number. */
460Lisp_Object Vcharset_revision_alist;
461
02ba4723
KH
462/* Default coding systems used for process I/O. */
463Lisp_Object Vdefault_process_coding_system;
464
b843d1ae
KH
465/* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469static int inhibit_pre_post_conversion;
470
4ed46869 471\f
0ef69138 472/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
473
474/* Emacs' internal format for encoding multiple character sets is a
f4dee582 475 kind of multi-byte encoding, i.e. characters are encoded by
b73bfc1c
KH
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
f4dee582 496
4ed46869 497 --- CODE RANGE of Emacs' internal format ---
b73bfc1c
KH
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
4ed46869
KH
504 ---------------------------------------------
505
506 */
507
508enum emacs_code_class_type emacs_code_class[256];
509
4ed46869
KH
510/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
513
514int
0ef69138 515detect_coding_emacs_mule (src, src_end)
b73bfc1c 516 unsigned char *src, *src_end;
4ed46869
KH
517{
518 unsigned char c;
519 int composing = 0;
b73bfc1c
KH
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
4ed46869 523
b73bfc1c 524 while (1)
4ed46869 525 {
b73bfc1c 526 ONE_MORE_BYTE (c);
4ed46869
KH
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
b73bfc1c
KH
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
4ed46869
KH
537 else
538 c -= 0x20;
539 }
540
b73bfc1c 541 if (c < 0x20)
4ed46869 542 {
4ed46869
KH
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
b73bfc1c
KH
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
4ed46869 555
b73bfc1c
KH
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565}
4ed46869 566
4ed46869 567
b73bfc1c 568/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 569
b73bfc1c
KH
570static void
571decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575{
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
4ed46869 585
b73bfc1c
KH
586 coding->produced_char = 0;
587 while (src < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
ec6d2bb8 591
b73bfc1c
KH
592 src_base = src;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 {
595 p = src;
596 src += bytes;
597 }
598 else
599 {
600 bytes = CHAR_STRING (*src, tmp);
601 p = tmp;
602 src++;
603 }
604 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 {
606 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869
KH
607 break;
608 }
b73bfc1c
KH
609 while (bytes--) *dst++ = *p++;
610 coding->produced_char++;
4ed46869 611 }
b73bfc1c
KH
612 coding->consumed = coding->consumed_char = src_base - source;
613 coding->produced = dst - destination;
4ed46869
KH
614}
615
b73bfc1c
KH
616#define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
618
619
4ed46869
KH
620\f
621/*** 3. ISO2022 handlers ***/
622
623/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
627 original document of ISO2022.
628
629 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
b73bfc1c 634
39787efd 635 There are two kinds of character sets: control character set and
4ed46869
KH
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
39787efd
KH
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
640 two control character sets and many graphic character sets.
641
642 Graphic character sets are classified into one of the following
39787efd
KH
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
645 - DIMENSION1_CHARS94
646 - DIMENSION1_CHARS96
647 - DIMENSION2_CHARS94
648 - DIMENSION2_CHARS96
649
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
4ed46869
KH
655
656 Note (*): ECMA = European Computer Manufacturers Association
657
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
663
39787efd 664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
669
670 A control character set is directly designated and invoked to C0 or
39787efd
KH
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
4ed46869
KH
679
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
39787efd
KH
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
4ed46869 687
39787efd
KH
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
691 be used.
4ed46869
KH
692
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
39787efd
KH
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
698 escape sequences:
4ed46869
KH
699
700 ----------------------------------------------------------------------
39787efd 701 abbrev function cntrl escape seq description
4ed46869 702 ----------------------------------------------------------------------
39787efd
KH
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 712 ----------------------------------------------------------------------
39787efd
KH
713 (*) These are not used by any known coding system.
714
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
4ed46869 717
39787efd 718 Designations are done by the following escape sequences:
4ed46869
KH
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
739
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 741 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
742
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
39787efd 745 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
746 7-bit environment, non-locking-shift, and non-single-shift.
747
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 749 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
750
751 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
756 localized platforms), and all of these are variants of ISO2022.
757
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
761
39787efd 762 ISO6429's direction specification takes the following form:
4ed46869
KH
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769
770 Character composition specification takes the following form:
ec6d2bb8
KH
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c
KH
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
ec6d2bb8 778
b73bfc1c
KH
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
ec6d2bb8 783
b73bfc1c
KH
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
ec6d2bb8 786
b73bfc1c 787 COMPOSITION_RELATIVE:
ec6d2bb8 788 ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 789 COMPOSITOIN_WITH_RULE:
ec6d2bb8 790 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 791 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 793 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
795
796enum iso_code_class_type iso_code_class[256];
797
f024b6aa
RS
798#define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
804
805#define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807
4ed46869
KH
808/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
d46c5b12 812 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
817 are set. If a code which should never appear in ISO2022 is found,
818 returns 0. */
819
820int
821detect_coding_iso2022 (src, src_end)
822 unsigned char *src, *src_end;
823{
d46c5b12
KH
824 int mask = CODING_CATEGORY_MASK_ISO;
825 int mask_found = 0;
f46869e4 826 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 827 int c, c1, i, charset;
b73bfc1c
KH
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding;
830 struct coding_system *coding = &dummy_coding;
3f003981 831
d46c5b12 832 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 833 while (mask && src < src_end)
4ed46869 834 {
b73bfc1c 835 ONE_MORE_BYTE (c);
4ed46869
KH
836 switch (c)
837 {
838 case ISO_CODE_ESC:
f46869e4 839 single_shifting = 0;
b73bfc1c 840 ONE_MORE_BYTE (c);
d46c5b12 841 if (c >= '(' && c <= '/')
4ed46869 842 {
bf9cdd4e 843 /* Designation sequence for a charset of dimension 1. */
b73bfc1c 844 ONE_MORE_BYTE (c1);
d46c5b12
KH
845 if (c1 < ' ' || c1 >= 0x80
846 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
847 /* Invalid designation sequence. Just ignore. */
848 break;
849 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
850 }
851 else if (c == '$')
852 {
853 /* Designation sequence for a charset of dimension 2. */
b73bfc1c 854 ONE_MORE_BYTE (c);
bf9cdd4e
KH
855 if (c >= '@' && c <= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 857 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 858 else if (c >= '(' && c <= '/')
bcf26d6a 859 {
b73bfc1c 860 ONE_MORE_BYTE (c1);
d46c5b12
KH
861 if (c1 < ' ' || c1 >= 0x80
862 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
863 /* Invalid designation sequence. Just ignore. */
864 break;
865 reg[(c - '(') % 4] = charset;
bcf26d6a 866 }
bf9cdd4e 867 else
d46c5b12
KH
868 /* Invalid designation sequence. Just ignore. */
869 break;
870 }
ae9ff118 871 else if (c == 'N' || c == 'O')
d46c5b12 872 {
ae9ff118
KH
873 /* ESC <Fe> for SS2 or SS3. */
874 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 875 break;
4ed46869 876 }
ec6d2bb8
KH
877 else if (c >= '0' && c <= '4')
878 {
879 /* ESC <Fp> for start/end composition. */
880 mask_found |= CODING_CATEGORY_MASK_ISO;
881 break;
882 }
bf9cdd4e 883 else
d46c5b12
KH
884 /* Invalid escape sequence. Just ignore. */
885 break;
886
887 /* We found a valid designation sequence for CHARSET. */
888 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
890 mask_found |= CODING_CATEGORY_MASK_ISO_7;
891 else
892 mask &= ~CODING_CATEGORY_MASK_ISO_7;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
894 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
895 else
896 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
898 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
899 else
d46c5b12 900 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
902 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
903 else
d46c5b12 904 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
905 break;
906
4ed46869 907 case ISO_CODE_SO:
f46869e4 908 single_shifting = 0;
d46c5b12
KH
909 if (shift_out == 0
910 && (reg[1] >= 0
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 {
914 /* Locking shift out. */
915 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
916 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 }
e0e989f6
KH
918 break;
919
d46c5b12 920 case ISO_CODE_SI:
f46869e4 921 single_shifting = 0;
d46c5b12
KH
922 if (shift_out == 1)
923 {
924 /* Locking shift in. */
925 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
926 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 }
928 break;
929
4ed46869 930 case ISO_CODE_CSI:
f46869e4 931 single_shifting = 0;
4ed46869
KH
932 case ISO_CODE_SS2:
933 case ISO_CODE_SS3:
3f003981
KH
934 {
935 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936
70c22245
KH
937 if (c != ISO_CODE_CSI)
938 {
d46c5b12
KH
939 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 941 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
942 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 944 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 945 single_shifting = 1;
70c22245 946 }
3f003981
KH
947 if (VECTORP (Vlatin_extra_code_table)
948 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 {
d46c5b12
KH
950 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 952 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
953 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
955 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 }
957 mask &= newmask;
d46c5b12 958 mask_found |= newmask;
3f003981
KH
959 }
960 break;
4ed46869
KH
961
962 default:
963 if (c < 0x80)
f46869e4
KH
964 {
965 single_shifting = 0;
966 break;
967 }
4ed46869 968 else if (c < 0xA0)
c4825358 969 {
f46869e4 970 single_shifting = 0;
3f003981
KH
971 if (VECTORP (Vlatin_extra_code_table)
972 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 973 {
3f003981
KH
974 int newmask = 0;
975
d46c5b12
KH
976 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 978 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
979 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
981 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
982 mask &= newmask;
d46c5b12 983 mask_found |= newmask;
c4825358 984 }
3f003981
KH
985 else
986 return 0;
c4825358 987 }
4ed46869
KH
988 else
989 {
d46c5b12 990 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 991 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 992 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
993 /* Check the length of succeeding codes of the range
994 0xA0..0FF. If the byte length is odd, we exclude
995 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
996 when we are not single shifting. */
b73bfc1c
KH
997 if (!single_shifting
998 && mask & CODING_CATEGORY_MASK_ISO_8_2)
f46869e4 999 {
e17de821 1000 int i = 1;
b73bfc1c
KH
1001 while (src < src_end)
1002 {
1003 ONE_MORE_BYTE (c);
1004 if (c < 0xA0)
1005 break;
1006 i++;
1007 }
1008
1009 if (i & 1 && src < src_end)
f46869e4
KH
1010 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1011 else
1012 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1013 }
4ed46869
KH
1014 }
1015 break;
1016 }
1017 }
b73bfc1c 1018 label_end_of_loop:
d46c5b12 1019 return (mask & mask_found);
4ed46869
KH
1020}
1021
b73bfc1c
KH
1022/* Decode a character of which charset is CHARSET, the 1st position
1023 code is C1, the 2nd position code is C2, and return the decoded
1024 character code. If the variable `translation_table' is non-nil,
1025 returned the translated code. */
ec6d2bb8 1026
b73bfc1c
KH
1027#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1028 (NILP (translation_table) \
1029 ? MAKE_CHAR (charset, c1, c2) \
1030 : translate_char (translation_table, -1, charset, c1, c2))
4ed46869
KH
1031
1032/* Set designation state into CODING. */
d46c5b12
KH
1033#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1034 do { \
944bd420
KH
1035 int charset; \
1036 \
1037 if (final_char < '0' || final_char >= 128) \
1038 goto label_invalid_code; \
1039 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1040 make_number (chars), \
1041 make_number (final_char)); \
d46c5b12 1042 if (charset >= 0 \
704c5781
KH
1043 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1044 || coding->safe_charsets[charset])) \
d46c5b12
KH
1045 { \
1046 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1047 && reg == 0 \
1048 && charset == CHARSET_ASCII) \
1049 { \
1050 /* We should insert this designation sequence as is so \
1051 that it is surely written back to a file. */ \
1052 coding->spec.iso2022.last_invalid_designation_register = -1; \
1053 goto label_invalid_code; \
1054 } \
1055 coding->spec.iso2022.last_invalid_designation_register = -1; \
1056 if ((coding->mode & CODING_MODE_DIRECTION) \
1057 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1058 charset = CHARSET_REVERSE_CHARSET (charset); \
1059 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1060 } \
1061 else \
1062 { \
1063 coding->spec.iso2022.last_invalid_designation_register = reg; \
1064 goto label_invalid_code; \
1065 } \
4ed46869
KH
1066 } while (0)
1067
ec6d2bb8
KH
1068/* Allocate a memory block for storing information about compositions.
1069 The block is chained to the already allocated blocks. */
d46c5b12 1070
33fb63eb 1071void
ec6d2bb8 1072coding_allocate_composition_data (coding, char_offset)
d46c5b12 1073 struct coding_system *coding;
ec6d2bb8 1074 int char_offset;
d46c5b12 1075{
ec6d2bb8
KH
1076 struct composition_data *cmp_data
1077 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1078
1079 cmp_data->char_offset = char_offset;
1080 cmp_data->used = 0;
1081 cmp_data->prev = coding->cmp_data;
1082 cmp_data->next = NULL;
1083 if (coding->cmp_data)
1084 coding->cmp_data->next = cmp_data;
1085 coding->cmp_data = cmp_data;
1086 coding->cmp_data_start = 0;
1087}
d46c5b12 1088
ec6d2bb8
KH
1089/* Record the starting position START and METHOD of one composition. */
1090
1091#define CODING_ADD_COMPOSITION_START(coding, start, method) \
1092 do { \
1093 struct composition_data *cmp_data = coding->cmp_data; \
1094 int *data = cmp_data->data + cmp_data->used; \
1095 coding->cmp_data_start = cmp_data->used; \
1096 data[0] = -1; \
1097 data[1] = cmp_data->char_offset + start; \
1098 data[3] = (int) method; \
1099 cmp_data->used += 4; \
1100 } while (0)
1101
1102/* Record the ending position END of the current composition. */
1103
1104#define CODING_ADD_COMPOSITION_END(coding, end) \
1105 do { \
1106 struct composition_data *cmp_data = coding->cmp_data; \
1107 int *data = cmp_data->data + coding->cmp_data_start; \
1108 data[0] = cmp_data->used - coding->cmp_data_start; \
1109 data[2] = cmp_data->char_offset + end; \
1110 } while (0)
1111
1112/* Record one COMPONENT (alternate character or composition rule). */
1113
1114#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1115 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1116
1117/* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1118
33fb63eb
KH
1119#define DECODE_COMPOSITION_START(c1) \
1120 do { \
1121 if (coding->composing == COMPOSITION_DISABLED) \
1122 { \
1123 *dst++ = ISO_CODE_ESC; \
1124 *dst++ = c1 & 0x7f; \
1125 coding->produced_char += 2; \
1126 } \
1127 else if (!COMPOSING_P (coding)) \
1128 { \
1129 /* This is surely the start of a composition. We must be sure \
1130 that coding->cmp_data has enough space to store the \
1131 information about the composition. If not, terminate the \
1132 current decoding loop, allocate one more memory block for \
1133 coding->cmp_data in the calller, then start the decoding \
1134 loop again. We can't allocate memory here directly because \
1135 it may cause buffer/string relocation. */ \
1136 if (!coding->cmp_data \
1137 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1138 >= COMPOSITION_DATA_SIZE)) \
1139 { \
1140 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1141 goto label_end_of_loop; \
1142 } \
1143 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1144 : c1 == '2' ? COMPOSITION_WITH_RULE \
1145 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1146 : COMPOSITION_WITH_RULE_ALTCHARS); \
1147 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1148 coding->composing); \
1149 coding->composition_rule_follows = 0; \
1150 } \
1151 else \
1152 { \
1153 /* We are already handling a composition. If the method is \
1154 the following two, the codes following the current escape \
1155 sequence are actual characters stored in a buffer. */ \
1156 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1157 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1158 { \
1159 coding->composing = COMPOSITION_RELATIVE; \
1160 coding->composition_rule_follows = 0; \
1161 } \
1162 } \
ec6d2bb8
KH
1163 } while (0)
1164
1165/* Handle compositoin end sequence ESC 1. */
1166
1167#define DECODE_COMPOSITION_END(c1) \
1168 do { \
1169 if (coding->composing == COMPOSITION_DISABLED) \
1170 { \
1171 *dst++ = ISO_CODE_ESC; \
1172 *dst++ = c1; \
1173 coding->produced_char += 2; \
1174 } \
1175 else \
1176 { \
1177 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1178 coding->composing = COMPOSITION_NO; \
1179 } \
1180 } while (0)
1181
1182/* Decode a composition rule from the byte C1 (and maybe one more byte
1183 from SRC) and store one encoded composition rule in
1184 coding->cmp_data. */
1185
1186#define DECODE_COMPOSITION_RULE(c1) \
1187 do { \
1188 int rule = 0; \
1189 (c1) -= 32; \
1190 if (c1 < 81) /* old format (before ver.21) */ \
1191 { \
1192 int gref = (c1) / 9; \
1193 int nref = (c1) % 9; \
1194 if (gref == 4) gref = 10; \
1195 if (nref == 4) nref = 10; \
1196 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1197 } \
b73bfc1c 1198 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
1199 { \
1200 ONE_MORE_BYTE (c2); \
1201 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1202 } \
1203 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1204 coding->composition_rule_follows = 0; \
1205 } while (0)
88993dfd 1206
d46c5b12 1207
4ed46869
KH
1208/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1209
b73bfc1c 1210static void
d46c5b12 1211decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1212 struct coding_system *coding;
1213 unsigned char *source, *destination;
1214 int src_bytes, dst_bytes;
4ed46869
KH
1215{
1216 unsigned char *src = source;
1217 unsigned char *src_end = source + src_bytes;
1218 unsigned char *dst = destination;
1219 unsigned char *dst_end = destination + dst_bytes;
4ed46869
KH
1220 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1221 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1222 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
b73bfc1c
KH
1223 /* SRC_BASE remembers the start position in source in each loop.
1224 The loop will be exited when there's not enough source code
1225 (within macro ONE_MORE_BYTE), or when there's not enough
1226 destination area to produce a character (within macro
1227 EMIT_CHAR). */
1228 unsigned char *src_base;
1229 int c, charset;
1230 Lisp_Object translation_table;
bdd9fb48 1231
b73bfc1c
KH
1232 if (NILP (Venable_character_translation))
1233 translation_table = Qnil;
1234 else
1235 {
1236 translation_table = coding->translation_table_for_decode;
1237 if (NILP (translation_table))
1238 translation_table = Vstandard_translation_table_for_decode;
1239 }
4ed46869 1240
b73bfc1c
KH
1241 coding->result = CODING_FINISH_NORMAL;
1242
1243 while (1)
4ed46869 1244 {
b73bfc1c
KH
1245 int c1, c2;
1246
1247 src_base = src;
1248 ONE_MORE_BYTE (c1);
4ed46869 1249
ec6d2bb8 1250 /* We produce no character or one character. */
4ed46869
KH
1251 switch (iso_code_class [c1])
1252 {
1253 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1254 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1255 {
1256 DECODE_COMPOSITION_RULE (c1);
b73bfc1c 1257 continue;
ec6d2bb8
KH
1258 }
1259 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1260 {
1261 /* This is SPACE or DEL. */
b73bfc1c 1262 charset = CHARSET_ASCII;
4ed46869
KH
1263 break;
1264 }
1265 /* This is a graphic character, we fall down ... */
1266
1267 case ISO_graphic_plane_0:
ec6d2bb8 1268 if (COMPOSING_P (coding) && coding->composition_rule_follows)
b73bfc1c
KH
1269 {
1270 DECODE_COMPOSITION_RULE (c1);
1271 continue;
1272 }
1273 charset = charset0;
4ed46869
KH
1274 break;
1275
1276 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1277 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1278 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1279 goto label_invalid_code;
4ed46869
KH
1280 /* This is a graphic character, we fall down ... */
1281
1282 case ISO_graphic_plane_1:
b73bfc1c 1283 if (charset1 < 0)
fb88bf2d 1284 goto label_invalid_code;
b73bfc1c 1285 charset = charset1;
4ed46869
KH
1286 break;
1287
b73bfc1c 1288 case ISO_control_0:
ec6d2bb8
KH
1289 if (COMPOSING_P (coding))
1290 DECODE_COMPOSITION_END ('1');
1291
4ed46869
KH
1292 /* All ISO2022 control characters in this class have the
1293 same representation in Emacs internal format. */
d46c5b12
KH
1294 if (c1 == '\n'
1295 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1296 && (coding->eol_type == CODING_EOL_CR
1297 || coding->eol_type == CODING_EOL_CRLF))
1298 {
b73bfc1c
KH
1299 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1300 goto label_end_of_loop;
d46c5b12 1301 }
b73bfc1c 1302 charset = CHARSET_ASCII;
4ed46869
KH
1303 break;
1304
b73bfc1c
KH
1305 case ISO_control_1:
1306 if (COMPOSING_P (coding))
1307 DECODE_COMPOSITION_END ('1');
1308 goto label_invalid_code;
1309
4ed46869 1310 case ISO_carriage_return:
ec6d2bb8
KH
1311 if (COMPOSING_P (coding))
1312 DECODE_COMPOSITION_END ('1');
1313
4ed46869 1314 if (coding->eol_type == CODING_EOL_CR)
b73bfc1c 1315 c1 = '\n';
4ed46869
KH
1316 else if (coding->eol_type == CODING_EOL_CRLF)
1317 {
1318 ONE_MORE_BYTE (c1);
b73bfc1c 1319 if (c1 != ISO_CODE_LF)
4ed46869 1320 {
d46c5b12
KH
1321 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1322 {
b73bfc1c
KH
1323 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1324 goto label_end_of_loop;
d46c5b12 1325 }
4ed46869 1326 src--;
b73bfc1c 1327 c1 = '\r';
4ed46869
KH
1328 }
1329 }
b73bfc1c 1330 charset = CHARSET_ASCII;
4ed46869
KH
1331 break;
1332
1333 case ISO_shift_out:
d46c5b12
KH
1334 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1335 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1336 goto label_invalid_code;
4ed46869
KH
1337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1338 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1339 continue;
4ed46869
KH
1340
1341 case ISO_shift_in:
d46c5b12
KH
1342 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1343 goto label_invalid_code;
4ed46869
KH
1344 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1345 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1346 continue;
4ed46869
KH
1347
1348 case ISO_single_shift_2_7:
1349 case ISO_single_shift_2:
d46c5b12
KH
1350 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1351 goto label_invalid_code;
4ed46869
KH
1352 /* SS2 is handled as an escape sequence of ESC 'N' */
1353 c1 = 'N';
1354 goto label_escape_sequence;
1355
1356 case ISO_single_shift_3:
d46c5b12
KH
1357 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1358 goto label_invalid_code;
4ed46869
KH
1359 /* SS2 is handled as an escape sequence of ESC 'O' */
1360 c1 = 'O';
1361 goto label_escape_sequence;
1362
1363 case ISO_control_sequence_introducer:
1364 /* CSI is handled as an escape sequence of ESC '[' ... */
1365 c1 = '[';
1366 goto label_escape_sequence;
1367
1368 case ISO_escape:
1369 ONE_MORE_BYTE (c1);
1370 label_escape_sequence:
1371 /* Escape sequences handled by Emacs are invocation,
1372 designation, direction specification, and character
1373 composition specification. */
1374 switch (c1)
1375 {
1376 case '&': /* revision of following character set */
1377 ONE_MORE_BYTE (c1);
1378 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1379 goto label_invalid_code;
4ed46869
KH
1380 ONE_MORE_BYTE (c1);
1381 if (c1 != ISO_CODE_ESC)
d46c5b12 1382 goto label_invalid_code;
4ed46869
KH
1383 ONE_MORE_BYTE (c1);
1384 goto label_escape_sequence;
1385
1386 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1387 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1388 goto label_invalid_code;
4ed46869
KH
1389 ONE_MORE_BYTE (c1);
1390 if (c1 >= '@' && c1 <= 'B')
1391 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1392 or JISX0208.1980 */
4ed46869
KH
1393 DECODE_DESIGNATION (0, 2, 94, c1);
1394 }
1395 else if (c1 >= 0x28 && c1 <= 0x2B)
1396 { /* designation of DIMENSION2_CHARS94 character set */
1397 ONE_MORE_BYTE (c2);
1398 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1399 }
1400 else if (c1 >= 0x2C && c1 <= 0x2F)
1401 { /* designation of DIMENSION2_CHARS96 character set */
1402 ONE_MORE_BYTE (c2);
1403 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1404 }
1405 else
d46c5b12 1406 goto label_invalid_code;
b73bfc1c
KH
1407 /* We must update these variables now. */
1408 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1409 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1410 continue;
4ed46869
KH
1411
1412 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1413 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1414 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1415 goto label_invalid_code;
4ed46869 1416 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1417 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1418 continue;
4ed46869
KH
1419
1420 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1421 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1422 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1423 goto label_invalid_code;
4ed46869 1424 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1425 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1426 continue;
4ed46869
KH
1427
1428 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1429 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1430 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1431 goto label_invalid_code;
4ed46869 1432 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
b73bfc1c 1433 ONE_MORE_BYTE (c1);
4ed46869
KH
1434 break;
1435
1436 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1437 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1438 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1439 goto label_invalid_code;
4ed46869 1440 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
b73bfc1c 1441 ONE_MORE_BYTE (c1);
4ed46869
KH
1442 break;
1443
ec6d2bb8
KH
1444 case '0': case '2': case '3': case '4': /* start composition */
1445 DECODE_COMPOSITION_START (c1);
b73bfc1c 1446 continue;
4ed46869 1447
ec6d2bb8
KH
1448 case '1': /* end composition */
1449 DECODE_COMPOSITION_END (c1);
b73bfc1c 1450 continue;
4ed46869
KH
1451
1452 case '[': /* specification of direction */
d46c5b12
KH
1453 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1454 goto label_invalid_code;
4ed46869 1455 /* For the moment, nested direction is not supported.
d46c5b12
KH
1456 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1457 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1458 ONE_MORE_BYTE (c1);
1459 switch (c1)
1460 {
1461 case ']': /* end of the current direction */
d46c5b12 1462 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1463
1464 case '0': /* end of the current direction */
1465 case '1': /* start of left-to-right direction */
1466 ONE_MORE_BYTE (c1);
1467 if (c1 == ']')
d46c5b12 1468 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1469 else
d46c5b12 1470 goto label_invalid_code;
4ed46869
KH
1471 break;
1472
1473 case '2': /* start of right-to-left direction */
1474 ONE_MORE_BYTE (c1);
1475 if (c1 == ']')
d46c5b12 1476 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1477 else
d46c5b12 1478 goto label_invalid_code;
4ed46869
KH
1479 break;
1480
1481 default:
d46c5b12 1482 goto label_invalid_code;
4ed46869 1483 }
b73bfc1c 1484 continue;
4ed46869
KH
1485
1486 default:
d46c5b12
KH
1487 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1488 goto label_invalid_code;
4ed46869
KH
1489 if (c1 >= 0x28 && c1 <= 0x2B)
1490 { /* designation of DIMENSION1_CHARS94 character set */
1491 ONE_MORE_BYTE (c2);
1492 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1493 }
1494 else if (c1 >= 0x2C && c1 <= 0x2F)
1495 { /* designation of DIMENSION1_CHARS96 character set */
1496 ONE_MORE_BYTE (c2);
1497 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1498 }
1499 else
b73bfc1c
KH
1500 goto label_invalid_code;
1501 /* We must update these variables now. */
1502 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1503 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1504 continue;
4ed46869 1505 }
b73bfc1c 1506 }
4ed46869 1507
b73bfc1c
KH
1508 /* Now we know CHARSET and 1st position code C1 of a character.
1509 Produce a multibyte sequence for that character while getting
1510 2nd position code C2 if necessary. */
1511 if (CHARSET_DIMENSION (charset) == 2)
1512 {
1513 ONE_MORE_BYTE (c2);
1514 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1515 /* C2 is not in a valid range. */
1516 goto label_invalid_code;
4ed46869 1517 }
b73bfc1c
KH
1518 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1519 EMIT_CHAR (c);
4ed46869
KH
1520 continue;
1521
b73bfc1c
KH
1522 label_invalid_code:
1523 coding->errors++;
1524 if (COMPOSING_P (coding))
1525 DECODE_COMPOSITION_END ('1');
4ed46869 1526 src = src_base;
b73bfc1c
KH
1527 c = *src++;
1528 EMIT_CHAR (c);
4ed46869 1529 }
fb88bf2d 1530
b73bfc1c
KH
1531 label_end_of_loop:
1532 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 1533 coding->produced = dst - destination;
b73bfc1c 1534 return;
4ed46869
KH
1535}
1536
b73bfc1c 1537
f4dee582 1538/* ISO2022 encoding stuff. */
4ed46869
KH
1539
1540/*
f4dee582 1541 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1542 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1543 variant has the following specifications:
1544 1. Initial designation to G0 thru G3.
1545 2. Allows short-form designation?
1546 3. ASCII should be designated to G0 before control characters?
1547 4. ASCII should be designated to G0 at end of line?
1548 5. 7-bit environment or 8-bit environment?
1549 6. Use locking-shift?
1550 7. Use Single-shift?
1551 And the following two are only for Japanese:
1552 8. Use ASCII in place of JIS0201-1976-Roman?
1553 9. Use JISX0208-1983 in place of JISX0208-1978?
1554 These specifications are encoded in `coding->flags' as flag bits
1555 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1556 details.
4ed46869
KH
1557*/
1558
1559/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
1560 register REG at DST, and increment DST. If <final-char> of CHARSET is
1561 '@', 'A', or 'B' and the coding system CODING allows, produce
1562 designation sequence of short-form. */
4ed46869
KH
1563
1564#define ENCODE_DESIGNATION(charset, reg, coding) \
1565 do { \
1566 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1567 char *intermediate_char_94 = "()*+"; \
1568 char *intermediate_char_96 = ",-./"; \
70c22245 1569 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
b73bfc1c 1570 \
70c22245
KH
1571 if (revision < 255) \
1572 { \
4ed46869
KH
1573 *dst++ = ISO_CODE_ESC; \
1574 *dst++ = '&'; \
70c22245 1575 *dst++ = '@' + revision; \
4ed46869 1576 } \
b73bfc1c 1577 *dst++ = ISO_CODE_ESC; \
4ed46869
KH
1578 if (CHARSET_DIMENSION (charset) == 1) \
1579 { \
1580 if (CHARSET_CHARS (charset) == 94) \
1581 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1582 else \
1583 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1584 } \
1585 else \
1586 { \
1587 *dst++ = '$'; \
1588 if (CHARSET_CHARS (charset) == 94) \
1589 { \
b73bfc1c
KH
1590 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1591 || reg != 0 \
1592 || final_char < '@' || final_char > 'B') \
4ed46869
KH
1593 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1594 } \
1595 else \
b73bfc1c 1596 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
4ed46869 1597 } \
b73bfc1c 1598 *dst++ = final_char; \
4ed46869
KH
1599 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1600 } while (0)
1601
1602/* The following two macros produce codes (control character or escape
1603 sequence) for ISO2022 single-shift functions (single-shift-2 and
1604 single-shift-3). */
1605
1606#define ENCODE_SINGLE_SHIFT_2 \
1607 do { \
1608 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1609 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1610 else \
b73bfc1c 1611 *dst++ = ISO_CODE_SS2; \
4ed46869
KH
1612 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1613 } while (0)
1614
fb88bf2d
KH
1615#define ENCODE_SINGLE_SHIFT_3 \
1616 do { \
4ed46869 1617 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1618 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1619 else \
b73bfc1c 1620 *dst++ = ISO_CODE_SS3; \
4ed46869
KH
1621 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1622 } while (0)
1623
1624/* The following four macros produce codes (control character or
1625 escape sequence) for ISO2022 locking-shift functions (shift-in,
1626 shift-out, locking-shift-2, and locking-shift-3). */
1627
b73bfc1c
KH
1628#define ENCODE_SHIFT_IN \
1629 do { \
1630 *dst++ = ISO_CODE_SI; \
4ed46869
KH
1631 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1632 } while (0)
1633
b73bfc1c
KH
1634#define ENCODE_SHIFT_OUT \
1635 do { \
1636 *dst++ = ISO_CODE_SO; \
4ed46869
KH
1637 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1638 } while (0)
1639
1640#define ENCODE_LOCKING_SHIFT_2 \
1641 do { \
1642 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1643 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1644 } while (0)
1645
b73bfc1c
KH
1646#define ENCODE_LOCKING_SHIFT_3 \
1647 do { \
1648 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
4ed46869
KH
1649 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1650 } while (0)
1651
f4dee582
RS
1652/* Produce codes for a DIMENSION1 character whose character set is
1653 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1654 sequences are also produced in advance if necessary. */
1655
6e85d753
KH
1656#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1657 do { \
1658 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1659 { \
1660 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1661 *dst++ = c1 & 0x7F; \
1662 else \
1663 *dst++ = c1 | 0x80; \
1664 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1665 break; \
1666 } \
1667 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1668 { \
1669 *dst++ = c1 & 0x7F; \
1670 break; \
1671 } \
1672 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1673 { \
1674 *dst++ = c1 | 0x80; \
1675 break; \
1676 } \
1677 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1678 && !coding->safe_charsets[charset]) \
6e85d753
KH
1679 { \
1680 /* We should not encode this character, instead produce one or \
1681 two `?'s. */ \
1682 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1683 if (CHARSET_WIDTH (charset) == 2) \
1684 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1685 break; \
1686 } \
1687 else \
1688 /* Since CHARSET is not yet invoked to any graphic planes, we \
1689 must invoke it, or, at first, designate it to some graphic \
1690 register. Then repeat the loop to actually produce the \
1691 character. */ \
1692 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1693 } while (1)
1694
f4dee582
RS
1695/* Produce codes for a DIMENSION2 character whose character set is
1696 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1697 invocation codes are also produced in advance if necessary. */
1698
6e85d753
KH
1699#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1700 do { \
1701 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1702 { \
1703 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1704 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1705 else \
1706 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1707 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1708 break; \
1709 } \
1710 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1711 { \
1712 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1713 break; \
1714 } \
1715 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1716 { \
1717 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1718 break; \
1719 } \
1720 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1721 && !coding->safe_charsets[charset]) \
6e85d753
KH
1722 { \
1723 /* We should not encode this character, instead produce one or \
1724 two `?'s. */ \
1725 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1726 if (CHARSET_WIDTH (charset) == 2) \
1727 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1728 break; \
1729 } \
1730 else \
1731 /* Since CHARSET is not yet invoked to any graphic planes, we \
1732 must invoke it, or, at first, designate it to some graphic \
1733 register. Then repeat the loop to actually produce the \
1734 character. */ \
1735 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1736 } while (1)
1737
6f551029
KH
1738#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1739 do { \
b73bfc1c 1740 int alt_charset = charset; \
ec6d2bb8 1741 \
b73bfc1c 1742 if (CHARSET_DEFINED_P (charset)) \
6f551029 1743 { \
b73bfc1c 1744 if (CHARSET_DIMENSION (charset) == 1) \
6f551029
KH
1745 { \
1746 if (charset == CHARSET_ASCII \
1747 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
b73bfc1c
KH
1748 alt_charset = charset_latin_jisx0201; \
1749 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
6f551029
KH
1750 } \
1751 else \
1752 { \
1753 if (charset == charset_jisx0208 \
1754 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
b73bfc1c
KH
1755 alt_charset = charset_jisx0208_1978; \
1756 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
6f551029
KH
1757 } \
1758 } \
1759 else \
1760 { \
b73bfc1c
KH
1761 *dst++ = c1; \
1762 if (c2 >= 0) \
1763 *dst++ = c2; \
6f551029 1764 } \
84fbb8a0 1765 } while (0)
bdd9fb48 1766
4ed46869
KH
1767/* Produce designation and invocation codes at a place pointed by DST
1768 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1769 Return new DST. */
1770
1771unsigned char *
1772encode_invocation_designation (charset, coding, dst)
1773 int charset;
1774 struct coding_system *coding;
1775 unsigned char *dst;
1776{
1777 int reg; /* graphic register number */
1778
1779 /* At first, check designations. */
1780 for (reg = 0; reg < 4; reg++)
1781 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1782 break;
1783
1784 if (reg >= 4)
1785 {
1786 /* CHARSET is not yet designated to any graphic registers. */
1787 /* At first check the requested designation. */
1788 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1789 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1790 /* Since CHARSET requests no special designation, designate it
1791 to graphic register 0. */
4ed46869
KH
1792 reg = 0;
1793
1794 ENCODE_DESIGNATION (charset, reg, coding);
1795 }
1796
1797 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1798 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1799 {
1800 /* Since the graphic register REG is not invoked to any graphic
1801 planes, invoke it to graphic plane 0. */
1802 switch (reg)
1803 {
1804 case 0: /* graphic register 0 */
1805 ENCODE_SHIFT_IN;
1806 break;
1807
1808 case 1: /* graphic register 1 */
1809 ENCODE_SHIFT_OUT;
1810 break;
1811
1812 case 2: /* graphic register 2 */
1813 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1814 ENCODE_SINGLE_SHIFT_2;
1815 else
1816 ENCODE_LOCKING_SHIFT_2;
1817 break;
1818
1819 case 3: /* graphic register 3 */
1820 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1821 ENCODE_SINGLE_SHIFT_3;
1822 else
1823 ENCODE_LOCKING_SHIFT_3;
1824 break;
1825 }
1826 }
b73bfc1c 1827
4ed46869
KH
1828 return dst;
1829}
1830
ec6d2bb8
KH
1831/* Produce 2-byte codes for encoded composition rule RULE. */
1832
1833#define ENCODE_COMPOSITION_RULE(rule) \
1834 do { \
1835 int gref, nref; \
1836 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1837 *dst++ = 32 + 81 + gref; \
1838 *dst++ = 32 + nref; \
1839 } while (0)
1840
1841/* Produce codes for indicating the start of a composition sequence
1842 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1843 which specify information about the composition. See the comment
1844 in coding.h for the format of DATA. */
1845
1846#define ENCODE_COMPOSITION_START(coding, data) \
1847 do { \
1848 coding->composing = data[3]; \
1849 *dst++ = ISO_CODE_ESC; \
1850 if (coding->composing == COMPOSITION_RELATIVE) \
1851 *dst++ = '0'; \
1852 else \
1853 { \
1854 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1855 ? '3' : '4'); \
1856 coding->cmp_data_index = coding->cmp_data_start + 4; \
1857 coding->composition_rule_follows = 0; \
1858 } \
1859 } while (0)
1860
1861/* Produce codes for indicating the end of the current composition. */
1862
1863#define ENCODE_COMPOSITION_END(coding, data) \
1864 do { \
1865 *dst++ = ISO_CODE_ESC; \
1866 *dst++ = '1'; \
1867 coding->cmp_data_start += data[0]; \
1868 coding->composing = COMPOSITION_NO; \
1869 if (coding->cmp_data_start == coding->cmp_data->used \
1870 && coding->cmp_data->next) \
1871 { \
1872 coding->cmp_data = coding->cmp_data->next; \
1873 coding->cmp_data_start = 0; \
1874 } \
1875 } while (0)
1876
1877/* Produce composition start sequence ESC 0. Here, this sequence
1878 doesn't mean the start of a new composition but means that we have
1879 just produced components (alternate chars and composition rules) of
1880 the composition and the actual text follows in SRC. */
1881
1882#define ENCODE_COMPOSITION_FAKE_START(coding) \
1883 do { \
1884 *dst++ = ISO_CODE_ESC; \
1885 *dst++ = '0'; \
1886 coding->composing = COMPOSITION_RELATIVE; \
1887 } while (0)
4ed46869
KH
1888
1889/* The following three macros produce codes for indicating direction
1890 of text. */
b73bfc1c
KH
1891#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1892 do { \
4ed46869 1893 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
b73bfc1c
KH
1894 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1895 else \
1896 *dst++ = ISO_CODE_CSI; \
4ed46869
KH
1897 } while (0)
1898
1899#define ENCODE_DIRECTION_R2L \
b73bfc1c 1900 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
4ed46869
KH
1901
1902#define ENCODE_DIRECTION_L2R \
b73bfc1c 1903 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
4ed46869
KH
1904
1905/* Produce codes for designation and invocation to reset the graphic
1906 planes and registers to initial state. */
e0e989f6
KH
1907#define ENCODE_RESET_PLANE_AND_REGISTER \
1908 do { \
1909 int reg; \
1910 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1911 ENCODE_SHIFT_IN; \
1912 for (reg = 0; reg < 4; reg++) \
1913 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1914 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1915 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1916 ENCODE_DESIGNATION \
1917 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1918 } while (0)
1919
bdd9fb48 1920/* Produce designation sequences of charsets in the line started from
b73bfc1c 1921 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
1922
1923 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1924 find all the necessary designations. */
1925
b73bfc1c
KH
1926static unsigned char *
1927encode_designation_at_bol (coding, translation_table, src, src_end, dst)
e0e989f6 1928 struct coding_system *coding;
b73bfc1c
KH
1929 Lisp_Object translation_table;
1930 unsigned char *src, *src_end, *dst;
e0e989f6 1931{
bdd9fb48
KH
1932 int charset, c, found = 0, reg;
1933 /* Table of charsets to be designated to each graphic register. */
1934 int r[4];
bdd9fb48
KH
1935
1936 for (reg = 0; reg < 4; reg++)
1937 r[reg] = -1;
1938
b73bfc1c 1939 while (found < 4)
e0e989f6 1940 {
b73bfc1c
KH
1941 ONE_MORE_CHAR (c);
1942 if (c == '\n')
1943 break;
bdd9fb48 1944
b73bfc1c 1945 charset = CHAR_CHARSET (c);
e0e989f6 1946 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1947 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1948 {
1949 found++;
1950 r[reg] = charset;
1951 }
bdd9fb48
KH
1952 }
1953
b73bfc1c 1954 label_end_of_loop:
bdd9fb48
KH
1955 if (found)
1956 {
1957 for (reg = 0; reg < 4; reg++)
1958 if (r[reg] >= 0
1959 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1960 ENCODE_DESIGNATION (r[reg], reg, coding);
e0e989f6 1961 }
b73bfc1c
KH
1962
1963 return dst;
e0e989f6
KH
1964}
1965
4ed46869
KH
1966/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1967
b73bfc1c 1968static void
d46c5b12 1969encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1970 struct coding_system *coding;
1971 unsigned char *source, *destination;
1972 int src_bytes, dst_bytes;
4ed46869
KH
1973{
1974 unsigned char *src = source;
1975 unsigned char *src_end = source + src_bytes;
1976 unsigned char *dst = destination;
1977 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c 1978 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1979 from DST_END to assure overflow checking is necessary only at the
1980 head of loop. */
b73bfc1c
KH
1981 unsigned char *adjusted_dst_end = dst_end - 19;
1982 /* SRC_BASE remembers the start position in source in each loop.
1983 The loop will be exited when there's not enough source text to
1984 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1985 there's not enough destination area to produce encoded codes
1986 (within macro EMIT_BYTES). */
1987 unsigned char *src_base;
1988 int c;
1989 Lisp_Object translation_table;
bdd9fb48 1990
b73bfc1c
KH
1991 if (NILP (Venable_character_translation))
1992 translation_table = Qnil;
1993 else
1994 {
1995 translation_table = coding->translation_table_for_encode;
1996 if (NILP (translation_table))
1997 translation_table = Vstandard_translation_table_for_encode;
1998 }
4ed46869 1999
d46c5b12 2000 coding->consumed_char = 0;
b73bfc1c
KH
2001 coding->errors = 0;
2002 while (1)
4ed46869 2003 {
b73bfc1c
KH
2004 int charset, c1, c2;
2005
2006 src_base = src;
2007
2008 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2009 {
2010 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2011 break;
2012 }
4ed46869 2013
e0e989f6
KH
2014 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2015 && CODING_SPEC_ISO_BOL (coding))
2016 {
bdd9fb48 2017 /* We have to produce designation sequences if any now. */
b73bfc1c
KH
2018 dst = encode_designation_at_bol (coding, translation_table,
2019 src, src_end, dst);
e0e989f6
KH
2020 CODING_SPEC_ISO_BOL (coding) = 0;
2021 }
2022
ec6d2bb8
KH
2023 /* Check composition start and end. */
2024 if (coding->composing != COMPOSITION_DISABLED
2025 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 2026 {
ec6d2bb8
KH
2027 struct composition_data *cmp_data = coding->cmp_data;
2028 int *data = cmp_data->data + coding->cmp_data_start;
2029 int this_pos = cmp_data->char_offset + coding->consumed_char;
2030
2031 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 2032 {
ec6d2bb8
KH
2033 if (this_pos == data[2])
2034 {
2035 ENCODE_COMPOSITION_END (coding, data);
2036 cmp_data = coding->cmp_data;
2037 data = cmp_data->data + coding->cmp_data_start;
2038 }
4ed46869 2039 }
ec6d2bb8 2040 else if (COMPOSING_P (coding))
4ed46869 2041 {
ec6d2bb8
KH
2042 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2043 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2044 /* We have consumed components of the composition.
2045 What follows in SRC is the compositions's base
2046 text. */
2047 ENCODE_COMPOSITION_FAKE_START (coding);
2048 else
4ed46869 2049 {
ec6d2bb8
KH
2050 int c = cmp_data->data[coding->cmp_data_index++];
2051 if (coding->composition_rule_follows)
2052 {
2053 ENCODE_COMPOSITION_RULE (c);
2054 coding->composition_rule_follows = 0;
2055 }
2056 else
2057 {
2058 SPLIT_CHAR (c, charset, c1, c2);
2059 ENCODE_ISO_CHARACTER (charset, c1, c2);
ec6d2bb8
KH
2060 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2061 coding->composition_rule_follows = 1;
2062 }
4ed46869
KH
2063 continue;
2064 }
ec6d2bb8
KH
2065 }
2066 if (!COMPOSING_P (coding))
2067 {
2068 if (this_pos == data[1])
4ed46869 2069 {
ec6d2bb8
KH
2070 ENCODE_COMPOSITION_START (coding, data);
2071 continue;
4ed46869 2072 }
4ed46869
KH
2073 }
2074 }
ec6d2bb8 2075
b73bfc1c 2076 ONE_MORE_CHAR (c);
4ed46869 2077
b73bfc1c
KH
2078 /* Now encode the character C. */
2079 if (c < 0x20 || c == 0x7F)
2080 {
2081 if (c == '\r')
19a8d9e0 2082 {
b73bfc1c
KH
2083 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2084 {
2085 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2086 ENCODE_RESET_PLANE_AND_REGISTER;
2087 *dst++ = c;
2088 continue;
2089 }
2090 /* fall down to treat '\r' as '\n' ... */
2091 c = '\n';
19a8d9e0 2092 }
b73bfc1c 2093 if (c == '\n')
19a8d9e0 2094 {
b73bfc1c
KH
2095 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2096 ENCODE_RESET_PLANE_AND_REGISTER;
2097 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2098 bcopy (coding->spec.iso2022.initial_designation,
2099 coding->spec.iso2022.current_designation,
2100 sizeof coding->spec.iso2022.initial_designation);
2101 if (coding->eol_type == CODING_EOL_LF
2102 || coding->eol_type == CODING_EOL_UNDECIDED)
2103 *dst++ = ISO_CODE_LF;
2104 else if (coding->eol_type == CODING_EOL_CRLF)
2105 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2106 else
2107 *dst++ = ISO_CODE_CR;
2108 CODING_SPEC_ISO_BOL (coding) = 1;
19a8d9e0 2109 }
b73bfc1c 2110 else
19a8d9e0 2111 {
b73bfc1c
KH
2112 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2113 ENCODE_RESET_PLANE_AND_REGISTER;
2114 *dst++ = c;
19a8d9e0 2115 }
4ed46869 2116 }
b73bfc1c
KH
2117 else if (ASCII_BYTE_P (c))
2118 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2119 else if (SINGLE_BYTE_CHAR_P (c))
88993dfd 2120 {
b73bfc1c
KH
2121 *dst++ = c;
2122 coding->errors++;
88993dfd 2123 }
b73bfc1c
KH
2124 else
2125 {
2126 SPLIT_CHAR (c, charset, c1, c2);
2127 ENCODE_ISO_CHARACTER (charset, c1, c2);
2128 }
2129
2130 coding->consumed_char++;
84fbb8a0 2131 }
b73bfc1c
KH
2132
2133 label_end_of_loop:
2134 coding->consumed = src_base - source;
d46c5b12 2135 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2136}
2137
2138\f
2139/*** 4. SJIS and BIG5 handlers ***/
2140
f4dee582 2141/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2142 quite widely. So, for the moment, Emacs supports them in the bare
2143 C code. But, in the future, they may be supported only by CCL. */
2144
2145/* SJIS is a coding system encoding three character sets: ASCII, right
2146 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2147 as is. A character of charset katakana-jisx0201 is encoded by
2148 "position-code + 0x80". A character of charset japanese-jisx0208
2149 is encoded in 2-byte but two position-codes are divided and shifted
2150 so that it fit in the range below.
2151
2152 --- CODE RANGE of SJIS ---
2153 (character set) (range)
2154 ASCII 0x00 .. 0x7F
2155 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2156 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2157 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2158 -------------------------------
2159
2160*/
2161
2162/* BIG5 is a coding system encoding two character sets: ASCII and
2163 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2164 character set and is encoded in two-byte.
2165
2166 --- CODE RANGE of BIG5 ---
2167 (character set) (range)
2168 ASCII 0x00 .. 0x7F
2169 Big5 (1st byte) 0xA1 .. 0xFE
2170 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2171 --------------------------
2172
2173 Since the number of characters in Big5 is larger than maximum
2174 characters in Emacs' charset (96x96), it can't be handled as one
2175 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2176 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2177 contains frequently used characters and the latter contains less
2178 frequently used characters. */
2179
2180/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2181 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2182 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2183 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2184
2185/* Number of Big5 characters which have the same code in 1st byte. */
2186#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2187
2188#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2189 do { \
2190 unsigned int temp \
2191 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2192 if (b1 < 0xC9) \
2193 charset = charset_big5_1; \
2194 else \
2195 { \
2196 charset = charset_big5_2; \
2197 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2198 } \
2199 c1 = temp / (0xFF - 0xA1) + 0x21; \
2200 c2 = temp % (0xFF - 0xA1) + 0x21; \
2201 } while (0)
2202
2203#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2204 do { \
2205 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2206 if (charset == charset_big5_2) \
2207 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2208 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2209 b2 = temp % BIG5_SAME_ROW; \
2210 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2211 } while (0)
2212
2213/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2214 Check if a text is encoded in SJIS. If it is, return
2215 CODING_CATEGORY_MASK_SJIS, else return 0. */
2216
2217int
2218detect_coding_sjis (src, src_end)
2219 unsigned char *src, *src_end;
2220{
b73bfc1c
KH
2221 int c;
2222 /* Dummy for ONE_MORE_BYTE. */
2223 struct coding_system dummy_coding;
2224 struct coding_system *coding = &dummy_coding;
4ed46869 2225
b73bfc1c 2226 while (1)
4ed46869 2227 {
b73bfc1c 2228 ONE_MORE_BYTE (c);
4ed46869
KH
2229 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2230 {
b73bfc1c
KH
2231 ONE_MORE_BYTE (c);
2232 if (c < 0x40)
4ed46869
KH
2233 return 0;
2234 }
2235 }
b73bfc1c 2236 label_end_of_loop:
4ed46869
KH
2237 return CODING_CATEGORY_MASK_SJIS;
2238}
2239
2240/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2241 Check if a text is encoded in BIG5. If it is, return
2242 CODING_CATEGORY_MASK_BIG5, else return 0. */
2243
2244int
2245detect_coding_big5 (src, src_end)
2246 unsigned char *src, *src_end;
2247{
b73bfc1c
KH
2248 int c;
2249 /* Dummy for ONE_MORE_BYTE. */
2250 struct coding_system dummy_coding;
2251 struct coding_system *coding = &dummy_coding;
4ed46869 2252
b73bfc1c 2253 while (1)
4ed46869 2254 {
b73bfc1c 2255 ONE_MORE_BYTE (c);
4ed46869
KH
2256 if (c >= 0xA1)
2257 {
b73bfc1c 2258 ONE_MORE_BYTE (c);
4ed46869
KH
2259 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2260 return 0;
2261 }
2262 }
b73bfc1c 2263 label_end_of_loop:
4ed46869
KH
2264 return CODING_CATEGORY_MASK_BIG5;
2265}
2266
fa42c37f
KH
2267/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2268 Check if a text is encoded in UTF-8. If it is, return
2269 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2270
2271#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2272#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2273#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2274#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2275#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2276#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2277#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2278
2279int
2280detect_coding_utf_8 (src, src_end)
2281 unsigned char *src, *src_end;
2282{
2283 unsigned char c;
2284 int seq_maybe_bytes;
b73bfc1c
KH
2285 /* Dummy for ONE_MORE_BYTE. */
2286 struct coding_system dummy_coding;
2287 struct coding_system *coding = &dummy_coding;
fa42c37f 2288
b73bfc1c 2289 while (1)
fa42c37f 2290 {
b73bfc1c 2291 ONE_MORE_BYTE (c);
fa42c37f
KH
2292 if (UTF_8_1_OCTET_P (c))
2293 continue;
2294 else if (UTF_8_2_OCTET_LEADING_P (c))
2295 seq_maybe_bytes = 1;
2296 else if (UTF_8_3_OCTET_LEADING_P (c))
2297 seq_maybe_bytes = 2;
2298 else if (UTF_8_4_OCTET_LEADING_P (c))
2299 seq_maybe_bytes = 3;
2300 else if (UTF_8_5_OCTET_LEADING_P (c))
2301 seq_maybe_bytes = 4;
2302 else if (UTF_8_6_OCTET_LEADING_P (c))
2303 seq_maybe_bytes = 5;
2304 else
2305 return 0;
2306
2307 do
2308 {
b73bfc1c 2309 ONE_MORE_BYTE (c);
fa42c37f
KH
2310 if (!UTF_8_EXTRA_OCTET_P (c))
2311 return 0;
2312 seq_maybe_bytes--;
2313 }
2314 while (seq_maybe_bytes > 0);
2315 }
2316
b73bfc1c 2317 label_end_of_loop:
fa42c37f
KH
2318 return CODING_CATEGORY_MASK_UTF_8;
2319}
2320
2321/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2322 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2323 Little Endian (otherwise). If it is, return
2324 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2325 else return 0. */
2326
2327#define UTF_16_INVALID_P(val) \
2328 (((val) == 0xFFFE) \
2329 || ((val) == 0xFFFF))
2330
2331#define UTF_16_HIGH_SURROGATE_P(val) \
2332 (((val) & 0xD800) == 0xD800)
2333
2334#define UTF_16_LOW_SURROGATE_P(val) \
2335 (((val) & 0xDC00) == 0xDC00)
2336
2337int
2338detect_coding_utf_16 (src, src_end)
2339 unsigned char *src, *src_end;
2340{
b73bfc1c
KH
2341 unsigned char c1, c2;
2342 /* Dummy for TWO_MORE_BYTES. */
2343 struct coding_system dummy_coding;
2344 struct coding_system *coding = &dummy_coding;
fa42c37f 2345
b73bfc1c
KH
2346 TWO_MORE_BYTES (c1, c2);
2347
2348 if ((c1 == 0xFF) && (c2 == 0xFE))
fa42c37f 2349 return CODING_CATEGORY_MASK_UTF_16_LE;
b73bfc1c 2350 else if ((c1 == 0xFE) && (c2 == 0xFF))
fa42c37f
KH
2351 return CODING_CATEGORY_MASK_UTF_16_BE;
2352
b73bfc1c 2353 label_end_of_loop:
fa42c37f
KH
2354 return 0;
2355}
2356
4ed46869
KH
2357/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2358 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2359
b73bfc1c 2360static void
4ed46869 2361decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2362 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2363 struct coding_system *coding;
2364 unsigned char *source, *destination;
2365 int src_bytes, dst_bytes;
4ed46869
KH
2366 int sjis_p;
2367{
2368 unsigned char *src = source;
2369 unsigned char *src_end = source + src_bytes;
2370 unsigned char *dst = destination;
2371 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
2372 /* SRC_BASE remembers the start position in source in each loop.
2373 The loop will be exited when there's not enough source code
2374 (within macro ONE_MORE_BYTE), or when there's not enough
2375 destination area to produce a character (within macro
2376 EMIT_CHAR). */
2377 unsigned char *src_base;
2378 Lisp_Object translation_table;
a5d301df 2379
b73bfc1c
KH
2380 if (NILP (Venable_character_translation))
2381 translation_table = Qnil;
2382 else
2383 {
2384 translation_table = coding->translation_table_for_decode;
2385 if (NILP (translation_table))
2386 translation_table = Vstandard_translation_table_for_decode;
2387 }
4ed46869 2388
d46c5b12 2389 coding->produced_char = 0;
b73bfc1c 2390 while (1)
4ed46869 2391 {
b73bfc1c
KH
2392 int c, charset, c1, c2;
2393
2394 src_base = src;
2395 ONE_MORE_BYTE (c1);
2396
2397 if (c1 < 0x80)
4ed46869 2398 {
b73bfc1c
KH
2399 charset = CHARSET_ASCII;
2400 if (c1 < 0x20)
4ed46869 2401 {
b73bfc1c 2402 if (c1 == '\r')
d46c5b12 2403 {
b73bfc1c 2404 if (coding->eol_type == CODING_EOL_CRLF)
d46c5b12 2405 {
b73bfc1c
KH
2406 ONE_MORE_BYTE (c2);
2407 if (c2 == '\n')
2408 c1 = c2;
2409 else if (coding->mode
2410 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2411 {
2412 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2413 goto label_end_of_loop;
2414 }
2415 else
2416 /* To process C2 again, SRC is subtracted by 1. */
2417 src--;
d46c5b12 2418 }
b73bfc1c
KH
2419 else if (coding->eol_type == CODING_EOL_CR)
2420 c1 = '\n';
2421 }
2422 else if (c1 == '\n'
2423 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2424 && (coding->eol_type == CODING_EOL_CR
2425 || coding->eol_type == CODING_EOL_CRLF))
2426 {
2427 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2428 goto label_end_of_loop;
d46c5b12 2429 }
4ed46869 2430 }
4ed46869 2431 }
54f78171 2432 else
b73bfc1c 2433 {
4ed46869
KH
2434 if (sjis_p)
2435 {
b73bfc1c
KH
2436 if (c1 >= 0xF0)
2437 goto label_invalid_code;
2438 if (c1 < 0xA0 || c1 >= 0xE0)
fb88bf2d 2439 {
54f78171
KH
2440 /* SJIS -> JISX0208 */
2441 ONE_MORE_BYTE (c2);
b73bfc1c
KH
2442 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2443 goto label_invalid_code;
2444 DECODE_SJIS (c1, c2, c1, c2);
2445 charset = charset_jisx0208;
5e34de15 2446 }
fb88bf2d 2447 else
b73bfc1c
KH
2448 /* SJIS -> JISX0201-Kana */
2449 charset = charset_katakana_jisx0201;
4ed46869 2450 }
fb88bf2d 2451 else
fb88bf2d 2452 {
54f78171 2453 /* BIG5 -> Big5 */
b73bfc1c
KH
2454 if (c1 < 0xA1 || c1 > 0xFE)
2455 goto label_invalid_code;
2456 ONE_MORE_BYTE (c2);
2457 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2458 goto label_invalid_code;
2459 DECODE_BIG5 (c1, c2, charset, c1, c2);
4ed46869
KH
2460 }
2461 }
4ed46869 2462
b73bfc1c
KH
2463 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2464 EMIT_CHAR (c);
fb88bf2d
KH
2465 continue;
2466
b73bfc1c
KH
2467 label_invalid_code:
2468 coding->errors++;
4ed46869 2469 src = src_base;
b73bfc1c
KH
2470 c = *src++;
2471 EMIT_CHAR (c);
fb88bf2d 2472 }
d46c5b12 2473
b73bfc1c
KH
2474 label_end_of_loop:
2475 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 2476 coding->produced = dst - destination;
b73bfc1c 2477 return;
4ed46869
KH
2478}
2479
2480/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
2481 This function can encode charsets `ascii', `katakana-jisx0201',
2482 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2483 are sure that all these charsets are registered as official charset
4ed46869
KH
2484 (i.e. do not have extended leading-codes). Characters of other
2485 charsets are produced without any encoding. If SJIS_P is 1, encode
2486 SJIS text, else encode BIG5 text. */
2487
b73bfc1c 2488static void
4ed46869 2489encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2490 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2491 struct coding_system *coding;
2492 unsigned char *source, *destination;
2493 int src_bytes, dst_bytes;
4ed46869
KH
2494 int sjis_p;
2495{
2496 unsigned char *src = source;
2497 unsigned char *src_end = source + src_bytes;
2498 unsigned char *dst = destination;
2499 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
2500 /* SRC_BASE remembers the start position in source in each loop.
2501 The loop will be exited when there's not enough source text to
2502 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2503 there's not enough destination area to produce encoded codes
2504 (within macro EMIT_BYTES). */
2505 unsigned char *src_base;
2506 Lisp_Object translation_table;
4ed46869 2507
b73bfc1c
KH
2508 if (NILP (Venable_character_translation))
2509 translation_table = Qnil;
2510 else
4ed46869 2511 {
b73bfc1c
KH
2512 translation_table = coding->translation_table_for_decode;
2513 if (NILP (translation_table))
2514 translation_table = Vstandard_translation_table_for_decode;
2515 }
a5d301df 2516
b73bfc1c
KH
2517 while (1)
2518 {
2519 int c, charset, c1, c2;
4ed46869 2520
b73bfc1c
KH
2521 src_base = src;
2522 ONE_MORE_CHAR (c);
2523
2524 /* Now encode the character C. */
2525 if (SINGLE_BYTE_CHAR_P (c))
2526 {
2527 switch (c)
4ed46869 2528 {
b73bfc1c
KH
2529 case '\r':
2530 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2531 {
2532 EMIT_ONE_BYTE (c);
2533 break;
2534 }
2535 c = '\n';
2536 case '\n':
2537 if (coding->eol_type == CODING_EOL_CRLF)
2538 {
2539 EMIT_TWO_BYTES ('\r', c);
2540 break;
2541 }
2542 else if (coding->eol_type == CODING_EOL_CR)
2543 c = '\r';
2544 default:
2545 EMIT_ONE_BYTE (c);
2546 }
2547 }
2548 else
2549 {
2550 SPLIT_CHAR (c, charset, c1, c2);
2551 if (sjis_p)
2552 {
2553 if (charset == charset_jisx0208
2554 || charset == charset_jisx0208_1978)
2555 {
2556 ENCODE_SJIS (c1, c2, c1, c2);
2557 EMIT_TWO_BYTES (c1, c2);
2558 }
2559 else if (charset == charset_latin_jisx0201)
2560 EMIT_ONE_BYTE (c1);
2561 else
2562 /* There's no way other than producing the internal
2563 codes as is. */
2564 EMIT_BYTES (src_base, src);
4ed46869 2565 }
4ed46869 2566 else
b73bfc1c
KH
2567 {
2568 if (charset == charset_big5_1 || charset == charset_big5_2)
2569 {
2570 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2571 EMIT_TWO_BYTES (c1, c2);
2572 }
2573 else
2574 /* There's no way other than producing the internal
2575 codes as is. */
2576 EMIT_BYTES (src_base, src);
2577 }
4ed46869 2578 }
b73bfc1c 2579 coding->consumed_char++;
4ed46869
KH
2580 }
2581
b73bfc1c
KH
2582 label_end_of_loop:
2583 coding->consumed = src_base - source;
d46c5b12 2584 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2585}
2586
2587\f
1397dc18
KH
2588/*** 5. CCL handlers ***/
2589
2590/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2591 Check if a text is encoded in a coding system of which
2592 encoder/decoder are written in CCL program. If it is, return
2593 CODING_CATEGORY_MASK_CCL, else return 0. */
2594
2595int
2596detect_coding_ccl (src, src_end)
2597 unsigned char *src, *src_end;
2598{
2599 unsigned char *valid;
b73bfc1c
KH
2600 int c;
2601 /* Dummy for ONE_MORE_BYTE. */
2602 struct coding_system dummy_coding;
2603 struct coding_system *coding = &dummy_coding;
1397dc18
KH
2604
2605 /* No coding system is assigned to coding-category-ccl. */
2606 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2607 return 0;
2608
2609 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
b73bfc1c 2610 while (1)
1397dc18 2611 {
b73bfc1c
KH
2612 ONE_MORE_BYTE (c);
2613 if (! valid[c])
2614 return 0;
1397dc18 2615 }
b73bfc1c 2616 label_end_of_loop:
1397dc18
KH
2617 return CODING_CATEGORY_MASK_CCL;
2618}
2619
2620\f
2621/*** 6. End-of-line handlers ***/
4ed46869 2622
b73bfc1c 2623/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 2624
b73bfc1c 2625static void
d46c5b12 2626decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2627 struct coding_system *coding;
2628 unsigned char *source, *destination;
2629 int src_bytes, dst_bytes;
4ed46869
KH
2630{
2631 unsigned char *src = source;
4ed46869 2632 unsigned char *dst = destination;
b73bfc1c
KH
2633 unsigned char *src_end = src + src_bytes;
2634 unsigned char *dst_end = dst + dst_bytes;
2635 Lisp_Object translation_table;
2636 /* SRC_BASE remembers the start position in source in each loop.
2637 The loop will be exited when there's not enough source code
2638 (within macro ONE_MORE_BYTE), or when there's not enough
2639 destination area to produce a character (within macro
2640 EMIT_CHAR). */
2641 unsigned char *src_base;
2642 int c;
2643
2644 translation_table = Qnil;
4ed46869
KH
2645 switch (coding->eol_type)
2646 {
2647 case CODING_EOL_CRLF:
b73bfc1c 2648 while (1)
d46c5b12 2649 {
b73bfc1c
KH
2650 src_base = src;
2651 ONE_MORE_BYTE (c);
2652 if (c == '\r')
fb88bf2d 2653 {
b73bfc1c
KH
2654 ONE_MORE_BYTE (c);
2655 if (c != '\n')
2656 {
2657 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2658 {
2659 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2660 goto label_end_of_loop;
2661 }
2662 src--;
2663 c = '\r';
2664 }
fb88bf2d 2665 }
b73bfc1c
KH
2666 else if (c == '\n'
2667 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
d46c5b12 2668 {
b73bfc1c
KH
2669 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2670 goto label_end_of_loop;
d46c5b12 2671 }
b73bfc1c 2672 EMIT_CHAR (c);
d46c5b12 2673 }
b73bfc1c
KH
2674 break;
2675
2676 case CODING_EOL_CR:
2677 while (1)
d46c5b12 2678 {
b73bfc1c
KH
2679 src_base = src;
2680 ONE_MORE_BYTE (c);
2681 if (c == '\n')
2682 {
2683 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2684 {
2685 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2686 goto label_end_of_loop;
2687 }
2688 }
2689 else if (c == '\r')
2690 c = '\n';
2691 EMIT_CHAR (c);
d46c5b12 2692 }
4ed46869
KH
2693 break;
2694
b73bfc1c
KH
2695 default: /* no need for EOL handling */
2696 while (1)
d46c5b12 2697 {
b73bfc1c
KH
2698 src_base = src;
2699 ONE_MORE_BYTE (c);
2700 EMIT_CHAR (c);
d46c5b12 2701 }
4ed46869
KH
2702 }
2703
b73bfc1c
KH
2704 label_end_of_loop:
2705 coding->consumed = coding->consumed_char = src_base - source;
2706 coding->produced = dst - destination;
2707 return;
4ed46869
KH
2708}
2709
2710/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
b73bfc1c
KH
2711 format of end-of-line according to `coding->eol_type'. It also
2712 convert multibyte form 8-bit characers to unibyte if
2713 CODING->src_multibyte is nonzero. If `coding->mode &
2714 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2715 also means end-of-line. */
4ed46869 2716
b73bfc1c 2717static void
d46c5b12 2718encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2719 struct coding_system *coding;
2720 unsigned char *source, *destination;
2721 int src_bytes, dst_bytes;
4ed46869
KH
2722{
2723 unsigned char *src = source;
2724 unsigned char *dst = destination;
b73bfc1c
KH
2725 unsigned char *src_end = src + src_bytes;
2726 unsigned char *dst_end = dst + dst_bytes;
2727 Lisp_Object translation_table;
2728 /* SRC_BASE remembers the start position in source in each loop.
2729 The loop will be exited when there's not enough source text to
2730 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2731 there's not enough destination area to produce encoded codes
2732 (within macro EMIT_BYTES). */
2733 unsigned char *src_base;
2734 int c;
2735 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2736
2737 translation_table = Qnil;
2738 if (coding->src_multibyte
2739 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2740 {
2741 src_end--;
2742 src_bytes--;
2743 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2744 }
fb88bf2d 2745
d46c5b12
KH
2746 if (coding->eol_type == CODING_EOL_CRLF)
2747 {
b73bfc1c 2748 while (src < src_end)
d46c5b12 2749 {
b73bfc1c 2750 src_base = src;
d46c5b12 2751 c = *src++;
b73bfc1c
KH
2752 if (c >= 0x20)
2753 EMIT_ONE_BYTE (c);
2754 else if (c == '\n' || (c == '\r' && selective_display))
2755 EMIT_TWO_BYTES ('\r', '\n');
d46c5b12 2756 else
b73bfc1c 2757 EMIT_ONE_BYTE (c);
d46c5b12 2758 }
ff2b1ea9 2759 src_base = src;
b73bfc1c 2760 label_end_of_loop:
005f0d35 2761 ;
d46c5b12
KH
2762 }
2763 else
4ed46869 2764 {
b73bfc1c 2765 if (src_bytes <= dst_bytes)
4ed46869 2766 {
b73bfc1c
KH
2767 safe_bcopy (src, dst, src_bytes);
2768 src_base = src_end;
2769 dst += src_bytes;
d46c5b12 2770 }
d46c5b12 2771 else
b73bfc1c
KH
2772 {
2773 if (coding->src_multibyte
2774 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2775 dst_bytes--;
2776 safe_bcopy (src, dst, dst_bytes);
2777 src_base = src + dst_bytes;
2778 dst = destination + dst_bytes;
2779 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2780 }
993824c9 2781 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 2782 {
b73bfc1c
KH
2783 for (src = destination; src < dst; src++)
2784 if (*src == '\n') *src = '\r';
d46c5b12 2785 }
b73bfc1c 2786 else if (selective_display)
d46c5b12 2787 {
b73bfc1c
KH
2788 for (src = destination; src < dst; src++)
2789 if (*src == '\r') *src = '\n';
4ed46869 2790 }
4ed46869 2791 }
b73bfc1c
KH
2792 if (coding->src_multibyte)
2793 dst = destination + str_as_unibyte (destination, dst - destination);
4ed46869 2794
b73bfc1c
KH
2795 coding->consumed = src_base - source;
2796 coding->produced = dst - destination;
4ed46869
KH
2797}
2798
2799\f
1397dc18 2800/*** 7. C library functions ***/
4ed46869
KH
2801
2802/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2803 has a property `coding-system'. The value of this property is a
2804 vector of length 5 (called as coding-vector). Among elements of
2805 this vector, the first (element[0]) and the fifth (element[4])
2806 carry important information for decoding/encoding. Before
2807 decoding/encoding, this information should be set in fields of a
2808 structure of type `coding_system'.
2809
2810 A value of property `coding-system' can be a symbol of another
2811 subsidiary coding-system. In that case, Emacs gets coding-vector
2812 from that symbol.
2813
2814 `element[0]' contains information to be set in `coding->type'. The
2815 value and its meaning is as follows:
2816
0ef69138
KH
2817 0 -- coding_type_emacs_mule
2818 1 -- coding_type_sjis
2819 2 -- coding_type_iso2022
2820 3 -- coding_type_big5
2821 4 -- coding_type_ccl encoder/decoder written in CCL
2822 nil -- coding_type_no_conversion
2823 t -- coding_type_undecided (automatic conversion on decoding,
2824 no-conversion on encoding)
4ed46869
KH
2825
2826 `element[4]' contains information to be set in `coding->flags' and
2827 `coding->spec'. The meaning varies by `coding->type'.
2828
2829 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2830 of length 32 (of which the first 13 sub-elements are used now).
2831 Meanings of these sub-elements are:
2832
2833 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2834 If the value is an integer of valid charset, the charset is
2835 assumed to be designated to graphic register N initially.
2836
2837 If the value is minus, it is a minus value of charset which
2838 reserves graphic register N, which means that the charset is
2839 not designated initially but should be designated to graphic
2840 register N just before encoding a character in that charset.
2841
2842 If the value is nil, graphic register N is never used on
2843 encoding.
2844
2845 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2846 Each value takes t or nil. See the section ISO2022 of
2847 `coding.h' for more information.
2848
2849 If `coding->type' is `coding_type_big5', element[4] is t to denote
2850 BIG5-ETen or nil to denote BIG5-HKU.
2851
2852 If `coding->type' takes the other value, element[4] is ignored.
2853
2854 Emacs Lisp's coding system also carries information about format of
2855 end-of-line in a value of property `eol-type'. If the value is
2856 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2857 means CODING_EOL_CR. If it is not integer, it should be a vector
2858 of subsidiary coding systems of which property `eol-type' has one
2859 of above values.
2860
2861*/
2862
2863/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2864 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2865 is setup so that no conversion is necessary and return -1, else
2866 return 0. */
2867
2868int
e0e989f6
KH
2869setup_coding_system (coding_system, coding)
2870 Lisp_Object coding_system;
4ed46869
KH
2871 struct coding_system *coding;
2872{
d46c5b12 2873 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2874 Lisp_Object val;
70c22245 2875 int i;
4ed46869 2876
d46c5b12 2877 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2878 coding->symbol = coding_system;
d46c5b12
KH
2879 coding->common_flags = 0;
2880 coding->mode = 0;
2881 coding->heading_ascii = -1;
2882 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
2883 coding->composing = COMPOSITION_DISABLED;
2884 coding->cmp_data = NULL;
1f5dbf34
KH
2885
2886 if (NILP (coding_system))
2887 goto label_invalid_coding_system;
2888
4608c386 2889 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2890
4608c386
KH
2891 if (!VECTORP (coding_spec)
2892 || XVECTOR (coding_spec)->size != 5
2893 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2894 goto label_invalid_coding_system;
4608c386 2895
d46c5b12
KH
2896 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2897 if (VECTORP (eol_type))
2898 {
2899 coding->eol_type = CODING_EOL_UNDECIDED;
2900 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2901 }
2902 else if (XFASTINT (eol_type) == 1)
2903 {
2904 coding->eol_type = CODING_EOL_CRLF;
2905 coding->common_flags
2906 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2907 }
2908 else if (XFASTINT (eol_type) == 2)
2909 {
2910 coding->eol_type = CODING_EOL_CR;
2911 coding->common_flags
2912 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2913 }
2914 else
2915 coding->eol_type = CODING_EOL_LF;
2916
2917 coding_type = XVECTOR (coding_spec)->contents[0];
2918 /* Try short cut. */
2919 if (SYMBOLP (coding_type))
2920 {
2921 if (EQ (coding_type, Qt))
2922 {
2923 coding->type = coding_type_undecided;
2924 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2925 }
2926 else
2927 coding->type = coding_type_no_conversion;
2928 return 0;
2929 }
2930
d46c5b12
KH
2931 /* Get values of coding system properties:
2932 `post-read-conversion', `pre-write-conversion',
f967223b 2933 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 2934 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae
KH
2935 /* Pre & post conversion functions should be disabled if
2936 inhibit_eol_conversion is nozero. This is the case that a code
2937 conversion function is called while those functions are running. */
2938 if (! inhibit_pre_post_conversion)
2939 {
2940 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2941 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2942 }
f967223b 2943 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2944 if (SYMBOLP (val))
f967223b
KH
2945 val = Fget (val, Qtranslation_table_for_decode);
2946 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2947 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2948 if (SYMBOLP (val))
f967223b
KH
2949 val = Fget (val, Qtranslation_table_for_encode);
2950 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2951 val = Fplist_get (plist, Qcoding_category);
2952 if (!NILP (val))
2953 {
2954 val = Fget (val, Qcoding_category_index);
2955 if (INTEGERP (val))
2956 coding->category_idx = XINT (val);
2957 else
2958 goto label_invalid_coding_system;
2959 }
2960 else
2961 goto label_invalid_coding_system;
4608c386 2962
70c22245
KH
2963 val = Fplist_get (plist, Qsafe_charsets);
2964 if (EQ (val, Qt))
2965 {
2966 for (i = 0; i <= MAX_CHARSET; i++)
2967 coding->safe_charsets[i] = 1;
2968 }
2969 else
2970 {
2971 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2972 while (CONSP (val))
2973 {
03699b14 2974 if ((i = get_charset_id (XCAR (val))) >= 0)
70c22245 2975 coding->safe_charsets[i] = 1;
03699b14 2976 val = XCDR (val);
70c22245
KH
2977 }
2978 }
2979
ec6d2bb8
KH
2980 /* If the coding system has non-nil `composition' property, enable
2981 composition handling. */
2982 val = Fplist_get (plist, Qcomposition);
2983 if (!NILP (val))
2984 coding->composing = COMPOSITION_NO;
2985
d46c5b12 2986 switch (XFASTINT (coding_type))
4ed46869
KH
2987 {
2988 case 0:
0ef69138 2989 coding->type = coding_type_emacs_mule;
c952af22
KH
2990 if (!NILP (coding->post_read_conversion))
2991 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2992 if (!NILP (coding->pre_write_conversion))
2993 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2994 break;
2995
2996 case 1:
2997 coding->type = coding_type_sjis;
c952af22
KH
2998 coding->common_flags
2999 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3000 break;
3001
3002 case 2:
3003 coding->type = coding_type_iso2022;
c952af22
KH
3004 coding->common_flags
3005 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3006 {
70c22245 3007 Lisp_Object val, temp;
4ed46869 3008 Lisp_Object *flags;
d46c5b12 3009 int i, charset, reg_bits = 0;
4ed46869 3010
4608c386 3011 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3012
4ed46869
KH
3013 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3014 goto label_invalid_coding_system;
3015
3016 flags = XVECTOR (val)->contents;
3017 coding->flags
3018 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3019 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3020 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3021 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3022 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3023 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3024 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3025 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3026 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3027 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3028 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3029 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3030 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3031 );
4ed46869
KH
3032
3033 /* Invoke graphic register 0 to plane 0. */
3034 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3035 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3036 CODING_SPEC_ISO_INVOCATION (coding, 1)
3037 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3038 /* Not single shifting at first. */
6e85d753 3039 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3040 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3041 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3042
70c22245
KH
3043 for (charset = 0; charset <= MAX_CHARSET; charset++)
3044 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3045 val = Vcharset_revision_alist;
3046 while (CONSP (val))
3047 {
03699b14 3048 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3049 if (charset >= 0
03699b14 3050 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3051 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3052 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3053 val = XCDR (val);
70c22245
KH
3054 }
3055
4ed46869
KH
3056 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3057 FLAGS[REG] can be one of below:
3058 integer CHARSET: CHARSET occupies register I,
3059 t: designate nothing to REG initially, but can be used
3060 by any charsets,
3061 list of integer, nil, or t: designate the first
3062 element (if integer) to REG initially, the remaining
3063 elements (if integer) is designated to REG on request,
d46c5b12 3064 if an element is t, REG can be used by any charsets,
4ed46869 3065 nil: REG is never used. */
467e7675 3066 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3067 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3068 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3069 for (i = 0; i < 4; i++)
3070 {
3071 if (INTEGERP (flags[i])
e0e989f6
KH
3072 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3073 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3074 {
3075 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3076 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3077 }
3078 else if (EQ (flags[i], Qt))
3079 {
3080 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3081 reg_bits |= 1 << i;
3082 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3083 }
3084 else if (CONSP (flags[i]))
3085 {
84d60297
RS
3086 Lisp_Object tail;
3087 tail = flags[i];
4ed46869 3088
d46c5b12 3089 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
03699b14
KR
3090 if (INTEGERP (XCAR (tail))
3091 && (charset = XINT (XCAR (tail)),
e0e989f6 3092 CHARSET_VALID_P (charset))
03699b14 3093 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3094 {
3095 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3096 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3097 }
3098 else
3099 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3100 tail = XCDR (tail);
4ed46869
KH
3101 while (CONSP (tail))
3102 {
03699b14
KR
3103 if (INTEGERP (XCAR (tail))
3104 && (charset = XINT (XCAR (tail)),
e0e989f6 3105 CHARSET_VALID_P (charset))
03699b14 3106 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3107 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3108 = i;
03699b14 3109 else if (EQ (XCAR (tail), Qt))
d46c5b12 3110 reg_bits |= 1 << i;
03699b14 3111 tail = XCDR (tail);
4ed46869
KH
3112 }
3113 }
3114 else
3115 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3116
3117 CODING_SPEC_ISO_DESIGNATION (coding, i)
3118 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3119 }
3120
d46c5b12 3121 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3122 {
3123 /* REG 1 can be used only by locking shift in 7-bit env. */
3124 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3125 reg_bits &= ~2;
4ed46869
KH
3126 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3127 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3128 reg_bits &= 3;
4ed46869
KH
3129 }
3130
d46c5b12
KH
3131 if (reg_bits)
3132 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3133 {
d46c5b12
KH
3134 if (CHARSET_VALID_P (charset))
3135 {
3136 /* There exist some default graphic registers to be
3137 used CHARSET. */
3138
3139 /* We had better avoid designating a charset of
3140 CHARS96 to REG 0 as far as possible. */
3141 if (CHARSET_CHARS (charset) == 96)
3142 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3143 = (reg_bits & 2
3144 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3145 else
3146 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3147 = (reg_bits & 1
3148 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3149 }
6e85d753 3150 }
4ed46869 3151 }
c952af22 3152 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3153 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3154 break;
3155
3156 case 3:
3157 coding->type = coding_type_big5;
c952af22
KH
3158 coding->common_flags
3159 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3160 coding->flags
4608c386 3161 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3162 ? CODING_FLAG_BIG5_HKU
3163 : CODING_FLAG_BIG5_ETEN);
3164 break;
3165
3166 case 4:
3167 coding->type = coding_type_ccl;
c952af22
KH
3168 coding->common_flags
3169 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3170 {
84d60297 3171 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3172 if (! CONSP (val)
3173 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3174 XCAR (val)) < 0
ef4ced28 3175 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3176 XCDR (val)) < 0)
4ed46869 3177 goto label_invalid_coding_system;
1397dc18
KH
3178
3179 bzero (coding->spec.ccl.valid_codes, 256);
3180 val = Fplist_get (plist, Qvalid_codes);
3181 if (CONSP (val))
3182 {
3183 Lisp_Object this;
3184
03699b14 3185 for (; CONSP (val); val = XCDR (val))
1397dc18 3186 {
03699b14 3187 this = XCAR (val);
1397dc18
KH
3188 if (INTEGERP (this)
3189 && XINT (this) >= 0 && XINT (this) < 256)
3190 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3191 else if (CONSP (this)
03699b14
KR
3192 && INTEGERP (XCAR (this))
3193 && INTEGERP (XCDR (this)))
1397dc18 3194 {
03699b14
KR
3195 int start = XINT (XCAR (this));
3196 int end = XINT (XCDR (this));
1397dc18
KH
3197
3198 if (start >= 0 && start <= end && end < 256)
e133c8fa 3199 while (start <= end)
1397dc18
KH
3200 coding->spec.ccl.valid_codes[start++] = 1;
3201 }
3202 }
3203 }
4ed46869 3204 }
c952af22 3205 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3206 break;
3207
27901516
KH
3208 case 5:
3209 coding->type = coding_type_raw_text;
3210 break;
3211
4ed46869 3212 default:
d46c5b12 3213 goto label_invalid_coding_system;
4ed46869
KH
3214 }
3215 return 0;
3216
3217 label_invalid_coding_system:
3218 coding->type = coding_type_no_conversion;
d46c5b12 3219 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3220 coding->common_flags = 0;
dec137e5 3221 coding->eol_type = CODING_EOL_LF;
d46c5b12 3222 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3223 return -1;
3224}
3225
ec6d2bb8
KH
3226/* Free memory blocks allocated for storing composition information. */
3227
3228void
3229coding_free_composition_data (coding)
3230 struct coding_system *coding;
3231{
3232 struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234 if (!cmp_data)
3235 return;
3236 /* Memory blocks are chained. At first, rewind to the first, then,
3237 free blocks one by one. */
3238 while (cmp_data->prev)
3239 cmp_data = cmp_data->prev;
3240 while (cmp_data)
3241 {
3242 next = cmp_data->next;
3243 xfree (cmp_data);
3244 cmp_data = next;
3245 }
3246 coding->cmp_data = NULL;
3247}
3248
3249/* Set `char_offset' member of all memory blocks pointed by
3250 coding->cmp_data to POS. */
3251
3252void
3253coding_adjust_composition_offset (coding, pos)
3254 struct coding_system *coding;
3255 int pos;
3256{
3257 struct composition_data *cmp_data;
3258
3259 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260 cmp_data->char_offset = pos;
3261}
3262
54f78171
KH
3263/* Setup raw-text or one of its subsidiaries in the structure
3264 coding_system CODING according to the already setup value eol_type
3265 in CODING. CODING should be setup for some coding system in
3266 advance. */
3267
3268void
3269setup_raw_text_coding_system (coding)
3270 struct coding_system *coding;
3271{
3272 if (coding->type != coding_type_raw_text)
3273 {
3274 coding->symbol = Qraw_text;
3275 coding->type = coding_type_raw_text;
3276 if (coding->eol_type != CODING_EOL_UNDECIDED)
3277 {
84d60297
RS
3278 Lisp_Object subsidiaries;
3279 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3280
3281 if (VECTORP (subsidiaries)
3282 && XVECTOR (subsidiaries)->size == 3)
3283 coding->symbol
3284 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285 }
716e0b0a 3286 setup_coding_system (coding->symbol, coding);
54f78171
KH
3287 }
3288 return;
3289}
3290
4ed46869
KH
3291/* Emacs has a mechanism to automatically detect a coding system if it
3292 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3293 it's impossible to distinguish some coding systems accurately
3294 because they use the same range of codes. So, at first, coding
3295 systems are categorized into 7, those are:
3296
0ef69138 3297 o coding-category-emacs-mule
4ed46869
KH
3298
3299 The category for a coding system which has the same code range
3300 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3301 symbol) `emacs-mule' by default.
4ed46869
KH
3302
3303 o coding-category-sjis
3304
3305 The category for a coding system which has the same code range
3306 as SJIS. Assigned the coding-system (Lisp
7717c392 3307 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3308
3309 o coding-category-iso-7
3310
3311 The category for a coding system which has the same code range
7717c392 3312 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3313 shift and single shift functions. This can encode/decode all
3314 charsets. Assigned the coding-system (Lisp symbol)
3315 `iso-2022-7bit' by default.
3316
3317 o coding-category-iso-7-tight
3318
3319 Same as coding-category-iso-7 except that this can
3320 encode/decode only the specified charsets.
4ed46869
KH
3321
3322 o coding-category-iso-8-1
3323
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3326 for DIMENSION1 charset. This doesn't use any locking shift
3327 and single shift functions. Assigned the coding-system (Lisp
3328 symbol) `iso-latin-1' by default.
4ed46869
KH
3329
3330 o coding-category-iso-8-2
3331
3332 The category for a coding system which has the same code range
3333 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3334 for DIMENSION2 charset. This doesn't use any locking shift
3335 and single shift functions. Assigned the coding-system (Lisp
3336 symbol) `japanese-iso-8bit' by default.
4ed46869 3337
7717c392 3338 o coding-category-iso-7-else
4ed46869
KH
3339
3340 The category for a coding system which has the same code range
7717c392
KH
3341 as ISO2022 of 7-bit environemnt but uses locking shift or
3342 single shift functions. Assigned the coding-system (Lisp
3343 symbol) `iso-2022-7bit-lock' by default.
3344
3345 o coding-category-iso-8-else
3346
3347 The category for a coding system which has the same code range
3348 as ISO2022 of 8-bit environemnt but uses locking shift or
3349 single shift functions. Assigned the coding-system (Lisp
3350 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3351
3352 o coding-category-big5
3353
3354 The category for a coding system which has the same code range
3355 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3356 `cn-big5' by default.
4ed46869 3357
fa42c37f
KH
3358 o coding-category-utf-8
3359
3360 The category for a coding system which has the same code range
3361 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3362 symbol) `utf-8' by default.
3363
3364 o coding-category-utf-16-be
3365
3366 The category for a coding system in which a text has an
3367 Unicode signature (cf. Unicode Standard) in the order of BIG
3368 endian at the head. Assigned the coding-system (Lisp symbol)
3369 `utf-16-be' by default.
3370
3371 o coding-category-utf-16-le
3372
3373 The category for a coding system in which a text has an
3374 Unicode signature (cf. Unicode Standard) in the order of
3375 LITTLE endian at the head. Assigned the coding-system (Lisp
3376 symbol) `utf-16-le' by default.
3377
1397dc18
KH
3378 o coding-category-ccl
3379
3380 The category for a coding system of which encoder/decoder is
3381 written in CCL programs. The default value is nil, i.e., no
3382 coding system is assigned.
3383
4ed46869
KH
3384 o coding-category-binary
3385
3386 The category for a coding system not categorized in any of the
3387 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3388 `no-conversion' by default.
4ed46869
KH
3389
3390 Each of them is a Lisp symbol and the value is an actual
3391 `coding-system's (this is also a Lisp symbol) assigned by a user.
3392 What Emacs does actually is to detect a category of coding system.
3393 Then, it uses a `coding-system' assigned to it. If Emacs can't
3394 decide only one possible category, it selects a category of the
3395 highest priority. Priorities of categories are also specified by a
3396 user in a Lisp variable `coding-category-list'.
3397
3398*/
3399
66cfb530
KH
3400static
3401int ascii_skip_code[256];
3402
d46c5b12 3403/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3404 If it detects possible coding systems, return an integer in which
3405 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
3406 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3407 it should point the table `coding_priorities'. In that case, only
3408 the flag bit for a coding system of the highest priority is set in
3409 the returned value.
4ed46869 3410
d46c5b12
KH
3411 How many ASCII characters are at the head is returned as *SKIP. */
3412
3413static int
3414detect_coding_mask (source, src_bytes, priorities, skip)
3415 unsigned char *source;
3416 int src_bytes, *priorities, *skip;
4ed46869
KH
3417{
3418 register unsigned char c;
d46c5b12 3419 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f
KH
3420 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421 int i, idx;
4ed46869
KH
3422
3423 /* At first, skip all ASCII characters and control characters except
3424 for three ISO2022 specific control characters. */
66cfb530
KH
3425 ascii_skip_code[ISO_CODE_SO] = 0;
3426 ascii_skip_code[ISO_CODE_SI] = 0;
3427 ascii_skip_code[ISO_CODE_ESC] = 0;
3428
bcf26d6a 3429 label_loop_detect_coding:
66cfb530 3430 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3431 *skip = src - source;
4ed46869
KH
3432
3433 if (src >= src_end)
3434 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3435 return 0;
4ed46869 3436
8a8147d6 3437 c = *src;
4ed46869
KH
3438 /* The text seems to be encoded in some multilingual coding system.
3439 Now, try to find in which coding system the text is encoded. */
3440 if (c < 0x80)
bcf26d6a
KH
3441 {
3442 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443 /* C is an ISO2022 specific control code of C0. */
3444 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3445 if (mask == 0)
d46c5b12
KH
3446 {
3447 /* No valid ISO2022 code follows C. Try again. */
3448 src++;
66cfb530
KH
3449 if (c == ISO_CODE_ESC)
3450 ascii_skip_code[ISO_CODE_ESC] = 1;
3451 else
3452 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3453 goto label_loop_detect_coding;
3454 }
3455 if (priorities)
fa42c37f
KH
3456 {
3457 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458 {
3459 if (mask & priorities[i])
3460 return priorities[i];
3461 }
3462 return CODING_CATEGORY_MASK_RAW_TEXT;
3463 }
bcf26d6a 3464 }
d46c5b12 3465 else
c4825358 3466 {
d46c5b12 3467 int try;
4ed46869 3468
d46c5b12
KH
3469 if (c < 0xA0)
3470 {
3471 /* C is the first byte of SJIS character code,
fa42c37f
KH
3472 or a leading-code of Emacs' internal format (emacs-mule),
3473 or the first byte of UTF-16. */
3474 try = (CODING_CATEGORY_MASK_SJIS
3475 | CODING_CATEGORY_MASK_EMACS_MULE
3476 | CODING_CATEGORY_MASK_UTF_16_BE
3477 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
3478
3479 /* Or, if C is a special latin extra code,
3480 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481 or is an ISO2022 control-sequence-introducer (CSI),
3482 we should also consider the possibility of ISO2022 codings. */
3483 if ((VECTORP (Vlatin_extra_code_table)
3484 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486 || (c == ISO_CODE_CSI
3487 && (src < src_end
3488 && (*src == ']'
3489 || ((*src == '0' || *src == '1' || *src == '2')
3490 && src + 1 < src_end
3491 && src[1] == ']')))))
3492 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493 | CODING_CATEGORY_MASK_ISO_8BIT);
3494 }
c4825358 3495 else
d46c5b12
KH
3496 /* C is a character of ISO2022 in graphic plane right,
3497 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
3498 or the first byte of BIG5's 2-byte code,
3499 or the first byte of UTF-8/16. */
d46c5b12
KH
3500 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501 | CODING_CATEGORY_MASK_ISO_8BIT
3502 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
3503 | CODING_CATEGORY_MASK_BIG5
3504 | CODING_CATEGORY_MASK_UTF_8
3505 | CODING_CATEGORY_MASK_UTF_16_BE
3506 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 3507
1397dc18
KH
3508 /* Or, we may have to consider the possibility of CCL. */
3509 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511 ->spec.ccl.valid_codes)[c])
3512 try |= CODING_CATEGORY_MASK_CCL;
3513
d46c5b12 3514 mask = 0;
fa42c37f 3515 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
3516 if (priorities)
3517 {
3518 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519 {
fa42c37f
KH
3520 if (!iso2022_examined_p
3521 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522 {
3523 mask |= detect_coding_iso2022 (src, src_end);
3524 iso2022_examined_p = 1;
3525 }
5ab13dd0 3526 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
fa42c37f
KH
3527 mask |= detect_coding_sjis (src, src_end);
3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529 mask |= detect_coding_utf_8 (src, src_end);
3530 else if (!utf16_examined_p
3531 && (priorities[i] & try &
3532 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533 {
3534 mask |= detect_coding_utf_16 (src, src_end);
3535 utf16_examined_p = 1;
3536 }
5ab13dd0 3537 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
fa42c37f 3538 mask |= detect_coding_big5 (src, src_end);
5ab13dd0 3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
fa42c37f 3540 mask |= detect_coding_emacs_mule (src, src_end);
89fa8b36 3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
fa42c37f 3542 mask |= detect_coding_ccl (src, src_end);
5ab13dd0 3543 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 3544 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 3545 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
3546 mask |= CODING_CATEGORY_MASK_BINARY;
3547 if (mask & priorities[i])
3548 return priorities[i];
d46c5b12
KH
3549 }
3550 return CODING_CATEGORY_MASK_RAW_TEXT;
3551 }
3552 if (try & CODING_CATEGORY_MASK_ISO)
3553 mask |= detect_coding_iso2022 (src, src_end);
3554 if (try & CODING_CATEGORY_MASK_SJIS)
3555 mask |= detect_coding_sjis (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_BIG5)
3557 mask |= detect_coding_big5 (src, src_end);
fa42c37f
KH
3558 if (try & CODING_CATEGORY_MASK_UTF_8)
3559 mask |= detect_coding_utf_8 (src, src_end);
3560 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561 mask |= detect_coding_utf_16 (src, src_end);
d46c5b12 3562 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3563 mask |= detect_coding_emacs_mule (src, src_end);
3564 if (try & CODING_CATEGORY_MASK_CCL)
3565 mask |= detect_coding_ccl (src, src_end);
c4825358 3566 }
5ab13dd0 3567 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
3568}
3569
3570/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571 The information of the detected coding system is set in CODING. */
3572
3573void
3574detect_coding (coding, src, src_bytes)
3575 struct coding_system *coding;
3576 unsigned char *src;
3577 int src_bytes;
3578{
d46c5b12
KH
3579 unsigned int idx;
3580 int skip, mask, i;
84d60297 3581 Lisp_Object val;
4ed46869 3582
84d60297 3583 val = Vcoding_category_list;
66cfb530 3584 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3585 coding->heading_ascii = skip;
4ed46869 3586
d46c5b12
KH
3587 if (!mask) return;
3588
3589 /* We found a single coding system of the highest priority in MASK. */
3590 idx = 0;
3591 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592 if (! mask)
3593 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3594
d46c5b12
KH
3595 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3598 {
84d60297 3599 Lisp_Object tmp;
d46c5b12 3600
84d60297 3601 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3602 if (VECTORP (tmp))
3603 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3604 }
b73bfc1c
KH
3605
3606 /* Setup this new coding system while preserving some slots. */
3607 {
3608 int src_multibyte = coding->src_multibyte;
3609 int dst_multibyte = coding->dst_multibyte;
3610
3611 setup_coding_system (val, coding);
3612 coding->src_multibyte = src_multibyte;
3613 coding->dst_multibyte = dst_multibyte;
3614 coding->heading_ascii = skip;
3615 }
4ed46869
KH
3616}
3617
d46c5b12
KH
3618/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3623
bc4bc72a
RS
3624#define MAX_EOL_CHECK_COUNT 3
3625
d46c5b12
KH
3626static int
3627detect_eol_type (source, src_bytes, skip)
3628 unsigned char *source;
3629 int src_bytes, *skip;
4ed46869 3630{
d46c5b12 3631 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3632 unsigned char c;
bc4bc72a
RS
3633 int total = 0; /* How many end-of-lines are found so far. */
3634 int eol_type = CODING_EOL_UNDECIDED;
3635 int this_eol_type;
4ed46869 3636
d46c5b12
KH
3637 *skip = 0;
3638
bc4bc72a 3639 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3640 {
3641 c = *src++;
bc4bc72a 3642 if (c == '\n' || c == '\r')
4ed46869 3643 {
d46c5b12
KH
3644 if (*skip == 0)
3645 *skip = src - 1 - source;
bc4bc72a
RS
3646 total++;
3647 if (c == '\n')
3648 this_eol_type = CODING_EOL_LF;
3649 else if (src >= src_end || *src != '\n')
3650 this_eol_type = CODING_EOL_CR;
4ed46869 3651 else
bc4bc72a
RS
3652 this_eol_type = CODING_EOL_CRLF, src++;
3653
3654 if (eol_type == CODING_EOL_UNDECIDED)
3655 /* This is the first end-of-line. */
3656 eol_type = this_eol_type;
3657 else if (eol_type != this_eol_type)
d46c5b12
KH
3658 {
3659 /* The found type is different from what found before. */
3660 eol_type = CODING_EOL_INCONSISTENT;
3661 break;
3662 }
4ed46869
KH
3663 }
3664 }
bc4bc72a 3665
d46c5b12
KH
3666 if (*skip == 0)
3667 *skip = src_end - source;
85a02ca4 3668 return eol_type;
4ed46869
KH
3669}
3670
fa42c37f
KH
3671/* Like detect_eol_type, but detect EOL type in 2-octet
3672 big-endian/little-endian format for coding systems utf-16-be and
3673 utf-16-le. */
3674
3675static int
3676detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677 unsigned char *source;
3678 int src_bytes, *skip;
3679{
3680 unsigned char *src = source, *src_end = src + src_bytes;
3681 unsigned int c1, c2;
3682 int total = 0; /* How many end-of-lines are found so far. */
3683 int eol_type = CODING_EOL_UNDECIDED;
3684 int this_eol_type;
3685 int msb, lsb;
3686
3687 if (big_endian_p)
3688 msb = 0, lsb = 1;
3689 else
3690 msb = 1, lsb = 0;
3691
3692 *skip = 0;
3693
3694 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695 {
3696 c1 = (src[msb] << 8) | (src[lsb]);
3697 src += 2;
3698
3699 if (c1 == '\n' || c1 == '\r')
3700 {
3701 if (*skip == 0)
3702 *skip = src - 2 - source;
3703 total++;
3704 if (c1 == '\n')
3705 {
3706 this_eol_type = CODING_EOL_LF;
3707 }
3708 else
3709 {
3710 if ((src + 1) >= src_end)
3711 {
3712 this_eol_type = CODING_EOL_CR;
3713 }
3714 else
3715 {
3716 c2 = (src[msb] << 8) | (src[lsb]);
3717 if (c2 == '\n')
3718 this_eol_type = CODING_EOL_CRLF, src += 2;
3719 else
3720 this_eol_type = CODING_EOL_CR;
3721 }
3722 }
3723
3724 if (eol_type == CODING_EOL_UNDECIDED)
3725 /* This is the first end-of-line. */
3726 eol_type = this_eol_type;
3727 else if (eol_type != this_eol_type)
3728 {
3729 /* The found type is different from what found before. */
3730 eol_type = CODING_EOL_INCONSISTENT;
3731 break;
3732 }
3733 }
3734 }
3735
3736 if (*skip == 0)
3737 *skip = src_end - source;
3738 return eol_type;
3739}
3740
4ed46869
KH
3741/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742 is encoded. If it detects an appropriate format of end-of-line, it
3743 sets the information in *CODING. */
3744
3745void
3746detect_eol (coding, src, src_bytes)
3747 struct coding_system *coding;
3748 unsigned char *src;
3749 int src_bytes;
3750{
4608c386 3751 Lisp_Object val;
d46c5b12 3752 int skip;
fa42c37f
KH
3753 int eol_type;
3754
3755 switch (coding->category_idx)
3756 {
3757 case CODING_CATEGORY_IDX_UTF_16_BE:
3758 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759 break;
3760 case CODING_CATEGORY_IDX_UTF_16_LE:
3761 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762 break;
3763 default:
3764 eol_type = detect_eol_type (src, src_bytes, &skip);
3765 break;
3766 }
d46c5b12
KH
3767
3768 if (coding->heading_ascii > skip)
3769 coding->heading_ascii = skip;
3770 else
3771 skip = coding->heading_ascii;
4ed46869 3772
0ef69138 3773 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3774 return;
27901516
KH
3775 if (eol_type == CODING_EOL_INCONSISTENT)
3776 {
3777#if 0
3778 /* This code is suppressed until we find a better way to
992f23f2 3779 distinguish raw text file and binary file. */
27901516
KH
3780
3781 /* If we have already detected that the coding is raw-text, the
3782 coding should actually be no-conversion. */
3783 if (coding->type == coding_type_raw_text)
3784 {
3785 setup_coding_system (Qno_conversion, coding);
3786 return;
3787 }
3788 /* Else, let's decode only text code anyway. */
3789#endif /* 0 */
1b2af4b0 3790 eol_type = CODING_EOL_LF;
27901516
KH
3791 }
3792
4608c386 3793 val = Fget (coding->symbol, Qeol_type);
4ed46869 3794 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12 3795 {
b73bfc1c
KH
3796 int src_multibyte = coding->src_multibyte;
3797 int dst_multibyte = coding->dst_multibyte;
3798
d46c5b12 3799 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
b73bfc1c
KH
3800 coding->src_multibyte = src_multibyte;
3801 coding->dst_multibyte = dst_multibyte;
d46c5b12
KH
3802 coding->heading_ascii = skip;
3803 }
3804}
3805
3806#define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
b73bfc1c
KH
3808#define DECODING_BUFFER_MAG(coding) \
3809 (coding->type == coding_type_iso2022 \
3810 ? 3 \
3811 : (coding->type == coding_type_ccl \
3812 ? coding->spec.ccl.decoder.buf_magnification \
3813 : 2))
d46c5b12
KH
3814
3815/* Return maximum size (bytes) of a buffer enough for decoding
3816 SRC_BYTES of text encoded in CODING. */
3817
3818int
3819decoding_buffer_size (coding, src_bytes)
3820 struct coding_system *coding;
3821 int src_bytes;
3822{
3823 return (src_bytes * DECODING_BUFFER_MAG (coding)
3824 + CONVERSION_BUFFER_EXTRA_ROOM);
3825}
3826
3827/* Return maximum size (bytes) of a buffer enough for encoding
3828 SRC_BYTES of text to CODING. */
3829
3830int
3831encoding_buffer_size (coding, src_bytes)
3832 struct coding_system *coding;
3833 int src_bytes;
3834{
3835 int magnification;
3836
3837 if (coding->type == coding_type_ccl)
3838 magnification = coding->spec.ccl.encoder.buf_magnification;
b73bfc1c 3839 else if (CODING_REQUIRE_ENCODING (coding))
d46c5b12 3840 magnification = 3;
b73bfc1c
KH
3841 else
3842 magnification = 1;
d46c5b12
KH
3843
3844 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845}
3846
3847#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849#endif
3850
3851char *conversion_buffer;
3852int conversion_buffer_size;
3853
3854/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855 or decoding. Sufficient memory is allocated automatically. If we
3856 run out of memory, return NULL. */
3857
3858char *
3859get_conversion_buffer (size)
3860 int size;
3861{
3862 if (size > conversion_buffer_size)
3863 {
3864 char *buf;
3865 int real_size = conversion_buffer_size * 2;
3866
3867 while (real_size < size) real_size *= 2;
3868 buf = (char *) xmalloc (real_size);
3869 xfree (conversion_buffer);
3870 conversion_buffer = buf;
3871 conversion_buffer_size = real_size;
3872 }
3873 return conversion_buffer;
3874}
3875
3876int
3877ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878 struct coding_system *coding;
3879 unsigned char *source, *destination;
3880 int src_bytes, dst_bytes, encodep;
3881{
3882 struct ccl_program *ccl
3883 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884 int result;
3885
ae9ff118 3886 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3887
d46c5b12
KH
3888 coding->produced = ccl_driver (ccl, source, destination,
3889 src_bytes, dst_bytes, &(coding->consumed));
b73bfc1c
KH
3890 if (encodep)
3891 coding->produced_char = coding->produced;
3892 else
3893 {
3894 int bytes
3895 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3896 coding->produced = str_as_multibyte (destination, bytes,
3897 coding->produced,
3898 &(coding->produced_char));
3899 }
69f76525 3900
d46c5b12
KH
3901 switch (ccl->status)
3902 {
3903 case CCL_STAT_SUSPEND_BY_SRC:
3904 result = CODING_FINISH_INSUFFICIENT_SRC;
3905 break;
3906 case CCL_STAT_SUSPEND_BY_DST:
3907 result = CODING_FINISH_INSUFFICIENT_DST;
3908 break;
9864ebce
KH
3909 case CCL_STAT_QUIT:
3910 case CCL_STAT_INVALID_CMD:
3911 result = CODING_FINISH_INTERRUPT;
3912 break;
d46c5b12
KH
3913 default:
3914 result = CODING_FINISH_NORMAL;
3915 break;
3916 }
3917 return result;
4ed46869
KH
3918}
3919
3920/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3921 decoding, it may detect coding system and format of end-of-line if
b73bfc1c
KH
3922 those are not yet decided. The source should be unibyte, the
3923 result is multibyte if CODING->dst_multibyte is nonzero, else
3924 unibyte. */
4ed46869
KH
3925
3926int
d46c5b12 3927decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3928 struct coding_system *coding;
3929 unsigned char *source, *destination;
3930 int src_bytes, dst_bytes;
4ed46869 3931{
0ef69138 3932 if (coding->type == coding_type_undecided)
4ed46869
KH
3933 detect_coding (coding, source, src_bytes);
3934
0ef69138 3935 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3936 detect_eol (coding, source, src_bytes);
3937
b73bfc1c
KH
3938 coding->produced = coding->produced_char = 0;
3939 coding->consumed = coding->consumed_char = 0;
3940 coding->errors = 0;
3941 coding->result = CODING_FINISH_NORMAL;
3942
4ed46869
KH
3943 switch (coding->type)
3944 {
4ed46869 3945 case coding_type_sjis:
b73bfc1c
KH
3946 decode_coding_sjis_big5 (coding, source, destination,
3947 src_bytes, dst_bytes, 1);
4ed46869
KH
3948 break;
3949
3950 case coding_type_iso2022:
b73bfc1c
KH
3951 decode_coding_iso2022 (coding, source, destination,
3952 src_bytes, dst_bytes);
4ed46869
KH
3953 break;
3954
3955 case coding_type_big5:
b73bfc1c
KH
3956 decode_coding_sjis_big5 (coding, source, destination,
3957 src_bytes, dst_bytes, 0);
3958 break;
3959
3960 case coding_type_emacs_mule:
3961 decode_coding_emacs_mule (coding, source, destination,
3962 src_bytes, dst_bytes);
4ed46869
KH
3963 break;
3964
3965 case coding_type_ccl:
b73bfc1c
KH
3966 ccl_coding_driver (coding, source, destination,
3967 src_bytes, dst_bytes, 0);
d46c5b12
KH
3968 break;
3969
b73bfc1c
KH
3970 default:
3971 decode_eol (coding, source, destination, src_bytes, dst_bytes);
3972 }
3973
3974 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3975 && coding->consumed == src_bytes)
3976 coding->result = CODING_FINISH_NORMAL;
3977
3978 if (coding->mode & CODING_MODE_LAST_BLOCK
3979 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3980 {
3981 unsigned char *src = source + coding->consumed;
3982 unsigned char *dst = destination + coding->produced;
3983
3984 src_bytes -= coding->consumed;
3985 coding->errors++;
3986 if (COMPOSING_P (coding))
3987 DECODE_COMPOSITION_END ('1');
3988 while (src_bytes--)
d46c5b12 3989 {
b73bfc1c
KH
3990 int c = *src++;
3991 dst += CHAR_STRING (c, dst);
3992 coding->produced_char++;
d46c5b12 3993 }
b73bfc1c
KH
3994 coding->consumed = coding->consumed_char = src - source;
3995 coding->produced = dst - destination;
4ed46869
KH
3996 }
3997
b73bfc1c
KH
3998 if (!coding->dst_multibyte)
3999 {
4000 coding->produced = str_as_unibyte (destination, coding->produced);
4001 coding->produced_char = coding->produced;
4002 }
4ed46869 4003
b73bfc1c
KH
4004 return coding->result;
4005}
52d41803 4006
b73bfc1c
KH
4007/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4008 multibyteness of the source is CODING->src_multibyte, the
4009 multibyteness of the result is always unibyte. */
4ed46869
KH
4010
4011int
d46c5b12 4012encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
4013 struct coding_system *coding;
4014 unsigned char *source, *destination;
4015 int src_bytes, dst_bytes;
4ed46869 4016{
b73bfc1c
KH
4017 coding->produced = coding->produced_char = 0;
4018 coding->consumed = coding->consumed_char = 0;
4019 coding->errors = 0;
4020 coding->result = CODING_FINISH_NORMAL;
4ed46869 4021
d46c5b12
KH
4022 switch (coding->type)
4023 {
4ed46869 4024 case coding_type_sjis:
b73bfc1c
KH
4025 encode_coding_sjis_big5 (coding, source, destination,
4026 src_bytes, dst_bytes, 1);
4ed46869
KH
4027 break;
4028
4029 case coding_type_iso2022:
b73bfc1c
KH
4030 encode_coding_iso2022 (coding, source, destination,
4031 src_bytes, dst_bytes);
4ed46869
KH
4032 break;
4033
4034 case coding_type_big5:
b73bfc1c
KH
4035 encode_coding_sjis_big5 (coding, source, destination,
4036 src_bytes, dst_bytes, 0);
4037 break;
4038
4039 case coding_type_emacs_mule:
4040 encode_coding_emacs_mule (coding, source, destination,
4041 src_bytes, dst_bytes);
4ed46869
KH
4042 break;
4043
4044 case coding_type_ccl:
b73bfc1c
KH
4045 ccl_coding_driver (coding, source, destination,
4046 src_bytes, dst_bytes, 1);
d46c5b12
KH
4047 break;
4048
b73bfc1c
KH
4049 default:
4050 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4051 }
4052
4053 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4054 && coding->consumed == src_bytes)
4055 coding->result = CODING_FINISH_NORMAL;
4056
4057 if (coding->mode & CODING_MODE_LAST_BLOCK)
4058 {
4059 unsigned char *src = source + coding->consumed;
4060 unsigned char *src_end = src + src_bytes;
4061 unsigned char *dst = destination + coding->produced;
4062
4063 if (coding->type == coding_type_iso2022)
4064 ENCODE_RESET_PLANE_AND_REGISTER;
4065 if (COMPOSING_P (coding))
4066 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4067 if (coding->consumed < src_bytes)
d46c5b12 4068 {
b73bfc1c
KH
4069 int len = src_bytes - coding->consumed;
4070
4071 BCOPY_SHORT (source + coding->consumed, dst, len);
4072 if (coding->src_multibyte)
4073 len = str_as_unibyte (dst, len);
4074 dst += len;
4075 coding->consumed = src_bytes;
d46c5b12 4076 }
b73bfc1c 4077 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
4078 }
4079
b73bfc1c 4080 return coding->result;
4ed46869
KH
4081}
4082
fb88bf2d
KH
4083/* Scan text in the region between *BEG and *END (byte positions),
4084 skip characters which we don't have to decode by coding system
4085 CODING at the head and tail, then set *BEG and *END to the region
4086 of the text we actually have to convert. The caller should move
b73bfc1c
KH
4087 the gap out of the region in advance if the region is from a
4088 buffer.
4ed46869 4089
d46c5b12
KH
4090 If STR is not NULL, *BEG and *END are indices into STR. */
4091
4092static void
4093shrink_decoding_region (beg, end, coding, str)
4094 int *beg, *end;
4095 struct coding_system *coding;
4096 unsigned char *str;
4097{
fb88bf2d 4098 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 4099 int eol_conversion;
88993dfd 4100 Lisp_Object translation_table;
d46c5b12
KH
4101
4102 if (coding->type == coding_type_ccl
4103 || coding->type == coding_type_undecided
b73bfc1c
KH
4104 || coding->eol_type != CODING_EOL_LF
4105 || !NILP (coding->post_read_conversion)
4106 || coding->composing != COMPOSITION_DISABLED)
d46c5b12
KH
4107 {
4108 /* We can't skip any data. */
4109 return;
4110 }
b73bfc1c
KH
4111 if (coding->type == coding_type_no_conversion
4112 || coding->type == coding_type_raw_text
4113 || coding->type == coding_type_emacs_mule)
d46c5b12 4114 {
fb88bf2d
KH
4115 /* We need no conversion, but don't have to skip any data here.
4116 Decoding routine handles them effectively anyway. */
d46c5b12
KH
4117 return;
4118 }
4119
88993dfd
KH
4120 translation_table = coding->translation_table_for_decode;
4121 if (NILP (translation_table) && !NILP (Venable_character_translation))
4122 translation_table = Vstandard_translation_table_for_decode;
4123 if (CHAR_TABLE_P (translation_table))
4124 {
4125 int i;
4126 for (i = 0; i < 128; i++)
4127 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4128 break;
4129 if (i < 128)
fa46990e 4130 /* Some ASCII character should be translated. We give up
88993dfd
KH
4131 shrinking. */
4132 return;
4133 }
4134
b73bfc1c 4135 if (coding->heading_ascii >= 0)
d46c5b12
KH
4136 /* Detection routine has already found how much we can skip at the
4137 head. */
4138 *beg += coding->heading_ascii;
4139
4140 if (str)
4141 {
4142 begp_orig = begp = str + *beg;
4143 endp_orig = endp = str + *end;
4144 }
4145 else
4146 {
fb88bf2d 4147 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4148 endp_orig = endp = begp + *end - *beg;
4149 }
4150
fa46990e
DL
4151 eol_conversion = (coding->eol_type == CODING_EOL_CR
4152 || coding->eol_type == CODING_EOL_CRLF);
4153
d46c5b12
KH
4154 switch (coding->type)
4155 {
d46c5b12
KH
4156 case coding_type_sjis:
4157 case coding_type_big5:
4158 /* We can skip all ASCII characters at the head. */
4159 if (coding->heading_ascii < 0)
4160 {
4161 if (eol_conversion)
de9d083c 4162 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4163 else
4164 while (begp < endp && *begp < 0x80) begp++;
4165 }
4166 /* We can skip all ASCII characters at the tail except for the
4167 second byte of SJIS or BIG5 code. */
4168 if (eol_conversion)
de9d083c 4169 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4170 else
4171 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4172 /* Do not consider LF as ascii if preceded by CR, since that
4173 confuses eol decoding. */
4174 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4175 endp++;
d46c5b12
KH
4176 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4177 endp++;
4178 break;
4179
b73bfc1c 4180 case coding_type_iso2022:
622fece5
KH
4181 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4182 /* We can't skip any data. */
4183 break;
d46c5b12
KH
4184 if (coding->heading_ascii < 0)
4185 {
d46c5b12
KH
4186 /* We can skip all ASCII characters at the head except for a
4187 few control codes. */
4188 while (begp < endp && (c = *begp) < 0x80
4189 && c != ISO_CODE_CR && c != ISO_CODE_SO
4190 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4191 && (!eol_conversion || c != ISO_CODE_LF))
4192 begp++;
4193 }
4194 switch (coding->category_idx)
4195 {
4196 case CODING_CATEGORY_IDX_ISO_8_1:
4197 case CODING_CATEGORY_IDX_ISO_8_2:
4198 /* We can skip all ASCII characters at the tail. */
4199 if (eol_conversion)
de9d083c 4200 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4201 else
4202 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4203 /* Do not consider LF as ascii if preceded by CR, since that
4204 confuses eol decoding. */
4205 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4206 endp++;
d46c5b12
KH
4207 break;
4208
4209 case CODING_CATEGORY_IDX_ISO_7:
4210 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4211 {
4212 /* We can skip all charactes at the tail except for 8-bit
4213 codes and ESC and the following 2-byte at the tail. */
4214 unsigned char *eight_bit = NULL;
4215
4216 if (eol_conversion)
4217 while (begp < endp
4218 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4219 {
4220 if (!eight_bit && c & 0x80) eight_bit = endp;
4221 endp--;
4222 }
4223 else
4224 while (begp < endp
4225 && (c = endp[-1]) != ISO_CODE_ESC)
4226 {
4227 if (!eight_bit && c & 0x80) eight_bit = endp;
4228 endp--;
4229 }
4230 /* Do not consider LF as ascii if preceded by CR, since that
4231 confuses eol decoding. */
4232 if (begp < endp && endp < endp_orig
4233 && endp[-1] == '\r' && endp[0] == '\n')
4234 endp++;
4235 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4236 {
4237 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4238 /* This is an ASCII designation sequence. We can
4239 surely skip the tail. But, if we have
4240 encountered an 8-bit code, skip only the codes
4241 after that. */
4242 endp = eight_bit ? eight_bit : endp + 2;
4243 else
4244 /* Hmmm, we can't skip the tail. */
4245 endp = endp_orig;
4246 }
4247 else if (eight_bit)
4248 endp = eight_bit;
4249 }
d46c5b12 4250 }
b73bfc1c
KH
4251 break;
4252
4253 default:
4254 abort ();
d46c5b12
KH
4255 }
4256 *beg += begp - begp_orig;
4257 *end += endp - endp_orig;
4258 return;
4259}
4260
4261/* Like shrink_decoding_region but for encoding. */
4262
4263static void
4264shrink_encoding_region (beg, end, coding, str)
4265 int *beg, *end;
4266 struct coding_system *coding;
4267 unsigned char *str;
4268{
4269 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4270 int eol_conversion;
88993dfd 4271 Lisp_Object translation_table;
d46c5b12 4272
b73bfc1c
KH
4273 if (coding->type == coding_type_ccl
4274 || coding->eol_type == CODING_EOL_CRLF
4275 || coding->eol_type == CODING_EOL_CR
4276 || coding->cmp_data && coding->cmp_data->used > 0)
d46c5b12 4277 {
b73bfc1c
KH
4278 /* We can't skip any data. */
4279 return;
4280 }
4281 if (coding->type == coding_type_no_conversion
4282 || coding->type == coding_type_raw_text
4283 || coding->type == coding_type_emacs_mule
4284 || coding->type == coding_type_undecided)
4285 {
4286 /* We need no conversion, but don't have to skip any data here.
4287 Encoding routine handles them effectively anyway. */
d46c5b12
KH
4288 return;
4289 }
4290
88993dfd
KH
4291 translation_table = coding->translation_table_for_encode;
4292 if (NILP (translation_table) && !NILP (Venable_character_translation))
4293 translation_table = Vstandard_translation_table_for_encode;
4294 if (CHAR_TABLE_P (translation_table))
4295 {
4296 int i;
4297 for (i = 0; i < 128; i++)
4298 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4299 break;
4300 if (i < 128)
4301 /* Some ASCII character should be tranlsated. We give up
4302 shrinking. */
4303 return;
4304 }
4305
d46c5b12
KH
4306 if (str)
4307 {
4308 begp_orig = begp = str + *beg;
4309 endp_orig = endp = str + *end;
4310 }
4311 else
4312 {
fb88bf2d 4313 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4314 endp_orig = endp = begp + *end - *beg;
4315 }
4316
4317 eol_conversion = (coding->eol_type == CODING_EOL_CR
4318 || coding->eol_type == CODING_EOL_CRLF);
4319
4320 /* Here, we don't have to check coding->pre_write_conversion because
4321 the caller is expected to have handled it already. */
4322 switch (coding->type)
4323 {
d46c5b12 4324 case coding_type_iso2022:
622fece5
KH
4325 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4326 /* We can't skip any data. */
4327 break;
d46c5b12
KH
4328 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4329 {
4330 unsigned char *bol = begp;
4331 while (begp < endp && *begp < 0x80)
4332 {
4333 begp++;
4334 if (begp[-1] == '\n')
4335 bol = begp;
4336 }
4337 begp = bol;
4338 goto label_skip_tail;
4339 }
4340 /* fall down ... */
4341
b73bfc1c
KH
4342 case coding_type_sjis:
4343 case coding_type_big5:
d46c5b12
KH
4344 /* We can skip all ASCII characters at the head and tail. */
4345 if (eol_conversion)
4346 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4347 else
4348 while (begp < endp && *begp < 0x80) begp++;
4349 label_skip_tail:
4350 if (eol_conversion)
4351 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4352 else
4353 while (begp < endp && *(endp - 1) < 0x80) endp--;
4354 break;
b73bfc1c
KH
4355
4356 default:
4357 abort ();
d46c5b12
KH
4358 }
4359
4360 *beg += begp - begp_orig;
4361 *end += endp - endp_orig;
4362 return;
4363}
4364
88993dfd
KH
4365/* As shrinking conversion region requires some overhead, we don't try
4366 shrinking if the length of conversion region is less than this
4367 value. */
4368static int shrink_conversion_region_threshhold = 1024;
4369
4370#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4371 do { \
4372 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4373 { \
4374 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4375 else shrink_decoding_region (beg, end, coding, str); \
4376 } \
4377 } while (0)
4378
b843d1ae
KH
4379static Lisp_Object
4380code_convert_region_unwind (dummy)
4381 Lisp_Object dummy;
4382{
4383 inhibit_pre_post_conversion = 0;
4384 return Qnil;
4385}
4386
ec6d2bb8
KH
4387/* Store information about all compositions in the range FROM and TO
4388 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4389 buffer or a string, defaults to the current buffer. */
4390
4391void
4392coding_save_composition (coding, from, to, obj)
4393 struct coding_system *coding;
4394 int from, to;
4395 Lisp_Object obj;
4396{
4397 Lisp_Object prop;
4398 int start, end;
4399
91bee881
KH
4400 if (coding->composing == COMPOSITION_DISABLED)
4401 return;
4402 if (!coding->cmp_data)
4403 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
4404 if (!find_composition (from, to, &start, &end, &prop, obj)
4405 || end > to)
4406 return;
4407 if (start < from
4408 && (!find_composition (end, to, &start, &end, &prop, obj)
4409 || end > to))
4410 return;
4411 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
4412 do
4413 {
4414 if (COMPOSITION_VALID_P (start, end, prop))
4415 {
4416 enum composition_method method = COMPOSITION_METHOD (prop);
4417 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4418 >= COMPOSITION_DATA_SIZE)
4419 coding_allocate_composition_data (coding, from);
4420 /* For relative composition, we remember start and end
4421 positions, for the other compositions, we also remember
4422 components. */
4423 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4424 if (method != COMPOSITION_RELATIVE)
4425 {
4426 /* We must store a*/
4427 Lisp_Object val, ch;
4428
4429 val = COMPOSITION_COMPONENTS (prop);
4430 if (CONSP (val))
4431 while (CONSP (val))
4432 {
4433 ch = XCAR (val), val = XCDR (val);
4434 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4435 }
4436 else if (VECTORP (val) || STRINGP (val))
4437 {
4438 int len = (VECTORP (val)
4439 ? XVECTOR (val)->size : XSTRING (val)->size);
4440 int i;
4441 for (i = 0; i < len; i++)
4442 {
4443 ch = (STRINGP (val)
4444 ? Faref (val, make_number (i))
4445 : XVECTOR (val)->contents[i]);
4446 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4447 }
4448 }
4449 else /* INTEGERP (val) */
4450 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4451 }
4452 CODING_ADD_COMPOSITION_END (coding, end - from);
4453 }
4454 start = end;
4455 }
4456 while (start < to
4457 && find_composition (start, to, &start, &end, &prop, obj)
4458 && end <= to);
4459
4460 /* Make coding->cmp_data point to the first memory block. */
4461 while (coding->cmp_data->prev)
4462 coding->cmp_data = coding->cmp_data->prev;
4463 coding->cmp_data_start = 0;
4464}
4465
4466/* Reflect the saved information about compositions to OBJ.
4467 CODING->cmp_data points to a memory block for the informaiton. OBJ
4468 is a buffer or a string, defaults to the current buffer. */
4469
33fb63eb 4470void
ec6d2bb8
KH
4471coding_restore_composition (coding, obj)
4472 struct coding_system *coding;
4473 Lisp_Object obj;
4474{
4475 struct composition_data *cmp_data = coding->cmp_data;
4476
4477 if (!cmp_data)
4478 return;
4479
4480 while (cmp_data->prev)
4481 cmp_data = cmp_data->prev;
4482
4483 while (cmp_data)
4484 {
4485 int i;
4486
4487 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4488 {
4489 int *data = cmp_data->data + i;
4490 enum composition_method method = (enum composition_method) data[3];
4491 Lisp_Object components;
4492
4493 if (method == COMPOSITION_RELATIVE)
4494 components = Qnil;
4495 else
4496 {
4497 int len = data[0] - 4, j;
4498 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4499
4500 for (j = 0; j < len; j++)
4501 args[j] = make_number (data[4 + j]);
4502 components = (method == COMPOSITION_WITH_ALTCHARS
4503 ? Fstring (len, args) : Fvector (len, args));
4504 }
4505 compose_text (data[1], data[2], components, Qnil, obj);
4506 }
4507 cmp_data = cmp_data->next;
4508 }
4509}
4510
d46c5b12 4511/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4512 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4513 coding system CODING, and return the status code of code conversion
4514 (currently, this value has no meaning).
4515
4516 How many characters (and bytes) are converted to how many
4517 characters (and bytes) are recorded in members of the structure
4518 CODING.
d46c5b12 4519
6e44253b 4520 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4521 is deleted and a new text is inserted. See the comments in
b73bfc1c
KH
4522 replace_range (insdel.c) to know what we are doing.
4523
4524 If REPLACE is zero, it is assumed that the source text is unibyte.
4525 Otherwize, it is assumed that the source text is multibyte. */
4ed46869
KH
4526
4527int
6e44253b
KH
4528code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4529 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4530 struct coding_system *coding;
4ed46869 4531{
fb88bf2d
KH
4532 int len = to - from, len_byte = to_byte - from_byte;
4533 int require, inserted, inserted_byte;
4b39528c 4534 int head_skip, tail_skip, total_skip = 0;
84d60297 4535 Lisp_Object saved_coding_symbol;
fb88bf2d 4536 int first = 1;
fb88bf2d 4537 unsigned char *src, *dst;
84d60297 4538 Lisp_Object deletion;
e133c8fa 4539 int orig_point = PT, orig_len = len;
6abb9bd9 4540 int prev_Z;
b73bfc1c
KH
4541 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4542
4543 coding->src_multibyte = replace && multibyte_p;
4544 coding->dst_multibyte = multibyte_p;
84d60297
RS
4545
4546 deletion = Qnil;
4547 saved_coding_symbol = Qnil;
d46c5b12 4548
83fa074f 4549 if (from < PT && PT < to)
e133c8fa
KH
4550 {
4551 TEMP_SET_PT_BOTH (from, from_byte);
4552 orig_point = from;
4553 }
83fa074f 4554
6e44253b 4555 if (replace)
d46c5b12 4556 {
fb88bf2d
KH
4557 int saved_from = from;
4558
d46c5b12 4559 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4560 if (saved_from != from)
4561 {
4562 to = from + len;
b73bfc1c 4563 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
fb88bf2d
KH
4564 len_byte = to_byte - from_byte;
4565 }
d46c5b12 4566 }
d46c5b12
KH
4567
4568 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4569 {
12410ef1 4570 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4571
4572 if (from < GPT && to > GPT)
4573 move_gap_both (from, from_byte);
4574 if (coding->type == coding_type_undecided)
4575 {
fb88bf2d 4576 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4577 if (coding->type == coding_type_undecided)
12410ef1
KH
4578 /* It seems that the text contains only ASCII, but we
4579 should not left it undecided because the deeper
4580 decoding routine (decode_coding) tries to detect the
4581 encodings again in vain. */
d46c5b12
KH
4582 coding->type = coding_type_emacs_mule;
4583 }
4584 if (coding->eol_type == CODING_EOL_UNDECIDED)
4585 {
4586 saved_coding_symbol = coding->symbol;
4587 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4588 if (coding->eol_type == CODING_EOL_UNDECIDED)
4589 coding->eol_type = CODING_EOL_LF;
4590 /* We had better recover the original eol format if we
4591 encounter an inconsitent eol format while decoding. */
4592 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4593 }
4594 }
4595
d46c5b12
KH
4596 /* Now we convert the text. */
4597
4598 /* For encoding, we must process pre-write-conversion in advance. */
b73bfc1c
KH
4599 if (! inhibit_pre_post_conversion
4600 && encodep
d46c5b12
KH
4601 && SYMBOLP (coding->pre_write_conversion)
4602 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4603 {
2b4f9037
KH
4604 /* The function in pre-write-conversion may put a new text in a
4605 new buffer. */
0007bdd0
KH
4606 struct buffer *prev = current_buffer;
4607 Lisp_Object new;
b843d1ae 4608 int count = specpdl_ptr - specpdl;
d46c5b12 4609
b843d1ae
KH
4610 record_unwind_protect (code_convert_region_unwind, Qnil);
4611 /* We should not call any more pre-write/post-read-conversion
4612 functions while this pre-write-conversion is running. */
4613 inhibit_pre_post_conversion = 1;
b39f748c
AS
4614 call2 (coding->pre_write_conversion,
4615 make_number (from), make_number (to));
b843d1ae
KH
4616 inhibit_pre_post_conversion = 0;
4617 /* Discard the unwind protect. */
4618 specpdl_ptr--;
4619
d46c5b12
KH
4620 if (current_buffer != prev)
4621 {
4622 len = ZV - BEGV;
0007bdd0 4623 new = Fcurrent_buffer ();
d46c5b12 4624 set_buffer_internal_1 (prev);
7dae4502 4625 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 4626 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4627 insert_from_buffer (XBUFFER (new), 1, len, 0);
4628 Fkill_buffer (new);
e133c8fa
KH
4629 if (orig_point >= to)
4630 orig_point += len - orig_len;
4631 else if (orig_point > from)
4632 orig_point = from;
4633 orig_len = len;
d46c5b12 4634 to = from + len;
b73bfc1c
KH
4635 from_byte = CHAR_TO_BYTE (from);
4636 to_byte = CHAR_TO_BYTE (to);
d46c5b12 4637 len_byte = to_byte - from_byte;
e133c8fa 4638 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4639 }
4640 }
4641
12410ef1
KH
4642 if (replace)
4643 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4644
ec6d2bb8
KH
4645 if (coding->composing != COMPOSITION_DISABLED)
4646 {
4647 if (encodep)
4648 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4649 else
4650 coding_allocate_composition_data (coding, from);
4651 }
fb88bf2d 4652
b73bfc1c
KH
4653 /* Try to skip the heading and tailing ASCIIs. */
4654 {
4655 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4656
4657 if (from < GPT && GPT < to)
4658 move_gap_both (from, from_byte);
4659 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4660 if (from_byte == to_byte
4661 && (encodep || NILP (coding->post_read_conversion))
4662 && ! CODING_REQUIRE_FLUSHING (coding))
4663 {
4664 coding->produced = len_byte;
4665 coding->produced_char = len;
4666 if (!replace)
4667 /* We must record and adjust for this new text now. */
4668 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4669 return 0;
4670 }
ec6d2bb8 4671
b73bfc1c
KH
4672 head_skip = from_byte - from_byte_orig;
4673 tail_skip = to_byte_orig - to_byte;
4674 total_skip = head_skip + tail_skip;
4675 from += head_skip;
4676 to -= tail_skip;
4677 len -= total_skip; len_byte -= total_skip;
4678 }
d46c5b12 4679
88993dfd 4680 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4681 now. So, we must remove all text properties in the region.
4682 Here, we must suppress all modification hooks. */
88993dfd 4683 if (replace)
55d8d769
KH
4684 {
4685 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4686 inhibit_modification_hooks = 1;
4687 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4688 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4689 }
88993dfd 4690
fb88bf2d
KH
4691 /* For converion, we must put the gap before the text in addition to
4692 making the gap larger for efficient decoding. The required gap
4693 size starts from 2000 which is the magic number used in make_gap.
4694 But, after one batch of conversion, it will be incremented if we
4695 find that it is not enough . */
d46c5b12
KH
4696 require = 2000;
4697
4698 if (GAP_SIZE < require)
4699 make_gap (require - GAP_SIZE);
4700 move_gap_both (from, from_byte);
4701
d46c5b12 4702 inserted = inserted_byte = 0;
fb88bf2d
KH
4703
4704 GAP_SIZE += len_byte;
4705 ZV -= len;
4706 Z -= len;
4707 ZV_BYTE -= len_byte;
4708 Z_BYTE -= len_byte;
4709
d9f9a1bc
GM
4710 if (GPT - BEG < BEG_UNCHANGED)
4711 BEG_UNCHANGED = GPT - BEG;
4712 if (Z - GPT < END_UNCHANGED)
4713 END_UNCHANGED = Z - GPT;
f2558efd 4714
b73bfc1c
KH
4715 if (!encodep && coding->src_multibyte)
4716 {
4717 /* Decoding routines expects that the source text is unibyte.
4718 We must convert 8-bit characters of multibyte form to
4719 unibyte. */
4720 int len_byte_orig = len_byte;
4721 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4722 if (len_byte < len_byte_orig)
4723 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4724 len_byte);
4725 coding->src_multibyte = 0;
4726 }
4727
d46c5b12
KH
4728 for (;;)
4729 {
fb88bf2d 4730 int result;
d46c5b12 4731
ec6d2bb8 4732 /* The buffer memory is now:
b73bfc1c
KH
4733 +--------+converted-text+---------+-------original-text-------+---+
4734 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4735 |<---------------------- GAP ----------------------->| */
ec6d2bb8
KH
4736 src = GAP_END_ADDR - len_byte;
4737 dst = GPT_ADDR + inserted_byte;
4738
d46c5b12 4739 if (encodep)
fb88bf2d 4740 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4741 else
fb88bf2d 4742 result = decode_coding (coding, src, dst, len_byte, 0);
ec6d2bb8
KH
4743
4744 /* The buffer memory is now:
b73bfc1c
KH
4745 +--------+-------converted-text----+--+------original-text----+---+
4746 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4747 |<---------------------- GAP ----------------------->| */
ec6d2bb8 4748
d46c5b12
KH
4749 inserted += coding->produced_char;
4750 inserted_byte += coding->produced;
d46c5b12 4751 len_byte -= coding->consumed;
ec6d2bb8
KH
4752
4753 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4754 {
4755 coding_allocate_composition_data (coding, from + inserted);
4756 continue;
4757 }
4758
fb88bf2d 4759 src += coding->consumed;
3636f7a3 4760 dst += coding->produced;
d46c5b12 4761
9864ebce
KH
4762 if (result == CODING_FINISH_NORMAL)
4763 {
4764 src += len_byte;
4765 break;
4766 }
d46c5b12
KH
4767 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4768 {
fb88bf2d 4769 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 4770 Lisp_Object eol_type;
d46c5b12
KH
4771
4772 /* Encode LFs back to the original eol format (CR or CRLF). */
4773 if (coding->eol_type == CODING_EOL_CR)
4774 {
4775 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4776 }
4777 else
4778 {
d46c5b12
KH
4779 int count = 0;
4780
fb88bf2d
KH
4781 while (p < pend) if (*p++ == '\n') count++;
4782 if (src - dst < count)
d46c5b12 4783 {
38edf7d4 4784 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
4785 back to CRLF. We must record converted and
4786 not-yet-converted text back to the buffer
4787 content, enlarge the gap, then record them out of
4788 the buffer contents again. */
4789 int add = len_byte + inserted_byte;
4790
4791 GAP_SIZE -= add;
4792 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4793 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4794 make_gap (count - GAP_SIZE);
4795 GAP_SIZE += add;
4796 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4797 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4798 /* Don't forget to update SRC, DST, and PEND. */
4799 src = GAP_END_ADDR - len_byte;
4800 dst = GPT_ADDR + inserted_byte;
4801 pend = dst;
d46c5b12 4802 }
d46c5b12
KH
4803 inserted += count;
4804 inserted_byte += count;
fb88bf2d
KH
4805 coding->produced += count;
4806 p = dst = pend + count;
4807 while (count)
4808 {
4809 *--p = *--pend;
4810 if (*p == '\n') count--, *--p = '\r';
4811 }
d46c5b12
KH
4812 }
4813
4814 /* Suppress eol-format conversion in the further conversion. */
4815 coding->eol_type = CODING_EOL_LF;
4816
38edf7d4
KH
4817 /* Set the coding system symbol to that for Unix-like EOL. */
4818 eol_type = Fget (saved_coding_symbol, Qeol_type);
4819 if (VECTORP (eol_type)
4820 && XVECTOR (eol_type)->size == 3
4821 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4822 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4823 else
4824 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4825
4826 continue;
d46c5b12
KH
4827 }
4828 if (len_byte <= 0)
944bd420
KH
4829 {
4830 if (coding->type != coding_type_ccl
4831 || coding->mode & CODING_MODE_LAST_BLOCK)
4832 break;
4833 coding->mode |= CODING_MODE_LAST_BLOCK;
4834 continue;
4835 }
d46c5b12
KH
4836 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4837 {
4838 /* The source text ends in invalid codes. Let's just
4839 make them valid buffer contents, and finish conversion. */
fb88bf2d 4840 inserted += len_byte;
d46c5b12 4841 inserted_byte += len_byte;
fb88bf2d 4842 while (len_byte--)
ee59c65f 4843 *dst++ = *src++;
d46c5b12
KH
4844 break;
4845 }
9864ebce
KH
4846 if (result == CODING_FINISH_INTERRUPT)
4847 {
4848 /* The conversion procedure was interrupted by a user. */
9864ebce
KH
4849 break;
4850 }
4851 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4852 if (coding->consumed < 1)
4853 {
4854 /* It's quite strange to require more memory without
4855 consuming any bytes. Perhaps CCL program bug. */
9864ebce
KH
4856 break;
4857 }
fb88bf2d
KH
4858 if (first)
4859 {
4860 /* We have just done the first batch of conversion which was
4861 stoped because of insufficient gap. Let's reconsider the
4862 required gap size (i.e. SRT - DST) now.
4863
4864 We have converted ORIG bytes (== coding->consumed) into
4865 NEW bytes (coding->produced). To convert the remaining
4866 LEN bytes, we may need REQUIRE bytes of gap, where:
4867 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4868 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4869 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4870 float ratio = coding->produced - coding->consumed;
4871 ratio /= coding->consumed;
4872 require = len_byte * ratio;
fb88bf2d
KH
4873 first = 0;
4874 }
4875 if ((src - dst) < (require + 2000))
4876 {
4877 /* See the comment above the previous call of make_gap. */
4878 int add = len_byte + inserted_byte;
4879
4880 GAP_SIZE -= add;
4881 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4882 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4883 make_gap (require + 2000);
4884 GAP_SIZE += add;
4885 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4886 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 4887 }
d46c5b12 4888 }
fb88bf2d
KH
4889 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4890
b73bfc1c
KH
4891 if (encodep && coding->dst_multibyte)
4892 {
4893 /* The output is unibyte. We must convert 8-bit characters to
4894 multibyte form. */
4895 if (inserted_byte * 2 > GAP_SIZE)
4896 {
4897 GAP_SIZE -= inserted_byte;
4898 ZV += inserted_byte; Z += inserted_byte;
4899 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4900 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4901 make_gap (inserted_byte - GAP_SIZE);
4902 GAP_SIZE += inserted_byte;
4903 ZV -= inserted_byte; Z -= inserted_byte;
4904 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4905 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4906 }
4907 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4908 }
7553d0e1 4909
12410ef1
KH
4910 /* If we have shrinked the conversion area, adjust it now. */
4911 if (total_skip > 0)
4912 {
4913 if (tail_skip > 0)
4914 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4915 inserted += total_skip; inserted_byte += total_skip;
4916 GAP_SIZE += total_skip;
4917 GPT -= head_skip; GPT_BYTE -= head_skip;
4918 ZV -= total_skip; ZV_BYTE -= total_skip;
4919 Z -= total_skip; Z_BYTE -= total_skip;
4920 from -= head_skip; from_byte -= head_skip;
4921 to += tail_skip; to_byte += tail_skip;
4922 }
4923
6abb9bd9 4924 prev_Z = Z;
12410ef1 4925 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4926 inserted = Z - prev_Z;
4ed46869 4927
ec6d2bb8
KH
4928 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4929 coding_restore_composition (coding, Fcurrent_buffer ());
4930 coding_free_composition_data (coding);
4931
b73bfc1c
KH
4932 if (! inhibit_pre_post_conversion
4933 && ! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4934 {
2b4f9037 4935 Lisp_Object val;
b843d1ae 4936 int count = specpdl_ptr - specpdl;
4ed46869 4937
e133c8fa
KH
4938 if (from != PT)
4939 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4940 prev_Z = Z;
b843d1ae
KH
4941 record_unwind_protect (code_convert_region_unwind, Qnil);
4942 /* We should not call any more pre-write/post-read-conversion
4943 functions while this post-read-conversion is running. */
4944 inhibit_pre_post_conversion = 1;
2b4f9037 4945 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae
KH
4946 inhibit_pre_post_conversion = 0;
4947 /* Discard the unwind protect. */
4948 specpdl_ptr--;
6abb9bd9 4949 CHECK_NUMBER (val, 0);
944bd420 4950 inserted += Z - prev_Z;
e133c8fa
KH
4951 }
4952
4953 if (orig_point >= from)
4954 {
4955 if (orig_point >= from + orig_len)
4956 orig_point += inserted - orig_len;
4957 else
4958 orig_point = from;
4959 TEMP_SET_PT (orig_point);
d46c5b12 4960 }
4ed46869 4961
ec6d2bb8
KH
4962 if (replace)
4963 {
4964 signal_after_change (from, to - from, inserted);
e19539f1 4965 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 4966 }
2b4f9037 4967
fb88bf2d 4968 {
12410ef1
KH
4969 coding->consumed = to_byte - from_byte;
4970 coding->consumed_char = to - from;
4971 coding->produced = inserted_byte;
4972 coding->produced_char = inserted;
fb88bf2d 4973 }
7553d0e1 4974
fb88bf2d 4975 return 0;
d46c5b12
KH
4976}
4977
4978Lisp_Object
b73bfc1c
KH
4979run_pre_post_conversion_on_str (str, coding, encodep)
4980 Lisp_Object str;
4981 struct coding_system *coding;
4982 int encodep;
4983{
4984 int count = specpdl_ptr - specpdl;
4985 struct gcpro gcpro1;
4986 struct buffer *prev = current_buffer;
4987 int multibyte = STRING_MULTIBYTE (str);
4988
4989 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4990 record_unwind_protect (code_convert_region_unwind, Qnil);
4991 GCPRO1 (str);
4992 temp_output_buffer_setup (" *code-converting-work*");
4993 set_buffer_internal (XBUFFER (Vstandard_output));
4994 /* We must insert the contents of STR as is without
4995 unibyte<->multibyte conversion. For that, we adjust the
4996 multibyteness of the working buffer to that of STR. */
4997 Ferase_buffer ();
4998 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4999 insert_from_string (str, 0, 0,
5000 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5001 UNGCPRO;
5002 inhibit_pre_post_conversion = 1;
5003 if (encodep)
5004 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5005 else
6bac5b12
KH
5006 {
5007 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5008 call1 (coding->post_read_conversion, make_number (Z - BEG));
5009 }
b73bfc1c
KH
5010 inhibit_pre_post_conversion = 0;
5011 str = make_buffer_string (BEG, Z, 0);
5012 return unbind_to (count, str);
5013}
5014
5015Lisp_Object
5016decode_coding_string (str, coding, nocopy)
d46c5b12 5017 Lisp_Object str;
4ed46869 5018 struct coding_system *coding;
b73bfc1c 5019 int nocopy;
4ed46869 5020{
d46c5b12
KH
5021 int len;
5022 char *buf;
b73bfc1c 5023 int from, to, to_byte;
d46c5b12 5024 struct gcpro gcpro1;
84d60297 5025 Lisp_Object saved_coding_symbol;
d46c5b12 5026 int result;
4ed46869 5027
b73bfc1c
KH
5028 from = 0;
5029 to = XSTRING (str)->size;
5030 to_byte = STRING_BYTES (XSTRING (str));
4ed46869 5031
b73bfc1c
KH
5032 saved_coding_symbol = Qnil;
5033 if (CODING_REQUIRE_DETECTION (coding))
d46c5b12
KH
5034 {
5035 /* See the comments in code_convert_region. */
5036 if (coding->type == coding_type_undecided)
5037 {
5038 detect_coding (coding, XSTRING (str)->data, to_byte);
5039 if (coding->type == coding_type_undecided)
5040 coding->type = coding_type_emacs_mule;
5041 }
5042 if (coding->eol_type == CODING_EOL_UNDECIDED)
5043 {
5044 saved_coding_symbol = coding->symbol;
5045 detect_eol (coding, XSTRING (str)->data, to_byte);
5046 if (coding->eol_type == CODING_EOL_UNDECIDED)
5047 coding->eol_type = CODING_EOL_LF;
5048 /* We had better recover the original eol format if we
5049 encounter an inconsitent eol format while decoding. */
5050 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5051 }
5052 }
4ed46869 5053
b73bfc1c 5054 if (! CODING_REQUIRE_DECODING (coding))
ec6d2bb8 5055 {
b73bfc1c
KH
5056 if (!STRING_MULTIBYTE (str))
5057 {
5058 str = Fstring_as_multibyte (str);
5059 nocopy = 1;
5060 }
5061 return (nocopy ? str : Fcopy_sequence (str));
ec6d2bb8
KH
5062 }
5063
b73bfc1c 5064 if (STRING_MULTIBYTE (str))
d46c5b12 5065 {
b73bfc1c
KH
5066 /* Decoding routines expect the source text to be unibyte. */
5067 str = Fstring_as_unibyte (str);
5068 nocopy = 1;
5069 coding->src_multibyte = 0;
5070 }
5071 coding->dst_multibyte = 1;
ec6d2bb8 5072
b73bfc1c
KH
5073 if (coding->composing != COMPOSITION_DISABLED)
5074 coding_allocate_composition_data (coding, from);
ec6d2bb8 5075
b73bfc1c
KH
5076 /* Try to skip the heading and tailing ASCIIs. */
5077 {
5078 int from_orig = from;
4ed46869 5079
b73bfc1c
KH
5080 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5081 0);
5082 if (from == to_byte)
5083 return (nocopy ? str : Fcopy_sequence (str));
5084 }
5085
5086 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 5087 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
5088 GCPRO1 (str);
5089 buf = get_conversion_buffer (len);
5090 UNGCPRO;
4ed46869 5091
d46c5b12
KH
5092 if (from > 0)
5093 bcopy (XSTRING (str)->data, buf, from);
b73bfc1c
KH
5094 result = decode_coding (coding, XSTRING (str)->data + from,
5095 buf + from, to_byte - from, len);
5096 if (result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 5097 {
ec6d2bb8 5098 /* We simply try to decode the whole string again but without
d46c5b12
KH
5099 eol-conversion this time. */
5100 coding->eol_type = CODING_EOL_LF;
5101 coding->symbol = saved_coding_symbol;
ec6d2bb8 5102 coding_free_composition_data (coding);
b73bfc1c 5103 return decode_coding_string (str, coding, nocopy);
4ed46869 5104 }
d46c5b12
KH
5105
5106 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 5107 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 5108
fc932ac6 5109 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
b73bfc1c
KH
5110 str = make_multibyte_string (buf, len + coding->produced_char,
5111 len + coding->produced);
5112
5113 if (coding->cmp_data && coding->cmp_data->used)
5114 coding_restore_composition (coding, str);
5115 coding_free_composition_data (coding);
5116
5117 if (SYMBOLP (coding->post_read_conversion)
5118 && !NILP (Ffboundp (coding->post_read_conversion)))
6bac5b12 5119 str = run_pre_post_conversion_on_str (str, coding, 0);
b73bfc1c
KH
5120
5121 return str;
5122}
5123
5124Lisp_Object
5125encode_coding_string (str, coding, nocopy)
5126 Lisp_Object str;
5127 struct coding_system *coding;
5128 int nocopy;
5129{
5130 int len;
5131 char *buf;
5132 int from, to, to_byte;
5133 struct gcpro gcpro1;
5134 Lisp_Object saved_coding_symbol;
5135 int result;
5136
5137 if (SYMBOLP (coding->pre_write_conversion)
5138 && !NILP (Ffboundp (coding->pre_write_conversion)))
6bac5b12 5139 str = run_pre_post_conversion_on_str (str, coding, 1);
b73bfc1c
KH
5140
5141 from = 0;
5142 to = XSTRING (str)->size;
5143 to_byte = STRING_BYTES (XSTRING (str));
5144
5145 saved_coding_symbol = Qnil;
5146 if (! CODING_REQUIRE_ENCODING (coding))
826bfb8b 5147 {
b73bfc1c
KH
5148 if (STRING_MULTIBYTE (str))
5149 {
5150 str = Fstring_as_unibyte (str);
5151 nocopy = 1;
5152 }
5153 return (nocopy ? str : Fcopy_sequence (str));
826bfb8b
KH
5154 }
5155
b73bfc1c
KH
5156 /* Encoding routines determine the multibyteness of the source text
5157 by coding->src_multibyte. */
5158 coding->src_multibyte = STRING_MULTIBYTE (str);
5159 coding->dst_multibyte = 0;
5160
5161 if (coding->composing != COMPOSITION_DISABLED)
5162 coding_save_composition (coding, from, to, str);
ec6d2bb8 5163
b73bfc1c
KH
5164 /* Try to skip the heading and tailing ASCIIs. */
5165 {
5166 int from_orig = from;
5167
5168 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5169 1);
5170 if (from == to_byte)
5171 return (nocopy ? str : Fcopy_sequence (str));
5172 }
5173
5174 len = encoding_buffer_size (coding, to_byte - from);
5175 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5176 GCPRO1 (str);
5177 buf = get_conversion_buffer (len);
5178 UNGCPRO;
5179
5180 if (from > 0)
5181 bcopy (XSTRING (str)->data, buf, from);
5182 result = encode_coding (coding, XSTRING (str)->data + from,
5183 buf + from, to_byte - from, len);
5184 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5185 STRING_BYTES (XSTRING (str)) - to_byte);
5186
5187 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5188 str = make_unibyte_string (buf, len + coding->produced);
ec6d2bb8 5189 coding_free_composition_data (coding);
b73bfc1c 5190
d46c5b12 5191 return str;
4ed46869
KH
5192}
5193
5194\f
5195#ifdef emacs
1397dc18 5196/*** 8. Emacs Lisp library functions ***/
4ed46869 5197
4ed46869
KH
5198DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5199 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
5200See the documentation of `make-coding-system' for information\n\
5201about coding-system objects.")
4ed46869
KH
5202 (obj)
5203 Lisp_Object obj;
5204{
4608c386
KH
5205 if (NILP (obj))
5206 return Qt;
5207 if (!SYMBOLP (obj))
5208 return Qnil;
5209 /* Get coding-spec vector for OBJ. */
5210 obj = Fget (obj, Qcoding_system);
5211 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5212 ? Qt : Qnil);
4ed46869
KH
5213}
5214
9d991de8
RS
5215DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5216 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 5217 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
5218 (prompt)
5219 Lisp_Object prompt;
5220{
e0e989f6 5221 Lisp_Object val;
9d991de8
RS
5222 do
5223 {
4608c386
KH
5224 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5225 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
5226 }
5227 while (XSTRING (val)->size == 0);
e0e989f6 5228 return (Fintern (val, Qnil));
4ed46869
KH
5229}
5230
9b787f3e
RS
5231DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5232 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5233If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5234 (prompt, default_coding_system)
5235 Lisp_Object prompt, default_coding_system;
4ed46869 5236{
f44d27ce 5237 Lisp_Object val;
9b787f3e
RS
5238 if (SYMBOLP (default_coding_system))
5239 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 5240 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
5241 Qt, Qnil, Qcoding_system_history,
5242 default_coding_system, Qnil);
e0e989f6 5243 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
5244}
5245
5246DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5247 1, 1, 0,
5248 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
5249If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5250It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
5251The value of property should be a vector of length 5.")
5252 (coding_system)
5253 Lisp_Object coding_system;
5254{
5255 CHECK_SYMBOL (coding_system, 0);
5256 if (!NILP (Fcoding_system_p (coding_system)))
5257 return coding_system;
5258 while (1)
02ba4723 5259 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 5260}
3a73fa5d 5261\f
d46c5b12
KH
5262Lisp_Object
5263detect_coding_system (src, src_bytes, highest)
5264 unsigned char *src;
5265 int src_bytes, highest;
4ed46869
KH
5266{
5267 int coding_mask, eol_type;
d46c5b12
KH
5268 Lisp_Object val, tmp;
5269 int dummy;
4ed46869 5270
d46c5b12
KH
5271 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5272 eol_type = detect_eol_type (src, src_bytes, &dummy);
5273 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 5274 eol_type = CODING_EOL_UNDECIDED;
4ed46869 5275
d46c5b12 5276 if (!coding_mask)
4ed46869 5277 {
27901516 5278 val = Qundecided;
d46c5b12 5279 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 5280 {
f44d27ce
RS
5281 Lisp_Object val2;
5282 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
5283 if (VECTORP (val2))
5284 val = XVECTOR (val2)->contents[eol_type];
5285 }
80e803b4 5286 return (highest ? val : Fcons (val, Qnil));
4ed46869 5287 }
4ed46869 5288
d46c5b12
KH
5289 /* At first, gather possible coding systems in VAL. */
5290 val = Qnil;
fa42c37f 5291 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5292 {
fa42c37f
KH
5293 Lisp_Object category_val, category_index;
5294
5295 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5296 category_val = Fsymbol_value (XCAR (tmp));
5297 if (!NILP (category_val)
5298 && NATNUMP (category_index)
5299 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 5300 {
fa42c37f 5301 val = Fcons (category_val, val);
d46c5b12
KH
5302 if (highest)
5303 break;
4ed46869
KH
5304 }
5305 }
d46c5b12
KH
5306 if (!highest)
5307 val = Fnreverse (val);
4ed46869 5308
65059037 5309 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 5310 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5311 {
65059037
RS
5312 if (eol_type != CODING_EOL_UNDECIDED
5313 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 5314 {
d46c5b12 5315 Lisp_Object eol;
03699b14 5316 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 5317 if (VECTORP (eol))
03699b14 5318 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
5319 }
5320 }
03699b14 5321 return (highest ? XCAR (val) : val);
d46c5b12 5322}
4ed46869 5323
d46c5b12
KH
5324DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5325 2, 3, 0,
5326 "Detect coding system of the text in the region between START and END.\n\
5327Return a list of possible coding systems ordered by priority.\n\
5328\n\
80e803b4
KH
5329If only ASCII characters are found, it returns a list of single element\n\
5330`undecided' or its subsidiary coding system according to a detected\n\
5331end-of-line format.\n\
d46c5b12
KH
5332\n\
5333If optional argument HIGHEST is non-nil, return the coding system of\n\
5334highest priority.")
5335 (start, end, highest)
5336 Lisp_Object start, end, highest;
5337{
5338 int from, to;
5339 int from_byte, to_byte;
6289dd10 5340
d46c5b12
KH
5341 CHECK_NUMBER_COERCE_MARKER (start, 0);
5342 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 5343
d46c5b12
KH
5344 validate_region (&start, &end);
5345 from = XINT (start), to = XINT (end);
5346 from_byte = CHAR_TO_BYTE (from);
5347 to_byte = CHAR_TO_BYTE (to);
6289dd10 5348
d46c5b12
KH
5349 if (from < GPT && to >= GPT)
5350 move_gap_both (to, to_byte);
4ed46869 5351
d46c5b12
KH
5352 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5353 to_byte - from_byte,
5354 !NILP (highest));
5355}
6289dd10 5356
d46c5b12
KH
5357DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5358 1, 2, 0,
5359 "Detect coding system of the text in STRING.\n\
5360Return a list of possible coding systems ordered by priority.\n\
5361\n\
80e803b4
KH
5362If only ASCII characters are found, it returns a list of single element\n\
5363`undecided' or its subsidiary coding system according to a detected\n\
5364end-of-line format.\n\
d46c5b12
KH
5365\n\
5366If optional argument HIGHEST is non-nil, return the coding system of\n\
5367highest priority.")
5368 (string, highest)
5369 Lisp_Object string, highest;
5370{
5371 CHECK_STRING (string, 0);
4ed46869 5372
d46c5b12 5373 return detect_coding_system (XSTRING (string)->data,
fc932ac6 5374 STRING_BYTES (XSTRING (string)),
d46c5b12 5375 !NILP (highest));
4ed46869
KH
5376}
5377
4031e2bf
KH
5378Lisp_Object
5379code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 5380 Lisp_Object start, end, coding_system;
4031e2bf 5381 int encodep;
3a73fa5d
RS
5382{
5383 struct coding_system coding;
4031e2bf 5384 int from, to, len;
3a73fa5d 5385
d46c5b12
KH
5386 CHECK_NUMBER_COERCE_MARKER (start, 0);
5387 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
5388 CHECK_SYMBOL (coding_system, 2);
5389
d46c5b12
KH
5390 validate_region (&start, &end);
5391 from = XFASTINT (start);
5392 to = XFASTINT (end);
5393
3a73fa5d 5394 if (NILP (coding_system))
d46c5b12
KH
5395 return make_number (to - from);
5396
3a73fa5d 5397 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 5398 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 5399
d46c5b12 5400 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5401 coding.src_multibyte = coding.dst_multibyte
5402 = !NILP (current_buffer->enable_multibyte_characters);
fb88bf2d
KH
5403 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5404 &coding, encodep, 1);
f072a3e8 5405 Vlast_coding_system_used = coding.symbol;
fb88bf2d 5406 return make_number (coding.produced_char);
4031e2bf
KH
5407}
5408
5409DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5410 3, 3, "r\nzCoding system: ",
5411 "Decode the current region by specified coding system.\n\
5412When called from a program, takes three arguments:\n\
5413START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5414This function sets `last-coding-system-used' to the precise coding system\n\
5415used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5416not fully specified.)\n\
5417It returns the length of the decoded text.")
4031e2bf
KH
5418 (start, end, coding_system)
5419 Lisp_Object start, end, coding_system;
5420{
5421 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
5422}
5423
5424DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5425 3, 3, "r\nzCoding system: ",
d46c5b12 5426 "Encode the current region by specified coding system.\n\
3a73fa5d 5427When called from a program, takes three arguments:\n\
d46c5b12 5428START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5429This function sets `last-coding-system-used' to the precise coding system\n\
5430used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5431not fully specified.)\n\
5432It returns the length of the encoded text.")
d46c5b12
KH
5433 (start, end, coding_system)
5434 Lisp_Object start, end, coding_system;
3a73fa5d 5435{
4031e2bf
KH
5436 return code_convert_region1 (start, end, coding_system, 1);
5437}
3a73fa5d 5438
4031e2bf
KH
5439Lisp_Object
5440code_convert_string1 (string, coding_system, nocopy, encodep)
5441 Lisp_Object string, coding_system, nocopy;
5442 int encodep;
5443{
5444 struct coding_system coding;
3a73fa5d 5445
4031e2bf
KH
5446 CHECK_STRING (string, 0);
5447 CHECK_SYMBOL (coding_system, 1);
4ed46869 5448
d46c5b12 5449 if (NILP (coding_system))
4031e2bf 5450 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5451
d46c5b12
KH
5452 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5453 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5454
d46c5b12 5455 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5456 string = (encodep
5457 ? encode_coding_string (string, &coding, !NILP (nocopy))
5458 : decode_coding_string (string, &coding, !NILP (nocopy)));
f072a3e8 5459 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
5460
5461 return string;
4ed46869
KH
5462}
5463
4ed46869 5464DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5465 2, 3, 0,
5466 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5467Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5468if the decoding operation is trivial.\n\
5469This function sets `last-coding-system-used' to the precise coding system\n\
5470used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5471not fully specified.)")
e0e989f6
KH
5472 (string, coding_system, nocopy)
5473 Lisp_Object string, coding_system, nocopy;
4ed46869 5474{
f072a3e8 5475 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5476}
5477
5478DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5479 2, 3, 0,
5480 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5481Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5482if the encoding operation is trivial.\n\
5483This function sets `last-coding-system-used' to the precise coding system\n\
5484used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5485not fully specified.)")
e0e989f6
KH
5486 (string, coding_system, nocopy)
5487 Lisp_Object string, coding_system, nocopy;
4ed46869 5488{
f072a3e8 5489 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5490}
4031e2bf 5491
ecec61c1 5492/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
5493 Do not set Vlast_coding_system_used.
5494
5495 This function is called only from macros DECODE_FILE and
5496 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
5497
5498Lisp_Object
5499code_convert_string_norecord (string, coding_system, encodep)
5500 Lisp_Object string, coding_system;
5501 int encodep;
5502{
5503 struct coding_system coding;
5504
5505 CHECK_STRING (string, 0);
5506 CHECK_SYMBOL (coding_system, 1);
5507
5508 if (NILP (coding_system))
5509 return string;
5510
5511 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5512 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5513
ec6d2bb8 5514 coding.composing = COMPOSITION_DISABLED;
ecec61c1 5515 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5516 return (encodep
5517 ? encode_coding_string (string, &coding, 1)
5518 : decode_coding_string (string, &coding, 1));
ecec61c1 5519}
3a73fa5d 5520\f
4ed46869 5521DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5522 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5523Return the corresponding character.")
5524 (code)
5525 Lisp_Object code;
5526{
5527 unsigned char c1, c2, s1, s2;
5528 Lisp_Object val;
5529
5530 CHECK_NUMBER (code, 0);
5531 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5532 if (s1 == 0)
5533 {
c28a9453
KH
5534 if (s2 < 0x80)
5535 XSETFASTINT (val, s2);
5536 else if (s2 >= 0xA0 || s2 <= 0xDF)
b73bfc1c 5537 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
c28a9453 5538 else
9da8350f 5539 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5540 }
5541 else
5542 {
5543 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5544 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5545 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3 5546 DECODE_SJIS (s1, s2, c1, c2);
b73bfc1c 5547 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
55ab7be3 5548 }
4ed46869
KH
5549 return val;
5550}
5551
5552DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5553 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5554Return the corresponding code in SJIS.")
4ed46869
KH
5555 (ch)
5556 Lisp_Object ch;
5557{
bcf26d6a 5558 int charset, c1, c2, s1, s2;
4ed46869
KH
5559 Lisp_Object val;
5560
5561 CHECK_NUMBER (ch, 0);
5562 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5563 if (charset == CHARSET_ASCII)
5564 {
5565 val = ch;
5566 }
5567 else if (charset == charset_jisx0208
5568 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5569 {
5570 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5571 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5572 }
55ab7be3
KH
5573 else if (charset == charset_katakana_jisx0201
5574 && c1 > 0x20 && c2 < 0xE0)
5575 {
5576 XSETFASTINT (val, c1 | 0x80);
5577 }
4ed46869 5578 else
55ab7be3 5579 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5580 return val;
5581}
5582
5583DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5584 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5585Return the corresponding character.")
5586 (code)
5587 Lisp_Object code;
5588{
5589 int charset;
5590 unsigned char b1, b2, c1, c2;
5591 Lisp_Object val;
5592
5593 CHECK_NUMBER (code, 0);
5594 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5595 if (b1 == 0)
5596 {
5597 if (b2 >= 0x80)
9da8350f 5598 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5599 val = code;
5600 }
5601 else
5602 {
5603 if ((b1 < 0xA1 || b1 > 0xFE)
5604 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5605 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453 5606 DECODE_BIG5 (b1, b2, charset, c1, c2);
b73bfc1c 5607 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
c28a9453 5608 }
4ed46869
KH
5609 return val;
5610}
5611
5612DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5613 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5614Return the corresponding character code in Big5.")
5615 (ch)
5616 Lisp_Object ch;
5617{
bcf26d6a 5618 int charset, c1, c2, b1, b2;
4ed46869
KH
5619 Lisp_Object val;
5620
5621 CHECK_NUMBER (ch, 0);
5622 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5623 if (charset == CHARSET_ASCII)
5624 {
5625 val = ch;
5626 }
5627 else if ((charset == charset_big5_1
5628 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5629 || (charset == charset_big5_2
5630 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5631 {
5632 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5633 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5634 }
5635 else
c28a9453 5636 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5637 return val;
5638}
3a73fa5d 5639\f
1ba9e4ab
KH
5640DEFUN ("set-terminal-coding-system-internal",
5641 Fset_terminal_coding_system_internal,
5642 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5643 (coding_system)
5644 Lisp_Object coding_system;
5645{
5646 CHECK_SYMBOL (coding_system, 0);
5647 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5648 /* We had better not send unsafe characters to terminal. */
6e85d753 5649 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
ec6d2bb8
KH
5650 /* Characer composition should be disabled. */
5651 terminal_coding.composing = COMPOSITION_DISABLED;
b73bfc1c
KH
5652 terminal_coding.src_multibyte = 1;
5653 terminal_coding.dst_multibyte = 0;
4ed46869
KH
5654 return Qnil;
5655}
5656
c4825358
KH
5657DEFUN ("set-safe-terminal-coding-system-internal",
5658 Fset_safe_terminal_coding_system_internal,
5659 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5660 (coding_system)
5661 Lisp_Object coding_system;
5662{
5663 CHECK_SYMBOL (coding_system, 0);
5664 setup_coding_system (Fcheck_coding_system (coding_system),
5665 &safe_terminal_coding);
ec6d2bb8
KH
5666 /* Characer composition should be disabled. */
5667 safe_terminal_coding.composing = COMPOSITION_DISABLED;
b73bfc1c
KH
5668 safe_terminal_coding.src_multibyte = 1;
5669 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
5670 return Qnil;
5671}
5672
4ed46869
KH
5673DEFUN ("terminal-coding-system",
5674 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5675 "Return coding system specified for terminal output.")
4ed46869
KH
5676 ()
5677{
5678 return terminal_coding.symbol;
5679}
5680
1ba9e4ab
KH
5681DEFUN ("set-keyboard-coding-system-internal",
5682 Fset_keyboard_coding_system_internal,
5683 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5684 (coding_system)
5685 Lisp_Object coding_system;
5686{
5687 CHECK_SYMBOL (coding_system, 0);
5688 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
ec6d2bb8
KH
5689 /* Characer composition should be disabled. */
5690 keyboard_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
5691 return Qnil;
5692}
5693
5694DEFUN ("keyboard-coding-system",
5695 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5696 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5697 ()
5698{
5699 return keyboard_coding.symbol;
5700}
5701
5702\f
a5d301df
KH
5703DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5704 Sfind_operation_coding_system, 1, MANY, 0,
5705 "Choose a coding system for an operation based on the target name.\n\
69f76525 5706The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5707DECODING-SYSTEM is the coding system to use for decoding\n\
5708\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5709for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5710\n\
5711The first argument OPERATION specifies an I/O primitive:\n\
5712 For file I/O, `insert-file-contents' or `write-region'.\n\
5713 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5714 For network I/O, `open-network-stream'.\n\
5715\n\
5716The remaining arguments should be the same arguments that were passed\n\
5717to the primitive. Depending on which primitive, one of those arguments\n\
5718is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5719whichever argument specifies the file name is TARGET.\n\
5720\n\
5721TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5722 For file I/O, TARGET is a file name.\n\
5723 For process I/O, TARGET is a process name.\n\
5724 For network I/O, TARGET is a service name or a port number\n\
5725\n\
02ba4723
KH
5726This function looks up what specified for TARGET in,\n\
5727`file-coding-system-alist', `process-coding-system-alist',\n\
5728or `network-coding-system-alist' depending on OPERATION.\n\
5729They may specify a coding system, a cons of coding systems,\n\
5730or a function symbol to call.\n\
5731In the last case, we call the function with one argument,\n\
9ce27fde 5732which is a list of all the arguments given to this function.")
4ed46869
KH
5733 (nargs, args)
5734 int nargs;
5735 Lisp_Object *args;
5736{
5737 Lisp_Object operation, target_idx, target, val;
5738 register Lisp_Object chain;
5739
5740 if (nargs < 2)
5741 error ("Too few arguments");
5742 operation = args[0];
5743 if (!SYMBOLP (operation)
5744 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5745 error ("Invalid first arguement");
5746 if (nargs < 1 + XINT (target_idx))
5747 error ("Too few arguments for operation: %s",
5748 XSYMBOL (operation)->name->data);
5749 target = args[XINT (target_idx) + 1];
5750 if (!(STRINGP (target)
5751 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5752 error ("Invalid %dth argument", XINT (target_idx) + 1);
5753
2e34157c
RS
5754 chain = ((EQ (operation, Qinsert_file_contents)
5755 || EQ (operation, Qwrite_region))
02ba4723 5756 ? Vfile_coding_system_alist
2e34157c 5757 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5758 ? Vnetwork_coding_system_alist
5759 : Vprocess_coding_system_alist));
4ed46869
KH
5760 if (NILP (chain))
5761 return Qnil;
5762
03699b14 5763 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 5764 {
f44d27ce 5765 Lisp_Object elt;
03699b14 5766 elt = XCAR (chain);
4ed46869
KH
5767
5768 if (CONSP (elt)
5769 && ((STRINGP (target)
03699b14
KR
5770 && STRINGP (XCAR (elt))
5771 && fast_string_match (XCAR (elt), target) >= 0)
5772 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 5773 {
03699b14 5774 val = XCDR (elt);
b19fd4c5
KH
5775 /* Here, if VAL is both a valid coding system and a valid
5776 function symbol, we return VAL as a coding system. */
02ba4723
KH
5777 if (CONSP (val))
5778 return val;
5779 if (! SYMBOLP (val))
5780 return Qnil;
5781 if (! NILP (Fcoding_system_p (val)))
5782 return Fcons (val, val);
b19fd4c5
KH
5783 if (! NILP (Ffboundp (val)))
5784 {
5785 val = call1 (val, Flist (nargs, args));
5786 if (CONSP (val))
5787 return val;
5788 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5789 return Fcons (val, val);
5790 }
02ba4723
KH
5791 return Qnil;
5792 }
4ed46869
KH
5793 }
5794 return Qnil;
5795}
5796
1397dc18
KH
5797DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5798 Supdate_coding_systems_internal, 0, 0, 0,
5799 "Update internal database for ISO2022 and CCL based coding systems.\n\
fa42c37f
KH
5800When values of any coding categories are changed, you must\n\
5801call this function")
d46c5b12
KH
5802 ()
5803{
5804 int i;
5805
fa42c37f 5806 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 5807 {
1397dc18
KH
5808 Lisp_Object val;
5809
5810 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5811 if (!NILP (val))
5812 {
5813 if (! coding_system_table[i])
5814 coding_system_table[i] = ((struct coding_system *)
5815 xmalloc (sizeof (struct coding_system)));
5816 setup_coding_system (val, coding_system_table[i]);
5817 }
5818 else if (coding_system_table[i])
5819 {
5820 xfree (coding_system_table[i]);
5821 coding_system_table[i] = NULL;
5822 }
d46c5b12 5823 }
1397dc18 5824
d46c5b12
KH
5825 return Qnil;
5826}
5827
66cfb530
KH
5828DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5829 Sset_coding_priority_internal, 0, 0, 0,
5830 "Update internal database for the current value of `coding-category-list'.\n\
5831This function is internal use only.")
5832 ()
5833{
5834 int i = 0, idx;
84d60297
RS
5835 Lisp_Object val;
5836
5837 val = Vcoding_category_list;
66cfb530
KH
5838
5839 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5840 {
03699b14 5841 if (! SYMBOLP (XCAR (val)))
66cfb530 5842 break;
03699b14 5843 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
5844 if (idx >= CODING_CATEGORY_IDX_MAX)
5845 break;
5846 coding_priorities[i++] = (1 << idx);
03699b14 5847 val = XCDR (val);
66cfb530
KH
5848 }
5849 /* If coding-category-list is valid and contains all coding
5850 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 5851 the following code saves Emacs from crashing. */
66cfb530
KH
5852 while (i < CODING_CATEGORY_IDX_MAX)
5853 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5854
5855 return Qnil;
5856}
5857
4ed46869
KH
5858#endif /* emacs */
5859
5860\f
1397dc18 5861/*** 9. Post-amble ***/
4ed46869 5862
6d74c3aa
KH
5863void
5864init_coding ()
5865{
5866 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5867}
5868
dfcf069d 5869void
4ed46869
KH
5870init_coding_once ()
5871{
5872 int i;
5873
0ef69138 5874 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5875 for (i = 0; i <= 0x20; i++)
5876 emacs_code_class[i] = EMACS_control_code;
5877 emacs_code_class[0x0A] = EMACS_linefeed_code;
5878 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5879 for (i = 0x21 ; i < 0x7F; i++)
5880 emacs_code_class[i] = EMACS_ascii_code;
5881 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 5882 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
5883 emacs_code_class[i] = EMACS_invalid_code;
5884 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5885 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5886 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5887 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5888
5889 /* ISO2022 specific initialize routine. */
5890 for (i = 0; i < 0x20; i++)
b73bfc1c 5891 iso_code_class[i] = ISO_control_0;
4ed46869
KH
5892 for (i = 0x21; i < 0x7F; i++)
5893 iso_code_class[i] = ISO_graphic_plane_0;
5894 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 5895 iso_code_class[i] = ISO_control_1;
4ed46869
KH
5896 for (i = 0xA1; i < 0xFF; i++)
5897 iso_code_class[i] = ISO_graphic_plane_1;
5898 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5899 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5900 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5901 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5902 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5903 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5904 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5905 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5906 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5907 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5908
e0e989f6 5909 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5910
5911 setup_coding_system (Qnil, &keyboard_coding);
5912 setup_coding_system (Qnil, &terminal_coding);
c4825358 5913 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5914 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5915
d46c5b12
KH
5916 bzero (coding_system_table, sizeof coding_system_table);
5917
66cfb530
KH
5918 bzero (ascii_skip_code, sizeof ascii_skip_code);
5919 for (i = 0; i < 128; i++)
5920 ascii_skip_code[i] = 1;
5921
9ce27fde
KH
5922#if defined (MSDOS) || defined (WINDOWSNT)
5923 system_eol_type = CODING_EOL_CRLF;
5924#else
5925 system_eol_type = CODING_EOL_LF;
5926#endif
b843d1ae
KH
5927
5928 inhibit_pre_post_conversion = 0;
e0e989f6
KH
5929}
5930
5931#ifdef emacs
5932
dfcf069d 5933void
e0e989f6
KH
5934syms_of_coding ()
5935{
5936 Qtarget_idx = intern ("target-idx");
5937 staticpro (&Qtarget_idx);
5938
bb0115a2
RS
5939 Qcoding_system_history = intern ("coding-system-history");
5940 staticpro (&Qcoding_system_history);
5941 Fset (Qcoding_system_history, Qnil);
5942
9ce27fde 5943 /* Target FILENAME is the first argument. */
e0e989f6 5944 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5945 /* Target FILENAME is the third argument. */
e0e989f6
KH
5946 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5947
5948 Qcall_process = intern ("call-process");
5949 staticpro (&Qcall_process);
9ce27fde 5950 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5951 Fput (Qcall_process, Qtarget_idx, make_number (0));
5952
5953 Qcall_process_region = intern ("call-process-region");
5954 staticpro (&Qcall_process_region);
9ce27fde 5955 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5956 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5957
5958 Qstart_process = intern ("start-process");
5959 staticpro (&Qstart_process);
9ce27fde 5960 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5961 Fput (Qstart_process, Qtarget_idx, make_number (2));
5962
5963 Qopen_network_stream = intern ("open-network-stream");
5964 staticpro (&Qopen_network_stream);
9ce27fde 5965 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5966 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5967
4ed46869
KH
5968 Qcoding_system = intern ("coding-system");
5969 staticpro (&Qcoding_system);
5970
5971 Qeol_type = intern ("eol-type");
5972 staticpro (&Qeol_type);
5973
5974 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5975 staticpro (&Qbuffer_file_coding_system);
5976
5977 Qpost_read_conversion = intern ("post-read-conversion");
5978 staticpro (&Qpost_read_conversion);
5979
5980 Qpre_write_conversion = intern ("pre-write-conversion");
5981 staticpro (&Qpre_write_conversion);
5982
27901516
KH
5983 Qno_conversion = intern ("no-conversion");
5984 staticpro (&Qno_conversion);
5985
5986 Qundecided = intern ("undecided");
5987 staticpro (&Qundecided);
5988
4ed46869
KH
5989 Qcoding_system_p = intern ("coding-system-p");
5990 staticpro (&Qcoding_system_p);
5991
5992 Qcoding_system_error = intern ("coding-system-error");
5993 staticpro (&Qcoding_system_error);
5994
5995 Fput (Qcoding_system_error, Qerror_conditions,
5996 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5997 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5998 build_string ("Invalid coding system"));
4ed46869 5999
d46c5b12
KH
6000 Qcoding_category = intern ("coding-category");
6001 staticpro (&Qcoding_category);
4ed46869
KH
6002 Qcoding_category_index = intern ("coding-category-index");
6003 staticpro (&Qcoding_category_index);
6004
d46c5b12
KH
6005 Vcoding_category_table
6006 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6007 staticpro (&Vcoding_category_table);
4ed46869
KH
6008 {
6009 int i;
6010 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6011 {
d46c5b12
KH
6012 XVECTOR (Vcoding_category_table)->contents[i]
6013 = intern (coding_category_name[i]);
6014 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6015 Qcoding_category_index, make_number (i));
4ed46869
KH
6016 }
6017 }
6018
f967223b
KH
6019 Qtranslation_table = intern ("translation-table");
6020 staticpro (&Qtranslation_table);
1397dc18 6021 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 6022
f967223b
KH
6023 Qtranslation_table_id = intern ("translation-table-id");
6024 staticpro (&Qtranslation_table_id);
84fbb8a0 6025
f967223b
KH
6026 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6027 staticpro (&Qtranslation_table_for_decode);
a5d301df 6028
f967223b
KH
6029 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6030 staticpro (&Qtranslation_table_for_encode);
a5d301df 6031
70c22245
KH
6032 Qsafe_charsets = intern ("safe-charsets");
6033 staticpro (&Qsafe_charsets);
6034
1397dc18
KH
6035 Qvalid_codes = intern ("valid-codes");
6036 staticpro (&Qvalid_codes);
6037
9ce27fde
KH
6038 Qemacs_mule = intern ("emacs-mule");
6039 staticpro (&Qemacs_mule);
6040
d46c5b12
KH
6041 Qraw_text = intern ("raw-text");
6042 staticpro (&Qraw_text);
6043
4ed46869
KH
6044 defsubr (&Scoding_system_p);
6045 defsubr (&Sread_coding_system);
6046 defsubr (&Sread_non_nil_coding_system);
6047 defsubr (&Scheck_coding_system);
6048 defsubr (&Sdetect_coding_region);
d46c5b12 6049 defsubr (&Sdetect_coding_string);
4ed46869
KH
6050 defsubr (&Sdecode_coding_region);
6051 defsubr (&Sencode_coding_region);
6052 defsubr (&Sdecode_coding_string);
6053 defsubr (&Sencode_coding_string);
6054 defsubr (&Sdecode_sjis_char);
6055 defsubr (&Sencode_sjis_char);
6056 defsubr (&Sdecode_big5_char);
6057 defsubr (&Sencode_big5_char);
1ba9e4ab 6058 defsubr (&Sset_terminal_coding_system_internal);
c4825358 6059 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 6060 defsubr (&Sterminal_coding_system);
1ba9e4ab 6061 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 6062 defsubr (&Skeyboard_coding_system);
a5d301df 6063 defsubr (&Sfind_operation_coding_system);
1397dc18 6064 defsubr (&Supdate_coding_systems_internal);
66cfb530 6065 defsubr (&Sset_coding_priority_internal);
4ed46869 6066
4608c386
KH
6067 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6068 "List of coding systems.\n\
6069\n\
6070Do not alter the value of this variable manually. This variable should be\n\
6071updated by the functions `make-coding-system' and\n\
6072`define-coding-system-alias'.");
6073 Vcoding_system_list = Qnil;
6074
6075 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6076 "Alist of coding system names.\n\
6077Each element is one element list of coding system name.\n\
6078This variable is given to `completing-read' as TABLE argument.\n\
6079\n\
6080Do not alter the value of this variable manually. This variable should be\n\
6081updated by the functions `make-coding-system' and\n\
6082`define-coding-system-alias'.");
6083 Vcoding_system_alist = Qnil;
6084
4ed46869
KH
6085 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6086 "List of coding-categories (symbols) ordered by priority.");
6087 {
6088 int i;
6089
6090 Vcoding_category_list = Qnil;
6091 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6092 Vcoding_category_list
d46c5b12
KH
6093 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6094 Vcoding_category_list);
4ed46869
KH
6095 }
6096
6097 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 6098 "Specify the coding system for read operations.\n\
2ebb362d 6099It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 6100If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 6101If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 6102There are three such tables, `file-coding-system-alist',\n\
a67a9c66 6103`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
6104 Vcoding_system_for_read = Qnil;
6105
6106 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 6107 "Specify the coding system for write operations.\n\
928aedd8
RS
6108Programs bind this variable with `let', but you should not set it globally.\n\
6109If the value is a coding system, it is used for encoding of output,\n\
6110when writing it to a file and when sending it to a file or subprocess.\n\
6111\n\
6112If this does not specify a coding system, an appropriate element\n\
6113is used from one of the coding system alists:\n\
10bff6f1 6114There are three such tables, `file-coding-system-alist',\n\
928aedd8
RS
6115`process-coding-system-alist', and `network-coding-system-alist'.\n\
6116For output to files, if the above procedure does not specify a coding system,\n\
6117the value of `buffer-file-coding-system' is used.");
4ed46869
KH
6118 Vcoding_system_for_write = Qnil;
6119
6120 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 6121 "Coding system used in the latest file or process I/O.");
4ed46869
KH
6122 Vlast_coding_system_used = Qnil;
6123
9ce27fde 6124 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 6125 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
6126See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6127such conversion.");
9ce27fde
KH
6128 inhibit_eol_conversion = 0;
6129
ed29121d
EZ
6130 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6131 "Non-nil means process buffer inherits coding system of process output.\n\
6132Bind it to t if the process output is to be treated as if it were a file\n\
6133read from some filesystem.");
6134 inherit_process_coding_system = 0;
6135
02ba4723
KH
6136 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6137 "Alist to decide a coding system to use for a file I/O operation.\n\
6138The format is ((PATTERN . VAL) ...),\n\
6139where PATTERN is a regular expression matching a file name,\n\
6140VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6141If VAL is a coding system, it is used for both decoding and encoding\n\
6142the file contents.\n\
6143If VAL is a cons of coding systems, the car part is used for decoding,\n\
6144and the cdr part is used for encoding.\n\
6145If VAL is a function symbol, the function must return a coding system\n\
6146or a cons of coding systems which are used as above.\n\
e0e989f6 6147\n\
a85a871a 6148See also the function `find-operation-coding-system'\n\
eda284ac 6149and the variable `auto-coding-alist'.");
02ba4723
KH
6150 Vfile_coding_system_alist = Qnil;
6151
6152 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6153 "Alist to decide a coding system to use for a process I/O operation.\n\
6154The format is ((PATTERN . VAL) ...),\n\
6155where PATTERN is a regular expression matching a program name,\n\
6156VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6157If VAL is a coding system, it is used for both decoding what received\n\
6158from the program and encoding what sent to the program.\n\
6159If VAL is a cons of coding systems, the car part is used for decoding,\n\
6160and the cdr part is used for encoding.\n\
6161If VAL is a function symbol, the function must return a coding system\n\
6162or a cons of coding systems which are used as above.\n\
4ed46869 6163\n\
9ce27fde 6164See also the function `find-operation-coding-system'.");
02ba4723
KH
6165 Vprocess_coding_system_alist = Qnil;
6166
6167 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6168 "Alist to decide a coding system to use for a network I/O operation.\n\
6169The format is ((PATTERN . VAL) ...),\n\
6170where PATTERN is a regular expression matching a network service name\n\
6171or is a port number to connect to,\n\
6172VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6173If VAL is a coding system, it is used for both decoding what received\n\
6174from the network stream and encoding what sent to the network stream.\n\
6175If VAL is a cons of coding systems, the car part is used for decoding,\n\
6176and the cdr part is used for encoding.\n\
6177If VAL is a function symbol, the function must return a coding system\n\
6178or a cons of coding systems which are used as above.\n\
4ed46869 6179\n\
9ce27fde 6180See also the function `find-operation-coding-system'.");
02ba4723 6181 Vnetwork_coding_system_alist = Qnil;
4ed46869 6182
68c45bf0
PE
6183 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6184 "Coding system to use with system messages.");
6185 Vlocale_coding_system = Qnil;
6186
005f0d35 6187 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9
EZ
6188 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6189 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6190 eol_mnemonic_unix = build_string (":");
4ed46869 6191
7722baf9
EZ
6192 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6193 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6194 eol_mnemonic_dos = build_string ("\\");
4ed46869 6195
7722baf9
EZ
6196 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6197 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6198 eol_mnemonic_mac = build_string ("/");
4ed46869 6199
7722baf9
EZ
6200 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6201 "*String displayed in mode line when end-of-line format is not yet determined.");
6202 eol_mnemonic_undecided = build_string (":");
4ed46869 6203
84fbb8a0 6204 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 6205 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 6206 Venable_character_translation = Qt;
bdd9fb48 6207
f967223b
KH
6208 DEFVAR_LISP ("standard-translation-table-for-decode",
6209 &Vstandard_translation_table_for_decode,
84fbb8a0 6210 "Table for translating characters while decoding.");
f967223b 6211 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 6212
f967223b
KH
6213 DEFVAR_LISP ("standard-translation-table-for-encode",
6214 &Vstandard_translation_table_for_encode,
84fbb8a0 6215 "Table for translationg characters while encoding.");
f967223b 6216 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
6217
6218 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6219 "Alist of charsets vs revision numbers.\n\
6220While encoding, if a charset (car part of an element) is found,\n\
6221designate it with the escape sequence identifing revision (cdr part of the element).");
6222 Vcharset_revision_alist = Qnil;
02ba4723
KH
6223
6224 DEFVAR_LISP ("default-process-coding-system",
6225 &Vdefault_process_coding_system,
6226 "Cons of coding systems used for process I/O by default.\n\
6227The car part is used for decoding a process output,\n\
6228the cdr part is used for encoding a text to be sent to a process.");
6229 Vdefault_process_coding_system = Qnil;
c4825358 6230
3f003981
KH
6231 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6232 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
6233This is a vector of length 256.\n\
6234If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 6235\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
6236a coding system of ISO 2022 variant which has a flag\n\
6237`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
6238or reading output of a subprocess.\n\
6239Only 128th through 159th elements has a meaning.");
3f003981 6240 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
6241
6242 DEFVAR_LISP ("select-safe-coding-system-function",
6243 &Vselect_safe_coding_system_function,
6244 "Function to call to select safe coding system for encoding a text.\n\
6245\n\
6246If set, this function is called to force a user to select a proper\n\
6247coding system which can encode the text in the case that a default\n\
6248coding system used in each operation can't encode the text.\n\
6249\n\
a85a871a 6250The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
6251 Vselect_safe_coding_system_function = Qnil;
6252
4ed46869
KH
6253}
6254
68c45bf0
PE
6255char *
6256emacs_strerror (error_number)
6257 int error_number;
6258{
6259 char *str;
6260
ca9c0567 6261 synchronize_system_messages_locale ();
68c45bf0
PE
6262 str = strerror (error_number);
6263
6264 if (! NILP (Vlocale_coding_system))
6265 {
6266 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6267 Vlocale_coding_system,
6268 0);
6269 str = (char *) XSTRING (dec)->data;
6270 }
6271
6272 return str;
6273}
6274
4ed46869 6275#endif /* emacs */
c2f94ebc 6276