*** empty log message ***
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
b73bfc1c 24 0. General comments
4ed46869 25 1. Preamble
0ef69138 26 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
1397dc18
KH
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
4ed46869
KH
34
35*/
36
b73bfc1c
KH
37/*** 0. General comments ***/
38
39
4ed46869
KH
40/*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
0ef69138
KH
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
4ed46869 48
0ef69138 49 0. Emacs' internal format (emacs-mule)
4ed46869
KH
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 52 in a special format. Details are described in section 2.
4ed46869
KH
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
f4dee582
RS
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 65 section 4.
4ed46869
KH
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
4ed46869 74
27901516
KH
75 4. Raw text
76
4608c386
KH
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
27901516
KH
79
80 5. Other
4ed46869 81
f4dee582 82 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
d46c5b12
KH
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
4ed46869 89 information about it is set in a structure of type `struct
f4dee582 90 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
91
92*/
93
94/*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 98 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
4ed46869 101
f4dee582
RS
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
4ed46869 104 any format of end-of-line. So, Emacs has information of format of
f4dee582 105 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
106
107*/
108
109/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116#if 0
117int
0ef69138 118detect_coding_emacs_mule (src, src_end)
4ed46869
KH
119 unsigned char *src, *src_end;
120{
121 ...
122}
123#endif
124
125/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
b73bfc1c
KH
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
d46c5b12 131
b73bfc1c
KH
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
d46c5b12
KH
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
4ed46869 142#if 0
b73bfc1c 143static void
d46c5b12 144decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
4ed46869
KH
148{
149 ...
150}
151#endif
152
153/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
0ef69138 155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
b73bfc1c
KH
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
d46c5b12 159
b73bfc1c
KH
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
d46c5b12
KH
164
165 DST_BYTES zero means that source area and destination area are
b73bfc1c
KH
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
d46c5b12
KH
168
169 Below is a template of these functions. */
4ed46869 170#if 0
b73bfc1c 171static void
d46c5b12 172encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
4ed46869
KH
176{
177 ...
178}
179#endif
180
181/*** COMMONLY USED MACROS ***/
182
b73bfc1c
KH
183/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
4ed46869 190
b73bfc1c
KH
191#define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
4ed46869
KH
199 } while (0)
200
b73bfc1c
KH
201#define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
4ed46869
KH
210 } while (0)
211
4ed46869 212
b73bfc1c
KH
213/* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
4ed46869 222
b73bfc1c
KH
223#define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
4ed46869
KH
240 } while (0)
241
4ed46869 242
b73bfc1c
KH
243/* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253#define EMIT_CHAR(c) \
4ed46869 254 do { \
b73bfc1c
KH
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
ec6d2bb8 268 \
b73bfc1c
KH
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
4ed46869
KH
276 } while (0)
277
4ed46869 278
b73bfc1c
KH
279#define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289#define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299#define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
4ed46869
KH
308 } while (0)
309
310\f
311/*** 1. Preamble ***/
312
68c45bf0
PE
313#ifdef emacs
314#include <config.h>
315#endif
316
4ed46869
KH
317#include <stdio.h>
318
319#ifdef emacs
320
4ed46869
KH
321#include "lisp.h"
322#include "buffer.h"
323#include "charset.h"
ec6d2bb8 324#include "composite.h"
4ed46869
KH
325#include "ccl.h"
326#include "coding.h"
327#include "window.h"
328
329#else /* not emacs */
330
331#include "mulelib.h"
332
333#endif /* not emacs */
334
335Lisp_Object Qcoding_system, Qeol_type;
336Lisp_Object Qbuffer_file_coding_system;
337Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 338Lisp_Object Qno_conversion, Qundecided;
bb0115a2 339Lisp_Object Qcoding_system_history;
70c22245 340Lisp_Object Qsafe_charsets;
1397dc18 341Lisp_Object Qvalid_codes;
4ed46869
KH
342
343extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345Lisp_Object Qstart_process, Qopen_network_stream;
346Lisp_Object Qtarget_idx;
347
d46c5b12
KH
348Lisp_Object Vselect_safe_coding_system_function;
349
7722baf9
EZ
350/* Mnemonic string for each format of end-of-line. */
351Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 353 decided. */
7722baf9 354Lisp_Object eol_mnemonic_undecided;
4ed46869 355
9ce27fde
KH
356/* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358int system_eol_type;
359
4ed46869
KH
360#ifdef emacs
361
4608c386
KH
362Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 365
d46c5b12
KH
366/* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 369
4ed46869
KH
370/* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372/* Coding-system for reading files and receiving data from process. */
373Lisp_Object Vcoding_system_for_read;
374/* Coding-system for writing files and sending data to process. */
375Lisp_Object Vcoding_system_for_write;
376/* Coding-system actually used in the latest I/O. */
377Lisp_Object Vlast_coding_system_used;
378
c4825358 379/* A vector of length 256 which contains information about special
94487c4e 380 Latin codes (especially for dealing with Microsoft codes). */
3f003981 381Lisp_Object Vlatin_extra_code_table;
c4825358 382
9ce27fde
KH
383/* Flag to inhibit code conversion of end-of-line format. */
384int inhibit_eol_conversion;
385
ed29121d
EZ
386/* Flag to make buffer-file-coding-system inherit from process-coding. */
387int inherit_process_coding_system;
388
c4825358 389/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
390struct coding_system terminal_coding;
391
c4825358
KH
392/* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394struct coding_system safe_terminal_coding;
395
396/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
397struct coding_system keyboard_coding;
398
6bc51348
KH
399/* Default coding system to be used to write a file. */
400struct coding_system default_buffer_file_coding;
401
02ba4723
KH
402Lisp_Object Vfile_coding_system_alist;
403Lisp_Object Vprocess_coding_system_alist;
404Lisp_Object Vnetwork_coding_system_alist;
4ed46869 405
68c45bf0
PE
406Lisp_Object Vlocale_coding_system;
407
4ed46869
KH
408#endif /* emacs */
409
d46c5b12 410Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
411
412/* List of symbols `coding-category-xxx' ordered by priority. */
413Lisp_Object Vcoding_category_list;
414
d46c5b12
KH
415/* Table of coding categories (Lisp symbols). */
416Lisp_Object Vcoding_category_table;
4ed46869
KH
417
418/* Table of names of symbol for each coding-category. */
419char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 420 "coding-category-emacs-mule",
4ed46869
KH
421 "coding-category-sjis",
422 "coding-category-iso-7",
d46c5b12 423 "coding-category-iso-7-tight",
4ed46869
KH
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
7717c392
KH
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
89fa8b36 428 "coding-category-ccl",
4ed46869 429 "coding-category-big5",
fa42c37f
KH
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
27901516 433 "coding-category-raw-text",
89fa8b36 434 "coding-category-binary"
4ed46869
KH
435};
436
66cfb530 437/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
438 categories. */
439struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
66cfb530
KH
441/* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443static
444int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
f967223b
KH
446/* Flag to tell if we look up translation table on character code
447 conversion. */
84fbb8a0 448Lisp_Object Venable_character_translation;
f967223b
KH
449/* Standard translation table to look up on decoding (reading). */
450Lisp_Object Vstandard_translation_table_for_decode;
451/* Standard translation table to look up on encoding (writing). */
452Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 453
f967223b
KH
454Lisp_Object Qtranslation_table;
455Lisp_Object Qtranslation_table_id;
456Lisp_Object Qtranslation_table_for_decode;
457Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
458
459/* Alist of charsets vs revision number. */
460Lisp_Object Vcharset_revision_alist;
461
02ba4723
KH
462/* Default coding systems used for process I/O. */
463Lisp_Object Vdefault_process_coding_system;
464
b843d1ae
KH
465/* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469static int inhibit_pre_post_conversion;
470
4ed46869 471\f
0ef69138 472/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
473
474/* Emacs' internal format for encoding multiple character sets is a
f4dee582 475 kind of multi-byte encoding, i.e. characters are encoded by
b73bfc1c
KH
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
f4dee582 496
4ed46869 497 --- CODE RANGE of Emacs' internal format ---
b73bfc1c
KH
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
4ed46869
KH
504 ---------------------------------------------
505
506 */
507
508enum emacs_code_class_type emacs_code_class[256];
509
4ed46869
KH
510/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
513
514int
0ef69138 515detect_coding_emacs_mule (src, src_end)
b73bfc1c 516 unsigned char *src, *src_end;
4ed46869
KH
517{
518 unsigned char c;
519 int composing = 0;
b73bfc1c
KH
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
4ed46869 523
b73bfc1c 524 while (1)
4ed46869 525 {
b73bfc1c 526 ONE_MORE_BYTE (c);
4ed46869
KH
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
b73bfc1c
KH
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
4ed46869
KH
537 else
538 c -= 0x20;
539 }
540
b73bfc1c 541 if (c < 0x20)
4ed46869 542 {
4ed46869
KH
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
b73bfc1c
KH
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
4ed46869 555
b73bfc1c
KH
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565}
4ed46869 566
4ed46869 567
b73bfc1c 568/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 569
b73bfc1c
KH
570static void
571decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575{
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
4ed46869 585
b73bfc1c
KH
586 coding->produced_char = 0;
587 while (src < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
ec6d2bb8 591
b73bfc1c
KH
592 src_base = src;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 {
595 p = src;
596 src += bytes;
597 }
598 else
599 {
600 bytes = CHAR_STRING (*src, tmp);
601 p = tmp;
602 src++;
603 }
604 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 {
606 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869
KH
607 break;
608 }
b73bfc1c
KH
609 while (bytes--) *dst++ = *p++;
610 coding->produced_char++;
4ed46869 611 }
b73bfc1c
KH
612 coding->consumed = coding->consumed_char = src_base - source;
613 coding->produced = dst - destination;
4ed46869
KH
614}
615
b73bfc1c
KH
616#define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
618
619
4ed46869
KH
620\f
621/*** 3. ISO2022 handlers ***/
622
623/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
627 original document of ISO2022.
628
629 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
b73bfc1c 634
39787efd 635 There are two kinds of character sets: control character set and
4ed46869
KH
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
39787efd
KH
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
640 two control character sets and many graphic character sets.
641
642 Graphic character sets are classified into one of the following
39787efd
KH
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
645 - DIMENSION1_CHARS94
646 - DIMENSION1_CHARS96
647 - DIMENSION2_CHARS94
648 - DIMENSION2_CHARS96
649
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
4ed46869
KH
655
656 Note (*): ECMA = European Computer Manufacturers Association
657
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
663
39787efd 664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
669
670 A control character set is directly designated and invoked to C0 or
39787efd
KH
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
4ed46869
KH
679
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
39787efd
KH
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
4ed46869 687
39787efd
KH
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
691 be used.
4ed46869
KH
692
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
39787efd
KH
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
698 escape sequences:
4ed46869
KH
699
700 ----------------------------------------------------------------------
39787efd 701 abbrev function cntrl escape seq description
4ed46869 702 ----------------------------------------------------------------------
39787efd
KH
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 712 ----------------------------------------------------------------------
39787efd
KH
713 (*) These are not used by any known coding system.
714
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
4ed46869 717
39787efd 718 Designations are done by the following escape sequences:
4ed46869
KH
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
739
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 741 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
742
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
39787efd 745 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
746 7-bit environment, non-locking-shift, and non-single-shift.
747
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 749 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
750
751 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
756 localized platforms), and all of these are variants of ISO2022.
757
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
761
39787efd 762 ISO6429's direction specification takes the following form:
4ed46869
KH
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769
770 Character composition specification takes the following form:
ec6d2bb8
KH
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c
KH
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
ec6d2bb8 778
b73bfc1c
KH
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
ec6d2bb8 783
b73bfc1c
KH
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
ec6d2bb8 786
b73bfc1c 787 COMPOSITION_RELATIVE:
ec6d2bb8 788 ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 789 COMPOSITOIN_WITH_RULE:
ec6d2bb8 790 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 791 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 793 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
795
796enum iso_code_class_type iso_code_class[256];
797
f024b6aa
RS
798#define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
804
805#define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807
4ed46869
KH
808/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
d46c5b12 812 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
817 are set. If a code which should never appear in ISO2022 is found,
818 returns 0. */
819
820int
821detect_coding_iso2022 (src, src_end)
822 unsigned char *src, *src_end;
823{
d46c5b12
KH
824 int mask = CODING_CATEGORY_MASK_ISO;
825 int mask_found = 0;
f46869e4 826 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 827 int c, c1, i, charset;
b73bfc1c
KH
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding;
830 struct coding_system *coding = &dummy_coding;
3f003981 831
d46c5b12 832 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 833 while (mask && src < src_end)
4ed46869 834 {
b73bfc1c 835 ONE_MORE_BYTE (c);
4ed46869
KH
836 switch (c)
837 {
838 case ISO_CODE_ESC:
f46869e4 839 single_shifting = 0;
b73bfc1c 840 ONE_MORE_BYTE (c);
d46c5b12 841 if (c >= '(' && c <= '/')
4ed46869 842 {
bf9cdd4e 843 /* Designation sequence for a charset of dimension 1. */
b73bfc1c 844 ONE_MORE_BYTE (c1);
d46c5b12
KH
845 if (c1 < ' ' || c1 >= 0x80
846 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
847 /* Invalid designation sequence. Just ignore. */
848 break;
849 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
850 }
851 else if (c == '$')
852 {
853 /* Designation sequence for a charset of dimension 2. */
b73bfc1c 854 ONE_MORE_BYTE (c);
bf9cdd4e
KH
855 if (c >= '@' && c <= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 857 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 858 else if (c >= '(' && c <= '/')
bcf26d6a 859 {
b73bfc1c 860 ONE_MORE_BYTE (c1);
d46c5b12
KH
861 if (c1 < ' ' || c1 >= 0x80
862 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
863 /* Invalid designation sequence. Just ignore. */
864 break;
865 reg[(c - '(') % 4] = charset;
bcf26d6a 866 }
bf9cdd4e 867 else
d46c5b12
KH
868 /* Invalid designation sequence. Just ignore. */
869 break;
870 }
ae9ff118 871 else if (c == 'N' || c == 'O')
d46c5b12 872 {
ae9ff118
KH
873 /* ESC <Fe> for SS2 or SS3. */
874 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 875 break;
4ed46869 876 }
ec6d2bb8
KH
877 else if (c >= '0' && c <= '4')
878 {
879 /* ESC <Fp> for start/end composition. */
880 mask_found |= CODING_CATEGORY_MASK_ISO;
881 break;
882 }
bf9cdd4e 883 else
d46c5b12
KH
884 /* Invalid escape sequence. Just ignore. */
885 break;
886
887 /* We found a valid designation sequence for CHARSET. */
888 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
890 mask_found |= CODING_CATEGORY_MASK_ISO_7;
891 else
892 mask &= ~CODING_CATEGORY_MASK_ISO_7;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
894 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
895 else
896 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
898 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
899 else
d46c5b12 900 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
902 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
903 else
d46c5b12 904 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
905 break;
906
4ed46869 907 case ISO_CODE_SO:
f46869e4 908 single_shifting = 0;
d46c5b12
KH
909 if (shift_out == 0
910 && (reg[1] >= 0
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 {
914 /* Locking shift out. */
915 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
916 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 }
e0e989f6
KH
918 break;
919
d46c5b12 920 case ISO_CODE_SI:
f46869e4 921 single_shifting = 0;
d46c5b12
KH
922 if (shift_out == 1)
923 {
924 /* Locking shift in. */
925 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
926 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 }
928 break;
929
4ed46869 930 case ISO_CODE_CSI:
f46869e4 931 single_shifting = 0;
4ed46869
KH
932 case ISO_CODE_SS2:
933 case ISO_CODE_SS3:
3f003981
KH
934 {
935 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936
70c22245
KH
937 if (c != ISO_CODE_CSI)
938 {
d46c5b12
KH
939 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 941 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
942 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 944 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 945 single_shifting = 1;
70c22245 946 }
3f003981
KH
947 if (VECTORP (Vlatin_extra_code_table)
948 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 {
d46c5b12
KH
950 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 952 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
953 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
955 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 }
957 mask &= newmask;
d46c5b12 958 mask_found |= newmask;
3f003981
KH
959 }
960 break;
4ed46869
KH
961
962 default:
963 if (c < 0x80)
f46869e4
KH
964 {
965 single_shifting = 0;
966 break;
967 }
4ed46869 968 else if (c < 0xA0)
c4825358 969 {
f46869e4 970 single_shifting = 0;
3f003981
KH
971 if (VECTORP (Vlatin_extra_code_table)
972 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 973 {
3f003981
KH
974 int newmask = 0;
975
d46c5b12
KH
976 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 978 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
979 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
981 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
982 mask &= newmask;
d46c5b12 983 mask_found |= newmask;
c4825358 984 }
3f003981
KH
985 else
986 return 0;
c4825358 987 }
4ed46869
KH
988 else
989 {
7717c392 990 unsigned char *src_begin = src;
4ed46869 991
d46c5b12 992 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 993 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 994 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
995 /* Check the length of succeeding codes of the range
996 0xA0..0FF. If the byte length is odd, we exclude
997 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
998 when we are not single shifting. */
b73bfc1c
KH
999 if (!single_shifting
1000 && mask & CODING_CATEGORY_MASK_ISO_8_2)
f46869e4 1001 {
b73bfc1c
KH
1002 int i = 0;
1003 while (src < src_end)
1004 {
1005 ONE_MORE_BYTE (c);
1006 if (c < 0xA0)
1007 break;
1008 i++;
1009 }
1010
1011 if (i & 1 && src < src_end)
f46869e4
KH
1012 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1013 else
1014 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1015 }
4ed46869
KH
1016 }
1017 break;
1018 }
1019 }
b73bfc1c 1020 label_end_of_loop:
d46c5b12 1021 return (mask & mask_found);
4ed46869
KH
1022}
1023
b73bfc1c
KH
1024/* Decode a character of which charset is CHARSET, the 1st position
1025 code is C1, the 2nd position code is C2, and return the decoded
1026 character code. If the variable `translation_table' is non-nil,
1027 returned the translated code. */
ec6d2bb8 1028
b73bfc1c
KH
1029#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1030 (NILP (translation_table) \
1031 ? MAKE_CHAR (charset, c1, c2) \
1032 : translate_char (translation_table, -1, charset, c1, c2))
4ed46869
KH
1033
1034/* Set designation state into CODING. */
d46c5b12
KH
1035#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1036 do { \
944bd420
KH
1037 int charset; \
1038 \
1039 if (final_char < '0' || final_char >= 128) \
1040 goto label_invalid_code; \
1041 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1042 make_number (chars), \
1043 make_number (final_char)); \
d46c5b12 1044 if (charset >= 0 \
704c5781
KH
1045 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046 || coding->safe_charsets[charset])) \
d46c5b12
KH
1047 { \
1048 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1049 && reg == 0 \
1050 && charset == CHARSET_ASCII) \
1051 { \
1052 /* We should insert this designation sequence as is so \
1053 that it is surely written back to a file. */ \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 goto label_invalid_code; \
1056 } \
1057 coding->spec.iso2022.last_invalid_designation_register = -1; \
1058 if ((coding->mode & CODING_MODE_DIRECTION) \
1059 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1060 charset = CHARSET_REVERSE_CHARSET (charset); \
1061 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1062 } \
1063 else \
1064 { \
1065 coding->spec.iso2022.last_invalid_designation_register = reg; \
1066 goto label_invalid_code; \
1067 } \
4ed46869
KH
1068 } while (0)
1069
ec6d2bb8
KH
1070/* Allocate a memory block for storing information about compositions.
1071 The block is chained to the already allocated blocks. */
d46c5b12 1072
33fb63eb 1073void
ec6d2bb8 1074coding_allocate_composition_data (coding, char_offset)
d46c5b12 1075 struct coding_system *coding;
ec6d2bb8 1076 int char_offset;
d46c5b12 1077{
ec6d2bb8
KH
1078 struct composition_data *cmp_data
1079 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1080
1081 cmp_data->char_offset = char_offset;
1082 cmp_data->used = 0;
1083 cmp_data->prev = coding->cmp_data;
1084 cmp_data->next = NULL;
1085 if (coding->cmp_data)
1086 coding->cmp_data->next = cmp_data;
1087 coding->cmp_data = cmp_data;
1088 coding->cmp_data_start = 0;
1089}
d46c5b12 1090
ec6d2bb8
KH
1091/* Record the starting position START and METHOD of one composition. */
1092
1093#define CODING_ADD_COMPOSITION_START(coding, start, method) \
1094 do { \
1095 struct composition_data *cmp_data = coding->cmp_data; \
1096 int *data = cmp_data->data + cmp_data->used; \
1097 coding->cmp_data_start = cmp_data->used; \
1098 data[0] = -1; \
1099 data[1] = cmp_data->char_offset + start; \
1100 data[3] = (int) method; \
1101 cmp_data->used += 4; \
1102 } while (0)
1103
1104/* Record the ending position END of the current composition. */
1105
1106#define CODING_ADD_COMPOSITION_END(coding, end) \
1107 do { \
1108 struct composition_data *cmp_data = coding->cmp_data; \
1109 int *data = cmp_data->data + coding->cmp_data_start; \
1110 data[0] = cmp_data->used - coding->cmp_data_start; \
1111 data[2] = cmp_data->char_offset + end; \
1112 } while (0)
1113
1114/* Record one COMPONENT (alternate character or composition rule). */
1115
1116#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1117 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1118
1119/* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1120
33fb63eb
KH
1121#define DECODE_COMPOSITION_START(c1) \
1122 do { \
1123 if (coding->composing == COMPOSITION_DISABLED) \
1124 { \
1125 *dst++ = ISO_CODE_ESC; \
1126 *dst++ = c1 & 0x7f; \
1127 coding->produced_char += 2; \
1128 } \
1129 else if (!COMPOSING_P (coding)) \
1130 { \
1131 /* This is surely the start of a composition. We must be sure \
1132 that coding->cmp_data has enough space to store the \
1133 information about the composition. If not, terminate the \
1134 current decoding loop, allocate one more memory block for \
1135 coding->cmp_data in the calller, then start the decoding \
1136 loop again. We can't allocate memory here directly because \
1137 it may cause buffer/string relocation. */ \
1138 if (!coding->cmp_data \
1139 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1140 >= COMPOSITION_DATA_SIZE)) \
1141 { \
1142 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1143 goto label_end_of_loop; \
1144 } \
1145 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1146 : c1 == '2' ? COMPOSITION_WITH_RULE \
1147 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1148 : COMPOSITION_WITH_RULE_ALTCHARS); \
1149 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1150 coding->composing); \
1151 coding->composition_rule_follows = 0; \
1152 } \
1153 else \
1154 { \
1155 /* We are already handling a composition. If the method is \
1156 the following two, the codes following the current escape \
1157 sequence are actual characters stored in a buffer. */ \
1158 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1159 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1160 { \
1161 coding->composing = COMPOSITION_RELATIVE; \
1162 coding->composition_rule_follows = 0; \
1163 } \
1164 } \
ec6d2bb8
KH
1165 } while (0)
1166
1167/* Handle compositoin end sequence ESC 1. */
1168
1169#define DECODE_COMPOSITION_END(c1) \
1170 do { \
1171 if (coding->composing == COMPOSITION_DISABLED) \
1172 { \
1173 *dst++ = ISO_CODE_ESC; \
1174 *dst++ = c1; \
1175 coding->produced_char += 2; \
1176 } \
1177 else \
1178 { \
1179 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1180 coding->composing = COMPOSITION_NO; \
1181 } \
1182 } while (0)
1183
1184/* Decode a composition rule from the byte C1 (and maybe one more byte
1185 from SRC) and store one encoded composition rule in
1186 coding->cmp_data. */
1187
1188#define DECODE_COMPOSITION_RULE(c1) \
1189 do { \
1190 int rule = 0; \
1191 (c1) -= 32; \
1192 if (c1 < 81) /* old format (before ver.21) */ \
1193 { \
1194 int gref = (c1) / 9; \
1195 int nref = (c1) % 9; \
1196 if (gref == 4) gref = 10; \
1197 if (nref == 4) nref = 10; \
1198 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1199 } \
b73bfc1c 1200 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
1201 { \
1202 ONE_MORE_BYTE (c2); \
1203 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1204 } \
1205 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1206 coding->composition_rule_follows = 0; \
1207 } while (0)
88993dfd 1208
d46c5b12 1209
4ed46869
KH
1210/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1211
b73bfc1c 1212static void
d46c5b12 1213decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1214 struct coding_system *coding;
1215 unsigned char *source, *destination;
1216 int src_bytes, dst_bytes;
4ed46869
KH
1217{
1218 unsigned char *src = source;
1219 unsigned char *src_end = source + src_bytes;
1220 unsigned char *dst = destination;
1221 unsigned char *dst_end = destination + dst_bytes;
4ed46869
KH
1222 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1223 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1224 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
b73bfc1c
KH
1225 /* SRC_BASE remembers the start position in source in each loop.
1226 The loop will be exited when there's not enough source code
1227 (within macro ONE_MORE_BYTE), or when there's not enough
1228 destination area to produce a character (within macro
1229 EMIT_CHAR). */
1230 unsigned char *src_base;
1231 int c, charset;
1232 Lisp_Object translation_table;
bdd9fb48 1233
b73bfc1c
KH
1234 if (NILP (Venable_character_translation))
1235 translation_table = Qnil;
1236 else
1237 {
1238 translation_table = coding->translation_table_for_decode;
1239 if (NILP (translation_table))
1240 translation_table = Vstandard_translation_table_for_decode;
1241 }
4ed46869 1242
b73bfc1c
KH
1243 coding->result = CODING_FINISH_NORMAL;
1244
1245 while (1)
4ed46869 1246 {
b73bfc1c
KH
1247 int c1, c2;
1248
1249 src_base = src;
1250 ONE_MORE_BYTE (c1);
4ed46869 1251
ec6d2bb8 1252 /* We produce no character or one character. */
4ed46869
KH
1253 switch (iso_code_class [c1])
1254 {
1255 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1256 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1257 {
1258 DECODE_COMPOSITION_RULE (c1);
b73bfc1c 1259 continue;
ec6d2bb8
KH
1260 }
1261 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1262 {
1263 /* This is SPACE or DEL. */
b73bfc1c 1264 charset = CHARSET_ASCII;
4ed46869
KH
1265 break;
1266 }
1267 /* This is a graphic character, we fall down ... */
1268
1269 case ISO_graphic_plane_0:
ec6d2bb8 1270 if (COMPOSING_P (coding) && coding->composition_rule_follows)
b73bfc1c
KH
1271 {
1272 DECODE_COMPOSITION_RULE (c1);
1273 continue;
1274 }
1275 charset = charset0;
4ed46869
KH
1276 break;
1277
1278 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1279 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1280 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1281 goto label_invalid_code;
4ed46869
KH
1282 /* This is a graphic character, we fall down ... */
1283
1284 case ISO_graphic_plane_1:
b73bfc1c 1285 if (charset1 < 0)
fb88bf2d 1286 goto label_invalid_code;
b73bfc1c 1287 charset = charset1;
4ed46869
KH
1288 break;
1289
b73bfc1c 1290 case ISO_control_0:
ec6d2bb8
KH
1291 if (COMPOSING_P (coding))
1292 DECODE_COMPOSITION_END ('1');
1293
4ed46869
KH
1294 /* All ISO2022 control characters in this class have the
1295 same representation in Emacs internal format. */
d46c5b12
KH
1296 if (c1 == '\n'
1297 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1298 && (coding->eol_type == CODING_EOL_CR
1299 || coding->eol_type == CODING_EOL_CRLF))
1300 {
b73bfc1c
KH
1301 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1302 goto label_end_of_loop;
d46c5b12 1303 }
b73bfc1c 1304 charset = CHARSET_ASCII;
4ed46869
KH
1305 break;
1306
b73bfc1c
KH
1307 case ISO_control_1:
1308 if (COMPOSING_P (coding))
1309 DECODE_COMPOSITION_END ('1');
1310 goto label_invalid_code;
1311
4ed46869 1312 case ISO_carriage_return:
ec6d2bb8
KH
1313 if (COMPOSING_P (coding))
1314 DECODE_COMPOSITION_END ('1');
1315
4ed46869 1316 if (coding->eol_type == CODING_EOL_CR)
b73bfc1c 1317 c1 = '\n';
4ed46869
KH
1318 else if (coding->eol_type == CODING_EOL_CRLF)
1319 {
1320 ONE_MORE_BYTE (c1);
b73bfc1c 1321 if (c1 != ISO_CODE_LF)
4ed46869 1322 {
d46c5b12
KH
1323 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1324 {
b73bfc1c
KH
1325 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1326 goto label_end_of_loop;
d46c5b12 1327 }
4ed46869 1328 src--;
b73bfc1c 1329 c1 = '\r';
4ed46869
KH
1330 }
1331 }
b73bfc1c 1332 charset = CHARSET_ASCII;
4ed46869
KH
1333 break;
1334
1335 case ISO_shift_out:
d46c5b12
KH
1336 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1337 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1338 goto label_invalid_code;
4ed46869
KH
1339 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1340 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1341 continue;
4ed46869
KH
1342
1343 case ISO_shift_in:
d46c5b12
KH
1344 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1345 goto label_invalid_code;
4ed46869
KH
1346 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1347 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1348 continue;
4ed46869
KH
1349
1350 case ISO_single_shift_2_7:
1351 case ISO_single_shift_2:
d46c5b12
KH
1352 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1353 goto label_invalid_code;
4ed46869
KH
1354 /* SS2 is handled as an escape sequence of ESC 'N' */
1355 c1 = 'N';
1356 goto label_escape_sequence;
1357
1358 case ISO_single_shift_3:
d46c5b12
KH
1359 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1360 goto label_invalid_code;
4ed46869
KH
1361 /* SS2 is handled as an escape sequence of ESC 'O' */
1362 c1 = 'O';
1363 goto label_escape_sequence;
1364
1365 case ISO_control_sequence_introducer:
1366 /* CSI is handled as an escape sequence of ESC '[' ... */
1367 c1 = '[';
1368 goto label_escape_sequence;
1369
1370 case ISO_escape:
1371 ONE_MORE_BYTE (c1);
1372 label_escape_sequence:
1373 /* Escape sequences handled by Emacs are invocation,
1374 designation, direction specification, and character
1375 composition specification. */
1376 switch (c1)
1377 {
1378 case '&': /* revision of following character set */
1379 ONE_MORE_BYTE (c1);
1380 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1381 goto label_invalid_code;
4ed46869
KH
1382 ONE_MORE_BYTE (c1);
1383 if (c1 != ISO_CODE_ESC)
d46c5b12 1384 goto label_invalid_code;
4ed46869
KH
1385 ONE_MORE_BYTE (c1);
1386 goto label_escape_sequence;
1387
1388 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1389 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1390 goto label_invalid_code;
4ed46869
KH
1391 ONE_MORE_BYTE (c1);
1392 if (c1 >= '@' && c1 <= 'B')
1393 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1394 or JISX0208.1980 */
4ed46869
KH
1395 DECODE_DESIGNATION (0, 2, 94, c1);
1396 }
1397 else if (c1 >= 0x28 && c1 <= 0x2B)
1398 { /* designation of DIMENSION2_CHARS94 character set */
1399 ONE_MORE_BYTE (c2);
1400 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1401 }
1402 else if (c1 >= 0x2C && c1 <= 0x2F)
1403 { /* designation of DIMENSION2_CHARS96 character set */
1404 ONE_MORE_BYTE (c2);
1405 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1406 }
1407 else
d46c5b12 1408 goto label_invalid_code;
b73bfc1c
KH
1409 /* We must update these variables now. */
1410 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1411 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1412 continue;
4ed46869
KH
1413
1414 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1415 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1416 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1417 goto label_invalid_code;
4ed46869 1418 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1419 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1420 continue;
4ed46869
KH
1421
1422 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1423 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1424 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1425 goto label_invalid_code;
4ed46869 1426 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1427 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
b73bfc1c 1428 continue;
4ed46869
KH
1429
1430 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1431 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1432 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1433 goto label_invalid_code;
4ed46869 1434 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
b73bfc1c 1435 ONE_MORE_BYTE (c1);
4ed46869
KH
1436 break;
1437
1438 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1439 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1440 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1441 goto label_invalid_code;
4ed46869 1442 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
b73bfc1c 1443 ONE_MORE_BYTE (c1);
4ed46869
KH
1444 break;
1445
ec6d2bb8
KH
1446 case '0': case '2': case '3': case '4': /* start composition */
1447 DECODE_COMPOSITION_START (c1);
b73bfc1c 1448 continue;
4ed46869 1449
ec6d2bb8
KH
1450 case '1': /* end composition */
1451 DECODE_COMPOSITION_END (c1);
b73bfc1c 1452 continue;
4ed46869
KH
1453
1454 case '[': /* specification of direction */
d46c5b12
KH
1455 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1456 goto label_invalid_code;
4ed46869 1457 /* For the moment, nested direction is not supported.
d46c5b12
KH
1458 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1459 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1460 ONE_MORE_BYTE (c1);
1461 switch (c1)
1462 {
1463 case ']': /* end of the current direction */
d46c5b12 1464 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1465
1466 case '0': /* end of the current direction */
1467 case '1': /* start of left-to-right direction */
1468 ONE_MORE_BYTE (c1);
1469 if (c1 == ']')
d46c5b12 1470 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1471 else
d46c5b12 1472 goto label_invalid_code;
4ed46869
KH
1473 break;
1474
1475 case '2': /* start of right-to-left direction */
1476 ONE_MORE_BYTE (c1);
1477 if (c1 == ']')
d46c5b12 1478 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1479 else
d46c5b12 1480 goto label_invalid_code;
4ed46869
KH
1481 break;
1482
1483 default:
d46c5b12 1484 goto label_invalid_code;
4ed46869 1485 }
b73bfc1c 1486 continue;
4ed46869
KH
1487
1488 default:
d46c5b12
KH
1489 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1490 goto label_invalid_code;
4ed46869
KH
1491 if (c1 >= 0x28 && c1 <= 0x2B)
1492 { /* designation of DIMENSION1_CHARS94 character set */
1493 ONE_MORE_BYTE (c2);
1494 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1495 }
1496 else if (c1 >= 0x2C && c1 <= 0x2F)
1497 { /* designation of DIMENSION1_CHARS96 character set */
1498 ONE_MORE_BYTE (c2);
1499 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1500 }
1501 else
b73bfc1c
KH
1502 goto label_invalid_code;
1503 /* We must update these variables now. */
1504 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1505 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1506 continue;
4ed46869 1507 }
b73bfc1c 1508 }
4ed46869 1509
b73bfc1c
KH
1510 /* Now we know CHARSET and 1st position code C1 of a character.
1511 Produce a multibyte sequence for that character while getting
1512 2nd position code C2 if necessary. */
1513 if (CHARSET_DIMENSION (charset) == 2)
1514 {
1515 ONE_MORE_BYTE (c2);
1516 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1517 /* C2 is not in a valid range. */
1518 goto label_invalid_code;
4ed46869 1519 }
b73bfc1c
KH
1520 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1521 EMIT_CHAR (c);
4ed46869
KH
1522 continue;
1523
b73bfc1c
KH
1524 label_invalid_code:
1525 coding->errors++;
1526 if (COMPOSING_P (coding))
1527 DECODE_COMPOSITION_END ('1');
4ed46869 1528 src = src_base;
b73bfc1c
KH
1529 c = *src++;
1530 EMIT_CHAR (c);
4ed46869 1531 }
fb88bf2d 1532
b73bfc1c
KH
1533 label_end_of_loop:
1534 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 1535 coding->produced = dst - destination;
b73bfc1c 1536 return;
4ed46869
KH
1537}
1538
b73bfc1c 1539
f4dee582 1540/* ISO2022 encoding stuff. */
4ed46869
KH
1541
1542/*
f4dee582 1543 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1544 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1545 variant has the following specifications:
1546 1. Initial designation to G0 thru G3.
1547 2. Allows short-form designation?
1548 3. ASCII should be designated to G0 before control characters?
1549 4. ASCII should be designated to G0 at end of line?
1550 5. 7-bit environment or 8-bit environment?
1551 6. Use locking-shift?
1552 7. Use Single-shift?
1553 And the following two are only for Japanese:
1554 8. Use ASCII in place of JIS0201-1976-Roman?
1555 9. Use JISX0208-1983 in place of JISX0208-1978?
1556 These specifications are encoded in `coding->flags' as flag bits
1557 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1558 details.
4ed46869
KH
1559*/
1560
1561/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
1562 register REG at DST, and increment DST. If <final-char> of CHARSET is
1563 '@', 'A', or 'B' and the coding system CODING allows, produce
1564 designation sequence of short-form. */
4ed46869
KH
1565
1566#define ENCODE_DESIGNATION(charset, reg, coding) \
1567 do { \
1568 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1569 char *intermediate_char_94 = "()*+"; \
1570 char *intermediate_char_96 = ",-./"; \
70c22245 1571 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
b73bfc1c 1572 \
70c22245
KH
1573 if (revision < 255) \
1574 { \
4ed46869
KH
1575 *dst++ = ISO_CODE_ESC; \
1576 *dst++ = '&'; \
70c22245 1577 *dst++ = '@' + revision; \
4ed46869 1578 } \
b73bfc1c 1579 *dst++ = ISO_CODE_ESC; \
4ed46869
KH
1580 if (CHARSET_DIMENSION (charset) == 1) \
1581 { \
1582 if (CHARSET_CHARS (charset) == 94) \
1583 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1584 else \
1585 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1586 } \
1587 else \
1588 { \
1589 *dst++ = '$'; \
1590 if (CHARSET_CHARS (charset) == 94) \
1591 { \
b73bfc1c
KH
1592 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1593 || reg != 0 \
1594 || final_char < '@' || final_char > 'B') \
4ed46869
KH
1595 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1596 } \
1597 else \
b73bfc1c 1598 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
4ed46869 1599 } \
b73bfc1c 1600 *dst++ = final_char; \
4ed46869
KH
1601 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1602 } while (0)
1603
1604/* The following two macros produce codes (control character or escape
1605 sequence) for ISO2022 single-shift functions (single-shift-2 and
1606 single-shift-3). */
1607
1608#define ENCODE_SINGLE_SHIFT_2 \
1609 do { \
1610 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1611 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1612 else \
b73bfc1c 1613 *dst++ = ISO_CODE_SS2; \
4ed46869
KH
1614 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1615 } while (0)
1616
fb88bf2d
KH
1617#define ENCODE_SINGLE_SHIFT_3 \
1618 do { \
4ed46869 1619 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1620 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1621 else \
b73bfc1c 1622 *dst++ = ISO_CODE_SS3; \
4ed46869
KH
1623 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1624 } while (0)
1625
1626/* The following four macros produce codes (control character or
1627 escape sequence) for ISO2022 locking-shift functions (shift-in,
1628 shift-out, locking-shift-2, and locking-shift-3). */
1629
b73bfc1c
KH
1630#define ENCODE_SHIFT_IN \
1631 do { \
1632 *dst++ = ISO_CODE_SI; \
4ed46869
KH
1633 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1634 } while (0)
1635
b73bfc1c
KH
1636#define ENCODE_SHIFT_OUT \
1637 do { \
1638 *dst++ = ISO_CODE_SO; \
4ed46869
KH
1639 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1640 } while (0)
1641
1642#define ENCODE_LOCKING_SHIFT_2 \
1643 do { \
1644 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1645 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1646 } while (0)
1647
b73bfc1c
KH
1648#define ENCODE_LOCKING_SHIFT_3 \
1649 do { \
1650 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
4ed46869
KH
1651 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1652 } while (0)
1653
f4dee582
RS
1654/* Produce codes for a DIMENSION1 character whose character set is
1655 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1656 sequences are also produced in advance if necessary. */
1657
6e85d753
KH
1658#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1659 do { \
1660 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1661 { \
1662 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1663 *dst++ = c1 & 0x7F; \
1664 else \
1665 *dst++ = c1 | 0x80; \
1666 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1667 break; \
1668 } \
1669 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1670 { \
1671 *dst++ = c1 & 0x7F; \
1672 break; \
1673 } \
1674 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1675 { \
1676 *dst++ = c1 | 0x80; \
1677 break; \
1678 } \
1679 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1680 && !coding->safe_charsets[charset]) \
6e85d753
KH
1681 { \
1682 /* We should not encode this character, instead produce one or \
1683 two `?'s. */ \
1684 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1685 if (CHARSET_WIDTH (charset) == 2) \
1686 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1687 break; \
1688 } \
1689 else \
1690 /* Since CHARSET is not yet invoked to any graphic planes, we \
1691 must invoke it, or, at first, designate it to some graphic \
1692 register. Then repeat the loop to actually produce the \
1693 character. */ \
1694 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1695 } while (1)
1696
f4dee582
RS
1697/* Produce codes for a DIMENSION2 character whose character set is
1698 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1699 invocation codes are also produced in advance if necessary. */
1700
6e85d753
KH
1701#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1702 do { \
1703 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1704 { \
1705 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1706 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1707 else \
1708 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1709 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1710 break; \
1711 } \
1712 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1713 { \
1714 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1715 break; \
1716 } \
1717 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1718 { \
1719 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1720 break; \
1721 } \
1722 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1723 && !coding->safe_charsets[charset]) \
6e85d753
KH
1724 { \
1725 /* We should not encode this character, instead produce one or \
1726 two `?'s. */ \
1727 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1728 if (CHARSET_WIDTH (charset) == 2) \
1729 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1730 break; \
1731 } \
1732 else \
1733 /* Since CHARSET is not yet invoked to any graphic planes, we \
1734 must invoke it, or, at first, designate it to some graphic \
1735 register. Then repeat the loop to actually produce the \
1736 character. */ \
1737 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1738 } while (1)
1739
6f551029
KH
1740#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1741 do { \
b73bfc1c 1742 int alt_charset = charset; \
ec6d2bb8 1743 \
b73bfc1c 1744 if (CHARSET_DEFINED_P (charset)) \
6f551029 1745 { \
b73bfc1c 1746 if (CHARSET_DIMENSION (charset) == 1) \
6f551029
KH
1747 { \
1748 if (charset == CHARSET_ASCII \
1749 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
b73bfc1c
KH
1750 alt_charset = charset_latin_jisx0201; \
1751 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
6f551029
KH
1752 } \
1753 else \
1754 { \
1755 if (charset == charset_jisx0208 \
1756 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
b73bfc1c
KH
1757 alt_charset = charset_jisx0208_1978; \
1758 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
6f551029
KH
1759 } \
1760 } \
1761 else \
1762 { \
b73bfc1c
KH
1763 *dst++ = c1; \
1764 if (c2 >= 0) \
1765 *dst++ = c2; \
6f551029 1766 } \
84fbb8a0 1767 } while (0)
bdd9fb48 1768
4ed46869
KH
1769/* Produce designation and invocation codes at a place pointed by DST
1770 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1771 Return new DST. */
1772
1773unsigned char *
1774encode_invocation_designation (charset, coding, dst)
1775 int charset;
1776 struct coding_system *coding;
1777 unsigned char *dst;
1778{
1779 int reg; /* graphic register number */
1780
1781 /* At first, check designations. */
1782 for (reg = 0; reg < 4; reg++)
1783 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1784 break;
1785
1786 if (reg >= 4)
1787 {
1788 /* CHARSET is not yet designated to any graphic registers. */
1789 /* At first check the requested designation. */
1790 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1791 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1792 /* Since CHARSET requests no special designation, designate it
1793 to graphic register 0. */
4ed46869
KH
1794 reg = 0;
1795
1796 ENCODE_DESIGNATION (charset, reg, coding);
1797 }
1798
1799 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1800 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1801 {
1802 /* Since the graphic register REG is not invoked to any graphic
1803 planes, invoke it to graphic plane 0. */
1804 switch (reg)
1805 {
1806 case 0: /* graphic register 0 */
1807 ENCODE_SHIFT_IN;
1808 break;
1809
1810 case 1: /* graphic register 1 */
1811 ENCODE_SHIFT_OUT;
1812 break;
1813
1814 case 2: /* graphic register 2 */
1815 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1816 ENCODE_SINGLE_SHIFT_2;
1817 else
1818 ENCODE_LOCKING_SHIFT_2;
1819 break;
1820
1821 case 3: /* graphic register 3 */
1822 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1823 ENCODE_SINGLE_SHIFT_3;
1824 else
1825 ENCODE_LOCKING_SHIFT_3;
1826 break;
1827 }
1828 }
b73bfc1c 1829
4ed46869
KH
1830 return dst;
1831}
1832
ec6d2bb8
KH
1833/* Produce 2-byte codes for encoded composition rule RULE. */
1834
1835#define ENCODE_COMPOSITION_RULE(rule) \
1836 do { \
1837 int gref, nref; \
1838 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1839 *dst++ = 32 + 81 + gref; \
1840 *dst++ = 32 + nref; \
1841 } while (0)
1842
1843/* Produce codes for indicating the start of a composition sequence
1844 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1845 which specify information about the composition. See the comment
1846 in coding.h for the format of DATA. */
1847
1848#define ENCODE_COMPOSITION_START(coding, data) \
1849 do { \
1850 coding->composing = data[3]; \
1851 *dst++ = ISO_CODE_ESC; \
1852 if (coding->composing == COMPOSITION_RELATIVE) \
1853 *dst++ = '0'; \
1854 else \
1855 { \
1856 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1857 ? '3' : '4'); \
1858 coding->cmp_data_index = coding->cmp_data_start + 4; \
1859 coding->composition_rule_follows = 0; \
1860 } \
1861 } while (0)
1862
1863/* Produce codes for indicating the end of the current composition. */
1864
1865#define ENCODE_COMPOSITION_END(coding, data) \
1866 do { \
1867 *dst++ = ISO_CODE_ESC; \
1868 *dst++ = '1'; \
1869 coding->cmp_data_start += data[0]; \
1870 coding->composing = COMPOSITION_NO; \
1871 if (coding->cmp_data_start == coding->cmp_data->used \
1872 && coding->cmp_data->next) \
1873 { \
1874 coding->cmp_data = coding->cmp_data->next; \
1875 coding->cmp_data_start = 0; \
1876 } \
1877 } while (0)
1878
1879/* Produce composition start sequence ESC 0. Here, this sequence
1880 doesn't mean the start of a new composition but means that we have
1881 just produced components (alternate chars and composition rules) of
1882 the composition and the actual text follows in SRC. */
1883
1884#define ENCODE_COMPOSITION_FAKE_START(coding) \
1885 do { \
1886 *dst++ = ISO_CODE_ESC; \
1887 *dst++ = '0'; \
1888 coding->composing = COMPOSITION_RELATIVE; \
1889 } while (0)
4ed46869
KH
1890
1891/* The following three macros produce codes for indicating direction
1892 of text. */
b73bfc1c
KH
1893#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1894 do { \
4ed46869 1895 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
b73bfc1c
KH
1896 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1897 else \
1898 *dst++ = ISO_CODE_CSI; \
4ed46869
KH
1899 } while (0)
1900
1901#define ENCODE_DIRECTION_R2L \
b73bfc1c 1902 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
4ed46869
KH
1903
1904#define ENCODE_DIRECTION_L2R \
b73bfc1c 1905 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
4ed46869
KH
1906
1907/* Produce codes for designation and invocation to reset the graphic
1908 planes and registers to initial state. */
e0e989f6
KH
1909#define ENCODE_RESET_PLANE_AND_REGISTER \
1910 do { \
1911 int reg; \
1912 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1913 ENCODE_SHIFT_IN; \
1914 for (reg = 0; reg < 4; reg++) \
1915 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1916 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1917 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1918 ENCODE_DESIGNATION \
1919 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1920 } while (0)
1921
bdd9fb48 1922/* Produce designation sequences of charsets in the line started from
b73bfc1c 1923 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
1924
1925 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1926 find all the necessary designations. */
1927
b73bfc1c
KH
1928static unsigned char *
1929encode_designation_at_bol (coding, translation_table, src, src_end, dst)
e0e989f6 1930 struct coding_system *coding;
b73bfc1c
KH
1931 Lisp_Object translation_table;
1932 unsigned char *src, *src_end, *dst;
e0e989f6 1933{
bdd9fb48
KH
1934 int charset, c, found = 0, reg;
1935 /* Table of charsets to be designated to each graphic register. */
1936 int r[4];
bdd9fb48
KH
1937
1938 for (reg = 0; reg < 4; reg++)
1939 r[reg] = -1;
1940
b73bfc1c 1941 while (found < 4)
e0e989f6 1942 {
b73bfc1c
KH
1943 ONE_MORE_CHAR (c);
1944 if (c == '\n')
1945 break;
bdd9fb48 1946
b73bfc1c 1947 charset = CHAR_CHARSET (c);
e0e989f6 1948 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1949 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1950 {
1951 found++;
1952 r[reg] = charset;
1953 }
bdd9fb48
KH
1954 }
1955
b73bfc1c 1956 label_end_of_loop:
bdd9fb48
KH
1957 if (found)
1958 {
1959 for (reg = 0; reg < 4; reg++)
1960 if (r[reg] >= 0
1961 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1962 ENCODE_DESIGNATION (r[reg], reg, coding);
e0e989f6 1963 }
b73bfc1c
KH
1964
1965 return dst;
e0e989f6
KH
1966}
1967
4ed46869
KH
1968/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1969
b73bfc1c 1970static void
d46c5b12 1971encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1972 struct coding_system *coding;
1973 unsigned char *source, *destination;
1974 int src_bytes, dst_bytes;
4ed46869
KH
1975{
1976 unsigned char *src = source;
1977 unsigned char *src_end = source + src_bytes;
1978 unsigned char *dst = destination;
1979 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c 1980 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1981 from DST_END to assure overflow checking is necessary only at the
1982 head of loop. */
b73bfc1c
KH
1983 unsigned char *adjusted_dst_end = dst_end - 19;
1984 /* SRC_BASE remembers the start position in source in each loop.
1985 The loop will be exited when there's not enough source text to
1986 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1987 there's not enough destination area to produce encoded codes
1988 (within macro EMIT_BYTES). */
1989 unsigned char *src_base;
1990 int c;
1991 Lisp_Object translation_table;
bdd9fb48 1992
b73bfc1c
KH
1993 if (NILP (Venable_character_translation))
1994 translation_table = Qnil;
1995 else
1996 {
1997 translation_table = coding->translation_table_for_encode;
1998 if (NILP (translation_table))
1999 translation_table = Vstandard_translation_table_for_encode;
2000 }
4ed46869 2001
d46c5b12 2002 coding->consumed_char = 0;
b73bfc1c
KH
2003 coding->errors = 0;
2004 while (1)
4ed46869 2005 {
b73bfc1c
KH
2006 int charset, c1, c2;
2007
2008 src_base = src;
2009
2010 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2011 {
2012 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2013 break;
2014 }
4ed46869 2015
e0e989f6
KH
2016 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2017 && CODING_SPEC_ISO_BOL (coding))
2018 {
bdd9fb48 2019 /* We have to produce designation sequences if any now. */
b73bfc1c
KH
2020 dst = encode_designation_at_bol (coding, translation_table,
2021 src, src_end, dst);
e0e989f6
KH
2022 CODING_SPEC_ISO_BOL (coding) = 0;
2023 }
2024
ec6d2bb8
KH
2025 /* Check composition start and end. */
2026 if (coding->composing != COMPOSITION_DISABLED
2027 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 2028 {
ec6d2bb8
KH
2029 struct composition_data *cmp_data = coding->cmp_data;
2030 int *data = cmp_data->data + coding->cmp_data_start;
2031 int this_pos = cmp_data->char_offset + coding->consumed_char;
2032
2033 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 2034 {
ec6d2bb8
KH
2035 if (this_pos == data[2])
2036 {
2037 ENCODE_COMPOSITION_END (coding, data);
2038 cmp_data = coding->cmp_data;
2039 data = cmp_data->data + coding->cmp_data_start;
2040 }
4ed46869 2041 }
ec6d2bb8 2042 else if (COMPOSING_P (coding))
4ed46869 2043 {
ec6d2bb8
KH
2044 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2045 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2046 /* We have consumed components of the composition.
2047 What follows in SRC is the compositions's base
2048 text. */
2049 ENCODE_COMPOSITION_FAKE_START (coding);
2050 else
4ed46869 2051 {
ec6d2bb8
KH
2052 int c = cmp_data->data[coding->cmp_data_index++];
2053 if (coding->composition_rule_follows)
2054 {
2055 ENCODE_COMPOSITION_RULE (c);
2056 coding->composition_rule_follows = 0;
2057 }
2058 else
2059 {
2060 SPLIT_CHAR (c, charset, c1, c2);
2061 ENCODE_ISO_CHARACTER (charset, c1, c2);
ec6d2bb8
KH
2062 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2063 coding->composition_rule_follows = 1;
2064 }
4ed46869
KH
2065 continue;
2066 }
ec6d2bb8
KH
2067 }
2068 if (!COMPOSING_P (coding))
2069 {
2070 if (this_pos == data[1])
4ed46869 2071 {
ec6d2bb8
KH
2072 ENCODE_COMPOSITION_START (coding, data);
2073 continue;
4ed46869 2074 }
4ed46869
KH
2075 }
2076 }
ec6d2bb8 2077
b73bfc1c 2078 ONE_MORE_CHAR (c);
4ed46869 2079
b73bfc1c
KH
2080 /* Now encode the character C. */
2081 if (c < 0x20 || c == 0x7F)
2082 {
2083 if (c == '\r')
19a8d9e0 2084 {
b73bfc1c
KH
2085 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2086 {
2087 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2088 ENCODE_RESET_PLANE_AND_REGISTER;
2089 *dst++ = c;
2090 continue;
2091 }
2092 /* fall down to treat '\r' as '\n' ... */
2093 c = '\n';
19a8d9e0 2094 }
b73bfc1c 2095 if (c == '\n')
19a8d9e0 2096 {
b73bfc1c
KH
2097 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2098 ENCODE_RESET_PLANE_AND_REGISTER;
2099 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2100 bcopy (coding->spec.iso2022.initial_designation,
2101 coding->spec.iso2022.current_designation,
2102 sizeof coding->spec.iso2022.initial_designation);
2103 if (coding->eol_type == CODING_EOL_LF
2104 || coding->eol_type == CODING_EOL_UNDECIDED)
2105 *dst++ = ISO_CODE_LF;
2106 else if (coding->eol_type == CODING_EOL_CRLF)
2107 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2108 else
2109 *dst++ = ISO_CODE_CR;
2110 CODING_SPEC_ISO_BOL (coding) = 1;
19a8d9e0 2111 }
b73bfc1c 2112 else
19a8d9e0 2113 {
b73bfc1c
KH
2114 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2115 ENCODE_RESET_PLANE_AND_REGISTER;
2116 *dst++ = c;
19a8d9e0 2117 }
4ed46869 2118 }
b73bfc1c
KH
2119 else if (ASCII_BYTE_P (c))
2120 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2121 else if (SINGLE_BYTE_CHAR_P (c))
88993dfd 2122 {
b73bfc1c
KH
2123 *dst++ = c;
2124 coding->errors++;
88993dfd 2125 }
b73bfc1c
KH
2126 else
2127 {
2128 SPLIT_CHAR (c, charset, c1, c2);
2129 ENCODE_ISO_CHARACTER (charset, c1, c2);
2130 }
2131
2132 coding->consumed_char++;
84fbb8a0 2133 }
b73bfc1c
KH
2134
2135 label_end_of_loop:
2136 coding->consumed = src_base - source;
d46c5b12 2137 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2138}
2139
2140\f
2141/*** 4. SJIS and BIG5 handlers ***/
2142
f4dee582 2143/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2144 quite widely. So, for the moment, Emacs supports them in the bare
2145 C code. But, in the future, they may be supported only by CCL. */
2146
2147/* SJIS is a coding system encoding three character sets: ASCII, right
2148 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2149 as is. A character of charset katakana-jisx0201 is encoded by
2150 "position-code + 0x80". A character of charset japanese-jisx0208
2151 is encoded in 2-byte but two position-codes are divided and shifted
2152 so that it fit in the range below.
2153
2154 --- CODE RANGE of SJIS ---
2155 (character set) (range)
2156 ASCII 0x00 .. 0x7F
2157 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2158 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2159 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2160 -------------------------------
2161
2162*/
2163
2164/* BIG5 is a coding system encoding two character sets: ASCII and
2165 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2166 character set and is encoded in two-byte.
2167
2168 --- CODE RANGE of BIG5 ---
2169 (character set) (range)
2170 ASCII 0x00 .. 0x7F
2171 Big5 (1st byte) 0xA1 .. 0xFE
2172 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2173 --------------------------
2174
2175 Since the number of characters in Big5 is larger than maximum
2176 characters in Emacs' charset (96x96), it can't be handled as one
2177 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2178 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2179 contains frequently used characters and the latter contains less
2180 frequently used characters. */
2181
2182/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2183 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2184 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2185 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2186
2187/* Number of Big5 characters which have the same code in 1st byte. */
2188#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2189
2190#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2191 do { \
2192 unsigned int temp \
2193 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2194 if (b1 < 0xC9) \
2195 charset = charset_big5_1; \
2196 else \
2197 { \
2198 charset = charset_big5_2; \
2199 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2200 } \
2201 c1 = temp / (0xFF - 0xA1) + 0x21; \
2202 c2 = temp % (0xFF - 0xA1) + 0x21; \
2203 } while (0)
2204
2205#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2206 do { \
2207 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2208 if (charset == charset_big5_2) \
2209 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2210 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2211 b2 = temp % BIG5_SAME_ROW; \
2212 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2213 } while (0)
2214
2215/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2216 Check if a text is encoded in SJIS. If it is, return
2217 CODING_CATEGORY_MASK_SJIS, else return 0. */
2218
2219int
2220detect_coding_sjis (src, src_end)
2221 unsigned char *src, *src_end;
2222{
b73bfc1c
KH
2223 int c;
2224 /* Dummy for ONE_MORE_BYTE. */
2225 struct coding_system dummy_coding;
2226 struct coding_system *coding = &dummy_coding;
4ed46869 2227
b73bfc1c 2228 while (1)
4ed46869 2229 {
b73bfc1c 2230 ONE_MORE_BYTE (c);
4ed46869
KH
2231 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2232 {
b73bfc1c
KH
2233 ONE_MORE_BYTE (c);
2234 if (c < 0x40)
4ed46869
KH
2235 return 0;
2236 }
2237 }
b73bfc1c 2238 label_end_of_loop:
4ed46869
KH
2239 return CODING_CATEGORY_MASK_SJIS;
2240}
2241
2242/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2243 Check if a text is encoded in BIG5. If it is, return
2244 CODING_CATEGORY_MASK_BIG5, else return 0. */
2245
2246int
2247detect_coding_big5 (src, src_end)
2248 unsigned char *src, *src_end;
2249{
b73bfc1c
KH
2250 int c;
2251 /* Dummy for ONE_MORE_BYTE. */
2252 struct coding_system dummy_coding;
2253 struct coding_system *coding = &dummy_coding;
4ed46869 2254
b73bfc1c 2255 while (1)
4ed46869 2256 {
b73bfc1c 2257 ONE_MORE_BYTE (c);
4ed46869
KH
2258 if (c >= 0xA1)
2259 {
b73bfc1c 2260 ONE_MORE_BYTE (c);
4ed46869
KH
2261 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2262 return 0;
2263 }
2264 }
b73bfc1c 2265 label_end_of_loop:
4ed46869
KH
2266 return CODING_CATEGORY_MASK_BIG5;
2267}
2268
fa42c37f
KH
2269/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2270 Check if a text is encoded in UTF-8. If it is, return
2271 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2272
2273#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2274#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2275#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2276#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2277#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2278#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2279#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2280
2281int
2282detect_coding_utf_8 (src, src_end)
2283 unsigned char *src, *src_end;
2284{
2285 unsigned char c;
2286 int seq_maybe_bytes;
b73bfc1c
KH
2287 /* Dummy for ONE_MORE_BYTE. */
2288 struct coding_system dummy_coding;
2289 struct coding_system *coding = &dummy_coding;
fa42c37f 2290
b73bfc1c 2291 while (1)
fa42c37f 2292 {
b73bfc1c 2293 ONE_MORE_BYTE (c);
fa42c37f
KH
2294 if (UTF_8_1_OCTET_P (c))
2295 continue;
2296 else if (UTF_8_2_OCTET_LEADING_P (c))
2297 seq_maybe_bytes = 1;
2298 else if (UTF_8_3_OCTET_LEADING_P (c))
2299 seq_maybe_bytes = 2;
2300 else if (UTF_8_4_OCTET_LEADING_P (c))
2301 seq_maybe_bytes = 3;
2302 else if (UTF_8_5_OCTET_LEADING_P (c))
2303 seq_maybe_bytes = 4;
2304 else if (UTF_8_6_OCTET_LEADING_P (c))
2305 seq_maybe_bytes = 5;
2306 else
2307 return 0;
2308
2309 do
2310 {
b73bfc1c 2311 ONE_MORE_BYTE (c);
fa42c37f
KH
2312 if (!UTF_8_EXTRA_OCTET_P (c))
2313 return 0;
2314 seq_maybe_bytes--;
2315 }
2316 while (seq_maybe_bytes > 0);
2317 }
2318
b73bfc1c 2319 label_end_of_loop:
fa42c37f
KH
2320 return CODING_CATEGORY_MASK_UTF_8;
2321}
2322
2323/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2324 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2325 Little Endian (otherwise). If it is, return
2326 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2327 else return 0. */
2328
2329#define UTF_16_INVALID_P(val) \
2330 (((val) == 0xFFFE) \
2331 || ((val) == 0xFFFF))
2332
2333#define UTF_16_HIGH_SURROGATE_P(val) \
2334 (((val) & 0xD800) == 0xD800)
2335
2336#define UTF_16_LOW_SURROGATE_P(val) \
2337 (((val) & 0xDC00) == 0xDC00)
2338
2339int
2340detect_coding_utf_16 (src, src_end)
2341 unsigned char *src, *src_end;
2342{
b73bfc1c
KH
2343 unsigned char c1, c2;
2344 /* Dummy for TWO_MORE_BYTES. */
2345 struct coding_system dummy_coding;
2346 struct coding_system *coding = &dummy_coding;
fa42c37f 2347
b73bfc1c
KH
2348 TWO_MORE_BYTES (c1, c2);
2349
2350 if ((c1 == 0xFF) && (c2 == 0xFE))
fa42c37f 2351 return CODING_CATEGORY_MASK_UTF_16_LE;
b73bfc1c 2352 else if ((c1 == 0xFE) && (c2 == 0xFF))
fa42c37f
KH
2353 return CODING_CATEGORY_MASK_UTF_16_BE;
2354
b73bfc1c 2355 label_end_of_loop:
fa42c37f
KH
2356 return 0;
2357}
2358
4ed46869
KH
2359/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2360 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2361
b73bfc1c 2362static void
4ed46869 2363decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2364 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2365 struct coding_system *coding;
2366 unsigned char *source, *destination;
2367 int src_bytes, dst_bytes;
4ed46869
KH
2368 int sjis_p;
2369{
2370 unsigned char *src = source;
2371 unsigned char *src_end = source + src_bytes;
2372 unsigned char *dst = destination;
2373 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
2374 /* SRC_BASE remembers the start position in source in each loop.
2375 The loop will be exited when there's not enough source code
2376 (within macro ONE_MORE_BYTE), or when there's not enough
2377 destination area to produce a character (within macro
2378 EMIT_CHAR). */
2379 unsigned char *src_base;
2380 Lisp_Object translation_table;
a5d301df 2381
b73bfc1c
KH
2382 if (NILP (Venable_character_translation))
2383 translation_table = Qnil;
2384 else
2385 {
2386 translation_table = coding->translation_table_for_decode;
2387 if (NILP (translation_table))
2388 translation_table = Vstandard_translation_table_for_decode;
2389 }
4ed46869 2390
d46c5b12 2391 coding->produced_char = 0;
b73bfc1c 2392 while (1)
4ed46869 2393 {
b73bfc1c
KH
2394 int c, charset, c1, c2;
2395
2396 src_base = src;
2397 ONE_MORE_BYTE (c1);
2398
2399 if (c1 < 0x80)
4ed46869 2400 {
b73bfc1c
KH
2401 charset = CHARSET_ASCII;
2402 if (c1 < 0x20)
4ed46869 2403 {
b73bfc1c 2404 if (c1 == '\r')
d46c5b12 2405 {
b73bfc1c 2406 if (coding->eol_type == CODING_EOL_CRLF)
d46c5b12 2407 {
b73bfc1c
KH
2408 ONE_MORE_BYTE (c2);
2409 if (c2 == '\n')
2410 c1 = c2;
2411 else if (coding->mode
2412 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2413 {
2414 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2415 goto label_end_of_loop;
2416 }
2417 else
2418 /* To process C2 again, SRC is subtracted by 1. */
2419 src--;
d46c5b12 2420 }
b73bfc1c
KH
2421 else if (coding->eol_type == CODING_EOL_CR)
2422 c1 = '\n';
2423 }
2424 else if (c1 == '\n'
2425 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2426 && (coding->eol_type == CODING_EOL_CR
2427 || coding->eol_type == CODING_EOL_CRLF))
2428 {
2429 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2430 goto label_end_of_loop;
d46c5b12 2431 }
4ed46869 2432 }
4ed46869 2433 }
54f78171 2434 else
b73bfc1c 2435 {
4ed46869
KH
2436 if (sjis_p)
2437 {
b73bfc1c
KH
2438 if (c1 >= 0xF0)
2439 goto label_invalid_code;
2440 if (c1 < 0xA0 || c1 >= 0xE0)
fb88bf2d 2441 {
54f78171
KH
2442 /* SJIS -> JISX0208 */
2443 ONE_MORE_BYTE (c2);
b73bfc1c
KH
2444 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2445 goto label_invalid_code;
2446 DECODE_SJIS (c1, c2, c1, c2);
2447 charset = charset_jisx0208;
5e34de15 2448 }
fb88bf2d 2449 else
b73bfc1c
KH
2450 /* SJIS -> JISX0201-Kana */
2451 charset = charset_katakana_jisx0201;
4ed46869 2452 }
fb88bf2d 2453 else
fb88bf2d 2454 {
54f78171 2455 /* BIG5 -> Big5 */
b73bfc1c
KH
2456 if (c1 < 0xA1 || c1 > 0xFE)
2457 goto label_invalid_code;
2458 ONE_MORE_BYTE (c2);
2459 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2460 goto label_invalid_code;
2461 DECODE_BIG5 (c1, c2, charset, c1, c2);
4ed46869
KH
2462 }
2463 }
4ed46869 2464
b73bfc1c
KH
2465 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2466 EMIT_CHAR (c);
fb88bf2d
KH
2467 continue;
2468
b73bfc1c
KH
2469 label_invalid_code:
2470 coding->errors++;
4ed46869 2471 src = src_base;
b73bfc1c
KH
2472 c = *src++;
2473 EMIT_CHAR (c);
fb88bf2d 2474 }
d46c5b12 2475
b73bfc1c
KH
2476 label_end_of_loop:
2477 coding->consumed = coding->consumed_char = src_base - source;
d46c5b12 2478 coding->produced = dst - destination;
b73bfc1c 2479 return;
4ed46869
KH
2480}
2481
2482/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
2483 This function can encode charsets `ascii', `katakana-jisx0201',
2484 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2485 are sure that all these charsets are registered as official charset
4ed46869
KH
2486 (i.e. do not have extended leading-codes). Characters of other
2487 charsets are produced without any encoding. If SJIS_P is 1, encode
2488 SJIS text, else encode BIG5 text. */
2489
b73bfc1c 2490static void
4ed46869 2491encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2492 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2493 struct coding_system *coding;
2494 unsigned char *source, *destination;
2495 int src_bytes, dst_bytes;
4ed46869
KH
2496 int sjis_p;
2497{
2498 unsigned char *src = source;
2499 unsigned char *src_end = source + src_bytes;
2500 unsigned char *dst = destination;
2501 unsigned char *dst_end = destination + dst_bytes;
b73bfc1c
KH
2502 /* SRC_BASE remembers the start position in source in each loop.
2503 The loop will be exited when there's not enough source text to
2504 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2505 there's not enough destination area to produce encoded codes
2506 (within macro EMIT_BYTES). */
2507 unsigned char *src_base;
2508 Lisp_Object translation_table;
4ed46869 2509
b73bfc1c
KH
2510 if (NILP (Venable_character_translation))
2511 translation_table = Qnil;
2512 else
4ed46869 2513 {
b73bfc1c
KH
2514 translation_table = coding->translation_table_for_decode;
2515 if (NILP (translation_table))
2516 translation_table = Vstandard_translation_table_for_decode;
2517 }
a5d301df 2518
b73bfc1c
KH
2519 while (1)
2520 {
2521 int c, charset, c1, c2;
4ed46869 2522
b73bfc1c
KH
2523 src_base = src;
2524 ONE_MORE_CHAR (c);
2525
2526 /* Now encode the character C. */
2527 if (SINGLE_BYTE_CHAR_P (c))
2528 {
2529 switch (c)
4ed46869 2530 {
b73bfc1c
KH
2531 case '\r':
2532 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2533 {
2534 EMIT_ONE_BYTE (c);
2535 break;
2536 }
2537 c = '\n';
2538 case '\n':
2539 if (coding->eol_type == CODING_EOL_CRLF)
2540 {
2541 EMIT_TWO_BYTES ('\r', c);
2542 break;
2543 }
2544 else if (coding->eol_type == CODING_EOL_CR)
2545 c = '\r';
2546 default:
2547 EMIT_ONE_BYTE (c);
2548 }
2549 }
2550 else
2551 {
2552 SPLIT_CHAR (c, charset, c1, c2);
2553 if (sjis_p)
2554 {
2555 if (charset == charset_jisx0208
2556 || charset == charset_jisx0208_1978)
2557 {
2558 ENCODE_SJIS (c1, c2, c1, c2);
2559 EMIT_TWO_BYTES (c1, c2);
2560 }
2561 else if (charset == charset_latin_jisx0201)
2562 EMIT_ONE_BYTE (c1);
2563 else
2564 /* There's no way other than producing the internal
2565 codes as is. */
2566 EMIT_BYTES (src_base, src);
4ed46869 2567 }
4ed46869 2568 else
b73bfc1c
KH
2569 {
2570 if (charset == charset_big5_1 || charset == charset_big5_2)
2571 {
2572 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2573 EMIT_TWO_BYTES (c1, c2);
2574 }
2575 else
2576 /* There's no way other than producing the internal
2577 codes as is. */
2578 EMIT_BYTES (src_base, src);
2579 }
4ed46869 2580 }
b73bfc1c 2581 coding->consumed_char++;
4ed46869
KH
2582 }
2583
b73bfc1c
KH
2584 label_end_of_loop:
2585 coding->consumed = src_base - source;
d46c5b12 2586 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
2587}
2588
2589\f
1397dc18
KH
2590/*** 5. CCL handlers ***/
2591
2592/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2593 Check if a text is encoded in a coding system of which
2594 encoder/decoder are written in CCL program. If it is, return
2595 CODING_CATEGORY_MASK_CCL, else return 0. */
2596
2597int
2598detect_coding_ccl (src, src_end)
2599 unsigned char *src, *src_end;
2600{
2601 unsigned char *valid;
b73bfc1c
KH
2602 int c;
2603 /* Dummy for ONE_MORE_BYTE. */
2604 struct coding_system dummy_coding;
2605 struct coding_system *coding = &dummy_coding;
1397dc18
KH
2606
2607 /* No coding system is assigned to coding-category-ccl. */
2608 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2609 return 0;
2610
2611 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
b73bfc1c 2612 while (1)
1397dc18 2613 {
b73bfc1c
KH
2614 ONE_MORE_BYTE (c);
2615 if (! valid[c])
2616 return 0;
1397dc18 2617 }
b73bfc1c 2618 label_end_of_loop:
1397dc18
KH
2619 return CODING_CATEGORY_MASK_CCL;
2620}
2621
2622\f
2623/*** 6. End-of-line handlers ***/
4ed46869 2624
b73bfc1c 2625/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 2626
b73bfc1c 2627static void
d46c5b12 2628decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2629 struct coding_system *coding;
2630 unsigned char *source, *destination;
2631 int src_bytes, dst_bytes;
4ed46869
KH
2632{
2633 unsigned char *src = source;
4ed46869 2634 unsigned char *dst = destination;
b73bfc1c
KH
2635 unsigned char *src_end = src + src_bytes;
2636 unsigned char *dst_end = dst + dst_bytes;
2637 Lisp_Object translation_table;
2638 /* SRC_BASE remembers the start position in source in each loop.
2639 The loop will be exited when there's not enough source code
2640 (within macro ONE_MORE_BYTE), or when there's not enough
2641 destination area to produce a character (within macro
2642 EMIT_CHAR). */
2643 unsigned char *src_base;
2644 int c;
2645
2646 translation_table = Qnil;
4ed46869
KH
2647 switch (coding->eol_type)
2648 {
2649 case CODING_EOL_CRLF:
b73bfc1c 2650 while (1)
d46c5b12 2651 {
b73bfc1c
KH
2652 src_base = src;
2653 ONE_MORE_BYTE (c);
2654 if (c == '\r')
fb88bf2d 2655 {
b73bfc1c
KH
2656 ONE_MORE_BYTE (c);
2657 if (c != '\n')
2658 {
2659 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2660 {
2661 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2662 goto label_end_of_loop;
2663 }
2664 src--;
2665 c = '\r';
2666 }
fb88bf2d 2667 }
b73bfc1c
KH
2668 else if (c == '\n'
2669 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
d46c5b12 2670 {
b73bfc1c
KH
2671 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2672 goto label_end_of_loop;
d46c5b12 2673 }
b73bfc1c 2674 EMIT_CHAR (c);
d46c5b12 2675 }
b73bfc1c
KH
2676 break;
2677
2678 case CODING_EOL_CR:
2679 while (1)
d46c5b12 2680 {
b73bfc1c
KH
2681 src_base = src;
2682 ONE_MORE_BYTE (c);
2683 if (c == '\n')
2684 {
2685 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2686 {
2687 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2688 goto label_end_of_loop;
2689 }
2690 }
2691 else if (c == '\r')
2692 c = '\n';
2693 EMIT_CHAR (c);
d46c5b12 2694 }
4ed46869
KH
2695 break;
2696
b73bfc1c
KH
2697 default: /* no need for EOL handling */
2698 while (1)
d46c5b12 2699 {
b73bfc1c
KH
2700 src_base = src;
2701 ONE_MORE_BYTE (c);
2702 EMIT_CHAR (c);
d46c5b12 2703 }
4ed46869
KH
2704 }
2705
b73bfc1c
KH
2706 label_end_of_loop:
2707 coding->consumed = coding->consumed_char = src_base - source;
2708 coding->produced = dst - destination;
2709 return;
4ed46869
KH
2710}
2711
2712/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
b73bfc1c
KH
2713 format of end-of-line according to `coding->eol_type'. It also
2714 convert multibyte form 8-bit characers to unibyte if
2715 CODING->src_multibyte is nonzero. If `coding->mode &
2716 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2717 also means end-of-line. */
4ed46869 2718
b73bfc1c 2719static void
d46c5b12 2720encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2721 struct coding_system *coding;
2722 unsigned char *source, *destination;
2723 int src_bytes, dst_bytes;
4ed46869
KH
2724{
2725 unsigned char *src = source;
2726 unsigned char *dst = destination;
b73bfc1c
KH
2727 unsigned char *src_end = src + src_bytes;
2728 unsigned char *dst_end = dst + dst_bytes;
2729 Lisp_Object translation_table;
2730 /* SRC_BASE remembers the start position in source in each loop.
2731 The loop will be exited when there's not enough source text to
2732 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2733 there's not enough destination area to produce encoded codes
2734 (within macro EMIT_BYTES). */
2735 unsigned char *src_base;
2736 int c;
2737 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2738
2739 translation_table = Qnil;
2740 if (coding->src_multibyte
2741 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2742 {
2743 src_end--;
2744 src_bytes--;
2745 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2746 }
fb88bf2d 2747
d46c5b12
KH
2748 if (coding->eol_type == CODING_EOL_CRLF)
2749 {
b73bfc1c 2750 while (src < src_end)
d46c5b12 2751 {
b73bfc1c 2752 src_base = src;
d46c5b12 2753 c = *src++;
b73bfc1c
KH
2754 if (c >= 0x20)
2755 EMIT_ONE_BYTE (c);
2756 else if (c == '\n' || (c == '\r' && selective_display))
2757 EMIT_TWO_BYTES ('\r', '\n');
d46c5b12 2758 else
b73bfc1c 2759 EMIT_ONE_BYTE (c);
d46c5b12 2760 }
ff2b1ea9 2761 src_base = src;
b73bfc1c 2762 label_end_of_loop:
005f0d35 2763 ;
d46c5b12
KH
2764 }
2765 else
4ed46869 2766 {
b73bfc1c 2767 if (src_bytes <= dst_bytes)
4ed46869 2768 {
b73bfc1c
KH
2769 safe_bcopy (src, dst, src_bytes);
2770 src_base = src_end;
2771 dst += src_bytes;
d46c5b12 2772 }
d46c5b12 2773 else
b73bfc1c
KH
2774 {
2775 if (coding->src_multibyte
2776 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2777 dst_bytes--;
2778 safe_bcopy (src, dst, dst_bytes);
2779 src_base = src + dst_bytes;
2780 dst = destination + dst_bytes;
2781 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2782 }
993824c9 2783 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 2784 {
b73bfc1c
KH
2785 for (src = destination; src < dst; src++)
2786 if (*src == '\n') *src = '\r';
d46c5b12 2787 }
b73bfc1c 2788 else if (selective_display)
d46c5b12 2789 {
b73bfc1c
KH
2790 for (src = destination; src < dst; src++)
2791 if (*src == '\r') *src = '\n';
4ed46869 2792 }
4ed46869 2793 }
b73bfc1c
KH
2794 if (coding->src_multibyte)
2795 dst = destination + str_as_unibyte (destination, dst - destination);
4ed46869 2796
b73bfc1c
KH
2797 coding->consumed = src_base - source;
2798 coding->produced = dst - destination;
4ed46869
KH
2799}
2800
2801\f
1397dc18 2802/*** 7. C library functions ***/
4ed46869
KH
2803
2804/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2805 has a property `coding-system'. The value of this property is a
2806 vector of length 5 (called as coding-vector). Among elements of
2807 this vector, the first (element[0]) and the fifth (element[4])
2808 carry important information for decoding/encoding. Before
2809 decoding/encoding, this information should be set in fields of a
2810 structure of type `coding_system'.
2811
2812 A value of property `coding-system' can be a symbol of another
2813 subsidiary coding-system. In that case, Emacs gets coding-vector
2814 from that symbol.
2815
2816 `element[0]' contains information to be set in `coding->type'. The
2817 value and its meaning is as follows:
2818
0ef69138
KH
2819 0 -- coding_type_emacs_mule
2820 1 -- coding_type_sjis
2821 2 -- coding_type_iso2022
2822 3 -- coding_type_big5
2823 4 -- coding_type_ccl encoder/decoder written in CCL
2824 nil -- coding_type_no_conversion
2825 t -- coding_type_undecided (automatic conversion on decoding,
2826 no-conversion on encoding)
4ed46869
KH
2827
2828 `element[4]' contains information to be set in `coding->flags' and
2829 `coding->spec'. The meaning varies by `coding->type'.
2830
2831 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2832 of length 32 (of which the first 13 sub-elements are used now).
2833 Meanings of these sub-elements are:
2834
2835 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2836 If the value is an integer of valid charset, the charset is
2837 assumed to be designated to graphic register N initially.
2838
2839 If the value is minus, it is a minus value of charset which
2840 reserves graphic register N, which means that the charset is
2841 not designated initially but should be designated to graphic
2842 register N just before encoding a character in that charset.
2843
2844 If the value is nil, graphic register N is never used on
2845 encoding.
2846
2847 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2848 Each value takes t or nil. See the section ISO2022 of
2849 `coding.h' for more information.
2850
2851 If `coding->type' is `coding_type_big5', element[4] is t to denote
2852 BIG5-ETen or nil to denote BIG5-HKU.
2853
2854 If `coding->type' takes the other value, element[4] is ignored.
2855
2856 Emacs Lisp's coding system also carries information about format of
2857 end-of-line in a value of property `eol-type'. If the value is
2858 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2859 means CODING_EOL_CR. If it is not integer, it should be a vector
2860 of subsidiary coding systems of which property `eol-type' has one
2861 of above values.
2862
2863*/
2864
2865/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2866 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2867 is setup so that no conversion is necessary and return -1, else
2868 return 0. */
2869
2870int
e0e989f6
KH
2871setup_coding_system (coding_system, coding)
2872 Lisp_Object coding_system;
4ed46869
KH
2873 struct coding_system *coding;
2874{
d46c5b12 2875 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2876 Lisp_Object val;
70c22245 2877 int i;
4ed46869 2878
d46c5b12 2879 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2880 coding->symbol = coding_system;
d46c5b12
KH
2881 coding->common_flags = 0;
2882 coding->mode = 0;
2883 coding->heading_ascii = -1;
2884 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
2885 coding->composing = COMPOSITION_DISABLED;
2886 coding->cmp_data = NULL;
1f5dbf34
KH
2887
2888 if (NILP (coding_system))
2889 goto label_invalid_coding_system;
2890
4608c386 2891 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2892
4608c386
KH
2893 if (!VECTORP (coding_spec)
2894 || XVECTOR (coding_spec)->size != 5
2895 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2896 goto label_invalid_coding_system;
4608c386 2897
d46c5b12
KH
2898 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2899 if (VECTORP (eol_type))
2900 {
2901 coding->eol_type = CODING_EOL_UNDECIDED;
2902 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2903 }
2904 else if (XFASTINT (eol_type) == 1)
2905 {
2906 coding->eol_type = CODING_EOL_CRLF;
2907 coding->common_flags
2908 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2909 }
2910 else if (XFASTINT (eol_type) == 2)
2911 {
2912 coding->eol_type = CODING_EOL_CR;
2913 coding->common_flags
2914 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2915 }
2916 else
2917 coding->eol_type = CODING_EOL_LF;
2918
2919 coding_type = XVECTOR (coding_spec)->contents[0];
2920 /* Try short cut. */
2921 if (SYMBOLP (coding_type))
2922 {
2923 if (EQ (coding_type, Qt))
2924 {
2925 coding->type = coding_type_undecided;
2926 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2927 }
2928 else
2929 coding->type = coding_type_no_conversion;
2930 return 0;
2931 }
2932
d46c5b12
KH
2933 /* Get values of coding system properties:
2934 `post-read-conversion', `pre-write-conversion',
f967223b 2935 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 2936 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae
KH
2937 /* Pre & post conversion functions should be disabled if
2938 inhibit_eol_conversion is nozero. This is the case that a code
2939 conversion function is called while those functions are running. */
2940 if (! inhibit_pre_post_conversion)
2941 {
2942 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2943 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2944 }
f967223b 2945 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2946 if (SYMBOLP (val))
f967223b
KH
2947 val = Fget (val, Qtranslation_table_for_decode);
2948 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2949 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2950 if (SYMBOLP (val))
f967223b
KH
2951 val = Fget (val, Qtranslation_table_for_encode);
2952 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2953 val = Fplist_get (plist, Qcoding_category);
2954 if (!NILP (val))
2955 {
2956 val = Fget (val, Qcoding_category_index);
2957 if (INTEGERP (val))
2958 coding->category_idx = XINT (val);
2959 else
2960 goto label_invalid_coding_system;
2961 }
2962 else
2963 goto label_invalid_coding_system;
4608c386 2964
70c22245
KH
2965 val = Fplist_get (plist, Qsafe_charsets);
2966 if (EQ (val, Qt))
2967 {
2968 for (i = 0; i <= MAX_CHARSET; i++)
2969 coding->safe_charsets[i] = 1;
2970 }
2971 else
2972 {
2973 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2974 while (CONSP (val))
2975 {
03699b14 2976 if ((i = get_charset_id (XCAR (val))) >= 0)
70c22245 2977 coding->safe_charsets[i] = 1;
03699b14 2978 val = XCDR (val);
70c22245
KH
2979 }
2980 }
2981
ec6d2bb8
KH
2982 /* If the coding system has non-nil `composition' property, enable
2983 composition handling. */
2984 val = Fplist_get (plist, Qcomposition);
2985 if (!NILP (val))
2986 coding->composing = COMPOSITION_NO;
2987
d46c5b12 2988 switch (XFASTINT (coding_type))
4ed46869
KH
2989 {
2990 case 0:
0ef69138 2991 coding->type = coding_type_emacs_mule;
c952af22
KH
2992 if (!NILP (coding->post_read_conversion))
2993 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2994 if (!NILP (coding->pre_write_conversion))
2995 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2996 break;
2997
2998 case 1:
2999 coding->type = coding_type_sjis;
c952af22
KH
3000 coding->common_flags
3001 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3002 break;
3003
3004 case 2:
3005 coding->type = coding_type_iso2022;
c952af22
KH
3006 coding->common_flags
3007 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3008 {
70c22245 3009 Lisp_Object val, temp;
4ed46869 3010 Lisp_Object *flags;
d46c5b12 3011 int i, charset, reg_bits = 0;
4ed46869 3012
4608c386 3013 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3014
4ed46869
KH
3015 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3016 goto label_invalid_coding_system;
3017
3018 flags = XVECTOR (val)->contents;
3019 coding->flags
3020 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3021 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3022 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3023 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3024 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3025 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3026 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3027 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3028 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3029 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3030 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3031 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3032 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3033 );
4ed46869
KH
3034
3035 /* Invoke graphic register 0 to plane 0. */
3036 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3037 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3038 CODING_SPEC_ISO_INVOCATION (coding, 1)
3039 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3040 /* Not single shifting at first. */
6e85d753 3041 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3042 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3043 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3044
70c22245
KH
3045 for (charset = 0; charset <= MAX_CHARSET; charset++)
3046 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3047 val = Vcharset_revision_alist;
3048 while (CONSP (val))
3049 {
03699b14 3050 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3051 if (charset >= 0
03699b14 3052 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3053 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3054 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3055 val = XCDR (val);
70c22245
KH
3056 }
3057
4ed46869
KH
3058 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3059 FLAGS[REG] can be one of below:
3060 integer CHARSET: CHARSET occupies register I,
3061 t: designate nothing to REG initially, but can be used
3062 by any charsets,
3063 list of integer, nil, or t: designate the first
3064 element (if integer) to REG initially, the remaining
3065 elements (if integer) is designated to REG on request,
d46c5b12 3066 if an element is t, REG can be used by any charsets,
4ed46869 3067 nil: REG is never used. */
467e7675 3068 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3069 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3070 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3071 for (i = 0; i < 4; i++)
3072 {
3073 if (INTEGERP (flags[i])
e0e989f6
KH
3074 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3075 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3076 {
3077 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3078 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3079 }
3080 else if (EQ (flags[i], Qt))
3081 {
3082 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3083 reg_bits |= 1 << i;
3084 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3085 }
3086 else if (CONSP (flags[i]))
3087 {
84d60297
RS
3088 Lisp_Object tail;
3089 tail = flags[i];
4ed46869 3090
d46c5b12 3091 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
03699b14
KR
3092 if (INTEGERP (XCAR (tail))
3093 && (charset = XINT (XCAR (tail)),
e0e989f6 3094 CHARSET_VALID_P (charset))
03699b14 3095 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3096 {
3097 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3098 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3099 }
3100 else
3101 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3102 tail = XCDR (tail);
4ed46869
KH
3103 while (CONSP (tail))
3104 {
03699b14
KR
3105 if (INTEGERP (XCAR (tail))
3106 && (charset = XINT (XCAR (tail)),
e0e989f6 3107 CHARSET_VALID_P (charset))
03699b14 3108 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3109 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3110 = i;
03699b14 3111 else if (EQ (XCAR (tail), Qt))
d46c5b12 3112 reg_bits |= 1 << i;
03699b14 3113 tail = XCDR (tail);
4ed46869
KH
3114 }
3115 }
3116 else
3117 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3118
3119 CODING_SPEC_ISO_DESIGNATION (coding, i)
3120 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3121 }
3122
d46c5b12 3123 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3124 {
3125 /* REG 1 can be used only by locking shift in 7-bit env. */
3126 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3127 reg_bits &= ~2;
4ed46869
KH
3128 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3129 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3130 reg_bits &= 3;
4ed46869
KH
3131 }
3132
d46c5b12
KH
3133 if (reg_bits)
3134 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3135 {
d46c5b12
KH
3136 if (CHARSET_VALID_P (charset))
3137 {
3138 /* There exist some default graphic registers to be
3139 used CHARSET. */
3140
3141 /* We had better avoid designating a charset of
3142 CHARS96 to REG 0 as far as possible. */
3143 if (CHARSET_CHARS (charset) == 96)
3144 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3145 = (reg_bits & 2
3146 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3147 else
3148 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3149 = (reg_bits & 1
3150 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3151 }
6e85d753 3152 }
4ed46869 3153 }
c952af22 3154 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3155 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3156 break;
3157
3158 case 3:
3159 coding->type = coding_type_big5;
c952af22
KH
3160 coding->common_flags
3161 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3162 coding->flags
4608c386 3163 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3164 ? CODING_FLAG_BIG5_HKU
3165 : CODING_FLAG_BIG5_ETEN);
3166 break;
3167
3168 case 4:
3169 coding->type = coding_type_ccl;
c952af22
KH
3170 coding->common_flags
3171 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3172 {
84d60297 3173 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3174 if (! CONSP (val)
3175 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3176 XCAR (val)) < 0
ef4ced28 3177 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3178 XCDR (val)) < 0)
4ed46869 3179 goto label_invalid_coding_system;
1397dc18
KH
3180
3181 bzero (coding->spec.ccl.valid_codes, 256);
3182 val = Fplist_get (plist, Qvalid_codes);
3183 if (CONSP (val))
3184 {
3185 Lisp_Object this;
3186
03699b14 3187 for (; CONSP (val); val = XCDR (val))
1397dc18 3188 {
03699b14 3189 this = XCAR (val);
1397dc18
KH
3190 if (INTEGERP (this)
3191 && XINT (this) >= 0 && XINT (this) < 256)
3192 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3193 else if (CONSP (this)
03699b14
KR
3194 && INTEGERP (XCAR (this))
3195 && INTEGERP (XCDR (this)))
1397dc18 3196 {
03699b14
KR
3197 int start = XINT (XCAR (this));
3198 int end = XINT (XCDR (this));
1397dc18
KH
3199
3200 if (start >= 0 && start <= end && end < 256)
e133c8fa 3201 while (start <= end)
1397dc18
KH
3202 coding->spec.ccl.valid_codes[start++] = 1;
3203 }
3204 }
3205 }
4ed46869 3206 }
c952af22 3207 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3208 break;
3209
27901516
KH
3210 case 5:
3211 coding->type = coding_type_raw_text;
3212 break;
3213
4ed46869 3214 default:
d46c5b12 3215 goto label_invalid_coding_system;
4ed46869
KH
3216 }
3217 return 0;
3218
3219 label_invalid_coding_system:
3220 coding->type = coding_type_no_conversion;
d46c5b12 3221 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3222 coding->common_flags = 0;
dec137e5 3223 coding->eol_type = CODING_EOL_LF;
d46c5b12 3224 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3225 return -1;
3226}
3227
ec6d2bb8
KH
3228/* Free memory blocks allocated for storing composition information. */
3229
3230void
3231coding_free_composition_data (coding)
3232 struct coding_system *coding;
3233{
3234 struct composition_data *cmp_data = coding->cmp_data, *next;
3235
3236 if (!cmp_data)
3237 return;
3238 /* Memory blocks are chained. At first, rewind to the first, then,
3239 free blocks one by one. */
3240 while (cmp_data->prev)
3241 cmp_data = cmp_data->prev;
3242 while (cmp_data)
3243 {
3244 next = cmp_data->next;
3245 xfree (cmp_data);
3246 cmp_data = next;
3247 }
3248 coding->cmp_data = NULL;
3249}
3250
3251/* Set `char_offset' member of all memory blocks pointed by
3252 coding->cmp_data to POS. */
3253
3254void
3255coding_adjust_composition_offset (coding, pos)
3256 struct coding_system *coding;
3257 int pos;
3258{
3259 struct composition_data *cmp_data;
3260
3261 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3262 cmp_data->char_offset = pos;
3263}
3264
54f78171
KH
3265/* Setup raw-text or one of its subsidiaries in the structure
3266 coding_system CODING according to the already setup value eol_type
3267 in CODING. CODING should be setup for some coding system in
3268 advance. */
3269
3270void
3271setup_raw_text_coding_system (coding)
3272 struct coding_system *coding;
3273{
3274 if (coding->type != coding_type_raw_text)
3275 {
3276 coding->symbol = Qraw_text;
3277 coding->type = coding_type_raw_text;
3278 if (coding->eol_type != CODING_EOL_UNDECIDED)
3279 {
84d60297
RS
3280 Lisp_Object subsidiaries;
3281 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3282
3283 if (VECTORP (subsidiaries)
3284 && XVECTOR (subsidiaries)->size == 3)
3285 coding->symbol
3286 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3287 }
716e0b0a 3288 setup_coding_system (coding->symbol, coding);
54f78171
KH
3289 }
3290 return;
3291}
3292
4ed46869
KH
3293/* Emacs has a mechanism to automatically detect a coding system if it
3294 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3295 it's impossible to distinguish some coding systems accurately
3296 because they use the same range of codes. So, at first, coding
3297 systems are categorized into 7, those are:
3298
0ef69138 3299 o coding-category-emacs-mule
4ed46869
KH
3300
3301 The category for a coding system which has the same code range
3302 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3303 symbol) `emacs-mule' by default.
4ed46869
KH
3304
3305 o coding-category-sjis
3306
3307 The category for a coding system which has the same code range
3308 as SJIS. Assigned the coding-system (Lisp
7717c392 3309 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3310
3311 o coding-category-iso-7
3312
3313 The category for a coding system which has the same code range
7717c392 3314 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3315 shift and single shift functions. This can encode/decode all
3316 charsets. Assigned the coding-system (Lisp symbol)
3317 `iso-2022-7bit' by default.
3318
3319 o coding-category-iso-7-tight
3320
3321 Same as coding-category-iso-7 except that this can
3322 encode/decode only the specified charsets.
4ed46869
KH
3323
3324 o coding-category-iso-8-1
3325
3326 The category for a coding system which has the same code range
3327 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3328 for DIMENSION1 charset. This doesn't use any locking shift
3329 and single shift functions. Assigned the coding-system (Lisp
3330 symbol) `iso-latin-1' by default.
4ed46869
KH
3331
3332 o coding-category-iso-8-2
3333
3334 The category for a coding system which has the same code range
3335 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3336 for DIMENSION2 charset. This doesn't use any locking shift
3337 and single shift functions. Assigned the coding-system (Lisp
3338 symbol) `japanese-iso-8bit' by default.
4ed46869 3339
7717c392 3340 o coding-category-iso-7-else
4ed46869
KH
3341
3342 The category for a coding system which has the same code range
7717c392
KH
3343 as ISO2022 of 7-bit environemnt but uses locking shift or
3344 single shift functions. Assigned the coding-system (Lisp
3345 symbol) `iso-2022-7bit-lock' by default.
3346
3347 o coding-category-iso-8-else
3348
3349 The category for a coding system which has the same code range
3350 as ISO2022 of 8-bit environemnt but uses locking shift or
3351 single shift functions. Assigned the coding-system (Lisp
3352 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3353
3354 o coding-category-big5
3355
3356 The category for a coding system which has the same code range
3357 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3358 `cn-big5' by default.
4ed46869 3359
fa42c37f
KH
3360 o coding-category-utf-8
3361
3362 The category for a coding system which has the same code range
3363 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3364 symbol) `utf-8' by default.
3365
3366 o coding-category-utf-16-be
3367
3368 The category for a coding system in which a text has an
3369 Unicode signature (cf. Unicode Standard) in the order of BIG
3370 endian at the head. Assigned the coding-system (Lisp symbol)
3371 `utf-16-be' by default.
3372
3373 o coding-category-utf-16-le
3374
3375 The category for a coding system in which a text has an
3376 Unicode signature (cf. Unicode Standard) in the order of
3377 LITTLE endian at the head. Assigned the coding-system (Lisp
3378 symbol) `utf-16-le' by default.
3379
1397dc18
KH
3380 o coding-category-ccl
3381
3382 The category for a coding system of which encoder/decoder is
3383 written in CCL programs. The default value is nil, i.e., no
3384 coding system is assigned.
3385
4ed46869
KH
3386 o coding-category-binary
3387
3388 The category for a coding system not categorized in any of the
3389 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3390 `no-conversion' by default.
4ed46869
KH
3391
3392 Each of them is a Lisp symbol and the value is an actual
3393 `coding-system's (this is also a Lisp symbol) assigned by a user.
3394 What Emacs does actually is to detect a category of coding system.
3395 Then, it uses a `coding-system' assigned to it. If Emacs can't
3396 decide only one possible category, it selects a category of the
3397 highest priority. Priorities of categories are also specified by a
3398 user in a Lisp variable `coding-category-list'.
3399
3400*/
3401
66cfb530
KH
3402static
3403int ascii_skip_code[256];
3404
d46c5b12 3405/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3406 If it detects possible coding systems, return an integer in which
3407 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
3408 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3409 it should point the table `coding_priorities'. In that case, only
3410 the flag bit for a coding system of the highest priority is set in
3411 the returned value.
4ed46869 3412
d46c5b12
KH
3413 How many ASCII characters are at the head is returned as *SKIP. */
3414
3415static int
3416detect_coding_mask (source, src_bytes, priorities, skip)
3417 unsigned char *source;
3418 int src_bytes, *priorities, *skip;
4ed46869
KH
3419{
3420 register unsigned char c;
d46c5b12 3421 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f
KH
3422 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3423 int i, idx;
4ed46869
KH
3424
3425 /* At first, skip all ASCII characters and control characters except
3426 for three ISO2022 specific control characters. */
66cfb530
KH
3427 ascii_skip_code[ISO_CODE_SO] = 0;
3428 ascii_skip_code[ISO_CODE_SI] = 0;
3429 ascii_skip_code[ISO_CODE_ESC] = 0;
3430
bcf26d6a 3431 label_loop_detect_coding:
66cfb530 3432 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3433 *skip = src - source;
4ed46869
KH
3434
3435 if (src >= src_end)
3436 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3437 return 0;
4ed46869 3438
8a8147d6 3439 c = *src;
4ed46869
KH
3440 /* The text seems to be encoded in some multilingual coding system.
3441 Now, try to find in which coding system the text is encoded. */
3442 if (c < 0x80)
bcf26d6a
KH
3443 {
3444 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3445 /* C is an ISO2022 specific control code of C0. */
3446 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3447 if (mask == 0)
d46c5b12
KH
3448 {
3449 /* No valid ISO2022 code follows C. Try again. */
3450 src++;
66cfb530
KH
3451 if (c == ISO_CODE_ESC)
3452 ascii_skip_code[ISO_CODE_ESC] = 1;
3453 else
3454 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3455 goto label_loop_detect_coding;
3456 }
3457 if (priorities)
fa42c37f
KH
3458 {
3459 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3460 {
3461 if (mask & priorities[i])
3462 return priorities[i];
3463 }
3464 return CODING_CATEGORY_MASK_RAW_TEXT;
3465 }
bcf26d6a 3466 }
d46c5b12 3467 else
c4825358 3468 {
d46c5b12 3469 int try;
4ed46869 3470
d46c5b12
KH
3471 if (c < 0xA0)
3472 {
3473 /* C is the first byte of SJIS character code,
fa42c37f
KH
3474 or a leading-code of Emacs' internal format (emacs-mule),
3475 or the first byte of UTF-16. */
3476 try = (CODING_CATEGORY_MASK_SJIS
3477 | CODING_CATEGORY_MASK_EMACS_MULE
3478 | CODING_CATEGORY_MASK_UTF_16_BE
3479 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
3480
3481 /* Or, if C is a special latin extra code,
3482 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3483 or is an ISO2022 control-sequence-introducer (CSI),
3484 we should also consider the possibility of ISO2022 codings. */
3485 if ((VECTORP (Vlatin_extra_code_table)
3486 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3487 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3488 || (c == ISO_CODE_CSI
3489 && (src < src_end
3490 && (*src == ']'
3491 || ((*src == '0' || *src == '1' || *src == '2')
3492 && src + 1 < src_end
3493 && src[1] == ']')))))
3494 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3495 | CODING_CATEGORY_MASK_ISO_8BIT);
3496 }
c4825358 3497 else
d46c5b12
KH
3498 /* C is a character of ISO2022 in graphic plane right,
3499 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
3500 or the first byte of BIG5's 2-byte code,
3501 or the first byte of UTF-8/16. */
d46c5b12
KH
3502 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3503 | CODING_CATEGORY_MASK_ISO_8BIT
3504 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
3505 | CODING_CATEGORY_MASK_BIG5
3506 | CODING_CATEGORY_MASK_UTF_8
3507 | CODING_CATEGORY_MASK_UTF_16_BE
3508 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 3509
1397dc18
KH
3510 /* Or, we may have to consider the possibility of CCL. */
3511 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3512 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3513 ->spec.ccl.valid_codes)[c])
3514 try |= CODING_CATEGORY_MASK_CCL;
3515
d46c5b12 3516 mask = 0;
fa42c37f 3517 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
3518 if (priorities)
3519 {
3520 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3521 {
fa42c37f
KH
3522 if (!iso2022_examined_p
3523 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3524 {
3525 mask |= detect_coding_iso2022 (src, src_end);
3526 iso2022_examined_p = 1;
3527 }
5ab13dd0 3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
fa42c37f
KH
3529 mask |= detect_coding_sjis (src, src_end);
3530 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3531 mask |= detect_coding_utf_8 (src, src_end);
3532 else if (!utf16_examined_p
3533 && (priorities[i] & try &
3534 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3535 {
3536 mask |= detect_coding_utf_16 (src, src_end);
3537 utf16_examined_p = 1;
3538 }
5ab13dd0 3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
fa42c37f 3540 mask |= detect_coding_big5 (src, src_end);
5ab13dd0 3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
fa42c37f 3542 mask |= detect_coding_emacs_mule (src, src_end);
89fa8b36 3543 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
fa42c37f 3544 mask |= detect_coding_ccl (src, src_end);
5ab13dd0 3545 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 3546 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 3547 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
3548 mask |= CODING_CATEGORY_MASK_BINARY;
3549 if (mask & priorities[i])
3550 return priorities[i];
d46c5b12
KH
3551 }
3552 return CODING_CATEGORY_MASK_RAW_TEXT;
3553 }
3554 if (try & CODING_CATEGORY_MASK_ISO)
3555 mask |= detect_coding_iso2022 (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_SJIS)
3557 mask |= detect_coding_sjis (src, src_end);
3558 if (try & CODING_CATEGORY_MASK_BIG5)
3559 mask |= detect_coding_big5 (src, src_end);
fa42c37f
KH
3560 if (try & CODING_CATEGORY_MASK_UTF_8)
3561 mask |= detect_coding_utf_8 (src, src_end);
3562 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3563 mask |= detect_coding_utf_16 (src, src_end);
d46c5b12 3564 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3565 mask |= detect_coding_emacs_mule (src, src_end);
3566 if (try & CODING_CATEGORY_MASK_CCL)
3567 mask |= detect_coding_ccl (src, src_end);
c4825358 3568 }
5ab13dd0 3569 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
3570}
3571
3572/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3573 The information of the detected coding system is set in CODING. */
3574
3575void
3576detect_coding (coding, src, src_bytes)
3577 struct coding_system *coding;
3578 unsigned char *src;
3579 int src_bytes;
3580{
d46c5b12
KH
3581 unsigned int idx;
3582 int skip, mask, i;
84d60297 3583 Lisp_Object val;
4ed46869 3584
84d60297 3585 val = Vcoding_category_list;
66cfb530 3586 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3587 coding->heading_ascii = skip;
4ed46869 3588
d46c5b12
KH
3589 if (!mask) return;
3590
3591 /* We found a single coding system of the highest priority in MASK. */
3592 idx = 0;
3593 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3594 if (! mask)
3595 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3596
d46c5b12
KH
3597 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3598
3599 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3600 {
84d60297 3601 Lisp_Object tmp;
d46c5b12 3602
84d60297 3603 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3604 if (VECTORP (tmp))
3605 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3606 }
b73bfc1c
KH
3607
3608 /* Setup this new coding system while preserving some slots. */
3609 {
3610 int src_multibyte = coding->src_multibyte;
3611 int dst_multibyte = coding->dst_multibyte;
3612
3613 setup_coding_system (val, coding);
3614 coding->src_multibyte = src_multibyte;
3615 coding->dst_multibyte = dst_multibyte;
3616 coding->heading_ascii = skip;
3617 }
4ed46869
KH
3618}
3619
d46c5b12
KH
3620/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3621 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3622 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3623
3624 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3625
bc4bc72a
RS
3626#define MAX_EOL_CHECK_COUNT 3
3627
d46c5b12
KH
3628static int
3629detect_eol_type (source, src_bytes, skip)
3630 unsigned char *source;
3631 int src_bytes, *skip;
4ed46869 3632{
d46c5b12 3633 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3634 unsigned char c;
bc4bc72a
RS
3635 int total = 0; /* How many end-of-lines are found so far. */
3636 int eol_type = CODING_EOL_UNDECIDED;
3637 int this_eol_type;
4ed46869 3638
d46c5b12
KH
3639 *skip = 0;
3640
bc4bc72a 3641 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3642 {
3643 c = *src++;
bc4bc72a 3644 if (c == '\n' || c == '\r')
4ed46869 3645 {
d46c5b12
KH
3646 if (*skip == 0)
3647 *skip = src - 1 - source;
bc4bc72a
RS
3648 total++;
3649 if (c == '\n')
3650 this_eol_type = CODING_EOL_LF;
3651 else if (src >= src_end || *src != '\n')
3652 this_eol_type = CODING_EOL_CR;
4ed46869 3653 else
bc4bc72a
RS
3654 this_eol_type = CODING_EOL_CRLF, src++;
3655
3656 if (eol_type == CODING_EOL_UNDECIDED)
3657 /* This is the first end-of-line. */
3658 eol_type = this_eol_type;
3659 else if (eol_type != this_eol_type)
d46c5b12
KH
3660 {
3661 /* The found type is different from what found before. */
3662 eol_type = CODING_EOL_INCONSISTENT;
3663 break;
3664 }
4ed46869
KH
3665 }
3666 }
bc4bc72a 3667
d46c5b12
KH
3668 if (*skip == 0)
3669 *skip = src_end - source;
85a02ca4 3670 return eol_type;
4ed46869
KH
3671}
3672
fa42c37f
KH
3673/* Like detect_eol_type, but detect EOL type in 2-octet
3674 big-endian/little-endian format for coding systems utf-16-be and
3675 utf-16-le. */
3676
3677static int
3678detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3679 unsigned char *source;
3680 int src_bytes, *skip;
3681{
3682 unsigned char *src = source, *src_end = src + src_bytes;
3683 unsigned int c1, c2;
3684 int total = 0; /* How many end-of-lines are found so far. */
3685 int eol_type = CODING_EOL_UNDECIDED;
3686 int this_eol_type;
3687 int msb, lsb;
3688
3689 if (big_endian_p)
3690 msb = 0, lsb = 1;
3691 else
3692 msb = 1, lsb = 0;
3693
3694 *skip = 0;
3695
3696 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3697 {
3698 c1 = (src[msb] << 8) | (src[lsb]);
3699 src += 2;
3700
3701 if (c1 == '\n' || c1 == '\r')
3702 {
3703 if (*skip == 0)
3704 *skip = src - 2 - source;
3705 total++;
3706 if (c1 == '\n')
3707 {
3708 this_eol_type = CODING_EOL_LF;
3709 }
3710 else
3711 {
3712 if ((src + 1) >= src_end)
3713 {
3714 this_eol_type = CODING_EOL_CR;
3715 }
3716 else
3717 {
3718 c2 = (src[msb] << 8) | (src[lsb]);
3719 if (c2 == '\n')
3720 this_eol_type = CODING_EOL_CRLF, src += 2;
3721 else
3722 this_eol_type = CODING_EOL_CR;
3723 }
3724 }
3725
3726 if (eol_type == CODING_EOL_UNDECIDED)
3727 /* This is the first end-of-line. */
3728 eol_type = this_eol_type;
3729 else if (eol_type != this_eol_type)
3730 {
3731 /* The found type is different from what found before. */
3732 eol_type = CODING_EOL_INCONSISTENT;
3733 break;
3734 }
3735 }
3736 }
3737
3738 if (*skip == 0)
3739 *skip = src_end - source;
3740 return eol_type;
3741}
3742
4ed46869
KH
3743/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3744 is encoded. If it detects an appropriate format of end-of-line, it
3745 sets the information in *CODING. */
3746
3747void
3748detect_eol (coding, src, src_bytes)
3749 struct coding_system *coding;
3750 unsigned char *src;
3751 int src_bytes;
3752{
4608c386 3753 Lisp_Object val;
d46c5b12 3754 int skip;
fa42c37f
KH
3755 int eol_type;
3756
3757 switch (coding->category_idx)
3758 {
3759 case CODING_CATEGORY_IDX_UTF_16_BE:
3760 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3761 break;
3762 case CODING_CATEGORY_IDX_UTF_16_LE:
3763 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3764 break;
3765 default:
3766 eol_type = detect_eol_type (src, src_bytes, &skip);
3767 break;
3768 }
d46c5b12
KH
3769
3770 if (coding->heading_ascii > skip)
3771 coding->heading_ascii = skip;
3772 else
3773 skip = coding->heading_ascii;
4ed46869 3774
0ef69138 3775 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3776 return;
27901516
KH
3777 if (eol_type == CODING_EOL_INCONSISTENT)
3778 {
3779#if 0
3780 /* This code is suppressed until we find a better way to
992f23f2 3781 distinguish raw text file and binary file. */
27901516
KH
3782
3783 /* If we have already detected that the coding is raw-text, the
3784 coding should actually be no-conversion. */
3785 if (coding->type == coding_type_raw_text)
3786 {
3787 setup_coding_system (Qno_conversion, coding);
3788 return;
3789 }
3790 /* Else, let's decode only text code anyway. */
3791#endif /* 0 */
1b2af4b0 3792 eol_type = CODING_EOL_LF;
27901516
KH
3793 }
3794
4608c386 3795 val = Fget (coding->symbol, Qeol_type);
4ed46869 3796 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12 3797 {
b73bfc1c
KH
3798 int src_multibyte = coding->src_multibyte;
3799 int dst_multibyte = coding->dst_multibyte;
3800
d46c5b12 3801 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
b73bfc1c
KH
3802 coding->src_multibyte = src_multibyte;
3803 coding->dst_multibyte = dst_multibyte;
d46c5b12
KH
3804 coding->heading_ascii = skip;
3805 }
3806}
3807
3808#define CONVERSION_BUFFER_EXTRA_ROOM 256
3809
b73bfc1c
KH
3810#define DECODING_BUFFER_MAG(coding) \
3811 (coding->type == coding_type_iso2022 \
3812 ? 3 \
3813 : (coding->type == coding_type_ccl \
3814 ? coding->spec.ccl.decoder.buf_magnification \
3815 : 2))
d46c5b12
KH
3816
3817/* Return maximum size (bytes) of a buffer enough for decoding
3818 SRC_BYTES of text encoded in CODING. */
3819
3820int
3821decoding_buffer_size (coding, src_bytes)
3822 struct coding_system *coding;
3823 int src_bytes;
3824{
3825 return (src_bytes * DECODING_BUFFER_MAG (coding)
3826 + CONVERSION_BUFFER_EXTRA_ROOM);
3827}
3828
3829/* Return maximum size (bytes) of a buffer enough for encoding
3830 SRC_BYTES of text to CODING. */
3831
3832int
3833encoding_buffer_size (coding, src_bytes)
3834 struct coding_system *coding;
3835 int src_bytes;
3836{
3837 int magnification;
3838
3839 if (coding->type == coding_type_ccl)
3840 magnification = coding->spec.ccl.encoder.buf_magnification;
b73bfc1c 3841 else if (CODING_REQUIRE_ENCODING (coding))
d46c5b12 3842 magnification = 3;
b73bfc1c
KH
3843 else
3844 magnification = 1;
d46c5b12
KH
3845
3846 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3847}
3848
3849#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3850#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3851#endif
3852
3853char *conversion_buffer;
3854int conversion_buffer_size;
3855
3856/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3857 or decoding. Sufficient memory is allocated automatically. If we
3858 run out of memory, return NULL. */
3859
3860char *
3861get_conversion_buffer (size)
3862 int size;
3863{
3864 if (size > conversion_buffer_size)
3865 {
3866 char *buf;
3867 int real_size = conversion_buffer_size * 2;
3868
3869 while (real_size < size) real_size *= 2;
3870 buf = (char *) xmalloc (real_size);
3871 xfree (conversion_buffer);
3872 conversion_buffer = buf;
3873 conversion_buffer_size = real_size;
3874 }
3875 return conversion_buffer;
3876}
3877
3878int
3879ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3880 struct coding_system *coding;
3881 unsigned char *source, *destination;
3882 int src_bytes, dst_bytes, encodep;
3883{
3884 struct ccl_program *ccl
3885 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3886 int result;
3887
ae9ff118 3888 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3889
d46c5b12
KH
3890 coding->produced = ccl_driver (ccl, source, destination,
3891 src_bytes, dst_bytes, &(coding->consumed));
b73bfc1c
KH
3892 if (encodep)
3893 coding->produced_char = coding->produced;
3894 else
3895 {
3896 int bytes
3897 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3898 coding->produced = str_as_multibyte (destination, bytes,
3899 coding->produced,
3900 &(coding->produced_char));
3901 }
69f76525 3902
d46c5b12
KH
3903 switch (ccl->status)
3904 {
3905 case CCL_STAT_SUSPEND_BY_SRC:
3906 result = CODING_FINISH_INSUFFICIENT_SRC;
3907 break;
3908 case CCL_STAT_SUSPEND_BY_DST:
3909 result = CODING_FINISH_INSUFFICIENT_DST;
3910 break;
9864ebce
KH
3911 case CCL_STAT_QUIT:
3912 case CCL_STAT_INVALID_CMD:
3913 result = CODING_FINISH_INTERRUPT;
3914 break;
d46c5b12
KH
3915 default:
3916 result = CODING_FINISH_NORMAL;
3917 break;
3918 }
3919 return result;
4ed46869
KH
3920}
3921
3922/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3923 decoding, it may detect coding system and format of end-of-line if
b73bfc1c
KH
3924 those are not yet decided. The source should be unibyte, the
3925 result is multibyte if CODING->dst_multibyte is nonzero, else
3926 unibyte. */
4ed46869
KH
3927
3928int
d46c5b12 3929decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3930 struct coding_system *coding;
3931 unsigned char *source, *destination;
3932 int src_bytes, dst_bytes;
4ed46869 3933{
0ef69138 3934 if (coding->type == coding_type_undecided)
4ed46869
KH
3935 detect_coding (coding, source, src_bytes);
3936
0ef69138 3937 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3938 detect_eol (coding, source, src_bytes);
3939
b73bfc1c
KH
3940 coding->produced = coding->produced_char = 0;
3941 coding->consumed = coding->consumed_char = 0;
3942 coding->errors = 0;
3943 coding->result = CODING_FINISH_NORMAL;
3944
4ed46869
KH
3945 switch (coding->type)
3946 {
4ed46869 3947 case coding_type_sjis:
b73bfc1c
KH
3948 decode_coding_sjis_big5 (coding, source, destination,
3949 src_bytes, dst_bytes, 1);
4ed46869
KH
3950 break;
3951
3952 case coding_type_iso2022:
b73bfc1c
KH
3953 decode_coding_iso2022 (coding, source, destination,
3954 src_bytes, dst_bytes);
4ed46869
KH
3955 break;
3956
3957 case coding_type_big5:
b73bfc1c
KH
3958 decode_coding_sjis_big5 (coding, source, destination,
3959 src_bytes, dst_bytes, 0);
3960 break;
3961
3962 case coding_type_emacs_mule:
3963 decode_coding_emacs_mule (coding, source, destination,
3964 src_bytes, dst_bytes);
4ed46869
KH
3965 break;
3966
3967 case coding_type_ccl:
b73bfc1c
KH
3968 ccl_coding_driver (coding, source, destination,
3969 src_bytes, dst_bytes, 0);
d46c5b12
KH
3970 break;
3971
b73bfc1c
KH
3972 default:
3973 decode_eol (coding, source, destination, src_bytes, dst_bytes);
3974 }
3975
3976 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3977 && coding->consumed == src_bytes)
3978 coding->result = CODING_FINISH_NORMAL;
3979
3980 if (coding->mode & CODING_MODE_LAST_BLOCK
3981 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3982 {
3983 unsigned char *src = source + coding->consumed;
3984 unsigned char *dst = destination + coding->produced;
3985
3986 src_bytes -= coding->consumed;
3987 coding->errors++;
3988 if (COMPOSING_P (coding))
3989 DECODE_COMPOSITION_END ('1');
3990 while (src_bytes--)
d46c5b12 3991 {
b73bfc1c
KH
3992 int c = *src++;
3993 dst += CHAR_STRING (c, dst);
3994 coding->produced_char++;
d46c5b12 3995 }
b73bfc1c
KH
3996 coding->consumed = coding->consumed_char = src - source;
3997 coding->produced = dst - destination;
4ed46869
KH
3998 }
3999
b73bfc1c
KH
4000 if (!coding->dst_multibyte)
4001 {
4002 coding->produced = str_as_unibyte (destination, coding->produced);
4003 coding->produced_char = coding->produced;
4004 }
4ed46869 4005
b73bfc1c
KH
4006 return coding->result;
4007}
52d41803 4008
b73bfc1c
KH
4009/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4010 multibyteness of the source is CODING->src_multibyte, the
4011 multibyteness of the result is always unibyte. */
4ed46869
KH
4012
4013int
d46c5b12 4014encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
4015 struct coding_system *coding;
4016 unsigned char *source, *destination;
4017 int src_bytes, dst_bytes;
4ed46869 4018{
b73bfc1c
KH
4019 coding->produced = coding->produced_char = 0;
4020 coding->consumed = coding->consumed_char = 0;
4021 coding->errors = 0;
4022 coding->result = CODING_FINISH_NORMAL;
4ed46869 4023
d46c5b12
KH
4024 switch (coding->type)
4025 {
4ed46869 4026 case coding_type_sjis:
b73bfc1c
KH
4027 encode_coding_sjis_big5 (coding, source, destination,
4028 src_bytes, dst_bytes, 1);
4ed46869
KH
4029 break;
4030
4031 case coding_type_iso2022:
b73bfc1c
KH
4032 encode_coding_iso2022 (coding, source, destination,
4033 src_bytes, dst_bytes);
4ed46869
KH
4034 break;
4035
4036 case coding_type_big5:
b73bfc1c
KH
4037 encode_coding_sjis_big5 (coding, source, destination,
4038 src_bytes, dst_bytes, 0);
4039 break;
4040
4041 case coding_type_emacs_mule:
4042 encode_coding_emacs_mule (coding, source, destination,
4043 src_bytes, dst_bytes);
4ed46869
KH
4044 break;
4045
4046 case coding_type_ccl:
b73bfc1c
KH
4047 ccl_coding_driver (coding, source, destination,
4048 src_bytes, dst_bytes, 1);
d46c5b12
KH
4049 break;
4050
b73bfc1c
KH
4051 default:
4052 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4053 }
4054
4055 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4056 && coding->consumed == src_bytes)
4057 coding->result = CODING_FINISH_NORMAL;
4058
4059 if (coding->mode & CODING_MODE_LAST_BLOCK)
4060 {
4061 unsigned char *src = source + coding->consumed;
4062 unsigned char *src_end = src + src_bytes;
4063 unsigned char *dst = destination + coding->produced;
4064
4065 if (coding->type == coding_type_iso2022)
4066 ENCODE_RESET_PLANE_AND_REGISTER;
4067 if (COMPOSING_P (coding))
4068 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4069 if (coding->consumed < src_bytes)
d46c5b12 4070 {
b73bfc1c
KH
4071 int len = src_bytes - coding->consumed;
4072
4073 BCOPY_SHORT (source + coding->consumed, dst, len);
4074 if (coding->src_multibyte)
4075 len = str_as_unibyte (dst, len);
4076 dst += len;
4077 coding->consumed = src_bytes;
d46c5b12 4078 }
b73bfc1c 4079 coding->produced = coding->produced_char = dst - destination;
4ed46869
KH
4080 }
4081
b73bfc1c 4082 return coding->result;
4ed46869
KH
4083}
4084
fb88bf2d
KH
4085/* Scan text in the region between *BEG and *END (byte positions),
4086 skip characters which we don't have to decode by coding system
4087 CODING at the head and tail, then set *BEG and *END to the region
4088 of the text we actually have to convert. The caller should move
b73bfc1c
KH
4089 the gap out of the region in advance if the region is from a
4090 buffer.
4ed46869 4091
d46c5b12
KH
4092 If STR is not NULL, *BEG and *END are indices into STR. */
4093
4094static void
4095shrink_decoding_region (beg, end, coding, str)
4096 int *beg, *end;
4097 struct coding_system *coding;
4098 unsigned char *str;
4099{
fb88bf2d 4100 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 4101 int eol_conversion;
88993dfd 4102 Lisp_Object translation_table;
d46c5b12
KH
4103
4104 if (coding->type == coding_type_ccl
4105 || coding->type == coding_type_undecided
b73bfc1c
KH
4106 || coding->eol_type != CODING_EOL_LF
4107 || !NILP (coding->post_read_conversion)
4108 || coding->composing != COMPOSITION_DISABLED)
d46c5b12
KH
4109 {
4110 /* We can't skip any data. */
4111 return;
4112 }
b73bfc1c
KH
4113 if (coding->type == coding_type_no_conversion
4114 || coding->type == coding_type_raw_text
4115 || coding->type == coding_type_emacs_mule)
d46c5b12 4116 {
fb88bf2d
KH
4117 /* We need no conversion, but don't have to skip any data here.
4118 Decoding routine handles them effectively anyway. */
d46c5b12
KH
4119 return;
4120 }
4121
88993dfd
KH
4122 translation_table = coding->translation_table_for_decode;
4123 if (NILP (translation_table) && !NILP (Venable_character_translation))
4124 translation_table = Vstandard_translation_table_for_decode;
4125 if (CHAR_TABLE_P (translation_table))
4126 {
4127 int i;
4128 for (i = 0; i < 128; i++)
4129 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4130 break;
4131 if (i < 128)
fa46990e 4132 /* Some ASCII character should be translated. We give up
88993dfd
KH
4133 shrinking. */
4134 return;
4135 }
4136
b73bfc1c 4137 if (coding->heading_ascii >= 0)
d46c5b12
KH
4138 /* Detection routine has already found how much we can skip at the
4139 head. */
4140 *beg += coding->heading_ascii;
4141
4142 if (str)
4143 {
4144 begp_orig = begp = str + *beg;
4145 endp_orig = endp = str + *end;
4146 }
4147 else
4148 {
fb88bf2d 4149 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4150 endp_orig = endp = begp + *end - *beg;
4151 }
4152
fa46990e
DL
4153 eol_conversion = (coding->eol_type == CODING_EOL_CR
4154 || coding->eol_type == CODING_EOL_CRLF);
4155
d46c5b12
KH
4156 switch (coding->type)
4157 {
d46c5b12
KH
4158 case coding_type_sjis:
4159 case coding_type_big5:
4160 /* We can skip all ASCII characters at the head. */
4161 if (coding->heading_ascii < 0)
4162 {
4163 if (eol_conversion)
de9d083c 4164 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4165 else
4166 while (begp < endp && *begp < 0x80) begp++;
4167 }
4168 /* We can skip all ASCII characters at the tail except for the
4169 second byte of SJIS or BIG5 code. */
4170 if (eol_conversion)
de9d083c 4171 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4172 else
4173 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4174 /* Do not consider LF as ascii if preceded by CR, since that
4175 confuses eol decoding. */
4176 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4177 endp++;
d46c5b12
KH
4178 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4179 endp++;
4180 break;
4181
b73bfc1c 4182 case coding_type_iso2022:
622fece5
KH
4183 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4184 /* We can't skip any data. */
4185 break;
d46c5b12
KH
4186 if (coding->heading_ascii < 0)
4187 {
d46c5b12
KH
4188 /* We can skip all ASCII characters at the head except for a
4189 few control codes. */
4190 while (begp < endp && (c = *begp) < 0x80
4191 && c != ISO_CODE_CR && c != ISO_CODE_SO
4192 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4193 && (!eol_conversion || c != ISO_CODE_LF))
4194 begp++;
4195 }
4196 switch (coding->category_idx)
4197 {
4198 case CODING_CATEGORY_IDX_ISO_8_1:
4199 case CODING_CATEGORY_IDX_ISO_8_2:
4200 /* We can skip all ASCII characters at the tail. */
4201 if (eol_conversion)
de9d083c 4202 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4203 else
4204 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4205 /* Do not consider LF as ascii if preceded by CR, since that
4206 confuses eol decoding. */
4207 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4208 endp++;
d46c5b12
KH
4209 break;
4210
4211 case CODING_CATEGORY_IDX_ISO_7:
4212 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4213 {
4214 /* We can skip all charactes at the tail except for 8-bit
4215 codes and ESC and the following 2-byte at the tail. */
4216 unsigned char *eight_bit = NULL;
4217
4218 if (eol_conversion)
4219 while (begp < endp
4220 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4221 {
4222 if (!eight_bit && c & 0x80) eight_bit = endp;
4223 endp--;
4224 }
4225 else
4226 while (begp < endp
4227 && (c = endp[-1]) != ISO_CODE_ESC)
4228 {
4229 if (!eight_bit && c & 0x80) eight_bit = endp;
4230 endp--;
4231 }
4232 /* Do not consider LF as ascii if preceded by CR, since that
4233 confuses eol decoding. */
4234 if (begp < endp && endp < endp_orig
4235 && endp[-1] == '\r' && endp[0] == '\n')
4236 endp++;
4237 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4238 {
4239 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4240 /* This is an ASCII designation sequence. We can
4241 surely skip the tail. But, if we have
4242 encountered an 8-bit code, skip only the codes
4243 after that. */
4244 endp = eight_bit ? eight_bit : endp + 2;
4245 else
4246 /* Hmmm, we can't skip the tail. */
4247 endp = endp_orig;
4248 }
4249 else if (eight_bit)
4250 endp = eight_bit;
4251 }
d46c5b12 4252 }
b73bfc1c
KH
4253 break;
4254
4255 default:
4256 abort ();
d46c5b12
KH
4257 }
4258 *beg += begp - begp_orig;
4259 *end += endp - endp_orig;
4260 return;
4261}
4262
4263/* Like shrink_decoding_region but for encoding. */
4264
4265static void
4266shrink_encoding_region (beg, end, coding, str)
4267 int *beg, *end;
4268 struct coding_system *coding;
4269 unsigned char *str;
4270{
4271 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4272 int eol_conversion;
88993dfd 4273 Lisp_Object translation_table;
d46c5b12 4274
b73bfc1c
KH
4275 if (coding->type == coding_type_ccl
4276 || coding->eol_type == CODING_EOL_CRLF
4277 || coding->eol_type == CODING_EOL_CR
4278 || coding->cmp_data && coding->cmp_data->used > 0)
d46c5b12 4279 {
b73bfc1c
KH
4280 /* We can't skip any data. */
4281 return;
4282 }
4283 if (coding->type == coding_type_no_conversion
4284 || coding->type == coding_type_raw_text
4285 || coding->type == coding_type_emacs_mule
4286 || coding->type == coding_type_undecided)
4287 {
4288 /* We need no conversion, but don't have to skip any data here.
4289 Encoding routine handles them effectively anyway. */
d46c5b12
KH
4290 return;
4291 }
4292
88993dfd
KH
4293 translation_table = coding->translation_table_for_encode;
4294 if (NILP (translation_table) && !NILP (Venable_character_translation))
4295 translation_table = Vstandard_translation_table_for_encode;
4296 if (CHAR_TABLE_P (translation_table))
4297 {
4298 int i;
4299 for (i = 0; i < 128; i++)
4300 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4301 break;
4302 if (i < 128)
4303 /* Some ASCII character should be tranlsated. We give up
4304 shrinking. */
4305 return;
4306 }
4307
d46c5b12
KH
4308 if (str)
4309 {
4310 begp_orig = begp = str + *beg;
4311 endp_orig = endp = str + *end;
4312 }
4313 else
4314 {
fb88bf2d 4315 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4316 endp_orig = endp = begp + *end - *beg;
4317 }
4318
4319 eol_conversion = (coding->eol_type == CODING_EOL_CR
4320 || coding->eol_type == CODING_EOL_CRLF);
4321
4322 /* Here, we don't have to check coding->pre_write_conversion because
4323 the caller is expected to have handled it already. */
4324 switch (coding->type)
4325 {
d46c5b12 4326 case coding_type_iso2022:
622fece5
KH
4327 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4328 /* We can't skip any data. */
4329 break;
d46c5b12
KH
4330 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4331 {
4332 unsigned char *bol = begp;
4333 while (begp < endp && *begp < 0x80)
4334 {
4335 begp++;
4336 if (begp[-1] == '\n')
4337 bol = begp;
4338 }
4339 begp = bol;
4340 goto label_skip_tail;
4341 }
4342 /* fall down ... */
4343
b73bfc1c
KH
4344 case coding_type_sjis:
4345 case coding_type_big5:
d46c5b12
KH
4346 /* We can skip all ASCII characters at the head and tail. */
4347 if (eol_conversion)
4348 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4349 else
4350 while (begp < endp && *begp < 0x80) begp++;
4351 label_skip_tail:
4352 if (eol_conversion)
4353 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4354 else
4355 while (begp < endp && *(endp - 1) < 0x80) endp--;
4356 break;
b73bfc1c
KH
4357
4358 default:
4359 abort ();
d46c5b12
KH
4360 }
4361
4362 *beg += begp - begp_orig;
4363 *end += endp - endp_orig;
4364 return;
4365}
4366
88993dfd
KH
4367/* As shrinking conversion region requires some overhead, we don't try
4368 shrinking if the length of conversion region is less than this
4369 value. */
4370static int shrink_conversion_region_threshhold = 1024;
4371
4372#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4373 do { \
4374 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4375 { \
4376 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4377 else shrink_decoding_region (beg, end, coding, str); \
4378 } \
4379 } while (0)
4380
b843d1ae
KH
4381static Lisp_Object
4382code_convert_region_unwind (dummy)
4383 Lisp_Object dummy;
4384{
4385 inhibit_pre_post_conversion = 0;
4386 return Qnil;
4387}
4388
ec6d2bb8
KH
4389/* Store information about all compositions in the range FROM and TO
4390 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4391 buffer or a string, defaults to the current buffer. */
4392
4393void
4394coding_save_composition (coding, from, to, obj)
4395 struct coding_system *coding;
4396 int from, to;
4397 Lisp_Object obj;
4398{
4399 Lisp_Object prop;
4400 int start, end;
4401
91bee881
KH
4402 if (coding->composing == COMPOSITION_DISABLED)
4403 return;
4404 if (!coding->cmp_data)
4405 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
4406 if (!find_composition (from, to, &start, &end, &prop, obj)
4407 || end > to)
4408 return;
4409 if (start < from
4410 && (!find_composition (end, to, &start, &end, &prop, obj)
4411 || end > to))
4412 return;
4413 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
4414 do
4415 {
4416 if (COMPOSITION_VALID_P (start, end, prop))
4417 {
4418 enum composition_method method = COMPOSITION_METHOD (prop);
4419 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4420 >= COMPOSITION_DATA_SIZE)
4421 coding_allocate_composition_data (coding, from);
4422 /* For relative composition, we remember start and end
4423 positions, for the other compositions, we also remember
4424 components. */
4425 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4426 if (method != COMPOSITION_RELATIVE)
4427 {
4428 /* We must store a*/
4429 Lisp_Object val, ch;
4430
4431 val = COMPOSITION_COMPONENTS (prop);
4432 if (CONSP (val))
4433 while (CONSP (val))
4434 {
4435 ch = XCAR (val), val = XCDR (val);
4436 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4437 }
4438 else if (VECTORP (val) || STRINGP (val))
4439 {
4440 int len = (VECTORP (val)
4441 ? XVECTOR (val)->size : XSTRING (val)->size);
4442 int i;
4443 for (i = 0; i < len; i++)
4444 {
4445 ch = (STRINGP (val)
4446 ? Faref (val, make_number (i))
4447 : XVECTOR (val)->contents[i]);
4448 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4449 }
4450 }
4451 else /* INTEGERP (val) */
4452 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4453 }
4454 CODING_ADD_COMPOSITION_END (coding, end - from);
4455 }
4456 start = end;
4457 }
4458 while (start < to
4459 && find_composition (start, to, &start, &end, &prop, obj)
4460 && end <= to);
4461
4462 /* Make coding->cmp_data point to the first memory block. */
4463 while (coding->cmp_data->prev)
4464 coding->cmp_data = coding->cmp_data->prev;
4465 coding->cmp_data_start = 0;
4466}
4467
4468/* Reflect the saved information about compositions to OBJ.
4469 CODING->cmp_data points to a memory block for the informaiton. OBJ
4470 is a buffer or a string, defaults to the current buffer. */
4471
33fb63eb 4472void
ec6d2bb8
KH
4473coding_restore_composition (coding, obj)
4474 struct coding_system *coding;
4475 Lisp_Object obj;
4476{
4477 struct composition_data *cmp_data = coding->cmp_data;
4478
4479 if (!cmp_data)
4480 return;
4481
4482 while (cmp_data->prev)
4483 cmp_data = cmp_data->prev;
4484
4485 while (cmp_data)
4486 {
4487 int i;
4488
4489 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4490 {
4491 int *data = cmp_data->data + i;
4492 enum composition_method method = (enum composition_method) data[3];
4493 Lisp_Object components;
4494
4495 if (method == COMPOSITION_RELATIVE)
4496 components = Qnil;
4497 else
4498 {
4499 int len = data[0] - 4, j;
4500 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4501
4502 for (j = 0; j < len; j++)
4503 args[j] = make_number (data[4 + j]);
4504 components = (method == COMPOSITION_WITH_ALTCHARS
4505 ? Fstring (len, args) : Fvector (len, args));
4506 }
4507 compose_text (data[1], data[2], components, Qnil, obj);
4508 }
4509 cmp_data = cmp_data->next;
4510 }
4511}
4512
d46c5b12 4513/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4514 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4515 coding system CODING, and return the status code of code conversion
4516 (currently, this value has no meaning).
4517
4518 How many characters (and bytes) are converted to how many
4519 characters (and bytes) are recorded in members of the structure
4520 CODING.
d46c5b12 4521
6e44253b 4522 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4523 is deleted and a new text is inserted. See the comments in
b73bfc1c
KH
4524 replace_range (insdel.c) to know what we are doing.
4525
4526 If REPLACE is zero, it is assumed that the source text is unibyte.
4527 Otherwize, it is assumed that the source text is multibyte. */
4ed46869
KH
4528
4529int
6e44253b
KH
4530code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4531 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4532 struct coding_system *coding;
4ed46869 4533{
fb88bf2d
KH
4534 int len = to - from, len_byte = to_byte - from_byte;
4535 int require, inserted, inserted_byte;
4b39528c 4536 int head_skip, tail_skip, total_skip = 0;
84d60297 4537 Lisp_Object saved_coding_symbol;
fb88bf2d 4538 int first = 1;
fb88bf2d 4539 unsigned char *src, *dst;
84d60297 4540 Lisp_Object deletion;
e133c8fa 4541 int orig_point = PT, orig_len = len;
6abb9bd9 4542 int prev_Z;
b73bfc1c
KH
4543 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4544
4545 coding->src_multibyte = replace && multibyte_p;
4546 coding->dst_multibyte = multibyte_p;
84d60297
RS
4547
4548 deletion = Qnil;
4549 saved_coding_symbol = Qnil;
d46c5b12 4550
83fa074f 4551 if (from < PT && PT < to)
e133c8fa
KH
4552 {
4553 TEMP_SET_PT_BOTH (from, from_byte);
4554 orig_point = from;
4555 }
83fa074f 4556
6e44253b 4557 if (replace)
d46c5b12 4558 {
fb88bf2d
KH
4559 int saved_from = from;
4560
d46c5b12 4561 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4562 if (saved_from != from)
4563 {
4564 to = from + len;
b73bfc1c 4565 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
fb88bf2d
KH
4566 len_byte = to_byte - from_byte;
4567 }
d46c5b12 4568 }
d46c5b12
KH
4569
4570 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4571 {
12410ef1 4572 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4573
4574 if (from < GPT && to > GPT)
4575 move_gap_both (from, from_byte);
4576 if (coding->type == coding_type_undecided)
4577 {
fb88bf2d 4578 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4579 if (coding->type == coding_type_undecided)
12410ef1
KH
4580 /* It seems that the text contains only ASCII, but we
4581 should not left it undecided because the deeper
4582 decoding routine (decode_coding) tries to detect the
4583 encodings again in vain. */
d46c5b12
KH
4584 coding->type = coding_type_emacs_mule;
4585 }
4586 if (coding->eol_type == CODING_EOL_UNDECIDED)
4587 {
4588 saved_coding_symbol = coding->symbol;
4589 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4590 if (coding->eol_type == CODING_EOL_UNDECIDED)
4591 coding->eol_type = CODING_EOL_LF;
4592 /* We had better recover the original eol format if we
4593 encounter an inconsitent eol format while decoding. */
4594 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4595 }
4596 }
4597
d46c5b12
KH
4598 /* Now we convert the text. */
4599
4600 /* For encoding, we must process pre-write-conversion in advance. */
b73bfc1c
KH
4601 if (! inhibit_pre_post_conversion
4602 && encodep
d46c5b12
KH
4603 && SYMBOLP (coding->pre_write_conversion)
4604 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4605 {
2b4f9037
KH
4606 /* The function in pre-write-conversion may put a new text in a
4607 new buffer. */
0007bdd0
KH
4608 struct buffer *prev = current_buffer;
4609 Lisp_Object new;
b843d1ae 4610 int count = specpdl_ptr - specpdl;
d46c5b12 4611
b843d1ae
KH
4612 record_unwind_protect (code_convert_region_unwind, Qnil);
4613 /* We should not call any more pre-write/post-read-conversion
4614 functions while this pre-write-conversion is running. */
4615 inhibit_pre_post_conversion = 1;
b39f748c
AS
4616 call2 (coding->pre_write_conversion,
4617 make_number (from), make_number (to));
b843d1ae
KH
4618 inhibit_pre_post_conversion = 0;
4619 /* Discard the unwind protect. */
4620 specpdl_ptr--;
4621
d46c5b12
KH
4622 if (current_buffer != prev)
4623 {
4624 len = ZV - BEGV;
0007bdd0 4625 new = Fcurrent_buffer ();
d46c5b12 4626 set_buffer_internal_1 (prev);
7dae4502 4627 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 4628 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4629 insert_from_buffer (XBUFFER (new), 1, len, 0);
4630 Fkill_buffer (new);
e133c8fa
KH
4631 if (orig_point >= to)
4632 orig_point += len - orig_len;
4633 else if (orig_point > from)
4634 orig_point = from;
4635 orig_len = len;
d46c5b12 4636 to = from + len;
b73bfc1c
KH
4637 from_byte = CHAR_TO_BYTE (from);
4638 to_byte = CHAR_TO_BYTE (to);
d46c5b12 4639 len_byte = to_byte - from_byte;
e133c8fa 4640 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4641 }
4642 }
4643
12410ef1
KH
4644 if (replace)
4645 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4646
ec6d2bb8
KH
4647 if (coding->composing != COMPOSITION_DISABLED)
4648 {
4649 if (encodep)
4650 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4651 else
4652 coding_allocate_composition_data (coding, from);
4653 }
fb88bf2d 4654
b73bfc1c
KH
4655 /* Try to skip the heading and tailing ASCIIs. */
4656 {
4657 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4658
4659 if (from < GPT && GPT < to)
4660 move_gap_both (from, from_byte);
4661 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4662 if (from_byte == to_byte
4663 && (encodep || NILP (coding->post_read_conversion))
4664 && ! CODING_REQUIRE_FLUSHING (coding))
4665 {
4666 coding->produced = len_byte;
4667 coding->produced_char = len;
4668 if (!replace)
4669 /* We must record and adjust for this new text now. */
4670 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4671 return 0;
4672 }
ec6d2bb8 4673
b73bfc1c
KH
4674 head_skip = from_byte - from_byte_orig;
4675 tail_skip = to_byte_orig - to_byte;
4676 total_skip = head_skip + tail_skip;
4677 from += head_skip;
4678 to -= tail_skip;
4679 len -= total_skip; len_byte -= total_skip;
4680 }
d46c5b12 4681
88993dfd 4682 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4683 now. So, we must remove all text properties in the region.
4684 Here, we must suppress all modification hooks. */
88993dfd 4685 if (replace)
55d8d769
KH
4686 {
4687 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4688 inhibit_modification_hooks = 1;
4689 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4690 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4691 }
88993dfd 4692
fb88bf2d
KH
4693 /* For converion, we must put the gap before the text in addition to
4694 making the gap larger for efficient decoding. The required gap
4695 size starts from 2000 which is the magic number used in make_gap.
4696 But, after one batch of conversion, it will be incremented if we
4697 find that it is not enough . */
d46c5b12
KH
4698 require = 2000;
4699
4700 if (GAP_SIZE < require)
4701 make_gap (require - GAP_SIZE);
4702 move_gap_both (from, from_byte);
4703
d46c5b12 4704 inserted = inserted_byte = 0;
fb88bf2d
KH
4705
4706 GAP_SIZE += len_byte;
4707 ZV -= len;
4708 Z -= len;
4709 ZV_BYTE -= len_byte;
4710 Z_BYTE -= len_byte;
4711
d9f9a1bc
GM
4712 if (GPT - BEG < BEG_UNCHANGED)
4713 BEG_UNCHANGED = GPT - BEG;
4714 if (Z - GPT < END_UNCHANGED)
4715 END_UNCHANGED = Z - GPT;
f2558efd 4716
b73bfc1c
KH
4717 if (!encodep && coding->src_multibyte)
4718 {
4719 /* Decoding routines expects that the source text is unibyte.
4720 We must convert 8-bit characters of multibyte form to
4721 unibyte. */
4722 int len_byte_orig = len_byte;
4723 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4724 if (len_byte < len_byte_orig)
4725 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4726 len_byte);
4727 coding->src_multibyte = 0;
4728 }
4729
d46c5b12
KH
4730 for (;;)
4731 {
fb88bf2d 4732 int result;
d46c5b12 4733
ec6d2bb8 4734 /* The buffer memory is now:
b73bfc1c
KH
4735 +--------+converted-text+---------+-------original-text-------+---+
4736 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4737 |<---------------------- GAP ----------------------->| */
ec6d2bb8
KH
4738 src = GAP_END_ADDR - len_byte;
4739 dst = GPT_ADDR + inserted_byte;
4740
d46c5b12 4741 if (encodep)
fb88bf2d 4742 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4743 else
fb88bf2d 4744 result = decode_coding (coding, src, dst, len_byte, 0);
ec6d2bb8
KH
4745
4746 /* The buffer memory is now:
b73bfc1c
KH
4747 +--------+-------converted-text----+--+------original-text----+---+
4748 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4749 |<---------------------- GAP ----------------------->| */
ec6d2bb8 4750
d46c5b12
KH
4751 inserted += coding->produced_char;
4752 inserted_byte += coding->produced;
d46c5b12 4753 len_byte -= coding->consumed;
ec6d2bb8
KH
4754
4755 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4756 {
4757 coding_allocate_composition_data (coding, from + inserted);
4758 continue;
4759 }
4760
fb88bf2d 4761 src += coding->consumed;
3636f7a3 4762 dst += coding->produced;
d46c5b12 4763
9864ebce
KH
4764 if (result == CODING_FINISH_NORMAL)
4765 {
4766 src += len_byte;
4767 break;
4768 }
d46c5b12
KH
4769 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4770 {
fb88bf2d 4771 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 4772 Lisp_Object eol_type;
d46c5b12
KH
4773
4774 /* Encode LFs back to the original eol format (CR or CRLF). */
4775 if (coding->eol_type == CODING_EOL_CR)
4776 {
4777 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4778 }
4779 else
4780 {
d46c5b12
KH
4781 int count = 0;
4782
fb88bf2d
KH
4783 while (p < pend) if (*p++ == '\n') count++;
4784 if (src - dst < count)
d46c5b12 4785 {
38edf7d4 4786 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
4787 back to CRLF. We must record converted and
4788 not-yet-converted text back to the buffer
4789 content, enlarge the gap, then record them out of
4790 the buffer contents again. */
4791 int add = len_byte + inserted_byte;
4792
4793 GAP_SIZE -= add;
4794 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4795 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4796 make_gap (count - GAP_SIZE);
4797 GAP_SIZE += add;
4798 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4799 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4800 /* Don't forget to update SRC, DST, and PEND. */
4801 src = GAP_END_ADDR - len_byte;
4802 dst = GPT_ADDR + inserted_byte;
4803 pend = dst;
d46c5b12 4804 }
d46c5b12
KH
4805 inserted += count;
4806 inserted_byte += count;
fb88bf2d
KH
4807 coding->produced += count;
4808 p = dst = pend + count;
4809 while (count)
4810 {
4811 *--p = *--pend;
4812 if (*p == '\n') count--, *--p = '\r';
4813 }
d46c5b12
KH
4814 }
4815
4816 /* Suppress eol-format conversion in the further conversion. */
4817 coding->eol_type = CODING_EOL_LF;
4818
38edf7d4
KH
4819 /* Set the coding system symbol to that for Unix-like EOL. */
4820 eol_type = Fget (saved_coding_symbol, Qeol_type);
4821 if (VECTORP (eol_type)
4822 && XVECTOR (eol_type)->size == 3
4823 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4824 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4825 else
4826 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4827
4828 continue;
d46c5b12
KH
4829 }
4830 if (len_byte <= 0)
944bd420
KH
4831 {
4832 if (coding->type != coding_type_ccl
4833 || coding->mode & CODING_MODE_LAST_BLOCK)
4834 break;
4835 coding->mode |= CODING_MODE_LAST_BLOCK;
4836 continue;
4837 }
d46c5b12
KH
4838 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4839 {
4840 /* The source text ends in invalid codes. Let's just
4841 make them valid buffer contents, and finish conversion. */
fb88bf2d 4842 inserted += len_byte;
d46c5b12 4843 inserted_byte += len_byte;
fb88bf2d 4844 while (len_byte--)
ee59c65f 4845 *dst++ = *src++;
d46c5b12
KH
4846 break;
4847 }
9864ebce
KH
4848 if (result == CODING_FINISH_INTERRUPT)
4849 {
4850 /* The conversion procedure was interrupted by a user. */
9864ebce
KH
4851 break;
4852 }
4853 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4854 if (coding->consumed < 1)
4855 {
4856 /* It's quite strange to require more memory without
4857 consuming any bytes. Perhaps CCL program bug. */
9864ebce
KH
4858 break;
4859 }
fb88bf2d
KH
4860 if (first)
4861 {
4862 /* We have just done the first batch of conversion which was
4863 stoped because of insufficient gap. Let's reconsider the
4864 required gap size (i.e. SRT - DST) now.
4865
4866 We have converted ORIG bytes (== coding->consumed) into
4867 NEW bytes (coding->produced). To convert the remaining
4868 LEN bytes, we may need REQUIRE bytes of gap, where:
4869 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4870 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4871 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4872 float ratio = coding->produced - coding->consumed;
4873 ratio /= coding->consumed;
4874 require = len_byte * ratio;
fb88bf2d
KH
4875 first = 0;
4876 }
4877 if ((src - dst) < (require + 2000))
4878 {
4879 /* See the comment above the previous call of make_gap. */
4880 int add = len_byte + inserted_byte;
4881
4882 GAP_SIZE -= add;
4883 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4884 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4885 make_gap (require + 2000);
4886 GAP_SIZE += add;
4887 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4888 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 4889 }
d46c5b12 4890 }
fb88bf2d
KH
4891 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4892
b73bfc1c
KH
4893 if (encodep && coding->dst_multibyte)
4894 {
4895 /* The output is unibyte. We must convert 8-bit characters to
4896 multibyte form. */
4897 if (inserted_byte * 2 > GAP_SIZE)
4898 {
4899 GAP_SIZE -= inserted_byte;
4900 ZV += inserted_byte; Z += inserted_byte;
4901 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4902 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4903 make_gap (inserted_byte - GAP_SIZE);
4904 GAP_SIZE += inserted_byte;
4905 ZV -= inserted_byte; Z -= inserted_byte;
4906 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4907 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4908 }
4909 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4910 }
7553d0e1 4911
12410ef1
KH
4912 /* If we have shrinked the conversion area, adjust it now. */
4913 if (total_skip > 0)
4914 {
4915 if (tail_skip > 0)
4916 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4917 inserted += total_skip; inserted_byte += total_skip;
4918 GAP_SIZE += total_skip;
4919 GPT -= head_skip; GPT_BYTE -= head_skip;
4920 ZV -= total_skip; ZV_BYTE -= total_skip;
4921 Z -= total_skip; Z_BYTE -= total_skip;
4922 from -= head_skip; from_byte -= head_skip;
4923 to += tail_skip; to_byte += tail_skip;
4924 }
4925
6abb9bd9 4926 prev_Z = Z;
12410ef1 4927 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4928 inserted = Z - prev_Z;
4ed46869 4929
ec6d2bb8
KH
4930 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4931 coding_restore_composition (coding, Fcurrent_buffer ());
4932 coding_free_composition_data (coding);
4933
b73bfc1c
KH
4934 if (! inhibit_pre_post_conversion
4935 && ! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4936 {
2b4f9037 4937 Lisp_Object val;
b843d1ae 4938 int count = specpdl_ptr - specpdl;
4ed46869 4939
e133c8fa
KH
4940 if (from != PT)
4941 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4942 prev_Z = Z;
b843d1ae
KH
4943 record_unwind_protect (code_convert_region_unwind, Qnil);
4944 /* We should not call any more pre-write/post-read-conversion
4945 functions while this post-read-conversion is running. */
4946 inhibit_pre_post_conversion = 1;
2b4f9037 4947 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae
KH
4948 inhibit_pre_post_conversion = 0;
4949 /* Discard the unwind protect. */
4950 specpdl_ptr--;
6abb9bd9 4951 CHECK_NUMBER (val, 0);
944bd420 4952 inserted += Z - prev_Z;
e133c8fa
KH
4953 }
4954
4955 if (orig_point >= from)
4956 {
4957 if (orig_point >= from + orig_len)
4958 orig_point += inserted - orig_len;
4959 else
4960 orig_point = from;
4961 TEMP_SET_PT (orig_point);
d46c5b12 4962 }
4ed46869 4963
ec6d2bb8
KH
4964 if (replace)
4965 {
4966 signal_after_change (from, to - from, inserted);
e19539f1 4967 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 4968 }
2b4f9037 4969
fb88bf2d 4970 {
12410ef1
KH
4971 coding->consumed = to_byte - from_byte;
4972 coding->consumed_char = to - from;
4973 coding->produced = inserted_byte;
4974 coding->produced_char = inserted;
fb88bf2d 4975 }
7553d0e1 4976
fb88bf2d 4977 return 0;
d46c5b12
KH
4978}
4979
4980Lisp_Object
b73bfc1c
KH
4981run_pre_post_conversion_on_str (str, coding, encodep)
4982 Lisp_Object str;
4983 struct coding_system *coding;
4984 int encodep;
4985{
4986 int count = specpdl_ptr - specpdl;
4987 struct gcpro gcpro1;
4988 struct buffer *prev = current_buffer;
4989 int multibyte = STRING_MULTIBYTE (str);
4990
4991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4992 record_unwind_protect (code_convert_region_unwind, Qnil);
4993 GCPRO1 (str);
4994 temp_output_buffer_setup (" *code-converting-work*");
4995 set_buffer_internal (XBUFFER (Vstandard_output));
4996 /* We must insert the contents of STR as is without
4997 unibyte<->multibyte conversion. For that, we adjust the
4998 multibyteness of the working buffer to that of STR. */
4999 Ferase_buffer ();
5000 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5001 insert_from_string (str, 0, 0,
5002 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5003 UNGCPRO;
5004 inhibit_pre_post_conversion = 1;
5005 if (encodep)
5006 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5007 else
6bac5b12
KH
5008 {
5009 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5010 call1 (coding->post_read_conversion, make_number (Z - BEG));
5011 }
b73bfc1c
KH
5012 inhibit_pre_post_conversion = 0;
5013 str = make_buffer_string (BEG, Z, 0);
5014 return unbind_to (count, str);
5015}
5016
5017Lisp_Object
5018decode_coding_string (str, coding, nocopy)
d46c5b12 5019 Lisp_Object str;
4ed46869 5020 struct coding_system *coding;
b73bfc1c 5021 int nocopy;
4ed46869 5022{
d46c5b12
KH
5023 int len;
5024 char *buf;
b73bfc1c 5025 int from, to, to_byte;
d46c5b12 5026 struct gcpro gcpro1;
84d60297 5027 Lisp_Object saved_coding_symbol;
d46c5b12 5028 int result;
4ed46869 5029
b73bfc1c
KH
5030 from = 0;
5031 to = XSTRING (str)->size;
5032 to_byte = STRING_BYTES (XSTRING (str));
4ed46869 5033
b73bfc1c
KH
5034 saved_coding_symbol = Qnil;
5035 if (CODING_REQUIRE_DETECTION (coding))
d46c5b12
KH
5036 {
5037 /* See the comments in code_convert_region. */
5038 if (coding->type == coding_type_undecided)
5039 {
5040 detect_coding (coding, XSTRING (str)->data, to_byte);
5041 if (coding->type == coding_type_undecided)
5042 coding->type = coding_type_emacs_mule;
5043 }
5044 if (coding->eol_type == CODING_EOL_UNDECIDED)
5045 {
5046 saved_coding_symbol = coding->symbol;
5047 detect_eol (coding, XSTRING (str)->data, to_byte);
5048 if (coding->eol_type == CODING_EOL_UNDECIDED)
5049 coding->eol_type = CODING_EOL_LF;
5050 /* We had better recover the original eol format if we
5051 encounter an inconsitent eol format while decoding. */
5052 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5053 }
5054 }
4ed46869 5055
b73bfc1c 5056 if (! CODING_REQUIRE_DECODING (coding))
ec6d2bb8 5057 {
b73bfc1c
KH
5058 if (!STRING_MULTIBYTE (str))
5059 {
5060 str = Fstring_as_multibyte (str);
5061 nocopy = 1;
5062 }
5063 return (nocopy ? str : Fcopy_sequence (str));
ec6d2bb8
KH
5064 }
5065
b73bfc1c 5066 if (STRING_MULTIBYTE (str))
d46c5b12 5067 {
b73bfc1c
KH
5068 /* Decoding routines expect the source text to be unibyte. */
5069 str = Fstring_as_unibyte (str);
5070 nocopy = 1;
5071 coding->src_multibyte = 0;
5072 }
5073 coding->dst_multibyte = 1;
ec6d2bb8 5074
b73bfc1c
KH
5075 if (coding->composing != COMPOSITION_DISABLED)
5076 coding_allocate_composition_data (coding, from);
ec6d2bb8 5077
b73bfc1c
KH
5078 /* Try to skip the heading and tailing ASCIIs. */
5079 {
5080 int from_orig = from;
4ed46869 5081
b73bfc1c
KH
5082 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5083 0);
5084 if (from == to_byte)
5085 return (nocopy ? str : Fcopy_sequence (str));
5086 }
5087
5088 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 5089 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
5090 GCPRO1 (str);
5091 buf = get_conversion_buffer (len);
5092 UNGCPRO;
4ed46869 5093
d46c5b12
KH
5094 if (from > 0)
5095 bcopy (XSTRING (str)->data, buf, from);
b73bfc1c
KH
5096 result = decode_coding (coding, XSTRING (str)->data + from,
5097 buf + from, to_byte - from, len);
5098 if (result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 5099 {
ec6d2bb8 5100 /* We simply try to decode the whole string again but without
d46c5b12
KH
5101 eol-conversion this time. */
5102 coding->eol_type = CODING_EOL_LF;
5103 coding->symbol = saved_coding_symbol;
ec6d2bb8 5104 coding_free_composition_data (coding);
b73bfc1c 5105 return decode_coding_string (str, coding, nocopy);
4ed46869 5106 }
d46c5b12
KH
5107
5108 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 5109 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 5110
fc932ac6 5111 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
b73bfc1c
KH
5112 str = make_multibyte_string (buf, len + coding->produced_char,
5113 len + coding->produced);
5114
5115 if (coding->cmp_data && coding->cmp_data->used)
5116 coding_restore_composition (coding, str);
5117 coding_free_composition_data (coding);
5118
5119 if (SYMBOLP (coding->post_read_conversion)
5120 && !NILP (Ffboundp (coding->post_read_conversion)))
6bac5b12 5121 str = run_pre_post_conversion_on_str (str, coding, 0);
b73bfc1c
KH
5122
5123 return str;
5124}
5125
5126Lisp_Object
5127encode_coding_string (str, coding, nocopy)
5128 Lisp_Object str;
5129 struct coding_system *coding;
5130 int nocopy;
5131{
5132 int len;
5133 char *buf;
5134 int from, to, to_byte;
5135 struct gcpro gcpro1;
5136 Lisp_Object saved_coding_symbol;
5137 int result;
5138
5139 if (SYMBOLP (coding->pre_write_conversion)
5140 && !NILP (Ffboundp (coding->pre_write_conversion)))
6bac5b12 5141 str = run_pre_post_conversion_on_str (str, coding, 1);
b73bfc1c
KH
5142
5143 from = 0;
5144 to = XSTRING (str)->size;
5145 to_byte = STRING_BYTES (XSTRING (str));
5146
5147 saved_coding_symbol = Qnil;
5148 if (! CODING_REQUIRE_ENCODING (coding))
826bfb8b 5149 {
b73bfc1c
KH
5150 if (STRING_MULTIBYTE (str))
5151 {
5152 str = Fstring_as_unibyte (str);
5153 nocopy = 1;
5154 }
5155 return (nocopy ? str : Fcopy_sequence (str));
826bfb8b
KH
5156 }
5157
b73bfc1c
KH
5158 /* Encoding routines determine the multibyteness of the source text
5159 by coding->src_multibyte. */
5160 coding->src_multibyte = STRING_MULTIBYTE (str);
5161 coding->dst_multibyte = 0;
5162
5163 if (coding->composing != COMPOSITION_DISABLED)
5164 coding_save_composition (coding, from, to, str);
ec6d2bb8 5165
b73bfc1c
KH
5166 /* Try to skip the heading and tailing ASCIIs. */
5167 {
5168 int from_orig = from;
5169
5170 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5171 1);
5172 if (from == to_byte)
5173 return (nocopy ? str : Fcopy_sequence (str));
5174 }
5175
5176 len = encoding_buffer_size (coding, to_byte - from);
5177 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5178 GCPRO1 (str);
5179 buf = get_conversion_buffer (len);
5180 UNGCPRO;
5181
5182 if (from > 0)
5183 bcopy (XSTRING (str)->data, buf, from);
5184 result = encode_coding (coding, XSTRING (str)->data + from,
5185 buf + from, to_byte - from, len);
5186 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5187 STRING_BYTES (XSTRING (str)) - to_byte);
5188
5189 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5190 str = make_unibyte_string (buf, len + coding->produced);
ec6d2bb8 5191 coding_free_composition_data (coding);
b73bfc1c 5192
d46c5b12 5193 return str;
4ed46869
KH
5194}
5195
5196\f
5197#ifdef emacs
1397dc18 5198/*** 8. Emacs Lisp library functions ***/
4ed46869 5199
4ed46869
KH
5200DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5201 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
5202See the documentation of `make-coding-system' for information\n\
5203about coding-system objects.")
4ed46869
KH
5204 (obj)
5205 Lisp_Object obj;
5206{
4608c386
KH
5207 if (NILP (obj))
5208 return Qt;
5209 if (!SYMBOLP (obj))
5210 return Qnil;
5211 /* Get coding-spec vector for OBJ. */
5212 obj = Fget (obj, Qcoding_system);
5213 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5214 ? Qt : Qnil);
4ed46869
KH
5215}
5216
9d991de8
RS
5217DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5218 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 5219 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
5220 (prompt)
5221 Lisp_Object prompt;
5222{
e0e989f6 5223 Lisp_Object val;
9d991de8
RS
5224 do
5225 {
4608c386
KH
5226 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5227 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
5228 }
5229 while (XSTRING (val)->size == 0);
e0e989f6 5230 return (Fintern (val, Qnil));
4ed46869
KH
5231}
5232
9b787f3e
RS
5233DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5234 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5235If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5236 (prompt, default_coding_system)
5237 Lisp_Object prompt, default_coding_system;
4ed46869 5238{
f44d27ce 5239 Lisp_Object val;
9b787f3e
RS
5240 if (SYMBOLP (default_coding_system))
5241 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 5242 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
5243 Qt, Qnil, Qcoding_system_history,
5244 default_coding_system, Qnil);
e0e989f6 5245 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
5246}
5247
5248DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5249 1, 1, 0,
5250 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
5251If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5252It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
5253The value of property should be a vector of length 5.")
5254 (coding_system)
5255 Lisp_Object coding_system;
5256{
5257 CHECK_SYMBOL (coding_system, 0);
5258 if (!NILP (Fcoding_system_p (coding_system)))
5259 return coding_system;
5260 while (1)
02ba4723 5261 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 5262}
3a73fa5d 5263\f
d46c5b12
KH
5264Lisp_Object
5265detect_coding_system (src, src_bytes, highest)
5266 unsigned char *src;
5267 int src_bytes, highest;
4ed46869
KH
5268{
5269 int coding_mask, eol_type;
d46c5b12
KH
5270 Lisp_Object val, tmp;
5271 int dummy;
4ed46869 5272
d46c5b12
KH
5273 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5274 eol_type = detect_eol_type (src, src_bytes, &dummy);
5275 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 5276 eol_type = CODING_EOL_UNDECIDED;
4ed46869 5277
d46c5b12 5278 if (!coding_mask)
4ed46869 5279 {
27901516 5280 val = Qundecided;
d46c5b12 5281 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 5282 {
f44d27ce
RS
5283 Lisp_Object val2;
5284 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
5285 if (VECTORP (val2))
5286 val = XVECTOR (val2)->contents[eol_type];
5287 }
80e803b4 5288 return (highest ? val : Fcons (val, Qnil));
4ed46869 5289 }
4ed46869 5290
d46c5b12
KH
5291 /* At first, gather possible coding systems in VAL. */
5292 val = Qnil;
fa42c37f 5293 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5294 {
fa42c37f
KH
5295 Lisp_Object category_val, category_index;
5296
5297 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5298 category_val = Fsymbol_value (XCAR (tmp));
5299 if (!NILP (category_val)
5300 && NATNUMP (category_index)
5301 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 5302 {
fa42c37f 5303 val = Fcons (category_val, val);
d46c5b12
KH
5304 if (highest)
5305 break;
4ed46869
KH
5306 }
5307 }
d46c5b12
KH
5308 if (!highest)
5309 val = Fnreverse (val);
4ed46869 5310
65059037 5311 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 5312 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5313 {
65059037
RS
5314 if (eol_type != CODING_EOL_UNDECIDED
5315 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 5316 {
d46c5b12 5317 Lisp_Object eol;
03699b14 5318 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 5319 if (VECTORP (eol))
03699b14 5320 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
5321 }
5322 }
03699b14 5323 return (highest ? XCAR (val) : val);
d46c5b12 5324}
4ed46869 5325
d46c5b12
KH
5326DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5327 2, 3, 0,
5328 "Detect coding system of the text in the region between START and END.\n\
5329Return a list of possible coding systems ordered by priority.\n\
5330\n\
80e803b4
KH
5331If only ASCII characters are found, it returns a list of single element\n\
5332`undecided' or its subsidiary coding system according to a detected\n\
5333end-of-line format.\n\
d46c5b12
KH
5334\n\
5335If optional argument HIGHEST is non-nil, return the coding system of\n\
5336highest priority.")
5337 (start, end, highest)
5338 Lisp_Object start, end, highest;
5339{
5340 int from, to;
5341 int from_byte, to_byte;
6289dd10 5342
d46c5b12
KH
5343 CHECK_NUMBER_COERCE_MARKER (start, 0);
5344 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 5345
d46c5b12
KH
5346 validate_region (&start, &end);
5347 from = XINT (start), to = XINT (end);
5348 from_byte = CHAR_TO_BYTE (from);
5349 to_byte = CHAR_TO_BYTE (to);
6289dd10 5350
d46c5b12
KH
5351 if (from < GPT && to >= GPT)
5352 move_gap_both (to, to_byte);
4ed46869 5353
d46c5b12
KH
5354 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5355 to_byte - from_byte,
5356 !NILP (highest));
5357}
6289dd10 5358
d46c5b12
KH
5359DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5360 1, 2, 0,
5361 "Detect coding system of the text in STRING.\n\
5362Return a list of possible coding systems ordered by priority.\n\
5363\n\
80e803b4
KH
5364If only ASCII characters are found, it returns a list of single element\n\
5365`undecided' or its subsidiary coding system according to a detected\n\
5366end-of-line format.\n\
d46c5b12
KH
5367\n\
5368If optional argument HIGHEST is non-nil, return the coding system of\n\
5369highest priority.")
5370 (string, highest)
5371 Lisp_Object string, highest;
5372{
5373 CHECK_STRING (string, 0);
4ed46869 5374
d46c5b12 5375 return detect_coding_system (XSTRING (string)->data,
fc932ac6 5376 STRING_BYTES (XSTRING (string)),
d46c5b12 5377 !NILP (highest));
4ed46869
KH
5378}
5379
4031e2bf
KH
5380Lisp_Object
5381code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 5382 Lisp_Object start, end, coding_system;
4031e2bf 5383 int encodep;
3a73fa5d
RS
5384{
5385 struct coding_system coding;
4031e2bf 5386 int from, to, len;
3a73fa5d 5387
d46c5b12
KH
5388 CHECK_NUMBER_COERCE_MARKER (start, 0);
5389 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
5390 CHECK_SYMBOL (coding_system, 2);
5391
d46c5b12
KH
5392 validate_region (&start, &end);
5393 from = XFASTINT (start);
5394 to = XFASTINT (end);
5395
3a73fa5d 5396 if (NILP (coding_system))
d46c5b12
KH
5397 return make_number (to - from);
5398
3a73fa5d 5399 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 5400 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 5401
d46c5b12 5402 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5403 coding.src_multibyte = coding.dst_multibyte
5404 = !NILP (current_buffer->enable_multibyte_characters);
fb88bf2d
KH
5405 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5406 &coding, encodep, 1);
f072a3e8 5407 Vlast_coding_system_used = coding.symbol;
fb88bf2d 5408 return make_number (coding.produced_char);
4031e2bf
KH
5409}
5410
5411DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5412 3, 3, "r\nzCoding system: ",
5413 "Decode the current region by specified coding system.\n\
5414When called from a program, takes three arguments:\n\
5415START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5416This function sets `last-coding-system-used' to the precise coding system\n\
5417used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5418not fully specified.)\n\
5419It returns the length of the decoded text.")
4031e2bf
KH
5420 (start, end, coding_system)
5421 Lisp_Object start, end, coding_system;
5422{
5423 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
5424}
5425
5426DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5427 3, 3, "r\nzCoding system: ",
d46c5b12 5428 "Encode the current region by specified coding system.\n\
3a73fa5d 5429When called from a program, takes three arguments:\n\
d46c5b12 5430START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5431This function sets `last-coding-system-used' to the precise coding system\n\
5432used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5433not fully specified.)\n\
5434It returns the length of the encoded text.")
d46c5b12
KH
5435 (start, end, coding_system)
5436 Lisp_Object start, end, coding_system;
3a73fa5d 5437{
4031e2bf
KH
5438 return code_convert_region1 (start, end, coding_system, 1);
5439}
3a73fa5d 5440
4031e2bf
KH
5441Lisp_Object
5442code_convert_string1 (string, coding_system, nocopy, encodep)
5443 Lisp_Object string, coding_system, nocopy;
5444 int encodep;
5445{
5446 struct coding_system coding;
3a73fa5d 5447
4031e2bf
KH
5448 CHECK_STRING (string, 0);
5449 CHECK_SYMBOL (coding_system, 1);
4ed46869 5450
d46c5b12 5451 if (NILP (coding_system))
4031e2bf 5452 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5453
d46c5b12
KH
5454 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5455 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5456
d46c5b12 5457 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5458 string = (encodep
5459 ? encode_coding_string (string, &coding, !NILP (nocopy))
5460 : decode_coding_string (string, &coding, !NILP (nocopy)));
f072a3e8 5461 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
5462
5463 return string;
4ed46869
KH
5464}
5465
4ed46869 5466DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5467 2, 3, 0,
5468 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5469Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5470if the decoding operation is trivial.\n\
5471This function sets `last-coding-system-used' to the precise coding system\n\
5472used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5473not fully specified.)")
e0e989f6
KH
5474 (string, coding_system, nocopy)
5475 Lisp_Object string, coding_system, nocopy;
4ed46869 5476{
f072a3e8 5477 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5478}
5479
5480DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5481 2, 3, 0,
5482 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5483Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5484if the encoding operation is trivial.\n\
5485This function sets `last-coding-system-used' to the precise coding system\n\
5486used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5487not fully specified.)")
e0e989f6
KH
5488 (string, coding_system, nocopy)
5489 Lisp_Object string, coding_system, nocopy;
4ed46869 5490{
f072a3e8 5491 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5492}
4031e2bf 5493
ecec61c1 5494/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
5495 Do not set Vlast_coding_system_used.
5496
5497 This function is called only from macros DECODE_FILE and
5498 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
5499
5500Lisp_Object
5501code_convert_string_norecord (string, coding_system, encodep)
5502 Lisp_Object string, coding_system;
5503 int encodep;
5504{
5505 struct coding_system coding;
5506
5507 CHECK_STRING (string, 0);
5508 CHECK_SYMBOL (coding_system, 1);
5509
5510 if (NILP (coding_system))
5511 return string;
5512
5513 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5514 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5515
ec6d2bb8 5516 coding.composing = COMPOSITION_DISABLED;
ecec61c1 5517 coding.mode |= CODING_MODE_LAST_BLOCK;
b73bfc1c
KH
5518 return (encodep
5519 ? encode_coding_string (string, &coding, 1)
5520 : decode_coding_string (string, &coding, 1));
ecec61c1 5521}
3a73fa5d 5522\f
4ed46869 5523DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5524 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5525Return the corresponding character.")
5526 (code)
5527 Lisp_Object code;
5528{
5529 unsigned char c1, c2, s1, s2;
5530 Lisp_Object val;
5531
5532 CHECK_NUMBER (code, 0);
5533 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5534 if (s1 == 0)
5535 {
c28a9453
KH
5536 if (s2 < 0x80)
5537 XSETFASTINT (val, s2);
5538 else if (s2 >= 0xA0 || s2 <= 0xDF)
b73bfc1c 5539 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
c28a9453 5540 else
9da8350f 5541 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5542 }
5543 else
5544 {
5545 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5546 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5547 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3 5548 DECODE_SJIS (s1, s2, c1, c2);
b73bfc1c 5549 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
55ab7be3 5550 }
4ed46869
KH
5551 return val;
5552}
5553
5554DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5555 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5556Return the corresponding code in SJIS.")
4ed46869
KH
5557 (ch)
5558 Lisp_Object ch;
5559{
bcf26d6a 5560 int charset, c1, c2, s1, s2;
4ed46869
KH
5561 Lisp_Object val;
5562
5563 CHECK_NUMBER (ch, 0);
5564 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5565 if (charset == CHARSET_ASCII)
5566 {
5567 val = ch;
5568 }
5569 else if (charset == charset_jisx0208
5570 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5571 {
5572 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5573 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5574 }
55ab7be3
KH
5575 else if (charset == charset_katakana_jisx0201
5576 && c1 > 0x20 && c2 < 0xE0)
5577 {
5578 XSETFASTINT (val, c1 | 0x80);
5579 }
4ed46869 5580 else
55ab7be3 5581 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5582 return val;
5583}
5584
5585DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5586 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5587Return the corresponding character.")
5588 (code)
5589 Lisp_Object code;
5590{
5591 int charset;
5592 unsigned char b1, b2, c1, c2;
5593 Lisp_Object val;
5594
5595 CHECK_NUMBER (code, 0);
5596 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5597 if (b1 == 0)
5598 {
5599 if (b2 >= 0x80)
9da8350f 5600 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5601 val = code;
5602 }
5603 else
5604 {
5605 if ((b1 < 0xA1 || b1 > 0xFE)
5606 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5607 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453 5608 DECODE_BIG5 (b1, b2, charset, c1, c2);
b73bfc1c 5609 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
c28a9453 5610 }
4ed46869
KH
5611 return val;
5612}
5613
5614DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5615 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5616Return the corresponding character code in Big5.")
5617 (ch)
5618 Lisp_Object ch;
5619{
bcf26d6a 5620 int charset, c1, c2, b1, b2;
4ed46869
KH
5621 Lisp_Object val;
5622
5623 CHECK_NUMBER (ch, 0);
5624 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5625 if (charset == CHARSET_ASCII)
5626 {
5627 val = ch;
5628 }
5629 else if ((charset == charset_big5_1
5630 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5631 || (charset == charset_big5_2
5632 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5633 {
5634 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5635 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5636 }
5637 else
c28a9453 5638 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5639 return val;
5640}
3a73fa5d 5641\f
1ba9e4ab
KH
5642DEFUN ("set-terminal-coding-system-internal",
5643 Fset_terminal_coding_system_internal,
5644 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5645 (coding_system)
5646 Lisp_Object coding_system;
5647{
5648 CHECK_SYMBOL (coding_system, 0);
5649 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5650 /* We had better not send unsafe characters to terminal. */
6e85d753 5651 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
ec6d2bb8
KH
5652 /* Characer composition should be disabled. */
5653 terminal_coding.composing = COMPOSITION_DISABLED;
b73bfc1c
KH
5654 terminal_coding.src_multibyte = 1;
5655 terminal_coding.dst_multibyte = 0;
4ed46869
KH
5656 return Qnil;
5657}
5658
c4825358
KH
5659DEFUN ("set-safe-terminal-coding-system-internal",
5660 Fset_safe_terminal_coding_system_internal,
5661 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5662 (coding_system)
5663 Lisp_Object coding_system;
5664{
5665 CHECK_SYMBOL (coding_system, 0);
5666 setup_coding_system (Fcheck_coding_system (coding_system),
5667 &safe_terminal_coding);
ec6d2bb8
KH
5668 /* Characer composition should be disabled. */
5669 safe_terminal_coding.composing = COMPOSITION_DISABLED;
b73bfc1c
KH
5670 safe_terminal_coding.src_multibyte = 1;
5671 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
5672 return Qnil;
5673}
5674
4ed46869
KH
5675DEFUN ("terminal-coding-system",
5676 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5677 "Return coding system specified for terminal output.")
4ed46869
KH
5678 ()
5679{
5680 return terminal_coding.symbol;
5681}
5682
1ba9e4ab
KH
5683DEFUN ("set-keyboard-coding-system-internal",
5684 Fset_keyboard_coding_system_internal,
5685 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5686 (coding_system)
5687 Lisp_Object coding_system;
5688{
5689 CHECK_SYMBOL (coding_system, 0);
5690 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
ec6d2bb8
KH
5691 /* Characer composition should be disabled. */
5692 keyboard_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
5693 return Qnil;
5694}
5695
5696DEFUN ("keyboard-coding-system",
5697 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5698 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5699 ()
5700{
5701 return keyboard_coding.symbol;
5702}
5703
5704\f
a5d301df
KH
5705DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5706 Sfind_operation_coding_system, 1, MANY, 0,
5707 "Choose a coding system for an operation based on the target name.\n\
69f76525 5708The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5709DECODING-SYSTEM is the coding system to use for decoding\n\
5710\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5711for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5712\n\
5713The first argument OPERATION specifies an I/O primitive:\n\
5714 For file I/O, `insert-file-contents' or `write-region'.\n\
5715 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5716 For network I/O, `open-network-stream'.\n\
5717\n\
5718The remaining arguments should be the same arguments that were passed\n\
5719to the primitive. Depending on which primitive, one of those arguments\n\
5720is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5721whichever argument specifies the file name is TARGET.\n\
5722\n\
5723TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5724 For file I/O, TARGET is a file name.\n\
5725 For process I/O, TARGET is a process name.\n\
5726 For network I/O, TARGET is a service name or a port number\n\
5727\n\
02ba4723
KH
5728This function looks up what specified for TARGET in,\n\
5729`file-coding-system-alist', `process-coding-system-alist',\n\
5730or `network-coding-system-alist' depending on OPERATION.\n\
5731They may specify a coding system, a cons of coding systems,\n\
5732or a function symbol to call.\n\
5733In the last case, we call the function with one argument,\n\
9ce27fde 5734which is a list of all the arguments given to this function.")
4ed46869
KH
5735 (nargs, args)
5736 int nargs;
5737 Lisp_Object *args;
5738{
5739 Lisp_Object operation, target_idx, target, val;
5740 register Lisp_Object chain;
5741
5742 if (nargs < 2)
5743 error ("Too few arguments");
5744 operation = args[0];
5745 if (!SYMBOLP (operation)
5746 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5747 error ("Invalid first arguement");
5748 if (nargs < 1 + XINT (target_idx))
5749 error ("Too few arguments for operation: %s",
5750 XSYMBOL (operation)->name->data);
5751 target = args[XINT (target_idx) + 1];
5752 if (!(STRINGP (target)
5753 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5754 error ("Invalid %dth argument", XINT (target_idx) + 1);
5755
2e34157c
RS
5756 chain = ((EQ (operation, Qinsert_file_contents)
5757 || EQ (operation, Qwrite_region))
02ba4723 5758 ? Vfile_coding_system_alist
2e34157c 5759 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5760 ? Vnetwork_coding_system_alist
5761 : Vprocess_coding_system_alist));
4ed46869
KH
5762 if (NILP (chain))
5763 return Qnil;
5764
03699b14 5765 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 5766 {
f44d27ce 5767 Lisp_Object elt;
03699b14 5768 elt = XCAR (chain);
4ed46869
KH
5769
5770 if (CONSP (elt)
5771 && ((STRINGP (target)
03699b14
KR
5772 && STRINGP (XCAR (elt))
5773 && fast_string_match (XCAR (elt), target) >= 0)
5774 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 5775 {
03699b14 5776 val = XCDR (elt);
b19fd4c5
KH
5777 /* Here, if VAL is both a valid coding system and a valid
5778 function symbol, we return VAL as a coding system. */
02ba4723
KH
5779 if (CONSP (val))
5780 return val;
5781 if (! SYMBOLP (val))
5782 return Qnil;
5783 if (! NILP (Fcoding_system_p (val)))
5784 return Fcons (val, val);
b19fd4c5
KH
5785 if (! NILP (Ffboundp (val)))
5786 {
5787 val = call1 (val, Flist (nargs, args));
5788 if (CONSP (val))
5789 return val;
5790 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5791 return Fcons (val, val);
5792 }
02ba4723
KH
5793 return Qnil;
5794 }
4ed46869
KH
5795 }
5796 return Qnil;
5797}
5798
1397dc18
KH
5799DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5800 Supdate_coding_systems_internal, 0, 0, 0,
5801 "Update internal database for ISO2022 and CCL based coding systems.\n\
fa42c37f
KH
5802When values of any coding categories are changed, you must\n\
5803call this function")
d46c5b12
KH
5804 ()
5805{
5806 int i;
5807
fa42c37f 5808 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 5809 {
1397dc18
KH
5810 Lisp_Object val;
5811
5812 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5813 if (!NILP (val))
5814 {
5815 if (! coding_system_table[i])
5816 coding_system_table[i] = ((struct coding_system *)
5817 xmalloc (sizeof (struct coding_system)));
5818 setup_coding_system (val, coding_system_table[i]);
5819 }
5820 else if (coding_system_table[i])
5821 {
5822 xfree (coding_system_table[i]);
5823 coding_system_table[i] = NULL;
5824 }
d46c5b12 5825 }
1397dc18 5826
d46c5b12
KH
5827 return Qnil;
5828}
5829
66cfb530
KH
5830DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5831 Sset_coding_priority_internal, 0, 0, 0,
5832 "Update internal database for the current value of `coding-category-list'.\n\
5833This function is internal use only.")
5834 ()
5835{
5836 int i = 0, idx;
84d60297
RS
5837 Lisp_Object val;
5838
5839 val = Vcoding_category_list;
66cfb530
KH
5840
5841 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5842 {
03699b14 5843 if (! SYMBOLP (XCAR (val)))
66cfb530 5844 break;
03699b14 5845 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
5846 if (idx >= CODING_CATEGORY_IDX_MAX)
5847 break;
5848 coding_priorities[i++] = (1 << idx);
03699b14 5849 val = XCDR (val);
66cfb530
KH
5850 }
5851 /* If coding-category-list is valid and contains all coding
5852 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 5853 the following code saves Emacs from crashing. */
66cfb530
KH
5854 while (i < CODING_CATEGORY_IDX_MAX)
5855 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5856
5857 return Qnil;
5858}
5859
4ed46869
KH
5860#endif /* emacs */
5861
5862\f
1397dc18 5863/*** 9. Post-amble ***/
4ed46869 5864
6d74c3aa
KH
5865void
5866init_coding ()
5867{
5868 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5869}
5870
dfcf069d 5871void
4ed46869
KH
5872init_coding_once ()
5873{
5874 int i;
5875
0ef69138 5876 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5877 for (i = 0; i <= 0x20; i++)
5878 emacs_code_class[i] = EMACS_control_code;
5879 emacs_code_class[0x0A] = EMACS_linefeed_code;
5880 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5881 for (i = 0x21 ; i < 0x7F; i++)
5882 emacs_code_class[i] = EMACS_ascii_code;
5883 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 5884 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
5885 emacs_code_class[i] = EMACS_invalid_code;
5886 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5887 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5888 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5889 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5890
5891 /* ISO2022 specific initialize routine. */
5892 for (i = 0; i < 0x20; i++)
b73bfc1c 5893 iso_code_class[i] = ISO_control_0;
4ed46869
KH
5894 for (i = 0x21; i < 0x7F; i++)
5895 iso_code_class[i] = ISO_graphic_plane_0;
5896 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 5897 iso_code_class[i] = ISO_control_1;
4ed46869
KH
5898 for (i = 0xA1; i < 0xFF; i++)
5899 iso_code_class[i] = ISO_graphic_plane_1;
5900 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5901 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5902 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5903 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5904 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5905 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5906 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5907 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5908 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5909 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5910
e0e989f6 5911 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5912
5913 setup_coding_system (Qnil, &keyboard_coding);
5914 setup_coding_system (Qnil, &terminal_coding);
c4825358 5915 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5916 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5917
d46c5b12
KH
5918 bzero (coding_system_table, sizeof coding_system_table);
5919
66cfb530
KH
5920 bzero (ascii_skip_code, sizeof ascii_skip_code);
5921 for (i = 0; i < 128; i++)
5922 ascii_skip_code[i] = 1;
5923
9ce27fde
KH
5924#if defined (MSDOS) || defined (WINDOWSNT)
5925 system_eol_type = CODING_EOL_CRLF;
5926#else
5927 system_eol_type = CODING_EOL_LF;
5928#endif
b843d1ae
KH
5929
5930 inhibit_pre_post_conversion = 0;
e0e989f6
KH
5931}
5932
5933#ifdef emacs
5934
dfcf069d 5935void
e0e989f6
KH
5936syms_of_coding ()
5937{
5938 Qtarget_idx = intern ("target-idx");
5939 staticpro (&Qtarget_idx);
5940
bb0115a2
RS
5941 Qcoding_system_history = intern ("coding-system-history");
5942 staticpro (&Qcoding_system_history);
5943 Fset (Qcoding_system_history, Qnil);
5944
9ce27fde 5945 /* Target FILENAME is the first argument. */
e0e989f6 5946 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5947 /* Target FILENAME is the third argument. */
e0e989f6
KH
5948 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5949
5950 Qcall_process = intern ("call-process");
5951 staticpro (&Qcall_process);
9ce27fde 5952 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5953 Fput (Qcall_process, Qtarget_idx, make_number (0));
5954
5955 Qcall_process_region = intern ("call-process-region");
5956 staticpro (&Qcall_process_region);
9ce27fde 5957 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5958 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5959
5960 Qstart_process = intern ("start-process");
5961 staticpro (&Qstart_process);
9ce27fde 5962 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5963 Fput (Qstart_process, Qtarget_idx, make_number (2));
5964
5965 Qopen_network_stream = intern ("open-network-stream");
5966 staticpro (&Qopen_network_stream);
9ce27fde 5967 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5968 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5969
4ed46869
KH
5970 Qcoding_system = intern ("coding-system");
5971 staticpro (&Qcoding_system);
5972
5973 Qeol_type = intern ("eol-type");
5974 staticpro (&Qeol_type);
5975
5976 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5977 staticpro (&Qbuffer_file_coding_system);
5978
5979 Qpost_read_conversion = intern ("post-read-conversion");
5980 staticpro (&Qpost_read_conversion);
5981
5982 Qpre_write_conversion = intern ("pre-write-conversion");
5983 staticpro (&Qpre_write_conversion);
5984
27901516
KH
5985 Qno_conversion = intern ("no-conversion");
5986 staticpro (&Qno_conversion);
5987
5988 Qundecided = intern ("undecided");
5989 staticpro (&Qundecided);
5990
4ed46869
KH
5991 Qcoding_system_p = intern ("coding-system-p");
5992 staticpro (&Qcoding_system_p);
5993
5994 Qcoding_system_error = intern ("coding-system-error");
5995 staticpro (&Qcoding_system_error);
5996
5997 Fput (Qcoding_system_error, Qerror_conditions,
5998 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5999 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 6000 build_string ("Invalid coding system"));
4ed46869 6001
d46c5b12
KH
6002 Qcoding_category = intern ("coding-category");
6003 staticpro (&Qcoding_category);
4ed46869
KH
6004 Qcoding_category_index = intern ("coding-category-index");
6005 staticpro (&Qcoding_category_index);
6006
d46c5b12
KH
6007 Vcoding_category_table
6008 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6009 staticpro (&Vcoding_category_table);
4ed46869
KH
6010 {
6011 int i;
6012 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6013 {
d46c5b12
KH
6014 XVECTOR (Vcoding_category_table)->contents[i]
6015 = intern (coding_category_name[i]);
6016 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6017 Qcoding_category_index, make_number (i));
4ed46869
KH
6018 }
6019 }
6020
f967223b
KH
6021 Qtranslation_table = intern ("translation-table");
6022 staticpro (&Qtranslation_table);
1397dc18 6023 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 6024
f967223b
KH
6025 Qtranslation_table_id = intern ("translation-table-id");
6026 staticpro (&Qtranslation_table_id);
84fbb8a0 6027
f967223b
KH
6028 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6029 staticpro (&Qtranslation_table_for_decode);
a5d301df 6030
f967223b
KH
6031 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6032 staticpro (&Qtranslation_table_for_encode);
a5d301df 6033
70c22245
KH
6034 Qsafe_charsets = intern ("safe-charsets");
6035 staticpro (&Qsafe_charsets);
6036
1397dc18
KH
6037 Qvalid_codes = intern ("valid-codes");
6038 staticpro (&Qvalid_codes);
6039
9ce27fde
KH
6040 Qemacs_mule = intern ("emacs-mule");
6041 staticpro (&Qemacs_mule);
6042
d46c5b12
KH
6043 Qraw_text = intern ("raw-text");
6044 staticpro (&Qraw_text);
6045
4ed46869
KH
6046 defsubr (&Scoding_system_p);
6047 defsubr (&Sread_coding_system);
6048 defsubr (&Sread_non_nil_coding_system);
6049 defsubr (&Scheck_coding_system);
6050 defsubr (&Sdetect_coding_region);
d46c5b12 6051 defsubr (&Sdetect_coding_string);
4ed46869
KH
6052 defsubr (&Sdecode_coding_region);
6053 defsubr (&Sencode_coding_region);
6054 defsubr (&Sdecode_coding_string);
6055 defsubr (&Sencode_coding_string);
6056 defsubr (&Sdecode_sjis_char);
6057 defsubr (&Sencode_sjis_char);
6058 defsubr (&Sdecode_big5_char);
6059 defsubr (&Sencode_big5_char);
1ba9e4ab 6060 defsubr (&Sset_terminal_coding_system_internal);
c4825358 6061 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 6062 defsubr (&Sterminal_coding_system);
1ba9e4ab 6063 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 6064 defsubr (&Skeyboard_coding_system);
a5d301df 6065 defsubr (&Sfind_operation_coding_system);
1397dc18 6066 defsubr (&Supdate_coding_systems_internal);
66cfb530 6067 defsubr (&Sset_coding_priority_internal);
4ed46869 6068
4608c386
KH
6069 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6070 "List of coding systems.\n\
6071\n\
6072Do not alter the value of this variable manually. This variable should be\n\
6073updated by the functions `make-coding-system' and\n\
6074`define-coding-system-alias'.");
6075 Vcoding_system_list = Qnil;
6076
6077 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6078 "Alist of coding system names.\n\
6079Each element is one element list of coding system name.\n\
6080This variable is given to `completing-read' as TABLE argument.\n\
6081\n\
6082Do not alter the value of this variable manually. This variable should be\n\
6083updated by the functions `make-coding-system' and\n\
6084`define-coding-system-alias'.");
6085 Vcoding_system_alist = Qnil;
6086
4ed46869
KH
6087 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6088 "List of coding-categories (symbols) ordered by priority.");
6089 {
6090 int i;
6091
6092 Vcoding_category_list = Qnil;
6093 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6094 Vcoding_category_list
d46c5b12
KH
6095 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6096 Vcoding_category_list);
4ed46869
KH
6097 }
6098
6099 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 6100 "Specify the coding system for read operations.\n\
2ebb362d 6101It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 6102If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 6103If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 6104There are three such tables, `file-coding-system-alist',\n\
a67a9c66 6105`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
6106 Vcoding_system_for_read = Qnil;
6107
6108 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 6109 "Specify the coding system for write operations.\n\
928aedd8
RS
6110Programs bind this variable with `let', but you should not set it globally.\n\
6111If the value is a coding system, it is used for encoding of output,\n\
6112when writing it to a file and when sending it to a file or subprocess.\n\
6113\n\
6114If this does not specify a coding system, an appropriate element\n\
6115is used from one of the coding system alists:\n\
10bff6f1 6116There are three such tables, `file-coding-system-alist',\n\
928aedd8
RS
6117`process-coding-system-alist', and `network-coding-system-alist'.\n\
6118For output to files, if the above procedure does not specify a coding system,\n\
6119the value of `buffer-file-coding-system' is used.");
4ed46869
KH
6120 Vcoding_system_for_write = Qnil;
6121
6122 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 6123 "Coding system used in the latest file or process I/O.");
4ed46869
KH
6124 Vlast_coding_system_used = Qnil;
6125
9ce27fde 6126 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 6127 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
6128See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6129such conversion.");
9ce27fde
KH
6130 inhibit_eol_conversion = 0;
6131
ed29121d
EZ
6132 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6133 "Non-nil means process buffer inherits coding system of process output.\n\
6134Bind it to t if the process output is to be treated as if it were a file\n\
6135read from some filesystem.");
6136 inherit_process_coding_system = 0;
6137
02ba4723
KH
6138 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6139 "Alist to decide a coding system to use for a file I/O operation.\n\
6140The format is ((PATTERN . VAL) ...),\n\
6141where PATTERN is a regular expression matching a file name,\n\
6142VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6143If VAL is a coding system, it is used for both decoding and encoding\n\
6144the file contents.\n\
6145If VAL is a cons of coding systems, the car part is used for decoding,\n\
6146and the cdr part is used for encoding.\n\
6147If VAL is a function symbol, the function must return a coding system\n\
6148or a cons of coding systems which are used as above.\n\
e0e989f6 6149\n\
a85a871a 6150See also the function `find-operation-coding-system'\n\
eda284ac 6151and the variable `auto-coding-alist'.");
02ba4723
KH
6152 Vfile_coding_system_alist = Qnil;
6153
6154 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6155 "Alist to decide a coding system to use for a process I/O operation.\n\
6156The format is ((PATTERN . VAL) ...),\n\
6157where PATTERN is a regular expression matching a program name,\n\
6158VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6159If VAL is a coding system, it is used for both decoding what received\n\
6160from the program and encoding what sent to the program.\n\
6161If VAL is a cons of coding systems, the car part is used for decoding,\n\
6162and the cdr part is used for encoding.\n\
6163If VAL is a function symbol, the function must return a coding system\n\
6164or a cons of coding systems which are used as above.\n\
4ed46869 6165\n\
9ce27fde 6166See also the function `find-operation-coding-system'.");
02ba4723
KH
6167 Vprocess_coding_system_alist = Qnil;
6168
6169 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6170 "Alist to decide a coding system to use for a network I/O operation.\n\
6171The format is ((PATTERN . VAL) ...),\n\
6172where PATTERN is a regular expression matching a network service name\n\
6173or is a port number to connect to,\n\
6174VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6175If VAL is a coding system, it is used for both decoding what received\n\
6176from the network stream and encoding what sent to the network stream.\n\
6177If VAL is a cons of coding systems, the car part is used for decoding,\n\
6178and the cdr part is used for encoding.\n\
6179If VAL is a function symbol, the function must return a coding system\n\
6180or a cons of coding systems which are used as above.\n\
4ed46869 6181\n\
9ce27fde 6182See also the function `find-operation-coding-system'.");
02ba4723 6183 Vnetwork_coding_system_alist = Qnil;
4ed46869 6184
68c45bf0
PE
6185 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6186 "Coding system to use with system messages.");
6187 Vlocale_coding_system = Qnil;
6188
005f0d35 6189 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9
EZ
6190 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6191 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6192 eol_mnemonic_unix = build_string (":");
4ed46869 6193
7722baf9
EZ
6194 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6195 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6196 eol_mnemonic_dos = build_string ("\\");
4ed46869 6197
7722baf9
EZ
6198 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6199 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6200 eol_mnemonic_mac = build_string ("/");
4ed46869 6201
7722baf9
EZ
6202 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6203 "*String displayed in mode line when end-of-line format is not yet determined.");
6204 eol_mnemonic_undecided = build_string (":");
4ed46869 6205
84fbb8a0 6206 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 6207 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 6208 Venable_character_translation = Qt;
bdd9fb48 6209
f967223b
KH
6210 DEFVAR_LISP ("standard-translation-table-for-decode",
6211 &Vstandard_translation_table_for_decode,
84fbb8a0 6212 "Table for translating characters while decoding.");
f967223b 6213 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 6214
f967223b
KH
6215 DEFVAR_LISP ("standard-translation-table-for-encode",
6216 &Vstandard_translation_table_for_encode,
84fbb8a0 6217 "Table for translationg characters while encoding.");
f967223b 6218 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
6219
6220 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6221 "Alist of charsets vs revision numbers.\n\
6222While encoding, if a charset (car part of an element) is found,\n\
6223designate it with the escape sequence identifing revision (cdr part of the element).");
6224 Vcharset_revision_alist = Qnil;
02ba4723
KH
6225
6226 DEFVAR_LISP ("default-process-coding-system",
6227 &Vdefault_process_coding_system,
6228 "Cons of coding systems used for process I/O by default.\n\
6229The car part is used for decoding a process output,\n\
6230the cdr part is used for encoding a text to be sent to a process.");
6231 Vdefault_process_coding_system = Qnil;
c4825358 6232
3f003981
KH
6233 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6234 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
6235This is a vector of length 256.\n\
6236If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 6237\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
6238a coding system of ISO 2022 variant which has a flag\n\
6239`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
6240or reading output of a subprocess.\n\
6241Only 128th through 159th elements has a meaning.");
3f003981 6242 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
6243
6244 DEFVAR_LISP ("select-safe-coding-system-function",
6245 &Vselect_safe_coding_system_function,
6246 "Function to call to select safe coding system for encoding a text.\n\
6247\n\
6248If set, this function is called to force a user to select a proper\n\
6249coding system which can encode the text in the case that a default\n\
6250coding system used in each operation can't encode the text.\n\
6251\n\
a85a871a 6252The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
6253 Vselect_safe_coding_system_function = Qnil;
6254
4ed46869
KH
6255}
6256
68c45bf0
PE
6257char *
6258emacs_strerror (error_number)
6259 int error_number;
6260{
6261 char *str;
6262
ca9c0567 6263 synchronize_system_messages_locale ();
68c45bf0
PE
6264 str = strerror (error_number);
6265
6266 if (! NILP (Vlocale_coding_system))
6267 {
6268 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6269 Vlocale_coding_system,
6270 0);
6271 str = (char *) XSTRING (dec)->data;
6272 }
6273
6274 return str;
6275}
6276
4ed46869 6277#endif /* emacs */
c2f94ebc 6278