(Fchars_in_region): Fix mixing of Lisp_Object and int.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
27901516
KH
70 4. Raw text
71
4608c386
KH
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
27901516
KH
74
75 5. Other
4ed46869 76
f4dee582 77 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
d46c5b12
KH
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
4ed46869 84 information about it is set in a structure of type `struct
f4dee582 85 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
86
87*/
88
89/*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 93 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
94 `line-feed' codes. MacOS's format is usually one byte of
95 `carriage-return'.
4ed46869 96
f4dee582
RS
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
4ed46869 99 any format of end-of-line. So, Emacs has information of format of
f4dee582 100 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
101
102*/
103
104/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111#if 0
112int
0ef69138 113detect_coding_emacs_mule (src, src_end)
4ed46869
KH
114 unsigned char *src, *src_end;
115{
116 ...
117}
118#endif
119
120/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 123 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
128
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
131
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
135
136 Below is a template of these functions. */
4ed46869 137#if 0
d46c5b12 138decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
139 struct coding_system *coding;
140 unsigned char *source, *destination;
141 int src_bytes, dst_bytes;
4ed46869
KH
142{
143 ...
144}
145#endif
146
147/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
148
0ef69138
KH
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 151 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
155
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
158
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
162
163 Below is a template of these functions. */
4ed46869 164#if 0
d46c5b12 165encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
166 struct coding_system *coding;
167 unsigned char *source, *destination;
168 int src_bytes, dst_bytes;
4ed46869
KH
169{
170 ...
171}
172#endif
173
174/*** COMMONLY USED MACROS ***/
175
176/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
181
182#define ONE_MORE_BYTE(c1) \
183 do { \
184 if (src < src_end) \
185 c1 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190#define TWO_MORE_BYTES(c1, c2) \
191 do { \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
194 else \
195 goto label_end_of_loop; \
196 } while (0)
197
198#define THREE_MORE_BYTES(c1, c2, c3) \
199 do { \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
202 else \
203 goto label_end_of_loop; \
204 } while (0)
205
206/* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
212
213/* Decode one ASCII character C. */
214
215#define DECODE_CHARACTER_ASCII(c) \
216 do { \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
219 else \
d46c5b12
KH
220 { \
221 *dst++ = (c); \
222 coding->produced_char++; \
223 } \
4ed46869
KH
224 } while (0)
225
f4dee582 226/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
227 position-code is C. */
228
229#define DECODE_CHARACTER_DIMENSION1(charset, c) \
230 do { \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
234 else \
d46c5b12
KH
235 { \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
238 } \
4ed46869
KH
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
242 } while (0)
243
f4dee582 244/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
245 position-codes are C1 and C2. */
246
247#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
248 do { \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
251 } while (0)
252
253\f
254/*** 1. Preamble ***/
255
256#include <stdio.h>
257
258#ifdef emacs
259
260#include <config.h>
261#include "lisp.h"
262#include "buffer.h"
263#include "charset.h"
264#include "ccl.h"
265#include "coding.h"
266#include "window.h"
267
268#else /* not emacs */
269
270#include "mulelib.h"
271
272#endif /* not emacs */
273
274Lisp_Object Qcoding_system, Qeol_type;
275Lisp_Object Qbuffer_file_coding_system;
276Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 277Lisp_Object Qno_conversion, Qundecided;
bb0115a2 278Lisp_Object Qcoding_system_history;
70c22245 279Lisp_Object Qsafe_charsets;
4ed46869
KH
280
281extern Lisp_Object Qinsert_file_contents, Qwrite_region;
282Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
283Lisp_Object Qstart_process, Qopen_network_stream;
284Lisp_Object Qtarget_idx;
285
d46c5b12
KH
286Lisp_Object Vselect_safe_coding_system_function;
287
4ed46869
KH
288/* Mnemonic character of each format of end-of-line. */
289int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
290/* Mnemonic character to indicate format of end-of-line is not yet
291 decided. */
292int eol_mnemonic_undecided;
293
9ce27fde
KH
294/* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
296int system_eol_type;
297
4ed46869
KH
298#ifdef emacs
299
4608c386
KH
300Lisp_Object Vcoding_system_list, Vcoding_system_alist;
301
302Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 303
d46c5b12
KH
304/* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 307
4ed46869
KH
308/* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310/* Coding-system for reading files and receiving data from process. */
311Lisp_Object Vcoding_system_for_read;
312/* Coding-system for writing files and sending data to process. */
313Lisp_Object Vcoding_system_for_write;
314/* Coding-system actually used in the latest I/O. */
315Lisp_Object Vlast_coding_system_used;
316
c4825358 317/* A vector of length 256 which contains information about special
3f003981
KH
318 Latin codes (espepcially for dealing with Microsoft code). */
319Lisp_Object Vlatin_extra_code_table;
c4825358 320
9ce27fde
KH
321/* Flag to inhibit code conversion of end-of-line format. */
322int inhibit_eol_conversion;
323
c4825358 324/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
325struct coding_system terminal_coding;
326
c4825358
KH
327/* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329struct coding_system safe_terminal_coding;
330
331/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
332struct coding_system keyboard_coding;
333
02ba4723
KH
334Lisp_Object Vfile_coding_system_alist;
335Lisp_Object Vprocess_coding_system_alist;
336Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
337
338#endif /* emacs */
339
d46c5b12 340Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
341
342/* List of symbols `coding-category-xxx' ordered by priority. */
343Lisp_Object Vcoding_category_list;
344
d46c5b12
KH
345/* Table of coding categories (Lisp symbols). */
346Lisp_Object Vcoding_category_table;
4ed46869
KH
347
348/* Table of names of symbol for each coding-category. */
349char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 350 "coding-category-emacs-mule",
4ed46869
KH
351 "coding-category-sjis",
352 "coding-category-iso-7",
d46c5b12 353 "coding-category-iso-7-tight",
4ed46869
KH
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
7717c392
KH
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
4ed46869 358 "coding-category-big5",
27901516 359 "coding-category-raw-text",
4ed46869
KH
360 "coding-category-binary"
361};
362
d46c5b12
KH
363/* Table pointers to coding systems corresponding to each coding
364 categories. */
365struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
366
bdd9fb48
KH
367/* Flag to tell if we look up unification table on character code
368 conversion. */
369Lisp_Object Venable_character_unification;
a5d301df
KH
370/* Standard unification table to look up on decoding (reading). */
371Lisp_Object Vstandard_character_unification_table_for_decode;
372/* Standard unification table to look up on encoding (writing). */
373Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
374
375Lisp_Object Qcharacter_unification_table;
a5d301df
KH
376Lisp_Object Qcharacter_unification_table_for_decode;
377Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
378
379/* Alist of charsets vs revision number. */
380Lisp_Object Vcharset_revision_alist;
381
02ba4723
KH
382/* Default coding systems used for process I/O. */
383Lisp_Object Vdefault_process_coding_system;
384
4ed46869 385\f
0ef69138 386/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
387
388/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
400
401 There's one exception to this rule. Special leading-code
4ed46869
KH
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
4ed46869
KH
408 range as follows:
409
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
412 ASCII 0x00 .. 0x7F
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
416
417 */
418
419enum emacs_code_class_type emacs_code_class[256];
420
421/* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423#define CHECK_CODE_RANGE_A0_FF \
424 do { \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
428 return 0; \
429 } while (0)
430
431/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
434
435int
0ef69138 436detect_coding_emacs_mule (src, src_end)
4ed46869
KH
437 unsigned char *src, *src_end;
438{
439 unsigned char c;
440 int composing = 0;
441
442 while (src < src_end)
443 {
444 c = *src++;
445
446 if (composing)
447 {
448 if (c < 0xA0)
449 composing = 0;
450 else
451 c -= 0x20;
452 }
453
454 switch (emacs_code_class[c])
455 {
456 case EMACS_ascii_code:
457 case EMACS_linefeed_code:
458 break;
459
460 case EMACS_control_code:
461 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
462 return 0;
463 break;
464
465 case EMACS_invalid_code:
466 return 0;
467
468 case EMACS_leading_code_composition: /* c == 0x80 */
469 if (composing)
470 CHECK_CODE_RANGE_A0_FF;
471 else
472 composing = 1;
473 break;
474
475 case EMACS_leading_code_4:
476 CHECK_CODE_RANGE_A0_FF;
477 /* fall down to check it two more times ... */
478
479 case EMACS_leading_code_3:
480 CHECK_CODE_RANGE_A0_FF;
481 /* fall down to check it one more time ... */
482
483 case EMACS_leading_code_2:
484 CHECK_CODE_RANGE_A0_FF;
485 break;
486
487 default:
488 label_end_of_switch:
489 break;
490 }
491 }
0ef69138 492 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
493}
494
495\f
496/*** 3. ISO2022 handlers ***/
497
498/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
503
504 ISO2022 provides many mechanisms to encode several character sets
f4dee582 505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 506 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
4ed46869 509
f4dee582 510 There are two kinds of character set: control character set and
4ed46869
KH
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
f4dee582 513 functions are provided also by escape sequences). The latter
4ed46869
KH
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
516
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
526
527 Note (*): ECMA = European Computer Manufacturers Association
528
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
534
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
540
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
550
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
558
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
562
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
568 sequences.
569
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 574 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
582
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
604
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
607
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
612
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
615
616 Now you may notice that there are a lot of ways for encoding the
f4dee582 617 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
622
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
626
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
634
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
640
641enum iso_code_class_type iso_code_class[256];
642
704c5781
KH
643#define CHARSET_OK(idx, charset) \
644 (coding_system_table[idx]->safe_charsets[charset] \
645 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
646 (coding_system_table[idx], charset) \
647 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
d46c5b12
KH
648
649#define SHIFT_OUT_OK(idx) \
650 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
651
4ed46869
KH
652/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
653 Check if a text is encoded in ISO2022. If it is, returns an
654 integer in which appropriate flag bits any of:
655 CODING_CATEGORY_MASK_ISO_7
d46c5b12 656 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
657 CODING_CATEGORY_MASK_ISO_8_1
658 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
659 CODING_CATEGORY_MASK_ISO_7_ELSE
660 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
661 are set. If a code which should never appear in ISO2022 is found,
662 returns 0. */
663
664int
665detect_coding_iso2022 (src, src_end)
666 unsigned char *src, *src_end;
667{
d46c5b12
KH
668 int mask = CODING_CATEGORY_MASK_ISO;
669 int mask_found = 0;
670 int reg[4], shift_out = 0;
671 int c, c1, i, charset;
3f003981 672
d46c5b12 673 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 674 while (mask && src < src_end)
4ed46869
KH
675 {
676 c = *src++;
677 switch (c)
678 {
679 case ISO_CODE_ESC:
e0e989f6 680 if (src >= src_end)
4ed46869
KH
681 break;
682 c = *src++;
d46c5b12 683 if (c >= '(' && c <= '/')
4ed46869 684 {
bf9cdd4e
KH
685 /* Designation sequence for a charset of dimension 1. */
686 if (src >= src_end)
687 break;
d46c5b12
KH
688 c1 = *src++;
689 if (c1 < ' ' || c1 >= 0x80
690 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
691 /* Invalid designation sequence. Just ignore. */
692 break;
693 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
694 }
695 else if (c == '$')
696 {
697 /* Designation sequence for a charset of dimension 2. */
698 if (src >= src_end)
699 break;
700 c = *src++;
701 if (c >= '@' && c <= 'B')
702 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 703 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 704 else if (c >= '(' && c <= '/')
bcf26d6a 705 {
bf9cdd4e
KH
706 if (src >= src_end)
707 break;
d46c5b12
KH
708 c1 = *src++;
709 if (c1 < ' ' || c1 >= 0x80
710 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
711 /* Invalid designation sequence. Just ignore. */
712 break;
713 reg[(c - '(') % 4] = charset;
bcf26d6a 714 }
bf9cdd4e 715 else
d46c5b12
KH
716 /* Invalid designation sequence. Just ignore. */
717 break;
718 }
719 else if (c == 'N' || c == 'n')
720 {
721 if (shift_out == 0
722 && (reg[1] >= 0
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
724 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
725 {
726 /* Locking shift out. */
727 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
728 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
729 shift_out = 1;
730 }
731 break;
732 }
733 else if (c == 'O' || c == 'o')
734 {
735 if (shift_out == 1)
736 {
737 /* Locking shift in. */
738 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
739 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
740 shift_out = 0;
741 }
742 break;
4ed46869 743 }
bf9cdd4e 744 else if (c == '0' || c == '1' || c == '2')
d46c5b12
KH
745 /* Start/end composition. Just ignore. */
746 break;
bf9cdd4e 747 else
d46c5b12
KH
748 /* Invalid escape sequence. Just ignore. */
749 break;
750
751 /* We found a valid designation sequence for CHARSET. */
752 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
753 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
754 mask_found |= CODING_CATEGORY_MASK_ISO_7;
755 else
756 mask &= ~CODING_CATEGORY_MASK_ISO_7;
757 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
758 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
759 else
760 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
761 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
762 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
763 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
764 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
765 break;
766
4ed46869 767 case ISO_CODE_SO:
d46c5b12
KH
768 if (shift_out == 0
769 && (reg[1] >= 0
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
771 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
772 {
773 /* Locking shift out. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
775 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
776 }
e0e989f6
KH
777 break;
778
d46c5b12
KH
779 case ISO_CODE_SI:
780 if (shift_out == 1)
781 {
782 /* Locking shift in. */
783 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
784 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
785 }
786 break;
787
4ed46869
KH
788 case ISO_CODE_CSI:
789 case ISO_CODE_SS2:
790 case ISO_CODE_SS3:
3f003981
KH
791 {
792 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
793
70c22245
KH
794 if (c != ISO_CODE_CSI)
795 {
d46c5b12
KH
796 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
797 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 798 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
799 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
800 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245
KH
801 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
802 }
3f003981
KH
803 if (VECTORP (Vlatin_extra_code_table)
804 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
805 {
d46c5b12
KH
806 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
807 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 808 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
809 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
810 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
811 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
812 }
813 mask &= newmask;
d46c5b12 814 mask_found |= newmask;
3f003981
KH
815 }
816 break;
4ed46869
KH
817
818 default:
819 if (c < 0x80)
820 break;
821 else if (c < 0xA0)
c4825358 822 {
3f003981
KH
823 if (VECTORP (Vlatin_extra_code_table)
824 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 825 {
3f003981
KH
826 int newmask = 0;
827
d46c5b12
KH
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
829 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 830 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
832 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
833 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
834 mask &= newmask;
d46c5b12 835 mask_found |= newmask;
c4825358 836 }
3f003981
KH
837 else
838 return 0;
c4825358 839 }
4ed46869
KH
840 else
841 {
7717c392 842 unsigned char *src_begin = src;
4ed46869 843
d46c5b12 844 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 845 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 846 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
e0e989f6 847 while (src < src_end && *src >= 0xA0)
7717c392
KH
848 src++;
849 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869 850 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
d46c5b12
KH
851 else
852 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
4ed46869
KH
853 }
854 break;
855 }
856 }
857
d46c5b12 858 return (mask & mask_found);
4ed46869
KH
859}
860
861/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 862 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
863 fetched from SRC and set to C2. If CHARSET is negative, it means
864 that we are decoding ill formed text, and what we can do is just to
865 read C1 as is. */
866
bdd9fb48
KH
867#define DECODE_ISO_CHARACTER(charset, c1) \
868 do { \
869 int c_alt, charset_alt = (charset); \
870 if (COMPOSING_HEAD_P (coding->composing)) \
871 { \
872 *dst++ = LEADING_CODE_COMPOSITION; \
873 if (COMPOSING_WITH_RULE_P (coding->composing)) \
874 /* To tell composition rules are embeded. */ \
875 *dst++ = 0xFF; \
876 coding->composing += 2; \
877 } \
878 if ((charset) >= 0) \
879 { \
880 if (CHARSET_DIMENSION (charset) == 2) \
70c22245
KH
881 { \
882 ONE_MORE_BYTE (c2); \
883 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
884 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
885 { \
886 src--; \
887 c2 = ' '; \
888 } \
889 } \
bdd9fb48
KH
890 if (!NILP (unification_table) \
891 && ((c_alt = unify_char (unification_table, \
892 -1, (charset), c1, c2)) >= 0)) \
893 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
894 } \
895 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
896 DECODE_CHARACTER_ASCII (c1); \
897 else if (CHARSET_DIMENSION (charset_alt) == 1) \
898 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
899 else \
900 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
901 if (COMPOSING_WITH_RULE_P (coding->composing)) \
902 /* To tell a composition rule follows. */ \
903 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
904 } while (0)
905
906/* Set designation state into CODING. */
d46c5b12
KH
907#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
908 do { \
909 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
910 make_number (chars), \
911 make_number (final_char)); \
912 if (charset >= 0 \
704c5781
KH
913 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
914 || coding->safe_charsets[charset])) \
d46c5b12
KH
915 { \
916 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
917 && reg == 0 \
918 && charset == CHARSET_ASCII) \
919 { \
920 /* We should insert this designation sequence as is so \
921 that it is surely written back to a file. */ \
922 coding->spec.iso2022.last_invalid_designation_register = -1; \
923 goto label_invalid_code; \
924 } \
925 coding->spec.iso2022.last_invalid_designation_register = -1; \
926 if ((coding->mode & CODING_MODE_DIRECTION) \
927 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
928 charset = CHARSET_REVERSE_CHARSET (charset); \
929 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
930 } \
931 else \
932 { \
933 coding->spec.iso2022.last_invalid_designation_register = reg; \
934 goto label_invalid_code; \
935 } \
4ed46869
KH
936 } while (0)
937
d46c5b12
KH
938/* Check if the current composing sequence contains only valid codes.
939 If the composing sequence doesn't end before SRC_END, return -1.
940 Else, if it contains only valid codes, return 0.
941 Else return the length of the composing sequence. */
942
943int check_composing_code (coding, src, src_end)
944 struct coding_system *coding;
945 unsigned char *src, *src_end;
946{
947 unsigned char *src_start = src;
948 int invalid_code_found = 0;
949 int charset, c, c1, dim;
950
951 while (src < src_end)
952 {
953 if (*src++ != ISO_CODE_ESC) continue;
954 if (src >= src_end) break;
955 if ((c = *src++) == '1') /* end of compsition */
956 return (invalid_code_found ? src - src_start : 0);
957 if (src + 2 >= src_end) break;
958 if (!coding->flags & CODING_FLAG_ISO_DESIGNATION)
959 invalid_code_found = 1;
960 else
961 {
962 dim = 0;
963 if (c == '$')
964 {
965 dim = 1;
966 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
967 }
968 if (c >= '(' && c <= '/')
969 {
970 c1 = *src++;
971 if ((c1 < ' ' || c1 >= 0x80)
972 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
704c5781 973 || ! coding->safe_charsets[charset]
d46c5b12
KH
974 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
975 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
976 invalid_code_found = 1;
977 }
978 else
979 invalid_code_found = 1;
980 }
981 }
982 return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1);
983}
984
4ed46869
KH
985/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
986
987int
d46c5b12 988decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
989 struct coding_system *coding;
990 unsigned char *source, *destination;
991 int src_bytes, dst_bytes;
4ed46869
KH
992{
993 unsigned char *src = source;
994 unsigned char *src_end = source + src_bytes;
995 unsigned char *dst = destination;
996 unsigned char *dst_end = destination + dst_bytes;
997 /* Since the maximum bytes produced by each loop is 7, we subtract 6
998 from DST_END to assure that overflow checking is necessary only
999 at the head of loop. */
1000 unsigned char *adjusted_dst_end = dst_end - 6;
1001 int charset;
1002 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1003 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1004 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df 1005 Lisp_Object unification_table
d46c5b12
KH
1006 = coding->character_unification_table_for_decode;
1007 int result = CODING_FINISH_NORMAL;
bdd9fb48
KH
1008
1009 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1010 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869 1011
d46c5b12 1012 coding->produced_char = 0;
fb88bf2d 1013 coding->fake_multibyte = 0;
d46c5b12
KH
1014 while (src < src_end && (dst_bytes
1015 ? (dst < adjusted_dst_end)
1016 : (dst < src - 6)))
4ed46869
KH
1017 {
1018 /* SRC_BASE remembers the start position in source in each loop.
1019 The loop will be exited when there's not enough source text
1020 to analyze long escape sequence or 2-byte code (within macros
1021 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1022 to SRC_BASE before exiting. */
1023 unsigned char *src_base = src;
bdd9fb48 1024 int c1 = *src++, c2;
4ed46869
KH
1025
1026 switch (iso_code_class [c1])
1027 {
1028 case ISO_0x20_or_0x7F:
1029 if (!coding->composing
1030 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1031 {
1032 /* This is SPACE or DEL. */
1033 *dst++ = c1;
d46c5b12 1034 coding->produced_char++;
4ed46869
KH
1035 break;
1036 }
1037 /* This is a graphic character, we fall down ... */
1038
1039 case ISO_graphic_plane_0:
1040 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1041 {
1042 /* This is a composition rule. */
1043 *dst++ = c1 | 0x80;
1044 coding->composing = COMPOSING_WITH_RULE_TAIL;
1045 }
1046 else
1047 DECODE_ISO_CHARACTER (charset0, c1);
1048 break;
1049
1050 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1051 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1052 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1053 goto label_invalid_code;
4ed46869
KH
1054 /* This is a graphic character, we fall down ... */
1055
1056 case ISO_graphic_plane_1:
d46c5b12 1057 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1058 goto label_invalid_code;
d46c5b12
KH
1059 else
1060 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1061 break;
1062
1063 case ISO_control_code:
1064 /* All ISO2022 control characters in this class have the
1065 same representation in Emacs internal format. */
d46c5b12
KH
1066 if (c1 == '\n'
1067 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1068 && (coding->eol_type == CODING_EOL_CR
1069 || coding->eol_type == CODING_EOL_CRLF))
1070 {
1071 result = CODING_FINISH_INCONSISTENT_EOL;
1072 goto label_end_of_loop_2;
1073 }
4ed46869 1074 *dst++ = c1;
d46c5b12 1075 coding->produced_char++;
4ed46869
KH
1076 break;
1077
1078 case ISO_carriage_return:
1079 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1080 *dst++ = '\n';
4ed46869
KH
1081 else if (coding->eol_type == CODING_EOL_CRLF)
1082 {
1083 ONE_MORE_BYTE (c1);
1084 if (c1 == ISO_CODE_LF)
1085 *dst++ = '\n';
1086 else
1087 {
d46c5b12
KH
1088 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1089 {
1090 result = CODING_FINISH_INCONSISTENT_EOL;
1091 goto label_end_of_loop_2;
1092 }
4ed46869 1093 src--;
d46c5b12 1094 *dst++ = '\r';
4ed46869
KH
1095 }
1096 }
1097 else
d46c5b12
KH
1098 *dst++ = c1;
1099 coding->produced_char++;
4ed46869
KH
1100 break;
1101
1102 case ISO_shift_out:
d46c5b12
KH
1103 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1104 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1105 goto label_invalid_code;
4ed46869
KH
1106 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1107 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1108 break;
1109
1110 case ISO_shift_in:
d46c5b12
KH
1111 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1112 goto label_invalid_code;
4ed46869
KH
1113 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1114 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1115 break;
1116
1117 case ISO_single_shift_2_7:
1118 case ISO_single_shift_2:
d46c5b12
KH
1119 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1120 goto label_invalid_code;
4ed46869
KH
1121 /* SS2 is handled as an escape sequence of ESC 'N' */
1122 c1 = 'N';
1123 goto label_escape_sequence;
1124
1125 case ISO_single_shift_3:
d46c5b12
KH
1126 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1127 goto label_invalid_code;
4ed46869
KH
1128 /* SS2 is handled as an escape sequence of ESC 'O' */
1129 c1 = 'O';
1130 goto label_escape_sequence;
1131
1132 case ISO_control_sequence_introducer:
1133 /* CSI is handled as an escape sequence of ESC '[' ... */
1134 c1 = '[';
1135 goto label_escape_sequence;
1136
1137 case ISO_escape:
1138 ONE_MORE_BYTE (c1);
1139 label_escape_sequence:
1140 /* Escape sequences handled by Emacs are invocation,
1141 designation, direction specification, and character
1142 composition specification. */
1143 switch (c1)
1144 {
1145 case '&': /* revision of following character set */
1146 ONE_MORE_BYTE (c1);
1147 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1148 goto label_invalid_code;
4ed46869
KH
1149 ONE_MORE_BYTE (c1);
1150 if (c1 != ISO_CODE_ESC)
d46c5b12 1151 goto label_invalid_code;
4ed46869
KH
1152 ONE_MORE_BYTE (c1);
1153 goto label_escape_sequence;
1154
1155 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1156 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1157 goto label_invalid_code;
4ed46869
KH
1158 ONE_MORE_BYTE (c1);
1159 if (c1 >= '@' && c1 <= 'B')
1160 { /* designation of JISX0208.1978, GB2312.1980,
1161 or JISX0208.1980 */
1162 DECODE_DESIGNATION (0, 2, 94, c1);
1163 }
1164 else if (c1 >= 0x28 && c1 <= 0x2B)
1165 { /* designation of DIMENSION2_CHARS94 character set */
1166 ONE_MORE_BYTE (c2);
1167 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1168 }
1169 else if (c1 >= 0x2C && c1 <= 0x2F)
1170 { /* designation of DIMENSION2_CHARS96 character set */
1171 ONE_MORE_BYTE (c2);
1172 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1173 }
1174 else
d46c5b12 1175 goto label_invalid_code;
4ed46869
KH
1176 break;
1177
1178 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1179 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1180 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1181 goto label_invalid_code;
4ed46869 1182 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1183 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1184 break;
1185
1186 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1187 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1188 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1189 goto label_invalid_code;
4ed46869 1190 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1191 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1192 break;
1193
1194 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1195 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1196 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1197 goto label_invalid_code;
4ed46869
KH
1198 ONE_MORE_BYTE (c1);
1199 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1200 DECODE_ISO_CHARACTER (charset, c1);
1201 break;
1202
1203 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1204 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1205 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1206 goto label_invalid_code;
4ed46869
KH
1207 ONE_MORE_BYTE (c1);
1208 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1209 DECODE_ISO_CHARACTER (charset, c1);
1210 break;
1211
d46c5b12
KH
1212 case '0': case '2': /* start composing */
1213 /* Before processing composing, we must be sure that all
1214 characters being composed are supported by CODING.
1215 If not, we must give up composing and insert the
1216 bunch of codes for composing as is without decoding. */
1217 {
1218 int result1;
1219
1220 result1 = check_composing_code (coding, src, src_end);
1221 if (result1 == 0)
1222 coding->composing = (c1 == '0'
1223 ? COMPOSING_NO_RULE_HEAD
1224 : COMPOSING_WITH_RULE_HEAD);
1225 else if (result1 > 0)
1226 {
1227 if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst)
1228 {
1229 bcopy (src_base, dst, result1 + 2);
1230 src += result1;
1231 dst += result1 + 2;
1232 coding->produced_char += result1 + 2;
1233 }
1234 else
1235 {
1236 result = CODING_FINISH_INSUFFICIENT_DST;
1237 goto label_end_of_loop_2;
1238 }
1239 }
1240 else
1241 goto label_end_of_loop;
1242 }
4ed46869
KH
1243 break;
1244
1245 case '1': /* end composing */
1246 coding->composing = COMPOSING_NO;
d46c5b12 1247 coding->produced_char++;
4ed46869
KH
1248 break;
1249
1250 case '[': /* specification of direction */
d46c5b12
KH
1251 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1252 goto label_invalid_code;
4ed46869 1253 /* For the moment, nested direction is not supported.
d46c5b12
KH
1254 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1255 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1256 ONE_MORE_BYTE (c1);
1257 switch (c1)
1258 {
1259 case ']': /* end of the current direction */
d46c5b12 1260 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1261
1262 case '0': /* end of the current direction */
1263 case '1': /* start of left-to-right direction */
1264 ONE_MORE_BYTE (c1);
1265 if (c1 == ']')
d46c5b12 1266 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1267 else
d46c5b12 1268 goto label_invalid_code;
4ed46869
KH
1269 break;
1270
1271 case '2': /* start of right-to-left direction */
1272 ONE_MORE_BYTE (c1);
1273 if (c1 == ']')
d46c5b12 1274 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1275 else
d46c5b12 1276 goto label_invalid_code;
4ed46869
KH
1277 break;
1278
1279 default:
d46c5b12 1280 goto label_invalid_code;
4ed46869
KH
1281 }
1282 break;
1283
1284 default:
d46c5b12
KH
1285 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1286 goto label_invalid_code;
4ed46869
KH
1287 if (c1 >= 0x28 && c1 <= 0x2B)
1288 { /* designation of DIMENSION1_CHARS94 character set */
1289 ONE_MORE_BYTE (c2);
1290 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1291 }
1292 else if (c1 >= 0x2C && c1 <= 0x2F)
1293 { /* designation of DIMENSION1_CHARS96 character set */
1294 ONE_MORE_BYTE (c2);
1295 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1296 }
1297 else
1298 {
d46c5b12 1299 goto label_invalid_code;
4ed46869
KH
1300 }
1301 }
1302 /* We must update these variables now. */
1303 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1304 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1305 break;
1306
d46c5b12 1307 label_invalid_code:
d46c5b12
KH
1308 while (src_base < src)
1309 *dst++ = *src_base++;
fb88bf2d 1310 coding->fake_multibyte = 1;
4ed46869
KH
1311 }
1312 continue;
1313
1314 label_end_of_loop:
d46c5b12
KH
1315 result = CODING_FINISH_INSUFFICIENT_SRC;
1316 label_end_of_loop_2:
4ed46869
KH
1317 src = src_base;
1318 break;
1319 }
1320
fb88bf2d 1321 if (src < src_end)
4ed46869 1322 {
fb88bf2d
KH
1323 if (result == CODING_FINISH_NORMAL)
1324 result = CODING_FINISH_INSUFFICIENT_DST;
1325 else if (result != CODING_FINISH_INCONSISTENT_EOL
1326 && coding->mode & CODING_MODE_LAST_BLOCK)
1327 {
1328 /* This is the last block of the text to be decoded. We had
1329 better just flush out all remaining codes in the text
1330 although they are not valid characters. */
1331 src_bytes = src_end - src;
1332 if (dst_bytes && (dst_end - dst < src_bytes))
1333 src_bytes = dst_end - dst;
1334 bcopy (src, dst, src_bytes);
1335 dst += src_bytes;
1336 src += src_bytes;
1337 coding->fake_multibyte = 1;
1338 }
4ed46869 1339 }
fb88bf2d 1340
d46c5b12
KH
1341 coding->consumed = coding->consumed_char = src - source;
1342 coding->produced = dst - destination;
1343 return result;
4ed46869
KH
1344}
1345
f4dee582 1346/* ISO2022 encoding stuff. */
4ed46869
KH
1347
1348/*
f4dee582 1349 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1350 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1351 variant has the following specifications:
1352 1. Initial designation to G0 thru G3.
1353 2. Allows short-form designation?
1354 3. ASCII should be designated to G0 before control characters?
1355 4. ASCII should be designated to G0 at end of line?
1356 5. 7-bit environment or 8-bit environment?
1357 6. Use locking-shift?
1358 7. Use Single-shift?
1359 And the following two are only for Japanese:
1360 8. Use ASCII in place of JIS0201-1976-Roman?
1361 9. Use JISX0208-1983 in place of JISX0208-1978?
1362 These specifications are encoded in `coding->flags' as flag bits
1363 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1364 details.
4ed46869
KH
1365*/
1366
1367/* Produce codes (escape sequence) for designating CHARSET to graphic
1368 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1369 the coding system CODING allows, produce designation sequence of
1370 short-form. */
1371
1372#define ENCODE_DESIGNATION(charset, reg, coding) \
1373 do { \
1374 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1375 char *intermediate_char_94 = "()*+"; \
1376 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1377 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1378 if (revision < 255) \
1379 { \
4ed46869
KH
1380 *dst++ = ISO_CODE_ESC; \
1381 *dst++ = '&'; \
70c22245 1382 *dst++ = '@' + revision; \
4ed46869
KH
1383 } \
1384 *dst++ = ISO_CODE_ESC; \
1385 if (CHARSET_DIMENSION (charset) == 1) \
1386 { \
1387 if (CHARSET_CHARS (charset) == 94) \
1388 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1389 else \
1390 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1391 } \
1392 else \
1393 { \
1394 *dst++ = '$'; \
1395 if (CHARSET_CHARS (charset) == 94) \
1396 { \
1397 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1398 || reg != 0 \
1399 || final_char < '@' || final_char > 'B') \
1400 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1401 } \
1402 else \
1403 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1404 } \
1405 *dst++ = final_char; \
1406 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1407 } while (0)
1408
1409/* The following two macros produce codes (control character or escape
1410 sequence) for ISO2022 single-shift functions (single-shift-2 and
1411 single-shift-3). */
1412
1413#define ENCODE_SINGLE_SHIFT_2 \
1414 do { \
1415 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1416 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1417 else \
fb88bf2d
KH
1418 { \
1419 *dst++ = ISO_CODE_SS2; \
1420 coding->fake_multibyte = 1; \
1421 } \
4ed46869
KH
1422 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1423 } while (0)
1424
fb88bf2d
KH
1425#define ENCODE_SINGLE_SHIFT_3 \
1426 do { \
4ed46869 1427 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1428 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1429 else \
1430 { \
1431 *dst++ = ISO_CODE_SS3; \
1432 coding->fake_multibyte = 1; \
1433 } \
4ed46869
KH
1434 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1435 } while (0)
1436
1437/* The following four macros produce codes (control character or
1438 escape sequence) for ISO2022 locking-shift functions (shift-in,
1439 shift-out, locking-shift-2, and locking-shift-3). */
1440
1441#define ENCODE_SHIFT_IN \
1442 do { \
1443 *dst++ = ISO_CODE_SI; \
1444 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1445 } while (0)
1446
1447#define ENCODE_SHIFT_OUT \
1448 do { \
1449 *dst++ = ISO_CODE_SO; \
1450 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1451 } while (0)
1452
1453#define ENCODE_LOCKING_SHIFT_2 \
1454 do { \
1455 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1457 } while (0)
1458
1459#define ENCODE_LOCKING_SHIFT_3 \
1460 do { \
1461 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1462 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1463 } while (0)
1464
f4dee582
RS
1465/* Produce codes for a DIMENSION1 character whose character set is
1466 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1467 sequences are also produced in advance if necessary. */
1468
1469
6e85d753
KH
1470#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1471 do { \
1472 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1473 { \
1474 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1475 *dst++ = c1 & 0x7F; \
1476 else \
1477 *dst++ = c1 | 0x80; \
1478 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1479 break; \
1480 } \
1481 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1482 { \
1483 *dst++ = c1 & 0x7F; \
1484 break; \
1485 } \
1486 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1487 { \
1488 *dst++ = c1 | 0x80; \
1489 break; \
1490 } \
1491 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1492 && !coding->safe_charsets[charset]) \
6e85d753
KH
1493 { \
1494 /* We should not encode this character, instead produce one or \
1495 two `?'s. */ \
1496 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1497 if (CHARSET_WIDTH (charset) == 2) \
1498 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1499 break; \
1500 } \
1501 else \
1502 /* Since CHARSET is not yet invoked to any graphic planes, we \
1503 must invoke it, or, at first, designate it to some graphic \
1504 register. Then repeat the loop to actually produce the \
1505 character. */ \
1506 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1507 } while (1)
1508
f4dee582
RS
1509/* Produce codes for a DIMENSION2 character whose character set is
1510 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1511 invocation codes are also produced in advance if necessary. */
1512
6e85d753
KH
1513#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1514 do { \
1515 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1516 { \
1517 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1518 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1519 else \
1520 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1521 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1522 break; \
1523 } \
1524 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1525 { \
1526 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1527 break; \
1528 } \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1530 { \
1531 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1532 break; \
1533 } \
1534 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1535 && !coding->safe_charsets[charset]) \
6e85d753
KH
1536 { \
1537 /* We should not encode this character, instead produce one or \
1538 two `?'s. */ \
1539 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1540 if (CHARSET_WIDTH (charset) == 2) \
1541 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1542 break; \
1543 } \
1544 else \
1545 /* Since CHARSET is not yet invoked to any graphic planes, we \
1546 must invoke it, or, at first, designate it to some graphic \
1547 register. Then repeat the loop to actually produce the \
1548 character. */ \
1549 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1550 } while (1)
1551
bdd9fb48
KH
1552#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1553 do { \
1554 int c_alt, charset_alt; \
1555 if (!NILP (unification_table) \
1556 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1557 >= 0)) \
bdd9fb48
KH
1558 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1559 else \
1560 charset_alt = charset; \
1561 if (CHARSET_DIMENSION (charset_alt) == 1) \
4031e2bf
KH
1562 { \
1563 if (charset == CHARSET_ASCII \
1564 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1565 charset_alt = charset_latin_jisx0201; \
1566 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1567 } \
bdd9fb48 1568 else \
4031e2bf
KH
1569 { \
1570 if (charset == charset_jisx0208 \
1571 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1572 charset_alt = charset_jisx0208_1978; \
1573 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1574 } \
d46c5b12
KH
1575 if (! COMPOSING_P (coding->composing)) \
1576 coding->consumed_char++; \
4031e2bf 1577 } while (0)
bdd9fb48 1578
4ed46869
KH
1579/* Produce designation and invocation codes at a place pointed by DST
1580 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1581 Return new DST. */
1582
1583unsigned char *
1584encode_invocation_designation (charset, coding, dst)
1585 int charset;
1586 struct coding_system *coding;
1587 unsigned char *dst;
1588{
1589 int reg; /* graphic register number */
1590
1591 /* At first, check designations. */
1592 for (reg = 0; reg < 4; reg++)
1593 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1594 break;
1595
1596 if (reg >= 4)
1597 {
1598 /* CHARSET is not yet designated to any graphic registers. */
1599 /* At first check the requested designation. */
1600 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1601 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1602 /* Since CHARSET requests no special designation, designate it
1603 to graphic register 0. */
4ed46869
KH
1604 reg = 0;
1605
1606 ENCODE_DESIGNATION (charset, reg, coding);
1607 }
1608
1609 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1610 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1611 {
1612 /* Since the graphic register REG is not invoked to any graphic
1613 planes, invoke it to graphic plane 0. */
1614 switch (reg)
1615 {
1616 case 0: /* graphic register 0 */
1617 ENCODE_SHIFT_IN;
1618 break;
1619
1620 case 1: /* graphic register 1 */
1621 ENCODE_SHIFT_OUT;
1622 break;
1623
1624 case 2: /* graphic register 2 */
1625 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1626 ENCODE_SINGLE_SHIFT_2;
1627 else
1628 ENCODE_LOCKING_SHIFT_2;
1629 break;
1630
1631 case 3: /* graphic register 3 */
1632 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1633 ENCODE_SINGLE_SHIFT_3;
1634 else
1635 ENCODE_LOCKING_SHIFT_3;
1636 break;
1637 }
1638 }
1639 return dst;
1640}
1641
1642/* The following two macros produce codes for indicating composition. */
1643#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1644#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1645#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1646
1647/* The following three macros produce codes for indicating direction
1648 of text. */
1649#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1650 do { \
1651 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1652 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1653 else \
1654 *dst++ = ISO_CODE_CSI; \
1655 } while (0)
1656
1657#define ENCODE_DIRECTION_R2L \
1658 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1659
1660#define ENCODE_DIRECTION_L2R \
1661 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1662
1663/* Produce codes for designation and invocation to reset the graphic
1664 planes and registers to initial state. */
e0e989f6
KH
1665#define ENCODE_RESET_PLANE_AND_REGISTER \
1666 do { \
1667 int reg; \
1668 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1669 ENCODE_SHIFT_IN; \
1670 for (reg = 0; reg < 4; reg++) \
1671 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1672 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1673 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1674 ENCODE_DESIGNATION \
1675 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1676 } while (0)
1677
bdd9fb48 1678/* Produce designation sequences of charsets in the line started from
d46c5b12 1679 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1680
1681 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1682 find all the necessary designations. */
1683
dfcf069d 1684void
bdd9fb48 1685encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1686 struct coding_system *coding;
bdd9fb48 1687 Lisp_Object table;
e0e989f6
KH
1688 unsigned char *src, *src_end, **dstp;
1689{
bdd9fb48
KH
1690 int charset, c, found = 0, reg;
1691 /* Table of charsets to be designated to each graphic register. */
1692 int r[4];
1693 unsigned char *dst = *dstp;
1694
1695 for (reg = 0; reg < 4; reg++)
1696 r[reg] = -1;
1697
1698 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1699 {
bdd9fb48
KH
1700 int bytes = BYTES_BY_CHAR_HEAD (*src);
1701
1702 if (NILP (table))
1703 charset = CHARSET_AT (src);
1704 else
e0e989f6 1705 {
35cb8686
RS
1706 int c_alt;
1707 unsigned char c1, c2;
bdd9fb48
KH
1708
1709 SPLIT_STRING(src, bytes, charset, c1, c2);
1710 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1711 charset = CHAR_CHARSET (c_alt);
e0e989f6 1712 }
bdd9fb48 1713
e0e989f6 1714 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1715 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1716 {
1717 found++;
1718 r[reg] = charset;
1719 }
1720
1721 src += bytes;
1722 }
1723
1724 if (found)
1725 {
1726 for (reg = 0; reg < 4; reg++)
1727 if (r[reg] >= 0
1728 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1729 ENCODE_DESIGNATION (r[reg], reg, coding);
1730 *dstp = dst;
e0e989f6 1731 }
e0e989f6
KH
1732}
1733
4ed46869
KH
1734/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1735
1736int
d46c5b12 1737encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1738 struct coding_system *coding;
1739 unsigned char *source, *destination;
1740 int src_bytes, dst_bytes;
4ed46869
KH
1741{
1742 unsigned char *src = source;
1743 unsigned char *src_end = source + src_bytes;
1744 unsigned char *dst = destination;
1745 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1746 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1747 from DST_END to assure overflow checking is necessary only at the
1748 head of loop. */
e0e989f6 1749 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1750 Lisp_Object unification_table
1751 = coding->character_unification_table_for_encode;
d46c5b12 1752 int result = CODING_FINISH_NORMAL;
bdd9fb48
KH
1753
1754 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1755 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869 1756
d46c5b12 1757 coding->consumed_char = 0;
fb88bf2d 1758 coding->fake_multibyte = 0;
d46c5b12
KH
1759 while (src < src_end && (dst_bytes
1760 ? (dst < adjusted_dst_end)
1761 : (dst < src - 19)))
4ed46869
KH
1762 {
1763 /* SRC_BASE remembers the start position in source in each loop.
1764 The loop will be exited when there's not enough source text
1765 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1766 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1767 reset to SRC_BASE before exiting. */
1768 unsigned char *src_base = src;
bdd9fb48 1769 int charset, c1, c2, c3, c4;
4ed46869 1770
e0e989f6
KH
1771 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1772 && CODING_SPEC_ISO_BOL (coding))
1773 {
bdd9fb48
KH
1774 /* We have to produce designation sequences if any now. */
1775 encode_designation_at_bol (coding, unification_table,
1776 src, src_end, &dst);
e0e989f6
KH
1777 CODING_SPEC_ISO_BOL (coding) = 0;
1778 }
1779
1780 c1 = *src++;
4ed46869 1781 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1782 seeing a leading-code encoded irregularly for composition, or
1783 a composition rule if composing with rule. We must set C1 to
1784 a normal leading-code or an ASCII code. If we are not seeing
1785 a composite character, we must reset composition,
1786 designation, and invocation states. */
4ed46869
KH
1787 if (COMPOSING_P (coding->composing))
1788 {
1789 if (c1 < 0xA0)
1790 {
1791 /* We are not in a composite character any longer. */
1792 coding->composing = COMPOSING_NO;
d46c5b12 1793 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1794 ENCODE_COMPOSITION_END;
1795 }
1796 else
1797 {
1798 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1799 {
1800 *dst++ = c1 & 0x7F;
1801 coding->composing = COMPOSING_WITH_RULE_HEAD;
1802 continue;
1803 }
1804 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1805 coding->composing = COMPOSING_WITH_RULE_RULE;
1806 if (c1 == 0xA0)
1807 {
1808 /* This is an ASCII component. */
1809 ONE_MORE_BYTE (c1);
1810 c1 &= 0x7F;
1811 }
1812 else
1813 /* This is a leading-code of non ASCII component. */
1814 c1 -= 0x20;
1815 }
1816 }
1817
1818 /* Now encode one character. C1 is a control character, an
1819 ASCII character, or a leading-code of multi-byte character. */
1820 switch (emacs_code_class[c1])
1821 {
1822 case EMACS_ascii_code:
bdd9fb48 1823 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1824 break;
1825
1826 case EMACS_control_code:
1827 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1828 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1829 *dst++ = c1;
d46c5b12 1830 coding->consumed_char++;
4ed46869
KH
1831 break;
1832
1833 case EMACS_carriage_return_code:
d46c5b12 1834 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1835 {
1836 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1837 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1838 *dst++ = c1;
d46c5b12 1839 coding->consumed_char++;
4ed46869
KH
1840 break;
1841 }
1842 /* fall down to treat '\r' as '\n' ... */
1843
1844 case EMACS_linefeed_code:
1845 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1846 ENCODE_RESET_PLANE_AND_REGISTER;
1847 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1848 bcopy (coding->spec.iso2022.initial_designation,
1849 coding->spec.iso2022.current_designation,
1850 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1851 if (coding->eol_type == CODING_EOL_LF
0ef69138 1852 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1853 *dst++ = ISO_CODE_LF;
1854 else if (coding->eol_type == CODING_EOL_CRLF)
1855 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1856 else
1857 *dst++ = ISO_CODE_CR;
e0e989f6 1858 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1859 coding->consumed_char++;
4ed46869
KH
1860 break;
1861
1862 case EMACS_leading_code_2:
1863 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1864 if (c2 < 0xA0)
1865 {
1866 /* invalid sequence */
1867 *dst++ = c1;
1868 *dst++ = c2;
d46c5b12 1869 coding->consumed_char += 2;
19a8d9e0
KH
1870 }
1871 else
1872 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1873 break;
1874
1875 case EMACS_leading_code_3:
1876 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1877 if (c2 < 0xA0 || c3 < 0xA0)
1878 {
1879 /* invalid sequence */
1880 *dst++ = c1;
1881 *dst++ = c2;
1882 *dst++ = c3;
d46c5b12 1883 coding->consumed_char += 3;
19a8d9e0
KH
1884 }
1885 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1886 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1887 else
bdd9fb48 1888 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1889 break;
1890
1891 case EMACS_leading_code_4:
1892 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1893 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1894 {
1895 /* invalid sequence */
1896 *dst++ = c1;
1897 *dst++ = c2;
1898 *dst++ = c3;
1899 *dst++ = c4;
d46c5b12 1900 coding->consumed_char += 4;
19a8d9e0
KH
1901 }
1902 else
1903 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1904 break;
1905
1906 case EMACS_leading_code_composition:
19a8d9e0
KH
1907 ONE_MORE_BYTE (c2);
1908 if (c2 < 0xA0)
1909 {
1910 /* invalid sequence */
1911 *dst++ = c1;
1912 *dst++ = c2;
d46c5b12 1913 coding->consumed_char += 2;
19a8d9e0
KH
1914 }
1915 else if (c2 == 0xFF)
4ed46869 1916 {
d46c5b12 1917 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1918 coding->composing = COMPOSING_WITH_RULE_HEAD;
1919 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 1920 coding->consumed_char++;
4ed46869
KH
1921 }
1922 else
1923 {
d46c5b12 1924 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1925 /* Rewind one byte because it is a character code of
1926 composition elements. */
1927 src--;
1928 coding->composing = COMPOSING_NO_RULE_HEAD;
1929 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 1930 coding->consumed_char++;
4ed46869
KH
1931 }
1932 break;
1933
1934 case EMACS_invalid_code:
1935 *dst++ = c1;
d46c5b12 1936 coding->consumed_char++;
4ed46869
KH
1937 break;
1938 }
1939 continue;
1940 label_end_of_loop:
d46c5b12
KH
1941 result = CODING_FINISH_INSUFFICIENT_SRC;
1942 src = src_base;
4ed46869
KH
1943 break;
1944 }
1945
fb88bf2d
KH
1946 if (src < src_end)
1947 {
1948 if (result == CODING_FINISH_NORMAL)
1949 result = CODING_FINISH_INSUFFICIENT_DST;
1950 else
1951 /* If this is the last block of the text to be encoded, we
1952 must reset graphic planes and registers to the initial
1953 state, and flush out the carryover if any. */
1954 if (coding->mode & CODING_MODE_LAST_BLOCK)
1955 ENCODE_RESET_PLANE_AND_REGISTER;
1956 }
d46c5b12
KH
1957
1958 coding->consumed = src - source;
1959 coding->produced = coding->produced_char = dst - destination;
1960 return result;
4ed46869
KH
1961}
1962
1963\f
1964/*** 4. SJIS and BIG5 handlers ***/
1965
f4dee582 1966/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1967 quite widely. So, for the moment, Emacs supports them in the bare
1968 C code. But, in the future, they may be supported only by CCL. */
1969
1970/* SJIS is a coding system encoding three character sets: ASCII, right
1971 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1972 as is. A character of charset katakana-jisx0201 is encoded by
1973 "position-code + 0x80". A character of charset japanese-jisx0208
1974 is encoded in 2-byte but two position-codes are divided and shifted
1975 so that it fit in the range below.
1976
1977 --- CODE RANGE of SJIS ---
1978 (character set) (range)
1979 ASCII 0x00 .. 0x7F
1980 KATAKANA-JISX0201 0xA0 .. 0xDF
1981 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1982 (2nd byte) 0x40 .. 0xFF
1983 -------------------------------
1984
1985*/
1986
1987/* BIG5 is a coding system encoding two character sets: ASCII and
1988 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1989 character set and is encoded in two-byte.
1990
1991 --- CODE RANGE of BIG5 ---
1992 (character set) (range)
1993 ASCII 0x00 .. 0x7F
1994 Big5 (1st byte) 0xA1 .. 0xFE
1995 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1996 --------------------------
1997
1998 Since the number of characters in Big5 is larger than maximum
1999 characters in Emacs' charset (96x96), it can't be handled as one
2000 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2001 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2002 contains frequently used characters and the latter contains less
2003 frequently used characters. */
2004
2005/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2006 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2007 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2008 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2009
2010/* Number of Big5 characters which have the same code in 1st byte. */
2011#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2012
2013#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2014 do { \
2015 unsigned int temp \
2016 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2017 if (b1 < 0xC9) \
2018 charset = charset_big5_1; \
2019 else \
2020 { \
2021 charset = charset_big5_2; \
2022 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2023 } \
2024 c1 = temp / (0xFF - 0xA1) + 0x21; \
2025 c2 = temp % (0xFF - 0xA1) + 0x21; \
2026 } while (0)
2027
2028#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2029 do { \
2030 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2031 if (charset == charset_big5_2) \
2032 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2033 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2034 b2 = temp % BIG5_SAME_ROW; \
2035 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2036 } while (0)
2037
a5d301df
KH
2038#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2039 do { \
2040 int c_alt, charset_alt = (charset); \
2041 if (!NILP (unification_table) \
2042 && ((c_alt = unify_char (unification_table, \
2043 -1, (charset), c1, c2)) >= 0)) \
2044 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2045 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2046 DECODE_CHARACTER_ASCII (c1); \
2047 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2048 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2049 else \
2050 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2051 } while (0)
2052
2053#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2054 do { \
2055 int c_alt, charset_alt; \
2056 if (!NILP (unification_table) \
2057 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2058 >= 0)) \
2059 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2060 else \
2061 charset_alt = charset; \
2062 if (charset_alt == charset_ascii) \
2063 *dst++ = c1; \
2064 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2065 { \
2066 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2067 *dst++ = c1; \
2068 else \
fb88bf2d
KH
2069 { \
2070 *dst++ = charset_alt, *dst++ = c1; \
2071 coding->fake_multibyte = 1; \
2072 } \
a5d301df
KH
2073 } \
2074 else \
2075 { \
2076 c1 &= 0x7F, c2 &= 0x7F; \
2077 if (sjis_p && charset_alt == charset_jisx0208) \
2078 { \
2079 unsigned char s1, s2; \
fb88bf2d 2080 \
a5d301df
KH
2081 ENCODE_SJIS (c1, c2, s1, s2); \
2082 *dst++ = s1, *dst++ = s2; \
fb88bf2d 2083 coding->fake_multibyte = 1; \
a5d301df
KH
2084 } \
2085 else if (!sjis_p \
2086 && (charset_alt == charset_big5_1 \
2087 || charset_alt == charset_big5_2)) \
2088 { \
2089 unsigned char b1, b2; \
fb88bf2d 2090 \
9ce27fde 2091 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
2092 *dst++ = b1, *dst++ = b2; \
2093 } \
2094 else \
fb88bf2d
KH
2095 { \
2096 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2097 coding->fake_multibyte = 1; \
2098 } \
a5d301df 2099 } \
d46c5b12 2100 coding->consumed_char++; \
a5d301df
KH
2101 } while (0);
2102
4ed46869
KH
2103/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2104 Check if a text is encoded in SJIS. If it is, return
2105 CODING_CATEGORY_MASK_SJIS, else return 0. */
2106
2107int
2108detect_coding_sjis (src, src_end)
2109 unsigned char *src, *src_end;
2110{
2111 unsigned char c;
2112
2113 while (src < src_end)
2114 {
2115 c = *src++;
4ed46869
KH
2116 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2117 {
2118 if (src < src_end && *src++ < 0x40)
2119 return 0;
2120 }
2121 }
2122 return CODING_CATEGORY_MASK_SJIS;
2123}
2124
2125/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2126 Check if a text is encoded in BIG5. If it is, return
2127 CODING_CATEGORY_MASK_BIG5, else return 0. */
2128
2129int
2130detect_coding_big5 (src, src_end)
2131 unsigned char *src, *src_end;
2132{
2133 unsigned char c;
2134
2135 while (src < src_end)
2136 {
2137 c = *src++;
4ed46869
KH
2138 if (c >= 0xA1)
2139 {
2140 if (src >= src_end)
2141 break;
2142 c = *src++;
2143 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2144 return 0;
2145 }
2146 }
2147 return CODING_CATEGORY_MASK_BIG5;
2148}
2149
2150/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2151 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2152
2153int
2154decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2155 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2156 struct coding_system *coding;
2157 unsigned char *source, *destination;
2158 int src_bytes, dst_bytes;
4ed46869
KH
2159 int sjis_p;
2160{
2161 unsigned char *src = source;
2162 unsigned char *src_end = source + src_bytes;
2163 unsigned char *dst = destination;
2164 unsigned char *dst_end = destination + dst_bytes;
2165 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2166 from DST_END to assure overflow checking is necessary only at the
2167 head of loop. */
2168 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
2169 Lisp_Object unification_table
2170 = coding->character_unification_table_for_decode;
d46c5b12 2171 int result = CODING_FINISH_NORMAL;
a5d301df
KH
2172
2173 if (!NILP (Venable_character_unification) && NILP (unification_table))
2174 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869 2175
d46c5b12 2176 coding->produced_char = 0;
fb88bf2d 2177 coding->fake_multibyte = 0;
d46c5b12
KH
2178 while (src < src_end && (dst_bytes
2179 ? (dst < adjusted_dst_end)
2180 : (dst < src - 3)))
4ed46869
KH
2181 {
2182 /* SRC_BASE remembers the start position in source in each loop.
2183 The loop will be exited when there's not enough source text
2184 to analyze two-byte character (within macro ONE_MORE_BYTE).
2185 In that case, SRC is reset to SRC_BASE before exiting. */
2186 unsigned char *src_base = src;
2187 unsigned char c1 = *src++, c2, c3, c4;
2188
d46c5b12 2189 if (c1 < 0x20)
4ed46869 2190 {
d46c5b12 2191 if (c1 == '\r')
4ed46869 2192 {
d46c5b12
KH
2193 if (coding->eol_type == CODING_EOL_CRLF)
2194 {
2195 ONE_MORE_BYTE (c2);
2196 if (c2 == '\n')
2197 *dst++ = c2;
2198 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2199 {
2200 result = CODING_FINISH_INCONSISTENT_EOL;
2201 goto label_end_of_loop_2;
2202 }
2203 else
2204 /* To process C2 again, SRC is subtracted by 1. */
2205 *dst++ = c1, src--;
2206 }
2207 else if (coding->eol_type == CODING_EOL_CR)
2208 *dst++ = '\n';
4ed46869 2209 else
d46c5b12
KH
2210 *dst++ = c1;
2211 }
2212 else if (c1 == '\n'
2213 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2214 && (coding->eol_type == CODING_EOL_CR
2215 || coding->eol_type == CODING_EOL_CRLF))
2216 {
2217 result = CODING_FINISH_INCONSISTENT_EOL;
2218 goto label_end_of_loop_2;
4ed46869
KH
2219 }
2220 else
2221 *dst++ = c1;
d46c5b12 2222 coding->produced_char++;
4ed46869 2223 }
a5d301df
KH
2224 else if (c1 < 0x80)
2225 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
fb88bf2d 2226 else if (c1 < 0xA0)
4ed46869 2227 {
fb88bf2d 2228 /* SJIS -> JISX0208 */
4ed46869
KH
2229 if (sjis_p)
2230 {
2231 ONE_MORE_BYTE (c2);
fb88bf2d
KH
2232 if (c2 >= 0x40)
2233 {
2234 DECODE_SJIS (c1, c2, c3, c4);
2235 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2236 }
2237 else
2238 goto label_invalid_code_2;
4ed46869 2239 }
fb88bf2d
KH
2240 else
2241 goto label_invalid_code_1;
2242 }
2243 else if (c1 < 0xE0)
2244 {
2245 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2246 if (sjis_p)
2247 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2248 /* dummy */ c2);
2249 else
4ed46869
KH
2250 {
2251 int charset;
2252
2253 ONE_MORE_BYTE (c2);
fb88bf2d
KH
2254 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2255 {
2256 DECODE_BIG5 (c1, c2, charset, c3, c4);
2257 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2258 }
2259 else
2260 goto label_invalid_code_2;
d46c5b12 2261 }
4ed46869 2262 }
fb88bf2d 2263 else /* C1 >= 0xE0 */
4ed46869 2264 {
fb88bf2d 2265 /* SJIS -> JISX0208, BIG5 -> Big5 */
4ed46869 2266 if (sjis_p)
fb88bf2d
KH
2267 {
2268 ONE_MORE_BYTE (c2);
2269 if (c2 >= 0x40)
2270 {
2271 DECODE_SJIS (c1, c2, c3, c4);
2272 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2273 }
2274 else
2275 goto label_invalid_code_2;
2276 }
4ed46869
KH
2277 else
2278 {
2279 int charset;
2280
2281 ONE_MORE_BYTE (c2);
fb88bf2d
KH
2282 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2283 {
2284 DECODE_BIG5 (c1, c2, charset, c3, c4);
2285 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2286 }
2287 else
2288 goto label_invalid_code_2;
4ed46869
KH
2289 }
2290 }
2291 continue;
2292
fb88bf2d
KH
2293 label_invalid_code_1:
2294 *dst++ = c1;
2295 coding->produced_char++;
2296 coding->fake_multibyte = 1;
2297 continue;
2298
2299 label_invalid_code_2:
2300 *dst++ = c1; *dst++= c2;
2301 coding->produced_char += 2;
2302 coding->fake_multibyte = 1;
2303 continue;
2304
4ed46869 2305 label_end_of_loop:
d46c5b12
KH
2306 result = CODING_FINISH_INSUFFICIENT_SRC;
2307 label_end_of_loop_2:
4ed46869
KH
2308 src = src_base;
2309 break;
2310 }
2311
fb88bf2d
KH
2312 if (src < src_end)
2313 {
2314 if (result == CODING_FINISH_NORMAL)
2315 result = CODING_FINISH_INSUFFICIENT_DST;
2316 else if (result != CODING_FINISH_INCONSISTENT_EOL
2317 && coding->mode & CODING_MODE_LAST_BLOCK)
2318 {
2319 src_bytes = src_end - src;
2320 if (dst_bytes && (dst_end - dst < src_bytes))
2321 src_bytes = dst_end - dst;
2322 bcopy (dst, src, src_bytes);
2323 src += src_bytes;
2324 dst += src_bytes;
2325 coding->fake_multibyte = 1;
2326 }
2327 }
d46c5b12
KH
2328
2329 coding->consumed = coding->consumed_char = src - source;
2330 coding->produced = dst - destination;
2331 return result;
4ed46869
KH
2332}
2333
2334/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2335 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2336 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2337 sure that all these charsets are registered as official charset
2338 (i.e. do not have extended leading-codes). Characters of other
2339 charsets are produced without any encoding. If SJIS_P is 1, encode
2340 SJIS text, else encode BIG5 text. */
2341
2342int
2343encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2344 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2345 struct coding_system *coding;
2346 unsigned char *source, *destination;
2347 int src_bytes, dst_bytes;
4ed46869
KH
2348 int sjis_p;
2349{
2350 unsigned char *src = source;
2351 unsigned char *src_end = source + src_bytes;
2352 unsigned char *dst = destination;
2353 unsigned char *dst_end = destination + dst_bytes;
2354 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2355 from DST_END to assure overflow checking is necessary only at the
2356 head of loop. */
2357 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
2358 Lisp_Object unification_table
2359 = coding->character_unification_table_for_encode;
d46c5b12 2360 int result = CODING_FINISH_NORMAL;
a5d301df
KH
2361
2362 if (!NILP (Venable_character_unification) && NILP (unification_table))
2363 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869 2364
d46c5b12 2365 coding->consumed_char = 0;
fb88bf2d 2366 coding->fake_multibyte = 0;
d46c5b12
KH
2367 while (src < src_end && (dst_bytes
2368 ? (dst < adjusted_dst_end)
2369 : (dst < src - 1)))
4ed46869
KH
2370 {
2371 /* SRC_BASE remembers the start position in source in each loop.
2372 The loop will be exited when there's not enough source text
2373 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2374 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2375 before exiting. */
2376 unsigned char *src_base = src;
2377 unsigned char c1 = *src++, c2, c3, c4;
2378
2379 if (coding->composing)
2380 {
2381 if (c1 == 0xA0)
2382 {
2383 ONE_MORE_BYTE (c1);
2384 c1 &= 0x7F;
2385 }
2386 else if (c1 >= 0xA0)
2387 c1 -= 0x20;
2388 else
2389 coding->composing = 0;
2390 }
2391
2392 switch (emacs_code_class[c1])
2393 {
2394 case EMACS_ascii_code:
a5d301df
KH
2395 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2396 break;
2397
4ed46869
KH
2398 case EMACS_control_code:
2399 *dst++ = c1;
d46c5b12 2400 coding->consumed_char++;
4ed46869
KH
2401 break;
2402
2403 case EMACS_carriage_return_code:
d46c5b12 2404 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2405 {
2406 *dst++ = c1;
d46c5b12 2407 coding->consumed_char++;
4ed46869
KH
2408 break;
2409 }
2410 /* fall down to treat '\r' as '\n' ... */
2411
2412 case EMACS_linefeed_code:
2413 if (coding->eol_type == CODING_EOL_LF
0ef69138 2414 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2415 *dst++ = '\n';
2416 else if (coding->eol_type == CODING_EOL_CRLF)
2417 *dst++ = '\r', *dst++ = '\n';
2418 else
2419 *dst++ = '\r';
d46c5b12 2420 coding->consumed_char++;
4ed46869
KH
2421 break;
2422
2423 case EMACS_leading_code_2:
2424 ONE_MORE_BYTE (c2);
a5d301df 2425 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2426 break;
2427
2428 case EMACS_leading_code_3:
2429 TWO_MORE_BYTES (c2, c3);
a5d301df 2430 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2431 break;
2432
2433 case EMACS_leading_code_4:
2434 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2435 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2436 break;
2437
2438 case EMACS_leading_code_composition:
2439 coding->composing = 1;
2440 break;
2441
2442 default: /* i.e. case EMACS_invalid_code: */
2443 *dst++ = c1;
d46c5b12 2444 coding->consumed_char++;
4ed46869
KH
2445 }
2446 continue;
2447
2448 label_end_of_loop:
d46c5b12
KH
2449 result = CODING_FINISH_INSUFFICIENT_SRC;
2450 src = src_base;
4ed46869
KH
2451 break;
2452 }
2453
d46c5b12
KH
2454 if (result == CODING_FINISH_NORMAL
2455 && src < src_end)
2456 result = CODING_FINISH_INSUFFICIENT_DST;
2457 coding->consumed = src - source;
2458 coding->produced = coding->produced_char = dst - destination;
2459 return result;
4ed46869
KH
2460}
2461
2462\f
2463/*** 5. End-of-line handlers ***/
2464
2465/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2466 This function is called only when `coding->eol_type' is
2467 CODING_EOL_CRLF or CODING_EOL_CR. */
2468
dfcf069d 2469int
d46c5b12 2470decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2471 struct coding_system *coding;
2472 unsigned char *source, *destination;
2473 int src_bytes, dst_bytes;
4ed46869
KH
2474{
2475 unsigned char *src = source;
2476 unsigned char *src_end = source + src_bytes;
2477 unsigned char *dst = destination;
2478 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2479 unsigned char c;
d46c5b12
KH
2480 int result = CODING_FINISH_NORMAL;
2481
fb88bf2d
KH
2482 coding->fake_multibyte = 0;
2483
d46c5b12
KH
2484 if (src_bytes <= 0)
2485 return result;
4ed46869
KH
2486
2487 switch (coding->eol_type)
2488 {
2489 case CODING_EOL_CRLF:
2490 {
2491 /* Since the maximum bytes produced by each loop is 2, we
2492 subtract 1 from DST_END to assure overflow checking is
2493 necessary only at the head of loop. */
2494 unsigned char *adjusted_dst_end = dst_end - 1;
2495
d46c5b12
KH
2496 while (src < src_end && (dst_bytes
2497 ? (dst < adjusted_dst_end)
2498 : (dst < src - 1)))
4ed46869
KH
2499 {
2500 unsigned char *src_base = src;
fb88bf2d
KH
2501
2502 c = *src++;
4ed46869
KH
2503 if (c == '\r')
2504 {
2505 ONE_MORE_BYTE (c);
2506 if (c != '\n')
d46c5b12
KH
2507 {
2508 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2509 {
2510 result = CODING_FINISH_INCONSISTENT_EOL;
2511 goto label_end_of_loop_2;
2512 }
2513 *dst++ = '\r';
fb88bf2d
KH
2514 if (BASE_LEADING_CODE_P (c))
2515 coding->fake_multibyte = 1;
d46c5b12 2516 }
bfd99048 2517 *dst++ = c;
4ed46869 2518 }
d46c5b12
KH
2519 else if (c == '\n'
2520 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2521 {
2522 result = CODING_FINISH_INCONSISTENT_EOL;
2523 goto label_end_of_loop_2;
2524 }
4ed46869 2525 else
fb88bf2d
KH
2526 {
2527 *dst++ = c;
2528 if (BASE_LEADING_CODE_P (c))
2529 coding->fake_multibyte = 1;
2530 }
4ed46869
KH
2531 continue;
2532
2533 label_end_of_loop:
d46c5b12
KH
2534 result = CODING_FINISH_INSUFFICIENT_SRC;
2535 label_end_of_loop_2:
4ed46869
KH
2536 src = src_base;
2537 break;
2538 }
d46c5b12
KH
2539 if (result == CODING_FINISH_NORMAL
2540 && src < src_end)
2541 result = CODING_FINISH_INSUFFICIENT_DST;
4ed46869 2542 }
d46c5b12 2543 break;
4ed46869
KH
2544
2545 case CODING_EOL_CR:
d46c5b12
KH
2546 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2547 {
fb88bf2d
KH
2548 while (src < src_end)
2549 {
2550 if ((c = *src++) == '\n')
2551 break;
2552 if (BASE_LEADING_CODE_P (c))
2553 coding->fake_multibyte = 1;
2554 }
d46c5b12
KH
2555 if (*--src == '\n')
2556 {
2557 src_bytes = src - source;
2558 result = CODING_FINISH_INCONSISTENT_EOL;
2559 }
2560 }
2561 if (dst_bytes && src_bytes > dst_bytes)
2562 {
2563 result = CODING_FINISH_INSUFFICIENT_DST;
2564 src_bytes = dst_bytes;
2565 }
2566 if (dst_bytes)
2567 bcopy (source, destination, src_bytes);
2568 else
2569 safe_bcopy (source, destination, src_bytes);
2570 src = source + src_bytes;
2571 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2572 break;
2573
2574 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2575 if (dst_bytes && src_bytes > dst_bytes)
2576 {
2577 result = CODING_FINISH_INSUFFICIENT_DST;
2578 src_bytes = dst_bytes;
2579 }
2580 if (dst_bytes)
2581 bcopy (source, destination, src_bytes);
2582 else
2583 safe_bcopy (source, destination, src_bytes);
2584 src += src_bytes;
2585 dst += dst_bytes;
fb88bf2d 2586 coding->fake_multibyte = 1;
4ed46869
KH
2587 break;
2588 }
2589
d46c5b12
KH
2590 coding->consumed = coding->consumed_char = src - source;
2591 coding->produced = coding->produced_char = dst - destination;
2592 return result;
4ed46869
KH
2593}
2594
2595/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2596 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2597 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2598 '\r' in source text also means end-of-line. */
4ed46869 2599
dfcf069d 2600int
d46c5b12 2601encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2602 struct coding_system *coding;
2603 unsigned char *source, *destination;
2604 int src_bytes, dst_bytes;
4ed46869
KH
2605{
2606 unsigned char *src = source;
2607 unsigned char *dst = destination;
d46c5b12 2608 int result = CODING_FINISH_NORMAL;
4ed46869 2609
fb88bf2d
KH
2610 coding->fake_multibyte = 0;
2611
d46c5b12
KH
2612 if (coding->eol_type == CODING_EOL_CRLF)
2613 {
2614 unsigned char c;
2615 unsigned char *src_end = source + src_bytes;
2616 unsigned char *dst_end = destination + dst_bytes;
2617 /* Since the maximum bytes produced by each loop is 2, we
2618 subtract 1 from DST_END to assure overflow checking is
2619 necessary only at the head of loop. */
2620 unsigned char *adjusted_dst_end = dst_end - 1;
2621
2622 while (src < src_end && (dst_bytes
2623 ? (dst < adjusted_dst_end)
2624 : (dst < src - 1)))
2625 {
2626 c = *src++;
2627 if (c == '\n'
2628 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2629 *dst++ = '\r', *dst++ = '\n';
2630 else
fb88bf2d
KH
2631 {
2632 *dst++ = c;
2633 if (BASE_LEADING_CODE_P (c))
2634 coding->fake_multibyte = 1;
2635 }
d46c5b12
KH
2636 }
2637 if (src < src_end)
2638 result = CODING_FINISH_INSUFFICIENT_DST;
2639 }
2640 else
4ed46869 2641 {
fb88bf2d
KH
2642 unsigned char c;
2643
d46c5b12 2644 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2645 {
d46c5b12
KH
2646 src_bytes = dst_bytes;
2647 result = CODING_FINISH_INSUFFICIENT_DST;
2648 }
2649 if (dst_bytes)
2650 bcopy (source, destination, src_bytes);
2651 else
fb88bf2d
KH
2652 {
2653 safe_bcopy (source, destination, src_bytes);
2654 dst_bytes = src_bytes;
2655 }
d46c5b12
KH
2656 if (coding->eol_type == CODING_EOL_CRLF)
2657 {
2658 while (src_bytes--)
fb88bf2d
KH
2659 {
2660 if ((c = *dst++) == '\n')
2661 dst[-1] = '\r';
2662 else if (BASE_LEADING_CODE_P (c))
2663 coding->fake_multibyte = 1;
2664 }
d46c5b12 2665 }
fb88bf2d 2666 else
d46c5b12 2667 {
fb88bf2d
KH
2668 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2669 {
2670 while (src_bytes--)
2671 if (*dst++ == '\r') dst[-1] = '\n';
2672 }
2673 coding->fake_multibyte = 1;
4ed46869 2674 }
fb88bf2d
KH
2675 src = source + dst_bytes;
2676 dst = destination + dst_bytes;
4ed46869
KH
2677 }
2678
d46c5b12
KH
2679 coding->consumed = coding->consumed_char = src - source;
2680 coding->produced = coding->produced_char = dst - destination;
2681 return result;
4ed46869
KH
2682}
2683
2684\f
2685/*** 6. C library functions ***/
2686
2687/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2688 has a property `coding-system'. The value of this property is a
2689 vector of length 5 (called as coding-vector). Among elements of
2690 this vector, the first (element[0]) and the fifth (element[4])
2691 carry important information for decoding/encoding. Before
2692 decoding/encoding, this information should be set in fields of a
2693 structure of type `coding_system'.
2694
2695 A value of property `coding-system' can be a symbol of another
2696 subsidiary coding-system. In that case, Emacs gets coding-vector
2697 from that symbol.
2698
2699 `element[0]' contains information to be set in `coding->type'. The
2700 value and its meaning is as follows:
2701
0ef69138
KH
2702 0 -- coding_type_emacs_mule
2703 1 -- coding_type_sjis
2704 2 -- coding_type_iso2022
2705 3 -- coding_type_big5
2706 4 -- coding_type_ccl encoder/decoder written in CCL
2707 nil -- coding_type_no_conversion
2708 t -- coding_type_undecided (automatic conversion on decoding,
2709 no-conversion on encoding)
4ed46869
KH
2710
2711 `element[4]' contains information to be set in `coding->flags' and
2712 `coding->spec'. The meaning varies by `coding->type'.
2713
2714 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2715 of length 32 (of which the first 13 sub-elements are used now).
2716 Meanings of these sub-elements are:
2717
2718 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2719 If the value is an integer of valid charset, the charset is
2720 assumed to be designated to graphic register N initially.
2721
2722 If the value is minus, it is a minus value of charset which
2723 reserves graphic register N, which means that the charset is
2724 not designated initially but should be designated to graphic
2725 register N just before encoding a character in that charset.
2726
2727 If the value is nil, graphic register N is never used on
2728 encoding.
2729
2730 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2731 Each value takes t or nil. See the section ISO2022 of
2732 `coding.h' for more information.
2733
2734 If `coding->type' is `coding_type_big5', element[4] is t to denote
2735 BIG5-ETen or nil to denote BIG5-HKU.
2736
2737 If `coding->type' takes the other value, element[4] is ignored.
2738
2739 Emacs Lisp's coding system also carries information about format of
2740 end-of-line in a value of property `eol-type'. If the value is
2741 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2742 means CODING_EOL_CR. If it is not integer, it should be a vector
2743 of subsidiary coding systems of which property `eol-type' has one
2744 of above values.
2745
2746*/
2747
2748/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2749 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2750 is setup so that no conversion is necessary and return -1, else
2751 return 0. */
2752
2753int
e0e989f6
KH
2754setup_coding_system (coding_system, coding)
2755 Lisp_Object coding_system;
4ed46869
KH
2756 struct coding_system *coding;
2757{
d46c5b12 2758 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2759 Lisp_Object val;
70c22245 2760 int i;
4ed46869 2761
d46c5b12 2762 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2763 coding->symbol = coding_system;
d46c5b12
KH
2764 coding->common_flags = 0;
2765 coding->mode = 0;
2766 coding->heading_ascii = -1;
2767 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
4608c386
KH
2768 coding_spec = Fget (coding_system, Qcoding_system);
2769 if (!VECTORP (coding_spec)
2770 || XVECTOR (coding_spec)->size != 5
2771 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2772 goto label_invalid_coding_system;
4608c386 2773
d46c5b12
KH
2774 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2775 if (VECTORP (eol_type))
2776 {
2777 coding->eol_type = CODING_EOL_UNDECIDED;
2778 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2779 }
2780 else if (XFASTINT (eol_type) == 1)
2781 {
2782 coding->eol_type = CODING_EOL_CRLF;
2783 coding->common_flags
2784 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2785 }
2786 else if (XFASTINT (eol_type) == 2)
2787 {
2788 coding->eol_type = CODING_EOL_CR;
2789 coding->common_flags
2790 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2791 }
2792 else
2793 coding->eol_type = CODING_EOL_LF;
2794
2795 coding_type = XVECTOR (coding_spec)->contents[0];
2796 /* Try short cut. */
2797 if (SYMBOLP (coding_type))
2798 {
2799 if (EQ (coding_type, Qt))
2800 {
2801 coding->type = coding_type_undecided;
2802 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2803 }
2804 else
2805 coding->type = coding_type_no_conversion;
2806 return 0;
2807 }
2808
2809 /* Initialize remaining fields. */
2810 coding->composing = 0;
2811 coding->character_unification_table_for_decode = Qnil;
2812 coding->character_unification_table_for_encode = Qnil;
2813
2814 /* Get values of coding system properties:
2815 `post-read-conversion', `pre-write-conversion',
2816 `character-unification-table-for-decode',
2817 `character-unification-table-for-encode'. */
4608c386
KH
2818 plist = XVECTOR (coding_spec)->contents[3];
2819 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2820 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2821 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2822 if (SYMBOLP (val))
2823 val = Fget (val, Qcharacter_unification_table_for_decode);
2824 coding->character_unification_table_for_decode
2825 = CHAR_TABLE_P (val) ? val : Qnil;
2826 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2827 if (SYMBOLP (val))
2828 val = Fget (val, Qcharacter_unification_table_for_encode);
2829 coding->character_unification_table_for_encode
2830 = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2831 val = Fplist_get (plist, Qcoding_category);
2832 if (!NILP (val))
2833 {
2834 val = Fget (val, Qcoding_category_index);
2835 if (INTEGERP (val))
2836 coding->category_idx = XINT (val);
2837 else
2838 goto label_invalid_coding_system;
2839 }
2840 else
2841 goto label_invalid_coding_system;
4608c386 2842
70c22245
KH
2843 val = Fplist_get (plist, Qsafe_charsets);
2844 if (EQ (val, Qt))
2845 {
2846 for (i = 0; i <= MAX_CHARSET; i++)
2847 coding->safe_charsets[i] = 1;
2848 }
2849 else
2850 {
2851 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2852 while (CONSP (val))
2853 {
2854 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2855 coding->safe_charsets[i] = 1;
2856 val = XCONS (val)->cdr;
2857 }
2858 }
2859
d46c5b12 2860 switch (XFASTINT (coding_type))
4ed46869
KH
2861 {
2862 case 0:
0ef69138 2863 coding->type = coding_type_emacs_mule;
c952af22
KH
2864 if (!NILP (coding->post_read_conversion))
2865 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2866 if (!NILP (coding->pre_write_conversion))
2867 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2868 break;
2869
2870 case 1:
2871 coding->type = coding_type_sjis;
c952af22
KH
2872 coding->common_flags
2873 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2874 break;
2875
2876 case 2:
2877 coding->type = coding_type_iso2022;
c952af22
KH
2878 coding->common_flags
2879 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2880 {
70c22245 2881 Lisp_Object val, temp;
4ed46869 2882 Lisp_Object *flags;
d46c5b12 2883 int i, charset, reg_bits = 0;
4ed46869 2884
4608c386 2885 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2886
4ed46869
KH
2887 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2888 goto label_invalid_coding_system;
2889
2890 flags = XVECTOR (val)->contents;
2891 coding->flags
2892 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2893 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2894 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2895 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2896 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2897 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2898 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2899 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2900 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2901 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2902 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2903 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2904 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2905 );
4ed46869
KH
2906
2907 /* Invoke graphic register 0 to plane 0. */
2908 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2909 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2910 CODING_SPEC_ISO_INVOCATION (coding, 1)
2911 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2912 /* Not single shifting at first. */
6e85d753 2913 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2914 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2915 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 2916
70c22245
KH
2917 for (charset = 0; charset <= MAX_CHARSET; charset++)
2918 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2919 val = Vcharset_revision_alist;
2920 while (CONSP (val))
2921 {
2922 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2923 if (charset >= 0
2924 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2925 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2926 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2927 val = XCONS (val)->cdr;
2928 }
2929
4ed46869
KH
2930 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2931 FLAGS[REG] can be one of below:
2932 integer CHARSET: CHARSET occupies register I,
2933 t: designate nothing to REG initially, but can be used
2934 by any charsets,
2935 list of integer, nil, or t: designate the first
2936 element (if integer) to REG initially, the remaining
2937 elements (if integer) is designated to REG on request,
d46c5b12 2938 if an element is t, REG can be used by any charsets,
4ed46869 2939 nil: REG is never used. */
467e7675 2940 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2941 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2942 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2943 for (i = 0; i < 4; i++)
2944 {
2945 if (INTEGERP (flags[i])
e0e989f6
KH
2946 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2947 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2948 {
2949 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2950 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2951 }
2952 else if (EQ (flags[i], Qt))
2953 {
2954 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
2955 reg_bits |= 1 << i;
2956 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
2957 }
2958 else if (CONSP (flags[i]))
2959 {
2960 Lisp_Object tail = flags[i];
2961
d46c5b12 2962 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
2963 if (INTEGERP (XCONS (tail)->car)
2964 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2965 CHARSET_VALID_P (charset))
2966 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2967 {
2968 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2969 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2970 }
2971 else
2972 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2973 tail = XCONS (tail)->cdr;
2974 while (CONSP (tail))
2975 {
2976 if (INTEGERP (XCONS (tail)->car)
2977 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2978 CHARSET_VALID_P (charset))
2979 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
2980 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2981 = i;
4ed46869 2982 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 2983 reg_bits |= 1 << i;
4ed46869
KH
2984 tail = XCONS (tail)->cdr;
2985 }
2986 }
2987 else
2988 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2989
2990 CODING_SPEC_ISO_DESIGNATION (coding, i)
2991 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2992 }
2993
d46c5b12 2994 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
2995 {
2996 /* REG 1 can be used only by locking shift in 7-bit env. */
2997 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 2998 reg_bits &= ~2;
4ed46869
KH
2999 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3000 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3001 reg_bits &= 3;
4ed46869
KH
3002 }
3003
d46c5b12
KH
3004 if (reg_bits)
3005 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3006 {
d46c5b12
KH
3007 if (CHARSET_VALID_P (charset))
3008 {
3009 /* There exist some default graphic registers to be
3010 used CHARSET. */
3011
3012 /* We had better avoid designating a charset of
3013 CHARS96 to REG 0 as far as possible. */
3014 if (CHARSET_CHARS (charset) == 96)
3015 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3016 = (reg_bits & 2
3017 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3018 else
3019 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3020 = (reg_bits & 1
3021 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3022 }
6e85d753 3023 }
4ed46869 3024 }
c952af22 3025 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3026 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3027 break;
3028
3029 case 3:
3030 coding->type = coding_type_big5;
c952af22
KH
3031 coding->common_flags
3032 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3033 coding->flags
4608c386 3034 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3035 ? CODING_FLAG_BIG5_HKU
3036 : CODING_FLAG_BIG5_ETEN);
3037 break;
3038
3039 case 4:
3040 coding->type = coding_type_ccl;
c952af22
KH
3041 coding->common_flags
3042 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3043 {
4608c386 3044 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
4ed46869
KH
3045 if (CONSP (val)
3046 && VECTORP (XCONS (val)->car)
3047 && VECTORP (XCONS (val)->cdr))
3048 {
3049 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
3050 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
3051 }
3052 else
3053 goto label_invalid_coding_system;
3054 }
c952af22 3055 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3056 break;
3057
27901516
KH
3058 case 5:
3059 coding->type = coding_type_raw_text;
3060 break;
3061
4ed46869 3062 default:
d46c5b12 3063 goto label_invalid_coding_system;
4ed46869
KH
3064 }
3065 return 0;
3066
3067 label_invalid_coding_system:
3068 coding->type = coding_type_no_conversion;
d46c5b12 3069 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3070 coding->common_flags = 0;
dec137e5 3071 coding->eol_type = CODING_EOL_LF;
d46c5b12 3072 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3073 return -1;
3074}
3075
3076/* Emacs has a mechanism to automatically detect a coding system if it
3077 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3078 it's impossible to distinguish some coding systems accurately
3079 because they use the same range of codes. So, at first, coding
3080 systems are categorized into 7, those are:
3081
0ef69138 3082 o coding-category-emacs-mule
4ed46869
KH
3083
3084 The category for a coding system which has the same code range
3085 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3086 symbol) `emacs-mule' by default.
4ed46869
KH
3087
3088 o coding-category-sjis
3089
3090 The category for a coding system which has the same code range
3091 as SJIS. Assigned the coding-system (Lisp
7717c392 3092 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3093
3094 o coding-category-iso-7
3095
3096 The category for a coding system which has the same code range
7717c392 3097 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3098 shift and single shift functions. This can encode/decode all
3099 charsets. Assigned the coding-system (Lisp symbol)
3100 `iso-2022-7bit' by default.
3101
3102 o coding-category-iso-7-tight
3103
3104 Same as coding-category-iso-7 except that this can
3105 encode/decode only the specified charsets.
4ed46869
KH
3106
3107 o coding-category-iso-8-1
3108
3109 The category for a coding system which has the same code range
3110 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3111 for DIMENSION1 charset. This doesn't use any locking shift
3112 and single shift functions. Assigned the coding-system (Lisp
3113 symbol) `iso-latin-1' by default.
4ed46869
KH
3114
3115 o coding-category-iso-8-2
3116
3117 The category for a coding system which has the same code range
3118 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3119 for DIMENSION2 charset. This doesn't use any locking shift
3120 and single shift functions. Assigned the coding-system (Lisp
3121 symbol) `japanese-iso-8bit' by default.
4ed46869 3122
7717c392 3123 o coding-category-iso-7-else
4ed46869
KH
3124
3125 The category for a coding system which has the same code range
7717c392
KH
3126 as ISO2022 of 7-bit environemnt but uses locking shift or
3127 single shift functions. Assigned the coding-system (Lisp
3128 symbol) `iso-2022-7bit-lock' by default.
3129
3130 o coding-category-iso-8-else
3131
3132 The category for a coding system which has the same code range
3133 as ISO2022 of 8-bit environemnt but uses locking shift or
3134 single shift functions. Assigned the coding-system (Lisp
3135 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3136
3137 o coding-category-big5
3138
3139 The category for a coding system which has the same code range
3140 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3141 `cn-big5' by default.
4ed46869
KH
3142
3143 o coding-category-binary
3144
3145 The category for a coding system not categorized in any of the
3146 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3147 `no-conversion' by default.
4ed46869
KH
3148
3149 Each of them is a Lisp symbol and the value is an actual
3150 `coding-system's (this is also a Lisp symbol) assigned by a user.
3151 What Emacs does actually is to detect a category of coding system.
3152 Then, it uses a `coding-system' assigned to it. If Emacs can't
3153 decide only one possible category, it selects a category of the
3154 highest priority. Priorities of categories are also specified by a
3155 user in a Lisp variable `coding-category-list'.
3156
3157*/
3158
d46c5b12 3159/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3160 If it detects possible coding systems, return an integer in which
3161 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3162 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3163
d46c5b12
KH
3164 How many ASCII characters are at the head is returned as *SKIP. */
3165
3166static int
3167detect_coding_mask (source, src_bytes, priorities, skip)
3168 unsigned char *source;
3169 int src_bytes, *priorities, *skip;
4ed46869
KH
3170{
3171 register unsigned char c;
d46c5b12
KH
3172 unsigned char *src = source, *src_end = source + src_bytes;
3173 unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT
3174 | CODING_CATEGORY_MASK_ISO_SHIFT);
3175 int i;
4ed46869
KH
3176
3177 /* At first, skip all ASCII characters and control characters except
3178 for three ISO2022 specific control characters. */
bcf26d6a 3179 label_loop_detect_coding:
4ed46869
KH
3180 while (src < src_end)
3181 {
3182 c = *src;
3183 if (c >= 0x80
d46c5b12
KH
3184 || ((mask & CODING_CATEGORY_MASK_ISO_7BIT)
3185 && c == ISO_CODE_ESC)
3186 || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT)
3187 && (c == ISO_CODE_SI || c == ISO_CODE_SO)))
4ed46869
KH
3188 break;
3189 src++;
3190 }
d46c5b12 3191 *skip = src - source;
4ed46869
KH
3192
3193 if (src >= src_end)
3194 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3195 return 0;
4ed46869
KH
3196
3197 /* The text seems to be encoded in some multilingual coding system.
3198 Now, try to find in which coding system the text is encoded. */
3199 if (c < 0x80)
bcf26d6a
KH
3200 {
3201 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3202 /* C is an ISO2022 specific control code of C0. */
3203 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3204 if (mask == 0)
d46c5b12
KH
3205 {
3206 /* No valid ISO2022 code follows C. Try again. */
3207 src++;
3208 mask = (c != ISO_CODE_ESC
3209 ? CODING_CATEGORY_MASK_ISO_7BIT
3210 : CODING_CATEGORY_MASK_ISO_SHIFT);
3211 goto label_loop_detect_coding;
3212 }
3213 if (priorities)
3214 goto label_return_highest_only;
bcf26d6a 3215 }
d46c5b12 3216 else
c4825358 3217 {
d46c5b12 3218 int try;
4ed46869 3219
d46c5b12
KH
3220 if (c < 0xA0)
3221 {
3222 /* C is the first byte of SJIS character code,
3223 or a leading-code of Emacs' internal format (emacs-mule). */
3224 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3225
3226 /* Or, if C is a special latin extra code,
3227 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3228 or is an ISO2022 control-sequence-introducer (CSI),
3229 we should also consider the possibility of ISO2022 codings. */
3230 if ((VECTORP (Vlatin_extra_code_table)
3231 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3232 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3233 || (c == ISO_CODE_CSI
3234 && (src < src_end
3235 && (*src == ']'
3236 || ((*src == '0' || *src == '1' || *src == '2')
3237 && src + 1 < src_end
3238 && src[1] == ']')))))
3239 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3240 | CODING_CATEGORY_MASK_ISO_8BIT);
3241 }
c4825358 3242 else
d46c5b12
KH
3243 /* C is a character of ISO2022 in graphic plane right,
3244 or a SJIS's 1-byte character code (i.e. JISX0201),
3245 or the first byte of BIG5's 2-byte code. */
3246 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3247 | CODING_CATEGORY_MASK_ISO_8BIT
3248 | CODING_CATEGORY_MASK_SJIS
3249 | CODING_CATEGORY_MASK_BIG5);
3250
3251 mask = 0;
3252 if (priorities)
3253 {
3254 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3255 {
3256 priorities[i] &= try;
3257 if (priorities[i] & CODING_CATEGORY_MASK_ISO)
3258 mask = detect_coding_iso2022 (src, src_end);
3259 else if (priorities[i] & CODING_CATEGORY_MASK_SJIS)
3260 mask = detect_coding_sjis (src, src_end);
3261 else if (priorities[i] & CODING_CATEGORY_MASK_BIG5)
3262 mask = detect_coding_big5 (src, src_end);
3263 else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE)
3264 mask = detect_coding_emacs_mule (src, src_end);
3265 if (mask)
3266 goto label_return_highest_only;
3267 }
3268 return CODING_CATEGORY_MASK_RAW_TEXT;
3269 }
3270 if (try & CODING_CATEGORY_MASK_ISO)
3271 mask |= detect_coding_iso2022 (src, src_end);
3272 if (try & CODING_CATEGORY_MASK_SJIS)
3273 mask |= detect_coding_sjis (src, src_end);
3274 if (try & CODING_CATEGORY_MASK_BIG5)
3275 mask |= detect_coding_big5 (src, src_end);
3276 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3277 mask |= detect_coding_emacs_mule (src, src_end);
c4825358 3278 }
d46c5b12
KH
3279 return (mask | CODING_CATEGORY_MASK_RAW_TEXT);
3280
3281 label_return_highest_only:
3282 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3283 {
3284 if (mask & priorities[i])
3285 return priorities[i];
3286 }
3287 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3288}
3289
3290/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3291 The information of the detected coding system is set in CODING. */
3292
3293void
3294detect_coding (coding, src, src_bytes)
3295 struct coding_system *coding;
3296 unsigned char *src;
3297 int src_bytes;
3298{
d46c5b12
KH
3299 unsigned int idx;
3300 int skip, mask, i;
3301 int priorities[CODING_CATEGORY_IDX_MAX];
27901516 3302 Lisp_Object val = Vcoding_category_list;
4ed46869 3303
d46c5b12
KH
3304 i = 0;
3305 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
3306 {
3307 if (! SYMBOLP (XCONS (val)->car))
3308 break;
3309 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
3310 if (idx >= CODING_CATEGORY_IDX_MAX)
3311 break;
3312 priorities[i++] = (1 << idx);
3313 val = XCONS (val)->cdr;
3314 }
3315 /* If coding-category-list is valid and contains all coding
3316 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3317 the following code saves Emacs from craching. */
3318 while (i < CODING_CATEGORY_IDX_MAX)
3319 priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869 3320
d46c5b12
KH
3321 mask = detect_coding_mask (src, src_bytes, priorities, &skip);
3322 coding->heading_ascii = skip;
4ed46869 3323
d46c5b12
KH
3324 if (!mask) return;
3325
3326 /* We found a single coding system of the highest priority in MASK. */
3327 idx = 0;
3328 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3329 if (! mask)
3330 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3331
d46c5b12
KH
3332 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3333
3334 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3335 {
d46c5b12
KH
3336 Lisp_Object tmp = Fget (val, Qeol_type);
3337
3338 if (VECTORP (tmp))
3339 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3340 }
d46c5b12
KH
3341 setup_coding_system (val, coding);
3342 /* Set this again because setup_coding_system reset this member. */
3343 coding->heading_ascii = skip;
4ed46869
KH
3344}
3345
d46c5b12
KH
3346/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3347 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3348 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3349
3350 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3351
bc4bc72a
RS
3352#define MAX_EOL_CHECK_COUNT 3
3353
d46c5b12
KH
3354static int
3355detect_eol_type (source, src_bytes, skip)
3356 unsigned char *source;
3357 int src_bytes, *skip;
4ed46869 3358{
d46c5b12 3359 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3360 unsigned char c;
bc4bc72a
RS
3361 int total = 0; /* How many end-of-lines are found so far. */
3362 int eol_type = CODING_EOL_UNDECIDED;
3363 int this_eol_type;
4ed46869 3364
d46c5b12
KH
3365 *skip = 0;
3366
bc4bc72a 3367 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3368 {
3369 c = *src++;
bc4bc72a 3370 if (c == '\n' || c == '\r')
4ed46869 3371 {
d46c5b12
KH
3372 if (*skip == 0)
3373 *skip = src - 1 - source;
bc4bc72a
RS
3374 total++;
3375 if (c == '\n')
3376 this_eol_type = CODING_EOL_LF;
3377 else if (src >= src_end || *src != '\n')
3378 this_eol_type = CODING_EOL_CR;
4ed46869 3379 else
bc4bc72a
RS
3380 this_eol_type = CODING_EOL_CRLF, src++;
3381
3382 if (eol_type == CODING_EOL_UNDECIDED)
3383 /* This is the first end-of-line. */
3384 eol_type = this_eol_type;
3385 else if (eol_type != this_eol_type)
d46c5b12
KH
3386 {
3387 /* The found type is different from what found before. */
3388 eol_type = CODING_EOL_INCONSISTENT;
3389 break;
3390 }
4ed46869
KH
3391 }
3392 }
bc4bc72a 3393
d46c5b12
KH
3394 if (*skip == 0)
3395 *skip = src_end - source;
85a02ca4 3396 return eol_type;
4ed46869
KH
3397}
3398
3399/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3400 is encoded. If it detects an appropriate format of end-of-line, it
3401 sets the information in *CODING. */
3402
3403void
3404detect_eol (coding, src, src_bytes)
3405 struct coding_system *coding;
3406 unsigned char *src;
3407 int src_bytes;
3408{
4608c386 3409 Lisp_Object val;
d46c5b12
KH
3410 int skip;
3411 int eol_type = detect_eol_type (src, src_bytes, &skip);
3412
3413 if (coding->heading_ascii > skip)
3414 coding->heading_ascii = skip;
3415 else
3416 skip = coding->heading_ascii;
4ed46869 3417
0ef69138 3418 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3419 return;
27901516
KH
3420 if (eol_type == CODING_EOL_INCONSISTENT)
3421 {
3422#if 0
3423 /* This code is suppressed until we find a better way to
992f23f2 3424 distinguish raw text file and binary file. */
27901516
KH
3425
3426 /* If we have already detected that the coding is raw-text, the
3427 coding should actually be no-conversion. */
3428 if (coding->type == coding_type_raw_text)
3429 {
3430 setup_coding_system (Qno_conversion, coding);
3431 return;
3432 }
3433 /* Else, let's decode only text code anyway. */
3434#endif /* 0 */
1b2af4b0 3435 eol_type = CODING_EOL_LF;
27901516
KH
3436 }
3437
4608c386 3438 val = Fget (coding->symbol, Qeol_type);
4ed46869 3439 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3440 {
3441 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3442 coding->heading_ascii = skip;
3443 }
3444}
3445
3446#define CONVERSION_BUFFER_EXTRA_ROOM 256
3447
3448#define DECODING_BUFFER_MAG(coding) \
3449 (coding->type == coding_type_iso2022 \
3450 ? 3 \
3451 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3452 ? 2 \
3453 : (coding->type == coding_type_raw_text \
3454 ? 1 \
3455 : (coding->type == coding_type_ccl \
3456 ? coding->spec.ccl.decoder.buf_magnification \
3457 : 2))))
3458
3459/* Return maximum size (bytes) of a buffer enough for decoding
3460 SRC_BYTES of text encoded in CODING. */
3461
3462int
3463decoding_buffer_size (coding, src_bytes)
3464 struct coding_system *coding;
3465 int src_bytes;
3466{
3467 return (src_bytes * DECODING_BUFFER_MAG (coding)
3468 + CONVERSION_BUFFER_EXTRA_ROOM);
3469}
3470
3471/* Return maximum size (bytes) of a buffer enough for encoding
3472 SRC_BYTES of text to CODING. */
3473
3474int
3475encoding_buffer_size (coding, src_bytes)
3476 struct coding_system *coding;
3477 int src_bytes;
3478{
3479 int magnification;
3480
3481 if (coding->type == coding_type_ccl)
3482 magnification = coding->spec.ccl.encoder.buf_magnification;
3483 else
3484 magnification = 3;
3485
3486 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3487}
3488
3489#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3490#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3491#endif
3492
3493char *conversion_buffer;
3494int conversion_buffer_size;
3495
3496/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3497 or decoding. Sufficient memory is allocated automatically. If we
3498 run out of memory, return NULL. */
3499
3500char *
3501get_conversion_buffer (size)
3502 int size;
3503{
3504 if (size > conversion_buffer_size)
3505 {
3506 char *buf;
3507 int real_size = conversion_buffer_size * 2;
3508
3509 while (real_size < size) real_size *= 2;
3510 buf = (char *) xmalloc (real_size);
3511 xfree (conversion_buffer);
3512 conversion_buffer = buf;
3513 conversion_buffer_size = real_size;
3514 }
3515 return conversion_buffer;
3516}
3517
3518int
3519ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3520 struct coding_system *coding;
3521 unsigned char *source, *destination;
3522 int src_bytes, dst_bytes, encodep;
3523{
3524 struct ccl_program *ccl
3525 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3526 int result;
3527
3528 coding->produced = ccl_driver (ccl, source, destination,
3529 src_bytes, dst_bytes, &(coding->consumed));
3530 if (encodep)
3531 {
3532 coding->produced_char = coding->produced;
3533 coding->consumed_char
3534 = multibyte_chars_in_text (source, coding->consumed);
3535 }
3536 else
3537 {
3538 coding->produced_char
3539 = multibyte_chars_in_text (destination, coding->produced);
3540 coding->consumed_char = coding->consumed;
3541 }
3542 switch (ccl->status)
3543 {
3544 case CCL_STAT_SUSPEND_BY_SRC:
3545 result = CODING_FINISH_INSUFFICIENT_SRC;
3546 break;
3547 case CCL_STAT_SUSPEND_BY_DST:
3548 result = CODING_FINISH_INSUFFICIENT_DST;
3549 break;
3550 default:
3551 result = CODING_FINISH_NORMAL;
3552 break;
3553 }
3554 return result;
4ed46869
KH
3555}
3556
3557/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3558 decoding, it may detect coding system and format of end-of-line if
3559 those are not yet decided. */
3560
3561int
d46c5b12 3562decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3563 struct coding_system *coding;
3564 unsigned char *source, *destination;
3565 int src_bytes, dst_bytes;
4ed46869 3566{
d46c5b12 3567 int result;
4ed46869
KH
3568
3569 if (src_bytes <= 0)
3570 {
d46c5b12
KH
3571 coding->produced = coding->produced_char = 0;
3572 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3573 coding->fake_multibyte = 0;
d46c5b12 3574 return CODING_FINISH_NORMAL;
4ed46869
KH
3575 }
3576
0ef69138 3577 if (coding->type == coding_type_undecided)
4ed46869
KH
3578 detect_coding (coding, source, src_bytes);
3579
0ef69138 3580 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3581 detect_eol (coding, source, src_bytes);
3582
4ed46869
KH
3583 switch (coding->type)
3584 {
0ef69138
KH
3585 case coding_type_emacs_mule:
3586 case coding_type_undecided:
27901516 3587 case coding_type_raw_text:
4ed46869 3588 if (coding->eol_type == CODING_EOL_LF
0ef69138 3589 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3590 goto label_no_conversion;
d46c5b12 3591 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3592 break;
3593
3594 case coding_type_sjis:
d46c5b12
KH
3595 result = decode_coding_sjis_big5 (coding, source, destination,
3596 src_bytes, dst_bytes, 1);
4ed46869
KH
3597 break;
3598
3599 case coding_type_iso2022:
d46c5b12
KH
3600 result = decode_coding_iso2022 (coding, source, destination,
3601 src_bytes, dst_bytes);
4ed46869
KH
3602 break;
3603
3604 case coding_type_big5:
d46c5b12
KH
3605 result = decode_coding_sjis_big5 (coding, source, destination,
3606 src_bytes, dst_bytes, 0);
4ed46869
KH
3607 break;
3608
3609 case coding_type_ccl:
d46c5b12
KH
3610 result = ccl_coding_driver (coding, source, destination,
3611 src_bytes, dst_bytes, 0);
3612 break;
3613
3614 default: /* i.e. case coding_type_no_conversion: */
3615 label_no_conversion:
3616 if (dst_bytes && src_bytes > dst_bytes)
3617 {
3618 coding->produced = dst_bytes;
3619 result = CODING_FINISH_INSUFFICIENT_DST;
3620 }
3621 else
3622 {
3623 coding->produced = src_bytes;
3624 result = CODING_FINISH_NORMAL;
3625 }
3626 if (dst_bytes)
3627 bcopy (source, destination, coding->produced);
3628 else
3629 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3630 coding->fake_multibyte = 1;
d46c5b12
KH
3631 coding->consumed
3632 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3633 break;
3634 }
3635
d46c5b12 3636 return result;
4ed46869
KH
3637}
3638
3639/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3640
3641int
d46c5b12 3642encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3643 struct coding_system *coding;
3644 unsigned char *source, *destination;
3645 int src_bytes, dst_bytes;
4ed46869 3646{
d46c5b12 3647 int result;
4ed46869 3648
d46c5b12 3649 if (src_bytes <= 0)
4ed46869 3650 {
d46c5b12
KH
3651 coding->produced = coding->produced_char = 0;
3652 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3653 coding->fake_multibyte = 0;
d46c5b12
KH
3654 return CODING_FINISH_NORMAL;
3655 }
4ed46869 3656
d46c5b12
KH
3657 switch (coding->type)
3658 {
0ef69138
KH
3659 case coding_type_emacs_mule:
3660 case coding_type_undecided:
27901516 3661 case coding_type_raw_text:
4ed46869 3662 if (coding->eol_type == CODING_EOL_LF
0ef69138 3663 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3664 goto label_no_conversion;
d46c5b12 3665 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3666 break;
3667
3668 case coding_type_sjis:
d46c5b12
KH
3669 result = encode_coding_sjis_big5 (coding, source, destination,
3670 src_bytes, dst_bytes, 1);
4ed46869
KH
3671 break;
3672
3673 case coding_type_iso2022:
d46c5b12
KH
3674 result = encode_coding_iso2022 (coding, source, destination,
3675 src_bytes, dst_bytes);
4ed46869
KH
3676 break;
3677
3678 case coding_type_big5:
d46c5b12
KH
3679 result = encode_coding_sjis_big5 (coding, source, destination,
3680 src_bytes, dst_bytes, 0);
4ed46869
KH
3681 break;
3682
3683 case coding_type_ccl:
d46c5b12
KH
3684 result = ccl_coding_driver (coding, source, destination,
3685 src_bytes, dst_bytes, 1);
3686 break;
3687
3688 default: /* i.e. case coding_type_no_conversion: */
3689 label_no_conversion:
3690 if (dst_bytes && src_bytes > dst_bytes)
3691 {
3692 coding->produced = dst_bytes;
3693 result = CODING_FINISH_INSUFFICIENT_DST;
3694 }
3695 else
3696 {
3697 coding->produced = src_bytes;
3698 result = CODING_FINISH_NORMAL;
3699 }
3700 if (dst_bytes)
3701 bcopy (source, destination, coding->produced);
3702 else
3703 safe_bcopy (source, destination, coding->produced);
3704 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3705 {
3706 unsigned char *p = destination, *pend = p + coding->produced;
3707 while (p < pend)
3708 if (*p++ == '\015') p[-1] = '\n';
3709 }
fb88bf2d 3710 coding->fake_multibyte = 1;
d46c5b12
KH
3711 coding->consumed
3712 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3713 break;
3714 }
3715
d46c5b12 3716 return result;
4ed46869
KH
3717}
3718
fb88bf2d
KH
3719/* Scan text in the region between *BEG and *END (byte positions),
3720 skip characters which we don't have to decode by coding system
3721 CODING at the head and tail, then set *BEG and *END to the region
3722 of the text we actually have to convert. The caller should move
3723 the gap out of the region in advance.
4ed46869 3724
d46c5b12
KH
3725 If STR is not NULL, *BEG and *END are indices into STR. */
3726
3727static void
3728shrink_decoding_region (beg, end, coding, str)
3729 int *beg, *end;
3730 struct coding_system *coding;
3731 unsigned char *str;
3732{
fb88bf2d 3733 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12
KH
3734 int eol_conversion;
3735
3736 if (coding->type == coding_type_ccl
3737 || coding->type == coding_type_undecided
3738 || !NILP (coding->post_read_conversion))
3739 {
3740 /* We can't skip any data. */
3741 return;
3742 }
3743 else if (coding->type == coding_type_no_conversion)
3744 {
fb88bf2d
KH
3745 /* We need no conversion, but don't have to skip any data here.
3746 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3747 return;
3748 }
3749
3750 if (coding->heading_ascii >= 0)
3751 /* Detection routine has already found how much we can skip at the
3752 head. */
3753 *beg += coding->heading_ascii;
3754
3755 if (str)
3756 {
3757 begp_orig = begp = str + *beg;
3758 endp_orig = endp = str + *end;
3759 }
3760 else
3761 {
fb88bf2d 3762 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3763 endp_orig = endp = begp + *end - *beg;
3764 }
3765
3766 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3767
3768 switch (coding->type)
3769 {
3770 case coding_type_emacs_mule:
3771 case coding_type_raw_text:
3772 if (eol_conversion)
3773 {
3774 if (coding->heading_ascii < 0)
fb88bf2d
KH
3775 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3776 while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80)
3777 endp--;
d46c5b12
KH
3778 }
3779 else
3780 begp = endp;
3781 break;
3782
3783 case coding_type_sjis:
3784 case coding_type_big5:
3785 /* We can skip all ASCII characters at the head. */
3786 if (coding->heading_ascii < 0)
3787 {
3788 if (eol_conversion)
de9d083c 3789 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
3790 else
3791 while (begp < endp && *begp < 0x80) begp++;
3792 }
3793 /* We can skip all ASCII characters at the tail except for the
3794 second byte of SJIS or BIG5 code. */
3795 if (eol_conversion)
de9d083c 3796 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
3797 else
3798 while (begp < endp && endp[-1] < 0x80) endp--;
3799 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3800 endp++;
3801 break;
3802
3803 default: /* i.e. case coding_type_iso2022: */
3804 if (coding->heading_ascii < 0)
3805 {
d46c5b12
KH
3806 /* We can skip all ASCII characters at the head except for a
3807 few control codes. */
3808 while (begp < endp && (c = *begp) < 0x80
3809 && c != ISO_CODE_CR && c != ISO_CODE_SO
3810 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3811 && (!eol_conversion || c != ISO_CODE_LF))
3812 begp++;
3813 }
3814 switch (coding->category_idx)
3815 {
3816 case CODING_CATEGORY_IDX_ISO_8_1:
3817 case CODING_CATEGORY_IDX_ISO_8_2:
3818 /* We can skip all ASCII characters at the tail. */
3819 if (eol_conversion)
de9d083c 3820 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
3821 else
3822 while (begp < endp && endp[-1] < 0x80) endp--;
3823 break;
3824
3825 case CODING_CATEGORY_IDX_ISO_7:
3826 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3827 /* We can skip all charactes at the tail except for ESC and
3828 the following 2-byte at the tail. */
3829 if (eol_conversion)
fb88bf2d 3830 while (begp < endp
de9d083c 3831 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC && c != '\r')
d46c5b12
KH
3832 endp--;
3833 else
fb88bf2d
KH
3834 while (begp < endp
3835 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC)
d46c5b12
KH
3836 endp--;
3837 if (begp < endp && endp[-1] == ISO_CODE_ESC)
3838 {
3839 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
3840 /* This is an ASCII designation sequence. We can
3841 surely skip the tail. */
3842 endp += 2;
3843 else
3844 /* Hmmm, we can't skip the tail. */
3845 endp = endp_orig;
3846 }
3847 }
3848 }
3849 *beg += begp - begp_orig;
3850 *end += endp - endp_orig;
3851 return;
3852}
3853
3854/* Like shrink_decoding_region but for encoding. */
3855
3856static void
3857shrink_encoding_region (beg, end, coding, str)
3858 int *beg, *end;
3859 struct coding_system *coding;
3860 unsigned char *str;
3861{
3862 unsigned char *begp_orig, *begp, *endp_orig, *endp;
3863 int eol_conversion;
3864
3865 if (coding->type == coding_type_ccl)
3866 /* We can't skip any data. */
3867 return;
3868 else if (coding->type == coding_type_no_conversion)
3869 {
3870 /* We need no conversion. */
3871 *beg = *end;
3872 return;
3873 }
3874
3875 if (str)
3876 {
3877 begp_orig = begp = str + *beg;
3878 endp_orig = endp = str + *end;
3879 }
3880 else
3881 {
fb88bf2d 3882 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3883 endp_orig = endp = begp + *end - *beg;
3884 }
3885
3886 eol_conversion = (coding->eol_type == CODING_EOL_CR
3887 || coding->eol_type == CODING_EOL_CRLF);
3888
3889 /* Here, we don't have to check coding->pre_write_conversion because
3890 the caller is expected to have handled it already. */
3891 switch (coding->type)
3892 {
3893 case coding_type_undecided:
3894 case coding_type_emacs_mule:
3895 case coding_type_raw_text:
3896 if (eol_conversion)
3897 {
3898 while (begp < endp && *begp != '\n') begp++;
3899 while (begp < endp && endp[-1] != '\n') endp--;
3900 }
3901 else
3902 begp = endp;
3903 break;
3904
3905 case coding_type_iso2022:
3906 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3907 {
3908 unsigned char *bol = begp;
3909 while (begp < endp && *begp < 0x80)
3910 {
3911 begp++;
3912 if (begp[-1] == '\n')
3913 bol = begp;
3914 }
3915 begp = bol;
3916 goto label_skip_tail;
3917 }
3918 /* fall down ... */
3919
3920 default:
3921 /* We can skip all ASCII characters at the head and tail. */
3922 if (eol_conversion)
3923 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3924 else
3925 while (begp < endp && *begp < 0x80) begp++;
3926 label_skip_tail:
3927 if (eol_conversion)
3928 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3929 else
3930 while (begp < endp && *(endp - 1) < 0x80) endp--;
3931 break;
3932 }
3933
3934 *beg += begp - begp_orig;
3935 *end += endp - endp_orig;
3936 return;
3937}
3938
3939/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
3940 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3941 coding system CODING, and return the status code of code conversion
3942 (currently, this value has no meaning).
3943
3944 How many characters (and bytes) are converted to how many
3945 characters (and bytes) are recorded in members of the structure
3946 CODING.
d46c5b12 3947
6e44253b 3948 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 3949 is deleted and a new text is inserted. See the comments in
6e44253b 3950 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
3951
3952int
6e44253b
KH
3953code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
3954 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 3955 struct coding_system *coding;
4ed46869 3956{
fb88bf2d
KH
3957 int len = to - from, len_byte = to_byte - from_byte;
3958 int require, inserted, inserted_byte;
12410ef1 3959 int head_skip, tail_skip, total_skip;
d46c5b12 3960 Lisp_Object saved_coding_symbol = Qnil;
fb88bf2d
KH
3961 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
3962 int first = 1;
3963 int fake_multibyte = 0;
3964 unsigned char *src, *dst;
12410ef1 3965 Lisp_Object deletion = Qnil;
d46c5b12 3966
6e44253b 3967 if (replace)
d46c5b12 3968 {
fb88bf2d
KH
3969 int saved_from = from;
3970
d46c5b12 3971 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
3972 if (saved_from != from)
3973 {
3974 to = from + len;
3975 if (multibyte)
3976 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
3977 else
3978 from_byte = from, to_byte = to;
3979 len_byte = to_byte - from_byte;
3980 }
d46c5b12 3981 }
d46c5b12
KH
3982
3983 if (! encodep && CODING_REQUIRE_DETECTION (coding))
3984 {
12410ef1 3985 /* We must detect encoding of text and eol format. */
d46c5b12
KH
3986
3987 if (from < GPT && to > GPT)
3988 move_gap_both (from, from_byte);
3989 if (coding->type == coding_type_undecided)
3990 {
fb88bf2d 3991 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 3992 if (coding->type == coding_type_undecided)
12410ef1
KH
3993 /* It seems that the text contains only ASCII, but we
3994 should not left it undecided because the deeper
3995 decoding routine (decode_coding) tries to detect the
3996 encodings again in vain. */
d46c5b12
KH
3997 coding->type = coding_type_emacs_mule;
3998 }
3999 if (coding->eol_type == CODING_EOL_UNDECIDED)
4000 {
4001 saved_coding_symbol = coding->symbol;
4002 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4003 if (coding->eol_type == CODING_EOL_UNDECIDED)
4004 coding->eol_type = CODING_EOL_LF;
4005 /* We had better recover the original eol format if we
4006 encounter an inconsitent eol format while decoding. */
4007 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4008 }
4009 }
4010
fb88bf2d
KH
4011 coding->consumed_char = len, coding->consumed = len_byte;
4012
d46c5b12
KH
4013 if (encodep
4014 ? ! CODING_REQUIRE_ENCODING (coding)
4015 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4016 {
4017 coding->produced = len_byte;
12410ef1
KH
4018 if (multibyte
4019 && ! replace
4020 /* See the comment of the member heading_ascii in coding.h. */
4021 && coding->heading_ascii < len_byte)
fb88bf2d 4022 {
6e44253b
KH
4023 /* We still may have to combine byte at the head and the
4024 tail of the text in the region. */
12410ef1 4025 if (from < GPT && GPT < to)
6e44253b 4026 move_gap_both (to, to_byte);
12410ef1
KH
4027 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4028 adjust_after_insert (from, from_byte, to, to_byte, len);
4029 coding->produced_char = len;
fb88bf2d
KH
4030 }
4031 else
68e3a8f1
AS
4032 {
4033 if (!replace)
4034 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4035 coding->produced_char = len_byte;
4036 }
fb88bf2d
KH
4037 return 0;
4038 }
d46c5b12
KH
4039
4040 /* Now we convert the text. */
4041
4042 /* For encoding, we must process pre-write-conversion in advance. */
4043 if (encodep
d46c5b12
KH
4044 && ! NILP (coding->pre_write_conversion)
4045 && SYMBOLP (coding->pre_write_conversion)
4046 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4047 {
2b4f9037
KH
4048 /* The function in pre-write-conversion may put a new text in a
4049 new buffer. */
d46c5b12
KH
4050 struct buffer *prev = current_buffer, *new;
4051
4052 call2 (coding->pre_write_conversion, from, to);
4053 if (current_buffer != prev)
4054 {
4055 len = ZV - BEGV;
4056 new = current_buffer;
4057 set_buffer_internal_1 (prev);
ddbc19ff 4058 del_range_2 (from, from_byte, to, to_byte);
d46c5b12
KH
4059 insert_from_buffer (new, BEG, len, 0);
4060 to = from + len;
fb88bf2d 4061 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12
KH
4062 len_byte = to_byte - from_byte;
4063 }
4064 }
4065
12410ef1
KH
4066 if (replace)
4067 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4068
d46c5b12 4069 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4070 {
4071 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4072
4073 if (from < GPT && GPT < to)
4074 move_gap_both (from, from_byte);
4075 if (encodep)
4076 shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
4077 else
4078 shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
4079 if (from_byte == to_byte)
4080 {
4081 coding->produced = len_byte;
4082 coding->produced_char = multibyte ? len : len_byte;
4083 if (!replace)
4084 /* We must record and adjust for this new text now. */
4085 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4086 return 0;
4087 }
fb88bf2d 4088
12410ef1
KH
4089 head_skip = from_byte - from_byte_orig;
4090 tail_skip = to_byte_orig - to_byte;
4091 total_skip = head_skip + tail_skip;
4092 from += head_skip;
4093 to -= tail_skip;
4094 len -= total_skip; len_byte -= total_skip;
4095 }
d46c5b12 4096
fb88bf2d
KH
4097 /* For converion, we must put the gap before the text in addition to
4098 making the gap larger for efficient decoding. The required gap
4099 size starts from 2000 which is the magic number used in make_gap.
4100 But, after one batch of conversion, it will be incremented if we
4101 find that it is not enough . */
d46c5b12
KH
4102 require = 2000;
4103
4104 if (GAP_SIZE < require)
4105 make_gap (require - GAP_SIZE);
4106 move_gap_both (from, from_byte);
4107
d46c5b12
KH
4108 if (GPT - BEG < beg_unchanged)
4109 beg_unchanged = GPT - BEG;
4110 if (Z - GPT < end_unchanged)
4111 end_unchanged = Z - GPT;
4112
4113 inserted = inserted_byte = 0;
fb88bf2d
KH
4114 src = GAP_END_ADDR, dst = GPT_ADDR;
4115
4116 GAP_SIZE += len_byte;
4117 ZV -= len;
4118 Z -= len;
4119 ZV_BYTE -= len_byte;
4120 Z_BYTE -= len_byte;
4121
d46c5b12
KH
4122 for (;;)
4123 {
fb88bf2d 4124 int result;
d46c5b12
KH
4125
4126 /* The buffer memory is changed from:
fb88bf2d
KH
4127 +--------+converted-text+---------+-------original-text------+---+
4128 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4129 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4130 if (encodep)
fb88bf2d 4131 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4132 else
fb88bf2d 4133 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4134 /* to:
4135 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4136 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4137 |<------------------- GAP_SIZE -------------------->| */
4138 if (coding->fake_multibyte)
4139 fake_multibyte = 1;
d46c5b12 4140
fb88bf2d
KH
4141 if (!encodep && !multibyte)
4142 coding->produced_char = coding->produced;
d46c5b12
KH
4143 inserted += coding->produced_char;
4144 inserted_byte += coding->produced;
d46c5b12 4145 len_byte -= coding->consumed;
fb88bf2d
KH
4146 src += coding->consumed;
4147 dst += inserted_byte;
d46c5b12
KH
4148
4149 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4150 {
fb88bf2d 4151 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4152
4153 /* Encode LFs back to the original eol format (CR or CRLF). */
4154 if (coding->eol_type == CODING_EOL_CR)
4155 {
4156 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4157 }
4158 else
4159 {
d46c5b12
KH
4160 int count = 0;
4161
fb88bf2d
KH
4162 while (p < pend) if (*p++ == '\n') count++;
4163 if (src - dst < count)
d46c5b12 4164 {
fb88bf2d
KH
4165 /* We don't have sufficient room for putting LFs
4166 back to CRLF. We must record converted and
4167 not-yet-converted text back to the buffer
4168 content, enlarge the gap, then record them out of
4169 the buffer contents again. */
4170 int add = len_byte + inserted_byte;
4171
4172 GAP_SIZE -= add;
4173 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4174 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4175 make_gap (count - GAP_SIZE);
4176 GAP_SIZE += add;
4177 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4178 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4179 /* Don't forget to update SRC, DST, and PEND. */
4180 src = GAP_END_ADDR - len_byte;
4181 dst = GPT_ADDR + inserted_byte;
4182 pend = dst;
d46c5b12 4183 }
d46c5b12
KH
4184 inserted += count;
4185 inserted_byte += count;
fb88bf2d
KH
4186 coding->produced += count;
4187 p = dst = pend + count;
4188 while (count)
4189 {
4190 *--p = *--pend;
4191 if (*p == '\n') count--, *--p = '\r';
4192 }
d46c5b12
KH
4193 }
4194
4195 /* Suppress eol-format conversion in the further conversion. */
4196 coding->eol_type = CODING_EOL_LF;
4197
4198 /* Restore the original symbol. */
4199 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4200
4201 continue;
d46c5b12
KH
4202 }
4203 if (len_byte <= 0)
4204 break;
4205 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4206 {
4207 /* The source text ends in invalid codes. Let's just
4208 make them valid buffer contents, and finish conversion. */
fb88bf2d 4209 inserted += len_byte;
d46c5b12 4210 inserted_byte += len_byte;
fb88bf2d
KH
4211 while (len_byte--)
4212 *src++ = *dst++;
4213 fake_multibyte = 1;
d46c5b12
KH
4214 break;
4215 }
fb88bf2d
KH
4216 if (first)
4217 {
4218 /* We have just done the first batch of conversion which was
4219 stoped because of insufficient gap. Let's reconsider the
4220 required gap size (i.e. SRT - DST) now.
4221
4222 We have converted ORIG bytes (== coding->consumed) into
4223 NEW bytes (coding->produced). To convert the remaining
4224 LEN bytes, we may need REQUIRE bytes of gap, where:
4225 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4226 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4227 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4228 float ratio = coding->produced - coding->consumed;
4229 ratio /= coding->consumed;
4230 require = len_byte * ratio;
fb88bf2d
KH
4231 first = 0;
4232 }
4233 if ((src - dst) < (require + 2000))
4234 {
4235 /* See the comment above the previous call of make_gap. */
4236 int add = len_byte + inserted_byte;
4237
4238 GAP_SIZE -= add;
4239 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4240 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4241 make_gap (require + 2000);
4242 GAP_SIZE += add;
4243 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4244 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4245 /* Don't forget to update SRC, DST. */
4246 src = GAP_END_ADDR - len_byte;
4247 dst = GPT_ADDR + inserted_byte;
4248 }
d46c5b12 4249 }
fb88bf2d
KH
4250 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4251
2b4f9037 4252 if (multibyte
12410ef1
KH
4253 && (fake_multibyte
4254 || !encodep && (to - from) != (to_byte - from_byte)))
2b4f9037 4255 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4256
12410ef1
KH
4257 /* If we have shrinked the conversion area, adjust it now. */
4258 if (total_skip > 0)
4259 {
4260 if (tail_skip > 0)
4261 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4262 inserted += total_skip; inserted_byte += total_skip;
4263 GAP_SIZE += total_skip;
4264 GPT -= head_skip; GPT_BYTE -= head_skip;
4265 ZV -= total_skip; ZV_BYTE -= total_skip;
4266 Z -= total_skip; Z_BYTE -= total_skip;
4267 from -= head_skip; from_byte -= head_skip;
4268 to += tail_skip; to_byte += tail_skip;
4269 }
4270
4271 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4ed46869 4272
2b4f9037 4273 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4274 {
2b4f9037
KH
4275 Lisp_Object val;
4276 int orig_inserted = inserted, pos = PT;
4ed46869 4277
2b4f9037
KH
4278 if (from != pos)
4279 temp_set_point_both (current_buffer, from, from_byte);
4280 val = call1 (coding->post_read_conversion, make_number (inserted));
4281 if (! NILP (val))
d46c5b12 4282 {
2b4f9037
KH
4283 CHECK_NUMBER (val, 0);
4284 inserted = XFASTINT (val);
d46c5b12 4285 }
2b4f9037
KH
4286 if (pos >= from + orig_inserted)
4287 temp_set_point (current_buffer, pos + (inserted - orig_inserted));
d46c5b12 4288 }
4ed46869 4289
2b4f9037
KH
4290 signal_after_change (from, to - from, inserted);
4291
fb88bf2d 4292 {
12410ef1
KH
4293 coding->consumed = to_byte - from_byte;
4294 coding->consumed_char = to - from;
4295 coding->produced = inserted_byte;
4296 coding->produced_char = inserted;
fb88bf2d 4297 }
7553d0e1 4298
fb88bf2d 4299 return 0;
d46c5b12
KH
4300}
4301
4302Lisp_Object
4303code_convert_string (str, coding, encodep, nocopy)
4304 Lisp_Object str;
4ed46869 4305 struct coding_system *coding;
d46c5b12 4306 int encodep, nocopy;
4ed46869 4307{
d46c5b12
KH
4308 int len;
4309 char *buf;
fc932ac6
RS
4310 int from = 0, to = XSTRING (str)->size;
4311 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12
KH
4312 struct gcpro gcpro1;
4313 Lisp_Object saved_coding_symbol = Qnil;
4314 int result;
4ed46869 4315
d46c5b12
KH
4316 if (encodep && !NILP (coding->pre_write_conversion)
4317 || !encodep && !NILP (coding->post_read_conversion))
4318 {
4319 /* Since we have to call Lisp functions which assume target text
4320 is in a buffer, after setting a temporary buffer, call
4321 code_convert_region. */
4322 int count = specpdl_ptr - specpdl;
4323 struct buffer *prev = current_buffer;
4324
4325 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4326 temp_output_buffer_setup (" *code-converting-work*");
4327 set_buffer_internal (XBUFFER (Vstandard_output));
4328 if (encodep)
4329 insert_from_string (str, 0, 0, to, to_byte, 0);
4330 else
4331 {
4332 /* We must insert the contents of STR as is without
4333 unibyte<->multibyte conversion. */
4334 current_buffer->enable_multibyte_characters = Qnil;
4335 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4336 current_buffer->enable_multibyte_characters = Qt;
4337 }
fb88bf2d 4338 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4339 if (encodep)
4340 /* We must return the buffer contents as unibyte string. */
4341 current_buffer->enable_multibyte_characters = Qnil;
4342 str = make_buffer_string (BEGV, ZV, 0);
4343 set_buffer_internal (prev);
4344 return unbind_to (count, str);
4345 }
4ed46869 4346
d46c5b12
KH
4347 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4348 {
4349 /* See the comments in code_convert_region. */
4350 if (coding->type == coding_type_undecided)
4351 {
4352 detect_coding (coding, XSTRING (str)->data, to_byte);
4353 if (coding->type == coding_type_undecided)
4354 coding->type = coding_type_emacs_mule;
4355 }
4356 if (coding->eol_type == CODING_EOL_UNDECIDED)
4357 {
4358 saved_coding_symbol = coding->symbol;
4359 detect_eol (coding, XSTRING (str)->data, to_byte);
4360 if (coding->eol_type == CODING_EOL_UNDECIDED)
4361 coding->eol_type = CODING_EOL_LF;
4362 /* We had better recover the original eol format if we
4363 encounter an inconsitent eol format while decoding. */
4364 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4365 }
4366 }
4ed46869 4367
d46c5b12
KH
4368 if (encodep
4369 ? ! CODING_REQUIRE_ENCODING (coding)
4370 : ! CODING_REQUIRE_DECODING (coding))
4371 from = to_byte;
4372 else
4373 {
4374 /* Try to skip the heading and tailing ASCIIs. */
4375 if (encodep)
4376 shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4377 else
4378 shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4379 }
4380 if (from == to_byte)
4381 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4382
d46c5b12
KH
4383 if (encodep)
4384 len = encoding_buffer_size (coding, to_byte - from);
4385 else
4386 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4387 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4388 GCPRO1 (str);
4389 buf = get_conversion_buffer (len);
4390 UNGCPRO;
4ed46869 4391
d46c5b12
KH
4392 if (from > 0)
4393 bcopy (XSTRING (str)->data, buf, from);
4394 result = (encodep
4395 ? encode_coding (coding, XSTRING (str)->data + from,
4396 buf + from, to_byte - from, len)
4397 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4398 buf + from, to_byte - from, len));
d46c5b12 4399 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4400 {
d46c5b12
KH
4401 /* We simple try to decode the whole string again but without
4402 eol-conversion this time. */
4403 coding->eol_type = CODING_EOL_LF;
4404 coding->symbol = saved_coding_symbol;
4405 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4406 }
d46c5b12
KH
4407
4408 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4409 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4410
fc932ac6 4411 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4412 if (encodep)
4413 str = make_unibyte_string (buf, len + coding->produced);
4414 else
bbdf84bd
RS
4415 str = make_string_from_bytes (buf, len + coding->produced_char,
4416 len + coding->produced);
d46c5b12 4417 return str;
4ed46869
KH
4418}
4419
4420\f
4421#ifdef emacs
4422/*** 7. Emacs Lisp library functions ***/
4423
4ed46869
KH
4424DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4425 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4426See the documentation of `make-coding-system' for information\n\
4427about coding-system objects.")
4ed46869
KH
4428 (obj)
4429 Lisp_Object obj;
4430{
4608c386
KH
4431 if (NILP (obj))
4432 return Qt;
4433 if (!SYMBOLP (obj))
4434 return Qnil;
4435 /* Get coding-spec vector for OBJ. */
4436 obj = Fget (obj, Qcoding_system);
4437 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4438 ? Qt : Qnil);
4ed46869
KH
4439}
4440
9d991de8
RS
4441DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4442 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4443 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4444 (prompt)
4445 Lisp_Object prompt;
4446{
e0e989f6 4447 Lisp_Object val;
9d991de8
RS
4448 do
4449 {
4608c386
KH
4450 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4451 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4452 }
4453 while (XSTRING (val)->size == 0);
e0e989f6 4454 return (Fintern (val, Qnil));
4ed46869
KH
4455}
4456
9b787f3e
RS
4457DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4458 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4459If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4460 (prompt, default_coding_system)
4461 Lisp_Object prompt, default_coding_system;
4ed46869 4462{
f44d27ce 4463 Lisp_Object val;
9b787f3e
RS
4464 if (SYMBOLP (default_coding_system))
4465 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4466 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4467 Qt, Qnil, Qcoding_system_history,
4468 default_coding_system, Qnil);
e0e989f6 4469 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4470}
4471
4472DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4473 1, 1, 0,
4474 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4475If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4476It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4477The value of property should be a vector of length 5.")
4478 (coding_system)
4479 Lisp_Object coding_system;
4480{
4481 CHECK_SYMBOL (coding_system, 0);
4482 if (!NILP (Fcoding_system_p (coding_system)))
4483 return coding_system;
4484 while (1)
02ba4723 4485 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4486}
3a73fa5d 4487\f
d46c5b12
KH
4488Lisp_Object
4489detect_coding_system (src, src_bytes, highest)
4490 unsigned char *src;
4491 int src_bytes, highest;
4ed46869
KH
4492{
4493 int coding_mask, eol_type;
d46c5b12
KH
4494 Lisp_Object val, tmp;
4495 int dummy;
4ed46869 4496
d46c5b12
KH
4497 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4498 eol_type = detect_eol_type (src, src_bytes, &dummy);
4499 if (eol_type == CODING_EOL_INCONSISTENT)
4500 eol_type == CODING_EOL_UNDECIDED;
4ed46869 4501
d46c5b12 4502 if (!coding_mask)
4ed46869 4503 {
27901516 4504 val = Qundecided;
d46c5b12 4505 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4506 {
f44d27ce
RS
4507 Lisp_Object val2;
4508 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4509 if (VECTORP (val2))
4510 val = XVECTOR (val2)->contents[eol_type];
4511 }
d46c5b12 4512 return val;
4ed46869 4513 }
4ed46869 4514
d46c5b12
KH
4515 /* At first, gather possible coding systems in VAL. */
4516 val = Qnil;
4517 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4518 {
d46c5b12
KH
4519 int idx
4520 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4521 if (coding_mask & (1 << idx))
4ed46869 4522 {
d46c5b12
KH
4523 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4524 if (highest)
4525 break;
4ed46869
KH
4526 }
4527 }
d46c5b12
KH
4528 if (!highest)
4529 val = Fnreverse (val);
4ed46869 4530
d46c5b12
KH
4531 /* Then, substitute the elements by subsidiary coding systems. */
4532 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4533 {
d46c5b12 4534 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4535 {
d46c5b12
KH
4536 Lisp_Object eol;
4537 eol = Fget (XCONS (tmp)->car, Qeol_type);
4538 if (VECTORP (eol))
4539 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4540 }
4541 }
d46c5b12
KH
4542 return (highest ? XCONS (val)->car : val);
4543}
4ed46869 4544
d46c5b12
KH
4545DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4546 2, 3, 0,
4547 "Detect coding system of the text in the region between START and END.\n\
4548Return a list of possible coding systems ordered by priority.\n\
4549\n\
4550If only ASCII characters are found, it returns `undecided'\n\
4551or its subsidiary coding system according to a detected end-of-line format.\n\
4552\n\
4553If optional argument HIGHEST is non-nil, return the coding system of\n\
4554highest priority.")
4555 (start, end, highest)
4556 Lisp_Object start, end, highest;
4557{
4558 int from, to;
4559 int from_byte, to_byte;
6289dd10 4560
d46c5b12
KH
4561 CHECK_NUMBER_COERCE_MARKER (start, 0);
4562 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4563
d46c5b12
KH
4564 validate_region (&start, &end);
4565 from = XINT (start), to = XINT (end);
4566 from_byte = CHAR_TO_BYTE (from);
4567 to_byte = CHAR_TO_BYTE (to);
6289dd10 4568
d46c5b12
KH
4569 if (from < GPT && to >= GPT)
4570 move_gap_both (to, to_byte);
4ed46869 4571
d46c5b12
KH
4572 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4573 to_byte - from_byte,
4574 !NILP (highest));
4575}
6289dd10 4576
d46c5b12
KH
4577DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4578 1, 2, 0,
4579 "Detect coding system of the text in STRING.\n\
4580Return a list of possible coding systems ordered by priority.\n\
4581\n\
4582If only ASCII characters are found, it returns `undecided'\n\
4583or its subsidiary coding system according to a detected end-of-line format.\n\
4584\n\
4585If optional argument HIGHEST is non-nil, return the coding system of\n\
4586highest priority.")
4587 (string, highest)
4588 Lisp_Object string, highest;
4589{
4590 CHECK_STRING (string, 0);
4ed46869 4591
d46c5b12 4592 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4593 STRING_BYTES (XSTRING (string)),
d46c5b12 4594 !NILP (highest));
4ed46869
KH
4595}
4596
4031e2bf
KH
4597Lisp_Object
4598code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4599 Lisp_Object start, end, coding_system;
4031e2bf 4600 int encodep;
3a73fa5d
RS
4601{
4602 struct coding_system coding;
4031e2bf 4603 int from, to, len;
3a73fa5d 4604
d46c5b12
KH
4605 CHECK_NUMBER_COERCE_MARKER (start, 0);
4606 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4607 CHECK_SYMBOL (coding_system, 2);
4608
d46c5b12
KH
4609 validate_region (&start, &end);
4610 from = XFASTINT (start);
4611 to = XFASTINT (end);
4612
3a73fa5d 4613 if (NILP (coding_system))
d46c5b12
KH
4614 return make_number (to - from);
4615
3a73fa5d 4616 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4617 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4618
d46c5b12 4619 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4620 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4621 &coding, encodep, 1);
4622 return make_number (coding.produced_char);
4031e2bf
KH
4623}
4624
4625DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4626 3, 3, "r\nzCoding system: ",
4627 "Decode the current region by specified coding system.\n\
4628When called from a program, takes three arguments:\n\
4629START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4630Return length of decoded text.")
4631 (start, end, coding_system)
4632 Lisp_Object start, end, coding_system;
4633{
4634 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4635}
4636
4637DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4638 3, 3, "r\nzCoding system: ",
d46c5b12 4639 "Encode the current region by specified coding system.\n\
3a73fa5d 4640When called from a program, takes three arguments:\n\
d46c5b12 4641START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
3a73fa5d 4642Return length of encoded text.")
d46c5b12
KH
4643 (start, end, coding_system)
4644 Lisp_Object start, end, coding_system;
3a73fa5d 4645{
4031e2bf
KH
4646 return code_convert_region1 (start, end, coding_system, 1);
4647}
3a73fa5d 4648
4031e2bf
KH
4649Lisp_Object
4650code_convert_string1 (string, coding_system, nocopy, encodep)
4651 Lisp_Object string, coding_system, nocopy;
4652 int encodep;
4653{
4654 struct coding_system coding;
3a73fa5d 4655
4031e2bf
KH
4656 CHECK_STRING (string, 0);
4657 CHECK_SYMBOL (coding_system, 1);
4ed46869 4658
d46c5b12 4659 if (NILP (coding_system))
4031e2bf 4660 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 4661
d46c5b12
KH
4662 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4663 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 4664
d46c5b12 4665 coding.mode |= CODING_MODE_LAST_BLOCK;
4031e2bf 4666 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
4667}
4668
4ed46869 4669DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
4670 2, 3, 0,
4671 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71
RS
4672Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4673if the decoding operation is trivial.")
e0e989f6
KH
4674 (string, coding_system, nocopy)
4675 Lisp_Object string, coding_system, nocopy;
4ed46869 4676{
4031e2bf 4677 return code_convert_string1(string, coding_system, nocopy, 0);
4ed46869
KH
4678}
4679
4680DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
4681 2, 3, 0,
4682 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71
RS
4683Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4684if the encoding operation is trivial.")
e0e989f6
KH
4685 (string, coding_system, nocopy)
4686 Lisp_Object string, coding_system, nocopy;
4ed46869 4687{
4031e2bf 4688 return code_convert_string1(string, coding_system, nocopy, 1);
4ed46869 4689}
4031e2bf 4690
3a73fa5d 4691\f
4ed46869 4692DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 4693 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
4694CODE is the character code in SJIS.\n\
4695Return the corresponding character.")
4696 (code)
4697 Lisp_Object code;
4698{
4699 unsigned char c1, c2, s1, s2;
4700 Lisp_Object val;
4701
4702 CHECK_NUMBER (code, 0);
4703 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
4704 DECODE_SJIS (s1, s2, c1, c2);
4705 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
4706 return val;
4707}
4708
4709DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
d46c5b12 4710 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4ed46869
KH
4711Return the corresponding character code in SJIS.")
4712 (ch)
4713 Lisp_Object ch;
4714{
bcf26d6a 4715 int charset, c1, c2, s1, s2;
4ed46869
KH
4716 Lisp_Object val;
4717
4718 CHECK_NUMBER (ch, 0);
4719 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4720 if (charset == charset_jisx0208)
4721 {
4722 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 4723 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
4724 }
4725 else
4726 XSETFASTINT (val, 0);
4727 return val;
4728}
4729
4730DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
d46c5b12 4731 "Decode a Big5 character CODE of BIG5 coding system.\n\
4ed46869
KH
4732CODE is the character code in BIG5.\n\
4733Return the corresponding character.")
4734 (code)
4735 Lisp_Object code;
4736{
4737 int charset;
4738 unsigned char b1, b2, c1, c2;
4739 Lisp_Object val;
4740
4741 CHECK_NUMBER (code, 0);
4742 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
4743 DECODE_BIG5 (b1, b2, charset, c1, c2);
4744 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
4745 return val;
4746}
4747
4748DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 4749 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
4750Return the corresponding character code in Big5.")
4751 (ch)
4752 Lisp_Object ch;
4753{
bcf26d6a 4754 int charset, c1, c2, b1, b2;
4ed46869
KH
4755 Lisp_Object val;
4756
4757 CHECK_NUMBER (ch, 0);
4758 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4759 if (charset == charset_big5_1 || charset == charset_big5_2)
4760 {
4761 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 4762 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
4763 }
4764 else
4765 XSETFASTINT (val, 0);
4766 return val;
4767}
3a73fa5d 4768\f
1ba9e4ab
KH
4769DEFUN ("set-terminal-coding-system-internal",
4770 Fset_terminal_coding_system_internal,
4771 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
4772 (coding_system)
4773 Lisp_Object coding_system;
4774{
4775 CHECK_SYMBOL (coding_system, 0);
4776 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 4777 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
4778 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
4779
4ed46869
KH
4780 return Qnil;
4781}
4782
c4825358
KH
4783DEFUN ("set-safe-terminal-coding-system-internal",
4784 Fset_safe_terminal_coding_system_internal,
4785 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
4786 (coding_system)
4787 Lisp_Object coding_system;
4788{
4789 CHECK_SYMBOL (coding_system, 0);
4790 setup_coding_system (Fcheck_coding_system (coding_system),
4791 &safe_terminal_coding);
4792 return Qnil;
4793}
4794
4ed46869
KH
4795DEFUN ("terminal-coding-system",
4796 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 4797 "Return coding system specified for terminal output.")
4ed46869
KH
4798 ()
4799{
4800 return terminal_coding.symbol;
4801}
4802
1ba9e4ab
KH
4803DEFUN ("set-keyboard-coding-system-internal",
4804 Fset_keyboard_coding_system_internal,
4805 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
4806 (coding_system)
4807 Lisp_Object coding_system;
4808{
4809 CHECK_SYMBOL (coding_system, 0);
4810 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
4811 return Qnil;
4812}
4813
4814DEFUN ("keyboard-coding-system",
4815 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 4816 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
4817 ()
4818{
4819 return keyboard_coding.symbol;
4820}
4821
4822\f
a5d301df
KH
4823DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
4824 Sfind_operation_coding_system, 1, MANY, 0,
4825 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
4826The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4827DECODING-SYSTEM is the coding system to use for decoding\n\
4828\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4829for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
4830\n\
4831The first argument OPERATION specifies an I/O primitive:\n\
4832 For file I/O, `insert-file-contents' or `write-region'.\n\
4833 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4834 For network I/O, `open-network-stream'.\n\
4835\n\
4836The remaining arguments should be the same arguments that were passed\n\
4837to the primitive. Depending on which primitive, one of those arguments\n\
4838is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4839whichever argument specifies the file name is TARGET.\n\
4840\n\
4841TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
4842 For file I/O, TARGET is a file name.\n\
4843 For process I/O, TARGET is a process name.\n\
4844 For network I/O, TARGET is a service name or a port number\n\
4845\n\
02ba4723
KH
4846This function looks up what specified for TARGET in,\n\
4847`file-coding-system-alist', `process-coding-system-alist',\n\
4848or `network-coding-system-alist' depending on OPERATION.\n\
4849They may specify a coding system, a cons of coding systems,\n\
4850or a function symbol to call.\n\
4851In the last case, we call the function with one argument,\n\
9ce27fde 4852which is a list of all the arguments given to this function.")
4ed46869
KH
4853 (nargs, args)
4854 int nargs;
4855 Lisp_Object *args;
4856{
4857 Lisp_Object operation, target_idx, target, val;
4858 register Lisp_Object chain;
4859
4860 if (nargs < 2)
4861 error ("Too few arguments");
4862 operation = args[0];
4863 if (!SYMBOLP (operation)
4864 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
4865 error ("Invalid first arguement");
4866 if (nargs < 1 + XINT (target_idx))
4867 error ("Too few arguments for operation: %s",
4868 XSYMBOL (operation)->name->data);
4869 target = args[XINT (target_idx) + 1];
4870 if (!(STRINGP (target)
4871 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
4872 error ("Invalid %dth argument", XINT (target_idx) + 1);
4873
2e34157c
RS
4874 chain = ((EQ (operation, Qinsert_file_contents)
4875 || EQ (operation, Qwrite_region))
02ba4723 4876 ? Vfile_coding_system_alist
2e34157c 4877 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
4878 ? Vnetwork_coding_system_alist
4879 : Vprocess_coding_system_alist));
4ed46869
KH
4880 if (NILP (chain))
4881 return Qnil;
4882
02ba4723 4883 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 4884 {
f44d27ce
RS
4885 Lisp_Object elt;
4886 elt = XCONS (chain)->car;
4ed46869
KH
4887
4888 if (CONSP (elt)
4889 && ((STRINGP (target)
4890 && STRINGP (XCONS (elt)->car)
4891 && fast_string_match (XCONS (elt)->car, target) >= 0)
4892 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
4893 {
4894 val = XCONS (elt)->cdr;
b19fd4c5
KH
4895 /* Here, if VAL is both a valid coding system and a valid
4896 function symbol, we return VAL as a coding system. */
02ba4723
KH
4897 if (CONSP (val))
4898 return val;
4899 if (! SYMBOLP (val))
4900 return Qnil;
4901 if (! NILP (Fcoding_system_p (val)))
4902 return Fcons (val, val);
b19fd4c5
KH
4903 if (! NILP (Ffboundp (val)))
4904 {
4905 val = call1 (val, Flist (nargs, args));
4906 if (CONSP (val))
4907 return val;
4908 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
4909 return Fcons (val, val);
4910 }
02ba4723
KH
4911 return Qnil;
4912 }
4ed46869
KH
4913 }
4914 return Qnil;
4915}
4916
d46c5b12
KH
4917DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems,
4918 Supdate_iso_coding_systems, 0, 0, 0,
4919 "Update internal database for ISO2022 based coding systems.\n\
4920When values of the following coding categories are changed, you must\n\
4921call this function:\n\
4922 coding-category-iso-7, coding-category-iso-7-tight,\n\
4923 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4924 coding-category-iso-7-else, coding-category-iso-8-else")
4925 ()
4926{
4927 int i;
4928
4929 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_ISO_8_ELSE;
4930 i++)
4931 {
4932 if (! coding_system_table[i])
4933 coding_system_table[i]
4934 = (struct coding_system *) xmalloc (sizeof (struct coding_system));
4935 setup_coding_system
4936 (XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value,
4937 coding_system_table[i]);
4938 }
4939 return Qnil;
4940}
4941
4ed46869
KH
4942#endif /* emacs */
4943
4944\f
4945/*** 8. Post-amble ***/
4946
dfcf069d 4947void
4ed46869
KH
4948init_coding_once ()
4949{
4950 int i;
4951
0ef69138 4952 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
4953 for (i = 0; i <= 0x20; i++)
4954 emacs_code_class[i] = EMACS_control_code;
4955 emacs_code_class[0x0A] = EMACS_linefeed_code;
4956 emacs_code_class[0x0D] = EMACS_carriage_return_code;
4957 for (i = 0x21 ; i < 0x7F; i++)
4958 emacs_code_class[i] = EMACS_ascii_code;
4959 emacs_code_class[0x7F] = EMACS_control_code;
4960 emacs_code_class[0x80] = EMACS_leading_code_composition;
4961 for (i = 0x81; i < 0xFF; i++)
4962 emacs_code_class[i] = EMACS_invalid_code;
4963 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
4964 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
4965 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
4966 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
4967
4968 /* ISO2022 specific initialize routine. */
4969 for (i = 0; i < 0x20; i++)
4970 iso_code_class[i] = ISO_control_code;
4971 for (i = 0x21; i < 0x7F; i++)
4972 iso_code_class[i] = ISO_graphic_plane_0;
4973 for (i = 0x80; i < 0xA0; i++)
4974 iso_code_class[i] = ISO_control_code;
4975 for (i = 0xA1; i < 0xFF; i++)
4976 iso_code_class[i] = ISO_graphic_plane_1;
4977 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
4978 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4979 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
4980 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
4981 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
4982 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
4983 iso_code_class[ISO_CODE_ESC] = ISO_escape;
4984 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
4985 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
4986 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
4987
e0e989f6
KH
4988 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
4989 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
4990
4991 setup_coding_system (Qnil, &keyboard_coding);
4992 setup_coding_system (Qnil, &terminal_coding);
c4825358 4993 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde 4994
d46c5b12
KH
4995 bzero (coding_system_table, sizeof coding_system_table);
4996
9ce27fde
KH
4997#if defined (MSDOS) || defined (WINDOWSNT)
4998 system_eol_type = CODING_EOL_CRLF;
4999#else
5000 system_eol_type = CODING_EOL_LF;
5001#endif
e0e989f6
KH
5002}
5003
5004#ifdef emacs
5005
dfcf069d 5006void
e0e989f6
KH
5007syms_of_coding ()
5008{
5009 Qtarget_idx = intern ("target-idx");
5010 staticpro (&Qtarget_idx);
5011
bb0115a2
RS
5012 Qcoding_system_history = intern ("coding-system-history");
5013 staticpro (&Qcoding_system_history);
5014 Fset (Qcoding_system_history, Qnil);
5015
9ce27fde 5016 /* Target FILENAME is the first argument. */
e0e989f6 5017 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5018 /* Target FILENAME is the third argument. */
e0e989f6
KH
5019 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5020
5021 Qcall_process = intern ("call-process");
5022 staticpro (&Qcall_process);
9ce27fde 5023 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5024 Fput (Qcall_process, Qtarget_idx, make_number (0));
5025
5026 Qcall_process_region = intern ("call-process-region");
5027 staticpro (&Qcall_process_region);
9ce27fde 5028 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5029 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5030
5031 Qstart_process = intern ("start-process");
5032 staticpro (&Qstart_process);
9ce27fde 5033 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5034 Fput (Qstart_process, Qtarget_idx, make_number (2));
5035
5036 Qopen_network_stream = intern ("open-network-stream");
5037 staticpro (&Qopen_network_stream);
9ce27fde 5038 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5039 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5040
4ed46869
KH
5041 Qcoding_system = intern ("coding-system");
5042 staticpro (&Qcoding_system);
5043
5044 Qeol_type = intern ("eol-type");
5045 staticpro (&Qeol_type);
5046
5047 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5048 staticpro (&Qbuffer_file_coding_system);
5049
5050 Qpost_read_conversion = intern ("post-read-conversion");
5051 staticpro (&Qpost_read_conversion);
5052
5053 Qpre_write_conversion = intern ("pre-write-conversion");
5054 staticpro (&Qpre_write_conversion);
5055
27901516
KH
5056 Qno_conversion = intern ("no-conversion");
5057 staticpro (&Qno_conversion);
5058
5059 Qundecided = intern ("undecided");
5060 staticpro (&Qundecided);
5061
4ed46869
KH
5062 Qcoding_system_p = intern ("coding-system-p");
5063 staticpro (&Qcoding_system_p);
5064
5065 Qcoding_system_error = intern ("coding-system-error");
5066 staticpro (&Qcoding_system_error);
5067
5068 Fput (Qcoding_system_error, Qerror_conditions,
5069 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5070 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5071 build_string ("Invalid coding system"));
4ed46869 5072
d46c5b12
KH
5073 Qcoding_category = intern ("coding-category");
5074 staticpro (&Qcoding_category);
4ed46869
KH
5075 Qcoding_category_index = intern ("coding-category-index");
5076 staticpro (&Qcoding_category_index);
5077
d46c5b12
KH
5078 Vcoding_category_table
5079 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5080 staticpro (&Vcoding_category_table);
4ed46869
KH
5081 {
5082 int i;
5083 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5084 {
d46c5b12
KH
5085 XVECTOR (Vcoding_category_table)->contents[i]
5086 = intern (coding_category_name[i]);
5087 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5088 Qcoding_category_index, make_number (i));
4ed46869
KH
5089 }
5090 }
5091
bdd9fb48
KH
5092 Qcharacter_unification_table = intern ("character-unification-table");
5093 staticpro (&Qcharacter_unification_table);
5094 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
5095 make_number (0));
5096
a5d301df
KH
5097 Qcharacter_unification_table_for_decode
5098 = intern ("character-unification-table-for-decode");
5099 staticpro (&Qcharacter_unification_table_for_decode);
5100
5101 Qcharacter_unification_table_for_encode
5102 = intern ("character-unification-table-for-encode");
5103 staticpro (&Qcharacter_unification_table_for_encode);
5104
70c22245
KH
5105 Qsafe_charsets = intern ("safe-charsets");
5106 staticpro (&Qsafe_charsets);
5107
9ce27fde
KH
5108 Qemacs_mule = intern ("emacs-mule");
5109 staticpro (&Qemacs_mule);
5110
d46c5b12
KH
5111 Qraw_text = intern ("raw-text");
5112 staticpro (&Qraw_text);
5113
4ed46869
KH
5114 defsubr (&Scoding_system_p);
5115 defsubr (&Sread_coding_system);
5116 defsubr (&Sread_non_nil_coding_system);
5117 defsubr (&Scheck_coding_system);
5118 defsubr (&Sdetect_coding_region);
d46c5b12 5119 defsubr (&Sdetect_coding_string);
4ed46869
KH
5120 defsubr (&Sdecode_coding_region);
5121 defsubr (&Sencode_coding_region);
5122 defsubr (&Sdecode_coding_string);
5123 defsubr (&Sencode_coding_string);
5124 defsubr (&Sdecode_sjis_char);
5125 defsubr (&Sencode_sjis_char);
5126 defsubr (&Sdecode_big5_char);
5127 defsubr (&Sencode_big5_char);
1ba9e4ab 5128 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5129 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5130 defsubr (&Sterminal_coding_system);
1ba9e4ab 5131 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5132 defsubr (&Skeyboard_coding_system);
a5d301df 5133 defsubr (&Sfind_operation_coding_system);
d46c5b12 5134 defsubr (&Supdate_iso_coding_systems);
4ed46869 5135
4608c386
KH
5136 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5137 "List of coding systems.\n\
5138\n\
5139Do not alter the value of this variable manually. This variable should be\n\
5140updated by the functions `make-coding-system' and\n\
5141`define-coding-system-alias'.");
5142 Vcoding_system_list = Qnil;
5143
5144 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5145 "Alist of coding system names.\n\
5146Each element is one element list of coding system name.\n\
5147This variable is given to `completing-read' as TABLE argument.\n\
5148\n\
5149Do not alter the value of this variable manually. This variable should be\n\
5150updated by the functions `make-coding-system' and\n\
5151`define-coding-system-alias'.");
5152 Vcoding_system_alist = Qnil;
5153
4ed46869
KH
5154 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5155 "List of coding-categories (symbols) ordered by priority.");
5156 {
5157 int i;
5158
5159 Vcoding_category_list = Qnil;
5160 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5161 Vcoding_category_list
d46c5b12
KH
5162 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5163 Vcoding_category_list);
4ed46869
KH
5164 }
5165
5166 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5167 "Specify the coding system for read operations.\n\
2ebb362d 5168It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5169If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5170If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5171There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5172`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5173 Vcoding_system_for_read = Qnil;
5174
5175 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5176 "Specify the coding system for write operations.\n\
2ebb362d 5177It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5178If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5179If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5180There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5181`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5182 Vcoding_system_for_write = Qnil;
5183
5184 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5185 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5186 Vlast_coding_system_used = Qnil;
5187
9ce27fde
KH
5188 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5189 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5190 inhibit_eol_conversion = 0;
5191
02ba4723
KH
5192 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5193 "Alist to decide a coding system to use for a file I/O operation.\n\
5194The format is ((PATTERN . VAL) ...),\n\
5195where PATTERN is a regular expression matching a file name,\n\
5196VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5197If VAL is a coding system, it is used for both decoding and encoding\n\
5198the file contents.\n\
5199If VAL is a cons of coding systems, the car part is used for decoding,\n\
5200and the cdr part is used for encoding.\n\
5201If VAL is a function symbol, the function must return a coding system\n\
5202or a cons of coding systems which are used as above.\n\
e0e989f6 5203\n\
9ce27fde 5204See also the function `find-operation-coding-system'.");
02ba4723
KH
5205 Vfile_coding_system_alist = Qnil;
5206
5207 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5208 "Alist to decide a coding system to use for a process I/O operation.\n\
5209The format is ((PATTERN . VAL) ...),\n\
5210where PATTERN is a regular expression matching a program name,\n\
5211VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5212If VAL is a coding system, it is used for both decoding what received\n\
5213from the program and encoding what sent to the program.\n\
5214If VAL is a cons of coding systems, the car part is used for decoding,\n\
5215and the cdr part is used for encoding.\n\
5216If VAL is a function symbol, the function must return a coding system\n\
5217or a cons of coding systems which are used as above.\n\
4ed46869 5218\n\
9ce27fde 5219See also the function `find-operation-coding-system'.");
02ba4723
KH
5220 Vprocess_coding_system_alist = Qnil;
5221
5222 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5223 "Alist to decide a coding system to use for a network I/O operation.\n\
5224The format is ((PATTERN . VAL) ...),\n\
5225where PATTERN is a regular expression matching a network service name\n\
5226or is a port number to connect to,\n\
5227VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5228If VAL is a coding system, it is used for both decoding what received\n\
5229from the network stream and encoding what sent to the network stream.\n\
5230If VAL is a cons of coding systems, the car part is used for decoding,\n\
5231and the cdr part is used for encoding.\n\
5232If VAL is a function symbol, the function must return a coding system\n\
5233or a cons of coding systems which are used as above.\n\
4ed46869 5234\n\
9ce27fde 5235See also the function `find-operation-coding-system'.");
02ba4723 5236 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
5237
5238 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5239 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 5240 eol_mnemonic_unix = ':';
4ed46869
KH
5241
5242 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5243 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 5244 eol_mnemonic_dos = '\\';
4ed46869
KH
5245
5246 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5247 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 5248 eol_mnemonic_mac = '/';
4ed46869
KH
5249
5250 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5251 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 5252 eol_mnemonic_undecided = ':';
4ed46869 5253
bdd9fb48
KH
5254 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
5255 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5256 Venable_character_unification = Qt;
5257
a5d301df
KH
5258 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5259 &Vstandard_character_unification_table_for_decode,
bdd9fb48 5260 "Table for unifying characters when reading.");
a5d301df 5261 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 5262
a5d301df
KH
5263 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5264 &Vstandard_character_unification_table_for_encode,
bdd9fb48 5265 "Table for unifying characters when writing.");
a5d301df 5266 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
5267
5268 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5269 "Alist of charsets vs revision numbers.\n\
5270While encoding, if a charset (car part of an element) is found,\n\
5271designate it with the escape sequence identifing revision (cdr part of the element).");
5272 Vcharset_revision_alist = Qnil;
02ba4723
KH
5273
5274 DEFVAR_LISP ("default-process-coding-system",
5275 &Vdefault_process_coding_system,
5276 "Cons of coding systems used for process I/O by default.\n\
5277The car part is used for decoding a process output,\n\
5278the cdr part is used for encoding a text to be sent to a process.");
5279 Vdefault_process_coding_system = Qnil;
c4825358 5280
3f003981
KH
5281 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5282 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5283This is a vector of length 256.\n\
5284If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5285\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5286a coding system of ISO 2022 variant which has a flag\n\
5287`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5288or reading output of a subprocess.\n\
5289Only 128th through 159th elements has a meaning.");
3f003981 5290 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5291
5292 DEFVAR_LISP ("select-safe-coding-system-function",
5293 &Vselect_safe_coding_system_function,
5294 "Function to call to select safe coding system for encoding a text.\n\
5295\n\
5296If set, this function is called to force a user to select a proper\n\
5297coding system which can encode the text in the case that a default\n\
5298coding system used in each operation can't encode the text.\n\
5299\n\
5300The default value is `select-safe-codign-system' (which see).");
5301 Vselect_safe_coding_system_function = Qnil;
5302
4ed46869
KH
5303}
5304
5305#endif /* emacs */