Add comments on coding-category-utf-8,
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
ec6d2bb8
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 *dst++ = (c) & 0x7F; \
219 coding->produced_char++; \
4ed46869
KH
220 } while (0)
221
f4dee582 222/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
223 position-code is C. */
224
225#define DECODE_CHARACTER_DIMENSION1(charset, c) \
226 do { \
227 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
ec6d2bb8
KH
228 \
229 *dst++ = leading_code; \
230 if ((leading_code = CHARSET_LEADING_CODE_EXT (charset)) > 0) \
4ed46869
KH
231 *dst++ = leading_code; \
232 *dst++ = (c) | 0x80; \
ec6d2bb8 233 coding->produced_char++; \
4ed46869
KH
234 } while (0)
235
f4dee582 236/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
237 position-codes are C1 and C2. */
238
239#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
240 do { \
241 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
242 *dst++ = (c2) | 0x80; \
243 } while (0)
244
245\f
246/*** 1. Preamble ***/
247
68c45bf0
PE
248#ifdef emacs
249#include <config.h>
250#endif
251
4ed46869
KH
252#include <stdio.h>
253
254#ifdef emacs
255
4ed46869
KH
256#include "lisp.h"
257#include "buffer.h"
258#include "charset.h"
ec6d2bb8 259#include "composite.h"
4ed46869
KH
260#include "ccl.h"
261#include "coding.h"
262#include "window.h"
263
264#else /* not emacs */
265
266#include "mulelib.h"
267
268#endif /* not emacs */
269
270Lisp_Object Qcoding_system, Qeol_type;
271Lisp_Object Qbuffer_file_coding_system;
272Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 273Lisp_Object Qno_conversion, Qundecided;
bb0115a2 274Lisp_Object Qcoding_system_history;
70c22245 275Lisp_Object Qsafe_charsets;
1397dc18 276Lisp_Object Qvalid_codes;
4ed46869
KH
277
278extern Lisp_Object Qinsert_file_contents, Qwrite_region;
279Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
280Lisp_Object Qstart_process, Qopen_network_stream;
281Lisp_Object Qtarget_idx;
282
d46c5b12
KH
283Lisp_Object Vselect_safe_coding_system_function;
284
7722baf9
EZ
285/* Mnemonic string for each format of end-of-line. */
286Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
287/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 288 decided. */
7722baf9 289Lisp_Object eol_mnemonic_undecided;
4ed46869 290
9ce27fde
KH
291/* Format of end-of-line decided by system. This is CODING_EOL_LF on
292 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
293int system_eol_type;
294
4ed46869
KH
295#ifdef emacs
296
4608c386
KH
297Lisp_Object Vcoding_system_list, Vcoding_system_alist;
298
299Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 300
d46c5b12
KH
301/* Coding system emacs-mule and raw-text are for converting only
302 end-of-line format. */
303Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 304
4ed46869
KH
305/* Coding-systems are handed between Emacs Lisp programs and C internal
306 routines by the following three variables. */
307/* Coding-system for reading files and receiving data from process. */
308Lisp_Object Vcoding_system_for_read;
309/* Coding-system for writing files and sending data to process. */
310Lisp_Object Vcoding_system_for_write;
311/* Coding-system actually used in the latest I/O. */
312Lisp_Object Vlast_coding_system_used;
313
c4825358 314/* A vector of length 256 which contains information about special
94487c4e 315 Latin codes (especially for dealing with Microsoft codes). */
3f003981 316Lisp_Object Vlatin_extra_code_table;
c4825358 317
9ce27fde
KH
318/* Flag to inhibit code conversion of end-of-line format. */
319int inhibit_eol_conversion;
320
ed29121d
EZ
321/* Flag to make buffer-file-coding-system inherit from process-coding. */
322int inherit_process_coding_system;
323
c4825358 324/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
325struct coding_system terminal_coding;
326
c4825358
KH
327/* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329struct coding_system safe_terminal_coding;
330
331/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
332struct coding_system keyboard_coding;
333
6bc51348
KH
334/* Default coding system to be used to write a file. */
335struct coding_system default_buffer_file_coding;
336
02ba4723
KH
337Lisp_Object Vfile_coding_system_alist;
338Lisp_Object Vprocess_coding_system_alist;
339Lisp_Object Vnetwork_coding_system_alist;
4ed46869 340
68c45bf0
PE
341Lisp_Object Vlocale_coding_system;
342
4ed46869
KH
343#endif /* emacs */
344
d46c5b12 345Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
346
347/* List of symbols `coding-category-xxx' ordered by priority. */
348Lisp_Object Vcoding_category_list;
349
d46c5b12
KH
350/* Table of coding categories (Lisp symbols). */
351Lisp_Object Vcoding_category_table;
4ed46869
KH
352
353/* Table of names of symbol for each coding-category. */
354char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 355 "coding-category-emacs-mule",
4ed46869
KH
356 "coding-category-sjis",
357 "coding-category-iso-7",
d46c5b12 358 "coding-category-iso-7-tight",
4ed46869
KH
359 "coding-category-iso-8-1",
360 "coding-category-iso-8-2",
7717c392
KH
361 "coding-category-iso-7-else",
362 "coding-category-iso-8-else",
89fa8b36 363 "coding-category-ccl",
4ed46869 364 "coding-category-big5",
fa42c37f
KH
365 "coding-category-utf-8",
366 "coding-category-utf-16-be",
367 "coding-category-utf-16-le",
27901516 368 "coding-category-raw-text",
89fa8b36 369 "coding-category-binary"
4ed46869
KH
370};
371
66cfb530 372/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
373 categories. */
374struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
375
66cfb530
KH
376/* Table of coding category masks. Nth element is a mask for a coding
377 cateogry of which priority is Nth. */
378static
379int coding_priorities[CODING_CATEGORY_IDX_MAX];
380
f967223b
KH
381/* Flag to tell if we look up translation table on character code
382 conversion. */
84fbb8a0 383Lisp_Object Venable_character_translation;
f967223b
KH
384/* Standard translation table to look up on decoding (reading). */
385Lisp_Object Vstandard_translation_table_for_decode;
386/* Standard translation table to look up on encoding (writing). */
387Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 388
f967223b
KH
389Lisp_Object Qtranslation_table;
390Lisp_Object Qtranslation_table_id;
391Lisp_Object Qtranslation_table_for_decode;
392Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
393
394/* Alist of charsets vs revision number. */
395Lisp_Object Vcharset_revision_alist;
396
02ba4723
KH
397/* Default coding systems used for process I/O. */
398Lisp_Object Vdefault_process_coding_system;
399
b843d1ae
KH
400/* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404static int inhibit_pre_post_conversion;
405
4ed46869 406\f
0ef69138 407/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
408
409/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
410 kind of multi-byte encoding, i.e. characters are encoded by
411 variable-length sequences of one-byte codes. ASCII characters
412 and control characters (e.g. `tab', `newline') are represented by
413 one-byte sequences which are their ASCII codes, in the range 0x00
414 through 0x7F. The other characters are represented by a sequence
415 of `base leading-code', optional `extended leading-code', and one
416 or two `position-code's. The length of the sequence is determined
417 by the base leading-code. Leading-code takes the range 0x80
418 through 0x9F, whereas extended leading-code and position-code take
419 the range 0xA0 through 0xFF. See `charset.h' for more details
420 about leading-code and position-code.
421
4ed46869
KH
422 --- CODE RANGE of Emacs' internal format ---
423 (character set) (range)
424 ASCII 0x00 .. 0x7F
ec6d2bb8 425 ELSE (1st byte) 0x81 .. 0x9F
4ed46869
KH
426 (rest bytes) 0xA0 .. 0xFF
427 ---------------------------------------------
428
429 */
430
431enum emacs_code_class_type emacs_code_class[256];
432
433/* Go to the next statement only if *SRC is accessible and the code is
434 greater than 0xA0. */
435#define CHECK_CODE_RANGE_A0_FF \
436 do { \
437 if (src >= src_end) \
438 goto label_end_of_switch; \
439 else if (*src++ < 0xA0) \
440 return 0; \
441 } while (0)
442
443/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
444 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 445 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
446
447int
0ef69138 448detect_coding_emacs_mule (src, src_end)
4ed46869
KH
449 unsigned char *src, *src_end;
450{
451 unsigned char c;
452 int composing = 0;
453
454 while (src < src_end)
455 {
456 c = *src++;
457
458 if (composing)
459 {
460 if (c < 0xA0)
461 composing = 0;
462 else
463 c -= 0x20;
464 }
465
466 switch (emacs_code_class[c])
467 {
468 case EMACS_ascii_code:
469 case EMACS_linefeed_code:
470 break;
471
472 case EMACS_control_code:
473 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
474 return 0;
475 break;
476
477 case EMACS_invalid_code:
478 return 0;
479
4ed46869
KH
480 case EMACS_leading_code_4:
481 CHECK_CODE_RANGE_A0_FF;
482 /* fall down to check it two more times ... */
483
484 case EMACS_leading_code_3:
485 CHECK_CODE_RANGE_A0_FF;
486 /* fall down to check it one more time ... */
487
488 case EMACS_leading_code_2:
489 CHECK_CODE_RANGE_A0_FF;
490 break;
491
ec6d2bb8
KH
492 case 0x80: /* Old leading code for a composite character. */
493 if (composing)
494 CHECK_CODE_RANGE_A0_FF;
495 else
496 composing = 1;
497 break;
498
4ed46869
KH
499 default:
500 label_end_of_switch:
501 break;
502 }
503 }
0ef69138 504 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
505}
506
507\f
508/*** 3. ISO2022 handlers ***/
509
510/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
511 Since the intention of this note is to help understand the
512 functions in this file, some parts are NOT ACCURATE or OVERLY
513 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
514 original document of ISO2022.
515
516 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
517 in 7-bit and 8-bit environments. For 7-bite environments, all text
518 is encoded using bytes less than 128. This may make the encoded
519 text a little bit longer, but the text passes more easily through
520 several gateways, some of which strip off MSB (Most Signigant Bit).
521
522 There are two kinds of character sets: control character set and
4ed46869
KH
523 graphic character set. The former contains control characters such
524 as `newline' and `escape' to provide control functions (control
39787efd
KH
525 functions are also provided by escape sequences). The latter
526 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
527 two control character sets and many graphic character sets.
528
529 Graphic character sets are classified into one of the following
39787efd
KH
530 four classes, according to the number of bytes (DIMENSION) and
531 number of characters in one dimension (CHARS) of the set:
532 - DIMENSION1_CHARS94
533 - DIMENSION1_CHARS96
534 - DIMENSION2_CHARS94
535 - DIMENSION2_CHARS96
536
537 In addition, each character set is assigned an identification tag,
538 unique for each set, called "final character" (denoted as <F>
539 hereafter). The <F> of each character set is decided by ECMA(*)
540 when it is registered in ISO. The code range of <F> is 0x30..0x7F
541 (0x30..0x3F are for private use only).
4ed46869
KH
542
543 Note (*): ECMA = European Computer Manufacturers Association
544
545 Here are examples of graphic character set [NAME(<F>)]:
546 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
547 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
548 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
549 o DIMENSION2_CHARS96 -- none for the moment
550
39787efd 551 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
552 C0 [0x00..0x1F] -- control character plane 0
553 GL [0x20..0x7F] -- graphic character plane 0
554 C1 [0x80..0x9F] -- control character plane 1
555 GR [0xA0..0xFF] -- graphic character plane 1
556
557 A control character set is directly designated and invoked to C0 or
39787efd
KH
558 C1 by an escape sequence. The most common case is that:
559 - ISO646's control character set is designated/invoked to C0, and
560 - ISO6429's control character set is designated/invoked to C1,
561 and usually these designations/invocations are omitted in encoded
562 text. In a 7-bit environment, only C0 can be used, and a control
563 character for C1 is encoded by an appropriate escape sequence to
564 fit into the environment. All control characters for C1 are
565 defined to have corresponding escape sequences.
4ed46869
KH
566
567 A graphic character set is at first designated to one of four
568 graphic registers (G0 through G3), then these graphic registers are
569 invoked to GL or GR. These designations and invocations can be
570 done independently. The most common case is that G0 is invoked to
39787efd
KH
571 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
572 these invocations and designations are omitted in encoded text.
573 In a 7-bit environment, only GL can be used.
4ed46869 574
39787efd
KH
575 When a graphic character set of CHARS94 is invoked to GL, codes
576 0x20 and 0x7F of the GL area work as control characters SPACE and
577 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
578 be used.
4ed46869
KH
579
580 There are two ways of invocation: locking-shift and single-shift.
581 With locking-shift, the invocation lasts until the next different
39787efd
KH
582 invocation, whereas with single-shift, the invocation affects the
583 following character only and doesn't affect the locking-shift
584 state. Invocations are done by the following control characters or
585 escape sequences:
4ed46869
KH
586
587 ----------------------------------------------------------------------
39787efd 588 abbrev function cntrl escape seq description
4ed46869 589 ----------------------------------------------------------------------
39787efd
KH
590 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
591 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
592 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
593 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
594 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
595 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
596 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
597 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
598 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 599 ----------------------------------------------------------------------
39787efd
KH
600 (*) These are not used by any known coding system.
601
602 Control characters for these functions are defined by macros
603 ISO_CODE_XXX in `coding.h'.
4ed46869 604
39787efd 605 Designations are done by the following escape sequences:
4ed46869
KH
606 ----------------------------------------------------------------------
607 escape sequence description
608 ----------------------------------------------------------------------
609 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
610 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
611 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
612 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
613 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
614 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
615 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
616 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
617 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
618 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
619 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
620 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
621 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
622 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
623 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
624 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
625 ----------------------------------------------------------------------
626
627 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 628 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
629
630 Note (*): Although these designations are not allowed in ISO2022,
631 Emacs accepts them on decoding, and produces them on encoding
39787efd 632 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
633 7-bit environment, non-locking-shift, and non-single-shift.
634
635 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 636 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
637
638 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
639 same multilingual text in ISO2022. Actually, there exist many
640 coding systems such as Compound Text (used in X11's inter client
641 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
642 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
643 localized platforms), and all of these are variants of ISO2022.
644
645 In addition to the above, Emacs handles two more kinds of escape
646 sequences: ISO6429's direction specification and Emacs' private
647 sequence for specifying character composition.
648
39787efd 649 ISO6429's direction specification takes the following form:
4ed46869
KH
650 o CSI ']' -- end of the current direction
651 o CSI '0' ']' -- end of the current direction
652 o CSI '1' ']' -- start of left-to-right text
653 o CSI '2' ']' -- start of right-to-left text
654 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
655 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
656
657 Character composition specification takes the following form:
ec6d2bb8
KH
658 o ESC '0' -- start relative composition
659 o ESC '1' -- end composition
660 o ESC '2' -- start rule-base composition (*)
661 o ESC '3' -- start relative composition with alternate chars (**)
662 o ESC '4' -- start rule-base composition with alternate chars (**)
39787efd 663 Since these are not standard escape sequences of any ISO standard,
ec6d2bb8
KH
664 the use of them for these meaning is restricted to Emacs only.
665
666 (*) This form is used only in Emacs 20.5 and the older versions,
667 but the newer versions can safely decode it.
668 (**) This form is used only in Emacs 21.1 and the newer versions,
669 and the older versions can't decode it.
670
671 Here's a list of examples usages of these composition escape
672 sequences (categorized by `enum composition_method').
673
674 COMPOSITION_RELATIVE:
675 ESC 0 CHAR [ CHAR ] ESC 1
676 COMPOSITOIN_WITH_RULE:
677 ESC 2 CHAR [ RULE CHAR ] ESC 1
678 COMPOSITION_WITH_ALTCHARS:
679 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
680 COMPOSITION_WITH_RULE_ALTCHARS:
681 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
682
683enum iso_code_class_type iso_code_class[256];
684
f024b6aa
RS
685#define CHARSET_OK(idx, charset) \
686 (coding_system_table[idx] \
687 && (coding_system_table[idx]->safe_charsets[charset] \
688 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
689 (coding_system_table[idx], charset) \
690 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
691
692#define SHIFT_OUT_OK(idx) \
693 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
694
4ed46869
KH
695/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
696 Check if a text is encoded in ISO2022. If it is, returns an
697 integer in which appropriate flag bits any of:
698 CODING_CATEGORY_MASK_ISO_7
d46c5b12 699 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
700 CODING_CATEGORY_MASK_ISO_8_1
701 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
702 CODING_CATEGORY_MASK_ISO_7_ELSE
703 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
704 are set. If a code which should never appear in ISO2022 is found,
705 returns 0. */
706
707int
708detect_coding_iso2022 (src, src_end)
709 unsigned char *src, *src_end;
710{
d46c5b12
KH
711 int mask = CODING_CATEGORY_MASK_ISO;
712 int mask_found = 0;
f46869e4 713 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 714 int c, c1, i, charset;
3f003981 715
d46c5b12 716 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 717 while (mask && src < src_end)
4ed46869
KH
718 {
719 c = *src++;
720 switch (c)
721 {
722 case ISO_CODE_ESC:
f46869e4 723 single_shifting = 0;
e0e989f6 724 if (src >= src_end)
4ed46869
KH
725 break;
726 c = *src++;
d46c5b12 727 if (c >= '(' && c <= '/')
4ed46869 728 {
bf9cdd4e
KH
729 /* Designation sequence for a charset of dimension 1. */
730 if (src >= src_end)
731 break;
d46c5b12
KH
732 c1 = *src++;
733 if (c1 < ' ' || c1 >= 0x80
734 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
735 /* Invalid designation sequence. Just ignore. */
736 break;
737 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
738 }
739 else if (c == '$')
740 {
741 /* Designation sequence for a charset of dimension 2. */
742 if (src >= src_end)
743 break;
744 c = *src++;
745 if (c >= '@' && c <= 'B')
746 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 747 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 748 else if (c >= '(' && c <= '/')
bcf26d6a 749 {
bf9cdd4e
KH
750 if (src >= src_end)
751 break;
d46c5b12
KH
752 c1 = *src++;
753 if (c1 < ' ' || c1 >= 0x80
754 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
755 /* Invalid designation sequence. Just ignore. */
756 break;
757 reg[(c - '(') % 4] = charset;
bcf26d6a 758 }
bf9cdd4e 759 else
d46c5b12
KH
760 /* Invalid designation sequence. Just ignore. */
761 break;
762 }
ae9ff118 763 else if (c == 'N' || c == 'O')
d46c5b12 764 {
ae9ff118
KH
765 /* ESC <Fe> for SS2 or SS3. */
766 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 767 break;
4ed46869 768 }
ec6d2bb8
KH
769 else if (c >= '0' && c <= '4')
770 {
771 /* ESC <Fp> for start/end composition. */
772 mask_found |= CODING_CATEGORY_MASK_ISO;
773 break;
774 }
bf9cdd4e 775 else
d46c5b12
KH
776 /* Invalid escape sequence. Just ignore. */
777 break;
778
779 /* We found a valid designation sequence for CHARSET. */
780 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
781 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
782 mask_found |= CODING_CATEGORY_MASK_ISO_7;
783 else
784 mask &= ~CODING_CATEGORY_MASK_ISO_7;
785 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
786 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
787 else
788 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
789 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
790 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
791 else
d46c5b12 792 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
793 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
794 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
795 else
d46c5b12 796 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
797 break;
798
4ed46869 799 case ISO_CODE_SO:
f46869e4 800 single_shifting = 0;
d46c5b12
KH
801 if (shift_out == 0
802 && (reg[1] >= 0
803 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
804 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
805 {
806 /* Locking shift out. */
807 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
808 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
809 }
e0e989f6
KH
810 break;
811
d46c5b12 812 case ISO_CODE_SI:
f46869e4 813 single_shifting = 0;
d46c5b12
KH
814 if (shift_out == 1)
815 {
816 /* Locking shift in. */
817 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
818 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
819 }
820 break;
821
4ed46869 822 case ISO_CODE_CSI:
f46869e4 823 single_shifting = 0;
4ed46869
KH
824 case ISO_CODE_SS2:
825 case ISO_CODE_SS3:
3f003981
KH
826 {
827 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
828
70c22245
KH
829 if (c != ISO_CODE_CSI)
830 {
d46c5b12
KH
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
832 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 833 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
834 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
835 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 836 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 837 single_shifting = 1;
70c22245 838 }
3f003981
KH
839 if (VECTORP (Vlatin_extra_code_table)
840 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
841 {
d46c5b12
KH
842 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
843 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 844 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
845 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
846 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
847 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
848 }
849 mask &= newmask;
d46c5b12 850 mask_found |= newmask;
3f003981
KH
851 }
852 break;
4ed46869
KH
853
854 default:
855 if (c < 0x80)
f46869e4
KH
856 {
857 single_shifting = 0;
858 break;
859 }
4ed46869 860 else if (c < 0xA0)
c4825358 861 {
f46869e4 862 single_shifting = 0;
3f003981
KH
863 if (VECTORP (Vlatin_extra_code_table)
864 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 865 {
3f003981
KH
866 int newmask = 0;
867
d46c5b12
KH
868 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
869 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 870 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
871 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
872 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
873 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
874 mask &= newmask;
d46c5b12 875 mask_found |= newmask;
c4825358 876 }
3f003981
KH
877 else
878 return 0;
c4825358 879 }
4ed46869
KH
880 else
881 {
7717c392 882 unsigned char *src_begin = src;
4ed46869 883
d46c5b12 884 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 885 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 886 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
887 /* Check the length of succeeding codes of the range
888 0xA0..0FF. If the byte length is odd, we exclude
889 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
890 when we are not single shifting. */
891 if (!single_shifting)
892 {
893 while (src < src_end && *src >= 0xA0)
894 src++;
895 if ((src - src_begin - 1) & 1 && src < src_end)
896 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
897 else
898 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
899 }
4ed46869
KH
900 }
901 break;
902 }
903 }
904
d46c5b12 905 return (mask & mask_found);
4ed46869
KH
906}
907
908/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 909 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
910 fetched from SRC and set to C2. If CHARSET is negative, it means
911 that we are decoding ill formed text, and what we can do is just to
ec6d2bb8
KH
912 read C1 as is.
913
914 If we are now in the middle of composition sequence, the decoded
915 character may be ALTCHAR (see the comment above). In that case,
916 the character goes to coding->cmp_data->data instead of DST. */
917
918#define DECODE_ISO_CHARACTER(charset, c1) \
919 do { \
920 int c_alt = -1, charset_alt = (charset); \
921 if (charset_alt >= 0) \
922 { \
923 if (CHARSET_DIMENSION (charset_alt) == 2) \
924 { \
925 ONE_MORE_BYTE (c2); \
926 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
927 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
928 { \
929 src--; \
930 charset_alt = CHARSET_ASCII; \
931 } \
932 } \
933 if (!NILP (translation_table) \
934 && ((c_alt = translate_char (translation_table, \
85bbb134 935 -1, charset_alt, c1, c2)) >= 0)) \
ec6d2bb8
KH
936 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
937 } \
938 if (! COMPOSING_P (coding) \
939 || coding->composing == COMPOSITION_RELATIVE \
940 || coding->composing == COMPOSITION_WITH_RULE) \
941 { \
942 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
943 DECODE_CHARACTER_ASCII (c1); \
944 else if (CHARSET_DIMENSION (charset_alt) == 1) \
945 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
946 else \
947 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
948 } \
949 if (COMPOSING_P (coding) \
950 && coding->composing != COMPOSITION_RELATIVE) \
951 { \
952 if (c_alt < 0) \
953 c_alt = MAKE_CHAR (charset_alt, c1, c2); \
954 CODING_ADD_COMPOSITION_COMPONENT (coding, c_alt); \
955 coding->composition_rule_follows \
956 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
957 } \
4ed46869
KH
958 } while (0)
959
960/* Set designation state into CODING. */
d46c5b12
KH
961#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
962 do { \
944bd420
KH
963 int charset; \
964 \
965 if (final_char < '0' || final_char >= 128) \
966 goto label_invalid_code; \
967 charset = ISO_CHARSET_TABLE (make_number (dimension), \
968 make_number (chars), \
969 make_number (final_char)); \
d46c5b12 970 if (charset >= 0 \
704c5781
KH
971 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
972 || coding->safe_charsets[charset])) \
d46c5b12
KH
973 { \
974 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
975 && reg == 0 \
976 && charset == CHARSET_ASCII) \
977 { \
978 /* We should insert this designation sequence as is so \
979 that it is surely written back to a file. */ \
980 coding->spec.iso2022.last_invalid_designation_register = -1; \
981 goto label_invalid_code; \
982 } \
983 coding->spec.iso2022.last_invalid_designation_register = -1; \
984 if ((coding->mode & CODING_MODE_DIRECTION) \
985 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
986 charset = CHARSET_REVERSE_CHARSET (charset); \
987 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
988 } \
989 else \
990 { \
991 coding->spec.iso2022.last_invalid_designation_register = reg; \
992 goto label_invalid_code; \
993 } \
4ed46869
KH
994 } while (0)
995
ec6d2bb8
KH
996/* Allocate a memory block for storing information about compositions.
997 The block is chained to the already allocated blocks. */
d46c5b12 998
ec6d2bb8
KH
999static void
1000coding_allocate_composition_data (coding, char_offset)
d46c5b12 1001 struct coding_system *coding;
ec6d2bb8 1002 int char_offset;
d46c5b12 1003{
ec6d2bb8
KH
1004 struct composition_data *cmp_data
1005 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1006
1007 cmp_data->char_offset = char_offset;
1008 cmp_data->used = 0;
1009 cmp_data->prev = coding->cmp_data;
1010 cmp_data->next = NULL;
1011 if (coding->cmp_data)
1012 coding->cmp_data->next = cmp_data;
1013 coding->cmp_data = cmp_data;
1014 coding->cmp_data_start = 0;
1015}
d46c5b12 1016
ec6d2bb8
KH
1017/* Record the starting position START and METHOD of one composition. */
1018
1019#define CODING_ADD_COMPOSITION_START(coding, start, method) \
1020 do { \
1021 struct composition_data *cmp_data = coding->cmp_data; \
1022 int *data = cmp_data->data + cmp_data->used; \
1023 coding->cmp_data_start = cmp_data->used; \
1024 data[0] = -1; \
1025 data[1] = cmp_data->char_offset + start; \
1026 data[3] = (int) method; \
1027 cmp_data->used += 4; \
1028 } while (0)
1029
1030/* Record the ending position END of the current composition. */
1031
1032#define CODING_ADD_COMPOSITION_END(coding, end) \
1033 do { \
1034 struct composition_data *cmp_data = coding->cmp_data; \
1035 int *data = cmp_data->data + coding->cmp_data_start; \
1036 data[0] = cmp_data->used - coding->cmp_data_start; \
1037 data[2] = cmp_data->char_offset + end; \
1038 } while (0)
1039
1040/* Record one COMPONENT (alternate character or composition rule). */
1041
1042#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1043 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1044
1045/* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1046
1047#define DECODE_COMPOSITION_START(c1) \
1048 do { \
1049 if (coding->composing == COMPOSITION_DISABLED) \
1050 { \
1051 *dst++ = ISO_CODE_ESC; \
1052 *dst++ = c1 & 0x7f; \
1053 coding->produced_char += 2; \
1054 } \
1055 else if (!COMPOSING_P (coding)) \
1056 { \
1057 /* This is surely the start of a composition. We must be sure \
1058 that coding->cmp_data has enough space to store the \
1059 information about the composition. If not, terminate the \
1060 current decoding loop, allocate one more memory block for \
1061 coding->cmp_data in the calller, then start the decoding \
1062 loop again. We can't allocate memory here directly because \
1063 it may cause buffer/string relocation. */ \
1064 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1065 >= COMPOSITION_DATA_SIZE) \
1066 { \
1067 result = CODING_FINISH_INSUFFICIENT_CMP; \
1068 goto label_end_of_loop_2; \
1069 } \
1070 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1071 : c1 == '2' ? COMPOSITION_WITH_RULE \
1072 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1073 : COMPOSITION_WITH_RULE_ALTCHARS); \
1074 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1075 coding->composing); \
1076 coding->composition_rule_follows = 0; \
1077 } \
1078 else \
1079 { \
1080 /* We are already handling a composition. If the method is \
1081 the following two, the codes following the current escape \
1082 sequence are actual characters stored in a buffer. */ \
1083 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1084 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1085 { \
1086 coding->composing = COMPOSITION_RELATIVE; \
1087 coding->composition_rule_follows = 0; \
1088 } \
1089 } \
1090 } while (0)
1091
1092/* Handle compositoin end sequence ESC 1. */
1093
1094#define DECODE_COMPOSITION_END(c1) \
1095 do { \
1096 if (coding->composing == COMPOSITION_DISABLED) \
1097 { \
1098 *dst++ = ISO_CODE_ESC; \
1099 *dst++ = c1; \
1100 coding->produced_char += 2; \
1101 } \
1102 else \
1103 { \
1104 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1105 coding->composing = COMPOSITION_NO; \
1106 } \
1107 } while (0)
1108
1109/* Decode a composition rule from the byte C1 (and maybe one more byte
1110 from SRC) and store one encoded composition rule in
1111 coding->cmp_data. */
1112
1113#define DECODE_COMPOSITION_RULE(c1) \
1114 do { \
1115 int rule = 0; \
1116 (c1) -= 32; \
1117 if (c1 < 81) /* old format (before ver.21) */ \
1118 { \
1119 int gref = (c1) / 9; \
1120 int nref = (c1) % 9; \
1121 if (gref == 4) gref = 10; \
1122 if (nref == 4) nref = 10; \
1123 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1124 } \
1125 else if (c1 < 93) /* new format (after ver.21 */ \
1126 { \
1127 ONE_MORE_BYTE (c2); \
1128 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1129 } \
1130 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1131 coding->composition_rule_follows = 0; \
1132 } while (0)
88993dfd 1133
d46c5b12 1134
4ed46869
KH
1135/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1136
1137int
d46c5b12 1138decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1139 struct coding_system *coding;
1140 unsigned char *source, *destination;
1141 int src_bytes, dst_bytes;
4ed46869
KH
1142{
1143 unsigned char *src = source;
1144 unsigned char *src_end = source + src_bytes;
1145 unsigned char *dst = destination;
1146 unsigned char *dst_end = destination + dst_bytes;
1147 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1148 from DST_END to assure that overflow checking is necessary only
1149 at the head of loop. */
1150 unsigned char *adjusted_dst_end = dst_end - 6;
1151 int charset;
1152 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1153 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1154 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1155 Lisp_Object translation_table
f967223b 1156 = coding->translation_table_for_decode;
d46c5b12 1157 int result = CODING_FINISH_NORMAL;
bdd9fb48 1158
84fbb8a0 1159 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1160 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1161
d46c5b12 1162 coding->produced_char = 0;
fb88bf2d 1163 coding->fake_multibyte = 0;
d46c5b12
KH
1164 while (src < src_end && (dst_bytes
1165 ? (dst < adjusted_dst_end)
1166 : (dst < src - 6)))
4ed46869
KH
1167 {
1168 /* SRC_BASE remembers the start position in source in each loop.
1169 The loop will be exited when there's not enough source text
1170 to analyze long escape sequence or 2-byte code (within macros
1171 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1172 to SRC_BASE before exiting. */
1173 unsigned char *src_base = src;
bdd9fb48 1174 int c1 = *src++, c2;
4ed46869 1175
ec6d2bb8 1176 /* We produce no character or one character. */
4ed46869
KH
1177 switch (iso_code_class [c1])
1178 {
1179 case ISO_0x20_or_0x7F:
ec6d2bb8
KH
1180 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1181 {
1182 DECODE_COMPOSITION_RULE (c1);
1183 break;
1184 }
1185 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
4ed46869
KH
1186 {
1187 /* This is SPACE or DEL. */
1188 *dst++ = c1;
d46c5b12 1189 coding->produced_char++;
4ed46869
KH
1190 break;
1191 }
1192 /* This is a graphic character, we fall down ... */
1193
1194 case ISO_graphic_plane_0:
ec6d2bb8
KH
1195 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1196 DECODE_COMPOSITION_RULE (c1);
4ed46869
KH
1197 else
1198 DECODE_ISO_CHARACTER (charset0, c1);
1199 break;
1200
1201 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1202 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1203 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1204 goto label_invalid_code;
4ed46869
KH
1205 /* This is a graphic character, we fall down ... */
1206
1207 case ISO_graphic_plane_1:
d46c5b12 1208 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1209 goto label_invalid_code;
ec6d2bb8 1210 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1211 break;
1212
1213 case ISO_control_code:
ec6d2bb8
KH
1214 if (COMPOSING_P (coding))
1215 DECODE_COMPOSITION_END ('1');
1216
4ed46869
KH
1217 /* All ISO2022 control characters in this class have the
1218 same representation in Emacs internal format. */
d46c5b12
KH
1219 if (c1 == '\n'
1220 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1221 && (coding->eol_type == CODING_EOL_CR
1222 || coding->eol_type == CODING_EOL_CRLF))
1223 {
1224 result = CODING_FINISH_INCONSISTENT_EOL;
1225 goto label_end_of_loop_2;
1226 }
4ed46869 1227 *dst++ = c1;
d46c5b12 1228 coding->produced_char++;
4ed46869
KH
1229 break;
1230
1231 case ISO_carriage_return:
ec6d2bb8
KH
1232 if (COMPOSING_P (coding))
1233 DECODE_COMPOSITION_END ('1');
1234
4ed46869 1235 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1236 *dst++ = '\n';
4ed46869
KH
1237 else if (coding->eol_type == CODING_EOL_CRLF)
1238 {
1239 ONE_MORE_BYTE (c1);
1240 if (c1 == ISO_CODE_LF)
1241 *dst++ = '\n';
1242 else
1243 {
d46c5b12
KH
1244 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1245 {
1246 result = CODING_FINISH_INCONSISTENT_EOL;
1247 goto label_end_of_loop_2;
1248 }
4ed46869 1249 src--;
d46c5b12 1250 *dst++ = '\r';
4ed46869
KH
1251 }
1252 }
1253 else
d46c5b12
KH
1254 *dst++ = c1;
1255 coding->produced_char++;
4ed46869
KH
1256 break;
1257
1258 case ISO_shift_out:
d46c5b12
KH
1259 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1260 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1261 goto label_invalid_code;
4ed46869
KH
1262 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1263 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1264 break;
1265
1266 case ISO_shift_in:
d46c5b12
KH
1267 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1268 goto label_invalid_code;
4ed46869
KH
1269 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1270 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1271 break;
1272
1273 case ISO_single_shift_2_7:
1274 case ISO_single_shift_2:
d46c5b12
KH
1275 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1276 goto label_invalid_code;
4ed46869
KH
1277 /* SS2 is handled as an escape sequence of ESC 'N' */
1278 c1 = 'N';
1279 goto label_escape_sequence;
1280
1281 case ISO_single_shift_3:
d46c5b12
KH
1282 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1283 goto label_invalid_code;
4ed46869
KH
1284 /* SS2 is handled as an escape sequence of ESC 'O' */
1285 c1 = 'O';
1286 goto label_escape_sequence;
1287
1288 case ISO_control_sequence_introducer:
1289 /* CSI is handled as an escape sequence of ESC '[' ... */
1290 c1 = '[';
1291 goto label_escape_sequence;
1292
1293 case ISO_escape:
1294 ONE_MORE_BYTE (c1);
1295 label_escape_sequence:
1296 /* Escape sequences handled by Emacs are invocation,
1297 designation, direction specification, and character
1298 composition specification. */
1299 switch (c1)
1300 {
1301 case '&': /* revision of following character set */
1302 ONE_MORE_BYTE (c1);
1303 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1304 goto label_invalid_code;
4ed46869
KH
1305 ONE_MORE_BYTE (c1);
1306 if (c1 != ISO_CODE_ESC)
d46c5b12 1307 goto label_invalid_code;
4ed46869
KH
1308 ONE_MORE_BYTE (c1);
1309 goto label_escape_sequence;
1310
1311 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1312 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1313 goto label_invalid_code;
4ed46869
KH
1314 ONE_MORE_BYTE (c1);
1315 if (c1 >= '@' && c1 <= 'B')
1316 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1317 or JISX0208.1980 */
4ed46869
KH
1318 DECODE_DESIGNATION (0, 2, 94, c1);
1319 }
1320 else if (c1 >= 0x28 && c1 <= 0x2B)
1321 { /* designation of DIMENSION2_CHARS94 character set */
1322 ONE_MORE_BYTE (c2);
1323 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1324 }
1325 else if (c1 >= 0x2C && c1 <= 0x2F)
1326 { /* designation of DIMENSION2_CHARS96 character set */
1327 ONE_MORE_BYTE (c2);
1328 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1329 }
1330 else
d46c5b12 1331 goto label_invalid_code;
4ed46869
KH
1332 break;
1333
1334 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1335 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1336 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1337 goto label_invalid_code;
4ed46869 1338 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1339 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1340 break;
1341
1342 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1343 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1344 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1345 goto label_invalid_code;
4ed46869 1346 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1347 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1348 break;
1349
1350 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1351 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1352 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1353 goto label_invalid_code;
4ed46869
KH
1354 ONE_MORE_BYTE (c1);
1355 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1356 DECODE_ISO_CHARACTER (charset, c1);
1357 break;
1358
1359 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1360 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1361 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1362 goto label_invalid_code;
4ed46869
KH
1363 ONE_MORE_BYTE (c1);
1364 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1365 DECODE_ISO_CHARACTER (charset, c1);
1366 break;
1367
ec6d2bb8
KH
1368 case '0': case '2': case '3': case '4': /* start composition */
1369 DECODE_COMPOSITION_START (c1);
4ed46869
KH
1370 break;
1371
ec6d2bb8
KH
1372 case '1': /* end composition */
1373 DECODE_COMPOSITION_END (c1);
4ed46869
KH
1374 break;
1375
1376 case '[': /* specification of direction */
d46c5b12
KH
1377 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1378 goto label_invalid_code;
4ed46869 1379 /* For the moment, nested direction is not supported.
d46c5b12
KH
1380 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1381 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1382 ONE_MORE_BYTE (c1);
1383 switch (c1)
1384 {
1385 case ']': /* end of the current direction */
d46c5b12 1386 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1387
1388 case '0': /* end of the current direction */
1389 case '1': /* start of left-to-right direction */
1390 ONE_MORE_BYTE (c1);
1391 if (c1 == ']')
d46c5b12 1392 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1393 else
d46c5b12 1394 goto label_invalid_code;
4ed46869
KH
1395 break;
1396
1397 case '2': /* start of right-to-left direction */
1398 ONE_MORE_BYTE (c1);
1399 if (c1 == ']')
d46c5b12 1400 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1401 else
d46c5b12 1402 goto label_invalid_code;
4ed46869
KH
1403 break;
1404
1405 default:
d46c5b12 1406 goto label_invalid_code;
4ed46869
KH
1407 }
1408 break;
1409
1410 default:
d46c5b12
KH
1411 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1412 goto label_invalid_code;
4ed46869
KH
1413 if (c1 >= 0x28 && c1 <= 0x2B)
1414 { /* designation of DIMENSION1_CHARS94 character set */
1415 ONE_MORE_BYTE (c2);
1416 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1417 }
1418 else if (c1 >= 0x2C && c1 <= 0x2F)
1419 { /* designation of DIMENSION1_CHARS96 character set */
1420 ONE_MORE_BYTE (c2);
1421 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1422 }
1423 else
1424 {
d46c5b12 1425 goto label_invalid_code;
4ed46869
KH
1426 }
1427 }
1428 /* We must update these variables now. */
1429 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1430 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1431 break;
1432
d46c5b12 1433 label_invalid_code:
ec6d2bb8
KH
1434 if (COMPOSING_P (coding))
1435 DECODE_COMPOSITION_END ('1');
1436 coding->produced_char += src - src_base;
d46c5b12 1437 while (src_base < src)
ec6d2bb8 1438 *dst++ = (*src_base++) & 0x7F;
4ed46869
KH
1439 }
1440 continue;
1441
1442 label_end_of_loop:
d46c5b12
KH
1443 result = CODING_FINISH_INSUFFICIENT_SRC;
1444 label_end_of_loop_2:
4ed46869
KH
1445 src = src_base;
1446 break;
1447 }
1448
fb88bf2d 1449 if (src < src_end)
4ed46869 1450 {
fb88bf2d
KH
1451 if (result == CODING_FINISH_NORMAL)
1452 result = CODING_FINISH_INSUFFICIENT_DST;
1453 else if (result != CODING_FINISH_INCONSISTENT_EOL
1454 && coding->mode & CODING_MODE_LAST_BLOCK)
1455 {
1456 /* This is the last block of the text to be decoded. We had
1457 better just flush out all remaining codes in the text
1458 although they are not valid characters. */
ec6d2bb8
KH
1459 if (COMPOSING_P (coding))
1460 DECODE_COMPOSITION_END ('1');
fb88bf2d 1461 src_bytes = src_end - src;
ec6d2bb8
KH
1462 if (dst_bytes && (dst_end - dst < src_end - src))
1463 src_end = src + (dst_end - dst);
1464 coding->produced_char += src_end - src;
1465 while (src < src_end)
1466 *dst++ = (*src++) & 0x7F;
fb88bf2d 1467 }
4ed46869 1468 }
fb88bf2d 1469
d46c5b12
KH
1470 coding->consumed = coding->consumed_char = src - source;
1471 coding->produced = dst - destination;
1472 return result;
4ed46869
KH
1473}
1474
f4dee582 1475/* ISO2022 encoding stuff. */
4ed46869
KH
1476
1477/*
f4dee582 1478 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1479 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1480 variant has the following specifications:
1481 1. Initial designation to G0 thru G3.
1482 2. Allows short-form designation?
1483 3. ASCII should be designated to G0 before control characters?
1484 4. ASCII should be designated to G0 at end of line?
1485 5. 7-bit environment or 8-bit environment?
1486 6. Use locking-shift?
1487 7. Use Single-shift?
1488 And the following two are only for Japanese:
1489 8. Use ASCII in place of JIS0201-1976-Roman?
1490 9. Use JISX0208-1983 in place of JISX0208-1978?
1491 These specifications are encoded in `coding->flags' as flag bits
1492 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1493 details.
4ed46869
KH
1494*/
1495
1496/* Produce codes (escape sequence) for designating CHARSET to graphic
1497 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1498 the coding system CODING allows, produce designation sequence of
1499 short-form. */
1500
1501#define ENCODE_DESIGNATION(charset, reg, coding) \
1502 do { \
1503 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1504 char *intermediate_char_94 = "()*+"; \
1505 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1506 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1507 if (revision < 255) \
1508 { \
4ed46869
KH
1509 *dst++ = ISO_CODE_ESC; \
1510 *dst++ = '&'; \
70c22245 1511 *dst++ = '@' + revision; \
4ed46869
KH
1512 } \
1513 *dst++ = ISO_CODE_ESC; \
1514 if (CHARSET_DIMENSION (charset) == 1) \
1515 { \
1516 if (CHARSET_CHARS (charset) == 94) \
1517 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1518 else \
1519 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1520 } \
1521 else \
1522 { \
1523 *dst++ = '$'; \
1524 if (CHARSET_CHARS (charset) == 94) \
1525 { \
1526 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1527 || reg != 0 \
1528 || final_char < '@' || final_char > 'B') \
1529 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1530 } \
1531 else \
1532 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1533 } \
1534 *dst++ = final_char; \
1535 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1536 } while (0)
1537
1538/* The following two macros produce codes (control character or escape
1539 sequence) for ISO2022 single-shift functions (single-shift-2 and
1540 single-shift-3). */
1541
1542#define ENCODE_SINGLE_SHIFT_2 \
1543 do { \
1544 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1545 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1546 else \
fb88bf2d
KH
1547 { \
1548 *dst++ = ISO_CODE_SS2; \
1549 coding->fake_multibyte = 1; \
1550 } \
4ed46869
KH
1551 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1552 } while (0)
1553
fb88bf2d
KH
1554#define ENCODE_SINGLE_SHIFT_3 \
1555 do { \
4ed46869 1556 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1557 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1558 else \
1559 { \
1560 *dst++ = ISO_CODE_SS3; \
1561 coding->fake_multibyte = 1; \
1562 } \
4ed46869
KH
1563 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1564 } while (0)
1565
1566/* The following four macros produce codes (control character or
1567 escape sequence) for ISO2022 locking-shift functions (shift-in,
1568 shift-out, locking-shift-2, and locking-shift-3). */
1569
1570#define ENCODE_SHIFT_IN \
1571 do { \
1572 *dst++ = ISO_CODE_SI; \
1573 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1574 } while (0)
1575
1576#define ENCODE_SHIFT_OUT \
1577 do { \
1578 *dst++ = ISO_CODE_SO; \
1579 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1580 } while (0)
1581
1582#define ENCODE_LOCKING_SHIFT_2 \
1583 do { \
1584 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1585 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1586 } while (0)
1587
1588#define ENCODE_LOCKING_SHIFT_3 \
1589 do { \
1590 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1591 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1592 } while (0)
1593
f4dee582
RS
1594/* Produce codes for a DIMENSION1 character whose character set is
1595 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1596 sequences are also produced in advance if necessary. */
1597
1598
6e85d753
KH
1599#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1600 do { \
1601 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1602 { \
1603 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1604 *dst++ = c1 & 0x7F; \
1605 else \
1606 *dst++ = c1 | 0x80; \
1607 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1608 break; \
1609 } \
1610 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1611 { \
1612 *dst++ = c1 & 0x7F; \
1613 break; \
1614 } \
1615 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1616 { \
1617 *dst++ = c1 | 0x80; \
1618 break; \
1619 } \
1620 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1621 && !coding->safe_charsets[charset]) \
6e85d753
KH
1622 { \
1623 /* We should not encode this character, instead produce one or \
1624 two `?'s. */ \
1625 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1626 if (CHARSET_WIDTH (charset) == 2) \
1627 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1628 break; \
1629 } \
1630 else \
1631 /* Since CHARSET is not yet invoked to any graphic planes, we \
1632 must invoke it, or, at first, designate it to some graphic \
1633 register. Then repeat the loop to actually produce the \
1634 character. */ \
1635 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1636 } while (1)
1637
f4dee582
RS
1638/* Produce codes for a DIMENSION2 character whose character set is
1639 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1640 invocation codes are also produced in advance if necessary. */
1641
6e85d753
KH
1642#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1643 do { \
1644 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1645 { \
1646 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1647 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1648 else \
1649 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1650 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1651 break; \
1652 } \
1653 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1654 { \
1655 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1656 break; \
1657 } \
1658 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1659 { \
1660 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1661 break; \
1662 } \
1663 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1664 && !coding->safe_charsets[charset]) \
6e85d753
KH
1665 { \
1666 /* We should not encode this character, instead produce one or \
1667 two `?'s. */ \
1668 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1669 if (CHARSET_WIDTH (charset) == 2) \
1670 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1671 break; \
1672 } \
1673 else \
1674 /* Since CHARSET is not yet invoked to any graphic planes, we \
1675 must invoke it, or, at first, designate it to some graphic \
1676 register. Then repeat the loop to actually produce the \
1677 character. */ \
1678 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1679 } while (1)
1680
6f551029
KH
1681#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1682 do { \
1683 int c_alt, charset_alt; \
ec6d2bb8 1684 \
6f551029
KH
1685 if (!NILP (translation_table) \
1686 && ((c_alt = translate_char (translation_table, -1, \
1687 charset, c1, c2)) \
1688 >= 0)) \
1689 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1690 else \
1691 charset_alt = charset; \
1692 if (CHARSET_DEFINED_P (charset_alt)) \
1693 { \
1694 if (CHARSET_DIMENSION (charset_alt) == 1) \
1695 { \
1696 if (charset == CHARSET_ASCII \
1697 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1698 charset_alt = charset_latin_jisx0201; \
1699 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1700 } \
1701 else \
1702 { \
1703 if (charset == charset_jisx0208 \
1704 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1705 charset_alt = charset_jisx0208_1978; \
1706 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1707 } \
1708 } \
1709 else \
1710 { \
1711 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1712 { \
1713 *dst++ = charset & 0x7f; \
1714 *dst++ = c1 & 0x7f; \
1715 if (c2) \
1716 *dst++ = c2 & 0x7f; \
1717 } \
1718 else \
1719 { \
1720 *dst++ = charset; \
1721 *dst++ = c1; \
1722 if (c2) \
1723 *dst++ = c2; \
1724 } \
1725 } \
ec6d2bb8 1726 coding->consumed_char++; \
84fbb8a0 1727 } while (0)
bdd9fb48 1728
4ed46869
KH
1729/* Produce designation and invocation codes at a place pointed by DST
1730 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1731 Return new DST. */
1732
1733unsigned char *
1734encode_invocation_designation (charset, coding, dst)
1735 int charset;
1736 struct coding_system *coding;
1737 unsigned char *dst;
1738{
1739 int reg; /* graphic register number */
1740
1741 /* At first, check designations. */
1742 for (reg = 0; reg < 4; reg++)
1743 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1744 break;
1745
1746 if (reg >= 4)
1747 {
1748 /* CHARSET is not yet designated to any graphic registers. */
1749 /* At first check the requested designation. */
1750 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1751 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1752 /* Since CHARSET requests no special designation, designate it
1753 to graphic register 0. */
4ed46869
KH
1754 reg = 0;
1755
1756 ENCODE_DESIGNATION (charset, reg, coding);
1757 }
1758
1759 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1760 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1761 {
1762 /* Since the graphic register REG is not invoked to any graphic
1763 planes, invoke it to graphic plane 0. */
1764 switch (reg)
1765 {
1766 case 0: /* graphic register 0 */
1767 ENCODE_SHIFT_IN;
1768 break;
1769
1770 case 1: /* graphic register 1 */
1771 ENCODE_SHIFT_OUT;
1772 break;
1773
1774 case 2: /* graphic register 2 */
1775 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1776 ENCODE_SINGLE_SHIFT_2;
1777 else
1778 ENCODE_LOCKING_SHIFT_2;
1779 break;
1780
1781 case 3: /* graphic register 3 */
1782 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1783 ENCODE_SINGLE_SHIFT_3;
1784 else
1785 ENCODE_LOCKING_SHIFT_3;
1786 break;
1787 }
1788 }
1789 return dst;
1790}
1791
ec6d2bb8
KH
1792/* Produce 2-byte codes for encoded composition rule RULE. */
1793
1794#define ENCODE_COMPOSITION_RULE(rule) \
1795 do { \
1796 int gref, nref; \
1797 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1798 *dst++ = 32 + 81 + gref; \
1799 *dst++ = 32 + nref; \
1800 } while (0)
1801
1802/* Produce codes for indicating the start of a composition sequence
1803 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1804 which specify information about the composition. See the comment
1805 in coding.h for the format of DATA. */
1806
1807#define ENCODE_COMPOSITION_START(coding, data) \
1808 do { \
1809 coding->composing = data[3]; \
1810 *dst++ = ISO_CODE_ESC; \
1811 if (coding->composing == COMPOSITION_RELATIVE) \
1812 *dst++ = '0'; \
1813 else \
1814 { \
1815 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1816 ? '3' : '4'); \
1817 coding->cmp_data_index = coding->cmp_data_start + 4; \
1818 coding->composition_rule_follows = 0; \
1819 } \
1820 } while (0)
1821
1822/* Produce codes for indicating the end of the current composition. */
1823
1824#define ENCODE_COMPOSITION_END(coding, data) \
1825 do { \
1826 *dst++ = ISO_CODE_ESC; \
1827 *dst++ = '1'; \
1828 coding->cmp_data_start += data[0]; \
1829 coding->composing = COMPOSITION_NO; \
1830 if (coding->cmp_data_start == coding->cmp_data->used \
1831 && coding->cmp_data->next) \
1832 { \
1833 coding->cmp_data = coding->cmp_data->next; \
1834 coding->cmp_data_start = 0; \
1835 } \
1836 } while (0)
1837
1838/* Produce composition start sequence ESC 0. Here, this sequence
1839 doesn't mean the start of a new composition but means that we have
1840 just produced components (alternate chars and composition rules) of
1841 the composition and the actual text follows in SRC. */
1842
1843#define ENCODE_COMPOSITION_FAKE_START(coding) \
1844 do { \
1845 *dst++ = ISO_CODE_ESC; \
1846 *dst++ = '0'; \
1847 coding->composing = COMPOSITION_RELATIVE; \
1848 } while (0)
4ed46869
KH
1849
1850/* The following three macros produce codes for indicating direction
1851 of text. */
1852#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1853 do { \
1854 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1855 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1856 else \
1857 *dst++ = ISO_CODE_CSI; \
1858 } while (0)
1859
1860#define ENCODE_DIRECTION_R2L \
1861 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1862
1863#define ENCODE_DIRECTION_L2R \
1864 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1865
1866/* Produce codes for designation and invocation to reset the graphic
1867 planes and registers to initial state. */
e0e989f6
KH
1868#define ENCODE_RESET_PLANE_AND_REGISTER \
1869 do { \
1870 int reg; \
1871 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1872 ENCODE_SHIFT_IN; \
1873 for (reg = 0; reg < 4; reg++) \
1874 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1875 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1876 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1877 ENCODE_DESIGNATION \
1878 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1879 } while (0)
1880
bdd9fb48 1881/* Produce designation sequences of charsets in the line started from
d46c5b12 1882 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1883
1884 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1885 find all the necessary designations. */
1886
dfcf069d 1887void
bdd9fb48 1888encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1889 struct coding_system *coding;
bdd9fb48 1890 Lisp_Object table;
e0e989f6
KH
1891 unsigned char *src, *src_end, **dstp;
1892{
bdd9fb48
KH
1893 int charset, c, found = 0, reg;
1894 /* Table of charsets to be designated to each graphic register. */
1895 int r[4];
1896 unsigned char *dst = *dstp;
1897
1898 for (reg = 0; reg < 4; reg++)
1899 r[reg] = -1;
1900
1901 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1902 {
bdd9fb48
KH
1903 int bytes = BYTES_BY_CHAR_HEAD (*src);
1904
1905 if (NILP (table))
1906 charset = CHARSET_AT (src);
1907 else
e0e989f6 1908 {
35cb8686
RS
1909 int c_alt;
1910 unsigned char c1, c2;
bdd9fb48
KH
1911
1912 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1913 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1914 charset = CHAR_CHARSET (c_alt);
e0e989f6 1915 }
bdd9fb48 1916
e0e989f6 1917 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1918 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1919 {
1920 found++;
1921 r[reg] = charset;
1922 }
1923
1924 src += bytes;
1925 }
1926
1927 if (found)
1928 {
1929 for (reg = 0; reg < 4; reg++)
1930 if (r[reg] >= 0
1931 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1932 ENCODE_DESIGNATION (r[reg], reg, coding);
1933 *dstp = dst;
e0e989f6 1934 }
e0e989f6
KH
1935}
1936
4ed46869
KH
1937/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1938
1939int
d46c5b12 1940encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1941 struct coding_system *coding;
1942 unsigned char *source, *destination;
1943 int src_bytes, dst_bytes;
4ed46869
KH
1944{
1945 unsigned char *src = source;
1946 unsigned char *src_end = source + src_bytes;
1947 unsigned char *dst = destination;
1948 unsigned char *dst_end = destination + dst_bytes;
ec6d2bb8 1949 /* Since the maximum bytes produced by each loop is 14, we subtract 13
4ed46869
KH
1950 from DST_END to assure overflow checking is necessary only at the
1951 head of loop. */
ec6d2bb8 1952 unsigned char *adjusted_dst_end = dst_end - 13;
84fbb8a0 1953 Lisp_Object translation_table
f967223b 1954 = coding->translation_table_for_encode;
d46c5b12 1955 int result = CODING_FINISH_NORMAL;
bdd9fb48 1956
84fbb8a0 1957 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1958 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1959
d46c5b12 1960 coding->consumed_char = 0;
fb88bf2d 1961 coding->fake_multibyte = 0;
d46c5b12
KH
1962 while (src < src_end && (dst_bytes
1963 ? (dst < adjusted_dst_end)
ec6d2bb8 1964 : (dst < src - 13)))
4ed46869
KH
1965 {
1966 /* SRC_BASE remembers the start position in source in each loop.
1967 The loop will be exited when there's not enough source text
1968 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1969 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1970 reset to SRC_BASE before exiting. */
1971 unsigned char *src_base = src;
bdd9fb48 1972 int charset, c1, c2, c3, c4;
4ed46869 1973
e0e989f6
KH
1974 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1975 && CODING_SPEC_ISO_BOL (coding))
1976 {
bdd9fb48 1977 /* We have to produce designation sequences if any now. */
84fbb8a0 1978 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1979 src, src_end, &dst);
e0e989f6
KH
1980 CODING_SPEC_ISO_BOL (coding) = 0;
1981 }
1982
ec6d2bb8
KH
1983 /* Check composition start and end. */
1984 if (coding->composing != COMPOSITION_DISABLED
1985 && coding->cmp_data_start < coding->cmp_data->used)
4ed46869 1986 {
ec6d2bb8
KH
1987 struct composition_data *cmp_data = coding->cmp_data;
1988 int *data = cmp_data->data + coding->cmp_data_start;
1989 int this_pos = cmp_data->char_offset + coding->consumed_char;
1990
1991 if (coding->composing == COMPOSITION_RELATIVE)
4ed46869 1992 {
ec6d2bb8
KH
1993 if (this_pos == data[2])
1994 {
1995 ENCODE_COMPOSITION_END (coding, data);
1996 cmp_data = coding->cmp_data;
1997 data = cmp_data->data + coding->cmp_data_start;
1998 }
4ed46869 1999 }
ec6d2bb8 2000 else if (COMPOSING_P (coding))
4ed46869 2001 {
ec6d2bb8
KH
2002 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2003 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2004 /* We have consumed components of the composition.
2005 What follows in SRC is the compositions's base
2006 text. */
2007 ENCODE_COMPOSITION_FAKE_START (coding);
2008 else
4ed46869 2009 {
ec6d2bb8
KH
2010 int c = cmp_data->data[coding->cmp_data_index++];
2011 if (coding->composition_rule_follows)
2012 {
2013 ENCODE_COMPOSITION_RULE (c);
2014 coding->composition_rule_follows = 0;
2015 }
2016 else
2017 {
2018 SPLIT_CHAR (c, charset, c1, c2);
2019 ENCODE_ISO_CHARACTER (charset, c1, c2);
2020 /* But, we didn't consume a character in SRC. */
2021 coding->consumed_char--;
2022 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2023 coding->composition_rule_follows = 1;
2024 }
4ed46869
KH
2025 continue;
2026 }
ec6d2bb8
KH
2027 }
2028 if (!COMPOSING_P (coding))
2029 {
2030 if (this_pos == data[1])
4ed46869 2031 {
ec6d2bb8
KH
2032 ENCODE_COMPOSITION_START (coding, data);
2033 continue;
4ed46869 2034 }
4ed46869
KH
2035 }
2036 }
ec6d2bb8
KH
2037
2038 c1 = *src++;
4ed46869
KH
2039 /* Now encode one character. C1 is a control character, an
2040 ASCII character, or a leading-code of multi-byte character. */
2041 switch (emacs_code_class[c1])
2042 {
2043 case EMACS_ascii_code:
8dbb769e 2044 c2 = 0;
bdd9fb48 2045 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
2046 break;
2047
2048 case EMACS_control_code:
2049 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 2050 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2051 *dst++ = c1;
d46c5b12 2052 coding->consumed_char++;
4ed46869
KH
2053 break;
2054
2055 case EMACS_carriage_return_code:
d46c5b12 2056 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2057 {
2058 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 2059 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2060 *dst++ = c1;
d46c5b12 2061 coding->consumed_char++;
4ed46869
KH
2062 break;
2063 }
2064 /* fall down to treat '\r' as '\n' ... */
2065
2066 case EMACS_linefeed_code:
2067 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
2068 ENCODE_RESET_PLANE_AND_REGISTER;
2069 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2070 bcopy (coding->spec.iso2022.initial_designation,
2071 coding->spec.iso2022.current_designation,
2072 sizeof coding->spec.iso2022.initial_designation);
4ed46869 2073 if (coding->eol_type == CODING_EOL_LF
0ef69138 2074 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2075 *dst++ = ISO_CODE_LF;
2076 else if (coding->eol_type == CODING_EOL_CRLF)
2077 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2078 else
2079 *dst++ = ISO_CODE_CR;
e0e989f6 2080 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 2081 coding->consumed_char++;
4ed46869
KH
2082 break;
2083
2084 case EMACS_leading_code_2:
2085 ONE_MORE_BYTE (c2);
8dbb769e 2086 c3 = 0;
19a8d9e0
KH
2087 if (c2 < 0xA0)
2088 {
2089 /* invalid sequence */
2090 *dst++ = c1;
38cf95df
RS
2091 src--;
2092 coding->consumed_char++;
19a8d9e0
KH
2093 }
2094 else
2095 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2096 break;
2097
2098 case EMACS_leading_code_3:
2099 TWO_MORE_BYTES (c2, c3);
8dbb769e 2100 c4 = 0;
19a8d9e0
KH
2101 if (c2 < 0xA0 || c3 < 0xA0)
2102 {
2103 /* invalid sequence */
2104 *dst++ = c1;
38cf95df
RS
2105 src -= 2;
2106 coding->consumed_char++;
19a8d9e0
KH
2107 }
2108 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 2109 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 2110 else
bdd9fb48 2111 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
2112 break;
2113
2114 case EMACS_leading_code_4:
2115 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
2116 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
2117 {
2118 /* invalid sequence */
2119 *dst++ = c1;
38cf95df
RS
2120 src -= 3;
2121 coding->consumed_char++;
19a8d9e0
KH
2122 }
2123 else
2124 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
2125 break;
2126
4ed46869 2127 case EMACS_invalid_code:
3efbce95
KH
2128 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2129 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2130 *dst++ = c1;
d46c5b12 2131 coding->consumed_char++;
4ed46869
KH
2132 break;
2133 }
2134 continue;
2135 label_end_of_loop:
d46c5b12
KH
2136 result = CODING_FINISH_INSUFFICIENT_SRC;
2137 src = src_base;
4ed46869
KH
2138 break;
2139 }
2140
49cb52b4
KH
2141 if (src < src_end && result == CODING_FINISH_NORMAL)
2142 result = CODING_FINISH_INSUFFICIENT_DST;
2143
2144 /* If this is the last block of the text to be encoded, we must
2145 reset graphic planes and registers to the initial state, and
2146 flush out the carryover if any. */
2147 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2148 {
2149 ENCODE_RESET_PLANE_AND_REGISTER;
ec6d2bb8
KH
2150 if (COMPOSING_P (coding))
2151 *dst++ = ISO_CODE_ESC, *dst++ = '1';
88993dfd
KH
2152 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2153 {
2154 while (src < src_end && dst < dst_end)
2155 *dst++ = *src++;
2156 }
84fbb8a0 2157 }
d46c5b12
KH
2158 coding->consumed = src - source;
2159 coding->produced = coding->produced_char = dst - destination;
2160 return result;
4ed46869
KH
2161}
2162
2163\f
2164/*** 4. SJIS and BIG5 handlers ***/
2165
f4dee582 2166/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2167 quite widely. So, for the moment, Emacs supports them in the bare
2168 C code. But, in the future, they may be supported only by CCL. */
2169
2170/* SJIS is a coding system encoding three character sets: ASCII, right
2171 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2172 as is. A character of charset katakana-jisx0201 is encoded by
2173 "position-code + 0x80". A character of charset japanese-jisx0208
2174 is encoded in 2-byte but two position-codes are divided and shifted
2175 so that it fit in the range below.
2176
2177 --- CODE RANGE of SJIS ---
2178 (character set) (range)
2179 ASCII 0x00 .. 0x7F
2180 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2181 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2182 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2183 -------------------------------
2184
2185*/
2186
2187/* BIG5 is a coding system encoding two character sets: ASCII and
2188 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2189 character set and is encoded in two-byte.
2190
2191 --- CODE RANGE of BIG5 ---
2192 (character set) (range)
2193 ASCII 0x00 .. 0x7F
2194 Big5 (1st byte) 0xA1 .. 0xFE
2195 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2196 --------------------------
2197
2198 Since the number of characters in Big5 is larger than maximum
2199 characters in Emacs' charset (96x96), it can't be handled as one
2200 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2201 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2202 contains frequently used characters and the latter contains less
2203 frequently used characters. */
2204
2205/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2206 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2207 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2208 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2209
2210/* Number of Big5 characters which have the same code in 1st byte. */
2211#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2212
2213#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2214 do { \
2215 unsigned int temp \
2216 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2217 if (b1 < 0xC9) \
2218 charset = charset_big5_1; \
2219 else \
2220 { \
2221 charset = charset_big5_2; \
2222 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2223 } \
2224 c1 = temp / (0xFF - 0xA1) + 0x21; \
2225 c2 = temp % (0xFF - 0xA1) + 0x21; \
2226 } while (0)
2227
2228#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2229 do { \
2230 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2231 if (charset == charset_big5_2) \
2232 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2233 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2234 b2 = temp % BIG5_SAME_ROW; \
2235 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2236 } while (0)
2237
a5d301df
KH
2238#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2239 do { \
2240 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2241 if (!NILP (translation_table) \
2242 && ((c_alt = translate_char (translation_table, \
2243 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2244 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2245 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2246 DECODE_CHARACTER_ASCII (c1); \
2247 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2248 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2249 else \
2250 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2251 } while (0)
2252
84fbb8a0
KH
2253#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2254 do { \
2255 int c_alt, charset_alt; \
2256 if (!NILP (translation_table) \
2257 && ((c_alt = translate_char (translation_table, -1, \
2258 charset, c1, c2)) \
2259 >= 0)) \
2260 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2261 else \
2262 charset_alt = charset; \
2263 if (charset_alt == charset_ascii) \
2264 *dst++ = c1; \
2265 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2266 { \
2267 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2268 *dst++ = c1; \
497ee4fb
KH
2269 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2270 *dst++ = c1 & 0x7F; \
84fbb8a0
KH
2271 else \
2272 { \
2273 *dst++ = charset_alt, *dst++ = c1; \
2274 coding->fake_multibyte = 1; \
2275 } \
2276 } \
2277 else \
2278 { \
2279 c1 &= 0x7F, c2 &= 0x7F; \
d6bd663c
KH
2280 if (sjis_p && (charset_alt == charset_jisx0208 \
2281 || charset_alt == charset_jisx0208_1978))\
84fbb8a0
KH
2282 { \
2283 unsigned char s1, s2; \
2284 \
2285 ENCODE_SJIS (c1, c2, s1, s2); \
2286 *dst++ = s1, *dst++ = s2; \
2287 coding->fake_multibyte = 1; \
2288 } \
2289 else if (!sjis_p \
2290 && (charset_alt == charset_big5_1 \
2291 || charset_alt == charset_big5_2)) \
2292 { \
2293 unsigned char b1, b2; \
2294 \
2295 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2296 *dst++ = b1, *dst++ = b2; \
2297 } \
2298 else \
2299 { \
2300 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2301 coding->fake_multibyte = 1; \
2302 } \
2303 } \
2304 coding->consumed_char++; \
ec6d2bb8 2305 } while (0)
a5d301df 2306
4ed46869
KH
2307/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2308 Check if a text is encoded in SJIS. If it is, return
2309 CODING_CATEGORY_MASK_SJIS, else return 0. */
2310
2311int
2312detect_coding_sjis (src, src_end)
2313 unsigned char *src, *src_end;
2314{
2315 unsigned char c;
2316
2317 while (src < src_end)
2318 {
2319 c = *src++;
4ed46869
KH
2320 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2321 {
2322 if (src < src_end && *src++ < 0x40)
2323 return 0;
2324 }
2325 }
2326 return CODING_CATEGORY_MASK_SJIS;
2327}
2328
2329/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2330 Check if a text is encoded in BIG5. If it is, return
2331 CODING_CATEGORY_MASK_BIG5, else return 0. */
2332
2333int
2334detect_coding_big5 (src, src_end)
2335 unsigned char *src, *src_end;
2336{
2337 unsigned char c;
2338
2339 while (src < src_end)
2340 {
2341 c = *src++;
4ed46869
KH
2342 if (c >= 0xA1)
2343 {
2344 if (src >= src_end)
2345 break;
2346 c = *src++;
2347 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2348 return 0;
2349 }
2350 }
2351 return CODING_CATEGORY_MASK_BIG5;
2352}
2353
fa42c37f
KH
2354/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2355 Check if a text is encoded in UTF-8. If it is, return
2356 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2357
2358#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2359#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2360#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2361#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2362#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2363#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2364#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2365
2366int
2367detect_coding_utf_8 (src, src_end)
2368 unsigned char *src, *src_end;
2369{
2370 unsigned char c;
2371 int seq_maybe_bytes;
2372
2373 while (src < src_end)
2374 {
2375 c = *src++;
2376 if (UTF_8_1_OCTET_P (c))
2377 continue;
2378 else if (UTF_8_2_OCTET_LEADING_P (c))
2379 seq_maybe_bytes = 1;
2380 else if (UTF_8_3_OCTET_LEADING_P (c))
2381 seq_maybe_bytes = 2;
2382 else if (UTF_8_4_OCTET_LEADING_P (c))
2383 seq_maybe_bytes = 3;
2384 else if (UTF_8_5_OCTET_LEADING_P (c))
2385 seq_maybe_bytes = 4;
2386 else if (UTF_8_6_OCTET_LEADING_P (c))
2387 seq_maybe_bytes = 5;
2388 else
2389 return 0;
2390
2391 do
2392 {
2393 if (src >= src_end)
2394 return CODING_CATEGORY_MASK_UTF_8;
2395
2396 c = *src++;
2397 if (!UTF_8_EXTRA_OCTET_P (c))
2398 return 0;
2399 seq_maybe_bytes--;
2400 }
2401 while (seq_maybe_bytes > 0);
2402 }
2403
2404 return CODING_CATEGORY_MASK_UTF_8;
2405}
2406
2407/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2408 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2409 Little Endian (otherwise). If it is, return
2410 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2411 else return 0. */
2412
2413#define UTF_16_INVALID_P(val) \
2414 (((val) == 0xFFFE) \
2415 || ((val) == 0xFFFF))
2416
2417#define UTF_16_HIGH_SURROGATE_P(val) \
2418 (((val) & 0xD800) == 0xD800)
2419
2420#define UTF_16_LOW_SURROGATE_P(val) \
2421 (((val) & 0xDC00) == 0xDC00)
2422
2423int
2424detect_coding_utf_16 (src, src_end)
2425 unsigned char *src, *src_end;
2426{
2427 if ((src + 1) >= src_end) return 0;
2428
2429 if ((src[0] == 0xFF) && (src[1] == 0xFE))
2430 return CODING_CATEGORY_MASK_UTF_16_LE;
2431 else if ((src[0] == 0xFE) && (src[1] == 0xFF))
2432 return CODING_CATEGORY_MASK_UTF_16_BE;
2433
2434 return 0;
2435}
2436
4ed46869
KH
2437/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2438 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2439
2440int
2441decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2442 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2443 struct coding_system *coding;
2444 unsigned char *source, *destination;
2445 int src_bytes, dst_bytes;
4ed46869
KH
2446 int sjis_p;
2447{
2448 unsigned char *src = source;
2449 unsigned char *src_end = source + src_bytes;
2450 unsigned char *dst = destination;
2451 unsigned char *dst_end = destination + dst_bytes;
2452 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2453 from DST_END to assure overflow checking is necessary only at the
2454 head of loop. */
2455 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2456 Lisp_Object translation_table
f967223b 2457 = coding->translation_table_for_decode;
d46c5b12 2458 int result = CODING_FINISH_NORMAL;
a5d301df 2459
84fbb8a0 2460 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2461 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2462
d46c5b12 2463 coding->produced_char = 0;
fb88bf2d 2464 coding->fake_multibyte = 0;
d46c5b12
KH
2465 while (src < src_end && (dst_bytes
2466 ? (dst < adjusted_dst_end)
2467 : (dst < src - 3)))
4ed46869
KH
2468 {
2469 /* SRC_BASE remembers the start position in source in each loop.
2470 The loop will be exited when there's not enough source text
2471 to analyze two-byte character (within macro ONE_MORE_BYTE).
2472 In that case, SRC is reset to SRC_BASE before exiting. */
2473 unsigned char *src_base = src;
2474 unsigned char c1 = *src++, c2, c3, c4;
2475
d46c5b12 2476 if (c1 < 0x20)
4ed46869 2477 {
d46c5b12 2478 if (c1 == '\r')
4ed46869 2479 {
d46c5b12
KH
2480 if (coding->eol_type == CODING_EOL_CRLF)
2481 {
2482 ONE_MORE_BYTE (c2);
2483 if (c2 == '\n')
2484 *dst++ = c2;
2485 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2486 {
2487 result = CODING_FINISH_INCONSISTENT_EOL;
2488 goto label_end_of_loop_2;
2489 }
2490 else
2491 /* To process C2 again, SRC is subtracted by 1. */
2492 *dst++ = c1, src--;
2493 }
2494 else if (coding->eol_type == CODING_EOL_CR)
2495 *dst++ = '\n';
4ed46869 2496 else
d46c5b12
KH
2497 *dst++ = c1;
2498 }
2499 else if (c1 == '\n'
2500 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2501 && (coding->eol_type == CODING_EOL_CR
2502 || coding->eol_type == CODING_EOL_CRLF))
2503 {
2504 result = CODING_FINISH_INCONSISTENT_EOL;
2505 goto label_end_of_loop_2;
4ed46869
KH
2506 }
2507 else
2508 *dst++ = c1;
d46c5b12 2509 coding->produced_char++;
4ed46869 2510 }
a5d301df 2511 else if (c1 < 0x80)
5e34de15
KH
2512 {
2513 c2 = 0; /* avoid warning */
2514 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2515 }
54f78171 2516 else
4ed46869 2517 {
4ed46869
KH
2518 if (sjis_p)
2519 {
54f78171 2520 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2521 {
54f78171
KH
2522 /* SJIS -> JISX0208 */
2523 ONE_MORE_BYTE (c2);
d14d03ac 2524 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2525 {
2526 DECODE_SJIS (c1, c2, c3, c4);
2527 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2528 }
2529 else
2530 goto label_invalid_code_2;
fb88bf2d 2531 }
54f78171
KH
2532 else if (c1 < 0xE0)
2533 /* SJIS -> JISX0201-Kana */
5e34de15
KH
2534 {
2535 c2 = 0; /* avoid warning */
2536 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2537 /* dummy */ c2);
2538 }
fb88bf2d 2539 else
54f78171 2540 goto label_invalid_code_1;
4ed46869 2541 }
fb88bf2d 2542 else
fb88bf2d 2543 {
54f78171
KH
2544 /* BIG5 -> Big5 */
2545 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2546 {
54f78171
KH
2547 ONE_MORE_BYTE (c2);
2548 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2549 {
2550 int charset;
4ed46869 2551
54f78171
KH
2552 DECODE_BIG5 (c1, c2, charset, c3, c4);
2553 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2554 }
2555 else
2556 goto label_invalid_code_2;
fb88bf2d
KH
2557 }
2558 else
54f78171 2559 goto label_invalid_code_1;
4ed46869
KH
2560 }
2561 }
2562 continue;
2563
fb88bf2d
KH
2564 label_invalid_code_1:
2565 *dst++ = c1;
2566 coding->produced_char++;
2567 coding->fake_multibyte = 1;
2568 continue;
2569
2570 label_invalid_code_2:
2571 *dst++ = c1; *dst++= c2;
2572 coding->produced_char += 2;
2573 coding->fake_multibyte = 1;
2574 continue;
2575
4ed46869 2576 label_end_of_loop:
d46c5b12
KH
2577 result = CODING_FINISH_INSUFFICIENT_SRC;
2578 label_end_of_loop_2:
4ed46869
KH
2579 src = src_base;
2580 break;
2581 }
2582
fb88bf2d
KH
2583 if (src < src_end)
2584 {
2585 if (result == CODING_FINISH_NORMAL)
2586 result = CODING_FINISH_INSUFFICIENT_DST;
2587 else if (result != CODING_FINISH_INCONSISTENT_EOL
2588 && coding->mode & CODING_MODE_LAST_BLOCK)
2589 {
2590 src_bytes = src_end - src;
2591 if (dst_bytes && (dst_end - dst < src_bytes))
2592 src_bytes = dst_end - dst;
2593 bcopy (dst, src, src_bytes);
2594 src += src_bytes;
2595 dst += src_bytes;
2596 coding->fake_multibyte = 1;
2597 }
2598 }
d46c5b12
KH
2599
2600 coding->consumed = coding->consumed_char = src - source;
2601 coding->produced = dst - destination;
2602 return result;
4ed46869
KH
2603}
2604
2605/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2606 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2607 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2608 sure that all these charsets are registered as official charset
2609 (i.e. do not have extended leading-codes). Characters of other
2610 charsets are produced without any encoding. If SJIS_P is 1, encode
2611 SJIS text, else encode BIG5 text. */
2612
2613int
2614encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2615 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2616 struct coding_system *coding;
2617 unsigned char *source, *destination;
2618 int src_bytes, dst_bytes;
4ed46869
KH
2619 int sjis_p;
2620{
2621 unsigned char *src = source;
2622 unsigned char *src_end = source + src_bytes;
2623 unsigned char *dst = destination;
2624 unsigned char *dst_end = destination + dst_bytes;
2625 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2626 from DST_END to assure overflow checking is necessary only at the
2627 head of loop. */
2628 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2629 Lisp_Object translation_table
f967223b 2630 = coding->translation_table_for_encode;
d46c5b12 2631 int result = CODING_FINISH_NORMAL;
a5d301df 2632
84fbb8a0 2633 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2634 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2635
d46c5b12 2636 coding->consumed_char = 0;
fb88bf2d 2637 coding->fake_multibyte = 0;
d46c5b12
KH
2638 while (src < src_end && (dst_bytes
2639 ? (dst < adjusted_dst_end)
2640 : (dst < src - 1)))
4ed46869
KH
2641 {
2642 /* SRC_BASE remembers the start position in source in each loop.
2643 The loop will be exited when there's not enough source text
2644 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2645 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2646 before exiting. */
2647 unsigned char *src_base = src;
2648 unsigned char c1 = *src++, c2, c3, c4;
2649
4ed46869
KH
2650 switch (emacs_code_class[c1])
2651 {
2652 case EMACS_ascii_code:
a5d301df
KH
2653 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2654 break;
2655
4ed46869
KH
2656 case EMACS_control_code:
2657 *dst++ = c1;
d46c5b12 2658 coding->consumed_char++;
4ed46869
KH
2659 break;
2660
2661 case EMACS_carriage_return_code:
d46c5b12 2662 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2663 {
2664 *dst++ = c1;
d46c5b12 2665 coding->consumed_char++;
4ed46869
KH
2666 break;
2667 }
2668 /* fall down to treat '\r' as '\n' ... */
2669
2670 case EMACS_linefeed_code:
2671 if (coding->eol_type == CODING_EOL_LF
0ef69138 2672 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2673 *dst++ = '\n';
2674 else if (coding->eol_type == CODING_EOL_CRLF)
2675 *dst++ = '\r', *dst++ = '\n';
2676 else
2677 *dst++ = '\r';
d46c5b12 2678 coding->consumed_char++;
4ed46869
KH
2679 break;
2680
2681 case EMACS_leading_code_2:
2682 ONE_MORE_BYTE (c2);
a5d301df 2683 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2684 break;
2685
2686 case EMACS_leading_code_3:
2687 TWO_MORE_BYTES (c2, c3);
a5d301df 2688 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2689 break;
2690
2691 case EMACS_leading_code_4:
2692 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2693 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2694 break;
2695
4ed46869
KH
2696 default: /* i.e. case EMACS_invalid_code: */
2697 *dst++ = c1;
d46c5b12 2698 coding->consumed_char++;
4ed46869
KH
2699 }
2700 continue;
2701
2702 label_end_of_loop:
d46c5b12
KH
2703 result = CODING_FINISH_INSUFFICIENT_SRC;
2704 src = src_base;
4ed46869
KH
2705 break;
2706 }
2707
d46c5b12
KH
2708 if (result == CODING_FINISH_NORMAL
2709 && src < src_end)
2710 result = CODING_FINISH_INSUFFICIENT_DST;
2711 coding->consumed = src - source;
2712 coding->produced = coding->produced_char = dst - destination;
2713 return result;
4ed46869
KH
2714}
2715
2716\f
1397dc18
KH
2717/*** 5. CCL handlers ***/
2718
2719/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2720 Check if a text is encoded in a coding system of which
2721 encoder/decoder are written in CCL program. If it is, return
2722 CODING_CATEGORY_MASK_CCL, else return 0. */
2723
2724int
2725detect_coding_ccl (src, src_end)
2726 unsigned char *src, *src_end;
2727{
2728 unsigned char *valid;
2729
2730 /* No coding system is assigned to coding-category-ccl. */
2731 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2732 return 0;
2733
2734 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2735 while (src < src_end)
2736 {
2737 if (! valid[*src]) return 0;
2738 src++;
2739 }
2740 return CODING_CATEGORY_MASK_CCL;
2741}
2742
2743\f
2744/*** 6. End-of-line handlers ***/
4ed46869
KH
2745
2746/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2747 This function is called only when `coding->eol_type' is
2748 CODING_EOL_CRLF or CODING_EOL_CR. */
2749
dfcf069d 2750int
d46c5b12 2751decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2752 struct coding_system *coding;
2753 unsigned char *source, *destination;
2754 int src_bytes, dst_bytes;
4ed46869
KH
2755{
2756 unsigned char *src = source;
2757 unsigned char *src_end = source + src_bytes;
2758 unsigned char *dst = destination;
2759 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2760 unsigned char c;
d46c5b12
KH
2761 int result = CODING_FINISH_NORMAL;
2762
fb88bf2d
KH
2763 coding->fake_multibyte = 0;
2764
d46c5b12 2765 if (src_bytes <= 0)
716e0b0a
AI
2766 {
2767 coding->produced = coding->produced_char = 0;
2768 coding->consumed = coding->consumed_char = 0;
2769 return result;
2770 }
4ed46869
KH
2771
2772 switch (coding->eol_type)
2773 {
2774 case CODING_EOL_CRLF:
2775 {
2776 /* Since the maximum bytes produced by each loop is 2, we
2777 subtract 1 from DST_END to assure overflow checking is
2778 necessary only at the head of loop. */
2779 unsigned char *adjusted_dst_end = dst_end - 1;
2780
d46c5b12
KH
2781 while (src < src_end && (dst_bytes
2782 ? (dst < adjusted_dst_end)
2783 : (dst < src - 1)))
4ed46869
KH
2784 {
2785 unsigned char *src_base = src;
fb88bf2d
KH
2786
2787 c = *src++;
4ed46869
KH
2788 if (c == '\r')
2789 {
2790 ONE_MORE_BYTE (c);
fdfcf19d
KH
2791 if (c == '\n')
2792 *dst++ = c;
2793 else
d46c5b12
KH
2794 {
2795 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2796 {
2797 result = CODING_FINISH_INCONSISTENT_EOL;
2798 goto label_end_of_loop_2;
2799 }
fdfcf19d 2800 src--;
d46c5b12 2801 *dst++ = '\r';
fb88bf2d
KH
2802 if (BASE_LEADING_CODE_P (c))
2803 coding->fake_multibyte = 1;
d46c5b12 2804 }
4ed46869 2805 }
d46c5b12
KH
2806 else if (c == '\n'
2807 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2808 {
2809 result = CODING_FINISH_INCONSISTENT_EOL;
2810 goto label_end_of_loop_2;
2811 }
4ed46869 2812 else
fb88bf2d
KH
2813 {
2814 *dst++ = c;
2815 if (BASE_LEADING_CODE_P (c))
2816 coding->fake_multibyte = 1;
2817 }
4ed46869
KH
2818 continue;
2819
2820 label_end_of_loop:
d46c5b12
KH
2821 result = CODING_FINISH_INSUFFICIENT_SRC;
2822 label_end_of_loop_2:
4ed46869
KH
2823 src = src_base;
2824 break;
2825 }
fdfcf19d
KH
2826 if (src < src_end)
2827 {
2828 if (result == CODING_FINISH_NORMAL)
2829 result = CODING_FINISH_INSUFFICIENT_DST;
2830 else if (result != CODING_FINISH_INCONSISTENT_EOL
2831 && coding->mode & CODING_MODE_LAST_BLOCK)
2832 {
2833 /* This is the last block of the text to be decoded.
2834 We flush out all remaining codes. */
2835 src_bytes = src_end - src;
2836 if (dst_bytes && (dst_end - dst < src_bytes))
2837 src_bytes = dst_end - dst;
2838 bcopy (src, dst, src_bytes);
2839 dst += src_bytes;
2840 src += src_bytes;
2841 }
2842 }
4ed46869 2843 }
d46c5b12 2844 break;
4ed46869
KH
2845
2846 case CODING_EOL_CR:
d46c5b12
KH
2847 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2848 {
fb88bf2d
KH
2849 while (src < src_end)
2850 {
2851 if ((c = *src++) == '\n')
2852 break;
2853 if (BASE_LEADING_CODE_P (c))
2854 coding->fake_multibyte = 1;
2855 }
d46c5b12
KH
2856 if (*--src == '\n')
2857 {
2858 src_bytes = src - source;
2859 result = CODING_FINISH_INCONSISTENT_EOL;
2860 }
2861 }
2862 if (dst_bytes && src_bytes > dst_bytes)
2863 {
2864 result = CODING_FINISH_INSUFFICIENT_DST;
2865 src_bytes = dst_bytes;
2866 }
2867 if (dst_bytes)
2868 bcopy (source, destination, src_bytes);
2869 else
2870 safe_bcopy (source, destination, src_bytes);
2871 src = source + src_bytes;
2872 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2873 break;
2874
2875 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2876 if (dst_bytes && src_bytes > dst_bytes)
2877 {
2878 result = CODING_FINISH_INSUFFICIENT_DST;
2879 src_bytes = dst_bytes;
2880 }
2881 if (dst_bytes)
2882 bcopy (source, destination, src_bytes);
2883 else
2884 safe_bcopy (source, destination, src_bytes);
2885 src += src_bytes;
993824c9 2886 dst += src_bytes;
fb88bf2d 2887 coding->fake_multibyte = 1;
4ed46869
KH
2888 break;
2889 }
2890
d46c5b12
KH
2891 coding->consumed = coding->consumed_char = src - source;
2892 coding->produced = coding->produced_char = dst - destination;
2893 return result;
4ed46869
KH
2894}
2895
2896/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2897 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2898 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2899 '\r' in source text also means end-of-line. */
4ed46869 2900
dfcf069d 2901int
d46c5b12 2902encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2903 struct coding_system *coding;
2904 unsigned char *source, *destination;
2905 int src_bytes, dst_bytes;
4ed46869
KH
2906{
2907 unsigned char *src = source;
2908 unsigned char *dst = destination;
d46c5b12 2909 int result = CODING_FINISH_NORMAL;
4ed46869 2910
fb88bf2d
KH
2911 coding->fake_multibyte = 0;
2912
d46c5b12
KH
2913 if (coding->eol_type == CODING_EOL_CRLF)
2914 {
2915 unsigned char c;
2916 unsigned char *src_end = source + src_bytes;
2917 unsigned char *dst_end = destination + dst_bytes;
2918 /* Since the maximum bytes produced by each loop is 2, we
2919 subtract 1 from DST_END to assure overflow checking is
2920 necessary only at the head of loop. */
2921 unsigned char *adjusted_dst_end = dst_end - 1;
2922
2923 while (src < src_end && (dst_bytes
2924 ? (dst < adjusted_dst_end)
2925 : (dst < src - 1)))
2926 {
2927 c = *src++;
2928 if (c == '\n'
2929 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2930 *dst++ = '\r', *dst++ = '\n';
2931 else
fb88bf2d
KH
2932 {
2933 *dst++ = c;
2934 if (BASE_LEADING_CODE_P (c))
2935 coding->fake_multibyte = 1;
2936 }
d46c5b12
KH
2937 }
2938 if (src < src_end)
2939 result = CODING_FINISH_INSUFFICIENT_DST;
2940 }
2941 else
4ed46869 2942 {
fb88bf2d
KH
2943 unsigned char c;
2944
d46c5b12 2945 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2946 {
d46c5b12
KH
2947 src_bytes = dst_bytes;
2948 result = CODING_FINISH_INSUFFICIENT_DST;
2949 }
2950 if (dst_bytes)
2951 bcopy (source, destination, src_bytes);
2952 else
993824c9
RS
2953 safe_bcopy (source, destination, src_bytes);
2954 dst_bytes = src_bytes;
2955 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2956 {
2957 while (src_bytes--)
fb88bf2d
KH
2958 {
2959 if ((c = *dst++) == '\n')
2960 dst[-1] = '\r';
2961 else if (BASE_LEADING_CODE_P (c))
993824c9 2962 coding->fake_multibyte = 1;
fb88bf2d 2963 }
d46c5b12 2964 }
fb88bf2d 2965 else
d46c5b12 2966 {
fb88bf2d
KH
2967 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2968 {
2969 while (src_bytes--)
2970 if (*dst++ == '\r') dst[-1] = '\n';
2971 }
2972 coding->fake_multibyte = 1;
4ed46869 2973 }
fb88bf2d
KH
2974 src = source + dst_bytes;
2975 dst = destination + dst_bytes;
4ed46869
KH
2976 }
2977
d46c5b12
KH
2978 coding->consumed = coding->consumed_char = src - source;
2979 coding->produced = coding->produced_char = dst - destination;
2980 return result;
4ed46869
KH
2981}
2982
2983\f
1397dc18 2984/*** 7. C library functions ***/
4ed46869
KH
2985
2986/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2987 has a property `coding-system'. The value of this property is a
2988 vector of length 5 (called as coding-vector). Among elements of
2989 this vector, the first (element[0]) and the fifth (element[4])
2990 carry important information for decoding/encoding. Before
2991 decoding/encoding, this information should be set in fields of a
2992 structure of type `coding_system'.
2993
2994 A value of property `coding-system' can be a symbol of another
2995 subsidiary coding-system. In that case, Emacs gets coding-vector
2996 from that symbol.
2997
2998 `element[0]' contains information to be set in `coding->type'. The
2999 value and its meaning is as follows:
3000
0ef69138
KH
3001 0 -- coding_type_emacs_mule
3002 1 -- coding_type_sjis
3003 2 -- coding_type_iso2022
3004 3 -- coding_type_big5
3005 4 -- coding_type_ccl encoder/decoder written in CCL
3006 nil -- coding_type_no_conversion
3007 t -- coding_type_undecided (automatic conversion on decoding,
3008 no-conversion on encoding)
4ed46869
KH
3009
3010 `element[4]' contains information to be set in `coding->flags' and
3011 `coding->spec'. The meaning varies by `coding->type'.
3012
3013 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3014 of length 32 (of which the first 13 sub-elements are used now).
3015 Meanings of these sub-elements are:
3016
3017 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3018 If the value is an integer of valid charset, the charset is
3019 assumed to be designated to graphic register N initially.
3020
3021 If the value is minus, it is a minus value of charset which
3022 reserves graphic register N, which means that the charset is
3023 not designated initially but should be designated to graphic
3024 register N just before encoding a character in that charset.
3025
3026 If the value is nil, graphic register N is never used on
3027 encoding.
3028
3029 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3030 Each value takes t or nil. See the section ISO2022 of
3031 `coding.h' for more information.
3032
3033 If `coding->type' is `coding_type_big5', element[4] is t to denote
3034 BIG5-ETen or nil to denote BIG5-HKU.
3035
3036 If `coding->type' takes the other value, element[4] is ignored.
3037
3038 Emacs Lisp's coding system also carries information about format of
3039 end-of-line in a value of property `eol-type'. If the value is
3040 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3041 means CODING_EOL_CR. If it is not integer, it should be a vector
3042 of subsidiary coding systems of which property `eol-type' has one
3043 of above values.
3044
3045*/
3046
3047/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3048 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3049 is setup so that no conversion is necessary and return -1, else
3050 return 0. */
3051
3052int
e0e989f6
KH
3053setup_coding_system (coding_system, coding)
3054 Lisp_Object coding_system;
4ed46869
KH
3055 struct coding_system *coding;
3056{
d46c5b12 3057 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 3058 Lisp_Object val;
70c22245 3059 int i;
4ed46869 3060
d46c5b12 3061 /* Initialize some fields required for all kinds of coding systems. */
774324d6 3062 coding->symbol = coding_system;
d46c5b12
KH
3063 coding->common_flags = 0;
3064 coding->mode = 0;
3065 coding->heading_ascii = -1;
3066 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
ec6d2bb8
KH
3067 coding->composing = COMPOSITION_DISABLED;
3068 coding->cmp_data = NULL;
1f5dbf34
KH
3069
3070 if (NILP (coding_system))
3071 goto label_invalid_coding_system;
3072
4608c386 3073 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 3074
4608c386
KH
3075 if (!VECTORP (coding_spec)
3076 || XVECTOR (coding_spec)->size != 5
3077 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 3078 goto label_invalid_coding_system;
4608c386 3079
d46c5b12
KH
3080 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3081 if (VECTORP (eol_type))
3082 {
3083 coding->eol_type = CODING_EOL_UNDECIDED;
3084 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3085 }
3086 else if (XFASTINT (eol_type) == 1)
3087 {
3088 coding->eol_type = CODING_EOL_CRLF;
3089 coding->common_flags
3090 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3091 }
3092 else if (XFASTINT (eol_type) == 2)
3093 {
3094 coding->eol_type = CODING_EOL_CR;
3095 coding->common_flags
3096 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3097 }
3098 else
3099 coding->eol_type = CODING_EOL_LF;
3100
3101 coding_type = XVECTOR (coding_spec)->contents[0];
3102 /* Try short cut. */
3103 if (SYMBOLP (coding_type))
3104 {
3105 if (EQ (coding_type, Qt))
3106 {
3107 coding->type = coding_type_undecided;
3108 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3109 }
3110 else
3111 coding->type = coding_type_no_conversion;
3112 return 0;
3113 }
3114
d46c5b12
KH
3115 /* Get values of coding system properties:
3116 `post-read-conversion', `pre-write-conversion',
f967223b 3117 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 3118 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae
KH
3119 /* Pre & post conversion functions should be disabled if
3120 inhibit_eol_conversion is nozero. This is the case that a code
3121 conversion function is called while those functions are running. */
3122 if (! inhibit_pre_post_conversion)
3123 {
3124 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3125 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3126 }
f967223b 3127 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 3128 if (SYMBOLP (val))
f967223b
KH
3129 val = Fget (val, Qtranslation_table_for_decode);
3130 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3131 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 3132 if (SYMBOLP (val))
f967223b
KH
3133 val = Fget (val, Qtranslation_table_for_encode);
3134 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
3135 val = Fplist_get (plist, Qcoding_category);
3136 if (!NILP (val))
3137 {
3138 val = Fget (val, Qcoding_category_index);
3139 if (INTEGERP (val))
3140 coding->category_idx = XINT (val);
3141 else
3142 goto label_invalid_coding_system;
3143 }
3144 else
3145 goto label_invalid_coding_system;
4608c386 3146
70c22245
KH
3147 val = Fplist_get (plist, Qsafe_charsets);
3148 if (EQ (val, Qt))
3149 {
3150 for (i = 0; i <= MAX_CHARSET; i++)
3151 coding->safe_charsets[i] = 1;
3152 }
3153 else
3154 {
3155 bzero (coding->safe_charsets, MAX_CHARSET + 1);
3156 while (CONSP (val))
3157 {
03699b14 3158 if ((i = get_charset_id (XCAR (val))) >= 0)
70c22245 3159 coding->safe_charsets[i] = 1;
03699b14 3160 val = XCDR (val);
70c22245
KH
3161 }
3162 }
3163
ec6d2bb8
KH
3164 /* If the coding system has non-nil `composition' property, enable
3165 composition handling. */
3166 val = Fplist_get (plist, Qcomposition);
3167 if (!NILP (val))
3168 coding->composing = COMPOSITION_NO;
3169
d46c5b12 3170 switch (XFASTINT (coding_type))
4ed46869
KH
3171 {
3172 case 0:
0ef69138 3173 coding->type = coding_type_emacs_mule;
c952af22
KH
3174 if (!NILP (coding->post_read_conversion))
3175 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3176 if (!NILP (coding->pre_write_conversion))
3177 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3178 break;
3179
3180 case 1:
3181 coding->type = coding_type_sjis;
c952af22
KH
3182 coding->common_flags
3183 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3184 break;
3185
3186 case 2:
3187 coding->type = coding_type_iso2022;
c952af22
KH
3188 coding->common_flags
3189 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3190 {
70c22245 3191 Lisp_Object val, temp;
4ed46869 3192 Lisp_Object *flags;
d46c5b12 3193 int i, charset, reg_bits = 0;
4ed46869 3194
4608c386 3195 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3196
4ed46869
KH
3197 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3198 goto label_invalid_coding_system;
3199
3200 flags = XVECTOR (val)->contents;
3201 coding->flags
3202 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3203 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3204 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3205 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3206 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3207 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3208 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3209 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3210 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3211 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3212 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3213 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3214 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3215 );
4ed46869
KH
3216
3217 /* Invoke graphic register 0 to plane 0. */
3218 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3219 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3220 CODING_SPEC_ISO_INVOCATION (coding, 1)
3221 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3222 /* Not single shifting at first. */
6e85d753 3223 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3224 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3225 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3226
70c22245
KH
3227 for (charset = 0; charset <= MAX_CHARSET; charset++)
3228 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3229 val = Vcharset_revision_alist;
3230 while (CONSP (val))
3231 {
03699b14 3232 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3233 if (charset >= 0
03699b14 3234 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3235 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3236 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3237 val = XCDR (val);
70c22245
KH
3238 }
3239
4ed46869
KH
3240 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3241 FLAGS[REG] can be one of below:
3242 integer CHARSET: CHARSET occupies register I,
3243 t: designate nothing to REG initially, but can be used
3244 by any charsets,
3245 list of integer, nil, or t: designate the first
3246 element (if integer) to REG initially, the remaining
3247 elements (if integer) is designated to REG on request,
d46c5b12 3248 if an element is t, REG can be used by any charsets,
4ed46869 3249 nil: REG is never used. */
467e7675 3250 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3251 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3252 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3253 for (i = 0; i < 4; i++)
3254 {
3255 if (INTEGERP (flags[i])
e0e989f6
KH
3256 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3257 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3258 {
3259 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3260 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3261 }
3262 else if (EQ (flags[i], Qt))
3263 {
3264 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3265 reg_bits |= 1 << i;
3266 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3267 }
3268 else if (CONSP (flags[i]))
3269 {
84d60297
RS
3270 Lisp_Object tail;
3271 tail = flags[i];
4ed46869 3272
d46c5b12 3273 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
03699b14
KR
3274 if (INTEGERP (XCAR (tail))
3275 && (charset = XINT (XCAR (tail)),
e0e989f6 3276 CHARSET_VALID_P (charset))
03699b14 3277 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3278 {
3279 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3280 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3281 }
3282 else
3283 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3284 tail = XCDR (tail);
4ed46869
KH
3285 while (CONSP (tail))
3286 {
03699b14
KR
3287 if (INTEGERP (XCAR (tail))
3288 && (charset = XINT (XCAR (tail)),
e0e989f6 3289 CHARSET_VALID_P (charset))
03699b14 3290 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3291 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3292 = i;
03699b14 3293 else if (EQ (XCAR (tail), Qt))
d46c5b12 3294 reg_bits |= 1 << i;
03699b14 3295 tail = XCDR (tail);
4ed46869
KH
3296 }
3297 }
3298 else
3299 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3300
3301 CODING_SPEC_ISO_DESIGNATION (coding, i)
3302 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3303 }
3304
d46c5b12 3305 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3306 {
3307 /* REG 1 can be used only by locking shift in 7-bit env. */
3308 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3309 reg_bits &= ~2;
4ed46869
KH
3310 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3311 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3312 reg_bits &= 3;
4ed46869
KH
3313 }
3314
d46c5b12
KH
3315 if (reg_bits)
3316 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3317 {
d46c5b12
KH
3318 if (CHARSET_VALID_P (charset))
3319 {
3320 /* There exist some default graphic registers to be
3321 used CHARSET. */
3322
3323 /* We had better avoid designating a charset of
3324 CHARS96 to REG 0 as far as possible. */
3325 if (CHARSET_CHARS (charset) == 96)
3326 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3327 = (reg_bits & 2
3328 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3329 else
3330 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3331 = (reg_bits & 1
3332 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3333 }
6e85d753 3334 }
4ed46869 3335 }
c952af22 3336 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3337 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3338 break;
3339
3340 case 3:
3341 coding->type = coding_type_big5;
c952af22
KH
3342 coding->common_flags
3343 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3344 coding->flags
4608c386 3345 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3346 ? CODING_FLAG_BIG5_HKU
3347 : CODING_FLAG_BIG5_ETEN);
3348 break;
3349
3350 case 4:
3351 coding->type = coding_type_ccl;
c952af22
KH
3352 coding->common_flags
3353 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3354 {
84d60297 3355 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3356 if (! CONSP (val)
3357 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3358 XCAR (val)) < 0
ef4ced28 3359 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3360 XCDR (val)) < 0)
4ed46869 3361 goto label_invalid_coding_system;
1397dc18
KH
3362
3363 bzero (coding->spec.ccl.valid_codes, 256);
3364 val = Fplist_get (plist, Qvalid_codes);
3365 if (CONSP (val))
3366 {
3367 Lisp_Object this;
3368
03699b14 3369 for (; CONSP (val); val = XCDR (val))
1397dc18 3370 {
03699b14 3371 this = XCAR (val);
1397dc18
KH
3372 if (INTEGERP (this)
3373 && XINT (this) >= 0 && XINT (this) < 256)
3374 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3375 else if (CONSP (this)
03699b14
KR
3376 && INTEGERP (XCAR (this))
3377 && INTEGERP (XCDR (this)))
1397dc18 3378 {
03699b14
KR
3379 int start = XINT (XCAR (this));
3380 int end = XINT (XCDR (this));
1397dc18
KH
3381
3382 if (start >= 0 && start <= end && end < 256)
e133c8fa 3383 while (start <= end)
1397dc18
KH
3384 coding->spec.ccl.valid_codes[start++] = 1;
3385 }
3386 }
3387 }
4ed46869 3388 }
c952af22 3389 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3390 break;
3391
27901516
KH
3392 case 5:
3393 coding->type = coding_type_raw_text;
3394 break;
3395
4ed46869 3396 default:
d46c5b12 3397 goto label_invalid_coding_system;
4ed46869
KH
3398 }
3399 return 0;
3400
3401 label_invalid_coding_system:
3402 coding->type = coding_type_no_conversion;
d46c5b12 3403 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3404 coding->common_flags = 0;
dec137e5 3405 coding->eol_type = CODING_EOL_LF;
d46c5b12 3406 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3407 return -1;
3408}
3409
ec6d2bb8
KH
3410/* Free memory blocks allocated for storing composition information. */
3411
3412void
3413coding_free_composition_data (coding)
3414 struct coding_system *coding;
3415{
3416 struct composition_data *cmp_data = coding->cmp_data, *next;
3417
3418 if (!cmp_data)
3419 return;
3420 /* Memory blocks are chained. At first, rewind to the first, then,
3421 free blocks one by one. */
3422 while (cmp_data->prev)
3423 cmp_data = cmp_data->prev;
3424 while (cmp_data)
3425 {
3426 next = cmp_data->next;
3427 xfree (cmp_data);
3428 cmp_data = next;
3429 }
3430 coding->cmp_data = NULL;
3431}
3432
3433/* Set `char_offset' member of all memory blocks pointed by
3434 coding->cmp_data to POS. */
3435
3436void
3437coding_adjust_composition_offset (coding, pos)
3438 struct coding_system *coding;
3439 int pos;
3440{
3441 struct composition_data *cmp_data;
3442
3443 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3444 cmp_data->char_offset = pos;
3445}
3446
54f78171
KH
3447/* Setup raw-text or one of its subsidiaries in the structure
3448 coding_system CODING according to the already setup value eol_type
3449 in CODING. CODING should be setup for some coding system in
3450 advance. */
3451
3452void
3453setup_raw_text_coding_system (coding)
3454 struct coding_system *coding;
3455{
3456 if (coding->type != coding_type_raw_text)
3457 {
3458 coding->symbol = Qraw_text;
3459 coding->type = coding_type_raw_text;
3460 if (coding->eol_type != CODING_EOL_UNDECIDED)
3461 {
84d60297
RS
3462 Lisp_Object subsidiaries;
3463 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3464
3465 if (VECTORP (subsidiaries)
3466 && XVECTOR (subsidiaries)->size == 3)
3467 coding->symbol
3468 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3469 }
716e0b0a 3470 setup_coding_system (coding->symbol, coding);
54f78171
KH
3471 }
3472 return;
3473}
3474
4ed46869
KH
3475/* Emacs has a mechanism to automatically detect a coding system if it
3476 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3477 it's impossible to distinguish some coding systems accurately
3478 because they use the same range of codes. So, at first, coding
3479 systems are categorized into 7, those are:
3480
0ef69138 3481 o coding-category-emacs-mule
4ed46869
KH
3482
3483 The category for a coding system which has the same code range
3484 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3485 symbol) `emacs-mule' by default.
4ed46869
KH
3486
3487 o coding-category-sjis
3488
3489 The category for a coding system which has the same code range
3490 as SJIS. Assigned the coding-system (Lisp
7717c392 3491 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3492
3493 o coding-category-iso-7
3494
3495 The category for a coding system which has the same code range
7717c392 3496 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3497 shift and single shift functions. This can encode/decode all
3498 charsets. Assigned the coding-system (Lisp symbol)
3499 `iso-2022-7bit' by default.
3500
3501 o coding-category-iso-7-tight
3502
3503 Same as coding-category-iso-7 except that this can
3504 encode/decode only the specified charsets.
4ed46869
KH
3505
3506 o coding-category-iso-8-1
3507
3508 The category for a coding system which has the same code range
3509 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3510 for DIMENSION1 charset. This doesn't use any locking shift
3511 and single shift functions. Assigned the coding-system (Lisp
3512 symbol) `iso-latin-1' by default.
4ed46869
KH
3513
3514 o coding-category-iso-8-2
3515
3516 The category for a coding system which has the same code range
3517 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3518 for DIMENSION2 charset. This doesn't use any locking shift
3519 and single shift functions. Assigned the coding-system (Lisp
3520 symbol) `japanese-iso-8bit' by default.
4ed46869 3521
7717c392 3522 o coding-category-iso-7-else
4ed46869
KH
3523
3524 The category for a coding system which has the same code range
7717c392
KH
3525 as ISO2022 of 7-bit environemnt but uses locking shift or
3526 single shift functions. Assigned the coding-system (Lisp
3527 symbol) `iso-2022-7bit-lock' by default.
3528
3529 o coding-category-iso-8-else
3530
3531 The category for a coding system which has the same code range
3532 as ISO2022 of 8-bit environemnt but uses locking shift or
3533 single shift functions. Assigned the coding-system (Lisp
3534 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3535
3536 o coding-category-big5
3537
3538 The category for a coding system which has the same code range
3539 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3540 `cn-big5' by default.
4ed46869 3541
fa42c37f
KH
3542 o coding-category-utf-8
3543
3544 The category for a coding system which has the same code range
3545 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3546 symbol) `utf-8' by default.
3547
3548 o coding-category-utf-16-be
3549
3550 The category for a coding system in which a text has an
3551 Unicode signature (cf. Unicode Standard) in the order of BIG
3552 endian at the head. Assigned the coding-system (Lisp symbol)
3553 `utf-16-be' by default.
3554
3555 o coding-category-utf-16-le
3556
3557 The category for a coding system in which a text has an
3558 Unicode signature (cf. Unicode Standard) in the order of
3559 LITTLE endian at the head. Assigned the coding-system (Lisp
3560 symbol) `utf-16-le' by default.
3561
1397dc18
KH
3562 o coding-category-ccl
3563
3564 The category for a coding system of which encoder/decoder is
3565 written in CCL programs. The default value is nil, i.e., no
3566 coding system is assigned.
3567
4ed46869
KH
3568 o coding-category-binary
3569
3570 The category for a coding system not categorized in any of the
3571 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3572 `no-conversion' by default.
4ed46869
KH
3573
3574 Each of them is a Lisp symbol and the value is an actual
3575 `coding-system's (this is also a Lisp symbol) assigned by a user.
3576 What Emacs does actually is to detect a category of coding system.
3577 Then, it uses a `coding-system' assigned to it. If Emacs can't
3578 decide only one possible category, it selects a category of the
3579 highest priority. Priorities of categories are also specified by a
3580 user in a Lisp variable `coding-category-list'.
3581
3582*/
3583
66cfb530
KH
3584static
3585int ascii_skip_code[256];
3586
d46c5b12 3587/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3588 If it detects possible coding systems, return an integer in which
3589 appropriate flag bits are set. Flag bits are defined by macros
fa42c37f
KH
3590 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3591 it should point the table `coding_priorities'. In that case, only
3592 the flag bit for a coding system of the highest priority is set in
3593 the returned value.
4ed46869 3594
d46c5b12
KH
3595 How many ASCII characters are at the head is returned as *SKIP. */
3596
3597static int
3598detect_coding_mask (source, src_bytes, priorities, skip)
3599 unsigned char *source;
3600 int src_bytes, *priorities, *skip;
4ed46869
KH
3601{
3602 register unsigned char c;
d46c5b12 3603 unsigned char *src = source, *src_end = source + src_bytes;
fa42c37f
KH
3604 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3605 int i, idx;
4ed46869
KH
3606
3607 /* At first, skip all ASCII characters and control characters except
3608 for three ISO2022 specific control characters. */
66cfb530
KH
3609 ascii_skip_code[ISO_CODE_SO] = 0;
3610 ascii_skip_code[ISO_CODE_SI] = 0;
3611 ascii_skip_code[ISO_CODE_ESC] = 0;
3612
bcf26d6a 3613 label_loop_detect_coding:
66cfb530 3614 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3615 *skip = src - source;
4ed46869
KH
3616
3617 if (src >= src_end)
3618 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3619 return 0;
4ed46869 3620
8a8147d6 3621 c = *src;
4ed46869
KH
3622 /* The text seems to be encoded in some multilingual coding system.
3623 Now, try to find in which coding system the text is encoded. */
3624 if (c < 0x80)
bcf26d6a
KH
3625 {
3626 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3627 /* C is an ISO2022 specific control code of C0. */
3628 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3629 if (mask == 0)
d46c5b12
KH
3630 {
3631 /* No valid ISO2022 code follows C. Try again. */
3632 src++;
66cfb530
KH
3633 if (c == ISO_CODE_ESC)
3634 ascii_skip_code[ISO_CODE_ESC] = 1;
3635 else
3636 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3637 goto label_loop_detect_coding;
3638 }
3639 if (priorities)
fa42c37f
KH
3640 {
3641 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3642 {
3643 if (mask & priorities[i])
3644 return priorities[i];
3645 }
3646 return CODING_CATEGORY_MASK_RAW_TEXT;
3647 }
bcf26d6a 3648 }
d46c5b12 3649 else
c4825358 3650 {
d46c5b12 3651 int try;
4ed46869 3652
d46c5b12
KH
3653 if (c < 0xA0)
3654 {
3655 /* C is the first byte of SJIS character code,
fa42c37f
KH
3656 or a leading-code of Emacs' internal format (emacs-mule),
3657 or the first byte of UTF-16. */
3658 try = (CODING_CATEGORY_MASK_SJIS
3659 | CODING_CATEGORY_MASK_EMACS_MULE
3660 | CODING_CATEGORY_MASK_UTF_16_BE
3661 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12
KH
3662
3663 /* Or, if C is a special latin extra code,
3664 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3665 or is an ISO2022 control-sequence-introducer (CSI),
3666 we should also consider the possibility of ISO2022 codings. */
3667 if ((VECTORP (Vlatin_extra_code_table)
3668 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3669 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3670 || (c == ISO_CODE_CSI
3671 && (src < src_end
3672 && (*src == ']'
3673 || ((*src == '0' || *src == '1' || *src == '2')
3674 && src + 1 < src_end
3675 && src[1] == ']')))))
3676 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3677 | CODING_CATEGORY_MASK_ISO_8BIT);
3678 }
c4825358 3679 else
d46c5b12
KH
3680 /* C is a character of ISO2022 in graphic plane right,
3681 or a SJIS's 1-byte character code (i.e. JISX0201),
fa42c37f
KH
3682 or the first byte of BIG5's 2-byte code,
3683 or the first byte of UTF-8/16. */
d46c5b12
KH
3684 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3685 | CODING_CATEGORY_MASK_ISO_8BIT
3686 | CODING_CATEGORY_MASK_SJIS
fa42c37f
KH
3687 | CODING_CATEGORY_MASK_BIG5
3688 | CODING_CATEGORY_MASK_UTF_8
3689 | CODING_CATEGORY_MASK_UTF_16_BE
3690 | CODING_CATEGORY_MASK_UTF_16_LE);
d46c5b12 3691
1397dc18
KH
3692 /* Or, we may have to consider the possibility of CCL. */
3693 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3694 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3695 ->spec.ccl.valid_codes)[c])
3696 try |= CODING_CATEGORY_MASK_CCL;
3697
d46c5b12 3698 mask = 0;
fa42c37f 3699 utf16_examined_p = iso2022_examined_p = 0;
d46c5b12
KH
3700 if (priorities)
3701 {
3702 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3703 {
fa42c37f
KH
3704 if (!iso2022_examined_p
3705 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3706 {
3707 mask |= detect_coding_iso2022 (src, src_end);
3708 iso2022_examined_p = 1;
3709 }
5ab13dd0 3710 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
fa42c37f
KH
3711 mask |= detect_coding_sjis (src, src_end);
3712 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3713 mask |= detect_coding_utf_8 (src, src_end);
3714 else if (!utf16_examined_p
3715 && (priorities[i] & try &
3716 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3717 {
3718 mask |= detect_coding_utf_16 (src, src_end);
3719 utf16_examined_p = 1;
3720 }
5ab13dd0 3721 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
fa42c37f 3722 mask |= detect_coding_big5 (src, src_end);
5ab13dd0 3723 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
fa42c37f 3724 mask |= detect_coding_emacs_mule (src, src_end);
89fa8b36 3725 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
fa42c37f 3726 mask |= detect_coding_ccl (src, src_end);
5ab13dd0 3727 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
fa42c37f 3728 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
5ab13dd0 3729 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
fa42c37f
KH
3730 mask |= CODING_CATEGORY_MASK_BINARY;
3731 if (mask & priorities[i])
3732 return priorities[i];
d46c5b12
KH
3733 }
3734 return CODING_CATEGORY_MASK_RAW_TEXT;
3735 }
3736 if (try & CODING_CATEGORY_MASK_ISO)
3737 mask |= detect_coding_iso2022 (src, src_end);
3738 if (try & CODING_CATEGORY_MASK_SJIS)
3739 mask |= detect_coding_sjis (src, src_end);
3740 if (try & CODING_CATEGORY_MASK_BIG5)
3741 mask |= detect_coding_big5 (src, src_end);
fa42c37f
KH
3742 if (try & CODING_CATEGORY_MASK_UTF_8)
3743 mask |= detect_coding_utf_8 (src, src_end);
3744 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3745 mask |= detect_coding_utf_16 (src, src_end);
d46c5b12 3746 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3747 mask |= detect_coding_emacs_mule (src, src_end);
3748 if (try & CODING_CATEGORY_MASK_CCL)
3749 mask |= detect_coding_ccl (src, src_end);
c4825358 3750 }
5ab13dd0 3751 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
3752}
3753
3754/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3755 The information of the detected coding system is set in CODING. */
3756
3757void
3758detect_coding (coding, src, src_bytes)
3759 struct coding_system *coding;
3760 unsigned char *src;
3761 int src_bytes;
3762{
d46c5b12
KH
3763 unsigned int idx;
3764 int skip, mask, i;
84d60297 3765 Lisp_Object val;
4ed46869 3766
84d60297 3767 val = Vcoding_category_list;
66cfb530 3768 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3769 coding->heading_ascii = skip;
4ed46869 3770
d46c5b12
KH
3771 if (!mask) return;
3772
3773 /* We found a single coding system of the highest priority in MASK. */
3774 idx = 0;
3775 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3776 if (! mask)
3777 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3778
d46c5b12
KH
3779 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3780
3781 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3782 {
84d60297 3783 Lisp_Object tmp;
d46c5b12 3784
84d60297 3785 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3786 if (VECTORP (tmp))
3787 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3788 }
d46c5b12
KH
3789 setup_coding_system (val, coding);
3790 /* Set this again because setup_coding_system reset this member. */
3791 coding->heading_ascii = skip;
4ed46869
KH
3792}
3793
d46c5b12
KH
3794/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3795 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3796 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3797
3798 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3799
bc4bc72a
RS
3800#define MAX_EOL_CHECK_COUNT 3
3801
d46c5b12
KH
3802static int
3803detect_eol_type (source, src_bytes, skip)
3804 unsigned char *source;
3805 int src_bytes, *skip;
4ed46869 3806{
d46c5b12 3807 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3808 unsigned char c;
bc4bc72a
RS
3809 int total = 0; /* How many end-of-lines are found so far. */
3810 int eol_type = CODING_EOL_UNDECIDED;
3811 int this_eol_type;
4ed46869 3812
d46c5b12
KH
3813 *skip = 0;
3814
bc4bc72a 3815 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3816 {
3817 c = *src++;
bc4bc72a 3818 if (c == '\n' || c == '\r')
4ed46869 3819 {
d46c5b12
KH
3820 if (*skip == 0)
3821 *skip = src - 1 - source;
bc4bc72a
RS
3822 total++;
3823 if (c == '\n')
3824 this_eol_type = CODING_EOL_LF;
3825 else if (src >= src_end || *src != '\n')
3826 this_eol_type = CODING_EOL_CR;
4ed46869 3827 else
bc4bc72a
RS
3828 this_eol_type = CODING_EOL_CRLF, src++;
3829
3830 if (eol_type == CODING_EOL_UNDECIDED)
3831 /* This is the first end-of-line. */
3832 eol_type = this_eol_type;
3833 else if (eol_type != this_eol_type)
d46c5b12
KH
3834 {
3835 /* The found type is different from what found before. */
3836 eol_type = CODING_EOL_INCONSISTENT;
3837 break;
3838 }
4ed46869
KH
3839 }
3840 }
bc4bc72a 3841
d46c5b12
KH
3842 if (*skip == 0)
3843 *skip = src_end - source;
85a02ca4 3844 return eol_type;
4ed46869
KH
3845}
3846
fa42c37f
KH
3847/* Like detect_eol_type, but detect EOL type in 2-octet
3848 big-endian/little-endian format for coding systems utf-16-be and
3849 utf-16-le. */
3850
3851static int
3852detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3853 unsigned char *source;
3854 int src_bytes, *skip;
3855{
3856 unsigned char *src = source, *src_end = src + src_bytes;
3857 unsigned int c1, c2;
3858 int total = 0; /* How many end-of-lines are found so far. */
3859 int eol_type = CODING_EOL_UNDECIDED;
3860 int this_eol_type;
3861 int msb, lsb;
3862
3863 if (big_endian_p)
3864 msb = 0, lsb = 1;
3865 else
3866 msb = 1, lsb = 0;
3867
3868 *skip = 0;
3869
3870 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3871 {
3872 c1 = (src[msb] << 8) | (src[lsb]);
3873 src += 2;
3874
3875 if (c1 == '\n' || c1 == '\r')
3876 {
3877 if (*skip == 0)
3878 *skip = src - 2 - source;
3879 total++;
3880 if (c1 == '\n')
3881 {
3882 this_eol_type = CODING_EOL_LF;
3883 }
3884 else
3885 {
3886 if ((src + 1) >= src_end)
3887 {
3888 this_eol_type = CODING_EOL_CR;
3889 }
3890 else
3891 {
3892 c2 = (src[msb] << 8) | (src[lsb]);
3893 if (c2 == '\n')
3894 this_eol_type = CODING_EOL_CRLF, src += 2;
3895 else
3896 this_eol_type = CODING_EOL_CR;
3897 }
3898 }
3899
3900 if (eol_type == CODING_EOL_UNDECIDED)
3901 /* This is the first end-of-line. */
3902 eol_type = this_eol_type;
3903 else if (eol_type != this_eol_type)
3904 {
3905 /* The found type is different from what found before. */
3906 eol_type = CODING_EOL_INCONSISTENT;
3907 break;
3908 }
3909 }
3910 }
3911
3912 if (*skip == 0)
3913 *skip = src_end - source;
3914 return eol_type;
3915}
3916
4ed46869
KH
3917/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3918 is encoded. If it detects an appropriate format of end-of-line, it
3919 sets the information in *CODING. */
3920
3921void
3922detect_eol (coding, src, src_bytes)
3923 struct coding_system *coding;
3924 unsigned char *src;
3925 int src_bytes;
3926{
4608c386 3927 Lisp_Object val;
d46c5b12 3928 int skip;
fa42c37f
KH
3929 int eol_type;
3930
3931 switch (coding->category_idx)
3932 {
3933 case CODING_CATEGORY_IDX_UTF_16_BE:
3934 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3935 break;
3936 case CODING_CATEGORY_IDX_UTF_16_LE:
3937 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3938 break;
3939 default:
3940 eol_type = detect_eol_type (src, src_bytes, &skip);
3941 break;
3942 }
d46c5b12
KH
3943
3944 if (coding->heading_ascii > skip)
3945 coding->heading_ascii = skip;
3946 else
3947 skip = coding->heading_ascii;
4ed46869 3948
0ef69138 3949 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3950 return;
27901516
KH
3951 if (eol_type == CODING_EOL_INCONSISTENT)
3952 {
3953#if 0
3954 /* This code is suppressed until we find a better way to
992f23f2 3955 distinguish raw text file and binary file. */
27901516
KH
3956
3957 /* If we have already detected that the coding is raw-text, the
3958 coding should actually be no-conversion. */
3959 if (coding->type == coding_type_raw_text)
3960 {
3961 setup_coding_system (Qno_conversion, coding);
3962 return;
3963 }
3964 /* Else, let's decode only text code anyway. */
3965#endif /* 0 */
1b2af4b0 3966 eol_type = CODING_EOL_LF;
27901516
KH
3967 }
3968
4608c386 3969 val = Fget (coding->symbol, Qeol_type);
4ed46869 3970 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3971 {
3972 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3973 coding->heading_ascii = skip;
3974 }
3975}
3976
3977#define CONVERSION_BUFFER_EXTRA_ROOM 256
3978
3979#define DECODING_BUFFER_MAG(coding) \
3980 (coding->type == coding_type_iso2022 \
3981 ? 3 \
3982 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3983 ? 2 \
3984 : (coding->type == coding_type_raw_text \
3985 ? 1 \
3986 : (coding->type == coding_type_ccl \
3987 ? coding->spec.ccl.decoder.buf_magnification \
3988 : 2))))
3989
3990/* Return maximum size (bytes) of a buffer enough for decoding
3991 SRC_BYTES of text encoded in CODING. */
3992
3993int
3994decoding_buffer_size (coding, src_bytes)
3995 struct coding_system *coding;
3996 int src_bytes;
3997{
3998 return (src_bytes * DECODING_BUFFER_MAG (coding)
3999 + CONVERSION_BUFFER_EXTRA_ROOM);
4000}
4001
4002/* Return maximum size (bytes) of a buffer enough for encoding
4003 SRC_BYTES of text to CODING. */
4004
4005int
4006encoding_buffer_size (coding, src_bytes)
4007 struct coding_system *coding;
4008 int src_bytes;
4009{
4010 int magnification;
4011
4012 if (coding->type == coding_type_ccl)
4013 magnification = coding->spec.ccl.encoder.buf_magnification;
4014 else
4015 magnification = 3;
4016
4017 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4018}
4019
4020#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
4021#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
4022#endif
4023
4024char *conversion_buffer;
4025int conversion_buffer_size;
4026
4027/* Return a pointer to a SIZE bytes of buffer to be used for encoding
4028 or decoding. Sufficient memory is allocated automatically. If we
4029 run out of memory, return NULL. */
4030
4031char *
4032get_conversion_buffer (size)
4033 int size;
4034{
4035 if (size > conversion_buffer_size)
4036 {
4037 char *buf;
4038 int real_size = conversion_buffer_size * 2;
4039
4040 while (real_size < size) real_size *= 2;
4041 buf = (char *) xmalloc (real_size);
4042 xfree (conversion_buffer);
4043 conversion_buffer = buf;
4044 conversion_buffer_size = real_size;
4045 }
4046 return conversion_buffer;
4047}
4048
4049int
4050ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4051 struct coding_system *coding;
4052 unsigned char *source, *destination;
4053 int src_bytes, dst_bytes, encodep;
4054{
4055 struct ccl_program *ccl
4056 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4057 int result;
4058
ae9ff118 4059 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 4060
d46c5b12
KH
4061 coding->produced = ccl_driver (ccl, source, destination,
4062 src_bytes, dst_bytes, &(coding->consumed));
69f76525 4063 coding->produced_char
48942766
KH
4064 = (encodep
4065 ? coding->produced
4066 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
4067 coding->consumed_char
4068 = multibyte_chars_in_text (source, coding->consumed);
4069
d46c5b12
KH
4070 switch (ccl->status)
4071 {
4072 case CCL_STAT_SUSPEND_BY_SRC:
4073 result = CODING_FINISH_INSUFFICIENT_SRC;
4074 break;
4075 case CCL_STAT_SUSPEND_BY_DST:
4076 result = CODING_FINISH_INSUFFICIENT_DST;
4077 break;
9864ebce
KH
4078 case CCL_STAT_QUIT:
4079 case CCL_STAT_INVALID_CMD:
4080 result = CODING_FINISH_INTERRUPT;
4081 break;
d46c5b12
KH
4082 default:
4083 result = CODING_FINISH_NORMAL;
4084 break;
4085 }
4086 return result;
4ed46869
KH
4087}
4088
4089/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4090 decoding, it may detect coding system and format of end-of-line if
52d41803
KH
4091 those are not yet decided.
4092
4093 This function does not make full use of DESTINATION buffer. For
4094 instance, if coding->type is coding_type_iso2022, it uses only
4095 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
4096 DST_BYTES is decided by the function decoding_buffer_size, it
4097 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4098 So, this function can decode the full SOURCE. But, in the other
4099 case, if you want to avoid carry over, you must supply at least 7
4100 bytes more area in DESTINATION buffer than expected maximum bytes
4101 that will be produced by this function. */
4ed46869
KH
4102
4103int
d46c5b12 4104decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
4105 struct coding_system *coding;
4106 unsigned char *source, *destination;
4107 int src_bytes, dst_bytes;
4ed46869 4108{
d46c5b12 4109 int result;
4ed46869 4110
d4e57bcd 4111 if (src_bytes <= 0
944bd420 4112 && coding->type != coding_type_ccl
d4e57bcd
KH
4113 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4114 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 4115 {
d46c5b12
KH
4116 coding->produced = coding->produced_char = 0;
4117 coding->consumed = coding->consumed_char = 0;
fb88bf2d 4118 coding->fake_multibyte = 0;
d46c5b12 4119 return CODING_FINISH_NORMAL;
4ed46869
KH
4120 }
4121
0ef69138 4122 if (coding->type == coding_type_undecided)
4ed46869
KH
4123 detect_coding (coding, source, src_bytes);
4124
0ef69138 4125 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
4126 detect_eol (coding, source, src_bytes);
4127
4ed46869
KH
4128 switch (coding->type)
4129 {
0ef69138
KH
4130 case coding_type_emacs_mule:
4131 case coding_type_undecided:
27901516 4132 case coding_type_raw_text:
4ed46869 4133 if (coding->eol_type == CODING_EOL_LF
0ef69138 4134 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 4135 goto label_no_conversion;
d46c5b12 4136 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
4137 break;
4138
4139 case coding_type_sjis:
d46c5b12
KH
4140 result = decode_coding_sjis_big5 (coding, source, destination,
4141 src_bytes, dst_bytes, 1);
4ed46869
KH
4142 break;
4143
4144 case coding_type_iso2022:
d46c5b12
KH
4145 result = decode_coding_iso2022 (coding, source, destination,
4146 src_bytes, dst_bytes);
4ed46869
KH
4147 break;
4148
4149 case coding_type_big5:
d46c5b12
KH
4150 result = decode_coding_sjis_big5 (coding, source, destination,
4151 src_bytes, dst_bytes, 0);
4ed46869
KH
4152 break;
4153
4154 case coding_type_ccl:
d46c5b12
KH
4155 result = ccl_coding_driver (coding, source, destination,
4156 src_bytes, dst_bytes, 0);
4157 break;
4158
4159 default: /* i.e. case coding_type_no_conversion: */
4160 label_no_conversion:
4161 if (dst_bytes && src_bytes > dst_bytes)
4162 {
4163 coding->produced = dst_bytes;
4164 result = CODING_FINISH_INSUFFICIENT_DST;
4165 }
4166 else
4167 {
4168 coding->produced = src_bytes;
4169 result = CODING_FINISH_NORMAL;
4170 }
4171 if (dst_bytes)
4172 bcopy (source, destination, coding->produced);
4173 else
4174 safe_bcopy (source, destination, coding->produced);
fb88bf2d 4175 coding->fake_multibyte = 1;
d46c5b12
KH
4176 coding->consumed
4177 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
4178 break;
4179 }
4180
d46c5b12 4181 return result;
4ed46869
KH
4182}
4183
52d41803
KH
4184/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
4185
4186 This function does not make full use of DESTINATION buffer. For
4187 instance, if coding->type is coding_type_iso2022, it uses only
4188 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
4189 DST_BYTES is decided by the function encoding_buffer_size, it
4190 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
4191 So, this function can encode the full SOURCE. But, in the other
4192 case, if you want to avoid carry over, you must supply at least 20
4193 bytes more area in DESTINATION buffer than expected maximum bytes
4194 that will be produced by this function. */
4ed46869
KH
4195
4196int
d46c5b12 4197encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
4198 struct coding_system *coding;
4199 unsigned char *source, *destination;
4200 int src_bytes, dst_bytes;
4ed46869 4201{
d46c5b12 4202 int result;
4ed46869 4203
d4e57bcd
KH
4204 if (src_bytes <= 0
4205 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4206 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 4207 {
d46c5b12
KH
4208 coding->produced = coding->produced_char = 0;
4209 coding->consumed = coding->consumed_char = 0;
fb88bf2d 4210 coding->fake_multibyte = 0;
d46c5b12
KH
4211 return CODING_FINISH_NORMAL;
4212 }
4ed46869 4213
d46c5b12
KH
4214 switch (coding->type)
4215 {
0ef69138
KH
4216 case coding_type_emacs_mule:
4217 case coding_type_undecided:
27901516 4218 case coding_type_raw_text:
4ed46869 4219 if (coding->eol_type == CODING_EOL_LF
0ef69138 4220 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 4221 goto label_no_conversion;
d46c5b12 4222 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
4223 break;
4224
4225 case coding_type_sjis:
d46c5b12
KH
4226 result = encode_coding_sjis_big5 (coding, source, destination,
4227 src_bytes, dst_bytes, 1);
4ed46869
KH
4228 break;
4229
4230 case coding_type_iso2022:
d46c5b12
KH
4231 result = encode_coding_iso2022 (coding, source, destination,
4232 src_bytes, dst_bytes);
4ed46869
KH
4233 break;
4234
4235 case coding_type_big5:
d46c5b12
KH
4236 result = encode_coding_sjis_big5 (coding, source, destination,
4237 src_bytes, dst_bytes, 0);
4ed46869
KH
4238 break;
4239
4240 case coding_type_ccl:
d46c5b12
KH
4241 result = ccl_coding_driver (coding, source, destination,
4242 src_bytes, dst_bytes, 1);
4243 break;
4244
4245 default: /* i.e. case coding_type_no_conversion: */
4246 label_no_conversion:
4247 if (dst_bytes && src_bytes > dst_bytes)
4248 {
4249 coding->produced = dst_bytes;
4250 result = CODING_FINISH_INSUFFICIENT_DST;
4251 }
4252 else
4253 {
4254 coding->produced = src_bytes;
4255 result = CODING_FINISH_NORMAL;
4256 }
4257 if (dst_bytes)
4258 bcopy (source, destination, coding->produced);
4259 else
4260 safe_bcopy (source, destination, coding->produced);
4261 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
4262 {
4263 unsigned char *p = destination, *pend = p + coding->produced;
4264 while (p < pend)
4265 if (*p++ == '\015') p[-1] = '\n';
4266 }
fb88bf2d 4267 coding->fake_multibyte = 1;
d46c5b12
KH
4268 coding->consumed
4269 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
4270 break;
4271 }
4272
d46c5b12 4273 return result;
4ed46869
KH
4274}
4275
fb88bf2d
KH
4276/* Scan text in the region between *BEG and *END (byte positions),
4277 skip characters which we don't have to decode by coding system
4278 CODING at the head and tail, then set *BEG and *END to the region
4279 of the text we actually have to convert. The caller should move
4280 the gap out of the region in advance.
4ed46869 4281
d46c5b12
KH
4282 If STR is not NULL, *BEG and *END are indices into STR. */
4283
4284static void
4285shrink_decoding_region (beg, end, coding, str)
4286 int *beg, *end;
4287 struct coding_system *coding;
4288 unsigned char *str;
4289{
fb88bf2d 4290 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 4291 int eol_conversion;
88993dfd 4292 Lisp_Object translation_table;
d46c5b12
KH
4293
4294 if (coding->type == coding_type_ccl
4295 || coding->type == coding_type_undecided
4296 || !NILP (coding->post_read_conversion))
4297 {
4298 /* We can't skip any data. */
4299 return;
4300 }
4301 else if (coding->type == coding_type_no_conversion)
4302 {
fb88bf2d
KH
4303 /* We need no conversion, but don't have to skip any data here.
4304 Decoding routine handles them effectively anyway. */
d46c5b12
KH
4305 return;
4306 }
4307
88993dfd
KH
4308 translation_table = coding->translation_table_for_decode;
4309 if (NILP (translation_table) && !NILP (Venable_character_translation))
4310 translation_table = Vstandard_translation_table_for_decode;
4311 if (CHAR_TABLE_P (translation_table))
4312 {
4313 int i;
4314 for (i = 0; i < 128; i++)
4315 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4316 break;
4317 if (i < 128)
4318 /* Some ASCII character should be tranlsated. We give up
4319 shrinking. */
4320 return;
4321 }
4322
aa60dea6
KH
4323 eol_conversion = (coding->eol_type != CODING_EOL_LF);
4324
4325 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
4326 /* Detection routine has already found how much we can skip at the
4327 head. */
4328 *beg += coding->heading_ascii;
4329
4330 if (str)
4331 {
4332 begp_orig = begp = str + *beg;
4333 endp_orig = endp = str + *end;
4334 }
4335 else
4336 {
fb88bf2d 4337 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4338 endp_orig = endp = begp + *end - *beg;
4339 }
4340
d46c5b12
KH
4341 switch (coding->type)
4342 {
4343 case coding_type_emacs_mule:
4344 case coding_type_raw_text:
4345 if (eol_conversion)
4346 {
4347 if (coding->heading_ascii < 0)
fb88bf2d 4348 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 4349 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 4350 endp--;
ee59c65f
RS
4351 /* Do not consider LF as ascii if preceded by CR, since that
4352 confuses eol decoding. */
4353 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4354 endp++;
d46c5b12
KH
4355 }
4356 else
4357 begp = endp;
4358 break;
4359
4360 case coding_type_sjis:
4361 case coding_type_big5:
4362 /* We can skip all ASCII characters at the head. */
4363 if (coding->heading_ascii < 0)
4364 {
4365 if (eol_conversion)
de9d083c 4366 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4367 else
4368 while (begp < endp && *begp < 0x80) begp++;
4369 }
4370 /* We can skip all ASCII characters at the tail except for the
4371 second byte of SJIS or BIG5 code. */
4372 if (eol_conversion)
de9d083c 4373 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4374 else
4375 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4376 /* Do not consider LF as ascii if preceded by CR, since that
4377 confuses eol decoding. */
4378 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4379 endp++;
d46c5b12
KH
4380 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4381 endp++;
4382 break;
4383
4384 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4385 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4386 /* We can't skip any data. */
4387 break;
d46c5b12
KH
4388 if (coding->heading_ascii < 0)
4389 {
d46c5b12
KH
4390 /* We can skip all ASCII characters at the head except for a
4391 few control codes. */
4392 while (begp < endp && (c = *begp) < 0x80
4393 && c != ISO_CODE_CR && c != ISO_CODE_SO
4394 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4395 && (!eol_conversion || c != ISO_CODE_LF))
4396 begp++;
4397 }
4398 switch (coding->category_idx)
4399 {
4400 case CODING_CATEGORY_IDX_ISO_8_1:
4401 case CODING_CATEGORY_IDX_ISO_8_2:
4402 /* We can skip all ASCII characters at the tail. */
4403 if (eol_conversion)
de9d083c 4404 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4405 else
4406 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4407 /* Do not consider LF as ascii if preceded by CR, since that
4408 confuses eol decoding. */
4409 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4410 endp++;
d46c5b12
KH
4411 break;
4412
4413 case CODING_CATEGORY_IDX_ISO_7:
4414 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4415 {
4416 /* We can skip all charactes at the tail except for 8-bit
4417 codes and ESC and the following 2-byte at the tail. */
4418 unsigned char *eight_bit = NULL;
4419
4420 if (eol_conversion)
4421 while (begp < endp
4422 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4423 {
4424 if (!eight_bit && c & 0x80) eight_bit = endp;
4425 endp--;
4426 }
4427 else
4428 while (begp < endp
4429 && (c = endp[-1]) != ISO_CODE_ESC)
4430 {
4431 if (!eight_bit && c & 0x80) eight_bit = endp;
4432 endp--;
4433 }
4434 /* Do not consider LF as ascii if preceded by CR, since that
4435 confuses eol decoding. */
4436 if (begp < endp && endp < endp_orig
4437 && endp[-1] == '\r' && endp[0] == '\n')
4438 endp++;
4439 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4440 {
4441 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4442 /* This is an ASCII designation sequence. We can
4443 surely skip the tail. But, if we have
4444 encountered an 8-bit code, skip only the codes
4445 after that. */
4446 endp = eight_bit ? eight_bit : endp + 2;
4447 else
4448 /* Hmmm, we can't skip the tail. */
4449 endp = endp_orig;
4450 }
4451 else if (eight_bit)
4452 endp = eight_bit;
4453 }
d46c5b12
KH
4454 }
4455 }
4456 *beg += begp - begp_orig;
4457 *end += endp - endp_orig;
4458 return;
4459}
4460
4461/* Like shrink_decoding_region but for encoding. */
4462
4463static void
4464shrink_encoding_region (beg, end, coding, str)
4465 int *beg, *end;
4466 struct coding_system *coding;
4467 unsigned char *str;
4468{
4469 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4470 int eol_conversion;
88993dfd 4471 Lisp_Object translation_table;
d46c5b12
KH
4472
4473 if (coding->type == coding_type_ccl)
4474 /* We can't skip any data. */
4475 return;
4476 else if (coding->type == coding_type_no_conversion)
4477 {
4478 /* We need no conversion. */
4479 *beg = *end;
4480 return;
4481 }
4482
88993dfd
KH
4483 translation_table = coding->translation_table_for_encode;
4484 if (NILP (translation_table) && !NILP (Venable_character_translation))
4485 translation_table = Vstandard_translation_table_for_encode;
4486 if (CHAR_TABLE_P (translation_table))
4487 {
4488 int i;
4489 for (i = 0; i < 128; i++)
4490 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4491 break;
4492 if (i < 128)
4493 /* Some ASCII character should be tranlsated. We give up
4494 shrinking. */
4495 return;
4496 }
4497
d46c5b12
KH
4498 if (str)
4499 {
4500 begp_orig = begp = str + *beg;
4501 endp_orig = endp = str + *end;
4502 }
4503 else
4504 {
fb88bf2d 4505 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4506 endp_orig = endp = begp + *end - *beg;
4507 }
4508
4509 eol_conversion = (coding->eol_type == CODING_EOL_CR
4510 || coding->eol_type == CODING_EOL_CRLF);
4511
4512 /* Here, we don't have to check coding->pre_write_conversion because
4513 the caller is expected to have handled it already. */
4514 switch (coding->type)
4515 {
4516 case coding_type_undecided:
4517 case coding_type_emacs_mule:
4518 case coding_type_raw_text:
4519 if (eol_conversion)
4520 {
4521 while (begp < endp && *begp != '\n') begp++;
4522 while (begp < endp && endp[-1] != '\n') endp--;
4523 }
4524 else
4525 begp = endp;
4526 break;
4527
4528 case coding_type_iso2022:
622fece5
KH
4529 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4530 /* We can't skip any data. */
4531 break;
d46c5b12
KH
4532 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4533 {
4534 unsigned char *bol = begp;
4535 while (begp < endp && *begp < 0x80)
4536 {
4537 begp++;
4538 if (begp[-1] == '\n')
4539 bol = begp;
4540 }
4541 begp = bol;
4542 goto label_skip_tail;
4543 }
4544 /* fall down ... */
4545
4546 default:
4547 /* We can skip all ASCII characters at the head and tail. */
4548 if (eol_conversion)
4549 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4550 else
4551 while (begp < endp && *begp < 0x80) begp++;
4552 label_skip_tail:
4553 if (eol_conversion)
4554 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4555 else
4556 while (begp < endp && *(endp - 1) < 0x80) endp--;
4557 break;
4558 }
4559
4560 *beg += begp - begp_orig;
4561 *end += endp - endp_orig;
4562 return;
4563}
4564
88993dfd
KH
4565/* As shrinking conversion region requires some overhead, we don't try
4566 shrinking if the length of conversion region is less than this
4567 value. */
4568static int shrink_conversion_region_threshhold = 1024;
4569
4570#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4571 do { \
4572 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4573 { \
4574 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4575 else shrink_decoding_region (beg, end, coding, str); \
4576 } \
4577 } while (0)
4578
b843d1ae
KH
4579static Lisp_Object
4580code_convert_region_unwind (dummy)
4581 Lisp_Object dummy;
4582{
4583 inhibit_pre_post_conversion = 0;
4584 return Qnil;
4585}
4586
ec6d2bb8
KH
4587/* Store information about all compositions in the range FROM and TO
4588 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4589 buffer or a string, defaults to the current buffer. */
4590
4591void
4592coding_save_composition (coding, from, to, obj)
4593 struct coding_system *coding;
4594 int from, to;
4595 Lisp_Object obj;
4596{
4597 Lisp_Object prop;
4598 int start, end;
4599
91bee881
KH
4600 if (coding->composing == COMPOSITION_DISABLED)
4601 return;
4602 if (!coding->cmp_data)
4603 coding_allocate_composition_data (coding, from);
ec6d2bb8
KH
4604 if (!find_composition (from, to, &start, &end, &prop, obj)
4605 || end > to)
4606 return;
4607 if (start < from
4608 && (!find_composition (end, to, &start, &end, &prop, obj)
4609 || end > to))
4610 return;
4611 coding->composing = COMPOSITION_NO;
ec6d2bb8
KH
4612 do
4613 {
4614 if (COMPOSITION_VALID_P (start, end, prop))
4615 {
4616 enum composition_method method = COMPOSITION_METHOD (prop);
4617 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4618 >= COMPOSITION_DATA_SIZE)
4619 coding_allocate_composition_data (coding, from);
4620 /* For relative composition, we remember start and end
4621 positions, for the other compositions, we also remember
4622 components. */
4623 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4624 if (method != COMPOSITION_RELATIVE)
4625 {
4626 /* We must store a*/
4627 Lisp_Object val, ch;
4628
4629 val = COMPOSITION_COMPONENTS (prop);
4630 if (CONSP (val))
4631 while (CONSP (val))
4632 {
4633 ch = XCAR (val), val = XCDR (val);
4634 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4635 }
4636 else if (VECTORP (val) || STRINGP (val))
4637 {
4638 int len = (VECTORP (val)
4639 ? XVECTOR (val)->size : XSTRING (val)->size);
4640 int i;
4641 for (i = 0; i < len; i++)
4642 {
4643 ch = (STRINGP (val)
4644 ? Faref (val, make_number (i))
4645 : XVECTOR (val)->contents[i]);
4646 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4647 }
4648 }
4649 else /* INTEGERP (val) */
4650 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4651 }
4652 CODING_ADD_COMPOSITION_END (coding, end - from);
4653 }
4654 start = end;
4655 }
4656 while (start < to
4657 && find_composition (start, to, &start, &end, &prop, obj)
4658 && end <= to);
4659
4660 /* Make coding->cmp_data point to the first memory block. */
4661 while (coding->cmp_data->prev)
4662 coding->cmp_data = coding->cmp_data->prev;
4663 coding->cmp_data_start = 0;
4664}
4665
4666/* Reflect the saved information about compositions to OBJ.
4667 CODING->cmp_data points to a memory block for the informaiton. OBJ
4668 is a buffer or a string, defaults to the current buffer. */
4669
4670static void
4671coding_restore_composition (coding, obj)
4672 struct coding_system *coding;
4673 Lisp_Object obj;
4674{
4675 struct composition_data *cmp_data = coding->cmp_data;
4676
4677 if (!cmp_data)
4678 return;
4679
4680 while (cmp_data->prev)
4681 cmp_data = cmp_data->prev;
4682
4683 while (cmp_data)
4684 {
4685 int i;
4686
4687 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4688 {
4689 int *data = cmp_data->data + i;
4690 enum composition_method method = (enum composition_method) data[3];
4691 Lisp_Object components;
4692
4693 if (method == COMPOSITION_RELATIVE)
4694 components = Qnil;
4695 else
4696 {
4697 int len = data[0] - 4, j;
4698 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4699
4700 for (j = 0; j < len; j++)
4701 args[j] = make_number (data[4 + j]);
4702 components = (method == COMPOSITION_WITH_ALTCHARS
4703 ? Fstring (len, args) : Fvector (len, args));
4704 }
4705 compose_text (data[1], data[2], components, Qnil, obj);
4706 }
4707 cmp_data = cmp_data->next;
4708 }
4709}
4710
d46c5b12 4711/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4712 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4713 coding system CODING, and return the status code of code conversion
4714 (currently, this value has no meaning).
4715
4716 How many characters (and bytes) are converted to how many
4717 characters (and bytes) are recorded in members of the structure
4718 CODING.
d46c5b12 4719
6e44253b 4720 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4721 is deleted and a new text is inserted. See the comments in
6e44253b 4722 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4723
4724int
6e44253b
KH
4725code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4726 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4727 struct coding_system *coding;
4ed46869 4728{
fb88bf2d
KH
4729 int len = to - from, len_byte = to_byte - from_byte;
4730 int require, inserted, inserted_byte;
4b39528c 4731 int head_skip, tail_skip, total_skip = 0;
84d60297 4732 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4733 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4734 int first = 1;
4735 int fake_multibyte = 0;
4736 unsigned char *src, *dst;
84d60297 4737 Lisp_Object deletion;
e133c8fa 4738 int orig_point = PT, orig_len = len;
6abb9bd9 4739 int prev_Z;
84d60297
RS
4740
4741 deletion = Qnil;
4742 saved_coding_symbol = Qnil;
d46c5b12 4743
83fa074f 4744 if (from < PT && PT < to)
e133c8fa
KH
4745 {
4746 TEMP_SET_PT_BOTH (from, from_byte);
4747 orig_point = from;
4748 }
83fa074f 4749
6e44253b 4750 if (replace)
d46c5b12 4751 {
fb88bf2d
KH
4752 int saved_from = from;
4753
d46c5b12 4754 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4755 if (saved_from != from)
4756 {
4757 to = from + len;
4758 if (multibyte)
4759 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4760 else
4761 from_byte = from, to_byte = to;
4762 len_byte = to_byte - from_byte;
4763 }
d46c5b12 4764 }
d46c5b12
KH
4765
4766 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4767 {
12410ef1 4768 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4769
4770 if (from < GPT && to > GPT)
4771 move_gap_both (from, from_byte);
4772 if (coding->type == coding_type_undecided)
4773 {
fb88bf2d 4774 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4775 if (coding->type == coding_type_undecided)
12410ef1
KH
4776 /* It seems that the text contains only ASCII, but we
4777 should not left it undecided because the deeper
4778 decoding routine (decode_coding) tries to detect the
4779 encodings again in vain. */
d46c5b12
KH
4780 coding->type = coding_type_emacs_mule;
4781 }
4782 if (coding->eol_type == CODING_EOL_UNDECIDED)
4783 {
4784 saved_coding_symbol = coding->symbol;
4785 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4786 if (coding->eol_type == CODING_EOL_UNDECIDED)
4787 coding->eol_type = CODING_EOL_LF;
4788 /* We had better recover the original eol format if we
4789 encounter an inconsitent eol format while decoding. */
4790 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4791 }
4792 }
4793
4794 if (encodep
4795 ? ! CODING_REQUIRE_ENCODING (coding)
4796 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d 4797 {
ec6d2bb8
KH
4798 coding->consumed_char = len;
4799 coding->consumed = len_byte;
fb88bf2d 4800 coding->produced = len_byte;
12410ef1
KH
4801 if (multibyte
4802 && ! replace
4803 /* See the comment of the member heading_ascii in coding.h. */
4804 && coding->heading_ascii < len_byte)
fb88bf2d 4805 {
6e44253b
KH
4806 /* We still may have to combine byte at the head and the
4807 tail of the text in the region. */
12410ef1 4808 if (from < GPT && GPT < to)
6e44253b 4809 move_gap_both (to, to_byte);
12410ef1
KH
4810 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4811 adjust_after_insert (from, from_byte, to, to_byte, len);
4812 coding->produced_char = len;
fb88bf2d
KH
4813 }
4814 else
68e3a8f1
AS
4815 {
4816 if (!replace)
4817 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4818 coding->produced_char = len_byte;
4819 }
fb88bf2d
KH
4820 return 0;
4821 }
d46c5b12
KH
4822
4823 /* Now we convert the text. */
4824
4825 /* For encoding, we must process pre-write-conversion in advance. */
4826 if (encodep
d46c5b12
KH
4827 && ! NILP (coding->pre_write_conversion)
4828 && SYMBOLP (coding->pre_write_conversion)
4829 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4830 {
2b4f9037
KH
4831 /* The function in pre-write-conversion may put a new text in a
4832 new buffer. */
0007bdd0
KH
4833 struct buffer *prev = current_buffer;
4834 Lisp_Object new;
b843d1ae 4835 int count = specpdl_ptr - specpdl;
d46c5b12 4836
b843d1ae
KH
4837 record_unwind_protect (code_convert_region_unwind, Qnil);
4838 /* We should not call any more pre-write/post-read-conversion
4839 functions while this pre-write-conversion is running. */
4840 inhibit_pre_post_conversion = 1;
b39f748c
AS
4841 call2 (coding->pre_write_conversion,
4842 make_number (from), make_number (to));
b843d1ae
KH
4843 inhibit_pre_post_conversion = 0;
4844 /* Discard the unwind protect. */
4845 specpdl_ptr--;
4846
d46c5b12
KH
4847 if (current_buffer != prev)
4848 {
4849 len = ZV - BEGV;
0007bdd0 4850 new = Fcurrent_buffer ();
d46c5b12 4851 set_buffer_internal_1 (prev);
7dae4502 4852 del_range_2 (from, from_byte, to, to_byte, 0);
e133c8fa 4853 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4854 insert_from_buffer (XBUFFER (new), 1, len, 0);
4855 Fkill_buffer (new);
e133c8fa
KH
4856 if (orig_point >= to)
4857 orig_point += len - orig_len;
4858 else if (orig_point > from)
4859 orig_point = from;
4860 orig_len = len;
d46c5b12 4861 to = from + len;
e133c8fa 4862 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4863 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4864 len_byte = to_byte - from_byte;
e133c8fa 4865 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4866 }
4867 }
4868
12410ef1
KH
4869 if (replace)
4870 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4871
ec6d2bb8
KH
4872 if (coding->composing != COMPOSITION_DISABLED)
4873 {
4874 if (encodep)
4875 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4876 else
4877 coding_allocate_composition_data (coding, from);
4878 }
fb88bf2d 4879
ec6d2bb8
KH
4880 /* For conversion by CCL program and for encoding with composition
4881 handling, we can't skip any character because we may convert or
4882 compose even ASCII characters. */
4883 if (coding->type != coding_type_ccl
4884 && (!encodep || coding->cmp_data == NULL))
4885 {
4886 /* Try to skip the heading and tailing ASCIIs. */
4887 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4888
4889 if (from < GPT && GPT < to)
4890 move_gap_both (from, from_byte);
4891 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4892 if (from_byte == to_byte
4893 && (encodep || NILP (coding->post_read_conversion))
4894 && ! CODING_REQUIRE_FLUSHING (coding))
4895 {
4896 coding->produced = len_byte;
4897 coding->produced_char = multibyte ? len : len_byte;
4898 if (!replace)
4899 /* We must record and adjust for this new text now. */
4900 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4901 return 0;
4902 }
4903
4904 head_skip = from_byte - from_byte_orig;
4905 tail_skip = to_byte_orig - to_byte;
4906 total_skip = head_skip + tail_skip;
4907 from += head_skip;
4908 to -= tail_skip;
4909 len -= total_skip; len_byte -= total_skip;
4910
4911 if (coding->cmp_data)
4912 coding->cmp_data->char_offset = from;
4913 }
d46c5b12 4914
88993dfd 4915 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4916 now. So, we must remove all text properties in the region.
4917 Here, we must suppress all modification hooks. */
88993dfd 4918 if (replace)
55d8d769
KH
4919 {
4920 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4921 inhibit_modification_hooks = 1;
4922 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4923 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4924 }
88993dfd 4925
fb88bf2d
KH
4926 /* For converion, we must put the gap before the text in addition to
4927 making the gap larger for efficient decoding. The required gap
4928 size starts from 2000 which is the magic number used in make_gap.
4929 But, after one batch of conversion, it will be incremented if we
4930 find that it is not enough . */
d46c5b12
KH
4931 require = 2000;
4932
4933 if (GAP_SIZE < require)
4934 make_gap (require - GAP_SIZE);
4935 move_gap_both (from, from_byte);
4936
d46c5b12 4937 inserted = inserted_byte = 0;
fb88bf2d
KH
4938
4939 GAP_SIZE += len_byte;
4940 ZV -= len;
4941 Z -= len;
4942 ZV_BYTE -= len_byte;
4943 Z_BYTE -= len_byte;
4944
d9f9a1bc
GM
4945 if (GPT - BEG < BEG_UNCHANGED)
4946 BEG_UNCHANGED = GPT - BEG;
4947 if (Z - GPT < END_UNCHANGED)
4948 END_UNCHANGED = Z - GPT;
f2558efd 4949
d46c5b12
KH
4950 for (;;)
4951 {
fb88bf2d 4952 int result;
d46c5b12 4953
ec6d2bb8 4954 /* The buffer memory is now:
fb88bf2d
KH
4955 +--------+converted-text+---------+-------original-text------+---+
4956 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4957 |<------------------- GAP_SIZE -------------------->| */
ec6d2bb8
KH
4958 src = GAP_END_ADDR - len_byte;
4959 dst = GPT_ADDR + inserted_byte;
4960
d46c5b12 4961 if (encodep)
fb88bf2d 4962 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4963 else
fb88bf2d 4964 result = decode_coding (coding, src, dst, len_byte, 0);
ec6d2bb8
KH
4965
4966 /* The buffer memory is now:
d46c5b12 4967 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4968 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4969 |<------------------- GAP_SIZE -------------------->| */
ec6d2bb8 4970
fb88bf2d
KH
4971 if (coding->fake_multibyte)
4972 fake_multibyte = 1;
d46c5b12 4973
fb88bf2d
KH
4974 if (!encodep && !multibyte)
4975 coding->produced_char = coding->produced;
d46c5b12
KH
4976 inserted += coding->produced_char;
4977 inserted_byte += coding->produced;
d46c5b12 4978 len_byte -= coding->consumed;
ec6d2bb8
KH
4979
4980 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4981 {
4982 coding_allocate_composition_data (coding, from + inserted);
4983 continue;
4984 }
4985
fb88bf2d 4986 src += coding->consumed;
3636f7a3 4987 dst += coding->produced;
d46c5b12 4988
9864ebce
KH
4989 if (result == CODING_FINISH_NORMAL)
4990 {
4991 src += len_byte;
4992 break;
4993 }
d46c5b12
KH
4994 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4995 {
fb88bf2d 4996 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 4997 Lisp_Object eol_type;
d46c5b12
KH
4998
4999 /* Encode LFs back to the original eol format (CR or CRLF). */
5000 if (coding->eol_type == CODING_EOL_CR)
5001 {
5002 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5003 }
5004 else
5005 {
d46c5b12
KH
5006 int count = 0;
5007
fb88bf2d
KH
5008 while (p < pend) if (*p++ == '\n') count++;
5009 if (src - dst < count)
d46c5b12 5010 {
38edf7d4 5011 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
5012 back to CRLF. We must record converted and
5013 not-yet-converted text back to the buffer
5014 content, enlarge the gap, then record them out of
5015 the buffer contents again. */
5016 int add = len_byte + inserted_byte;
5017
5018 GAP_SIZE -= add;
5019 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5020 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5021 make_gap (count - GAP_SIZE);
5022 GAP_SIZE += add;
5023 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5024 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5025 /* Don't forget to update SRC, DST, and PEND. */
5026 src = GAP_END_ADDR - len_byte;
5027 dst = GPT_ADDR + inserted_byte;
5028 pend = dst;
d46c5b12 5029 }
d46c5b12
KH
5030 inserted += count;
5031 inserted_byte += count;
fb88bf2d
KH
5032 coding->produced += count;
5033 p = dst = pend + count;
5034 while (count)
5035 {
5036 *--p = *--pend;
5037 if (*p == '\n') count--, *--p = '\r';
5038 }
d46c5b12
KH
5039 }
5040
5041 /* Suppress eol-format conversion in the further conversion. */
5042 coding->eol_type = CODING_EOL_LF;
5043
38edf7d4
KH
5044 /* Set the coding system symbol to that for Unix-like EOL. */
5045 eol_type = Fget (saved_coding_symbol, Qeol_type);
5046 if (VECTORP (eol_type)
5047 && XVECTOR (eol_type)->size == 3
5048 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5049 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5050 else
5051 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
5052
5053 continue;
d46c5b12
KH
5054 }
5055 if (len_byte <= 0)
944bd420
KH
5056 {
5057 if (coding->type != coding_type_ccl
5058 || coding->mode & CODING_MODE_LAST_BLOCK)
5059 break;
5060 coding->mode |= CODING_MODE_LAST_BLOCK;
5061 continue;
5062 }
d46c5b12
KH
5063 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5064 {
5065 /* The source text ends in invalid codes. Let's just
5066 make them valid buffer contents, and finish conversion. */
fb88bf2d 5067 inserted += len_byte;
d46c5b12 5068 inserted_byte += len_byte;
fb88bf2d 5069 while (len_byte--)
ee59c65f 5070 *dst++ = *src++;
fb88bf2d 5071 fake_multibyte = 1;
d46c5b12
KH
5072 break;
5073 }
9864ebce
KH
5074 if (result == CODING_FINISH_INTERRUPT)
5075 {
5076 /* The conversion procedure was interrupted by a user. */
5077 fake_multibyte = 1;
5078 break;
5079 }
5080 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5081 if (coding->consumed < 1)
5082 {
5083 /* It's quite strange to require more memory without
5084 consuming any bytes. Perhaps CCL program bug. */
5085 fake_multibyte = 1;
5086 break;
5087 }
fb88bf2d
KH
5088 if (first)
5089 {
5090 /* We have just done the first batch of conversion which was
5091 stoped because of insufficient gap. Let's reconsider the
5092 required gap size (i.e. SRT - DST) now.
5093
5094 We have converted ORIG bytes (== coding->consumed) into
5095 NEW bytes (coding->produced). To convert the remaining
5096 LEN bytes, we may need REQUIRE bytes of gap, where:
5097 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5098 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5099 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
5100 float ratio = coding->produced - coding->consumed;
5101 ratio /= coding->consumed;
5102 require = len_byte * ratio;
fb88bf2d
KH
5103 first = 0;
5104 }
5105 if ((src - dst) < (require + 2000))
5106 {
5107 /* See the comment above the previous call of make_gap. */
5108 int add = len_byte + inserted_byte;
5109
5110 GAP_SIZE -= add;
5111 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5112 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5113 make_gap (require + 2000);
5114 GAP_SIZE += add;
5115 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5116 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
fb88bf2d 5117 }
d46c5b12 5118 }
fb88bf2d
KH
5119 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5120
2b4f9037 5121 if (multibyte
88993dfd
KH
5122 && (encodep
5123 || fake_multibyte
5124 || (to - from) != (to_byte - from_byte)))
2b4f9037 5125 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 5126
12410ef1
KH
5127 /* If we have shrinked the conversion area, adjust it now. */
5128 if (total_skip > 0)
5129 {
5130 if (tail_skip > 0)
5131 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5132 inserted += total_skip; inserted_byte += total_skip;
5133 GAP_SIZE += total_skip;
5134 GPT -= head_skip; GPT_BYTE -= head_skip;
5135 ZV -= total_skip; ZV_BYTE -= total_skip;
5136 Z -= total_skip; Z_BYTE -= total_skip;
5137 from -= head_skip; from_byte -= head_skip;
5138 to += tail_skip; to_byte += tail_skip;
5139 }
5140
6abb9bd9 5141 prev_Z = Z;
12410ef1 5142 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 5143 inserted = Z - prev_Z;
4ed46869 5144
ec6d2bb8
KH
5145 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5146 coding_restore_composition (coding, Fcurrent_buffer ());
5147 coding_free_composition_data (coding);
5148
2b4f9037 5149 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 5150 {
2b4f9037 5151 Lisp_Object val;
b843d1ae 5152 int count = specpdl_ptr - specpdl;
4ed46869 5153
e133c8fa
KH
5154 if (from != PT)
5155 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 5156 prev_Z = Z;
b843d1ae
KH
5157 record_unwind_protect (code_convert_region_unwind, Qnil);
5158 /* We should not call any more pre-write/post-read-conversion
5159 functions while this post-read-conversion is running. */
5160 inhibit_pre_post_conversion = 1;
2b4f9037 5161 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae
KH
5162 inhibit_pre_post_conversion = 0;
5163 /* Discard the unwind protect. */
5164 specpdl_ptr--;
6abb9bd9 5165 CHECK_NUMBER (val, 0);
944bd420 5166 inserted += Z - prev_Z;
e133c8fa
KH
5167 }
5168
5169 if (orig_point >= from)
5170 {
5171 if (orig_point >= from + orig_len)
5172 orig_point += inserted - orig_len;
5173 else
5174 orig_point = from;
5175 TEMP_SET_PT (orig_point);
d46c5b12 5176 }
4ed46869 5177
ec6d2bb8
KH
5178 if (replace)
5179 {
5180 signal_after_change (from, to - from, inserted);
e19539f1 5181 update_compositions (from, from + inserted, CHECK_BORDER);
ec6d2bb8 5182 }
2b4f9037 5183
fb88bf2d 5184 {
12410ef1
KH
5185 coding->consumed = to_byte - from_byte;
5186 coding->consumed_char = to - from;
5187 coding->produced = inserted_byte;
5188 coding->produced_char = inserted;
fb88bf2d 5189 }
7553d0e1 5190
fb88bf2d 5191 return 0;
d46c5b12
KH
5192}
5193
5194Lisp_Object
5195code_convert_string (str, coding, encodep, nocopy)
5196 Lisp_Object str;
4ed46869 5197 struct coding_system *coding;
d46c5b12 5198 int encodep, nocopy;
4ed46869 5199{
d46c5b12
KH
5200 int len;
5201 char *buf;
fc932ac6
RS
5202 int from = 0, to = XSTRING (str)->size;
5203 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 5204 struct gcpro gcpro1;
84d60297 5205 Lisp_Object saved_coding_symbol;
d46c5b12 5206 int result;
4ed46869 5207
84d60297 5208 saved_coding_symbol = Qnil;
b843d1ae
KH
5209 if ((encodep && !NILP (coding->pre_write_conversion)
5210 || !encodep && !NILP (coding->post_read_conversion)))
d46c5b12
KH
5211 {
5212 /* Since we have to call Lisp functions which assume target text
b843d1ae
KH
5213 is in a buffer, after setting a temporary buffer, call
5214 code_convert_region. */
d46c5b12
KH
5215 int count = specpdl_ptr - specpdl;
5216 struct buffer *prev = current_buffer;
b843d1ae 5217 int multibyte = STRING_MULTIBYTE (str);
e133c8fa 5218
d46c5b12 5219 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
b843d1ae
KH
5220 record_unwind_protect (code_convert_region_unwind, Qnil);
5221 inhibit_pre_post_conversion = 1;
5222 GCPRO1 (str);
d46c5b12
KH
5223 temp_output_buffer_setup (" *code-converting-work*");
5224 set_buffer_internal (XBUFFER (Vstandard_output));
b843d1ae
KH
5225 /* We must insert the contents of STR as is without
5226 unibyte<->multibyte conversion. For that, we adjust the
5227 multibyteness of the working buffer to that of STR. */
5228 Ferase_buffer (); /* for safety */
5229 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5230 insert_from_string (str, 0, 0, to, to_byte, 0);
5231 UNGCPRO;
fb88bf2d 5232 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
b843d1ae
KH
5233 /* Make a unibyte string if we are encoding, otherwise make a
5234 multibyte string. */
5235 Fset_buffer_multibyte (encodep ? Qnil : Qt);
d46c5b12 5236 str = make_buffer_string (BEGV, ZV, 0);
d46c5b12
KH
5237 return unbind_to (count, str);
5238 }
4ed46869 5239
d46c5b12
KH
5240 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5241 {
5242 /* See the comments in code_convert_region. */
5243 if (coding->type == coding_type_undecided)
5244 {
5245 detect_coding (coding, XSTRING (str)->data, to_byte);
5246 if (coding->type == coding_type_undecided)
5247 coding->type = coding_type_emacs_mule;
5248 }
5249 if (coding->eol_type == CODING_EOL_UNDECIDED)
5250 {
5251 saved_coding_symbol = coding->symbol;
5252 detect_eol (coding, XSTRING (str)->data, to_byte);
5253 if (coding->eol_type == CODING_EOL_UNDECIDED)
5254 coding->eol_type = CODING_EOL_LF;
5255 /* We had better recover the original eol format if we
5256 encounter an inconsitent eol format while decoding. */
5257 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5258 }
5259 }
4ed46869 5260
d46c5b12
KH
5261 if (encodep
5262 ? ! CODING_REQUIRE_ENCODING (coding)
5263 : ! CODING_REQUIRE_DECODING (coding))
ec6d2bb8
KH
5264 return (nocopy ? str : Fcopy_sequence (str));
5265
5266 if (coding->composing != COMPOSITION_DISABLED)
5267 {
5268 if (encodep)
5269 coding_save_composition (coding, from, to, str);
5270 else
5271 coding_allocate_composition_data (coding, from);
5272 }
5273
5274 /* For conversion by CCL program and for encoding with composition
5275 handling, we can't skip any character because we may convert or
5276 compose even ASCII characters. */
5277 if (coding->type != coding_type_ccl
5278 && (!encodep || coding->cmp_data == NULL))
d46c5b12
KH
5279 {
5280 /* Try to skip the heading and tailing ASCIIs. */
ec6d2bb8
KH
5281 int from_orig = from;
5282
88993dfd
KH
5283 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5284 encodep);
ec6d2bb8
KH
5285 if (from == to_byte)
5286 return (nocopy ? str : Fcopy_sequence (str));
5287
5288 if (coding->cmp_data)
5289 coding->cmp_data->char_offset = from;
d46c5b12 5290 }
4ed46869 5291
d46c5b12
KH
5292 if (encodep)
5293 len = encoding_buffer_size (coding, to_byte - from);
5294 else
5295 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 5296 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
5297 GCPRO1 (str);
5298 buf = get_conversion_buffer (len);
5299 UNGCPRO;
4ed46869 5300
d46c5b12
KH
5301 if (from > 0)
5302 bcopy (XSTRING (str)->data, buf, from);
5303 result = (encodep
5304 ? encode_coding (coding, XSTRING (str)->data + from,
5305 buf + from, to_byte - from, len)
5306 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 5307 buf + from, to_byte - from, len));
d46c5b12 5308 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 5309 {
ec6d2bb8 5310 /* We simply try to decode the whole string again but without
d46c5b12
KH
5311 eol-conversion this time. */
5312 coding->eol_type = CODING_EOL_LF;
5313 coding->symbol = saved_coding_symbol;
ec6d2bb8 5314 coding_free_composition_data (coding);
d46c5b12 5315 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 5316 }
d46c5b12
KH
5317
5318 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 5319 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 5320
fc932ac6 5321 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
5322 if (encodep)
5323 str = make_unibyte_string (buf, len + coding->produced);
5324 else
826bfb8b
KH
5325 {
5326 int chars= (coding->fake_multibyte
5327 ? multibyte_chars_in_text (buf + from, coding->produced)
5328 : coding->produced_char);
5329 str = make_multibyte_string (buf, len + chars, len + coding->produced);
5330 }
5331
ec6d2bb8
KH
5332 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5333 coding_restore_composition (coding, str);
5334
5335 coding_free_composition_data (coding);
d46c5b12 5336 return str;
4ed46869
KH
5337}
5338
5339\f
5340#ifdef emacs
1397dc18 5341/*** 8. Emacs Lisp library functions ***/
4ed46869 5342
4ed46869
KH
5343DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5344 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
5345See the documentation of `make-coding-system' for information\n\
5346about coding-system objects.")
4ed46869
KH
5347 (obj)
5348 Lisp_Object obj;
5349{
4608c386
KH
5350 if (NILP (obj))
5351 return Qt;
5352 if (!SYMBOLP (obj))
5353 return Qnil;
5354 /* Get coding-spec vector for OBJ. */
5355 obj = Fget (obj, Qcoding_system);
5356 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5357 ? Qt : Qnil);
4ed46869
KH
5358}
5359
9d991de8
RS
5360DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5361 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 5362 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
5363 (prompt)
5364 Lisp_Object prompt;
5365{
e0e989f6 5366 Lisp_Object val;
9d991de8
RS
5367 do
5368 {
4608c386
KH
5369 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5370 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
5371 }
5372 while (XSTRING (val)->size == 0);
e0e989f6 5373 return (Fintern (val, Qnil));
4ed46869
KH
5374}
5375
9b787f3e
RS
5376DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5377 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5378If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5379 (prompt, default_coding_system)
5380 Lisp_Object prompt, default_coding_system;
4ed46869 5381{
f44d27ce 5382 Lisp_Object val;
9b787f3e
RS
5383 if (SYMBOLP (default_coding_system))
5384 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 5385 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
5386 Qt, Qnil, Qcoding_system_history,
5387 default_coding_system, Qnil);
e0e989f6 5388 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
5389}
5390
5391DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5392 1, 1, 0,
5393 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
5394If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5395It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
5396The value of property should be a vector of length 5.")
5397 (coding_system)
5398 Lisp_Object coding_system;
5399{
5400 CHECK_SYMBOL (coding_system, 0);
5401 if (!NILP (Fcoding_system_p (coding_system)))
5402 return coding_system;
5403 while (1)
02ba4723 5404 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 5405}
3a73fa5d 5406\f
d46c5b12
KH
5407Lisp_Object
5408detect_coding_system (src, src_bytes, highest)
5409 unsigned char *src;
5410 int src_bytes, highest;
4ed46869
KH
5411{
5412 int coding_mask, eol_type;
d46c5b12
KH
5413 Lisp_Object val, tmp;
5414 int dummy;
4ed46869 5415
d46c5b12
KH
5416 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5417 eol_type = detect_eol_type (src, src_bytes, &dummy);
5418 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 5419 eol_type = CODING_EOL_UNDECIDED;
4ed46869 5420
d46c5b12 5421 if (!coding_mask)
4ed46869 5422 {
27901516 5423 val = Qundecided;
d46c5b12 5424 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 5425 {
f44d27ce
RS
5426 Lisp_Object val2;
5427 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
5428 if (VECTORP (val2))
5429 val = XVECTOR (val2)->contents[eol_type];
5430 }
80e803b4 5431 return (highest ? val : Fcons (val, Qnil));
4ed46869 5432 }
4ed46869 5433
d46c5b12
KH
5434 /* At first, gather possible coding systems in VAL. */
5435 val = Qnil;
fa42c37f 5436 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5437 {
fa42c37f
KH
5438 Lisp_Object category_val, category_index;
5439
5440 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5441 category_val = Fsymbol_value (XCAR (tmp));
5442 if (!NILP (category_val)
5443 && NATNUMP (category_index)
5444 && (coding_mask & (1 << XFASTINT (category_index))))
4ed46869 5445 {
fa42c37f 5446 val = Fcons (category_val, val);
d46c5b12
KH
5447 if (highest)
5448 break;
4ed46869
KH
5449 }
5450 }
d46c5b12
KH
5451 if (!highest)
5452 val = Fnreverse (val);
4ed46869 5453
65059037 5454 /* Then, replace the elements with subsidiary coding systems. */
fa42c37f 5455 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
4ed46869 5456 {
65059037
RS
5457 if (eol_type != CODING_EOL_UNDECIDED
5458 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 5459 {
d46c5b12 5460 Lisp_Object eol;
03699b14 5461 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 5462 if (VECTORP (eol))
03699b14 5463 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
5464 }
5465 }
03699b14 5466 return (highest ? XCAR (val) : val);
d46c5b12 5467}
4ed46869 5468
d46c5b12
KH
5469DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5470 2, 3, 0,
5471 "Detect coding system of the text in the region between START and END.\n\
5472Return a list of possible coding systems ordered by priority.\n\
5473\n\
80e803b4
KH
5474If only ASCII characters are found, it returns a list of single element\n\
5475`undecided' or its subsidiary coding system according to a detected\n\
5476end-of-line format.\n\
d46c5b12
KH
5477\n\
5478If optional argument HIGHEST is non-nil, return the coding system of\n\
5479highest priority.")
5480 (start, end, highest)
5481 Lisp_Object start, end, highest;
5482{
5483 int from, to;
5484 int from_byte, to_byte;
6289dd10 5485
d46c5b12
KH
5486 CHECK_NUMBER_COERCE_MARKER (start, 0);
5487 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 5488
d46c5b12
KH
5489 validate_region (&start, &end);
5490 from = XINT (start), to = XINT (end);
5491 from_byte = CHAR_TO_BYTE (from);
5492 to_byte = CHAR_TO_BYTE (to);
6289dd10 5493
d46c5b12
KH
5494 if (from < GPT && to >= GPT)
5495 move_gap_both (to, to_byte);
4ed46869 5496
d46c5b12
KH
5497 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5498 to_byte - from_byte,
5499 !NILP (highest));
5500}
6289dd10 5501
d46c5b12
KH
5502DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5503 1, 2, 0,
5504 "Detect coding system of the text in STRING.\n\
5505Return a list of possible coding systems ordered by priority.\n\
5506\n\
80e803b4
KH
5507If only ASCII characters are found, it returns a list of single element\n\
5508`undecided' or its subsidiary coding system according to a detected\n\
5509end-of-line format.\n\
d46c5b12
KH
5510\n\
5511If optional argument HIGHEST is non-nil, return the coding system of\n\
5512highest priority.")
5513 (string, highest)
5514 Lisp_Object string, highest;
5515{
5516 CHECK_STRING (string, 0);
4ed46869 5517
d46c5b12 5518 return detect_coding_system (XSTRING (string)->data,
fc932ac6 5519 STRING_BYTES (XSTRING (string)),
d46c5b12 5520 !NILP (highest));
4ed46869
KH
5521}
5522
4031e2bf
KH
5523Lisp_Object
5524code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 5525 Lisp_Object start, end, coding_system;
4031e2bf 5526 int encodep;
3a73fa5d
RS
5527{
5528 struct coding_system coding;
4031e2bf 5529 int from, to, len;
3a73fa5d 5530
d46c5b12
KH
5531 CHECK_NUMBER_COERCE_MARKER (start, 0);
5532 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
5533 CHECK_SYMBOL (coding_system, 2);
5534
d46c5b12
KH
5535 validate_region (&start, &end);
5536 from = XFASTINT (start);
5537 to = XFASTINT (end);
5538
3a73fa5d 5539 if (NILP (coding_system))
d46c5b12
KH
5540 return make_number (to - from);
5541
3a73fa5d 5542 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 5543 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 5544
d46c5b12 5545 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
5546 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5547 &coding, encodep, 1);
f072a3e8 5548 Vlast_coding_system_used = coding.symbol;
fb88bf2d 5549 return make_number (coding.produced_char);
4031e2bf
KH
5550}
5551
5552DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5553 3, 3, "r\nzCoding system: ",
5554 "Decode the current region by specified coding system.\n\
5555When called from a program, takes three arguments:\n\
5556START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5557This function sets `last-coding-system-used' to the precise coding system\n\
5558used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5559not fully specified.)\n\
5560It returns the length of the decoded text.")
4031e2bf
KH
5561 (start, end, coding_system)
5562 Lisp_Object start, end, coding_system;
5563{
5564 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
5565}
5566
5567DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5568 3, 3, "r\nzCoding system: ",
d46c5b12 5569 "Encode the current region by specified coding system.\n\
3a73fa5d 5570When called from a program, takes three arguments:\n\
d46c5b12 5571START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5572This function sets `last-coding-system-used' to the precise coding system\n\
5573used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5574not fully specified.)\n\
5575It returns the length of the encoded text.")
d46c5b12
KH
5576 (start, end, coding_system)
5577 Lisp_Object start, end, coding_system;
3a73fa5d 5578{
4031e2bf
KH
5579 return code_convert_region1 (start, end, coding_system, 1);
5580}
3a73fa5d 5581
4031e2bf
KH
5582Lisp_Object
5583code_convert_string1 (string, coding_system, nocopy, encodep)
5584 Lisp_Object string, coding_system, nocopy;
5585 int encodep;
5586{
5587 struct coding_system coding;
3a73fa5d 5588
4031e2bf
KH
5589 CHECK_STRING (string, 0);
5590 CHECK_SYMBOL (coding_system, 1);
4ed46869 5591
d46c5b12 5592 if (NILP (coding_system))
4031e2bf 5593 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5594
d46c5b12
KH
5595 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5596 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5597
d46c5b12 5598 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 5599 string = code_convert_string (string, &coding, encodep, !NILP (nocopy));
f072a3e8 5600 Vlast_coding_system_used = coding.symbol;
ec6d2bb8
KH
5601
5602 return string;
4ed46869
KH
5603}
5604
4ed46869 5605DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5606 2, 3, 0,
5607 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5608Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5609if the decoding operation is trivial.\n\
5610This function sets `last-coding-system-used' to the precise coding system\n\
5611used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5612not fully specified.)")
e0e989f6
KH
5613 (string, coding_system, nocopy)
5614 Lisp_Object string, coding_system, nocopy;
4ed46869 5615{
f072a3e8 5616 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5617}
5618
5619DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5620 2, 3, 0,
5621 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5622Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5623if the encoding operation is trivial.\n\
5624This function sets `last-coding-system-used' to the precise coding system\n\
5625used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5626not fully specified.)")
e0e989f6
KH
5627 (string, coding_system, nocopy)
5628 Lisp_Object string, coding_system, nocopy;
4ed46869 5629{
f072a3e8 5630 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5631}
4031e2bf 5632
ecec61c1 5633/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
5634 Do not set Vlast_coding_system_used.
5635
5636 This function is called only from macros DECODE_FILE and
5637 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
5638
5639Lisp_Object
5640code_convert_string_norecord (string, coding_system, encodep)
5641 Lisp_Object string, coding_system;
5642 int encodep;
5643{
5644 struct coding_system coding;
5645
5646 CHECK_STRING (string, 0);
5647 CHECK_SYMBOL (coding_system, 1);
5648
5649 if (NILP (coding_system))
5650 return string;
5651
5652 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5653 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5654
ec6d2bb8 5655 coding.composing = COMPOSITION_DISABLED;
ecec61c1
KH
5656 coding.mode |= CODING_MODE_LAST_BLOCK;
5657 return code_convert_string (string, &coding, encodep, Qt);
5658}
3a73fa5d 5659\f
4ed46869 5660DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5661 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5662Return the corresponding character.")
5663 (code)
5664 Lisp_Object code;
5665{
5666 unsigned char c1, c2, s1, s2;
5667 Lisp_Object val;
5668
5669 CHECK_NUMBER (code, 0);
5670 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5671 if (s1 == 0)
5672 {
c28a9453
KH
5673 if (s2 < 0x80)
5674 XSETFASTINT (val, s2);
5675 else if (s2 >= 0xA0 || s2 <= 0xDF)
5676 XSETFASTINT (val,
5677 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5678 else
9da8350f 5679 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5680 }
5681 else
5682 {
5683 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5684 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5685 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5686 DECODE_SJIS (s1, s2, c1, c2);
5687 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5688 }
4ed46869
KH
5689 return val;
5690}
5691
5692DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5693 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5694Return the corresponding code in SJIS.")
4ed46869
KH
5695 (ch)
5696 Lisp_Object ch;
5697{
bcf26d6a 5698 int charset, c1, c2, s1, s2;
4ed46869
KH
5699 Lisp_Object val;
5700
5701 CHECK_NUMBER (ch, 0);
5702 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5703 if (charset == CHARSET_ASCII)
5704 {
5705 val = ch;
5706 }
5707 else if (charset == charset_jisx0208
5708 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5709 {
5710 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5711 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5712 }
55ab7be3
KH
5713 else if (charset == charset_katakana_jisx0201
5714 && c1 > 0x20 && c2 < 0xE0)
5715 {
5716 XSETFASTINT (val, c1 | 0x80);
5717 }
4ed46869 5718 else
55ab7be3 5719 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5720 return val;
5721}
5722
5723DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5724 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5725Return the corresponding character.")
5726 (code)
5727 Lisp_Object code;
5728{
5729 int charset;
5730 unsigned char b1, b2, c1, c2;
5731 Lisp_Object val;
5732
5733 CHECK_NUMBER (code, 0);
5734 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5735 if (b1 == 0)
5736 {
5737 if (b2 >= 0x80)
9da8350f 5738 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5739 val = code;
5740 }
5741 else
5742 {
5743 if ((b1 < 0xA1 || b1 > 0xFE)
5744 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5745 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5746 DECODE_BIG5 (b1, b2, charset, c1, c2);
5747 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5748 }
4ed46869
KH
5749 return val;
5750}
5751
5752DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5753 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5754Return the corresponding character code in Big5.")
5755 (ch)
5756 Lisp_Object ch;
5757{
bcf26d6a 5758 int charset, c1, c2, b1, b2;
4ed46869
KH
5759 Lisp_Object val;
5760
5761 CHECK_NUMBER (ch, 0);
5762 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5763 if (charset == CHARSET_ASCII)
5764 {
5765 val = ch;
5766 }
5767 else if ((charset == charset_big5_1
5768 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5769 || (charset == charset_big5_2
5770 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5771 {
5772 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5773 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5774 }
5775 else
c28a9453 5776 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5777 return val;
5778}
3a73fa5d 5779\f
1ba9e4ab
KH
5780DEFUN ("set-terminal-coding-system-internal",
5781 Fset_terminal_coding_system_internal,
5782 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5783 (coding_system)
5784 Lisp_Object coding_system;
5785{
5786 CHECK_SYMBOL (coding_system, 0);
5787 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5788 /* We had better not send unsafe characters to terminal. */
6e85d753 5789 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
ec6d2bb8
KH
5790 /* Characer composition should be disabled. */
5791 terminal_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
5792 return Qnil;
5793}
5794
c4825358
KH
5795DEFUN ("set-safe-terminal-coding-system-internal",
5796 Fset_safe_terminal_coding_system_internal,
5797 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5798 (coding_system)
5799 Lisp_Object coding_system;
5800{
5801 CHECK_SYMBOL (coding_system, 0);
5802 setup_coding_system (Fcheck_coding_system (coding_system),
5803 &safe_terminal_coding);
ec6d2bb8
KH
5804 /* Characer composition should be disabled. */
5805 safe_terminal_coding.composing = COMPOSITION_DISABLED;
c4825358
KH
5806 return Qnil;
5807}
5808
4ed46869
KH
5809DEFUN ("terminal-coding-system",
5810 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5811 "Return coding system specified for terminal output.")
4ed46869
KH
5812 ()
5813{
5814 return terminal_coding.symbol;
5815}
5816
1ba9e4ab
KH
5817DEFUN ("set-keyboard-coding-system-internal",
5818 Fset_keyboard_coding_system_internal,
5819 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5820 (coding_system)
5821 Lisp_Object coding_system;
5822{
5823 CHECK_SYMBOL (coding_system, 0);
5824 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
ec6d2bb8
KH
5825 /* Characer composition should be disabled. */
5826 keyboard_coding.composing = COMPOSITION_DISABLED;
4ed46869
KH
5827 return Qnil;
5828}
5829
5830DEFUN ("keyboard-coding-system",
5831 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5832 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5833 ()
5834{
5835 return keyboard_coding.symbol;
5836}
5837
5838\f
a5d301df
KH
5839DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5840 Sfind_operation_coding_system, 1, MANY, 0,
5841 "Choose a coding system for an operation based on the target name.\n\
69f76525 5842The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5843DECODING-SYSTEM is the coding system to use for decoding\n\
5844\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5845for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5846\n\
5847The first argument OPERATION specifies an I/O primitive:\n\
5848 For file I/O, `insert-file-contents' or `write-region'.\n\
5849 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5850 For network I/O, `open-network-stream'.\n\
5851\n\
5852The remaining arguments should be the same arguments that were passed\n\
5853to the primitive. Depending on which primitive, one of those arguments\n\
5854is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5855whichever argument specifies the file name is TARGET.\n\
5856\n\
5857TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5858 For file I/O, TARGET is a file name.\n\
5859 For process I/O, TARGET is a process name.\n\
5860 For network I/O, TARGET is a service name or a port number\n\
5861\n\
02ba4723
KH
5862This function looks up what specified for TARGET in,\n\
5863`file-coding-system-alist', `process-coding-system-alist',\n\
5864or `network-coding-system-alist' depending on OPERATION.\n\
5865They may specify a coding system, a cons of coding systems,\n\
5866or a function symbol to call.\n\
5867In the last case, we call the function with one argument,\n\
9ce27fde 5868which is a list of all the arguments given to this function.")
4ed46869
KH
5869 (nargs, args)
5870 int nargs;
5871 Lisp_Object *args;
5872{
5873 Lisp_Object operation, target_idx, target, val;
5874 register Lisp_Object chain;
5875
5876 if (nargs < 2)
5877 error ("Too few arguments");
5878 operation = args[0];
5879 if (!SYMBOLP (operation)
5880 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5881 error ("Invalid first arguement");
5882 if (nargs < 1 + XINT (target_idx))
5883 error ("Too few arguments for operation: %s",
5884 XSYMBOL (operation)->name->data);
5885 target = args[XINT (target_idx) + 1];
5886 if (!(STRINGP (target)
5887 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5888 error ("Invalid %dth argument", XINT (target_idx) + 1);
5889
2e34157c
RS
5890 chain = ((EQ (operation, Qinsert_file_contents)
5891 || EQ (operation, Qwrite_region))
02ba4723 5892 ? Vfile_coding_system_alist
2e34157c 5893 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5894 ? Vnetwork_coding_system_alist
5895 : Vprocess_coding_system_alist));
4ed46869
KH
5896 if (NILP (chain))
5897 return Qnil;
5898
03699b14 5899 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 5900 {
f44d27ce 5901 Lisp_Object elt;
03699b14 5902 elt = XCAR (chain);
4ed46869
KH
5903
5904 if (CONSP (elt)
5905 && ((STRINGP (target)
03699b14
KR
5906 && STRINGP (XCAR (elt))
5907 && fast_string_match (XCAR (elt), target) >= 0)
5908 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 5909 {
03699b14 5910 val = XCDR (elt);
b19fd4c5
KH
5911 /* Here, if VAL is both a valid coding system and a valid
5912 function symbol, we return VAL as a coding system. */
02ba4723
KH
5913 if (CONSP (val))
5914 return val;
5915 if (! SYMBOLP (val))
5916 return Qnil;
5917 if (! NILP (Fcoding_system_p (val)))
5918 return Fcons (val, val);
b19fd4c5
KH
5919 if (! NILP (Ffboundp (val)))
5920 {
5921 val = call1 (val, Flist (nargs, args));
5922 if (CONSP (val))
5923 return val;
5924 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5925 return Fcons (val, val);
5926 }
02ba4723
KH
5927 return Qnil;
5928 }
4ed46869
KH
5929 }
5930 return Qnil;
5931}
5932
1397dc18
KH
5933DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5934 Supdate_coding_systems_internal, 0, 0, 0,
5935 "Update internal database for ISO2022 and CCL based coding systems.\n\
fa42c37f
KH
5936When values of any coding categories are changed, you must\n\
5937call this function")
d46c5b12
KH
5938 ()
5939{
5940 int i;
5941
fa42c37f 5942 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
d46c5b12 5943 {
1397dc18
KH
5944 Lisp_Object val;
5945
5946 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5947 if (!NILP (val))
5948 {
5949 if (! coding_system_table[i])
5950 coding_system_table[i] = ((struct coding_system *)
5951 xmalloc (sizeof (struct coding_system)));
5952 setup_coding_system (val, coding_system_table[i]);
5953 }
5954 else if (coding_system_table[i])
5955 {
5956 xfree (coding_system_table[i]);
5957 coding_system_table[i] = NULL;
5958 }
d46c5b12 5959 }
1397dc18 5960
d46c5b12
KH
5961 return Qnil;
5962}
5963
66cfb530
KH
5964DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5965 Sset_coding_priority_internal, 0, 0, 0,
5966 "Update internal database for the current value of `coding-category-list'.\n\
5967This function is internal use only.")
5968 ()
5969{
5970 int i = 0, idx;
84d60297
RS
5971 Lisp_Object val;
5972
5973 val = Vcoding_category_list;
66cfb530
KH
5974
5975 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5976 {
03699b14 5977 if (! SYMBOLP (XCAR (val)))
66cfb530 5978 break;
03699b14 5979 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
5980 if (idx >= CODING_CATEGORY_IDX_MAX)
5981 break;
5982 coding_priorities[i++] = (1 << idx);
03699b14 5983 val = XCDR (val);
66cfb530
KH
5984 }
5985 /* If coding-category-list is valid and contains all coding
5986 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
fa42c37f 5987 the following code saves Emacs from crashing. */
66cfb530
KH
5988 while (i < CODING_CATEGORY_IDX_MAX)
5989 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5990
5991 return Qnil;
5992}
5993
4ed46869
KH
5994#endif /* emacs */
5995
5996\f
1397dc18 5997/*** 9. Post-amble ***/
4ed46869 5998
6d74c3aa
KH
5999void
6000init_coding ()
6001{
6002 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6003}
6004
dfcf069d 6005void
4ed46869
KH
6006init_coding_once ()
6007{
6008 int i;
6009
0ef69138 6010 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
6011 for (i = 0; i <= 0x20; i++)
6012 emacs_code_class[i] = EMACS_control_code;
6013 emacs_code_class[0x0A] = EMACS_linefeed_code;
6014 emacs_code_class[0x0D] = EMACS_carriage_return_code;
6015 for (i = 0x21 ; i < 0x7F; i++)
6016 emacs_code_class[i] = EMACS_ascii_code;
6017 emacs_code_class[0x7F] = EMACS_control_code;
ec6d2bb8 6018 for (i = 0x80; i < 0xFF; i++)
4ed46869
KH
6019 emacs_code_class[i] = EMACS_invalid_code;
6020 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6021 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6022 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6023 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6024
6025 /* ISO2022 specific initialize routine. */
6026 for (i = 0; i < 0x20; i++)
6027 iso_code_class[i] = ISO_control_code;
6028 for (i = 0x21; i < 0x7F; i++)
6029 iso_code_class[i] = ISO_graphic_plane_0;
6030 for (i = 0x80; i < 0xA0; i++)
6031 iso_code_class[i] = ISO_control_code;
6032 for (i = 0xA1; i < 0xFF; i++)
6033 iso_code_class[i] = ISO_graphic_plane_1;
6034 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6035 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6036 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6037 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6038 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6039 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6040 iso_code_class[ISO_CODE_ESC] = ISO_escape;
6041 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6042 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6043 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6044
e0e989f6 6045 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
6046
6047 setup_coding_system (Qnil, &keyboard_coding);
6048 setup_coding_system (Qnil, &terminal_coding);
c4825358 6049 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 6050 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 6051
d46c5b12
KH
6052 bzero (coding_system_table, sizeof coding_system_table);
6053
66cfb530
KH
6054 bzero (ascii_skip_code, sizeof ascii_skip_code);
6055 for (i = 0; i < 128; i++)
6056 ascii_skip_code[i] = 1;
6057
9ce27fde
KH
6058#if defined (MSDOS) || defined (WINDOWSNT)
6059 system_eol_type = CODING_EOL_CRLF;
6060#else
6061 system_eol_type = CODING_EOL_LF;
6062#endif
b843d1ae
KH
6063
6064 inhibit_pre_post_conversion = 0;
e0e989f6
KH
6065}
6066
6067#ifdef emacs
6068
dfcf069d 6069void
e0e989f6
KH
6070syms_of_coding ()
6071{
6072 Qtarget_idx = intern ("target-idx");
6073 staticpro (&Qtarget_idx);
6074
bb0115a2
RS
6075 Qcoding_system_history = intern ("coding-system-history");
6076 staticpro (&Qcoding_system_history);
6077 Fset (Qcoding_system_history, Qnil);
6078
9ce27fde 6079 /* Target FILENAME is the first argument. */
e0e989f6 6080 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 6081 /* Target FILENAME is the third argument. */
e0e989f6
KH
6082 Fput (Qwrite_region, Qtarget_idx, make_number (2));
6083
6084 Qcall_process = intern ("call-process");
6085 staticpro (&Qcall_process);
9ce27fde 6086 /* Target PROGRAM is the first argument. */
e0e989f6
KH
6087 Fput (Qcall_process, Qtarget_idx, make_number (0));
6088
6089 Qcall_process_region = intern ("call-process-region");
6090 staticpro (&Qcall_process_region);
9ce27fde 6091 /* Target PROGRAM is the third argument. */
e0e989f6
KH
6092 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6093
6094 Qstart_process = intern ("start-process");
6095 staticpro (&Qstart_process);
9ce27fde 6096 /* Target PROGRAM is the third argument. */
e0e989f6
KH
6097 Fput (Qstart_process, Qtarget_idx, make_number (2));
6098
6099 Qopen_network_stream = intern ("open-network-stream");
6100 staticpro (&Qopen_network_stream);
9ce27fde 6101 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
6102 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6103
4ed46869
KH
6104 Qcoding_system = intern ("coding-system");
6105 staticpro (&Qcoding_system);
6106
6107 Qeol_type = intern ("eol-type");
6108 staticpro (&Qeol_type);
6109
6110 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6111 staticpro (&Qbuffer_file_coding_system);
6112
6113 Qpost_read_conversion = intern ("post-read-conversion");
6114 staticpro (&Qpost_read_conversion);
6115
6116 Qpre_write_conversion = intern ("pre-write-conversion");
6117 staticpro (&Qpre_write_conversion);
6118
27901516
KH
6119 Qno_conversion = intern ("no-conversion");
6120 staticpro (&Qno_conversion);
6121
6122 Qundecided = intern ("undecided");
6123 staticpro (&Qundecided);
6124
4ed46869
KH
6125 Qcoding_system_p = intern ("coding-system-p");
6126 staticpro (&Qcoding_system_p);
6127
6128 Qcoding_system_error = intern ("coding-system-error");
6129 staticpro (&Qcoding_system_error);
6130
6131 Fput (Qcoding_system_error, Qerror_conditions,
6132 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6133 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 6134 build_string ("Invalid coding system"));
4ed46869 6135
d46c5b12
KH
6136 Qcoding_category = intern ("coding-category");
6137 staticpro (&Qcoding_category);
4ed46869
KH
6138 Qcoding_category_index = intern ("coding-category-index");
6139 staticpro (&Qcoding_category_index);
6140
d46c5b12
KH
6141 Vcoding_category_table
6142 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6143 staticpro (&Vcoding_category_table);
4ed46869
KH
6144 {
6145 int i;
6146 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6147 {
d46c5b12
KH
6148 XVECTOR (Vcoding_category_table)->contents[i]
6149 = intern (coding_category_name[i]);
6150 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6151 Qcoding_category_index, make_number (i));
4ed46869
KH
6152 }
6153 }
6154
f967223b
KH
6155 Qtranslation_table = intern ("translation-table");
6156 staticpro (&Qtranslation_table);
1397dc18 6157 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 6158
f967223b
KH
6159 Qtranslation_table_id = intern ("translation-table-id");
6160 staticpro (&Qtranslation_table_id);
84fbb8a0 6161
f967223b
KH
6162 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6163 staticpro (&Qtranslation_table_for_decode);
a5d301df 6164
f967223b
KH
6165 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6166 staticpro (&Qtranslation_table_for_encode);
a5d301df 6167
70c22245
KH
6168 Qsafe_charsets = intern ("safe-charsets");
6169 staticpro (&Qsafe_charsets);
6170
1397dc18
KH
6171 Qvalid_codes = intern ("valid-codes");
6172 staticpro (&Qvalid_codes);
6173
9ce27fde
KH
6174 Qemacs_mule = intern ("emacs-mule");
6175 staticpro (&Qemacs_mule);
6176
d46c5b12
KH
6177 Qraw_text = intern ("raw-text");
6178 staticpro (&Qraw_text);
6179
4ed46869
KH
6180 defsubr (&Scoding_system_p);
6181 defsubr (&Sread_coding_system);
6182 defsubr (&Sread_non_nil_coding_system);
6183 defsubr (&Scheck_coding_system);
6184 defsubr (&Sdetect_coding_region);
d46c5b12 6185 defsubr (&Sdetect_coding_string);
4ed46869
KH
6186 defsubr (&Sdecode_coding_region);
6187 defsubr (&Sencode_coding_region);
6188 defsubr (&Sdecode_coding_string);
6189 defsubr (&Sencode_coding_string);
6190 defsubr (&Sdecode_sjis_char);
6191 defsubr (&Sencode_sjis_char);
6192 defsubr (&Sdecode_big5_char);
6193 defsubr (&Sencode_big5_char);
1ba9e4ab 6194 defsubr (&Sset_terminal_coding_system_internal);
c4825358 6195 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 6196 defsubr (&Sterminal_coding_system);
1ba9e4ab 6197 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 6198 defsubr (&Skeyboard_coding_system);
a5d301df 6199 defsubr (&Sfind_operation_coding_system);
1397dc18 6200 defsubr (&Supdate_coding_systems_internal);
66cfb530 6201 defsubr (&Sset_coding_priority_internal);
4ed46869 6202
4608c386
KH
6203 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6204 "List of coding systems.\n\
6205\n\
6206Do not alter the value of this variable manually. This variable should be\n\
6207updated by the functions `make-coding-system' and\n\
6208`define-coding-system-alias'.");
6209 Vcoding_system_list = Qnil;
6210
6211 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6212 "Alist of coding system names.\n\
6213Each element is one element list of coding system name.\n\
6214This variable is given to `completing-read' as TABLE argument.\n\
6215\n\
6216Do not alter the value of this variable manually. This variable should be\n\
6217updated by the functions `make-coding-system' and\n\
6218`define-coding-system-alias'.");
6219 Vcoding_system_alist = Qnil;
6220
4ed46869
KH
6221 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6222 "List of coding-categories (symbols) ordered by priority.");
6223 {
6224 int i;
6225
6226 Vcoding_category_list = Qnil;
6227 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6228 Vcoding_category_list
d46c5b12
KH
6229 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6230 Vcoding_category_list);
4ed46869
KH
6231 }
6232
6233 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 6234 "Specify the coding system for read operations.\n\
2ebb362d 6235It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 6236If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 6237If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 6238There are three such tables, `file-coding-system-alist',\n\
a67a9c66 6239`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
6240 Vcoding_system_for_read = Qnil;
6241
6242 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 6243 "Specify the coding system for write operations.\n\
928aedd8
RS
6244Programs bind this variable with `let', but you should not set it globally.\n\
6245If the value is a coding system, it is used for encoding of output,\n\
6246when writing it to a file and when sending it to a file or subprocess.\n\
6247\n\
6248If this does not specify a coding system, an appropriate element\n\
6249is used from one of the coding system alists:\n\
10bff6f1 6250There are three such tables, `file-coding-system-alist',\n\
928aedd8
RS
6251`process-coding-system-alist', and `network-coding-system-alist'.\n\
6252For output to files, if the above procedure does not specify a coding system,\n\
6253the value of `buffer-file-coding-system' is used.");
4ed46869
KH
6254 Vcoding_system_for_write = Qnil;
6255
6256 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 6257 "Coding system used in the latest file or process I/O.");
4ed46869
KH
6258 Vlast_coding_system_used = Qnil;
6259
9ce27fde 6260 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 6261 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
6262See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6263such conversion.");
9ce27fde
KH
6264 inhibit_eol_conversion = 0;
6265
ed29121d
EZ
6266 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6267 "Non-nil means process buffer inherits coding system of process output.\n\
6268Bind it to t if the process output is to be treated as if it were a file\n\
6269read from some filesystem.");
6270 inherit_process_coding_system = 0;
6271
02ba4723
KH
6272 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6273 "Alist to decide a coding system to use for a file I/O operation.\n\
6274The format is ((PATTERN . VAL) ...),\n\
6275where PATTERN is a regular expression matching a file name,\n\
6276VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6277If VAL is a coding system, it is used for both decoding and encoding\n\
6278the file contents.\n\
6279If VAL is a cons of coding systems, the car part is used for decoding,\n\
6280and the cdr part is used for encoding.\n\
6281If VAL is a function symbol, the function must return a coding system\n\
6282or a cons of coding systems which are used as above.\n\
e0e989f6 6283\n\
a85a871a 6284See also the function `find-operation-coding-system'\n\
eda284ac 6285and the variable `auto-coding-alist'.");
02ba4723
KH
6286 Vfile_coding_system_alist = Qnil;
6287
6288 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6289 "Alist to decide a coding system to use for a process I/O operation.\n\
6290The format is ((PATTERN . VAL) ...),\n\
6291where PATTERN is a regular expression matching a program name,\n\
6292VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6293If VAL is a coding system, it is used for both decoding what received\n\
6294from the program and encoding what sent to the program.\n\
6295If VAL is a cons of coding systems, the car part is used for decoding,\n\
6296and the cdr part is used for encoding.\n\
6297If VAL is a function symbol, the function must return a coding system\n\
6298or a cons of coding systems which are used as above.\n\
4ed46869 6299\n\
9ce27fde 6300See also the function `find-operation-coding-system'.");
02ba4723
KH
6301 Vprocess_coding_system_alist = Qnil;
6302
6303 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6304 "Alist to decide a coding system to use for a network I/O operation.\n\
6305The format is ((PATTERN . VAL) ...),\n\
6306where PATTERN is a regular expression matching a network service name\n\
6307or is a port number to connect to,\n\
6308VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6309If VAL is a coding system, it is used for both decoding what received\n\
6310from the network stream and encoding what sent to the network stream.\n\
6311If VAL is a cons of coding systems, the car part is used for decoding,\n\
6312and the cdr part is used for encoding.\n\
6313If VAL is a function symbol, the function must return a coding system\n\
6314or a cons of coding systems which are used as above.\n\
4ed46869 6315\n\
9ce27fde 6316See also the function `find-operation-coding-system'.");
02ba4723 6317 Vnetwork_coding_system_alist = Qnil;
4ed46869 6318
68c45bf0
PE
6319 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6320 "Coding system to use with system messages.");
6321 Vlocale_coding_system = Qnil;
6322
7722baf9
EZ
6323 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6324 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6325 eol_mnemonic_unix = build_string (":");
4ed46869 6326
7722baf9
EZ
6327 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6328 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6329 eol_mnemonic_dos = build_string ("\\");
4ed46869 6330
7722baf9
EZ
6331 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6332 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6333 eol_mnemonic_mac = build_string ("/");
4ed46869 6334
7722baf9
EZ
6335 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6336 "*String displayed in mode line when end-of-line format is not yet determined.");
6337 eol_mnemonic_undecided = build_string (":");
4ed46869 6338
84fbb8a0 6339 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 6340 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 6341 Venable_character_translation = Qt;
bdd9fb48 6342
f967223b
KH
6343 DEFVAR_LISP ("standard-translation-table-for-decode",
6344 &Vstandard_translation_table_for_decode,
84fbb8a0 6345 "Table for translating characters while decoding.");
f967223b 6346 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 6347
f967223b
KH
6348 DEFVAR_LISP ("standard-translation-table-for-encode",
6349 &Vstandard_translation_table_for_encode,
84fbb8a0 6350 "Table for translationg characters while encoding.");
f967223b 6351 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
6352
6353 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6354 "Alist of charsets vs revision numbers.\n\
6355While encoding, if a charset (car part of an element) is found,\n\
6356designate it with the escape sequence identifing revision (cdr part of the element).");
6357 Vcharset_revision_alist = Qnil;
02ba4723
KH
6358
6359 DEFVAR_LISP ("default-process-coding-system",
6360 &Vdefault_process_coding_system,
6361 "Cons of coding systems used for process I/O by default.\n\
6362The car part is used for decoding a process output,\n\
6363the cdr part is used for encoding a text to be sent to a process.");
6364 Vdefault_process_coding_system = Qnil;
c4825358 6365
3f003981
KH
6366 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6367 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
6368This is a vector of length 256.\n\
6369If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 6370\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
6371a coding system of ISO 2022 variant which has a flag\n\
6372`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
6373or reading output of a subprocess.\n\
6374Only 128th through 159th elements has a meaning.");
3f003981 6375 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
6376
6377 DEFVAR_LISP ("select-safe-coding-system-function",
6378 &Vselect_safe_coding_system_function,
6379 "Function to call to select safe coding system for encoding a text.\n\
6380\n\
6381If set, this function is called to force a user to select a proper\n\
6382coding system which can encode the text in the case that a default\n\
6383coding system used in each operation can't encode the text.\n\
6384\n\
a85a871a 6385The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
6386 Vselect_safe_coding_system_function = Qnil;
6387
4ed46869
KH
6388}
6389
68c45bf0
PE
6390char *
6391emacs_strerror (error_number)
6392 int error_number;
6393{
6394 char *str;
6395
ca9c0567 6396 synchronize_system_messages_locale ();
68c45bf0
PE
6397 str = strerror (error_number);
6398
6399 if (! NILP (Vlocale_coding_system))
6400 {
6401 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6402 Vlocale_coding_system,
6403 0);
6404 str = (char *) XSTRING (dec)->data;
6405 }
6406
6407 return str;
6408}
6409
4ed46869 6410#endif /* emacs */