(describe_vector): Adjusted for the change of CHAR_TABLE_ORDINARY_SLOTS.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869
KH
1/* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
4ed46869
KH
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
369314dc
KH
6This file is part of GNU Emacs.
7
8GNU Emacs is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
4ed46869 12
369314dc
KH
13GNU Emacs is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
4ed46869 17
369314dc
KH
18You should have received a copy of the GNU General Public License
19along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */
4ed46869
KH
22
23/*** TABLE OF CONTENTS ***
24
25 1. Preamble
26 2. Emacs' internal format handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means
42 converting Emacs' internal format to some other coding system.
43
44 0. Emacs' internal format
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83*/
84
85/*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98*/
99
100/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107#if 0
108int
109detect_coding_internal (src, src_end)
110 unsigned char *src, *src_end;
111{
112 ...
113}
114#endif
115
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a
120 place pointed by DESTINATION, the length of which should not exceed
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
122 The return value is the length of the decoded text. Below is a
123 template of these functions. */
124#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130{
131 ...
132}
133#endif
134
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
138 internal format to CODING. The resulting text goes to a place
139 pointed by DESTINATION, the length of which should not exceed
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
141 The return value is the length of the encoded text. Below is a
142 template of these functions. */
143#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149{
150 ...
151}
152#endif
153
154/*** COMMONLY USED MACROS ***/
155
156/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162#define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170#define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178#define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186/* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193/* Decode one ASCII character C. */
194
195#define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203/* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206#define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218/* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227\f
228/*** 1. Preamble ***/
229
230#include <stdio.h>
231
232#ifdef emacs
233
234#include <config.h>
235#include "lisp.h"
236#include "buffer.h"
237#include "charset.h"
238#include "ccl.h"
239#include "coding.h"
240#include "window.h"
241
242#else /* not emacs */
243
244#include "mulelib.h"
245
246#endif /* not emacs */
247
248Lisp_Object Qcoding_system, Qeol_type;
249Lisp_Object Qbuffer_file_coding_system;
250Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254Lisp_Object Qstart_process, Qopen_network_stream;
255Lisp_Object Qtarget_idx;
256
257/* Mnemonic character of each format of end-of-line. */
258int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259/* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261int eol_mnemonic_undecided;
262
263#ifdef emacs
264
265Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
266
267/* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269/* Coding-system for reading files and receiving data from process. */
270Lisp_Object Vcoding_system_for_read;
271/* Coding-system for writing files and sending data to process. */
272Lisp_Object Vcoding_system_for_write;
273/* Coding-system actually used in the latest I/O. */
274Lisp_Object Vlast_coding_system_used;
275
276/* Coding-system of what terminal accept for displaying. */
277struct coding_system terminal_coding;
278
279/* Coding-system of what is sent from terminal keyboard. */
280struct coding_system keyboard_coding;
281
282Lisp_Object Vcoding_system_alist;
283
284#endif /* emacs */
285
286Lisp_Object Qcoding_category_index;
287
288/* List of symbols `coding-category-xxx' ordered by priority. */
289Lisp_Object Vcoding_category_list;
290
291/* Table of coding-systems currently assigned to each coding-category. */
292Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
293
294/* Table of names of symbol for each coding-category. */
295char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
296 "coding-category-internal",
297 "coding-category-sjis",
298 "coding-category-iso-7",
299 "coding-category-iso-8-1",
300 "coding-category-iso-8-2",
301 "coding-category-iso-else",
302 "coding-category-big5",
303 "coding-category-binary"
304};
305
306/* Alist of charsets vs the alternate charsets. */
307Lisp_Object Valternate_charset_table;
308
309/* Alist of charsets vs revision number. */
310Lisp_Object Vcharset_revision_alist;
311
312\f
313/*** 2. Emacs internal format handlers ***/
314
315/* Emacs' internal format for encoding multiple character sets is a
316 kind of multi-byte encoding, i.e. encoding a character by a sequence
317 of one-byte codes of variable length. ASCII characters and control
318 characters (e.g. `tab', `newline') are represented by one-byte as
319 is. It takes the range 0x00 through 0x7F. The other characters
320 are represented by a sequence of `base leading-code', optional
321 `extended leading-code', and one or two `position-code's. Length
322 of the sequence is decided by the base leading-code. Leading-code
323 takes the range 0x80 through 0x9F, whereas extended leading-code
324 and position-code take the range 0xA0 through 0xFF. See the
325 document of `charset.h' for more detail about leading-code and
326 position-code.
327
328 There's one exception in this rule. Special leading-code
329 `leading-code-composition' denotes that the following several
330 characters should be composed into one character. Leading-codes of
331 components (except for ASCII) are added 0x20. An ASCII character
332 component is represented by a 2-byte sequence of `0xA0' and
333 `ASCII-code + 0x80'. See also the document in `charset.h' for the
334 detail of composite character. Hence, we can summarize the code
335 range as follows:
336
337 --- CODE RANGE of Emacs' internal format ---
338 (character set) (range)
339 ASCII 0x00 .. 0x7F
340 ELSE (1st byte) 0x80 .. 0x9F
341 (rest bytes) 0xA0 .. 0xFF
342 ---------------------------------------------
343
344 */
345
346enum emacs_code_class_type emacs_code_class[256];
347
348/* Go to the next statement only if *SRC is accessible and the code is
349 greater than 0xA0. */
350#define CHECK_CODE_RANGE_A0_FF \
351 do { \
352 if (src >= src_end) \
353 goto label_end_of_switch; \
354 else if (*src++ < 0xA0) \
355 return 0; \
356 } while (0)
357
358/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
359 Check if a text is encoded in Emacs' internal format. If it is,
360 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
361
362int
363detect_coding_internal (src, src_end)
364 unsigned char *src, *src_end;
365{
366 unsigned char c;
367 int composing = 0;
368
369 while (src < src_end)
370 {
371 c = *src++;
372
373 if (composing)
374 {
375 if (c < 0xA0)
376 composing = 0;
377 else
378 c -= 0x20;
379 }
380
381 switch (emacs_code_class[c])
382 {
383 case EMACS_ascii_code:
384 case EMACS_linefeed_code:
385 break;
386
387 case EMACS_control_code:
388 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
389 return 0;
390 break;
391
392 case EMACS_invalid_code:
393 return 0;
394
395 case EMACS_leading_code_composition: /* c == 0x80 */
396 if (composing)
397 CHECK_CODE_RANGE_A0_FF;
398 else
399 composing = 1;
400 break;
401
402 case EMACS_leading_code_4:
403 CHECK_CODE_RANGE_A0_FF;
404 /* fall down to check it two more times ... */
405
406 case EMACS_leading_code_3:
407 CHECK_CODE_RANGE_A0_FF;
408 /* fall down to check it one more time ... */
409
410 case EMACS_leading_code_2:
411 CHECK_CODE_RANGE_A0_FF;
412 break;
413
414 default:
415 label_end_of_switch:
416 break;
417 }
418 }
419 return CODING_CATEGORY_MASK_INTERNAL;
420}
421
422\f
423/*** 3. ISO2022 handlers ***/
424
425/* The following note describes the coding system ISO2022 briefly.
426 Since the intension of this note is to help understanding of the
427 programs in this file, some parts are NOT ACCURATE or OVERLY
428 SIMPLIFIED. For the thorough understanding, please refer to the
429 original document of ISO2022.
430
431 ISO2022 provides many mechanisms to encode several character sets
432 in 7-bit and 8-bit environment. If one choose 7-bite environment,
433 all text is encoded by codes of less than 128. This may make the
434 encoded text a little bit longer, but the text get more stability
435 to pass through several gateways (some of them split MSB off).
436
437 There are two kind of character set: control character set and
438 graphic character set. The former contains control characters such
439 as `newline' and `escape' to provide control functions (control
440 functions are provided also by escape sequence). The latter
441 contains graphic characters such as ' A' and '-'. Emacs recognizes
442 two control character sets and many graphic character sets.
443
444 Graphic character sets are classified into one of the following
445 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
446 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
447 bytes (DIMENSION) and the number of characters in one dimension
448 (CHARS) of the set. In addition, each character set is assigned an
449 identification tag (called "final character" and denoted as <F>
450 here after) which is unique in each class. <F> of each character
451 set is decided by ECMA(*) when it is registered in ISO. Code range
452 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
453
454 Note (*): ECMA = European Computer Manufacturers Association
455
456 Here are examples of graphic character set [NAME(<F>)]:
457 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
458 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
459 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
460 o DIMENSION2_CHARS96 -- none for the moment
461
462 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
463 C0 [0x00..0x1F] -- control character plane 0
464 GL [0x20..0x7F] -- graphic character plane 0
465 C1 [0x80..0x9F] -- control character plane 1
466 GR [0xA0..0xFF] -- graphic character plane 1
467
468 A control character set is directly designated and invoked to C0 or
469 C1 by an escape sequence. The most common case is that ISO646's
470 control character set is designated/invoked to C0 and ISO6429's
471 control character set is designated/invoked to C1, and usually
472 these designations/invocations are omitted in a coded text. With
473 7-bit environment, only C0 can be used, and a control character for
474 C1 is encoded by an appropriate escape sequence to fit in the
475 environment. All control characters for C1 are defined the
476 corresponding escape sequences.
477
478 A graphic character set is at first designated to one of four
479 graphic registers (G0 through G3), then these graphic registers are
480 invoked to GL or GR. These designations and invocations can be
481 done independently. The most common case is that G0 is invoked to
482 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
483 these invocations and designations are omitted in a coded text.
484 With 7-bit environment, only GL can be used.
485
486 When a graphic character set of CHARS94 is invoked to GL, code 0x20
487 and 0x7F of GL area work as control characters SPACE and DEL
488 respectively, and code 0xA0 and 0xFF of GR area should not be used.
489
490 There are two ways of invocation: locking-shift and single-shift.
491 With locking-shift, the invocation lasts until the next different
492 invocation, whereas with single-shift, the invocation works only
493 for the following character and doesn't affect locking-shift.
494 Invocations are done by the following control characters or escape
495 sequences.
496
497 ----------------------------------------------------------------------
498 function control char escape sequence description
499 ----------------------------------------------------------------------
500 SI (shift-in) 0x0F none invoke G0 to GL
501 SI (shift-out) 0x0E none invoke G1 to GL
502 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
503 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
504 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
505 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
506 ----------------------------------------------------------------------
507 The first four are for locking-shift. Control characters for these
508 functions are defined by macros ISO_CODE_XXX in `coding.h'.
509
510 Designations are done by the following escape sequences.
511 ----------------------------------------------------------------------
512 escape sequence description
513 ----------------------------------------------------------------------
514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
530 ----------------------------------------------------------------------
531
532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
533 of dimension 1, chars 94, and final character <F>, and etc.
534
535 Note (*): Although these designations are not allowed in ISO2022,
536 Emacs accepts them on decoding, and produces them on encoding
537 CHARS96 character set in a coding system which is characterized as
538 7-bit environment, non-locking-shift, and non-single-shift.
539
540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
541 '(' can be omitted. We call this as "short-form" here after.
542
543 Now you may notice that there are a lot of ways for encoding the
544 same multilingual text in ISO2022. Actually, there exist many
545 coding systems such as Compound Text (used in X's inter client
546 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
547 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
548 localized platforms), and all of these are variants of ISO2022.
549
550 In addition to the above, Emacs handles two more kinds of escape
551 sequences: ISO6429's direction specification and Emacs' private
552 sequence for specifying character composition.
553
554 ISO6429's direction specification takes the following format:
555 o CSI ']' -- end of the current direction
556 o CSI '0' ']' -- end of the current direction
557 o CSI '1' ']' -- start of left-to-right text
558 o CSI '2' ']' -- start of right-to-left text
559 The control character CSI (0x9B: control sequence introducer) is
560 abbreviated to the escape sequence ESC '[' in 7-bit environment.
561
562 Character composition specification takes the following format:
563 o ESC '0' -- start character composition
564 o ESC '1' -- end character composition
565 Since these are not standard escape sequences of any ISO, the use
566 of them for these meaning is restricted to Emacs only. */
567
568enum iso_code_class_type iso_code_class[256];
569
570/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
571 Check if a text is encoded in ISO2022. If it is, returns an
572 integer in which appropriate flag bits any of:
573 CODING_CATEGORY_MASK_ISO_7
574 CODING_CATEGORY_MASK_ISO_8_1
575 CODING_CATEGORY_MASK_ISO_8_2
576 CODING_CATEGORY_MASK_ISO_ELSE
577 are set. If a code which should never appear in ISO2022 is found,
578 returns 0. */
579
580int
581detect_coding_iso2022 (src, src_end)
582 unsigned char *src, *src_end;
583{
e0e989f6 584 unsigned char c, g1 = 0;
4ed46869
KH
585 int mask = (CODING_CATEGORY_MASK_ISO_7
586 | CODING_CATEGORY_MASK_ISO_8_1
587 | CODING_CATEGORY_MASK_ISO_8_2);
e0e989f6
KH
588 /* We may look ahead at most 4 bytes. */
589 unsigned char *adjusted_src_end = src_end - 4;
4ed46869
KH
590 int i;
591
e0e989f6 592 while (src < src_end)
4ed46869
KH
593 {
594 c = *src++;
595 switch (c)
596 {
597 case ISO_CODE_ESC:
e0e989f6 598 if (src >= src_end)
4ed46869
KH
599 break;
600 c = *src++;
e0e989f6
KH
601 if (src + 2 >= src_end
602 && ((c >= '(' && c <= '/')
603 || c == '$' && ((*src >= '(' && *src <= '/')
604 || (*src >= '@' && *src <= 'B'))))
4ed46869 605 {
e0e989f6
KH
606 /* Valid designation sequence. */
607 if (c == ')' || (c == '$' && *src == ')'))
608 g1 = 1;
609 src++;
610 break;
4ed46869 611 }
4ed46869
KH
612 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
613 return CODING_CATEGORY_MASK_ISO_ELSE;
614 break;
615
4ed46869 616 case ISO_CODE_SO:
e0e989f6
KH
617 if (g1)
618 return CODING_CATEGORY_MASK_ISO_ELSE;
619 break;
620
4ed46869
KH
621 case ISO_CODE_CSI:
622 case ISO_CODE_SS2:
623 case ISO_CODE_SS3:
624 mask &= ~CODING_CATEGORY_MASK_ISO_7;
625 break;
626
627 default:
628 if (c < 0x80)
629 break;
630 else if (c < 0xA0)
631 return 0;
632 else
633 {
634 int count = 1;
635
636 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 637 while (src < src_end && *src >= 0xA0)
4ed46869 638 count++, src++;
e0e989f6 639 if (count & 1 && src < src_end)
4ed46869
KH
640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
641 }
642 break;
643 }
644 }
645
646 return mask;
647}
648
649/* Decode a character of which charset is CHARSET and the 1st position
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
651 fetched from SRC and set to C2. If CHARSET is negative, it means
652 that we are decoding ill formed text, and what we can do is just to
653 read C1 as is. */
654
655#define DECODE_ISO_CHARACTER(charset, c1) \
656 do { \
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
658 ONE_MORE_BYTE (c2); \
659 if (COMPOSING_HEAD_P (coding->composing)) \
660 { \
661 *dst++ = LEADING_CODE_COMPOSITION; \
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
663 /* To tell composition rules are embeded. */ \
664 *dst++ = 0xFF; \
665 coding->composing += 2; \
666 } \
667 if ((charset) < 0) \
668 *dst++ = c1; \
669 else if ((charset) == CHARSET_ASCII) \
670 DECODE_CHARACTER_ASCII (c1); \
671 else if (CHARSET_DIMENSION (charset) == 1) \
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
673 else \
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell a composition rule follows. */ \
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
678 } while (0)
679
680/* Set designation state into CODING. */
681#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
682 do { \
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
684 Lisp_Object temp \
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
686 if (! NILP (temp)) \
687 charset = get_charset_id (XCONS (temp)->cdr); \
688 if (charset >= 0) \
689 { \
690 if (coding->direction == 1 \
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
692 charset = CHARSET_REVERSE_CHARSET (charset); \
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
694 } \
695 } while (0)
696
697/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
698
699int
700decode_coding_iso2022 (coding, source, destination,
701 src_bytes, dst_bytes, consumed)
702 struct coding_system *coding;
703 unsigned char *source, *destination;
704 int src_bytes, dst_bytes;
705 int *consumed;
706{
707 unsigned char *src = source;
708 unsigned char *src_end = source + src_bytes;
709 unsigned char *dst = destination;
710 unsigned char *dst_end = destination + dst_bytes;
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
712 from DST_END to assure that overflow checking is necessary only
713 at the head of loop. */
714 unsigned char *adjusted_dst_end = dst_end - 6;
715 int charset;
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
717 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
718 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
719
720 while (src < src_end && dst < adjusted_dst_end)
721 {
722 /* SRC_BASE remembers the start position in source in each loop.
723 The loop will be exited when there's not enough source text
724 to analyze long escape sequence or 2-byte code (within macros
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
726 to SRC_BASE before exiting. */
727 unsigned char *src_base = src;
728 unsigned char c1 = *src++, c2, cmprule;
729
730 switch (iso_code_class [c1])
731 {
732 case ISO_0x20_or_0x7F:
733 if (!coding->composing
734 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
735 {
736 /* This is SPACE or DEL. */
737 *dst++ = c1;
738 break;
739 }
740 /* This is a graphic character, we fall down ... */
741
742 case ISO_graphic_plane_0:
743 if (coding->composing == COMPOSING_WITH_RULE_RULE)
744 {
745 /* This is a composition rule. */
746 *dst++ = c1 | 0x80;
747 coding->composing = COMPOSING_WITH_RULE_TAIL;
748 }
749 else
750 DECODE_ISO_CHARACTER (charset0, c1);
751 break;
752
753 case ISO_0xA0_or_0xFF:
754 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
755 {
756 /* Invalid code. */
757 *dst++ = c1;
758 break;
759 }
760 /* This is a graphic character, we fall down ... */
761
762 case ISO_graphic_plane_1:
763 DECODE_ISO_CHARACTER (charset1, c1);
764 break;
765
766 case ISO_control_code:
767 /* All ISO2022 control characters in this class have the
768 same representation in Emacs internal format. */
769 *dst++ = c1;
770 break;
771
772 case ISO_carriage_return:
773 if (coding->eol_type == CODING_EOL_CR)
774 {
775 *dst++ = '\n';
776 }
777 else if (coding->eol_type == CODING_EOL_CRLF)
778 {
779 ONE_MORE_BYTE (c1);
780 if (c1 == ISO_CODE_LF)
781 *dst++ = '\n';
782 else
783 {
784 src--;
785 *dst++ = c1;
786 }
787 }
788 else
789 {
790 *dst++ = c1;
791 }
792 break;
793
794 case ISO_shift_out:
e0e989f6
KH
795 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
796 goto label_invalid_escape_sequence;
4ed46869
KH
797 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
798 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
799 break;
800
801 case ISO_shift_in:
802 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
803 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
804 break;
805
806 case ISO_single_shift_2_7:
807 case ISO_single_shift_2:
808 /* SS2 is handled as an escape sequence of ESC 'N' */
809 c1 = 'N';
810 goto label_escape_sequence;
811
812 case ISO_single_shift_3:
813 /* SS2 is handled as an escape sequence of ESC 'O' */
814 c1 = 'O';
815 goto label_escape_sequence;
816
817 case ISO_control_sequence_introducer:
818 /* CSI is handled as an escape sequence of ESC '[' ... */
819 c1 = '[';
820 goto label_escape_sequence;
821
822 case ISO_escape:
823 ONE_MORE_BYTE (c1);
824 label_escape_sequence:
825 /* Escape sequences handled by Emacs are invocation,
826 designation, direction specification, and character
827 composition specification. */
828 switch (c1)
829 {
830 case '&': /* revision of following character set */
831 ONE_MORE_BYTE (c1);
832 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 833 goto label_invalid_escape_sequence;
4ed46869
KH
834 ONE_MORE_BYTE (c1);
835 if (c1 != ISO_CODE_ESC)
e0e989f6 836 goto label_invalid_escape_sequence;
4ed46869
KH
837 ONE_MORE_BYTE (c1);
838 goto label_escape_sequence;
839
840 case '$': /* designation of 2-byte character set */
841 ONE_MORE_BYTE (c1);
842 if (c1 >= '@' && c1 <= 'B')
843 { /* designation of JISX0208.1978, GB2312.1980,
844 or JISX0208.1980 */
845 DECODE_DESIGNATION (0, 2, 94, c1);
846 }
847 else if (c1 >= 0x28 && c1 <= 0x2B)
848 { /* designation of DIMENSION2_CHARS94 character set */
849 ONE_MORE_BYTE (c2);
850 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
851 }
852 else if (c1 >= 0x2C && c1 <= 0x2F)
853 { /* designation of DIMENSION2_CHARS96 character set */
854 ONE_MORE_BYTE (c2);
855 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
856 }
857 else
e0e989f6 858 goto label_invalid_escape_sequence;
4ed46869
KH
859 break;
860
861 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
862 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
863 goto label_invalid_escape_sequence;
4ed46869 864 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 865 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
866 break;
867
868 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
869 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
870 goto label_invalid_escape_sequence;
4ed46869 871 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 872 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
873 break;
874
875 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
876 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
877 goto label_invalid_escape_sequence;
4ed46869
KH
878 ONE_MORE_BYTE (c1);
879 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
880 DECODE_ISO_CHARACTER (charset, c1);
881 break;
882
883 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
884 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
885 goto label_invalid_escape_sequence;
4ed46869
KH
886 ONE_MORE_BYTE (c1);
887 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
888 DECODE_ISO_CHARACTER (charset, c1);
889 break;
890
891 case '0': /* start composing without embeded rules */
892 coding->composing = COMPOSING_NO_RULE_HEAD;
893 break;
894
895 case '1': /* end composing */
896 coding->composing = COMPOSING_NO;
897 break;
898
899 case '2': /* start composing with embeded rules */
900 coding->composing = COMPOSING_WITH_RULE_HEAD;
901 break;
902
903 case '[': /* specification of direction */
904 /* For the moment, nested direction is not supported.
905 So, the value of `coding->direction' is 0 or 1: 0
906 means left-to-right, 1 means right-to-left. */
907 ONE_MORE_BYTE (c1);
908 switch (c1)
909 {
910 case ']': /* end of the current direction */
911 coding->direction = 0;
912
913 case '0': /* end of the current direction */
914 case '1': /* start of left-to-right direction */
915 ONE_MORE_BYTE (c1);
916 if (c1 == ']')
917 coding->direction = 0;
918 else
919 goto label_invalid_escape_sequence;
920 break;
921
922 case '2': /* start of right-to-left direction */
923 ONE_MORE_BYTE (c1);
924 if (c1 == ']')
925 coding->direction= 1;
926 else
927 goto label_invalid_escape_sequence;
928 break;
929
930 default:
931 goto label_invalid_escape_sequence;
932 }
933 break;
934
935 default:
936 if (c1 >= 0x28 && c1 <= 0x2B)
937 { /* designation of DIMENSION1_CHARS94 character set */
938 ONE_MORE_BYTE (c2);
939 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
940 }
941 else if (c1 >= 0x2C && c1 <= 0x2F)
942 { /* designation of DIMENSION1_CHARS96 character set */
943 ONE_MORE_BYTE (c2);
944 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
945 }
946 else
947 {
948 goto label_invalid_escape_sequence;
949 }
950 }
951 /* We must update these variables now. */
952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
953 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
954 break;
955
956 label_invalid_escape_sequence:
957 {
958 int length = src - src_base;
959
960 bcopy (src_base, dst, length);
961 dst += length;
962 }
963 }
964 continue;
965
966 label_end_of_loop:
967 coding->carryover_size = src - src_base;
968 bcopy (src_base, coding->carryover, coding->carryover_size);
969 src = src_base;
970 break;
971 }
972
973 /* If this is the last block of the text to be decoded, we had
974 better just flush out all remaining codes in the text although
975 they are not valid characters. */
976 if (coding->last_block)
977 {
978 bcopy (src, dst, src_end - src);
979 dst += (src_end - src);
980 src = src_end;
981 }
982 *consumed = src - source;
983 return dst - destination;
984}
985
986/* ISO2022 encoding staffs. */
987
988/*
989 It is not enough to say just "ISO2022" on encoding, but we have to
990 specify more details. In Emacs, each coding-system of ISO2022
991 variant has the following specifications:
992 1. Initial designation to G0 thru G3.
993 2. Allows short-form designation?
994 3. ASCII should be designated to G0 before control characters?
995 4. ASCII should be designated to G0 at end of line?
996 5. 7-bit environment or 8-bit environment?
997 6. Use locking-shift?
998 7. Use Single-shift?
999 And the following two are only for Japanese:
1000 8. Use ASCII in place of JIS0201-1976-Roman?
1001 9. Use JISX0208-1983 in place of JISX0208-1978?
1002 These specifications are encoded in `coding->flags' as flag bits
1003 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1004 detail.
1005*/
1006
1007/* Produce codes (escape sequence) for designating CHARSET to graphic
1008 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1009 the coding system CODING allows, produce designation sequence of
1010 short-form. */
1011
1012#define ENCODE_DESIGNATION(charset, reg, coding) \
1013 do { \
1014 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1015 char *intermediate_char_94 = "()*+"; \
1016 char *intermediate_char_96 = ",-./"; \
1017 Lisp_Object temp \
1018 = Fassq (make_number (charset), Vcharset_revision_alist); \
1019 if (! NILP (temp)) \
1020 { \
1021 *dst++ = ISO_CODE_ESC; \
1022 *dst++ = '&'; \
1023 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1024 } \
1025 *dst++ = ISO_CODE_ESC; \
1026 if (CHARSET_DIMENSION (charset) == 1) \
1027 { \
1028 if (CHARSET_CHARS (charset) == 94) \
1029 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1030 else \
1031 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1032 } \
1033 else \
1034 { \
1035 *dst++ = '$'; \
1036 if (CHARSET_CHARS (charset) == 94) \
1037 { \
1038 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1039 || reg != 0 \
1040 || final_char < '@' || final_char > 'B') \
1041 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1042 } \
1043 else \
1044 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1045 } \
1046 *dst++ = final_char; \
1047 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1048 } while (0)
1049
1050/* The following two macros produce codes (control character or escape
1051 sequence) for ISO2022 single-shift functions (single-shift-2 and
1052 single-shift-3). */
1053
1054#define ENCODE_SINGLE_SHIFT_2 \
1055 do { \
1056 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1057 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1058 else \
1059 *dst++ = ISO_CODE_SS2; \
1060 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1061 } while (0)
1062
1063#define ENCODE_SINGLE_SHIFT_3 \
1064 do { \
1065 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1066 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1067 else \
1068 *dst++ = ISO_CODE_SS3; \
1069 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1070 } while (0)
1071
1072/* The following four macros produce codes (control character or
1073 escape sequence) for ISO2022 locking-shift functions (shift-in,
1074 shift-out, locking-shift-2, and locking-shift-3). */
1075
1076#define ENCODE_SHIFT_IN \
1077 do { \
1078 *dst++ = ISO_CODE_SI; \
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1080 } while (0)
1081
1082#define ENCODE_SHIFT_OUT \
1083 do { \
1084 *dst++ = ISO_CODE_SO; \
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1086 } while (0)
1087
1088#define ENCODE_LOCKING_SHIFT_2 \
1089 do { \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1092 } while (0)
1093
1094#define ENCODE_LOCKING_SHIFT_3 \
1095 do { \
1096 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1097 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1098 } while (0)
1099
1100/* Produce codes for a DIMENSION1 character of which character set is
1101 CHARSET and position-code is C1. Designation and invocation
1102 sequences are also produced in advance if necessary. */
1103
1104
1105#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1106 do { \
1107 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1108 { \
1109 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1110 *dst++ = c1 & 0x7F; \
1111 else \
1112 *dst++ = c1 | 0x80; \
1113 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1114 break; \
1115 } \
1116 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1117 { \
1118 *dst++ = c1 & 0x7F; \
1119 break; \
1120 } \
1121 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1122 { \
1123 *dst++ = c1 | 0x80; \
1124 break; \
1125 } \
1126 else \
1127 /* Since CHARSET is not yet invoked to any graphic planes, we \
1128 must invoke it, or, at first, designate it to some graphic \
1129 register. Then repeat the loop to actually produce the \
1130 character. */ \
1131 dst = encode_invocation_designation (charset, coding, dst); \
1132 } while (1)
1133
1134/* Produce codes for a DIMENSION2 character of which character set is
1135 CHARSET and position-codes are C1 and C2. Designation and
1136 invocation codes are also produced in advance if necessary. */
1137
1138#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1139 do { \
1140 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1141 { \
1142 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1143 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1144 else \
1145 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1146 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1147 break; \
1148 } \
1149 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1150 { \
1151 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1152 break; \
1153 } \
1154 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1155 { \
1156 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1157 break; \
1158 } \
1159 else \
1160 /* Since CHARSET is not yet invoked to any graphic planes, we \
1161 must invoke it, or, at first, designate it to some graphic \
1162 register. Then repeat the loop to actually produce the \
1163 character. */ \
1164 dst = encode_invocation_designation (charset, coding, dst); \
1165 } while (1)
1166
1167/* Produce designation and invocation codes at a place pointed by DST
1168 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1169 Return new DST. */
1170
1171unsigned char *
1172encode_invocation_designation (charset, coding, dst)
1173 int charset;
1174 struct coding_system *coding;
1175 unsigned char *dst;
1176{
1177 int reg; /* graphic register number */
1178
1179 /* At first, check designations. */
1180 for (reg = 0; reg < 4; reg++)
1181 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1182 break;
1183
1184 if (reg >= 4)
1185 {
1186 /* CHARSET is not yet designated to any graphic registers. */
1187 /* At first check the requested designation. */
1188 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1189 if (reg < 0)
1190 /* Since CHARSET requests no special designation, designate to
1191 graphic register 0. */
1192 reg = 0;
1193
1194 ENCODE_DESIGNATION (charset, reg, coding);
1195 }
1196
1197 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1198 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1199 {
1200 /* Since the graphic register REG is not invoked to any graphic
1201 planes, invoke it to graphic plane 0. */
1202 switch (reg)
1203 {
1204 case 0: /* graphic register 0 */
1205 ENCODE_SHIFT_IN;
1206 break;
1207
1208 case 1: /* graphic register 1 */
1209 ENCODE_SHIFT_OUT;
1210 break;
1211
1212 case 2: /* graphic register 2 */
1213 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1214 ENCODE_SINGLE_SHIFT_2;
1215 else
1216 ENCODE_LOCKING_SHIFT_2;
1217 break;
1218
1219 case 3: /* graphic register 3 */
1220 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1221 ENCODE_SINGLE_SHIFT_3;
1222 else
1223 ENCODE_LOCKING_SHIFT_3;
1224 break;
1225 }
1226 }
1227 return dst;
1228}
1229
1230/* The following two macros produce codes for indicating composition. */
1231#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1232#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1233#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1234
1235/* The following three macros produce codes for indicating direction
1236 of text. */
1237#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1238 do { \
1239 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1240 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1241 else \
1242 *dst++ = ISO_CODE_CSI; \
1243 } while (0)
1244
1245#define ENCODE_DIRECTION_R2L \
1246 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1247
1248#define ENCODE_DIRECTION_L2R \
1249 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1250
1251/* Produce codes for designation and invocation to reset the graphic
1252 planes and registers to initial state. */
e0e989f6
KH
1253#define ENCODE_RESET_PLANE_AND_REGISTER \
1254 do { \
1255 int reg; \
1256 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1257 ENCODE_SHIFT_IN; \
1258 for (reg = 0; reg < 4; reg++) \
1259 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1260 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1261 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1262 ENCODE_DESIGNATION \
1263 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1264 } while (0)
1265
e0e989f6
KH
1266int
1267encode_designation_at_bol (coding, src, src_end, dstp)
1268 struct coding_system *coding;
1269 unsigned char *src, *src_end, **dstp;
1270{
1271 int charset, reg, r[4];
1272 unsigned char *dst = *dstp, c;
1273 for (reg = 0; reg < 4; reg++) r[reg] = -1;
1274 while (src < src_end && (c = *src++) != '\n')
1275 {
1276 switch (emacs_code_class[c])
1277 {
1278 case EMACS_ascii_code:
1279 charset = CHARSET_ASCII;
1280 break;
1281 case EMACS_leading_code_2:
1282 if (++src >= src_end) continue;
1283 charset = c;
1284 break;
1285 case EMACS_leading_code_3:
1286 if ((src += 2) >= src_end) continue;
1287 charset = (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1288 break;
1289 case EMACS_leading_code_4:
1290 if ((src += 3) >= src_end) continue;
1291 charset = *(src - 3);
1292 break;
1293 default:
1294 continue;
1295 }
1296 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1297 if (r[reg] < 0
1298 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset)
1299 r[reg] = charset;
1300 }
1301 if (c != '\n' && !coding->last_block)
1302 return -1;
1303 for (reg = 0; reg < 4; reg++)
1304 if (r[reg] >= 0)
1305 ENCODE_DESIGNATION (r[reg], reg, coding);
1306 *dstp = dst;
1307 return 0;
1308}
1309
4ed46869
KH
1310/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1311
1312int
1313encode_coding_iso2022 (coding, source, destination,
1314 src_bytes, dst_bytes, consumed)
1315 struct coding_system *coding;
1316 unsigned char *source, *destination;
1317 int src_bytes, dst_bytes;
1318 int *consumed;
1319{
1320 unsigned char *src = source;
1321 unsigned char *src_end = source + src_bytes;
1322 unsigned char *dst = destination;
1323 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1324 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1325 from DST_END to assure overflow checking is necessary only at the
1326 head of loop. */
e0e989f6 1327 unsigned char *adjusted_dst_end = dst_end - 19;
4ed46869
KH
1328
1329 while (src < src_end && dst < adjusted_dst_end)
1330 {
1331 /* SRC_BASE remembers the start position in source in each loop.
1332 The loop will be exited when there's not enough source text
1333 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1334 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1335 reset to SRC_BASE before exiting. */
1336 unsigned char *src_base = src;
e0e989f6 1337 unsigned char c1, c2, c3, c4;
4ed46869
KH
1338 int charset;
1339
e0e989f6
KH
1340 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1341 && CODING_SPEC_ISO_BOL (coding))
1342 {
1343 /* We have to produce destination sequences now. */
1344 if (encode_designation_at_bol (coding, src, src_end, &dst) < 0)
1345 /* We can't find end of line in the current block. Let's
1346 repeat encoding starting from the current position
1347 pointed by SRC. */
1348 break;
1349 CODING_SPEC_ISO_BOL (coding) = 0;
1350 }
1351
1352 c1 = *src++;
4ed46869
KH
1353 /* If we are seeing a component of a composite character, we are
1354 seeing a leading-code specially encoded for composition, or a
1355 composition rule if composing with rule. We must set C1
1356 to a normal leading-code or an ASCII code. If we are not at
1357 a composed character, we must reset the composition state. */
1358 if (COMPOSING_P (coding->composing))
1359 {
1360 if (c1 < 0xA0)
1361 {
1362 /* We are not in a composite character any longer. */
1363 coding->composing = COMPOSING_NO;
1364 ENCODE_COMPOSITION_END;
1365 }
1366 else
1367 {
1368 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1369 {
1370 *dst++ = c1 & 0x7F;
1371 coding->composing = COMPOSING_WITH_RULE_HEAD;
1372 continue;
1373 }
1374 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1375 coding->composing = COMPOSING_WITH_RULE_RULE;
1376 if (c1 == 0xA0)
1377 {
1378 /* This is an ASCII component. */
1379 ONE_MORE_BYTE (c1);
1380 c1 &= 0x7F;
1381 }
1382 else
1383 /* This is a leading-code of non ASCII component. */
1384 c1 -= 0x20;
1385 }
1386 }
1387
1388 /* Now encode one character. C1 is a control character, an
1389 ASCII character, or a leading-code of multi-byte character. */
1390 switch (emacs_code_class[c1])
1391 {
1392 case EMACS_ascii_code:
1393 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1394 break;
1395
1396 case EMACS_control_code:
1397 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1398 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1399 *dst++ = c1;
1400 break;
1401
1402 case EMACS_carriage_return_code:
1403 if (!coding->selective)
1404 {
1405 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1406 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1407 *dst++ = c1;
1408 break;
1409 }
1410 /* fall down to treat '\r' as '\n' ... */
1411
1412 case EMACS_linefeed_code:
1413 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1414 ENCODE_RESET_PLANE_AND_REGISTER;
1415 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1416 bcopy (coding->spec.iso2022.initial_designation,
1417 coding->spec.iso2022.current_designation,
1418 sizeof coding->spec.iso2022.initial_designation);
4ed46869
KH
1419 if (coding->eol_type == CODING_EOL_LF
1420 || coding->eol_type == CODING_EOL_AUTOMATIC)
1421 *dst++ = ISO_CODE_LF;
1422 else if (coding->eol_type == CODING_EOL_CRLF)
1423 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1424 else
1425 *dst++ = ISO_CODE_CR;
e0e989f6 1426 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1427 break;
1428
1429 case EMACS_leading_code_2:
1430 ONE_MORE_BYTE (c2);
1431 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1432 break;
1433
1434 case EMACS_leading_code_3:
1435 TWO_MORE_BYTES (c2, c3);
1436 if (c1 < LEADING_CODE_PRIVATE_11)
1437 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1438 else
1439 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1440 break;
1441
1442 case EMACS_leading_code_4:
1443 THREE_MORE_BYTES (c2, c3, c4);
1444 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1445 break;
1446
1447 case EMACS_leading_code_composition:
1448 ONE_MORE_BYTE (c1);
1449 if (c1 == 0xFF)
1450 {
1451 coding->composing = COMPOSING_WITH_RULE_HEAD;
1452 ENCODE_COMPOSITION_WITH_RULE_START;
1453 }
1454 else
1455 {
1456 /* Rewind one byte because it is a character code of
1457 composition elements. */
1458 src--;
1459 coding->composing = COMPOSING_NO_RULE_HEAD;
1460 ENCODE_COMPOSITION_NO_RULE_START;
1461 }
1462 break;
1463
1464 case EMACS_invalid_code:
1465 *dst++ = c1;
1466 break;
1467 }
1468 continue;
1469 label_end_of_loop:
1470 coding->carryover_size = src - src_base;
1471 bcopy (src_base, coding->carryover, coding->carryover_size);
1472 src = src_base;
1473 break;
1474 }
1475
1476 /* If this is the last block of the text to be encoded, we must
1477 reset the state of graphic planes and registers to initial one.
1478 In addition, we had better just flush out all remaining codes in
1479 the text although they are not valid characters. */
1480 if (coding->last_block)
1481 {
e0e989f6 1482 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1483 bcopy(src, dst, src_end - src);
1484 dst += (src_end - src);
1485 src = src_end;
1486 }
1487 *consumed = src - source;
1488 return dst - destination;
1489}
1490
1491\f
1492/*** 4. SJIS and BIG5 handlers ***/
1493
1494/* Although SJIS and BIG5 are not ISO's coding system, They are used
1495 quite widely. So, for the moment, Emacs supports them in the bare
1496 C code. But, in the future, they may be supported only by CCL. */
1497
1498/* SJIS is a coding system encoding three character sets: ASCII, right
1499 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1500 as is. A character of charset katakana-jisx0201 is encoded by
1501 "position-code + 0x80". A character of charset japanese-jisx0208
1502 is encoded in 2-byte but two position-codes are divided and shifted
1503 so that it fit in the range below.
1504
1505 --- CODE RANGE of SJIS ---
1506 (character set) (range)
1507 ASCII 0x00 .. 0x7F
1508 KATAKANA-JISX0201 0xA0 .. 0xDF
1509 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1510 (2nd byte) 0x40 .. 0xFF
1511 -------------------------------
1512
1513*/
1514
1515/* BIG5 is a coding system encoding two character sets: ASCII and
1516 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1517 character set and is encoded in two-byte.
1518
1519 --- CODE RANGE of BIG5 ---
1520 (character set) (range)
1521 ASCII 0x00 .. 0x7F
1522 Big5 (1st byte) 0xA1 .. 0xFE
1523 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1524 --------------------------
1525
1526 Since the number of characters in Big5 is larger than maximum
1527 characters in Emacs' charset (96x96), it can't be handled as one
1528 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1529 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1530 contains frequently used characters and the latter contains less
1531 frequently used characters. */
1532
1533/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1534 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1535 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1536 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1537
1538/* Number of Big5 characters which have the same code in 1st byte. */
1539#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1540
1541#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1542 do { \
1543 unsigned int temp \
1544 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1545 if (b1 < 0xC9) \
1546 charset = charset_big5_1; \
1547 else \
1548 { \
1549 charset = charset_big5_2; \
1550 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1551 } \
1552 c1 = temp / (0xFF - 0xA1) + 0x21; \
1553 c2 = temp % (0xFF - 0xA1) + 0x21; \
1554 } while (0)
1555
1556#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1557 do { \
1558 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1559 if (charset == charset_big5_2) \
1560 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1561 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1562 b2 = temp % BIG5_SAME_ROW; \
1563 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1564 } while (0)
1565
1566/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1567 Check if a text is encoded in SJIS. If it is, return
1568 CODING_CATEGORY_MASK_SJIS, else return 0. */
1569
1570int
1571detect_coding_sjis (src, src_end)
1572 unsigned char *src, *src_end;
1573{
1574 unsigned char c;
1575
1576 while (src < src_end)
1577 {
1578 c = *src++;
1579 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1580 return 0;
1581 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1582 {
1583 if (src < src_end && *src++ < 0x40)
1584 return 0;
1585 }
1586 }
1587 return CODING_CATEGORY_MASK_SJIS;
1588}
1589
1590/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591 Check if a text is encoded in BIG5. If it is, return
1592 CODING_CATEGORY_MASK_BIG5, else return 0. */
1593
1594int
1595detect_coding_big5 (src, src_end)
1596 unsigned char *src, *src_end;
1597{
1598 unsigned char c;
1599
1600 while (src < src_end)
1601 {
1602 c = *src++;
1603 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1604 return 0;
1605 if (c >= 0xA1)
1606 {
1607 if (src >= src_end)
1608 break;
1609 c = *src++;
1610 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1611 return 0;
1612 }
1613 }
1614 return CODING_CATEGORY_MASK_BIG5;
1615}
1616
1617/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1618 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1619
1620int
1621decode_coding_sjis_big5 (coding, source, destination,
1622 src_bytes, dst_bytes, consumed, sjis_p)
1623 struct coding_system *coding;
1624 unsigned char *source, *destination;
1625 int src_bytes, dst_bytes;
1626 int *consumed;
1627 int sjis_p;
1628{
1629 unsigned char *src = source;
1630 unsigned char *src_end = source + src_bytes;
1631 unsigned char *dst = destination;
1632 unsigned char *dst_end = destination + dst_bytes;
1633 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1634 from DST_END to assure overflow checking is necessary only at the
1635 head of loop. */
1636 unsigned char *adjusted_dst_end = dst_end - 3;
1637
1638 while (src < src_end && dst < adjusted_dst_end)
1639 {
1640 /* SRC_BASE remembers the start position in source in each loop.
1641 The loop will be exited when there's not enough source text
1642 to analyze two-byte character (within macro ONE_MORE_BYTE).
1643 In that case, SRC is reset to SRC_BASE before exiting. */
1644 unsigned char *src_base = src;
1645 unsigned char c1 = *src++, c2, c3, c4;
1646
1647 if (c1 == '\r')
1648 {
1649 if (coding->eol_type == CODING_EOL_CRLF)
1650 {
1651 ONE_MORE_BYTE (c2);
1652 if (c2 == '\n')
1653 *dst++ = c2;
1654 else
1655 /* To process C2 again, SRC is subtracted by 1. */
1656 *dst++ = c1, src--;
1657 }
1658 else
1659 *dst++ = c1;
1660 }
1661 else if (c1 < 0x80)
1662 *dst++ = c1;
1663 else if (c1 < 0xA0 || c1 >= 0xE0)
1664 {
1665 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1666 if (sjis_p)
1667 {
1668 ONE_MORE_BYTE (c2);
1669 DECODE_SJIS (c1, c2, c3, c4);
1670 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1671 }
1672 else if (c1 >= 0xE0 && c1 < 0xFF)
1673 {
1674 int charset;
1675
1676 ONE_MORE_BYTE (c2);
1677 DECODE_BIG5 (c1, c2, charset, c3, c4);
1678 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1679 }
1680 else /* Invalid code */
1681 *dst++ = c1;
1682 }
1683 else
1684 {
1685 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1686 if (sjis_p)
1687 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1688 else
1689 {
1690 int charset;
1691
1692 ONE_MORE_BYTE (c2);
1693 DECODE_BIG5 (c1, c2, charset, c3, c4);
1694 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1695 }
1696 }
1697 continue;
1698
1699 label_end_of_loop:
1700 coding->carryover_size = src - src_base;
1701 bcopy (src_base, coding->carryover, coding->carryover_size);
1702 src = src_base;
1703 break;
1704 }
1705
1706 *consumed = src - source;
1707 return dst - destination;
1708}
1709
1710/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1711 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1712 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1713 sure that all these charsets are registered as official charset
1714 (i.e. do not have extended leading-codes). Characters of other
1715 charsets are produced without any encoding. If SJIS_P is 1, encode
1716 SJIS text, else encode BIG5 text. */
1717
1718int
1719encode_coding_sjis_big5 (coding, source, destination,
1720 src_bytes, dst_bytes, consumed, sjis_p)
1721 struct coding_system *coding;
1722 unsigned char *source, *destination;
1723 int src_bytes, dst_bytes;
1724 int *consumed;
1725 int sjis_p;
1726{
1727 unsigned char *src = source;
1728 unsigned char *src_end = source + src_bytes;
1729 unsigned char *dst = destination;
1730 unsigned char *dst_end = destination + dst_bytes;
1731 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1732 from DST_END to assure overflow checking is necessary only at the
1733 head of loop. */
1734 unsigned char *adjusted_dst_end = dst_end - 1;
1735
1736 while (src < src_end && dst < adjusted_dst_end)
1737 {
1738 /* SRC_BASE remembers the start position in source in each loop.
1739 The loop will be exited when there's not enough source text
1740 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1741 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1742 before exiting. */
1743 unsigned char *src_base = src;
1744 unsigned char c1 = *src++, c2, c3, c4;
1745
1746 if (coding->composing)
1747 {
1748 if (c1 == 0xA0)
1749 {
1750 ONE_MORE_BYTE (c1);
1751 c1 &= 0x7F;
1752 }
1753 else if (c1 >= 0xA0)
1754 c1 -= 0x20;
1755 else
1756 coding->composing = 0;
1757 }
1758
1759 switch (emacs_code_class[c1])
1760 {
1761 case EMACS_ascii_code:
1762 case EMACS_control_code:
1763 *dst++ = c1;
1764 break;
1765
1766 case EMACS_carriage_return_code:
1767 if (!coding->selective)
1768 {
1769 *dst++ = c1;
1770 break;
1771 }
1772 /* fall down to treat '\r' as '\n' ... */
1773
1774 case EMACS_linefeed_code:
1775 if (coding->eol_type == CODING_EOL_LF
1776 || coding->eol_type == CODING_EOL_AUTOMATIC)
1777 *dst++ = '\n';
1778 else if (coding->eol_type == CODING_EOL_CRLF)
1779 *dst++ = '\r', *dst++ = '\n';
1780 else
1781 *dst++ = '\r';
1782 break;
1783
1784 case EMACS_leading_code_2:
1785 ONE_MORE_BYTE (c2);
1786 if (sjis_p && c1 == charset_katakana_jisx0201)
1787 *dst++ = c2;
1788 else
1789 *dst++ = c1, *dst++ = c2;
1790 break;
1791
1792 case EMACS_leading_code_3:
1793 TWO_MORE_BYTES (c2, c3);
1794 c2 &= 0x7F, c3 &= 0x7F;
1795 if (sjis_p && c1 == charset_jisx0208)
1796 {
1797 unsigned char s1, s2;
1798
1799 ENCODE_SJIS (c2, c3, s1, s2);
1800 *dst++ = s1, *dst++ = s2;
1801 }
1802 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1803 {
1804 unsigned char b1, b2;
1805
1806 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1807 *dst++ = b1, *dst++ = b2;
1808 }
1809 else
1810 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1811 break;
1812
1813 case EMACS_leading_code_4:
1814 THREE_MORE_BYTES (c2, c3, c4);
1815 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1816 break;
1817
1818 case EMACS_leading_code_composition:
1819 coding->composing = 1;
1820 break;
1821
1822 default: /* i.e. case EMACS_invalid_code: */
1823 *dst++ = c1;
1824 }
1825 continue;
1826
1827 label_end_of_loop:
1828 coding->carryover_size = src - src_base;
1829 bcopy (src_base, coding->carryover, coding->carryover_size);
1830 src = src_base;
1831 break;
1832 }
1833
1834 *consumed = src - source;
1835 return dst - destination;
1836}
1837
1838\f
1839/*** 5. End-of-line handlers ***/
1840
1841/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1842 This function is called only when `coding->eol_type' is
1843 CODING_EOL_CRLF or CODING_EOL_CR. */
1844
1845decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1846 struct coding_system *coding;
1847 unsigned char *source, *destination;
1848 int src_bytes, dst_bytes;
1849 int *consumed;
1850{
1851 unsigned char *src = source;
1852 unsigned char *src_end = source + src_bytes;
1853 unsigned char *dst = destination;
1854 unsigned char *dst_end = destination + dst_bytes;
1855 int produced;
1856
1857 switch (coding->eol_type)
1858 {
1859 case CODING_EOL_CRLF:
1860 {
1861 /* Since the maximum bytes produced by each loop is 2, we
1862 subtract 1 from DST_END to assure overflow checking is
1863 necessary only at the head of loop. */
1864 unsigned char *adjusted_dst_end = dst_end - 1;
1865
1866 while (src < src_end && dst < adjusted_dst_end)
1867 {
1868 unsigned char *src_base = src;
1869 unsigned char c = *src++;
1870 if (c == '\r')
1871 {
1872 ONE_MORE_BYTE (c);
1873 if (c != '\n')
1874 *dst++ = '\r';
bfd99048 1875 *dst++ = c;
4ed46869
KH
1876 }
1877 else
1878 *dst++ = c;
1879 continue;
1880
1881 label_end_of_loop:
1882 coding->carryover_size = src - src_base;
1883 bcopy (src_base, coding->carryover, coding->carryover_size);
1884 src = src_base;
1885 break;
1886 }
1887 *consumed = src - source;
1888 produced = dst - destination;
1889 break;
1890 }
1891
1892 case CODING_EOL_CR:
1893 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1894 bcopy (source, destination, produced);
1895 dst_end = destination + produced;
1896 while (dst < dst_end)
1897 if (*dst++ == '\r') dst[-1] = '\n';
1898 *consumed = produced;
1899 break;
1900
1901 default: /* i.e. case: CODING_EOL_LF */
1902 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1903 bcopy (source, destination, produced);
1904 *consumed = produced;
1905 break;
1906 }
1907
1908 return produced;
1909}
1910
1911/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1912 format of end-of-line according to `coding->eol_type'. If
1913 `coding->selective' is 1, code '\r' in source text also means
1914 end-of-line. */
1915
1916encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1917 struct coding_system *coding;
1918 unsigned char *source, *destination;
1919 int src_bytes, dst_bytes;
1920 int *consumed;
1921{
1922 unsigned char *src = source;
1923 unsigned char *dst = destination;
1924 int produced;
1925
1926 if (src_bytes <= 0)
1927 return 0;
1928
1929 switch (coding->eol_type)
1930 {
1931 case CODING_EOL_LF:
1932 case CODING_EOL_AUTOMATIC:
1933 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1934 bcopy (source, destination, produced);
1935 if (coding->selective)
1936 {
1937 int i = produced;
1938 while (i--)
1939 if (*dst++ == '\r') dst[-1] = '\n';
1940 }
1941 *consumed = produced;
1942
1943 case CODING_EOL_CRLF:
1944 {
1945 unsigned char c;
1946 unsigned char *src_end = source + src_bytes;
1947 unsigned char *dst_end = destination + dst_bytes;
1948 /* Since the maximum bytes produced by each loop is 2, we
1949 subtract 1 from DST_END to assure overflow checking is
1950 necessary only at the head of loop. */
1951 unsigned char *adjusted_dst_end = dst_end - 1;
1952
1953 while (src < src_end && dst < adjusted_dst_end)
1954 {
1955 c = *src++;
1956 if (c == '\n' || (c == '\r' && coding->selective))
1957 *dst++ = '\r', *dst++ = '\n';
1958 else
1959 *dst++ = c;
1960 }
1961 produced = dst - destination;
1962 *consumed = src - source;
1963 break;
1964 }
1965
1966 default: /* i.e. case CODING_EOL_CR: */
1967 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1968 bcopy (source, destination, produced);
1969 {
1970 int i = produced;
1971 while (i--)
1972 if (*dst++ == '\n') dst[-1] = '\r';
1973 }
1974 *consumed = produced;
1975 }
1976
1977 return produced;
1978}
1979
1980\f
1981/*** 6. C library functions ***/
1982
1983/* In Emacs Lisp, coding system is represented by a Lisp symbol which
1984 has a property `coding-system'. The value of this property is a
1985 vector of length 5 (called as coding-vector). Among elements of
1986 this vector, the first (element[0]) and the fifth (element[4])
1987 carry important information for decoding/encoding. Before
1988 decoding/encoding, this information should be set in fields of a
1989 structure of type `coding_system'.
1990
1991 A value of property `coding-system' can be a symbol of another
1992 subsidiary coding-system. In that case, Emacs gets coding-vector
1993 from that symbol.
1994
1995 `element[0]' contains information to be set in `coding->type'. The
1996 value and its meaning is as follows:
1997
1998 0 -- coding_system_internal
1999 1 -- coding_system_sjis
2000 2 -- coding_system_iso2022
2001 3 -- coding_system_big5
2002 4 -- coding_system_ccl
2003 nil -- coding_system_no_conversion
2004 t -- coding_system_automatic
2005
2006 `element[4]' contains information to be set in `coding->flags' and
2007 `coding->spec'. The meaning varies by `coding->type'.
2008
2009 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2010 of length 32 (of which the first 13 sub-elements are used now).
2011 Meanings of these sub-elements are:
2012
2013 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2014 If the value is an integer of valid charset, the charset is
2015 assumed to be designated to graphic register N initially.
2016
2017 If the value is minus, it is a minus value of charset which
2018 reserves graphic register N, which means that the charset is
2019 not designated initially but should be designated to graphic
2020 register N just before encoding a character in that charset.
2021
2022 If the value is nil, graphic register N is never used on
2023 encoding.
2024
2025 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2026 Each value takes t or nil. See the section ISO2022 of
2027 `coding.h' for more information.
2028
2029 If `coding->type' is `coding_type_big5', element[4] is t to denote
2030 BIG5-ETen or nil to denote BIG5-HKU.
2031
2032 If `coding->type' takes the other value, element[4] is ignored.
2033
2034 Emacs Lisp's coding system also carries information about format of
2035 end-of-line in a value of property `eol-type'. If the value is
2036 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2037 means CODING_EOL_CR. If it is not integer, it should be a vector
2038 of subsidiary coding systems of which property `eol-type' has one
2039 of above values.
2040
2041*/
2042
2043/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2044 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2045 is setup so that no conversion is necessary and return -1, else
2046 return 0. */
2047
2048int
e0e989f6
KH
2049setup_coding_system (coding_system, coding)
2050 Lisp_Object coding_system;
4ed46869
KH
2051 struct coding_system *coding;
2052{
4ed46869
KH
2053 Lisp_Object type, eol_type;
2054
2055 /* At first, set several fields default values. */
2056 coding->require_flushing = 0;
2057 coding->last_block = 0;
2058 coding->selective = 0;
2059 coding->composing = 0;
2060 coding->direction = 0;
2061 coding->carryover_size = 0;
4ed46869
KH
2062 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2063
e0e989f6
KH
2064 Vlast_coding_system_used = coding->symbol = coding_system;
2065 eol_type = Qnil;
2066 /* Get value of property `coding-system' until we get a vector.
2067 While doing that, also get values of properties
2068 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2069 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2070 {
4ed46869 2071 if (NILP (coding->post_read_conversion))
e0e989f6 2072 coding->post_read_conversion = Fget (coding_system,
4ed46869 2073 Qpost_read_conversion);
e0e989f6
KH
2074 if (NILP (coding->pre_write_conversion))
2075 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2076 Qpre_write_conversion);
e0e989f6
KH
2077 if (NILP (eol_type))
2078 eol_type = Fget (coding_system, Qeol_type);
2079 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2080 }
e0e989f6
KH
2081 if (!VECTORP (coding_system)
2082 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2083 goto label_invalid_coding_system;
2084
4ed46869
KH
2085 if (VECTORP (eol_type))
2086 coding->eol_type = CODING_EOL_AUTOMATIC;
2087 else if (XFASTINT (eol_type) == 1)
2088 coding->eol_type = CODING_EOL_CRLF;
2089 else if (XFASTINT (eol_type) == 2)
2090 coding->eol_type = CODING_EOL_CR;
2091 else
2092 coding->eol_type = CODING_EOL_LF;
2093
e0e989f6 2094 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2095 switch (XFASTINT (type))
2096 {
2097 case 0:
2098 coding->type = coding_type_internal;
2099 break;
2100
2101 case 1:
2102 coding->type = coding_type_sjis;
2103 break;
2104
2105 case 2:
2106 coding->type = coding_type_iso2022;
2107 {
e0e989f6 2108 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2109 Lisp_Object *flags;
2110 int i, charset, default_reg_bits = 0;
2111
2112 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2113 goto label_invalid_coding_system;
2114
2115 flags = XVECTOR (val)->contents;
2116 coding->flags
2117 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2118 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2119 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2120 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2121 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2122 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2123 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2124 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2125 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2126 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2127 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2128
2129 /* Invoke graphic register 0 to plane 0. */
2130 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2131 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2132 CODING_SPEC_ISO_INVOCATION (coding, 1)
2133 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2134 /* Not single shifting at first. */
2135 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2136 /* Beginning of buffer should also be regarded as bol. */
2137 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2138
2139 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2140 FLAGS[REG] can be one of below:
2141 integer CHARSET: CHARSET occupies register I,
2142 t: designate nothing to REG initially, but can be used
2143 by any charsets,
2144 list of integer, nil, or t: designate the first
2145 element (if integer) to REG initially, the remaining
2146 elements (if integer) is designated to REG on request,
2147 if an element is t, REG can be used by any charset,
2148 nil: REG is never used. */
2149 for (charset = 0; charset < MAX_CHARSET; charset++)
2150 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2151 for (i = 0; i < 4; i++)
2152 {
2153 if (INTEGERP (flags[i])
e0e989f6
KH
2154 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2155 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2156 {
2157 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2158 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2159 }
2160 else if (EQ (flags[i], Qt))
2161 {
2162 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2163 default_reg_bits |= 1 << i;
2164 }
2165 else if (CONSP (flags[i]))
2166 {
2167 Lisp_Object tail = flags[i];
2168
2169 if (INTEGERP (XCONS (tail)->car)
2170 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2171 CHARSET_VALID_P (charset))
2172 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2173 {
2174 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2176 }
2177 else
2178 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2179 tail = XCONS (tail)->cdr;
2180 while (CONSP (tail))
2181 {
2182 if (INTEGERP (XCONS (tail)->car)
2183 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2184 CHARSET_VALID_P (charset))
2185 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2186 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2187 = i;
2188 else if (EQ (XCONS (tail)->car, Qt))
2189 default_reg_bits |= 1 << i;
2190 tail = XCONS (tail)->cdr;
2191 }
2192 }
2193 else
2194 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2195
2196 CODING_SPEC_ISO_DESIGNATION (coding, i)
2197 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2198 }
2199
2200 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2201 {
2202 /* REG 1 can be used only by locking shift in 7-bit env. */
2203 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2204 default_reg_bits &= ~2;
2205 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2206 /* Without any shifting, only REG 0 and 1 can be used. */
2207 default_reg_bits &= 3;
2208 }
2209
2210 for (charset = 0; charset < MAX_CHARSET; charset++)
2211 if (CHARSET_VALID_P (charset)
2212 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2213 {
2214 /* We have not yet decided where to designate CHARSET. */
2215 int reg_bits = default_reg_bits;
2216
2217 if (CHARSET_CHARS (charset) == 96)
2218 /* A charset of CHARS96 can't be designated to REG 0. */
2219 reg_bits &= ~1;
2220
2221 if (reg_bits)
2222 /* There exist some default graphic register. */
2223 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2224 = (reg_bits & 1
2225 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2226 else
2227 /* We anyway have to designate CHARSET to somewhere. */
2228 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2229 = (CHARSET_CHARS (charset) == 94
2230 ? 0
2231 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2232 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2233 ? 1
2234 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2235 ? 2 : 0)));
2236 }
2237 }
2238 coding->require_flushing = 1;
2239 break;
2240
2241 case 3:
2242 coding->type = coding_type_big5;
2243 coding->flags
e0e989f6 2244 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2245 ? CODING_FLAG_BIG5_HKU
2246 : CODING_FLAG_BIG5_ETEN);
2247 break;
2248
2249 case 4:
2250 coding->type = coding_type_ccl;
2251 {
e0e989f6 2252 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2253 if (CONSP (val)
2254 && VECTORP (XCONS (val)->car)
2255 && VECTORP (XCONS (val)->cdr))
2256 {
2257 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2258 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2259 }
2260 else
2261 goto label_invalid_coding_system;
2262 }
2263 coding->require_flushing = 1;
2264 break;
2265
2266 default:
2267 if (EQ (type, Qt))
2268 coding->type = coding_type_automatic;
2269 else
2270 coding->type = coding_type_no_conversion;
2271 break;
2272 }
2273 return 0;
2274
2275 label_invalid_coding_system:
2276 coding->type = coding_type_no_conversion;
e0e989f6
KH
2277 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2278 = Qnil;
4ed46869
KH
2279 return -1;
2280}
2281
2282/* Emacs has a mechanism to automatically detect a coding system if it
2283 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2284 it's impossible to distinguish some coding systems accurately
2285 because they use the same range of codes. So, at first, coding
2286 systems are categorized into 7, those are:
2287
2288 o coding-category-internal
2289
2290 The category for a coding system which has the same code range
2291 as Emacs' internal format. Assigned the coding-system (Lisp
e0e989f6 2292 symbol) `internal' by default.
4ed46869
KH
2293
2294 o coding-category-sjis
2295
2296 The category for a coding system which has the same code range
2297 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2298 symbol) `shift-jis' by default.
4ed46869
KH
2299
2300 o coding-category-iso-7
2301
2302 The category for a coding system which has the same code range
2303 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2304 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2305
2306 o coding-category-iso-8-1
2307
2308 The category for a coding system which has the same code range
2309 as ISO2022 of 8-bit environment and graphic plane 1 used only
2310 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2311 symbol) `iso-8859-1' by default.
4ed46869
KH
2312
2313 o coding-category-iso-8-2
2314
2315 The category for a coding system which has the same code range
2316 as ISO2022 of 8-bit environment and graphic plane 1 used only
2317 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2318 symbol) `euc-japan' by default.
4ed46869
KH
2319
2320 o coding-category-iso-else
2321
2322 The category for a coding system which has the same code range
2323 as ISO2022 but not belongs to any of the above three
2324 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2325 `iso-2022-ss2-7' by default.
4ed46869
KH
2326
2327 o coding-category-big5
2328
2329 The category for a coding system which has the same code range
2330 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2331 `cn-big5' by default.
4ed46869
KH
2332
2333 o coding-category-binary
2334
2335 The category for a coding system not categorized in any of the
2336 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2337 `no-conversion' by default.
4ed46869
KH
2338
2339 Each of them is a Lisp symbol and the value is an actual
2340 `coding-system's (this is also a Lisp symbol) assigned by a user.
2341 What Emacs does actually is to detect a category of coding system.
2342 Then, it uses a `coding-system' assigned to it. If Emacs can't
2343 decide only one possible category, it selects a category of the
2344 highest priority. Priorities of categories are also specified by a
2345 user in a Lisp variable `coding-category-list'.
2346
2347*/
2348
2349/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2350 If it detects possible coding systems, return an integer in which
2351 appropriate flag bits are set. Flag bits are defined by macros
2352 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2353
2354int
2355detect_coding_mask (src, src_bytes)
2356 unsigned char *src;
2357 int src_bytes;
2358{
2359 register unsigned char c;
2360 unsigned char *src_end = src + src_bytes;
2361 int mask;
2362
2363 /* At first, skip all ASCII characters and control characters except
2364 for three ISO2022 specific control characters. */
2365 while (src < src_end)
2366 {
2367 c = *src;
2368 if (c >= 0x80
2369 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2370 break;
2371 src++;
2372 }
2373
2374 if (src >= src_end)
2375 /* We found nothing other than ASCII. There's nothing to do. */
2376 return CODING_CATEGORY_MASK_ANY;
2377
2378 /* The text seems to be encoded in some multilingual coding system.
2379 Now, try to find in which coding system the text is encoded. */
2380 if (c < 0x80)
2381 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2382 /* C is an ISO2022 specific control code of C0. */
2383 mask = detect_coding_iso2022 (src, src_end);
2384
2385 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2386 /* C is an ISO2022 specific control code of C1,
2387 or the first byte of SJIS's 2-byte character code,
2388 or a leading code of Emacs. */
2389 mask = (detect_coding_iso2022 (src, src_end)
2390 | detect_coding_sjis (src, src_end)
2391 | detect_coding_internal (src, src_end));
2392
2393 else if (c < 0xA0)
2394 /* C is the first byte of SJIS character code,
2395 or a leading-code of Emacs. */
2396 mask = (detect_coding_sjis (src, src_end)
2397 | detect_coding_internal (src, src_end));
2398
2399 else
2400 /* C is a character of ISO2022 in graphic plane right,
2401 or a SJIS's 1-byte character code (i.e. JISX0201),
2402 or the first byte of BIG5's 2-byte code. */
2403 mask = (detect_coding_iso2022 (src, src_end)
2404 | detect_coding_sjis (src, src_end)
2405 | detect_coding_big5 (src, src_end));
2406
2407 return mask;
2408}
2409
2410/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2411 The information of the detected coding system is set in CODING. */
2412
2413void
2414detect_coding (coding, src, src_bytes)
2415 struct coding_system *coding;
2416 unsigned char *src;
2417 int src_bytes;
2418{
2419 int mask = detect_coding_mask (src, src_bytes);
2420 int idx;
2421
2422 if (mask == CODING_CATEGORY_MASK_ANY)
2423 /* We found nothing other than ASCII. There's nothing to do. */
2424 return;
2425
2426 if (!mask)
2427 /* The source text seems to be encoded in unknown coding system.
2428 Emacs regards the category of such a kind of coding system as
2429 `coding-category-binary'. We assume that a user has assigned
2430 an appropriate coding system for a `coding-category-binary'. */
2431 idx = CODING_CATEGORY_IDX_BINARY;
2432 else
2433 {
2434 /* We found some plausible coding systems. Let's use a coding
2435 system of the highest priority. */
2436 Lisp_Object val = Vcoding_category_list;
2437
2438 if (CONSP (val))
2439 while (!NILP (val))
2440 {
2441 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2442 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2443 break;
2444 val = XCONS (val)->cdr;
2445 }
2446 else
2447 val = Qnil;
2448
2449 if (NILP (val))
2450 {
2451 /* For unknown reason, `Vcoding_category_list' contains none
2452 of found categories. Let's use any of them. */
2453 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2454 if (mask & (1 << idx))
2455 break;
2456 }
2457 }
2458 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2459}
2460
2461/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2462 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2463 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2464
2465int
2466detect_eol_type (src, src_bytes)
2467 unsigned char *src;
2468 int src_bytes;
2469{
2470 unsigned char *src_end = src + src_bytes;
2471 unsigned char c;
2472
2473 while (src < src_end)
2474 {
2475 c = *src++;
2476 if (c == '\n')
2477 return CODING_EOL_LF;
2478 else if (c == '\r')
2479 {
2480 if (src < src_end && *src == '\n')
2481 return CODING_EOL_CRLF;
2482 else
2483 return CODING_EOL_CR;
2484 }
2485 }
2486 return CODING_EOL_AUTOMATIC;
2487}
2488
2489/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2490 is encoded. If it detects an appropriate format of end-of-line, it
2491 sets the information in *CODING. */
2492
2493void
2494detect_eol (coding, src, src_bytes)
2495 struct coding_system *coding;
2496 unsigned char *src;
2497 int src_bytes;
2498{
2499 Lisp_Object val;
2500 int eol_type = detect_eol_type (src, src_bytes);
2501
2502 if (eol_type == CODING_EOL_AUTOMATIC)
2503 /* We found no end-of-line in the source text. */
2504 return;
2505
2506 val = Fget (coding->symbol, Qeol_type);
2507 if (VECTORP (val) && XVECTOR (val)->size == 3)
2508 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2509}
2510
2511/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2512 decoding, it may detect coding system and format of end-of-line if
2513 those are not yet decided. */
2514
2515int
2516decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2517 struct coding_system *coding;
2518 unsigned char *source, *destination;
2519 int src_bytes, dst_bytes;
2520 int *consumed;
2521{
2522 int produced;
2523
2524 if (src_bytes <= 0)
2525 {
2526 *consumed = 0;
2527 return 0;
2528 }
2529
2530 if (coding->type == coding_type_automatic)
2531 detect_coding (coding, source, src_bytes);
2532
2533 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2534 detect_eol (coding, source, src_bytes);
2535
2536 coding->carryover_size = 0;
2537 switch (coding->type)
2538 {
2539 case coding_type_no_conversion:
2540 label_no_conversion:
2541 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2542 bcopy (source, destination, produced);
2543 *consumed = produced;
2544 break;
2545
2546 case coding_type_internal:
2547 case coding_type_automatic:
2548 if (coding->eol_type == CODING_EOL_LF
2549 || coding->eol_type == CODING_EOL_AUTOMATIC)
2550 goto label_no_conversion;
2551 produced = decode_eol (coding, source, destination,
2552 src_bytes, dst_bytes, consumed);
2553 break;
2554
2555 case coding_type_sjis:
2556 produced = decode_coding_sjis_big5 (coding, source, destination,
2557 src_bytes, dst_bytes, consumed,
2558 1);
2559 break;
2560
2561 case coding_type_iso2022:
2562 produced = decode_coding_iso2022 (coding, source, destination,
2563 src_bytes, dst_bytes, consumed);
2564 break;
2565
2566 case coding_type_big5:
2567 produced = decode_coding_sjis_big5 (coding, source, destination,
2568 src_bytes, dst_bytes, consumed,
2569 0);
2570 break;
2571
2572 case coding_type_ccl:
2573 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2574 src_bytes, dst_bytes, consumed);
2575 break;
2576 }
2577
2578 return produced;
2579}
2580
2581/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2582
2583int
2584encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2585 struct coding_system *coding;
2586 unsigned char *source, *destination;
2587 int src_bytes, dst_bytes;
2588 int *consumed;
2589{
2590 int produced;
2591
2592 coding->carryover_size = 0;
2593 switch (coding->type)
2594 {
2595 case coding_type_no_conversion:
2596 label_no_conversion:
2597 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2598 if (produced > 0)
2599 {
2600 bcopy (source, destination, produced);
2601 if (coding->selective)
2602 {
2603 unsigned char *p = destination, *pend = destination + produced;
2604 while (p < pend)
e0e989f6 2605 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2606 }
2607 }
2608 *consumed = produced;
2609 break;
2610
2611 case coding_type_internal:
2612 case coding_type_automatic:
2613 if (coding->eol_type == CODING_EOL_LF
2614 || coding->eol_type == CODING_EOL_AUTOMATIC)
2615 goto label_no_conversion;
2616 produced = encode_eol (coding, source, destination,
2617 src_bytes, dst_bytes, consumed);
2618 break;
2619
2620 case coding_type_sjis:
2621 produced = encode_coding_sjis_big5 (coding, source, destination,
2622 src_bytes, dst_bytes, consumed,
2623 1);
2624 break;
2625
2626 case coding_type_iso2022:
2627 produced = encode_coding_iso2022 (coding, source, destination,
2628 src_bytes, dst_bytes, consumed);
2629 break;
2630
2631 case coding_type_big5:
2632 produced = encode_coding_sjis_big5 (coding, source, destination,
2633 src_bytes, dst_bytes, consumed,
2634 0);
2635 break;
2636
2637 case coding_type_ccl:
2638 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2639 src_bytes, dst_bytes, consumed);
2640 break;
2641 }
2642
2643 return produced;
2644}
2645
2646#define CONVERSION_BUFFER_EXTRA_ROOM 256
2647
2648/* Return maximum size (bytes) of a buffer enough for decoding
2649 SRC_BYTES of text encoded in CODING. */
2650
2651int
2652decoding_buffer_size (coding, src_bytes)
2653 struct coding_system *coding;
2654 int src_bytes;
2655{
2656 int magnification;
2657
2658 if (coding->type == coding_type_iso2022)
2659 magnification = 3;
2660 else if (coding->type == coding_type_ccl)
2661 magnification = coding->spec.ccl.decoder.buf_magnification;
2662 else
2663 magnification = 2;
2664
2665 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2666}
2667
2668/* Return maximum size (bytes) of a buffer enough for encoding
2669 SRC_BYTES of text to CODING. */
2670
2671int
2672encoding_buffer_size (coding, src_bytes)
2673 struct coding_system *coding;
2674 int src_bytes;
2675{
2676 int magnification;
2677
2678 if (coding->type == coding_type_ccl)
2679 magnification = coding->spec.ccl.encoder.buf_magnification;
2680 else
2681 magnification = 3;
2682
2683 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2684}
2685
2686#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2687#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2688#endif
2689
2690char *conversion_buffer;
2691int conversion_buffer_size;
2692
2693/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2694 or decoding. Sufficient memory is allocated automatically. If we
2695 run out of memory, return NULL. */
2696
2697char *
2698get_conversion_buffer (size)
2699 int size;
2700{
2701 if (size > conversion_buffer_size)
2702 {
2703 char *buf;
2704 int real_size = conversion_buffer_size * 2;
2705
2706 while (real_size < size) real_size *= 2;
2707 buf = (char *) xmalloc (real_size);
2708 xfree (conversion_buffer);
2709 conversion_buffer = buf;
2710 conversion_buffer_size = real_size;
2711 }
2712 return conversion_buffer;
2713}
2714
2715\f
2716#ifdef emacs
2717/*** 7. Emacs Lisp library functions ***/
2718
2719DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2720 1, 1, 0,
2721 "Return coding-vector of CODING-SYSTEM.\n\
2722If CODING-SYSTEM is not a valid coding-system, return nil.")
2723 (obj)
2724 Lisp_Object obj;
2725{
2726 while (SYMBOLP (obj) && !NILP (obj))
2727 obj = Fget (obj, Qcoding_system);
2728 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2729 ? Qnil : obj);
2730}
2731
2732DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2733 "Return t if OBJECT is nil or a coding-system.\n\
2734See document of make-coding-system for coding-system object.")
2735 (obj)
2736 Lisp_Object obj;
2737{
2738 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2739}
2740
2741DEFUN ("read-non-nil-coding-system",
2742 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2743 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2744 (prompt)
2745 Lisp_Object prompt;
2746{
e0e989f6
KH
2747 Lisp_Object val;
2748 do {
2749 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2750 Qt, Qnil, Qnil);
2751 } while (XSTRING (val)->size == 0);
2752 return (Fintern (val, Qnil));
4ed46869
KH
2753}
2754
2755DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2756 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2757 (prompt)
2758 Lisp_Object prompt;
2759{
e0e989f6
KH
2760 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2761 Qt, Qnil, Qnil);
2762 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2763}
2764
2765DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2766 1, 1, 0,
2767 "Check validity of CODING-SYSTEM.\n\
2768If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2769CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2770The value of property should be a vector of length 5.")
2771 (coding_system)
2772 Lisp_Object coding_system;
2773{
2774 CHECK_SYMBOL (coding_system, 0);
2775 if (!NILP (Fcoding_system_p (coding_system)))
2776 return coding_system;
2777 while (1)
2778 Fsignal (Qcoding_system_error, coding_system);
2779}
2780
2781DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2782 2, 2, 0,
2783 "Detect coding-system of the text in the region between START and END.\n\
2784Return a list of possible coding-systems ordered by priority.\n\
e0e989f6 2785If only ASCII characters are found, it returns `automatic-conversion'\n\
4ed46869
KH
2786 or its subsidiary coding-system according to a detected end-of-line format.")
2787 (b, e)
2788 Lisp_Object b, e;
2789{
2790 int coding_mask, eol_type;
2791 Lisp_Object val;
2792 int beg, end;
2793
2794 validate_region (&b, &e);
2795 beg = XINT (b), end = XINT (e);
2796 if (beg < GPT && end >= GPT) move_gap (end);
2797
2798 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2799 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2800
2801 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2802 {
e0e989f6 2803 val = intern ("automatic-conversion");
4ed46869
KH
2804 if (eol_type != CODING_EOL_AUTOMATIC)
2805 {
2806 Lisp_Object val2 = Fget (val, Qeol_type);
2807 if (VECTORP (val2))
2808 val = XVECTOR (val2)->contents[eol_type];
2809 }
2810 }
2811 else
2812 {
2813 Lisp_Object val2;
2814
2815 /* At first, gather possible coding-systems in VAL in a reverse
2816 order. */
2817 val = Qnil;
2818 for (val2 = Vcoding_category_list;
2819 !NILP (val2);
2820 val2 = XCONS (val2)->cdr)
2821 {
2822 int idx
2823 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2824 if (coding_mask & (1 << idx))
2825 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2826 }
2827
2828 /* Then, change the order of the list, while getting subsidiary
2829 coding-systems. */
2830 val2 = val;
2831 val = Qnil;
2832 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2833 {
2834 if (eol_type == CODING_EOL_AUTOMATIC)
2835 val = Fcons (XCONS (val2)->car, val);
2836 else
2837 {
2838 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2839 if (VECTORP (val3))
2840 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2841 else
2842 val = Fcons (XCONS (val2)->car, val);
2843 }
2844 }
2845 }
2846
2847 return val;
2848}
2849
2850/* Scan text in the region between *BEGP and *ENDP, skip characters
2851 which we never have to encode to (iff ENCODEP is 1) or decode from
2852 coding system CODING at the head and tail, then set BEGP and ENDP
2853 to the addresses of start and end of the text we actually convert. */
2854
2855void
2856shrink_conversion_area (begp, endp, coding, encodep)
2857 unsigned char **begp, **endp;
2858 struct coding_system *coding;
2859 int encodep;
2860{
2861 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2862
2863 if (coding->eol_type != CODING_EOL_LF
2864 && coding->eol_type != CODING_EOL_AUTOMATIC)
2865 /* Since we anyway have to convert end-of-line format, it is not
2866 worth skipping at most 100 bytes or so. */
2867 return;
2868
2869 if (encodep) /* for encoding */
2870 {
2871 switch (coding->type)
2872 {
2873 case coding_type_no_conversion:
2874 case coding_type_internal:
2875 case coding_type_automatic:
2876 /* We need no conversion. */
2877 *begp = *endp;
2878 return;
2879 case coding_type_ccl:
2880 /* We can't skip any data. */
2881 return;
e0e989f6
KH
2882 case coding_type_iso2022:
2883 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2884 {
2885 unsigned char *bol = beg_addr;
2886 while (beg_addr < end_addr && *beg_addr < 0x80)
2887 {
2888 beg_addr++;
2889 if (*(beg_addr - 1) == '\n')
2890 bol = beg_addr;
2891 }
2892 beg_addr = bol;
2893 goto label_skip_tail;
2894 }
2895 /* fall down ... */
4ed46869
KH
2896 default:
2897 /* We can skip all ASCII characters at the head and tail. */
2898 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 2899 label_skip_tail:
4ed46869
KH
2900 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2901 break;
2902 }
2903 }
2904 else /* for decoding */
2905 {
2906 switch (coding->type)
2907 {
2908 case coding_type_no_conversion:
2909 /* We need no conversion. */
2910 *begp = *endp;
2911 return;
2912 case coding_type_internal:
2913 if (coding->eol_type == CODING_EOL_LF)
2914 {
2915 /* We need no conversion. */
2916 *begp = *endp;
2917 return;
2918 }
2919 /* We can skip all but carriage-return. */
2920 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2921 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2922 break;
2923 case coding_type_sjis:
2924 case coding_type_big5:
2925 /* We can skip all ASCII characters at the head. */
2926 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2927 /* We can skip all ASCII characters at the tail except for
2928 the second byte of SJIS or BIG5 code. */
2929 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2930 if (end_addr != *endp)
2931 end_addr++;
2932 break;
2933 case coding_type_ccl:
2934 /* We can't skip any data. */
2935 return;
2936 default: /* i.e. case coding_type_iso2022: */
2937 {
2938 unsigned char c;
2939
2940 /* We can skip all ASCII characters except for a few
2941 control codes at the head. */
2942 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2943 && c != ISO_CODE_CR && c != ISO_CODE_SO
2944 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2945 beg_addr++;
2946 }
2947 break;
2948 }
2949 }
2950 *begp = beg_addr;
2951 *endp = end_addr;
2952 return;
2953}
2954
2955/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2956 text between B and E. B and E are buffer position. */
2957
2958Lisp_Object
2959code_convert_region (b, e, coding, encodep)
2960 Lisp_Object b, e;
2961 struct coding_system *coding;
2962 int encodep;
2963{
2964 int beg, end, len, consumed, produced;
2965 char *buf;
2966 unsigned char *begp, *endp;
2967 int pos = PT;
2968
2969 validate_region (&b, &e);
2970 beg = XINT (b), end = XINT (e);
2971 if (beg < GPT && end >= GPT)
2972 move_gap (end);
2973
2974 if (encodep && !NILP (coding->pre_write_conversion))
2975 {
2976 /* We must call a pre-conversion function which may put a new
2977 text to be converted in a new buffer. */
2978 struct buffer *old = current_buffer, *new;
2979
2980 TEMP_SET_PT (beg);
2981 call2 (coding->pre_write_conversion, b, e);
2982 if (old != current_buffer)
2983 {
2984 /* Replace the original text by the text just generated. */
2985 len = ZV - BEGV;
2986 new = current_buffer;
2987 set_buffer_internal (old);
2988 del_range (beg, end);
2989 insert_from_buffer (new, 1, len, 0);
2990 end = beg + len;
2991 }
2992 }
2993
2994 /* We may be able to shrink the conversion region. */
2995 begp = POS_ADDR (beg); endp = begp + (end - beg);
2996 shrink_conversion_area (&begp, &endp, coding, encodep);
2997
2998 if (begp == endp)
2999 /* We need no conversion. */
3000 len = end - beg;
3001 else
3002 {
3003 beg += begp - POS_ADDR (beg);
3004 end = beg + (endp - begp);
3005
3006 if (encodep)
3007 len = encoding_buffer_size (coding, end - beg);
3008 else
3009 len = decoding_buffer_size (coding, end - beg);
3010 buf = get_conversion_buffer (len);
3011
3012 coding->last_block = 1;
3013 produced = (encodep
3014 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3015 &consumed)
3016 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3017 &consumed));
3018
3019 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3020
3021 TEMP_SET_PT (beg);
3022 insert (buf, produced);
3023 del_range (PT, PT + end - beg);
3024 if (pos >= end)
3025 pos = PT + (pos - end);
3026 else if (pos > beg)
3027 pos = beg;
3028 TEMP_SET_PT (pos);
3029 }
3030
3031 if (!encodep && !NILP (coding->post_read_conversion))
3032 {
3033 /* We must call a post-conversion function which may alter
3034 the text just converted. */
3035 Lisp_Object insval;
3036
3037 beg = XINT (b);
3038 TEMP_SET_PT (beg);
3039 insval = call1 (coding->post_read_conversion, make_number (len));
3040 CHECK_NUMBER (insval, 0);
3041 len = XINT (insval);
3042 }
3043
3044 return make_number (len);
3045}
3046
3047Lisp_Object
e0e989f6
KH
3048code_convert_string (str, coding, encodep, nocopy)
3049 Lisp_Object str, nocopy;
4ed46869
KH
3050 struct coding_system *coding;
3051 int encodep;
3052{
3053 int len, consumed, produced;
3054 char *buf;
3055 unsigned char *begp, *endp;
3056 int head_skip, tail_skip;
3057 struct gcpro gcpro1;
3058
3059 if (encodep && !NILP (coding->pre_write_conversion)
3060 || !encodep && !NILP (coding->post_read_conversion))
3061 {
3062 /* Since we have to call Lisp functions which assume target text
3063 is in a buffer, after setting a temporary buffer, call
3064 code_convert_region. */
3065 int count = specpdl_ptr - specpdl;
3066 int len = XSTRING (str)->size;
3067 Lisp_Object result;
3068 struct buffer *old = current_buffer;
3069
3070 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3071 temp_output_buffer_setup (" *code-converting-work*");
3072 set_buffer_internal (XBUFFER (Vstandard_output));
3073 insert_from_string (str, 0, len, 0);
3074 code_convert_region (make_number (BEGV), make_number (ZV),
3075 coding, encodep);
3076 result = make_buffer_string (BEGV, ZV, 0);
3077 set_buffer_internal (old);
3078 return unbind_to (count, result);
3079 }
3080
3081 /* We may be able to shrink the conversion region. */
3082 begp = XSTRING (str)->data;
3083 endp = begp + XSTRING (str)->size;
3084 shrink_conversion_area (&begp, &endp, coding, encodep);
3085
3086 if (begp == endp)
3087 /* We need no conversion. */
e0e989f6 3088 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3089
3090 head_skip = begp - XSTRING (str)->data;
3091 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3092
3093 GCPRO1 (str);
3094
3095 if (encodep)
3096 len = encoding_buffer_size (coding, endp - begp);
3097 else
3098 len = decoding_buffer_size (coding, endp - begp);
3099 buf = get_conversion_buffer (len + head_skip + tail_skip);
3100
3101 bcopy (XSTRING (str)->data, buf, head_skip);
3102 coding->last_block = 1;
3103 produced = (encodep
3104 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3105 buf + head_skip, endp - begp, len, &consumed)
3106 : decode_coding (coding, XSTRING (str)->data + head_skip,
3107 buf + head_skip, endp - begp, len, &consumed));
3108 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3109 buf + head_skip + produced,
3110 tail_skip);
3111
3112 UNGCPRO;
3113
3114 return make_string (buf, head_skip + produced + tail_skip);
3115}
3116
3117DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3118 3, 3, "r\nzCoding system: ",
3119 "Decode current region by specified coding system.\n\
3120When called from a program, takes three arguments:\n\
3121START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3122Return length of decoded text.")
3123 (b, e, coding_system)
3124 Lisp_Object b, e, coding_system;
3125{
3126 struct coding_system coding;
3127
3128 CHECK_NUMBER_COERCE_MARKER (b, 0);
3129 CHECK_NUMBER_COERCE_MARKER (e, 1);
3130 CHECK_SYMBOL (coding_system, 2);
3131
e0e989f6
KH
3132 if (NILP (coding_system))
3133 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3134 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3135 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3136
3137 return code_convert_region (b, e, &coding, 0);
3138}
3139
3140DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3141 3, 3, "r\nzCoding system: ",
3142 "Encode current region by specified coding system.\n\
3143When called from a program, takes three arguments:\n\
3144START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3145Return length of encoded text.")
3146 (b, e, coding_system)
3147 Lisp_Object b, e, coding_system;
3148{
3149 struct coding_system coding;
3150
3151 CHECK_NUMBER_COERCE_MARKER (b, 0);
3152 CHECK_NUMBER_COERCE_MARKER (e, 1);
3153 CHECK_SYMBOL (coding_system, 2);
3154
e0e989f6
KH
3155 if (NILP (coding_system))
3156 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3157 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3158 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3159
3160 return code_convert_region (b, e, &coding, 1);
3161}
3162
3163DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3164 2, 3, 0,
3165 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3166Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3167of decoding.")
3168 (string, coding_system, nocopy)
3169 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3170{
3171 struct coding_system coding;
3172
3173 CHECK_STRING (string, 0);
3174 CHECK_SYMBOL (coding_system, 1);
3175
e0e989f6
KH
3176 if (NILP (coding_system))
3177 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3178 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3179 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3180
e0e989f6 3181 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3182}
3183
3184DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3185 2, 3, 0,
3186 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3187Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3188of encoding.")
3189 (string, coding_system, nocopy)
3190 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3191{
3192 struct coding_system coding;
3193
3194 CHECK_STRING (string, 0);
3195 CHECK_SYMBOL (coding_system, 1);
3196
e0e989f6
KH
3197 if (NILP (coding_system))
3198 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3199 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3200 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3201
e0e989f6 3202 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3203}
3204
3205DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3206 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3207CODE is the character code in SJIS.\n\
3208Return the corresponding character.")
3209 (code)
3210 Lisp_Object code;
3211{
3212 unsigned char c1, c2, s1, s2;
3213 Lisp_Object val;
3214
3215 CHECK_NUMBER (code, 0);
3216 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3217 DECODE_SJIS (s1, s2, c1, c2);
3218 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3219 return val;
3220}
3221
3222DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3223 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3224Return the corresponding character code in SJIS.")
3225 (ch)
3226 Lisp_Object ch;
3227{
3228 int charset;
3229 unsigned char c1, c2, s1, s2;
3230 Lisp_Object val;
3231
3232 CHECK_NUMBER (ch, 0);
3233 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3234 if (charset == charset_jisx0208)
3235 {
3236 ENCODE_SJIS (c1, c2, s1, s2);
3237 XSETFASTINT (val, ((int)s1 << 8) | s2);
3238 }
3239 else
3240 XSETFASTINT (val, 0);
3241 return val;
3242}
3243
3244DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3245 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3246CODE is the character code in BIG5.\n\
3247Return the corresponding character.")
3248 (code)
3249 Lisp_Object code;
3250{
3251 int charset;
3252 unsigned char b1, b2, c1, c2;
3253 Lisp_Object val;
3254
3255 CHECK_NUMBER (code, 0);
3256 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3257 DECODE_BIG5 (b1, b2, charset, c1, c2);
3258 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3259 return val;
3260}
3261
3262DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3263 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3264Return the corresponding character code in Big5.")
3265 (ch)
3266 Lisp_Object ch;
3267{
3268 int charset;
3269 unsigned char c1, c2, b1, b2;
3270 Lisp_Object val;
3271
3272 CHECK_NUMBER (ch, 0);
3273 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3274 if (charset == charset_big5_1 || charset == charset_big5_2)
3275 {
3276 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3277 XSETFASTINT (val, ((int)b1 << 8) | b2);
3278 }
3279 else
3280 XSETFASTINT (val, 0);
3281 return val;
3282}
3283
3284DEFUN ("set-terminal-coding-system",
3285 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3286 "zCoding-system for terminal display: ",
3287 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3288All outputs to terminal are encoded to this coding-system.")
3289 (coding_system)
3290 Lisp_Object coding_system;
3291{
3292 CHECK_SYMBOL (coding_system, 0);
3293 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3294 update_mode_lines++;
3295 if (!NILP (Finteractive_p ()))
3296 Fredraw_display ();
3297 return Qnil;
3298}
3299
3300DEFUN ("terminal-coding-system",
3301 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3302 "Return coding-system of your terminal.")
3303 ()
3304{
3305 return terminal_coding.symbol;
3306}
3307
3308DEFUN ("set-keyboard-coding-system",
3309 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3310 "zCoding-system for keyboard input: ",
3311 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3312All inputs from terminal are decoded from this coding-system.")
3313 (coding_system)
3314 Lisp_Object coding_system;
3315{
3316 CHECK_SYMBOL (coding_system, 0);
3317 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3318 return Qnil;
3319}
3320
3321DEFUN ("keyboard-coding-system",
3322 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3323 "Return coding-system of what is sent from terminal keyboard.")
3324 ()
3325{
3326 return keyboard_coding.symbol;
3327}
3328
3329\f
3330DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3331 1, MANY, 0,
3332 "Return a cons of coding systems for I/O primitive OPERATION.\n\
3333Remaining arguments are for OPERATION.\n\
3334OPERATION is one of the following Emacs I/O primitives:\n\
3335 For file I/O, insert-file-contents or write-region.\n\
3336 For process I/O, call-process, call-process-region, or start-process.\n\
3337 For network I/O, open-network-stream.\n\
3338For each OPERATION, TARGET is selected from the arguments as below:\n\
3339 For file I/O, TARGET is a file name.\n\
3340 For process I/O, TARGET is a process name.\n\
3341 For network I/O, TARGET is a service name or a port number\n\
3342\n\
3343The return value is a cons of coding systems for decoding and encoding\n\
3344registered in nested alist `coding-system-alist' (which see) at a slot\n\
e0e989f6 3345corresponding to OPERATION and TARGET.\n\
4ed46869
KH
3346If a function symbol is at the slot, return a result of the function call.\n\
3347The function is called with one argument, a list of all the arguments.")
3348 (nargs, args)
3349 int nargs;
3350 Lisp_Object *args;
3351{
3352 Lisp_Object operation, target_idx, target, val;
3353 register Lisp_Object chain;
3354
3355 if (nargs < 2)
3356 error ("Too few arguments");
3357 operation = args[0];
3358 if (!SYMBOLP (operation)
3359 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3360 error ("Invalid first arguement");
3361 if (nargs < 1 + XINT (target_idx))
3362 error ("Too few arguments for operation: %s",
3363 XSYMBOL (operation)->name->data);
3364 target = args[XINT (target_idx) + 1];
3365 if (!(STRINGP (target)
3366 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3367 error ("Invalid %dth argument", XINT (target_idx) + 1);
3368
3369 chain = Fassq (operation, Vcoding_system_alist);
3370 if (NILP (chain))
3371 return Qnil;
3372
3373 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3374 {
3375 Lisp_Object elt = XCONS (chain)->car;
3376
3377 if (CONSP (elt)
3378 && ((STRINGP (target)
3379 && STRINGP (XCONS (elt)->car)
3380 && fast_string_match (XCONS (elt)->car, target) >= 0)
3381 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3382 return (CONSP (val = XCONS (elt)->cdr)
3383 ? val
3384 : ((SYMBOLP (val) && Fboundp (val)
3385 ? call2 (val, Flist (nargs, args))
3386 : Qnil)));
3387 }
3388 return Qnil;
3389}
3390
3391#endif /* emacs */
3392
3393\f
3394/*** 8. Post-amble ***/
3395
3396init_coding_once ()
3397{
3398 int i;
3399
3400 /* Emacs internal format specific initialize routine. */
3401 for (i = 0; i <= 0x20; i++)
3402 emacs_code_class[i] = EMACS_control_code;
3403 emacs_code_class[0x0A] = EMACS_linefeed_code;
3404 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3405 for (i = 0x21 ; i < 0x7F; i++)
3406 emacs_code_class[i] = EMACS_ascii_code;
3407 emacs_code_class[0x7F] = EMACS_control_code;
3408 emacs_code_class[0x80] = EMACS_leading_code_composition;
3409 for (i = 0x81; i < 0xFF; i++)
3410 emacs_code_class[i] = EMACS_invalid_code;
3411 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3412 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3413 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3414 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3415
3416 /* ISO2022 specific initialize routine. */
3417 for (i = 0; i < 0x20; i++)
3418 iso_code_class[i] = ISO_control_code;
3419 for (i = 0x21; i < 0x7F; i++)
3420 iso_code_class[i] = ISO_graphic_plane_0;
3421 for (i = 0x80; i < 0xA0; i++)
3422 iso_code_class[i] = ISO_control_code;
3423 for (i = 0xA1; i < 0xFF; i++)
3424 iso_code_class[i] = ISO_graphic_plane_1;
3425 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3426 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3427 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3428 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3429 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3430 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3431 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3432 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3433 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3434 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3435
e0e989f6
KH
3436 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3437 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3438
3439 setup_coding_system (Qnil, &keyboard_coding);
3440 setup_coding_system (Qnil, &terminal_coding);
3441}
3442
3443#ifdef emacs
3444
3445syms_of_coding ()
3446{
3447 Qtarget_idx = intern ("target-idx");
3448 staticpro (&Qtarget_idx);
3449
3450 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3451 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3452
3453 Qcall_process = intern ("call-process");
3454 staticpro (&Qcall_process);
3455 Fput (Qcall_process, Qtarget_idx, make_number (0));
3456
3457 Qcall_process_region = intern ("call-process-region");
3458 staticpro (&Qcall_process_region);
3459 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3460
3461 Qstart_process = intern ("start-process");
3462 staticpro (&Qstart_process);
3463 Fput (Qstart_process, Qtarget_idx, make_number (2));
3464
3465 Qopen_network_stream = intern ("open-network-stream");
3466 staticpro (&Qopen_network_stream);
3467 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3468
4ed46869
KH
3469 Qcoding_system = intern ("coding-system");
3470 staticpro (&Qcoding_system);
3471
3472 Qeol_type = intern ("eol-type");
3473 staticpro (&Qeol_type);
3474
3475 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3476 staticpro (&Qbuffer_file_coding_system);
3477
3478 Qpost_read_conversion = intern ("post-read-conversion");
3479 staticpro (&Qpost_read_conversion);
3480
3481 Qpre_write_conversion = intern ("pre-write-conversion");
3482 staticpro (&Qpre_write_conversion);
3483
3484 Qcoding_system_vector = intern ("coding-system-vector");
3485 staticpro (&Qcoding_system_vector);
3486
3487 Qcoding_system_p = intern ("coding-system-p");
3488 staticpro (&Qcoding_system_p);
3489
3490 Qcoding_system_error = intern ("coding-system-error");
3491 staticpro (&Qcoding_system_error);
3492
3493 Fput (Qcoding_system_error, Qerror_conditions,
3494 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3495 Fput (Qcoding_system_error, Qerror_message,
3496 build_string ("Coding-system error"));
3497
3498 Qcoding_category_index = intern ("coding-category-index");
3499 staticpro (&Qcoding_category_index);
3500
3501 {
3502 int i;
3503 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3504 {
3505 coding_category_table[i] = intern (coding_category_name[i]);
3506 staticpro (&coding_category_table[i]);
3507 Fput (coding_category_table[i], Qcoding_category_index,
3508 make_number (i));
3509 }
3510 }
3511
4ed46869
KH
3512 defsubr (&Scoding_system_vector);
3513 defsubr (&Scoding_system_p);
3514 defsubr (&Sread_coding_system);
3515 defsubr (&Sread_non_nil_coding_system);
3516 defsubr (&Scheck_coding_system);
3517 defsubr (&Sdetect_coding_region);
3518 defsubr (&Sdecode_coding_region);
3519 defsubr (&Sencode_coding_region);
3520 defsubr (&Sdecode_coding_string);
3521 defsubr (&Sencode_coding_string);
3522 defsubr (&Sdecode_sjis_char);
3523 defsubr (&Sencode_sjis_char);
3524 defsubr (&Sdecode_big5_char);
3525 defsubr (&Sencode_big5_char);
3526 defsubr (&Sset_terminal_coding_system);
3527 defsubr (&Sterminal_coding_system);
3528 defsubr (&Sset_keyboard_coding_system);
3529 defsubr (&Skeyboard_coding_system);
3530 defsubr (&Sfind_coding_system);
3531
3532 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3533 "List of coding-categories (symbols) ordered by priority.");
3534 {
3535 int i;
3536
3537 Vcoding_category_list = Qnil;
3538 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3539 Vcoding_category_list
3540 = Fcons (coding_category_table[i], Vcoding_category_list);
3541 }
3542
3543 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3544 "A variable of internal use only.\n\
3545If the value is a coding system, it is used for decoding on read operation.\n\
3546If not, an appropriate element in `coding-system-alist' (which see) is used.");
3547 Vcoding_system_for_read = Qnil;
3548
3549 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3550 "A variable of internal use only.\n\
3551If the value is a coding system, it is used for encoding on write operation.\n\
3552If not, an appropriate element in `coding-system-alist' (which see) is used.");
3553 Vcoding_system_for_write = Qnil;
3554
3555 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3556 "Coding-system used in the latest file or process I/O.");
3557 Vlast_coding_system_used = Qnil;
3558
3559 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3560 "Nested alist to decide a coding system for a specific I/O operation.\n\
3561The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
e0e989f6 3562\n\
4ed46869
KH
3563OPERATION is one of the following Emacs I/O primitives:\n\
3564 For file I/O, insert-file-contents and write-region.\n\
3565 For process I/O, call-process, call-process-region, and start-process.\n\
3566 For network I/O, open-network-stream.\n\
3567In addition, for process I/O, `process-argument' can be specified for\n\
3568encoding arguments of the process.\n\
3569\n\
3570REGEXP is a regular expression matching a target of OPERATION, where\n\
3571target is a file name for file I/O operations, a process name for\n\
3572process I/O operations, or a service name for network I/O\n\
3573operations. REGEXP might be a port number for network I/O operation.\n\
3574\n\
3575CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3576character code on OPERATION, or a function symbol returning the cons.\n\
3577See the documentation of `find-coding-system' for more detail.");
3578 Vcoding_system_alist = Qnil;
3579
3580 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3581 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3582 eol_mnemonic_unix = '.';
3583
3584 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3585 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3586 eol_mnemonic_dos = ':';
3587
3588 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3589 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3590 eol_mnemonic_mac = '\'';
3591
3592 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3593 "Mnemonic character indicating end-of-line format is not yet decided.");
3594 eol_mnemonic_undecided = '-';
3595
3596 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3597 "Alist of charsets vs the alternate charsets.\n\
3598While decoding, if a charset (car part of an element) is found,\n\
3599decode it as the alternate charset (cdr part of the element).");
3600 Valternate_charset_table = Qnil;
3601
3602 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3603 "Alist of charsets vs revision numbers.\n\
3604While encoding, if a charset (car part of an element) is found,\n\
3605designate it with the escape sequence identifing revision (cdr part of the element).");
3606 Vcharset_revision_alist = Qnil;
3607}
3608
3609#endif /* emacs */