(Fread_coding_system, Fread_non_nil_coding_system):
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /*** TABLE OF CONTENTS ***
24
25 1. Preamble
26 2. Emacs' internal format handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
33
34 */
35
36 /*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means
42 converting Emacs' internal format to some other coding system.
43
44 0. Emacs' internal format
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83 */
84
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98 */
99
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107 #if 0
108 int
109 detect_coding_internal (src, src_end)
110 unsigned char *src, *src_end;
111 {
112 ...
113 }
114 #endif
115
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a
120 place pointed by DESTINATION, the length of which should not exceed
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
122 The return value is the length of the decoded text. Below is a
123 template of these functions. */
124 #if 0
125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130 {
131 ...
132 }
133 #endif
134
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
138 internal format to CODING. The resulting text goes to a place
139 pointed by DESTINATION, the length of which should not exceed
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
141 The return value is the length of the encoded text. Below is a
142 template of these functions. */
143 #if 0
144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149 {
150 ...
151 }
152 #endif
153
154 /*** COMMONLY USED MACROS ***/
155
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162 #define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170 #define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178 #define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193 /* Decode one ASCII character C. */
194
195 #define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227 \f
228 /*** 1. Preamble ***/
229
230 #include <stdio.h>
231
232 #ifdef emacs
233
234 #include <config.h>
235 #include "lisp.h"
236 #include "buffer.h"
237 #include "charset.h"
238 #include "ccl.h"
239 #include "coding.h"
240 #include "window.h"
241
242 #else /* not emacs */
243
244 #include "mulelib.h"
245
246 #endif /* not emacs */
247
248 Lisp_Object Qcoding_system, Qeol_type;
249 Lisp_Object Qbuffer_file_coding_system;
250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254 Lisp_Object Qstart_process, Qopen_network_stream;
255 Lisp_Object Qtarget_idx;
256
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259 /* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261 int eol_mnemonic_undecided;
262
263 #ifdef emacs
264
265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
266
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used;
275
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding;
278
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding;
281
282 Lisp_Object Vcoding_system_alist;
283
284 #endif /* emacs */
285
286 Lisp_Object Qcoding_category_index;
287
288 /* List of symbols `coding-category-xxx' ordered by priority. */
289 Lisp_Object Vcoding_category_list;
290
291 /* Table of coding-systems currently assigned to each coding-category. */
292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
293
294 /* Table of names of symbol for each coding-category. */
295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
296 "coding-category-internal",
297 "coding-category-sjis",
298 "coding-category-iso-7",
299 "coding-category-iso-8-1",
300 "coding-category-iso-8-2",
301 "coding-category-iso-else",
302 "coding-category-big5",
303 "coding-category-binary"
304 };
305
306 /* Alist of charsets vs the alternate charsets. */
307 Lisp_Object Valternate_charset_table;
308
309 /* Alist of charsets vs revision number. */
310 Lisp_Object Vcharset_revision_alist;
311
312 \f
313 /*** 2. Emacs internal format handlers ***/
314
315 /* Emacs' internal format for encoding multiple character sets is a
316 kind of multi-byte encoding, i.e. encoding a character by a sequence
317 of one-byte codes of variable length. ASCII characters and control
318 characters (e.g. `tab', `newline') are represented by one-byte as
319 is. It takes the range 0x00 through 0x7F. The other characters
320 are represented by a sequence of `base leading-code', optional
321 `extended leading-code', and one or two `position-code's. Length
322 of the sequence is decided by the base leading-code. Leading-code
323 takes the range 0x80 through 0x9F, whereas extended leading-code
324 and position-code take the range 0xA0 through 0xFF. See the
325 document of `charset.h' for more detail about leading-code and
326 position-code.
327
328 There's one exception in this rule. Special leading-code
329 `leading-code-composition' denotes that the following several
330 characters should be composed into one character. Leading-codes of
331 components (except for ASCII) are added 0x20. An ASCII character
332 component is represented by a 2-byte sequence of `0xA0' and
333 `ASCII-code + 0x80'. See also the document in `charset.h' for the
334 detail of composite character. Hence, we can summarize the code
335 range as follows:
336
337 --- CODE RANGE of Emacs' internal format ---
338 (character set) (range)
339 ASCII 0x00 .. 0x7F
340 ELSE (1st byte) 0x80 .. 0x9F
341 (rest bytes) 0xA0 .. 0xFF
342 ---------------------------------------------
343
344 */
345
346 enum emacs_code_class_type emacs_code_class[256];
347
348 /* Go to the next statement only if *SRC is accessible and the code is
349 greater than 0xA0. */
350 #define CHECK_CODE_RANGE_A0_FF \
351 do { \
352 if (src >= src_end) \
353 goto label_end_of_switch; \
354 else if (*src++ < 0xA0) \
355 return 0; \
356 } while (0)
357
358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
359 Check if a text is encoded in Emacs' internal format. If it is,
360 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
361
362 int
363 detect_coding_internal (src, src_end)
364 unsigned char *src, *src_end;
365 {
366 unsigned char c;
367 int composing = 0;
368
369 while (src < src_end)
370 {
371 c = *src++;
372
373 if (composing)
374 {
375 if (c < 0xA0)
376 composing = 0;
377 else
378 c -= 0x20;
379 }
380
381 switch (emacs_code_class[c])
382 {
383 case EMACS_ascii_code:
384 case EMACS_linefeed_code:
385 break;
386
387 case EMACS_control_code:
388 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
389 return 0;
390 break;
391
392 case EMACS_invalid_code:
393 return 0;
394
395 case EMACS_leading_code_composition: /* c == 0x80 */
396 if (composing)
397 CHECK_CODE_RANGE_A0_FF;
398 else
399 composing = 1;
400 break;
401
402 case EMACS_leading_code_4:
403 CHECK_CODE_RANGE_A0_FF;
404 /* fall down to check it two more times ... */
405
406 case EMACS_leading_code_3:
407 CHECK_CODE_RANGE_A0_FF;
408 /* fall down to check it one more time ... */
409
410 case EMACS_leading_code_2:
411 CHECK_CODE_RANGE_A0_FF;
412 break;
413
414 default:
415 label_end_of_switch:
416 break;
417 }
418 }
419 return CODING_CATEGORY_MASK_INTERNAL;
420 }
421
422 \f
423 /*** 3. ISO2022 handlers ***/
424
425 /* The following note describes the coding system ISO2022 briefly.
426 Since the intension of this note is to help understanding of the
427 programs in this file, some parts are NOT ACCURATE or OVERLY
428 SIMPLIFIED. For the thorough understanding, please refer to the
429 original document of ISO2022.
430
431 ISO2022 provides many mechanisms to encode several character sets
432 in 7-bit and 8-bit environment. If one choose 7-bite environment,
433 all text is encoded by codes of less than 128. This may make the
434 encoded text a little bit longer, but the text get more stability
435 to pass through several gateways (some of them split MSB off).
436
437 There are two kind of character set: control character set and
438 graphic character set. The former contains control characters such
439 as `newline' and `escape' to provide control functions (control
440 functions are provided also by escape sequence). The latter
441 contains graphic characters such as ' A' and '-'. Emacs recognizes
442 two control character sets and many graphic character sets.
443
444 Graphic character sets are classified into one of the following
445 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
446 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
447 bytes (DIMENSION) and the number of characters in one dimension
448 (CHARS) of the set. In addition, each character set is assigned an
449 identification tag (called "final character" and denoted as <F>
450 here after) which is unique in each class. <F> of each character
451 set is decided by ECMA(*) when it is registered in ISO. Code range
452 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
453
454 Note (*): ECMA = European Computer Manufacturers Association
455
456 Here are examples of graphic character set [NAME(<F>)]:
457 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
458 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
459 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
460 o DIMENSION2_CHARS96 -- none for the moment
461
462 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
463 C0 [0x00..0x1F] -- control character plane 0
464 GL [0x20..0x7F] -- graphic character plane 0
465 C1 [0x80..0x9F] -- control character plane 1
466 GR [0xA0..0xFF] -- graphic character plane 1
467
468 A control character set is directly designated and invoked to C0 or
469 C1 by an escape sequence. The most common case is that ISO646's
470 control character set is designated/invoked to C0 and ISO6429's
471 control character set is designated/invoked to C1, and usually
472 these designations/invocations are omitted in a coded text. With
473 7-bit environment, only C0 can be used, and a control character for
474 C1 is encoded by an appropriate escape sequence to fit in the
475 environment. All control characters for C1 are defined the
476 corresponding escape sequences.
477
478 A graphic character set is at first designated to one of four
479 graphic registers (G0 through G3), then these graphic registers are
480 invoked to GL or GR. These designations and invocations can be
481 done independently. The most common case is that G0 is invoked to
482 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
483 these invocations and designations are omitted in a coded text.
484 With 7-bit environment, only GL can be used.
485
486 When a graphic character set of CHARS94 is invoked to GL, code 0x20
487 and 0x7F of GL area work as control characters SPACE and DEL
488 respectively, and code 0xA0 and 0xFF of GR area should not be used.
489
490 There are two ways of invocation: locking-shift and single-shift.
491 With locking-shift, the invocation lasts until the next different
492 invocation, whereas with single-shift, the invocation works only
493 for the following character and doesn't affect locking-shift.
494 Invocations are done by the following control characters or escape
495 sequences.
496
497 ----------------------------------------------------------------------
498 function control char escape sequence description
499 ----------------------------------------------------------------------
500 SI (shift-in) 0x0F none invoke G0 to GL
501 SI (shift-out) 0x0E none invoke G1 to GL
502 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
503 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
504 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
505 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
506 ----------------------------------------------------------------------
507 The first four are for locking-shift. Control characters for these
508 functions are defined by macros ISO_CODE_XXX in `coding.h'.
509
510 Designations are done by the following escape sequences.
511 ----------------------------------------------------------------------
512 escape sequence description
513 ----------------------------------------------------------------------
514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
530 ----------------------------------------------------------------------
531
532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
533 of dimension 1, chars 94, and final character <F>, and etc.
534
535 Note (*): Although these designations are not allowed in ISO2022,
536 Emacs accepts them on decoding, and produces them on encoding
537 CHARS96 character set in a coding system which is characterized as
538 7-bit environment, non-locking-shift, and non-single-shift.
539
540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
541 '(' can be omitted. We call this as "short-form" here after.
542
543 Now you may notice that there are a lot of ways for encoding the
544 same multilingual text in ISO2022. Actually, there exist many
545 coding systems such as Compound Text (used in X's inter client
546 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
547 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
548 localized platforms), and all of these are variants of ISO2022.
549
550 In addition to the above, Emacs handles two more kinds of escape
551 sequences: ISO6429's direction specification and Emacs' private
552 sequence for specifying character composition.
553
554 ISO6429's direction specification takes the following format:
555 o CSI ']' -- end of the current direction
556 o CSI '0' ']' -- end of the current direction
557 o CSI '1' ']' -- start of left-to-right text
558 o CSI '2' ']' -- start of right-to-left text
559 The control character CSI (0x9B: control sequence introducer) is
560 abbreviated to the escape sequence ESC '[' in 7-bit environment.
561
562 Character composition specification takes the following format:
563 o ESC '0' -- start character composition
564 o ESC '1' -- end character composition
565 Since these are not standard escape sequences of any ISO, the use
566 of them for these meaning is restricted to Emacs only. */
567
568 enum iso_code_class_type iso_code_class[256];
569
570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
571 Check if a text is encoded in ISO2022. If it is, returns an
572 integer in which appropriate flag bits any of:
573 CODING_CATEGORY_MASK_ISO_7
574 CODING_CATEGORY_MASK_ISO_8_1
575 CODING_CATEGORY_MASK_ISO_8_2
576 CODING_CATEGORY_MASK_ISO_ELSE
577 are set. If a code which should never appear in ISO2022 is found,
578 returns 0. */
579
580 int
581 detect_coding_iso2022 (src, src_end)
582 unsigned char *src, *src_end;
583 {
584 int mask = CODING_CATEGORY_MASK_ANY;
585 int g1 = 0; /* 1 iff designating to G1. */
586 int c, i;
587
588 while (src < src_end)
589 {
590 c = *src++;
591 switch (c)
592 {
593 case ISO_CODE_ESC:
594 if (src >= src_end)
595 break;
596 c = *src++;
597 if (src < src_end
598 && ((c >= '(' && c <= '/')
599 || c == '$' && ((*src >= '(' && *src <= '/')
600 || (*src >= '@' && *src <= 'B'))))
601 {
602 /* Valid designation sequence. */
603 mask &= (CODING_CATEGORY_MASK_ISO_7
604 | CODING_CATEGORY_MASK_ISO_8_1
605 | CODING_CATEGORY_MASK_ISO_8_2
606 | CODING_CATEGORY_MASK_ISO_ELSE);
607 if (c == ')' || (c == '$' && *src == ')'))
608 {
609 g1 = 1;
610 mask &= ~CODING_CATEGORY_MASK_ISO_7;
611 }
612 src++;
613 break;
614 }
615 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
616 return CODING_CATEGORY_MASK_ISO_ELSE;
617 break;
618
619 case ISO_CODE_SO:
620 if (g1)
621 return CODING_CATEGORY_MASK_ISO_ELSE;
622 break;
623
624 case ISO_CODE_CSI:
625 case ISO_CODE_SS2:
626 case ISO_CODE_SS3:
627 mask &= ~CODING_CATEGORY_MASK_ISO_7;
628 break;
629
630 default:
631 if (c < 0x80)
632 break;
633 else if (c < 0xA0)
634 return 0;
635 else
636 {
637 int count = 1;
638
639 mask &= ~CODING_CATEGORY_MASK_ISO_7;
640 while (src < src_end && *src >= 0xA0)
641 count++, src++;
642 if (count & 1 && src < src_end)
643 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
644 }
645 break;
646 }
647 }
648
649 return mask;
650 }
651
652 /* Decode a character of which charset is CHARSET and the 1st position
653 code is C1. If dimension of CHARSET 2, the 2nd position code is
654 fetched from SRC and set to C2. If CHARSET is negative, it means
655 that we are decoding ill formed text, and what we can do is just to
656 read C1 as is. */
657
658 #define DECODE_ISO_CHARACTER(charset, c1) \
659 do { \
660 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
661 ONE_MORE_BYTE (c2); \
662 if (COMPOSING_HEAD_P (coding->composing)) \
663 { \
664 *dst++ = LEADING_CODE_COMPOSITION; \
665 if (COMPOSING_WITH_RULE_P (coding->composing)) \
666 /* To tell composition rules are embeded. */ \
667 *dst++ = 0xFF; \
668 coding->composing += 2; \
669 } \
670 if ((charset) < 0) \
671 *dst++ = c1; \
672 else if ((charset) == CHARSET_ASCII) \
673 DECODE_CHARACTER_ASCII (c1); \
674 else if (CHARSET_DIMENSION (charset) == 1) \
675 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
676 else \
677 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
678 if (COMPOSING_WITH_RULE_P (coding->composing)) \
679 /* To tell a composition rule follows. */ \
680 coding->composing = COMPOSING_WITH_RULE_RULE; \
681 } while (0)
682
683 /* Set designation state into CODING. */
684 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
685 do { \
686 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
687 Lisp_Object temp \
688 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
689 if (! NILP (temp)) \
690 charset = get_charset_id (XCONS (temp)->cdr); \
691 if (charset >= 0) \
692 { \
693 if (coding->direction == 1 \
694 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
695 charset = CHARSET_REVERSE_CHARSET (charset); \
696 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
697 } \
698 } while (0)
699
700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
701
702 int
703 decode_coding_iso2022 (coding, source, destination,
704 src_bytes, dst_bytes, consumed)
705 struct coding_system *coding;
706 unsigned char *source, *destination;
707 int src_bytes, dst_bytes;
708 int *consumed;
709 {
710 unsigned char *src = source;
711 unsigned char *src_end = source + src_bytes;
712 unsigned char *dst = destination;
713 unsigned char *dst_end = destination + dst_bytes;
714 /* Since the maximum bytes produced by each loop is 7, we subtract 6
715 from DST_END to assure that overflow checking is necessary only
716 at the head of loop. */
717 unsigned char *adjusted_dst_end = dst_end - 6;
718 int charset;
719 /* Charsets invoked to graphic plane 0 and 1 respectively. */
720 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
721 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
722
723 while (src < src_end && dst < adjusted_dst_end)
724 {
725 /* SRC_BASE remembers the start position in source in each loop.
726 The loop will be exited when there's not enough source text
727 to analyze long escape sequence or 2-byte code (within macros
728 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
729 to SRC_BASE before exiting. */
730 unsigned char *src_base = src;
731 unsigned char c1 = *src++, c2, cmprule;
732
733 switch (iso_code_class [c1])
734 {
735 case ISO_0x20_or_0x7F:
736 if (!coding->composing
737 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
738 {
739 /* This is SPACE or DEL. */
740 *dst++ = c1;
741 break;
742 }
743 /* This is a graphic character, we fall down ... */
744
745 case ISO_graphic_plane_0:
746 if (coding->composing == COMPOSING_WITH_RULE_RULE)
747 {
748 /* This is a composition rule. */
749 *dst++ = c1 | 0x80;
750 coding->composing = COMPOSING_WITH_RULE_TAIL;
751 }
752 else
753 DECODE_ISO_CHARACTER (charset0, c1);
754 break;
755
756 case ISO_0xA0_or_0xFF:
757 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
758 {
759 /* Invalid code. */
760 *dst++ = c1;
761 break;
762 }
763 /* This is a graphic character, we fall down ... */
764
765 case ISO_graphic_plane_1:
766 DECODE_ISO_CHARACTER (charset1, c1);
767 break;
768
769 case ISO_control_code:
770 /* All ISO2022 control characters in this class have the
771 same representation in Emacs internal format. */
772 *dst++ = c1;
773 break;
774
775 case ISO_carriage_return:
776 if (coding->eol_type == CODING_EOL_CR)
777 {
778 *dst++ = '\n';
779 }
780 else if (coding->eol_type == CODING_EOL_CRLF)
781 {
782 ONE_MORE_BYTE (c1);
783 if (c1 == ISO_CODE_LF)
784 *dst++ = '\n';
785 else
786 {
787 src--;
788 *dst++ = c1;
789 }
790 }
791 else
792 {
793 *dst++ = c1;
794 }
795 break;
796
797 case ISO_shift_out:
798 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
799 goto label_invalid_escape_sequence;
800 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
801 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
802 break;
803
804 case ISO_shift_in:
805 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
806 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
807 break;
808
809 case ISO_single_shift_2_7:
810 case ISO_single_shift_2:
811 /* SS2 is handled as an escape sequence of ESC 'N' */
812 c1 = 'N';
813 goto label_escape_sequence;
814
815 case ISO_single_shift_3:
816 /* SS2 is handled as an escape sequence of ESC 'O' */
817 c1 = 'O';
818 goto label_escape_sequence;
819
820 case ISO_control_sequence_introducer:
821 /* CSI is handled as an escape sequence of ESC '[' ... */
822 c1 = '[';
823 goto label_escape_sequence;
824
825 case ISO_escape:
826 ONE_MORE_BYTE (c1);
827 label_escape_sequence:
828 /* Escape sequences handled by Emacs are invocation,
829 designation, direction specification, and character
830 composition specification. */
831 switch (c1)
832 {
833 case '&': /* revision of following character set */
834 ONE_MORE_BYTE (c1);
835 if (!(c1 >= '@' && c1 <= '~'))
836 goto label_invalid_escape_sequence;
837 ONE_MORE_BYTE (c1);
838 if (c1 != ISO_CODE_ESC)
839 goto label_invalid_escape_sequence;
840 ONE_MORE_BYTE (c1);
841 goto label_escape_sequence;
842
843 case '$': /* designation of 2-byte character set */
844 ONE_MORE_BYTE (c1);
845 if (c1 >= '@' && c1 <= 'B')
846 { /* designation of JISX0208.1978, GB2312.1980,
847 or JISX0208.1980 */
848 DECODE_DESIGNATION (0, 2, 94, c1);
849 }
850 else if (c1 >= 0x28 && c1 <= 0x2B)
851 { /* designation of DIMENSION2_CHARS94 character set */
852 ONE_MORE_BYTE (c2);
853 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
854 }
855 else if (c1 >= 0x2C && c1 <= 0x2F)
856 { /* designation of DIMENSION2_CHARS96 character set */
857 ONE_MORE_BYTE (c2);
858 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
859 }
860 else
861 goto label_invalid_escape_sequence;
862 break;
863
864 case 'n': /* invocation of locking-shift-2 */
865 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
866 goto label_invalid_escape_sequence;
867 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
868 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
869 break;
870
871 case 'o': /* invocation of locking-shift-3 */
872 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
873 goto label_invalid_escape_sequence;
874 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
875 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
876 break;
877
878 case 'N': /* invocation of single-shift-2 */
879 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
880 goto label_invalid_escape_sequence;
881 ONE_MORE_BYTE (c1);
882 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
883 DECODE_ISO_CHARACTER (charset, c1);
884 break;
885
886 case 'O': /* invocation of single-shift-3 */
887 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
888 goto label_invalid_escape_sequence;
889 ONE_MORE_BYTE (c1);
890 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
891 DECODE_ISO_CHARACTER (charset, c1);
892 break;
893
894 case '0': /* start composing without embeded rules */
895 coding->composing = COMPOSING_NO_RULE_HEAD;
896 break;
897
898 case '1': /* end composing */
899 coding->composing = COMPOSING_NO;
900 break;
901
902 case '2': /* start composing with embeded rules */
903 coding->composing = COMPOSING_WITH_RULE_HEAD;
904 break;
905
906 case '[': /* specification of direction */
907 /* For the moment, nested direction is not supported.
908 So, the value of `coding->direction' is 0 or 1: 0
909 means left-to-right, 1 means right-to-left. */
910 ONE_MORE_BYTE (c1);
911 switch (c1)
912 {
913 case ']': /* end of the current direction */
914 coding->direction = 0;
915
916 case '0': /* end of the current direction */
917 case '1': /* start of left-to-right direction */
918 ONE_MORE_BYTE (c1);
919 if (c1 == ']')
920 coding->direction = 0;
921 else
922 goto label_invalid_escape_sequence;
923 break;
924
925 case '2': /* start of right-to-left direction */
926 ONE_MORE_BYTE (c1);
927 if (c1 == ']')
928 coding->direction= 1;
929 else
930 goto label_invalid_escape_sequence;
931 break;
932
933 default:
934 goto label_invalid_escape_sequence;
935 }
936 break;
937
938 default:
939 if (c1 >= 0x28 && c1 <= 0x2B)
940 { /* designation of DIMENSION1_CHARS94 character set */
941 ONE_MORE_BYTE (c2);
942 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
943 }
944 else if (c1 >= 0x2C && c1 <= 0x2F)
945 { /* designation of DIMENSION1_CHARS96 character set */
946 ONE_MORE_BYTE (c2);
947 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
948 }
949 else
950 {
951 goto label_invalid_escape_sequence;
952 }
953 }
954 /* We must update these variables now. */
955 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
956 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
957 break;
958
959 label_invalid_escape_sequence:
960 {
961 int length = src - src_base;
962
963 bcopy (src_base, dst, length);
964 dst += length;
965 }
966 }
967 continue;
968
969 label_end_of_loop:
970 coding->carryover_size = src - src_base;
971 bcopy (src_base, coding->carryover, coding->carryover_size);
972 src = src_base;
973 break;
974 }
975
976 /* If this is the last block of the text to be decoded, we had
977 better just flush out all remaining codes in the text although
978 they are not valid characters. */
979 if (coding->last_block)
980 {
981 bcopy (src, dst, src_end - src);
982 dst += (src_end - src);
983 src = src_end;
984 }
985 *consumed = src - source;
986 return dst - destination;
987 }
988
989 /* ISO2022 encoding staffs. */
990
991 /*
992 It is not enough to say just "ISO2022" on encoding, but we have to
993 specify more details. In Emacs, each coding-system of ISO2022
994 variant has the following specifications:
995 1. Initial designation to G0 thru G3.
996 2. Allows short-form designation?
997 3. ASCII should be designated to G0 before control characters?
998 4. ASCII should be designated to G0 at end of line?
999 5. 7-bit environment or 8-bit environment?
1000 6. Use locking-shift?
1001 7. Use Single-shift?
1002 And the following two are only for Japanese:
1003 8. Use ASCII in place of JIS0201-1976-Roman?
1004 9. Use JISX0208-1983 in place of JISX0208-1978?
1005 These specifications are encoded in `coding->flags' as flag bits
1006 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1007 detail.
1008 */
1009
1010 /* Produce codes (escape sequence) for designating CHARSET to graphic
1011 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1012 the coding system CODING allows, produce designation sequence of
1013 short-form. */
1014
1015 #define ENCODE_DESIGNATION(charset, reg, coding) \
1016 do { \
1017 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1018 char *intermediate_char_94 = "()*+"; \
1019 char *intermediate_char_96 = ",-./"; \
1020 Lisp_Object temp \
1021 = Fassq (make_number (charset), Vcharset_revision_alist); \
1022 if (! NILP (temp)) \
1023 { \
1024 *dst++ = ISO_CODE_ESC; \
1025 *dst++ = '&'; \
1026 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1027 } \
1028 *dst++ = ISO_CODE_ESC; \
1029 if (CHARSET_DIMENSION (charset) == 1) \
1030 { \
1031 if (CHARSET_CHARS (charset) == 94) \
1032 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1033 else \
1034 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1035 } \
1036 else \
1037 { \
1038 *dst++ = '$'; \
1039 if (CHARSET_CHARS (charset) == 94) \
1040 { \
1041 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1042 || reg != 0 \
1043 || final_char < '@' || final_char > 'B') \
1044 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1045 } \
1046 else \
1047 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1048 } \
1049 *dst++ = final_char; \
1050 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1051 } while (0)
1052
1053 /* The following two macros produce codes (control character or escape
1054 sequence) for ISO2022 single-shift functions (single-shift-2 and
1055 single-shift-3). */
1056
1057 #define ENCODE_SINGLE_SHIFT_2 \
1058 do { \
1059 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1060 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1061 else \
1062 *dst++ = ISO_CODE_SS2; \
1063 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1064 } while (0)
1065
1066 #define ENCODE_SINGLE_SHIFT_3 \
1067 do { \
1068 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1069 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1070 else \
1071 *dst++ = ISO_CODE_SS3; \
1072 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1073 } while (0)
1074
1075 /* The following four macros produce codes (control character or
1076 escape sequence) for ISO2022 locking-shift functions (shift-in,
1077 shift-out, locking-shift-2, and locking-shift-3). */
1078
1079 #define ENCODE_SHIFT_IN \
1080 do { \
1081 *dst++ = ISO_CODE_SI; \
1082 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1083 } while (0)
1084
1085 #define ENCODE_SHIFT_OUT \
1086 do { \
1087 *dst++ = ISO_CODE_SO; \
1088 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1089 } while (0)
1090
1091 #define ENCODE_LOCKING_SHIFT_2 \
1092 do { \
1093 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1094 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1095 } while (0)
1096
1097 #define ENCODE_LOCKING_SHIFT_3 \
1098 do { \
1099 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1100 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1101 } while (0)
1102
1103 /* Produce codes for a DIMENSION1 character of which character set is
1104 CHARSET and position-code is C1. Designation and invocation
1105 sequences are also produced in advance if necessary. */
1106
1107
1108 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1109 do { \
1110 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1111 { \
1112 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1113 *dst++ = c1 & 0x7F; \
1114 else \
1115 *dst++ = c1 | 0x80; \
1116 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1117 break; \
1118 } \
1119 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1120 { \
1121 *dst++ = c1 & 0x7F; \
1122 break; \
1123 } \
1124 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1125 { \
1126 *dst++ = c1 | 0x80; \
1127 break; \
1128 } \
1129 else \
1130 /* Since CHARSET is not yet invoked to any graphic planes, we \
1131 must invoke it, or, at first, designate it to some graphic \
1132 register. Then repeat the loop to actually produce the \
1133 character. */ \
1134 dst = encode_invocation_designation (charset, coding, dst); \
1135 } while (1)
1136
1137 /* Produce codes for a DIMENSION2 character of which character set is
1138 CHARSET and position-codes are C1 and C2. Designation and
1139 invocation codes are also produced in advance if necessary. */
1140
1141 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1142 do { \
1143 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1144 { \
1145 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1146 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1147 else \
1148 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1149 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1150 break; \
1151 } \
1152 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1153 { \
1154 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1155 break; \
1156 } \
1157 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1158 { \
1159 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1160 break; \
1161 } \
1162 else \
1163 /* Since CHARSET is not yet invoked to any graphic planes, we \
1164 must invoke it, or, at first, designate it to some graphic \
1165 register. Then repeat the loop to actually produce the \
1166 character. */ \
1167 dst = encode_invocation_designation (charset, coding, dst); \
1168 } while (1)
1169
1170 /* Produce designation and invocation codes at a place pointed by DST
1171 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1172 Return new DST. */
1173
1174 unsigned char *
1175 encode_invocation_designation (charset, coding, dst)
1176 int charset;
1177 struct coding_system *coding;
1178 unsigned char *dst;
1179 {
1180 int reg; /* graphic register number */
1181
1182 /* At first, check designations. */
1183 for (reg = 0; reg < 4; reg++)
1184 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1185 break;
1186
1187 if (reg >= 4)
1188 {
1189 /* CHARSET is not yet designated to any graphic registers. */
1190 /* At first check the requested designation. */
1191 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1192 if (reg < 0)
1193 /* Since CHARSET requests no special designation, designate to
1194 graphic register 0. */
1195 reg = 0;
1196
1197 ENCODE_DESIGNATION (charset, reg, coding);
1198 }
1199
1200 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1201 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1202 {
1203 /* Since the graphic register REG is not invoked to any graphic
1204 planes, invoke it to graphic plane 0. */
1205 switch (reg)
1206 {
1207 case 0: /* graphic register 0 */
1208 ENCODE_SHIFT_IN;
1209 break;
1210
1211 case 1: /* graphic register 1 */
1212 ENCODE_SHIFT_OUT;
1213 break;
1214
1215 case 2: /* graphic register 2 */
1216 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1217 ENCODE_SINGLE_SHIFT_2;
1218 else
1219 ENCODE_LOCKING_SHIFT_2;
1220 break;
1221
1222 case 3: /* graphic register 3 */
1223 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1224 ENCODE_SINGLE_SHIFT_3;
1225 else
1226 ENCODE_LOCKING_SHIFT_3;
1227 break;
1228 }
1229 }
1230 return dst;
1231 }
1232
1233 /* The following two macros produce codes for indicating composition. */
1234 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1235 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1236 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1237
1238 /* The following three macros produce codes for indicating direction
1239 of text. */
1240 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1241 do { \
1242 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1243 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1244 else \
1245 *dst++ = ISO_CODE_CSI; \
1246 } while (0)
1247
1248 #define ENCODE_DIRECTION_R2L \
1249 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1250
1251 #define ENCODE_DIRECTION_L2R \
1252 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1253
1254 /* Produce codes for designation and invocation to reset the graphic
1255 planes and registers to initial state. */
1256 #define ENCODE_RESET_PLANE_AND_REGISTER \
1257 do { \
1258 int reg; \
1259 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1260 ENCODE_SHIFT_IN; \
1261 for (reg = 0; reg < 4; reg++) \
1262 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1263 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1264 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1265 ENCODE_DESIGNATION \
1266 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1267 } while (0)
1268
1269 int
1270 encode_designation_at_bol (coding, src, src_end, dstp)
1271 struct coding_system *coding;
1272 unsigned char *src, *src_end, **dstp;
1273 {
1274 int charset, reg, r[4];
1275 unsigned char *dst = *dstp, c;
1276 for (reg = 0; reg < 4; reg++) r[reg] = -1;
1277 while (src < src_end && (c = *src++) != '\n')
1278 {
1279 switch (emacs_code_class[c])
1280 {
1281 case EMACS_ascii_code:
1282 charset = CHARSET_ASCII;
1283 break;
1284 case EMACS_leading_code_2:
1285 if (++src >= src_end) continue;
1286 charset = c;
1287 break;
1288 case EMACS_leading_code_3:
1289 if ((src += 2) >= src_end) continue;
1290 charset = (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1291 break;
1292 case EMACS_leading_code_4:
1293 if ((src += 3) >= src_end) continue;
1294 charset = *(src - 3);
1295 break;
1296 default:
1297 continue;
1298 }
1299 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1300 if (r[reg] < 0
1301 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset)
1302 r[reg] = charset;
1303 }
1304 if (c != '\n' && !coding->last_block)
1305 return -1;
1306 for (reg = 0; reg < 4; reg++)
1307 if (r[reg] >= 0)
1308 ENCODE_DESIGNATION (r[reg], reg, coding);
1309 *dstp = dst;
1310 return 0;
1311 }
1312
1313 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1314
1315 int
1316 encode_coding_iso2022 (coding, source, destination,
1317 src_bytes, dst_bytes, consumed)
1318 struct coding_system *coding;
1319 unsigned char *source, *destination;
1320 int src_bytes, dst_bytes;
1321 int *consumed;
1322 {
1323 unsigned char *src = source;
1324 unsigned char *src_end = source + src_bytes;
1325 unsigned char *dst = destination;
1326 unsigned char *dst_end = destination + dst_bytes;
1327 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1328 from DST_END to assure overflow checking is necessary only at the
1329 head of loop. */
1330 unsigned char *adjusted_dst_end = dst_end - 19;
1331
1332 while (src < src_end && dst < adjusted_dst_end)
1333 {
1334 /* SRC_BASE remembers the start position in source in each loop.
1335 The loop will be exited when there's not enough source text
1336 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1337 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1338 reset to SRC_BASE before exiting. */
1339 unsigned char *src_base = src;
1340 unsigned char c1, c2, c3, c4;
1341 int charset;
1342
1343 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1344 && CODING_SPEC_ISO_BOL (coding))
1345 {
1346 /* We have to produce destination sequences now. */
1347 if (encode_designation_at_bol (coding, src, src_end, &dst) < 0)
1348 /* We can't find end of line in the current block. Let's
1349 repeat encoding starting from the current position
1350 pointed by SRC. */
1351 break;
1352 CODING_SPEC_ISO_BOL (coding) = 0;
1353 }
1354
1355 c1 = *src++;
1356 /* If we are seeing a component of a composite character, we are
1357 seeing a leading-code specially encoded for composition, or a
1358 composition rule if composing with rule. We must set C1
1359 to a normal leading-code or an ASCII code. If we are not at
1360 a composed character, we must reset the composition state. */
1361 if (COMPOSING_P (coding->composing))
1362 {
1363 if (c1 < 0xA0)
1364 {
1365 /* We are not in a composite character any longer. */
1366 coding->composing = COMPOSING_NO;
1367 ENCODE_COMPOSITION_END;
1368 }
1369 else
1370 {
1371 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1372 {
1373 *dst++ = c1 & 0x7F;
1374 coding->composing = COMPOSING_WITH_RULE_HEAD;
1375 continue;
1376 }
1377 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1378 coding->composing = COMPOSING_WITH_RULE_RULE;
1379 if (c1 == 0xA0)
1380 {
1381 /* This is an ASCII component. */
1382 ONE_MORE_BYTE (c1);
1383 c1 &= 0x7F;
1384 }
1385 else
1386 /* This is a leading-code of non ASCII component. */
1387 c1 -= 0x20;
1388 }
1389 }
1390
1391 /* Now encode one character. C1 is a control character, an
1392 ASCII character, or a leading-code of multi-byte character. */
1393 switch (emacs_code_class[c1])
1394 {
1395 case EMACS_ascii_code:
1396 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1397 break;
1398
1399 case EMACS_control_code:
1400 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1401 ENCODE_RESET_PLANE_AND_REGISTER;
1402 *dst++ = c1;
1403 break;
1404
1405 case EMACS_carriage_return_code:
1406 if (!coding->selective)
1407 {
1408 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1409 ENCODE_RESET_PLANE_AND_REGISTER;
1410 *dst++ = c1;
1411 break;
1412 }
1413 /* fall down to treat '\r' as '\n' ... */
1414
1415 case EMACS_linefeed_code:
1416 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1417 ENCODE_RESET_PLANE_AND_REGISTER;
1418 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1419 bcopy (coding->spec.iso2022.initial_designation,
1420 coding->spec.iso2022.current_designation,
1421 sizeof coding->spec.iso2022.initial_designation);
1422 if (coding->eol_type == CODING_EOL_LF
1423 || coding->eol_type == CODING_EOL_AUTOMATIC)
1424 *dst++ = ISO_CODE_LF;
1425 else if (coding->eol_type == CODING_EOL_CRLF)
1426 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1427 else
1428 *dst++ = ISO_CODE_CR;
1429 CODING_SPEC_ISO_BOL (coding) = 1;
1430 break;
1431
1432 case EMACS_leading_code_2:
1433 ONE_MORE_BYTE (c2);
1434 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1435 break;
1436
1437 case EMACS_leading_code_3:
1438 TWO_MORE_BYTES (c2, c3);
1439 if (c1 < LEADING_CODE_PRIVATE_11)
1440 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1441 else
1442 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1443 break;
1444
1445 case EMACS_leading_code_4:
1446 THREE_MORE_BYTES (c2, c3, c4);
1447 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1448 break;
1449
1450 case EMACS_leading_code_composition:
1451 ONE_MORE_BYTE (c1);
1452 if (c1 == 0xFF)
1453 {
1454 coding->composing = COMPOSING_WITH_RULE_HEAD;
1455 ENCODE_COMPOSITION_WITH_RULE_START;
1456 }
1457 else
1458 {
1459 /* Rewind one byte because it is a character code of
1460 composition elements. */
1461 src--;
1462 coding->composing = COMPOSING_NO_RULE_HEAD;
1463 ENCODE_COMPOSITION_NO_RULE_START;
1464 }
1465 break;
1466
1467 case EMACS_invalid_code:
1468 *dst++ = c1;
1469 break;
1470 }
1471 continue;
1472 label_end_of_loop:
1473 coding->carryover_size = src - src_base;
1474 bcopy (src_base, coding->carryover, coding->carryover_size);
1475 src = src_base;
1476 break;
1477 }
1478
1479 /* If this is the last block of the text to be encoded, we must
1480 reset the state of graphic planes and registers to initial one.
1481 In addition, we had better just flush out all remaining codes in
1482 the text although they are not valid characters. */
1483 if (coding->last_block)
1484 {
1485 ENCODE_RESET_PLANE_AND_REGISTER;
1486 bcopy(src, dst, src_end - src);
1487 dst += (src_end - src);
1488 src = src_end;
1489 }
1490 *consumed = src - source;
1491 return dst - destination;
1492 }
1493
1494 \f
1495 /*** 4. SJIS and BIG5 handlers ***/
1496
1497 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1498 quite widely. So, for the moment, Emacs supports them in the bare
1499 C code. But, in the future, they may be supported only by CCL. */
1500
1501 /* SJIS is a coding system encoding three character sets: ASCII, right
1502 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1503 as is. A character of charset katakana-jisx0201 is encoded by
1504 "position-code + 0x80". A character of charset japanese-jisx0208
1505 is encoded in 2-byte but two position-codes are divided and shifted
1506 so that it fit in the range below.
1507
1508 --- CODE RANGE of SJIS ---
1509 (character set) (range)
1510 ASCII 0x00 .. 0x7F
1511 KATAKANA-JISX0201 0xA0 .. 0xDF
1512 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1513 (2nd byte) 0x40 .. 0xFF
1514 -------------------------------
1515
1516 */
1517
1518 /* BIG5 is a coding system encoding two character sets: ASCII and
1519 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1520 character set and is encoded in two-byte.
1521
1522 --- CODE RANGE of BIG5 ---
1523 (character set) (range)
1524 ASCII 0x00 .. 0x7F
1525 Big5 (1st byte) 0xA1 .. 0xFE
1526 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1527 --------------------------
1528
1529 Since the number of characters in Big5 is larger than maximum
1530 characters in Emacs' charset (96x96), it can't be handled as one
1531 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1532 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1533 contains frequently used characters and the latter contains less
1534 frequently used characters. */
1535
1536 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1537 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1538 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1539 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1540
1541 /* Number of Big5 characters which have the same code in 1st byte. */
1542 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1543
1544 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1545 do { \
1546 unsigned int temp \
1547 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1548 if (b1 < 0xC9) \
1549 charset = charset_big5_1; \
1550 else \
1551 { \
1552 charset = charset_big5_2; \
1553 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1554 } \
1555 c1 = temp / (0xFF - 0xA1) + 0x21; \
1556 c2 = temp % (0xFF - 0xA1) + 0x21; \
1557 } while (0)
1558
1559 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1560 do { \
1561 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1562 if (charset == charset_big5_2) \
1563 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1564 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1565 b2 = temp % BIG5_SAME_ROW; \
1566 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1567 } while (0)
1568
1569 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1570 Check if a text is encoded in SJIS. If it is, return
1571 CODING_CATEGORY_MASK_SJIS, else return 0. */
1572
1573 int
1574 detect_coding_sjis (src, src_end)
1575 unsigned char *src, *src_end;
1576 {
1577 unsigned char c;
1578
1579 while (src < src_end)
1580 {
1581 c = *src++;
1582 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1583 return 0;
1584 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1585 {
1586 if (src < src_end && *src++ < 0x40)
1587 return 0;
1588 }
1589 }
1590 return CODING_CATEGORY_MASK_SJIS;
1591 }
1592
1593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1594 Check if a text is encoded in BIG5. If it is, return
1595 CODING_CATEGORY_MASK_BIG5, else return 0. */
1596
1597 int
1598 detect_coding_big5 (src, src_end)
1599 unsigned char *src, *src_end;
1600 {
1601 unsigned char c;
1602
1603 while (src < src_end)
1604 {
1605 c = *src++;
1606 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1607 return 0;
1608 if (c >= 0xA1)
1609 {
1610 if (src >= src_end)
1611 break;
1612 c = *src++;
1613 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1614 return 0;
1615 }
1616 }
1617 return CODING_CATEGORY_MASK_BIG5;
1618 }
1619
1620 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1621 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1622
1623 int
1624 decode_coding_sjis_big5 (coding, source, destination,
1625 src_bytes, dst_bytes, consumed, sjis_p)
1626 struct coding_system *coding;
1627 unsigned char *source, *destination;
1628 int src_bytes, dst_bytes;
1629 int *consumed;
1630 int sjis_p;
1631 {
1632 unsigned char *src = source;
1633 unsigned char *src_end = source + src_bytes;
1634 unsigned char *dst = destination;
1635 unsigned char *dst_end = destination + dst_bytes;
1636 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1637 from DST_END to assure overflow checking is necessary only at the
1638 head of loop. */
1639 unsigned char *adjusted_dst_end = dst_end - 3;
1640
1641 while (src < src_end && dst < adjusted_dst_end)
1642 {
1643 /* SRC_BASE remembers the start position in source in each loop.
1644 The loop will be exited when there's not enough source text
1645 to analyze two-byte character (within macro ONE_MORE_BYTE).
1646 In that case, SRC is reset to SRC_BASE before exiting. */
1647 unsigned char *src_base = src;
1648 unsigned char c1 = *src++, c2, c3, c4;
1649
1650 if (c1 == '\r')
1651 {
1652 if (coding->eol_type == CODING_EOL_CRLF)
1653 {
1654 ONE_MORE_BYTE (c2);
1655 if (c2 == '\n')
1656 *dst++ = c2;
1657 else
1658 /* To process C2 again, SRC is subtracted by 1. */
1659 *dst++ = c1, src--;
1660 }
1661 else
1662 *dst++ = c1;
1663 }
1664 else if (c1 < 0x80)
1665 *dst++ = c1;
1666 else if (c1 < 0xA0 || c1 >= 0xE0)
1667 {
1668 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1669 if (sjis_p)
1670 {
1671 ONE_MORE_BYTE (c2);
1672 DECODE_SJIS (c1, c2, c3, c4);
1673 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1674 }
1675 else if (c1 >= 0xE0 && c1 < 0xFF)
1676 {
1677 int charset;
1678
1679 ONE_MORE_BYTE (c2);
1680 DECODE_BIG5 (c1, c2, charset, c3, c4);
1681 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1682 }
1683 else /* Invalid code */
1684 *dst++ = c1;
1685 }
1686 else
1687 {
1688 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1689 if (sjis_p)
1690 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1691 else
1692 {
1693 int charset;
1694
1695 ONE_MORE_BYTE (c2);
1696 DECODE_BIG5 (c1, c2, charset, c3, c4);
1697 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1698 }
1699 }
1700 continue;
1701
1702 label_end_of_loop:
1703 coding->carryover_size = src - src_base;
1704 bcopy (src_base, coding->carryover, coding->carryover_size);
1705 src = src_base;
1706 break;
1707 }
1708
1709 *consumed = src - source;
1710 return dst - destination;
1711 }
1712
1713 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1714 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1715 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1716 sure that all these charsets are registered as official charset
1717 (i.e. do not have extended leading-codes). Characters of other
1718 charsets are produced without any encoding. If SJIS_P is 1, encode
1719 SJIS text, else encode BIG5 text. */
1720
1721 int
1722 encode_coding_sjis_big5 (coding, source, destination,
1723 src_bytes, dst_bytes, consumed, sjis_p)
1724 struct coding_system *coding;
1725 unsigned char *source, *destination;
1726 int src_bytes, dst_bytes;
1727 int *consumed;
1728 int sjis_p;
1729 {
1730 unsigned char *src = source;
1731 unsigned char *src_end = source + src_bytes;
1732 unsigned char *dst = destination;
1733 unsigned char *dst_end = destination + dst_bytes;
1734 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1735 from DST_END to assure overflow checking is necessary only at the
1736 head of loop. */
1737 unsigned char *adjusted_dst_end = dst_end - 1;
1738
1739 while (src < src_end && dst < adjusted_dst_end)
1740 {
1741 /* SRC_BASE remembers the start position in source in each loop.
1742 The loop will be exited when there's not enough source text
1743 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1744 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1745 before exiting. */
1746 unsigned char *src_base = src;
1747 unsigned char c1 = *src++, c2, c3, c4;
1748
1749 if (coding->composing)
1750 {
1751 if (c1 == 0xA0)
1752 {
1753 ONE_MORE_BYTE (c1);
1754 c1 &= 0x7F;
1755 }
1756 else if (c1 >= 0xA0)
1757 c1 -= 0x20;
1758 else
1759 coding->composing = 0;
1760 }
1761
1762 switch (emacs_code_class[c1])
1763 {
1764 case EMACS_ascii_code:
1765 case EMACS_control_code:
1766 *dst++ = c1;
1767 break;
1768
1769 case EMACS_carriage_return_code:
1770 if (!coding->selective)
1771 {
1772 *dst++ = c1;
1773 break;
1774 }
1775 /* fall down to treat '\r' as '\n' ... */
1776
1777 case EMACS_linefeed_code:
1778 if (coding->eol_type == CODING_EOL_LF
1779 || coding->eol_type == CODING_EOL_AUTOMATIC)
1780 *dst++ = '\n';
1781 else if (coding->eol_type == CODING_EOL_CRLF)
1782 *dst++ = '\r', *dst++ = '\n';
1783 else
1784 *dst++ = '\r';
1785 break;
1786
1787 case EMACS_leading_code_2:
1788 ONE_MORE_BYTE (c2);
1789 if (sjis_p && c1 == charset_katakana_jisx0201)
1790 *dst++ = c2;
1791 else
1792 *dst++ = c1, *dst++ = c2;
1793 break;
1794
1795 case EMACS_leading_code_3:
1796 TWO_MORE_BYTES (c2, c3);
1797 c2 &= 0x7F, c3 &= 0x7F;
1798 if (sjis_p && c1 == charset_jisx0208)
1799 {
1800 unsigned char s1, s2;
1801
1802 ENCODE_SJIS (c2, c3, s1, s2);
1803 *dst++ = s1, *dst++ = s2;
1804 }
1805 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1806 {
1807 unsigned char b1, b2;
1808
1809 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1810 *dst++ = b1, *dst++ = b2;
1811 }
1812 else
1813 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1814 break;
1815
1816 case EMACS_leading_code_4:
1817 THREE_MORE_BYTES (c2, c3, c4);
1818 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1819 break;
1820
1821 case EMACS_leading_code_composition:
1822 coding->composing = 1;
1823 break;
1824
1825 default: /* i.e. case EMACS_invalid_code: */
1826 *dst++ = c1;
1827 }
1828 continue;
1829
1830 label_end_of_loop:
1831 coding->carryover_size = src - src_base;
1832 bcopy (src_base, coding->carryover, coding->carryover_size);
1833 src = src_base;
1834 break;
1835 }
1836
1837 *consumed = src - source;
1838 return dst - destination;
1839 }
1840
1841 \f
1842 /*** 5. End-of-line handlers ***/
1843
1844 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1845 This function is called only when `coding->eol_type' is
1846 CODING_EOL_CRLF or CODING_EOL_CR. */
1847
1848 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1849 struct coding_system *coding;
1850 unsigned char *source, *destination;
1851 int src_bytes, dst_bytes;
1852 int *consumed;
1853 {
1854 unsigned char *src = source;
1855 unsigned char *src_end = source + src_bytes;
1856 unsigned char *dst = destination;
1857 unsigned char *dst_end = destination + dst_bytes;
1858 int produced;
1859
1860 switch (coding->eol_type)
1861 {
1862 case CODING_EOL_CRLF:
1863 {
1864 /* Since the maximum bytes produced by each loop is 2, we
1865 subtract 1 from DST_END to assure overflow checking is
1866 necessary only at the head of loop. */
1867 unsigned char *adjusted_dst_end = dst_end - 1;
1868
1869 while (src < src_end && dst < adjusted_dst_end)
1870 {
1871 unsigned char *src_base = src;
1872 unsigned char c = *src++;
1873 if (c == '\r')
1874 {
1875 ONE_MORE_BYTE (c);
1876 if (c != '\n')
1877 *dst++ = '\r';
1878 *dst++ = c;
1879 }
1880 else
1881 *dst++ = c;
1882 continue;
1883
1884 label_end_of_loop:
1885 coding->carryover_size = src - src_base;
1886 bcopy (src_base, coding->carryover, coding->carryover_size);
1887 src = src_base;
1888 break;
1889 }
1890 *consumed = src - source;
1891 produced = dst - destination;
1892 break;
1893 }
1894
1895 case CODING_EOL_CR:
1896 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1897 bcopy (source, destination, produced);
1898 dst_end = destination + produced;
1899 while (dst < dst_end)
1900 if (*dst++ == '\r') dst[-1] = '\n';
1901 *consumed = produced;
1902 break;
1903
1904 default: /* i.e. case: CODING_EOL_LF */
1905 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1906 bcopy (source, destination, produced);
1907 *consumed = produced;
1908 break;
1909 }
1910
1911 return produced;
1912 }
1913
1914 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1915 format of end-of-line according to `coding->eol_type'. If
1916 `coding->selective' is 1, code '\r' in source text also means
1917 end-of-line. */
1918
1919 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1920 struct coding_system *coding;
1921 unsigned char *source, *destination;
1922 int src_bytes, dst_bytes;
1923 int *consumed;
1924 {
1925 unsigned char *src = source;
1926 unsigned char *dst = destination;
1927 int produced;
1928
1929 if (src_bytes <= 0)
1930 return 0;
1931
1932 switch (coding->eol_type)
1933 {
1934 case CODING_EOL_LF:
1935 case CODING_EOL_AUTOMATIC:
1936 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1937 bcopy (source, destination, produced);
1938 if (coding->selective)
1939 {
1940 int i = produced;
1941 while (i--)
1942 if (*dst++ == '\r') dst[-1] = '\n';
1943 }
1944 *consumed = produced;
1945
1946 case CODING_EOL_CRLF:
1947 {
1948 unsigned char c;
1949 unsigned char *src_end = source + src_bytes;
1950 unsigned char *dst_end = destination + dst_bytes;
1951 /* Since the maximum bytes produced by each loop is 2, we
1952 subtract 1 from DST_END to assure overflow checking is
1953 necessary only at the head of loop. */
1954 unsigned char *adjusted_dst_end = dst_end - 1;
1955
1956 while (src < src_end && dst < adjusted_dst_end)
1957 {
1958 c = *src++;
1959 if (c == '\n' || (c == '\r' && coding->selective))
1960 *dst++ = '\r', *dst++ = '\n';
1961 else
1962 *dst++ = c;
1963 }
1964 produced = dst - destination;
1965 *consumed = src - source;
1966 break;
1967 }
1968
1969 default: /* i.e. case CODING_EOL_CR: */
1970 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1971 bcopy (source, destination, produced);
1972 {
1973 int i = produced;
1974 while (i--)
1975 if (*dst++ == '\n') dst[-1] = '\r';
1976 }
1977 *consumed = produced;
1978 }
1979
1980 return produced;
1981 }
1982
1983 \f
1984 /*** 6. C library functions ***/
1985
1986 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1987 has a property `coding-system'. The value of this property is a
1988 vector of length 5 (called as coding-vector). Among elements of
1989 this vector, the first (element[0]) and the fifth (element[4])
1990 carry important information for decoding/encoding. Before
1991 decoding/encoding, this information should be set in fields of a
1992 structure of type `coding_system'.
1993
1994 A value of property `coding-system' can be a symbol of another
1995 subsidiary coding-system. In that case, Emacs gets coding-vector
1996 from that symbol.
1997
1998 `element[0]' contains information to be set in `coding->type'. The
1999 value and its meaning is as follows:
2000
2001 0 -- coding_system_internal
2002 1 -- coding_system_sjis
2003 2 -- coding_system_iso2022
2004 3 -- coding_system_big5
2005 4 -- coding_system_ccl
2006 nil -- coding_system_no_conversion
2007 t -- coding_system_automatic
2008
2009 `element[4]' contains information to be set in `coding->flags' and
2010 `coding->spec'. The meaning varies by `coding->type'.
2011
2012 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2013 of length 32 (of which the first 13 sub-elements are used now).
2014 Meanings of these sub-elements are:
2015
2016 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2017 If the value is an integer of valid charset, the charset is
2018 assumed to be designated to graphic register N initially.
2019
2020 If the value is minus, it is a minus value of charset which
2021 reserves graphic register N, which means that the charset is
2022 not designated initially but should be designated to graphic
2023 register N just before encoding a character in that charset.
2024
2025 If the value is nil, graphic register N is never used on
2026 encoding.
2027
2028 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2029 Each value takes t or nil. See the section ISO2022 of
2030 `coding.h' for more information.
2031
2032 If `coding->type' is `coding_type_big5', element[4] is t to denote
2033 BIG5-ETen or nil to denote BIG5-HKU.
2034
2035 If `coding->type' takes the other value, element[4] is ignored.
2036
2037 Emacs Lisp's coding system also carries information about format of
2038 end-of-line in a value of property `eol-type'. If the value is
2039 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2040 means CODING_EOL_CR. If it is not integer, it should be a vector
2041 of subsidiary coding systems of which property `eol-type' has one
2042 of above values.
2043
2044 */
2045
2046 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2047 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2048 is setup so that no conversion is necessary and return -1, else
2049 return 0. */
2050
2051 int
2052 setup_coding_system (coding_system, coding)
2053 Lisp_Object coding_system;
2054 struct coding_system *coding;
2055 {
2056 Lisp_Object type, eol_type;
2057
2058 /* At first, set several fields default values. */
2059 coding->require_flushing = 0;
2060 coding->last_block = 0;
2061 coding->selective = 0;
2062 coding->composing = 0;
2063 coding->direction = 0;
2064 coding->carryover_size = 0;
2065 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2066
2067 Vlast_coding_system_used = coding->symbol = coding_system;
2068 eol_type = Qnil;
2069 /* Get value of property `coding-system' until we get a vector.
2070 While doing that, also get values of properties
2071 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2072 while (!NILP (coding_system) && SYMBOLP (coding_system))
2073 {
2074 if (NILP (coding->post_read_conversion))
2075 coding->post_read_conversion = Fget (coding_system,
2076 Qpost_read_conversion);
2077 if (NILP (coding->pre_write_conversion))
2078 coding->pre_write_conversion = Fget (coding_system,
2079 Qpre_write_conversion);
2080 if (NILP (eol_type))
2081 eol_type = Fget (coding_system, Qeol_type);
2082 coding_system = Fget (coding_system, Qcoding_system);
2083 }
2084 if (!VECTORP (coding_system)
2085 || XVECTOR (coding_system)->size != 5)
2086 goto label_invalid_coding_system;
2087
2088 if (VECTORP (eol_type))
2089 coding->eol_type = CODING_EOL_AUTOMATIC;
2090 else if (XFASTINT (eol_type) == 1)
2091 coding->eol_type = CODING_EOL_CRLF;
2092 else if (XFASTINT (eol_type) == 2)
2093 coding->eol_type = CODING_EOL_CR;
2094 else
2095 coding->eol_type = CODING_EOL_LF;
2096
2097 type = XVECTOR (coding_system)->contents[0];
2098 switch (XFASTINT (type))
2099 {
2100 case 0:
2101 coding->type = coding_type_internal;
2102 break;
2103
2104 case 1:
2105 coding->type = coding_type_sjis;
2106 break;
2107
2108 case 2:
2109 coding->type = coding_type_iso2022;
2110 {
2111 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2112 Lisp_Object *flags;
2113 int i, charset, default_reg_bits = 0;
2114
2115 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2116 goto label_invalid_coding_system;
2117
2118 flags = XVECTOR (val)->contents;
2119 coding->flags
2120 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2121 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2122 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2123 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2124 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2125 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2126 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2127 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2128 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2129 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2130 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2131
2132 /* Invoke graphic register 0 to plane 0. */
2133 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2134 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2135 CODING_SPEC_ISO_INVOCATION (coding, 1)
2136 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2137 /* Not single shifting at first. */
2138 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2139 /* Beginning of buffer should also be regarded as bol. */
2140 CODING_SPEC_ISO_BOL(coding) = 1;
2141
2142 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2143 FLAGS[REG] can be one of below:
2144 integer CHARSET: CHARSET occupies register I,
2145 t: designate nothing to REG initially, but can be used
2146 by any charsets,
2147 list of integer, nil, or t: designate the first
2148 element (if integer) to REG initially, the remaining
2149 elements (if integer) is designated to REG on request,
2150 if an element is t, REG can be used by any charset,
2151 nil: REG is never used. */
2152 for (charset = 0; charset <= MAX_CHARSET; charset++)
2153 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2154 for (i = 0; i < 4; i++)
2155 {
2156 if (INTEGERP (flags[i])
2157 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2158 || (charset = get_charset_id (flags[i])) >= 0)
2159 {
2160 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2161 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2162 }
2163 else if (EQ (flags[i], Qt))
2164 {
2165 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2166 default_reg_bits |= 1 << i;
2167 }
2168 else if (CONSP (flags[i]))
2169 {
2170 Lisp_Object tail = flags[i];
2171
2172 if (INTEGERP (XCONS (tail)->car)
2173 && (charset = XINT (XCONS (tail)->car),
2174 CHARSET_VALID_P (charset))
2175 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2176 {
2177 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2178 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2179 }
2180 else
2181 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2182 tail = XCONS (tail)->cdr;
2183 while (CONSP (tail))
2184 {
2185 if (INTEGERP (XCONS (tail)->car)
2186 && (charset = XINT (XCONS (tail)->car),
2187 CHARSET_VALID_P (charset))
2188 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2189 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2190 = i;
2191 else if (EQ (XCONS (tail)->car, Qt))
2192 default_reg_bits |= 1 << i;
2193 tail = XCONS (tail)->cdr;
2194 }
2195 }
2196 else
2197 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2198
2199 CODING_SPEC_ISO_DESIGNATION (coding, i)
2200 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2201 }
2202
2203 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2204 {
2205 /* REG 1 can be used only by locking shift in 7-bit env. */
2206 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2207 default_reg_bits &= ~2;
2208 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2209 /* Without any shifting, only REG 0 and 1 can be used. */
2210 default_reg_bits &= 3;
2211 }
2212
2213 for (charset = 0; charset <= MAX_CHARSET; charset++)
2214 if (CHARSET_VALID_P (charset)
2215 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2216 {
2217 /* We have not yet decided where to designate CHARSET. */
2218 int reg_bits = default_reg_bits;
2219
2220 if (CHARSET_CHARS (charset) == 96)
2221 /* A charset of CHARS96 can't be designated to REG 0. */
2222 reg_bits &= ~1;
2223
2224 if (reg_bits)
2225 /* There exist some default graphic register. */
2226 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2227 = (reg_bits & 1
2228 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2229 else
2230 /* We anyway have to designate CHARSET to somewhere. */
2231 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2232 = (CHARSET_CHARS (charset) == 94
2233 ? 0
2234 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2235 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2236 ? 1
2237 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2238 ? 2 : 0)));
2239 }
2240 }
2241 coding->require_flushing = 1;
2242 break;
2243
2244 case 3:
2245 coding->type = coding_type_big5;
2246 coding->flags
2247 = (NILP (XVECTOR (coding_system)->contents[4])
2248 ? CODING_FLAG_BIG5_HKU
2249 : CODING_FLAG_BIG5_ETEN);
2250 break;
2251
2252 case 4:
2253 coding->type = coding_type_ccl;
2254 {
2255 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2256 if (CONSP (val)
2257 && VECTORP (XCONS (val)->car)
2258 && VECTORP (XCONS (val)->cdr))
2259 {
2260 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2261 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2262 }
2263 else
2264 goto label_invalid_coding_system;
2265 }
2266 coding->require_flushing = 1;
2267 break;
2268
2269 default:
2270 if (EQ (type, Qt))
2271 coding->type = coding_type_automatic;
2272 else
2273 coding->type = coding_type_no_conversion;
2274 break;
2275 }
2276 return 0;
2277
2278 label_invalid_coding_system:
2279 coding->type = coding_type_no_conversion;
2280 coding->eol_type = CODING_EOL_LF;
2281 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2282 = Qnil;
2283 return -1;
2284 }
2285
2286 /* Emacs has a mechanism to automatically detect a coding system if it
2287 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2288 it's impossible to distinguish some coding systems accurately
2289 because they use the same range of codes. So, at first, coding
2290 systems are categorized into 7, those are:
2291
2292 o coding-category-internal
2293
2294 The category for a coding system which has the same code range
2295 as Emacs' internal format. Assigned the coding-system (Lisp
2296 symbol) `internal' by default.
2297
2298 o coding-category-sjis
2299
2300 The category for a coding system which has the same code range
2301 as SJIS. Assigned the coding-system (Lisp
2302 symbol) `shift-jis' by default.
2303
2304 o coding-category-iso-7
2305
2306 The category for a coding system which has the same code range
2307 as ISO2022 of 7-bit environment. Assigned the coding-system
2308 (Lisp symbol) `iso-2022-7' by default.
2309
2310 o coding-category-iso-8-1
2311
2312 The category for a coding system which has the same code range
2313 as ISO2022 of 8-bit environment and graphic plane 1 used only
2314 for DIMENSION1 charset. Assigned the coding-system (Lisp
2315 symbol) `iso-8859-1' by default.
2316
2317 o coding-category-iso-8-2
2318
2319 The category for a coding system which has the same code range
2320 as ISO2022 of 8-bit environment and graphic plane 1 used only
2321 for DIMENSION2 charset. Assigned the coding-system (Lisp
2322 symbol) `euc-japan' by default.
2323
2324 o coding-category-iso-else
2325
2326 The category for a coding system which has the same code range
2327 as ISO2022 but not belongs to any of the above three
2328 categories. Assigned the coding-system (Lisp symbol)
2329 `iso-2022-ss2-7' by default.
2330
2331 o coding-category-big5
2332
2333 The category for a coding system which has the same code range
2334 as BIG5. Assigned the coding-system (Lisp symbol)
2335 `cn-big5' by default.
2336
2337 o coding-category-binary
2338
2339 The category for a coding system not categorized in any of the
2340 above. Assigned the coding-system (Lisp symbol)
2341 `no-conversion' by default.
2342
2343 Each of them is a Lisp symbol and the value is an actual
2344 `coding-system's (this is also a Lisp symbol) assigned by a user.
2345 What Emacs does actually is to detect a category of coding system.
2346 Then, it uses a `coding-system' assigned to it. If Emacs can't
2347 decide only one possible category, it selects a category of the
2348 highest priority. Priorities of categories are also specified by a
2349 user in a Lisp variable `coding-category-list'.
2350
2351 */
2352
2353 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2354 If it detects possible coding systems, return an integer in which
2355 appropriate flag bits are set. Flag bits are defined by macros
2356 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2357
2358 int
2359 detect_coding_mask (src, src_bytes)
2360 unsigned char *src;
2361 int src_bytes;
2362 {
2363 register unsigned char c;
2364 unsigned char *src_end = src + src_bytes;
2365 int mask;
2366
2367 /* At first, skip all ASCII characters and control characters except
2368 for three ISO2022 specific control characters. */
2369 label_loop_detect_coding:
2370 while (src < src_end)
2371 {
2372 c = *src;
2373 if (c >= 0x80
2374 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2375 break;
2376 src++;
2377 }
2378
2379 if (src >= src_end)
2380 /* We found nothing other than ASCII. There's nothing to do. */
2381 return CODING_CATEGORY_MASK_ANY;
2382
2383 /* The text seems to be encoded in some multilingual coding system.
2384 Now, try to find in which coding system the text is encoded. */
2385 if (c < 0x80)
2386 {
2387 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2388 /* C is an ISO2022 specific control code of C0. */
2389 mask = detect_coding_iso2022 (src, src_end);
2390 src++;
2391 if (mask == CODING_CATEGORY_MASK_ANY)
2392 /* No valid ISO2022 code follows C. Try again. */
2393 goto label_loop_detect_coding;
2394 }
2395 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2396 /* C is an ISO2022 specific control code of C1,
2397 or the first byte of SJIS's 2-byte character code,
2398 or a leading code of Emacs. */
2399 mask = (detect_coding_iso2022 (src, src_end)
2400 | detect_coding_sjis (src, src_end)
2401 | detect_coding_internal (src, src_end));
2402
2403 else if (c < 0xA0)
2404 /* C is the first byte of SJIS character code,
2405 or a leading-code of Emacs. */
2406 mask = (detect_coding_sjis (src, src_end)
2407 | detect_coding_internal (src, src_end));
2408
2409 else
2410 /* C is a character of ISO2022 in graphic plane right,
2411 or a SJIS's 1-byte character code (i.e. JISX0201),
2412 or the first byte of BIG5's 2-byte code. */
2413 mask = (detect_coding_iso2022 (src, src_end)
2414 | detect_coding_sjis (src, src_end)
2415 | detect_coding_big5 (src, src_end));
2416
2417 return mask;
2418 }
2419
2420 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2421 The information of the detected coding system is set in CODING. */
2422
2423 void
2424 detect_coding (coding, src, src_bytes)
2425 struct coding_system *coding;
2426 unsigned char *src;
2427 int src_bytes;
2428 {
2429 int mask = detect_coding_mask (src, src_bytes);
2430 int idx;
2431
2432 if (mask == CODING_CATEGORY_MASK_ANY)
2433 /* We found nothing other than ASCII. There's nothing to do. */
2434 return;
2435
2436 if (!mask)
2437 /* The source text seems to be encoded in unknown coding system.
2438 Emacs regards the category of such a kind of coding system as
2439 `coding-category-binary'. We assume that a user has assigned
2440 an appropriate coding system for a `coding-category-binary'. */
2441 idx = CODING_CATEGORY_IDX_BINARY;
2442 else
2443 {
2444 /* We found some plausible coding systems. Let's use a coding
2445 system of the highest priority. */
2446 Lisp_Object val = Vcoding_category_list;
2447
2448 if (CONSP (val))
2449 while (!NILP (val))
2450 {
2451 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2452 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2453 break;
2454 val = XCONS (val)->cdr;
2455 }
2456 else
2457 val = Qnil;
2458
2459 if (NILP (val))
2460 {
2461 /* For unknown reason, `Vcoding_category_list' contains none
2462 of found categories. Let's use any of them. */
2463 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2464 if (mask & (1 << idx))
2465 break;
2466 }
2467 }
2468 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2469 }
2470
2471 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2472 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2473 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2474
2475 int
2476 detect_eol_type (src, src_bytes)
2477 unsigned char *src;
2478 int src_bytes;
2479 {
2480 unsigned char *src_end = src + src_bytes;
2481 unsigned char c;
2482
2483 while (src < src_end)
2484 {
2485 c = *src++;
2486 if (c == '\n')
2487 return CODING_EOL_LF;
2488 else if (c == '\r')
2489 {
2490 if (src < src_end && *src == '\n')
2491 return CODING_EOL_CRLF;
2492 else
2493 return CODING_EOL_CR;
2494 }
2495 }
2496 return CODING_EOL_AUTOMATIC;
2497 }
2498
2499 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2500 is encoded. If it detects an appropriate format of end-of-line, it
2501 sets the information in *CODING. */
2502
2503 void
2504 detect_eol (coding, src, src_bytes)
2505 struct coding_system *coding;
2506 unsigned char *src;
2507 int src_bytes;
2508 {
2509 Lisp_Object val;
2510 int eol_type = detect_eol_type (src, src_bytes);
2511
2512 if (eol_type == CODING_EOL_AUTOMATIC)
2513 /* We found no end-of-line in the source text. */
2514 return;
2515
2516 val = Fget (coding->symbol, Qeol_type);
2517 if (VECTORP (val) && XVECTOR (val)->size == 3)
2518 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2519 }
2520
2521 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2522 decoding, it may detect coding system and format of end-of-line if
2523 those are not yet decided. */
2524
2525 int
2526 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2527 struct coding_system *coding;
2528 unsigned char *source, *destination;
2529 int src_bytes, dst_bytes;
2530 int *consumed;
2531 {
2532 int produced;
2533
2534 if (src_bytes <= 0)
2535 {
2536 *consumed = 0;
2537 return 0;
2538 }
2539
2540 if (coding->type == coding_type_automatic)
2541 detect_coding (coding, source, src_bytes);
2542
2543 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2544 detect_eol (coding, source, src_bytes);
2545
2546 coding->carryover_size = 0;
2547 switch (coding->type)
2548 {
2549 case coding_type_no_conversion:
2550 label_no_conversion:
2551 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2552 bcopy (source, destination, produced);
2553 *consumed = produced;
2554 break;
2555
2556 case coding_type_internal:
2557 case coding_type_automatic:
2558 if (coding->eol_type == CODING_EOL_LF
2559 || coding->eol_type == CODING_EOL_AUTOMATIC)
2560 goto label_no_conversion;
2561 produced = decode_eol (coding, source, destination,
2562 src_bytes, dst_bytes, consumed);
2563 break;
2564
2565 case coding_type_sjis:
2566 produced = decode_coding_sjis_big5 (coding, source, destination,
2567 src_bytes, dst_bytes, consumed,
2568 1);
2569 break;
2570
2571 case coding_type_iso2022:
2572 produced = decode_coding_iso2022 (coding, source, destination,
2573 src_bytes, dst_bytes, consumed);
2574 break;
2575
2576 case coding_type_big5:
2577 produced = decode_coding_sjis_big5 (coding, source, destination,
2578 src_bytes, dst_bytes, consumed,
2579 0);
2580 break;
2581
2582 case coding_type_ccl:
2583 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2584 src_bytes, dst_bytes, consumed);
2585 break;
2586 }
2587
2588 return produced;
2589 }
2590
2591 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2592
2593 int
2594 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2595 struct coding_system *coding;
2596 unsigned char *source, *destination;
2597 int src_bytes, dst_bytes;
2598 int *consumed;
2599 {
2600 int produced;
2601
2602 coding->carryover_size = 0;
2603 switch (coding->type)
2604 {
2605 case coding_type_no_conversion:
2606 label_no_conversion:
2607 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2608 if (produced > 0)
2609 {
2610 bcopy (source, destination, produced);
2611 if (coding->selective)
2612 {
2613 unsigned char *p = destination, *pend = destination + produced;
2614 while (p < pend)
2615 if (*p++ == '\015') p[-1] = '\n';
2616 }
2617 }
2618 *consumed = produced;
2619 break;
2620
2621 case coding_type_internal:
2622 case coding_type_automatic:
2623 if (coding->eol_type == CODING_EOL_LF
2624 || coding->eol_type == CODING_EOL_AUTOMATIC)
2625 goto label_no_conversion;
2626 produced = encode_eol (coding, source, destination,
2627 src_bytes, dst_bytes, consumed);
2628 break;
2629
2630 case coding_type_sjis:
2631 produced = encode_coding_sjis_big5 (coding, source, destination,
2632 src_bytes, dst_bytes, consumed,
2633 1);
2634 break;
2635
2636 case coding_type_iso2022:
2637 produced = encode_coding_iso2022 (coding, source, destination,
2638 src_bytes, dst_bytes, consumed);
2639 break;
2640
2641 case coding_type_big5:
2642 produced = encode_coding_sjis_big5 (coding, source, destination,
2643 src_bytes, dst_bytes, consumed,
2644 0);
2645 break;
2646
2647 case coding_type_ccl:
2648 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2649 src_bytes, dst_bytes, consumed);
2650 break;
2651 }
2652
2653 return produced;
2654 }
2655
2656 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2657
2658 /* Return maximum size (bytes) of a buffer enough for decoding
2659 SRC_BYTES of text encoded in CODING. */
2660
2661 int
2662 decoding_buffer_size (coding, src_bytes)
2663 struct coding_system *coding;
2664 int src_bytes;
2665 {
2666 int magnification;
2667
2668 if (coding->type == coding_type_iso2022)
2669 magnification = 3;
2670 else if (coding->type == coding_type_ccl)
2671 magnification = coding->spec.ccl.decoder.buf_magnification;
2672 else
2673 magnification = 2;
2674
2675 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2676 }
2677
2678 /* Return maximum size (bytes) of a buffer enough for encoding
2679 SRC_BYTES of text to CODING. */
2680
2681 int
2682 encoding_buffer_size (coding, src_bytes)
2683 struct coding_system *coding;
2684 int src_bytes;
2685 {
2686 int magnification;
2687
2688 if (coding->type == coding_type_ccl)
2689 magnification = coding->spec.ccl.encoder.buf_magnification;
2690 else
2691 magnification = 3;
2692
2693 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2694 }
2695
2696 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2697 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2698 #endif
2699
2700 char *conversion_buffer;
2701 int conversion_buffer_size;
2702
2703 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2704 or decoding. Sufficient memory is allocated automatically. If we
2705 run out of memory, return NULL. */
2706
2707 char *
2708 get_conversion_buffer (size)
2709 int size;
2710 {
2711 if (size > conversion_buffer_size)
2712 {
2713 char *buf;
2714 int real_size = conversion_buffer_size * 2;
2715
2716 while (real_size < size) real_size *= 2;
2717 buf = (char *) xmalloc (real_size);
2718 xfree (conversion_buffer);
2719 conversion_buffer = buf;
2720 conversion_buffer_size = real_size;
2721 }
2722 return conversion_buffer;
2723 }
2724
2725 \f
2726 #ifdef emacs
2727 /*** 7. Emacs Lisp library functions ***/
2728
2729 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2730 1, 1, 0,
2731 "Return coding-vector of CODING-SYSTEM.\n\
2732 If CODING-SYSTEM is not a valid coding-system, return nil.")
2733 (obj)
2734 Lisp_Object obj;
2735 {
2736 while (SYMBOLP (obj) && !NILP (obj))
2737 obj = Fget (obj, Qcoding_system);
2738 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2739 ? Qnil : obj);
2740 }
2741
2742 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2743 "Return t if OBJECT is nil or a coding-system.\n\
2744 See document of make-coding-system for coding-system object.")
2745 (obj)
2746 Lisp_Object obj;
2747 {
2748 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2749 }
2750
2751 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2752 Sread_non_nil_coding_system, 1, 1, 0,
2753 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2754 (prompt)
2755 Lisp_Object prompt;
2756 {
2757 Lisp_Object val;
2758 do
2759 {
2760 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2761 Qt, Qnil, Qnil, Qnil);
2762 }
2763 while (XSTRING (val)->size == 0);
2764 return (Fintern (val, Qnil));
2765 }
2766
2767 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2768 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2769 (prompt)
2770 Lisp_Object prompt;
2771 {
2772 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2773 Qt, Qnil, Qnil, Qnil);
2774 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2775 }
2776
2777 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2778 1, 1, 0,
2779 "Check validity of CODING-SYSTEM.\n\
2780 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2781 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2782 The value of property should be a vector of length 5.")
2783 (coding_system)
2784 Lisp_Object coding_system;
2785 {
2786 CHECK_SYMBOL (coding_system, 0);
2787 if (!NILP (Fcoding_system_p (coding_system)))
2788 return coding_system;
2789 while (1)
2790 Fsignal (Qcoding_system_error, coding_system);
2791 }
2792
2793 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2794 2, 2, 0,
2795 "Detect coding-system of the text in the region between START and END.\n\
2796 Return a list of possible coding-systems ordered by priority.\n\
2797 If only ASCII characters are found, it returns `automatic-conversion'\n\
2798 or its subsidiary coding-system according to a detected end-of-line format.")
2799 (b, e)
2800 Lisp_Object b, e;
2801 {
2802 int coding_mask, eol_type;
2803 Lisp_Object val;
2804 int beg, end;
2805
2806 validate_region (&b, &e);
2807 beg = XINT (b), end = XINT (e);
2808 if (beg < GPT && end >= GPT) move_gap (end);
2809
2810 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2811 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2812
2813 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2814 {
2815 val = intern ("automatic-conversion");
2816 if (eol_type != CODING_EOL_AUTOMATIC)
2817 {
2818 Lisp_Object val2 = Fget (val, Qeol_type);
2819 if (VECTORP (val2))
2820 val = XVECTOR (val2)->contents[eol_type];
2821 }
2822 }
2823 else
2824 {
2825 Lisp_Object val2;
2826
2827 /* At first, gather possible coding-systems in VAL in a reverse
2828 order. */
2829 val = Qnil;
2830 for (val2 = Vcoding_category_list;
2831 !NILP (val2);
2832 val2 = XCONS (val2)->cdr)
2833 {
2834 int idx
2835 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2836 if (coding_mask & (1 << idx))
2837 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2838 }
2839
2840 /* Then, change the order of the list, while getting subsidiary
2841 coding-systems. */
2842 val2 = val;
2843 val = Qnil;
2844 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2845 {
2846 if (eol_type == CODING_EOL_AUTOMATIC)
2847 val = Fcons (XCONS (val2)->car, val);
2848 else
2849 {
2850 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2851 if (VECTORP (val3))
2852 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2853 else
2854 val = Fcons (XCONS (val2)->car, val);
2855 }
2856 }
2857 }
2858
2859 return val;
2860 }
2861
2862 /* Scan text in the region between *BEGP and *ENDP, skip characters
2863 which we never have to encode to (iff ENCODEP is 1) or decode from
2864 coding system CODING at the head and tail, then set BEGP and ENDP
2865 to the addresses of start and end of the text we actually convert. */
2866
2867 void
2868 shrink_conversion_area (begp, endp, coding, encodep)
2869 unsigned char **begp, **endp;
2870 struct coding_system *coding;
2871 int encodep;
2872 {
2873 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2874
2875 if (coding->eol_type != CODING_EOL_LF
2876 && coding->eol_type != CODING_EOL_AUTOMATIC)
2877 /* Since we anyway have to convert end-of-line format, it is not
2878 worth skipping at most 100 bytes or so. */
2879 return;
2880
2881 if (encodep) /* for encoding */
2882 {
2883 switch (coding->type)
2884 {
2885 case coding_type_no_conversion:
2886 case coding_type_internal:
2887 case coding_type_automatic:
2888 /* We need no conversion. */
2889 *begp = *endp;
2890 return;
2891 case coding_type_ccl:
2892 /* We can't skip any data. */
2893 return;
2894 case coding_type_iso2022:
2895 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2896 {
2897 unsigned char *bol = beg_addr;
2898 while (beg_addr < end_addr && *beg_addr < 0x80)
2899 {
2900 beg_addr++;
2901 if (*(beg_addr - 1) == '\n')
2902 bol = beg_addr;
2903 }
2904 beg_addr = bol;
2905 goto label_skip_tail;
2906 }
2907 /* fall down ... */
2908 default:
2909 /* We can skip all ASCII characters at the head and tail. */
2910 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2911 label_skip_tail:
2912 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2913 break;
2914 }
2915 }
2916 else /* for decoding */
2917 {
2918 switch (coding->type)
2919 {
2920 case coding_type_no_conversion:
2921 /* We need no conversion. */
2922 *begp = *endp;
2923 return;
2924 case coding_type_internal:
2925 if (coding->eol_type == CODING_EOL_LF)
2926 {
2927 /* We need no conversion. */
2928 *begp = *endp;
2929 return;
2930 }
2931 /* We can skip all but carriage-return. */
2932 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2933 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2934 break;
2935 case coding_type_sjis:
2936 case coding_type_big5:
2937 /* We can skip all ASCII characters at the head. */
2938 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2939 /* We can skip all ASCII characters at the tail except for
2940 the second byte of SJIS or BIG5 code. */
2941 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2942 if (end_addr != *endp)
2943 end_addr++;
2944 break;
2945 case coding_type_ccl:
2946 /* We can't skip any data. */
2947 return;
2948 default: /* i.e. case coding_type_iso2022: */
2949 {
2950 unsigned char c;
2951
2952 /* We can skip all ASCII characters except for a few
2953 control codes at the head. */
2954 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2955 && c != ISO_CODE_CR && c != ISO_CODE_SO
2956 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2957 beg_addr++;
2958 }
2959 break;
2960 }
2961 }
2962 *begp = beg_addr;
2963 *endp = end_addr;
2964 return;
2965 }
2966
2967 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2968 text between B and E. B and E are buffer position. */
2969
2970 Lisp_Object
2971 code_convert_region (b, e, coding, encodep)
2972 Lisp_Object b, e;
2973 struct coding_system *coding;
2974 int encodep;
2975 {
2976 int beg, end, len, consumed, produced;
2977 char *buf;
2978 unsigned char *begp, *endp;
2979 int pos = PT;
2980
2981 validate_region (&b, &e);
2982 beg = XINT (b), end = XINT (e);
2983 if (beg < GPT && end >= GPT)
2984 move_gap (end);
2985
2986 if (encodep && !NILP (coding->pre_write_conversion))
2987 {
2988 /* We must call a pre-conversion function which may put a new
2989 text to be converted in a new buffer. */
2990 struct buffer *old = current_buffer, *new;
2991
2992 TEMP_SET_PT (beg);
2993 call2 (coding->pre_write_conversion, b, e);
2994 if (old != current_buffer)
2995 {
2996 /* Replace the original text by the text just generated. */
2997 len = ZV - BEGV;
2998 new = current_buffer;
2999 set_buffer_internal (old);
3000 del_range (beg, end);
3001 insert_from_buffer (new, 1, len, 0);
3002 end = beg + len;
3003 }
3004 }
3005
3006 /* We may be able to shrink the conversion region. */
3007 begp = POS_ADDR (beg); endp = begp + (end - beg);
3008 shrink_conversion_area (&begp, &endp, coding, encodep);
3009
3010 if (begp == endp)
3011 /* We need no conversion. */
3012 len = end - beg;
3013 else
3014 {
3015 beg += begp - POS_ADDR (beg);
3016 end = beg + (endp - begp);
3017
3018 if (encodep)
3019 len = encoding_buffer_size (coding, end - beg);
3020 else
3021 len = decoding_buffer_size (coding, end - beg);
3022 buf = get_conversion_buffer (len);
3023
3024 coding->last_block = 1;
3025 produced = (encodep
3026 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3027 &consumed)
3028 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3029 &consumed));
3030
3031 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3032
3033 TEMP_SET_PT (beg);
3034 insert (buf, produced);
3035 del_range (PT, PT + end - beg);
3036 if (pos >= end)
3037 pos = PT + (pos - end);
3038 else if (pos > beg)
3039 pos = beg;
3040 TEMP_SET_PT (pos);
3041 }
3042
3043 if (!encodep && !NILP (coding->post_read_conversion))
3044 {
3045 /* We must call a post-conversion function which may alter
3046 the text just converted. */
3047 Lisp_Object insval;
3048
3049 beg = XINT (b);
3050 TEMP_SET_PT (beg);
3051 insval = call1 (coding->post_read_conversion, make_number (len));
3052 CHECK_NUMBER (insval, 0);
3053 len = XINT (insval);
3054 }
3055
3056 return make_number (len);
3057 }
3058
3059 Lisp_Object
3060 code_convert_string (str, coding, encodep, nocopy)
3061 Lisp_Object str, nocopy;
3062 struct coding_system *coding;
3063 int encodep;
3064 {
3065 int len, consumed, produced;
3066 char *buf;
3067 unsigned char *begp, *endp;
3068 int head_skip, tail_skip;
3069 struct gcpro gcpro1;
3070
3071 if (encodep && !NILP (coding->pre_write_conversion)
3072 || !encodep && !NILP (coding->post_read_conversion))
3073 {
3074 /* Since we have to call Lisp functions which assume target text
3075 is in a buffer, after setting a temporary buffer, call
3076 code_convert_region. */
3077 int count = specpdl_ptr - specpdl;
3078 int len = XSTRING (str)->size;
3079 Lisp_Object result;
3080 struct buffer *old = current_buffer;
3081
3082 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3083 temp_output_buffer_setup (" *code-converting-work*");
3084 set_buffer_internal (XBUFFER (Vstandard_output));
3085 insert_from_string (str, 0, len, 0);
3086 code_convert_region (make_number (BEGV), make_number (ZV),
3087 coding, encodep);
3088 result = make_buffer_string (BEGV, ZV, 0);
3089 set_buffer_internal (old);
3090 return unbind_to (count, result);
3091 }
3092
3093 /* We may be able to shrink the conversion region. */
3094 begp = XSTRING (str)->data;
3095 endp = begp + XSTRING (str)->size;
3096 shrink_conversion_area (&begp, &endp, coding, encodep);
3097
3098 if (begp == endp)
3099 /* We need no conversion. */
3100 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3101
3102 head_skip = begp - XSTRING (str)->data;
3103 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3104
3105 GCPRO1 (str);
3106
3107 if (encodep)
3108 len = encoding_buffer_size (coding, endp - begp);
3109 else
3110 len = decoding_buffer_size (coding, endp - begp);
3111 buf = get_conversion_buffer (len + head_skip + tail_skip);
3112
3113 bcopy (XSTRING (str)->data, buf, head_skip);
3114 coding->last_block = 1;
3115 produced = (encodep
3116 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3117 buf + head_skip, endp - begp, len, &consumed)
3118 : decode_coding (coding, XSTRING (str)->data + head_skip,
3119 buf + head_skip, endp - begp, len, &consumed));
3120 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3121 buf + head_skip + produced,
3122 tail_skip);
3123
3124 UNGCPRO;
3125
3126 return make_string (buf, head_skip + produced + tail_skip);
3127 }
3128
3129 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3130 3, 3, "r\nzCoding system: ",
3131 "Decode current region by specified coding system.\n\
3132 When called from a program, takes three arguments:\n\
3133 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3134 Return length of decoded text.")
3135 (b, e, coding_system)
3136 Lisp_Object b, e, coding_system;
3137 {
3138 struct coding_system coding;
3139
3140 CHECK_NUMBER_COERCE_MARKER (b, 0);
3141 CHECK_NUMBER_COERCE_MARKER (e, 1);
3142 CHECK_SYMBOL (coding_system, 2);
3143
3144 if (NILP (coding_system))
3145 return make_number (XFASTINT (e) - XFASTINT (b));
3146 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3147 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3148
3149 return code_convert_region (b, e, &coding, 0);
3150 }
3151
3152 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3153 3, 3, "r\nzCoding system: ",
3154 "Encode current region by specified coding system.\n\
3155 When called from a program, takes three arguments:\n\
3156 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3157 Return length of encoded text.")
3158 (b, e, coding_system)
3159 Lisp_Object b, e, coding_system;
3160 {
3161 struct coding_system coding;
3162
3163 CHECK_NUMBER_COERCE_MARKER (b, 0);
3164 CHECK_NUMBER_COERCE_MARKER (e, 1);
3165 CHECK_SYMBOL (coding_system, 2);
3166
3167 if (NILP (coding_system))
3168 return make_number (XFASTINT (e) - XFASTINT (b));
3169 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3170 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3171
3172 return code_convert_region (b, e, &coding, 1);
3173 }
3174
3175 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3176 2, 3, 0,
3177 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3178 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3179 of decoding.")
3180 (string, coding_system, nocopy)
3181 Lisp_Object string, coding_system, nocopy;
3182 {
3183 struct coding_system coding;
3184
3185 CHECK_STRING (string, 0);
3186 CHECK_SYMBOL (coding_system, 1);
3187
3188 if (NILP (coding_system))
3189 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3190 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3191 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3192
3193 return code_convert_string (string, &coding, 0, nocopy);
3194 }
3195
3196 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3197 2, 3, 0,
3198 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3199 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3200 of encoding.")
3201 (string, coding_system, nocopy)
3202 Lisp_Object string, coding_system, nocopy;
3203 {
3204 struct coding_system coding;
3205
3206 CHECK_STRING (string, 0);
3207 CHECK_SYMBOL (coding_system, 1);
3208
3209 if (NILP (coding_system))
3210 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3211 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3212 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3213
3214 return code_convert_string (string, &coding, 1, nocopy);
3215 }
3216
3217 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3218 "Decode a JISX0208 character of shift-jis encoding.\n\
3219 CODE is the character code in SJIS.\n\
3220 Return the corresponding character.")
3221 (code)
3222 Lisp_Object code;
3223 {
3224 unsigned char c1, c2, s1, s2;
3225 Lisp_Object val;
3226
3227 CHECK_NUMBER (code, 0);
3228 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3229 DECODE_SJIS (s1, s2, c1, c2);
3230 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3231 return val;
3232 }
3233
3234 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3235 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3236 Return the corresponding character code in SJIS.")
3237 (ch)
3238 Lisp_Object ch;
3239 {
3240 int charset, c1, c2, s1, s2;
3241 Lisp_Object val;
3242
3243 CHECK_NUMBER (ch, 0);
3244 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3245 if (charset == charset_jisx0208)
3246 {
3247 ENCODE_SJIS (c1, c2, s1, s2);
3248 XSETFASTINT (val, (s1 << 8) | s2);
3249 }
3250 else
3251 XSETFASTINT (val, 0);
3252 return val;
3253 }
3254
3255 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3256 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3257 CODE is the character code in BIG5.\n\
3258 Return the corresponding character.")
3259 (code)
3260 Lisp_Object code;
3261 {
3262 int charset;
3263 unsigned char b1, b2, c1, c2;
3264 Lisp_Object val;
3265
3266 CHECK_NUMBER (code, 0);
3267 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3268 DECODE_BIG5 (b1, b2, charset, c1, c2);
3269 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3270 return val;
3271 }
3272
3273 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3274 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3275 Return the corresponding character code in Big5.")
3276 (ch)
3277 Lisp_Object ch;
3278 {
3279 int charset, c1, c2, b1, b2;
3280 Lisp_Object val;
3281
3282 CHECK_NUMBER (ch, 0);
3283 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3284 if (charset == charset_big5_1 || charset == charset_big5_2)
3285 {
3286 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3287 XSETFASTINT (val, (b1 << 8) | b2);
3288 }
3289 else
3290 XSETFASTINT (val, 0);
3291 return val;
3292 }
3293
3294 DEFUN ("set-terminal-coding-system",
3295 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3296 "zCoding-system for terminal display: ",
3297 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3298 All outputs to terminal are encoded to this coding-system.")
3299 (coding_system)
3300 Lisp_Object coding_system;
3301 {
3302 CHECK_SYMBOL (coding_system, 0);
3303 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3304 update_mode_lines++;
3305 if (!NILP (Finteractive_p ()))
3306 Fredraw_display ();
3307 return Qnil;
3308 }
3309
3310 DEFUN ("terminal-coding-system",
3311 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3312 "Return coding-system of your terminal.")
3313 ()
3314 {
3315 return terminal_coding.symbol;
3316 }
3317
3318 DEFUN ("set-keyboard-coding-system",
3319 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3320 "zCoding-system for keyboard input: ",
3321 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3322 All inputs from terminal are decoded from this coding-system.")
3323 (coding_system)
3324 Lisp_Object coding_system;
3325 {
3326 CHECK_SYMBOL (coding_system, 0);
3327 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3328 return Qnil;
3329 }
3330
3331 DEFUN ("keyboard-coding-system",
3332 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3333 "Return coding-system of what is sent from terminal keyboard.")
3334 ()
3335 {
3336 return keyboard_coding.symbol;
3337 }
3338
3339 \f
3340 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3341 1, MANY, 0,
3342 "Choose a coding system for a file operation based on file name.\n\
3343 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3344 ENCODING-SYSTEM is the coding system to use for encoding\n\
3345 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3346 for decoding (in case OPERATION does decoding).\n\
3347 \n\
3348 The first argument OPERATION specifies an I/O primitive:\n\
3349 For file I/O, `insert-file-contents' or `write-region'.\n\
3350 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3351 For network I/O, `open-network-stream'.\n\
3352 \n\
3353 The remaining arguments should be the same arguments that were passed\n\
3354 to the primitive. Depending on which primitive, one of those arguments\n\
3355 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3356 whichever argument specifies the file name is TARGET.\n\
3357 \n\
3358 TARGET has a meaning which depends on OPERATION:\n\
3359 For file I/O, TARGET is a file name.\n\
3360 For process I/O, TARGET is a process name.\n\
3361 For network I/O, TARGET is a service name or a port number\n\
3362 \n\
3363 This function looks up what `coding-system-alist' specifies for\n\
3364 OPERATION and TARGET. It may specify a cons cell which represents\n\
3365 a particular coding system or it may have a function to call.\n\
3366 In the latter case, we call the function with one argument,\n\
3367 which is a list of all the arguments given to `find-coding-system'.")
3368 (nargs, args)
3369 int nargs;
3370 Lisp_Object *args;
3371 {
3372 Lisp_Object operation, target_idx, target, val;
3373 register Lisp_Object chain;
3374
3375 if (nargs < 2)
3376 error ("Too few arguments");
3377 operation = args[0];
3378 if (!SYMBOLP (operation)
3379 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3380 error ("Invalid first arguement");
3381 if (nargs < 1 + XINT (target_idx))
3382 error ("Too few arguments for operation: %s",
3383 XSYMBOL (operation)->name->data);
3384 target = args[XINT (target_idx) + 1];
3385 if (!(STRINGP (target)
3386 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3387 error ("Invalid %dth argument", XINT (target_idx) + 1);
3388
3389 chain = Fassq (operation, Vcoding_system_alist);
3390 if (NILP (chain))
3391 return Qnil;
3392
3393 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3394 {
3395 Lisp_Object elt = XCONS (chain)->car;
3396
3397 if (CONSP (elt)
3398 && ((STRINGP (target)
3399 && STRINGP (XCONS (elt)->car)
3400 && fast_string_match (XCONS (elt)->car, target) >= 0)
3401 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3402 return (val = XCONS (elt)->cdr, CONSP (val)
3403 ? val
3404 : ((SYMBOLP (val) && !NILP (Fboundp (val))
3405 ? call2 (val, Flist (nargs, args))
3406 : Qnil)));
3407 }
3408 return Qnil;
3409 }
3410
3411 #endif /* emacs */
3412
3413 \f
3414 /*** 8. Post-amble ***/
3415
3416 init_coding_once ()
3417 {
3418 int i;
3419
3420 /* Emacs internal format specific initialize routine. */
3421 for (i = 0; i <= 0x20; i++)
3422 emacs_code_class[i] = EMACS_control_code;
3423 emacs_code_class[0x0A] = EMACS_linefeed_code;
3424 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3425 for (i = 0x21 ; i < 0x7F; i++)
3426 emacs_code_class[i] = EMACS_ascii_code;
3427 emacs_code_class[0x7F] = EMACS_control_code;
3428 emacs_code_class[0x80] = EMACS_leading_code_composition;
3429 for (i = 0x81; i < 0xFF; i++)
3430 emacs_code_class[i] = EMACS_invalid_code;
3431 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3432 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3433 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3434 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3435
3436 /* ISO2022 specific initialize routine. */
3437 for (i = 0; i < 0x20; i++)
3438 iso_code_class[i] = ISO_control_code;
3439 for (i = 0x21; i < 0x7F; i++)
3440 iso_code_class[i] = ISO_graphic_plane_0;
3441 for (i = 0x80; i < 0xA0; i++)
3442 iso_code_class[i] = ISO_control_code;
3443 for (i = 0xA1; i < 0xFF; i++)
3444 iso_code_class[i] = ISO_graphic_plane_1;
3445 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3446 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3447 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3448 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3449 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3450 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3451 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3452 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3453 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3454 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3455
3456 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3457 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3458
3459 setup_coding_system (Qnil, &keyboard_coding);
3460 setup_coding_system (Qnil, &terminal_coding);
3461 }
3462
3463 #ifdef emacs
3464
3465 syms_of_coding ()
3466 {
3467 Qtarget_idx = intern ("target-idx");
3468 staticpro (&Qtarget_idx);
3469
3470 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3471 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3472
3473 Qcall_process = intern ("call-process");
3474 staticpro (&Qcall_process);
3475 Fput (Qcall_process, Qtarget_idx, make_number (0));
3476
3477 Qcall_process_region = intern ("call-process-region");
3478 staticpro (&Qcall_process_region);
3479 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3480
3481 Qstart_process = intern ("start-process");
3482 staticpro (&Qstart_process);
3483 Fput (Qstart_process, Qtarget_idx, make_number (2));
3484
3485 Qopen_network_stream = intern ("open-network-stream");
3486 staticpro (&Qopen_network_stream);
3487 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3488
3489 Qcoding_system = intern ("coding-system");
3490 staticpro (&Qcoding_system);
3491
3492 Qeol_type = intern ("eol-type");
3493 staticpro (&Qeol_type);
3494
3495 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3496 staticpro (&Qbuffer_file_coding_system);
3497
3498 Qpost_read_conversion = intern ("post-read-conversion");
3499 staticpro (&Qpost_read_conversion);
3500
3501 Qpre_write_conversion = intern ("pre-write-conversion");
3502 staticpro (&Qpre_write_conversion);
3503
3504 Qcoding_system_vector = intern ("coding-system-vector");
3505 staticpro (&Qcoding_system_vector);
3506
3507 Qcoding_system_p = intern ("coding-system-p");
3508 staticpro (&Qcoding_system_p);
3509
3510 Qcoding_system_error = intern ("coding-system-error");
3511 staticpro (&Qcoding_system_error);
3512
3513 Fput (Qcoding_system_error, Qerror_conditions,
3514 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3515 Fput (Qcoding_system_error, Qerror_message,
3516 build_string ("Coding-system error"));
3517
3518 Qcoding_category_index = intern ("coding-category-index");
3519 staticpro (&Qcoding_category_index);
3520
3521 {
3522 int i;
3523 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3524 {
3525 coding_category_table[i] = intern (coding_category_name[i]);
3526 staticpro (&coding_category_table[i]);
3527 Fput (coding_category_table[i], Qcoding_category_index,
3528 make_number (i));
3529 }
3530 }
3531
3532 defsubr (&Scoding_system_vector);
3533 defsubr (&Scoding_system_p);
3534 defsubr (&Sread_coding_system);
3535 defsubr (&Sread_non_nil_coding_system);
3536 defsubr (&Scheck_coding_system);
3537 defsubr (&Sdetect_coding_region);
3538 defsubr (&Sdecode_coding_region);
3539 defsubr (&Sencode_coding_region);
3540 defsubr (&Sdecode_coding_string);
3541 defsubr (&Sencode_coding_string);
3542 defsubr (&Sdecode_sjis_char);
3543 defsubr (&Sencode_sjis_char);
3544 defsubr (&Sdecode_big5_char);
3545 defsubr (&Sencode_big5_char);
3546 defsubr (&Sset_terminal_coding_system);
3547 defsubr (&Sterminal_coding_system);
3548 defsubr (&Sset_keyboard_coding_system);
3549 defsubr (&Skeyboard_coding_system);
3550 defsubr (&Sfind_coding_system);
3551
3552 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3553 "List of coding-categories (symbols) ordered by priority.");
3554 {
3555 int i;
3556
3557 Vcoding_category_list = Qnil;
3558 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3559 Vcoding_category_list
3560 = Fcons (coding_category_table[i], Vcoding_category_list);
3561 }
3562
3563 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3564 "A variable of internal use only.\n\
3565 If the value is a coding system, it is used for decoding on read operation.\n\
3566 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3567 Vcoding_system_for_read = Qnil;
3568
3569 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3570 "A variable of internal use only.\n\
3571 If the value is a coding system, it is used for encoding on write operation.\n\
3572 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3573 Vcoding_system_for_write = Qnil;
3574
3575 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3576 "Coding-system used in the latest file or process I/O.");
3577 Vlast_coding_system_used = Qnil;
3578
3579 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3580 "Nested alist to decide a coding system for a specific I/O operation.\n\
3581 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3582 \n\
3583 OPERATION is one of the following Emacs I/O primitives:\n\
3584 For file I/O, insert-file-contents and write-region.\n\
3585 For process I/O, call-process, call-process-region, and start-process.\n\
3586 For network I/O, open-network-stream.\n\
3587 In addition, for process I/O, `process-argument' can be specified for\n\
3588 encoding arguments of the process.\n\
3589 \n\
3590 REGEXP is a regular expression matching a target of OPERATION, where\n\
3591 target is a file name for file I/O operations, a process name for\n\
3592 process I/O operations, or a service name for network I/O\n\
3593 operations. REGEXP might be a port number for network I/O operation.\n\
3594 \n\
3595 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3596 character code on OPERATION, or a function symbol returning the cons.\n\
3597 See the documentation of `find-coding-system' for more detail.");
3598 Vcoding_system_alist = Qnil;
3599
3600 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3601 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3602 eol_mnemonic_unix = '.';
3603
3604 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3605 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3606 eol_mnemonic_dos = ':';
3607
3608 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3609 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3610 eol_mnemonic_mac = '\'';
3611
3612 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3613 "Mnemonic character indicating end-of-line format is not yet decided.");
3614 eol_mnemonic_undecided = '-';
3615
3616 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3617 "Alist of charsets vs the alternate charsets.\n\
3618 While decoding, if a charset (car part of an element) is found,\n\
3619 decode it as the alternate charset (cdr part of the element).");
3620 Valternate_charset_table = Qnil;
3621
3622 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3623 "Alist of charsets vs revision numbers.\n\
3624 While encoding, if a charset (car part of an element) is found,\n\
3625 designate it with the escape sequence identifing revision (cdr part of the element).");
3626 Vcharset_revision_alist = Qnil;
3627 }
3628
3629 #endif /* emacs */