Provide XXX-util instead of
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83*/
84
85/*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98*/
99
100/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107#if 0
108int
0ef69138 109detect_coding_emacs_mule (src, src_end)
4ed46869
KH
110 unsigned char *src, *src_end;
111{
112 ...
113}
114#endif
115
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
4ed46869
KH
124#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130{
131 ...
132}
133#endif
134
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
0ef69138
KH
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
4ed46869
KH
143#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149{
150 ...
151}
152#endif
153
154/*** COMMONLY USED MACROS ***/
155
156/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162#define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170#define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178#define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186/* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193/* Decode one ASCII character C. */
194
195#define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203/* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206#define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218/* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227\f
228/*** 1. Preamble ***/
229
230#include <stdio.h>
231
232#ifdef emacs
233
234#include <config.h>
235#include "lisp.h"
236#include "buffer.h"
237#include "charset.h"
238#include "ccl.h"
239#include "coding.h"
240#include "window.h"
241
242#else /* not emacs */
243
244#include "mulelib.h"
245
246#endif /* not emacs */
247
248Lisp_Object Qcoding_system, Qeol_type;
249Lisp_Object Qbuffer_file_coding_system;
250Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254Lisp_Object Qstart_process, Qopen_network_stream;
255Lisp_Object Qtarget_idx;
256
257/* Mnemonic character of each format of end-of-line. */
258int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259/* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261int eol_mnemonic_undecided;
262
263#ifdef emacs
264
02ba4723 265Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869
KH
266
267/* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269/* Coding-system for reading files and receiving data from process. */
270Lisp_Object Vcoding_system_for_read;
271/* Coding-system for writing files and sending data to process. */
272Lisp_Object Vcoding_system_for_write;
273/* Coding-system actually used in the latest I/O. */
274Lisp_Object Vlast_coding_system_used;
275
276/* Coding-system of what terminal accept for displaying. */
277struct coding_system terminal_coding;
278
279/* Coding-system of what is sent from terminal keyboard. */
280struct coding_system keyboard_coding;
281
02ba4723
KH
282Lisp_Object Vfile_coding_system_alist;
283Lisp_Object Vprocess_coding_system_alist;
284Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
285
286#endif /* emacs */
287
288Lisp_Object Qcoding_category_index;
289
290/* List of symbols `coding-category-xxx' ordered by priority. */
291Lisp_Object Vcoding_category_list;
292
293/* Table of coding-systems currently assigned to each coding-category. */
294Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
295
296/* Table of names of symbol for each coding-category. */
297char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 298 "coding-category-emacs-mule",
4ed46869
KH
299 "coding-category-sjis",
300 "coding-category-iso-7",
301 "coding-category-iso-8-1",
302 "coding-category-iso-8-2",
303 "coding-category-iso-else",
304 "coding-category-big5",
305 "coding-category-binary"
306};
307
bdd9fb48
KH
308/* Flag to tell if we look up unification table on character code
309 conversion. */
310Lisp_Object Venable_character_unification;
311/* Standard unification table to look up on reading (decoding). */
312Lisp_Object Vstandard_character_unification_table_for_read;
313/* Standard unification table to look up on writing (encoding). */
314Lisp_Object Vstandard_character_unification_table_for_write;
315
316Lisp_Object Qcharacter_unification_table;
4ed46869
KH
317
318/* Alist of charsets vs revision number. */
319Lisp_Object Vcharset_revision_alist;
320
02ba4723
KH
321/* Default coding systems used for process I/O. */
322Lisp_Object Vdefault_process_coding_system;
323
4ed46869 324\f
0ef69138 325/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
326
327/* Emacs' internal format for encoding multiple character sets is a
328 kind of multi-byte encoding, i.e. encoding a character by a sequence
329 of one-byte codes of variable length. ASCII characters and control
330 characters (e.g. `tab', `newline') are represented by one-byte as
331 is. It takes the range 0x00 through 0x7F. The other characters
332 are represented by a sequence of `base leading-code', optional
333 `extended leading-code', and one or two `position-code's. Length
334 of the sequence is decided by the base leading-code. Leading-code
335 takes the range 0x80 through 0x9F, whereas extended leading-code
336 and position-code take the range 0xA0 through 0xFF. See the
337 document of `charset.h' for more detail about leading-code and
338 position-code.
339
340 There's one exception in this rule. Special leading-code
341 `leading-code-composition' denotes that the following several
342 characters should be composed into one character. Leading-codes of
343 components (except for ASCII) are added 0x20. An ASCII character
344 component is represented by a 2-byte sequence of `0xA0' and
345 `ASCII-code + 0x80'. See also the document in `charset.h' for the
346 detail of composite character. Hence, we can summarize the code
347 range as follows:
348
349 --- CODE RANGE of Emacs' internal format ---
350 (character set) (range)
351 ASCII 0x00 .. 0x7F
352 ELSE (1st byte) 0x80 .. 0x9F
353 (rest bytes) 0xA0 .. 0xFF
354 ---------------------------------------------
355
356 */
357
358enum emacs_code_class_type emacs_code_class[256];
359
360/* Go to the next statement only if *SRC is accessible and the code is
361 greater than 0xA0. */
362#define CHECK_CODE_RANGE_A0_FF \
363 do { \
364 if (src >= src_end) \
365 goto label_end_of_switch; \
366 else if (*src++ < 0xA0) \
367 return 0; \
368 } while (0)
369
370/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
371 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 372 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
373
374int
0ef69138 375detect_coding_emacs_mule (src, src_end)
4ed46869
KH
376 unsigned char *src, *src_end;
377{
378 unsigned char c;
379 int composing = 0;
380
381 while (src < src_end)
382 {
383 c = *src++;
384
385 if (composing)
386 {
387 if (c < 0xA0)
388 composing = 0;
389 else
390 c -= 0x20;
391 }
392
393 switch (emacs_code_class[c])
394 {
395 case EMACS_ascii_code:
396 case EMACS_linefeed_code:
397 break;
398
399 case EMACS_control_code:
400 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
401 return 0;
402 break;
403
404 case EMACS_invalid_code:
405 return 0;
406
407 case EMACS_leading_code_composition: /* c == 0x80 */
408 if (composing)
409 CHECK_CODE_RANGE_A0_FF;
410 else
411 composing = 1;
412 break;
413
414 case EMACS_leading_code_4:
415 CHECK_CODE_RANGE_A0_FF;
416 /* fall down to check it two more times ... */
417
418 case EMACS_leading_code_3:
419 CHECK_CODE_RANGE_A0_FF;
420 /* fall down to check it one more time ... */
421
422 case EMACS_leading_code_2:
423 CHECK_CODE_RANGE_A0_FF;
424 break;
425
426 default:
427 label_end_of_switch:
428 break;
429 }
430 }
0ef69138 431 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
432}
433
434\f
435/*** 3. ISO2022 handlers ***/
436
437/* The following note describes the coding system ISO2022 briefly.
438 Since the intension of this note is to help understanding of the
439 programs in this file, some parts are NOT ACCURATE or OVERLY
440 SIMPLIFIED. For the thorough understanding, please refer to the
441 original document of ISO2022.
442
443 ISO2022 provides many mechanisms to encode several character sets
444 in 7-bit and 8-bit environment. If one choose 7-bite environment,
445 all text is encoded by codes of less than 128. This may make the
446 encoded text a little bit longer, but the text get more stability
447 to pass through several gateways (some of them split MSB off).
448
449 There are two kind of character set: control character set and
450 graphic character set. The former contains control characters such
451 as `newline' and `escape' to provide control functions (control
452 functions are provided also by escape sequence). The latter
453 contains graphic characters such as ' A' and '-'. Emacs recognizes
454 two control character sets and many graphic character sets.
455
456 Graphic character sets are classified into one of the following
457 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
458 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
459 bytes (DIMENSION) and the number of characters in one dimension
460 (CHARS) of the set. In addition, each character set is assigned an
461 identification tag (called "final character" and denoted as <F>
462 here after) which is unique in each class. <F> of each character
463 set is decided by ECMA(*) when it is registered in ISO. Code range
464 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
465
466 Note (*): ECMA = European Computer Manufacturers Association
467
468 Here are examples of graphic character set [NAME(<F>)]:
469 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
470 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
471 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
472 o DIMENSION2_CHARS96 -- none for the moment
473
474 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
475 C0 [0x00..0x1F] -- control character plane 0
476 GL [0x20..0x7F] -- graphic character plane 0
477 C1 [0x80..0x9F] -- control character plane 1
478 GR [0xA0..0xFF] -- graphic character plane 1
479
480 A control character set is directly designated and invoked to C0 or
481 C1 by an escape sequence. The most common case is that ISO646's
482 control character set is designated/invoked to C0 and ISO6429's
483 control character set is designated/invoked to C1, and usually
484 these designations/invocations are omitted in a coded text. With
485 7-bit environment, only C0 can be used, and a control character for
486 C1 is encoded by an appropriate escape sequence to fit in the
487 environment. All control characters for C1 are defined the
488 corresponding escape sequences.
489
490 A graphic character set is at first designated to one of four
491 graphic registers (G0 through G3), then these graphic registers are
492 invoked to GL or GR. These designations and invocations can be
493 done independently. The most common case is that G0 is invoked to
494 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
495 these invocations and designations are omitted in a coded text.
496 With 7-bit environment, only GL can be used.
497
498 When a graphic character set of CHARS94 is invoked to GL, code 0x20
499 and 0x7F of GL area work as control characters SPACE and DEL
500 respectively, and code 0xA0 and 0xFF of GR area should not be used.
501
502 There are two ways of invocation: locking-shift and single-shift.
503 With locking-shift, the invocation lasts until the next different
504 invocation, whereas with single-shift, the invocation works only
505 for the following character and doesn't affect locking-shift.
506 Invocations are done by the following control characters or escape
507 sequences.
508
509 ----------------------------------------------------------------------
510 function control char escape sequence description
511 ----------------------------------------------------------------------
512 SI (shift-in) 0x0F none invoke G0 to GL
513 SI (shift-out) 0x0E none invoke G1 to GL
514 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
515 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
516 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
517 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
518 ----------------------------------------------------------------------
519 The first four are for locking-shift. Control characters for these
520 functions are defined by macros ISO_CODE_XXX in `coding.h'.
521
522 Designations are done by the following escape sequences.
523 ----------------------------------------------------------------------
524 escape sequence description
525 ----------------------------------------------------------------------
526 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
527 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
528 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
529 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
530 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
531 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
532 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
533 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
534 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
535 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
536 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
537 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
538 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
539 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
540 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
541 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
542 ----------------------------------------------------------------------
543
544 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
545 of dimension 1, chars 94, and final character <F>, and etc.
546
547 Note (*): Although these designations are not allowed in ISO2022,
548 Emacs accepts them on decoding, and produces them on encoding
549 CHARS96 character set in a coding system which is characterized as
550 7-bit environment, non-locking-shift, and non-single-shift.
551
552 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
553 '(' can be omitted. We call this as "short-form" here after.
554
555 Now you may notice that there are a lot of ways for encoding the
556 same multilingual text in ISO2022. Actually, there exist many
557 coding systems such as Compound Text (used in X's inter client
558 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
559 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
560 localized platforms), and all of these are variants of ISO2022.
561
562 In addition to the above, Emacs handles two more kinds of escape
563 sequences: ISO6429's direction specification and Emacs' private
564 sequence for specifying character composition.
565
566 ISO6429's direction specification takes the following format:
567 o CSI ']' -- end of the current direction
568 o CSI '0' ']' -- end of the current direction
569 o CSI '1' ']' -- start of left-to-right text
570 o CSI '2' ']' -- start of right-to-left text
571 The control character CSI (0x9B: control sequence introducer) is
572 abbreviated to the escape sequence ESC '[' in 7-bit environment.
573
574 Character composition specification takes the following format:
575 o ESC '0' -- start character composition
576 o ESC '1' -- end character composition
577 Since these are not standard escape sequences of any ISO, the use
578 of them for these meaning is restricted to Emacs only. */
579
580enum iso_code_class_type iso_code_class[256];
581
582/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
583 Check if a text is encoded in ISO2022. If it is, returns an
584 integer in which appropriate flag bits any of:
585 CODING_CATEGORY_MASK_ISO_7
586 CODING_CATEGORY_MASK_ISO_8_1
587 CODING_CATEGORY_MASK_ISO_8_2
588 CODING_CATEGORY_MASK_ISO_ELSE
589 are set. If a code which should never appear in ISO2022 is found,
590 returns 0. */
591
592int
593detect_coding_iso2022 (src, src_end)
594 unsigned char *src, *src_end;
595{
bcf26d6a
KH
596 int mask = CODING_CATEGORY_MASK_ANY;
597 int g1 = 0; /* 1 iff designating to G1. */
598 int c, i;
4ed46869 599
e0e989f6 600 while (src < src_end)
4ed46869
KH
601 {
602 c = *src++;
603 switch (c)
604 {
605 case ISO_CODE_ESC:
e0e989f6 606 if (src >= src_end)
4ed46869
KH
607 break;
608 c = *src++;
bcf26d6a 609 if (src < src_end
e0e989f6
KH
610 && ((c >= '(' && c <= '/')
611 || c == '$' && ((*src >= '(' && *src <= '/')
612 || (*src >= '@' && *src <= 'B'))))
4ed46869 613 {
e0e989f6 614 /* Valid designation sequence. */
bcf26d6a
KH
615 mask &= (CODING_CATEGORY_MASK_ISO_7
616 | CODING_CATEGORY_MASK_ISO_8_1
617 | CODING_CATEGORY_MASK_ISO_8_2
618 | CODING_CATEGORY_MASK_ISO_ELSE);
e0e989f6 619 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
620 {
621 g1 = 1;
622 mask &= ~CODING_CATEGORY_MASK_ISO_7;
623 }
e0e989f6
KH
624 src++;
625 break;
4ed46869 626 }
4ed46869
KH
627 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
628 return CODING_CATEGORY_MASK_ISO_ELSE;
629 break;
630
4ed46869 631 case ISO_CODE_SO:
e0e989f6
KH
632 if (g1)
633 return CODING_CATEGORY_MASK_ISO_ELSE;
634 break;
635
4ed46869
KH
636 case ISO_CODE_CSI:
637 case ISO_CODE_SS2:
638 case ISO_CODE_SS3:
639 mask &= ~CODING_CATEGORY_MASK_ISO_7;
640 break;
641
642 default:
643 if (c < 0x80)
644 break;
645 else if (c < 0xA0)
646 return 0;
647 else
648 {
649 int count = 1;
650
651 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 652 while (src < src_end && *src >= 0xA0)
4ed46869 653 count++, src++;
e0e989f6 654 if (count & 1 && src < src_end)
4ed46869
KH
655 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
656 }
657 break;
658 }
659 }
660
661 return mask;
662}
663
664/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 665 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
666 fetched from SRC and set to C2. If CHARSET is negative, it means
667 that we are decoding ill formed text, and what we can do is just to
668 read C1 as is. */
669
bdd9fb48
KH
670#define DECODE_ISO_CHARACTER(charset, c1) \
671 do { \
672 int c_alt, charset_alt = (charset); \
673 if (COMPOSING_HEAD_P (coding->composing)) \
674 { \
675 *dst++ = LEADING_CODE_COMPOSITION; \
676 if (COMPOSING_WITH_RULE_P (coding->composing)) \
677 /* To tell composition rules are embeded. */ \
678 *dst++ = 0xFF; \
679 coding->composing += 2; \
680 } \
681 if ((charset) >= 0) \
682 { \
683 if (CHARSET_DIMENSION (charset) == 2) \
684 ONE_MORE_BYTE (c2); \
685 if (!NILP (unification_table) \
686 && ((c_alt = unify_char (unification_table, \
687 -1, (charset), c1, c2)) >= 0)) \
688 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
689 } \
690 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
691 DECODE_CHARACTER_ASCII (c1); \
692 else if (CHARSET_DIMENSION (charset_alt) == 1) \
693 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
694 else \
695 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
696 if (COMPOSING_WITH_RULE_P (coding->composing)) \
697 /* To tell a composition rule follows. */ \
698 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
699 } while (0)
700
701/* Set designation state into CODING. */
702#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
703 do { \
704 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
4ed46869
KH
705 if (charset >= 0) \
706 { \
707 if (coding->direction == 1 \
708 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
709 charset = CHARSET_REVERSE_CHARSET (charset); \
710 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
711 } \
712 } while (0)
713
714/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
715
716int
717decode_coding_iso2022 (coding, source, destination,
718 src_bytes, dst_bytes, consumed)
719 struct coding_system *coding;
720 unsigned char *source, *destination;
721 int src_bytes, dst_bytes;
722 int *consumed;
723{
724 unsigned char *src = source;
725 unsigned char *src_end = source + src_bytes;
726 unsigned char *dst = destination;
727 unsigned char *dst_end = destination + dst_bytes;
728 /* Since the maximum bytes produced by each loop is 7, we subtract 6
729 from DST_END to assure that overflow checking is necessary only
730 at the head of loop. */
731 unsigned char *adjusted_dst_end = dst_end - 6;
732 int charset;
733 /* Charsets invoked to graphic plane 0 and 1 respectively. */
734 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
735 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
bdd9fb48
KH
736 Lisp_Object unification_table = coding->character_unification_table;
737
738 if (!NILP (Venable_character_unification) && NILP (unification_table))
739 unification_table = Vstandard_character_unification_table_for_read;
4ed46869
KH
740
741 while (src < src_end && dst < adjusted_dst_end)
742 {
743 /* SRC_BASE remembers the start position in source in each loop.
744 The loop will be exited when there's not enough source text
745 to analyze long escape sequence or 2-byte code (within macros
746 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
747 to SRC_BASE before exiting. */
748 unsigned char *src_base = src;
bdd9fb48 749 int c1 = *src++, c2;
4ed46869
KH
750
751 switch (iso_code_class [c1])
752 {
753 case ISO_0x20_or_0x7F:
754 if (!coding->composing
755 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
756 {
757 /* This is SPACE or DEL. */
758 *dst++ = c1;
759 break;
760 }
761 /* This is a graphic character, we fall down ... */
762
763 case ISO_graphic_plane_0:
764 if (coding->composing == COMPOSING_WITH_RULE_RULE)
765 {
766 /* This is a composition rule. */
767 *dst++ = c1 | 0x80;
768 coding->composing = COMPOSING_WITH_RULE_TAIL;
769 }
770 else
771 DECODE_ISO_CHARACTER (charset0, c1);
772 break;
773
774 case ISO_0xA0_or_0xFF:
775 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
776 {
777 /* Invalid code. */
778 *dst++ = c1;
779 break;
780 }
781 /* This is a graphic character, we fall down ... */
782
783 case ISO_graphic_plane_1:
784 DECODE_ISO_CHARACTER (charset1, c1);
785 break;
786
787 case ISO_control_code:
788 /* All ISO2022 control characters in this class have the
789 same representation in Emacs internal format. */
790 *dst++ = c1;
791 break;
792
793 case ISO_carriage_return:
794 if (coding->eol_type == CODING_EOL_CR)
795 {
796 *dst++ = '\n';
797 }
798 else if (coding->eol_type == CODING_EOL_CRLF)
799 {
800 ONE_MORE_BYTE (c1);
801 if (c1 == ISO_CODE_LF)
802 *dst++ = '\n';
803 else
804 {
805 src--;
806 *dst++ = c1;
807 }
808 }
809 else
810 {
811 *dst++ = c1;
812 }
813 break;
814
815 case ISO_shift_out:
e0e989f6
KH
816 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
817 goto label_invalid_escape_sequence;
4ed46869
KH
818 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
819 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
820 break;
821
822 case ISO_shift_in:
823 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
824 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
825 break;
826
827 case ISO_single_shift_2_7:
828 case ISO_single_shift_2:
829 /* SS2 is handled as an escape sequence of ESC 'N' */
830 c1 = 'N';
831 goto label_escape_sequence;
832
833 case ISO_single_shift_3:
834 /* SS2 is handled as an escape sequence of ESC 'O' */
835 c1 = 'O';
836 goto label_escape_sequence;
837
838 case ISO_control_sequence_introducer:
839 /* CSI is handled as an escape sequence of ESC '[' ... */
840 c1 = '[';
841 goto label_escape_sequence;
842
843 case ISO_escape:
844 ONE_MORE_BYTE (c1);
845 label_escape_sequence:
846 /* Escape sequences handled by Emacs are invocation,
847 designation, direction specification, and character
848 composition specification. */
849 switch (c1)
850 {
851 case '&': /* revision of following character set */
852 ONE_MORE_BYTE (c1);
853 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 854 goto label_invalid_escape_sequence;
4ed46869
KH
855 ONE_MORE_BYTE (c1);
856 if (c1 != ISO_CODE_ESC)
e0e989f6 857 goto label_invalid_escape_sequence;
4ed46869
KH
858 ONE_MORE_BYTE (c1);
859 goto label_escape_sequence;
860
861 case '$': /* designation of 2-byte character set */
862 ONE_MORE_BYTE (c1);
863 if (c1 >= '@' && c1 <= 'B')
864 { /* designation of JISX0208.1978, GB2312.1980,
865 or JISX0208.1980 */
866 DECODE_DESIGNATION (0, 2, 94, c1);
867 }
868 else if (c1 >= 0x28 && c1 <= 0x2B)
869 { /* designation of DIMENSION2_CHARS94 character set */
870 ONE_MORE_BYTE (c2);
871 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
872 }
873 else if (c1 >= 0x2C && c1 <= 0x2F)
874 { /* designation of DIMENSION2_CHARS96 character set */
875 ONE_MORE_BYTE (c2);
876 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
877 }
878 else
e0e989f6 879 goto label_invalid_escape_sequence;
4ed46869
KH
880 break;
881
882 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
883 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
884 goto label_invalid_escape_sequence;
4ed46869 885 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 886 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
887 break;
888
889 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
890 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
891 goto label_invalid_escape_sequence;
4ed46869 892 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 893 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
894 break;
895
896 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
897 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
898 goto label_invalid_escape_sequence;
4ed46869
KH
899 ONE_MORE_BYTE (c1);
900 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
901 DECODE_ISO_CHARACTER (charset, c1);
902 break;
903
904 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
905 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
906 goto label_invalid_escape_sequence;
4ed46869
KH
907 ONE_MORE_BYTE (c1);
908 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
909 DECODE_ISO_CHARACTER (charset, c1);
910 break;
911
912 case '0': /* start composing without embeded rules */
913 coding->composing = COMPOSING_NO_RULE_HEAD;
914 break;
915
916 case '1': /* end composing */
917 coding->composing = COMPOSING_NO;
918 break;
919
920 case '2': /* start composing with embeded rules */
921 coding->composing = COMPOSING_WITH_RULE_HEAD;
922 break;
923
924 case '[': /* specification of direction */
925 /* For the moment, nested direction is not supported.
926 So, the value of `coding->direction' is 0 or 1: 0
927 means left-to-right, 1 means right-to-left. */
928 ONE_MORE_BYTE (c1);
929 switch (c1)
930 {
931 case ']': /* end of the current direction */
932 coding->direction = 0;
933
934 case '0': /* end of the current direction */
935 case '1': /* start of left-to-right direction */
936 ONE_MORE_BYTE (c1);
937 if (c1 == ']')
938 coding->direction = 0;
939 else
940 goto label_invalid_escape_sequence;
941 break;
942
943 case '2': /* start of right-to-left direction */
944 ONE_MORE_BYTE (c1);
945 if (c1 == ']')
946 coding->direction= 1;
947 else
948 goto label_invalid_escape_sequence;
949 break;
950
951 default:
952 goto label_invalid_escape_sequence;
953 }
954 break;
955
956 default:
957 if (c1 >= 0x28 && c1 <= 0x2B)
958 { /* designation of DIMENSION1_CHARS94 character set */
959 ONE_MORE_BYTE (c2);
960 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
961 }
962 else if (c1 >= 0x2C && c1 <= 0x2F)
963 { /* designation of DIMENSION1_CHARS96 character set */
964 ONE_MORE_BYTE (c2);
965 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
966 }
967 else
968 {
969 goto label_invalid_escape_sequence;
970 }
971 }
972 /* We must update these variables now. */
973 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
974 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
975 break;
976
977 label_invalid_escape_sequence:
978 {
979 int length = src - src_base;
980
981 bcopy (src_base, dst, length);
982 dst += length;
983 }
984 }
985 continue;
986
987 label_end_of_loop:
988 coding->carryover_size = src - src_base;
989 bcopy (src_base, coding->carryover, coding->carryover_size);
990 src = src_base;
991 break;
992 }
993
994 /* If this is the last block of the text to be decoded, we had
995 better just flush out all remaining codes in the text although
996 they are not valid characters. */
997 if (coding->last_block)
998 {
999 bcopy (src, dst, src_end - src);
1000 dst += (src_end - src);
1001 src = src_end;
1002 }
1003 *consumed = src - source;
1004 return dst - destination;
1005}
1006
1007/* ISO2022 encoding staffs. */
1008
1009/*
1010 It is not enough to say just "ISO2022" on encoding, but we have to
1011 specify more details. In Emacs, each coding-system of ISO2022
1012 variant has the following specifications:
1013 1. Initial designation to G0 thru G3.
1014 2. Allows short-form designation?
1015 3. ASCII should be designated to G0 before control characters?
1016 4. ASCII should be designated to G0 at end of line?
1017 5. 7-bit environment or 8-bit environment?
1018 6. Use locking-shift?
1019 7. Use Single-shift?
1020 And the following two are only for Japanese:
1021 8. Use ASCII in place of JIS0201-1976-Roman?
1022 9. Use JISX0208-1983 in place of JISX0208-1978?
1023 These specifications are encoded in `coding->flags' as flag bits
1024 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1025 detail.
1026*/
1027
1028/* Produce codes (escape sequence) for designating CHARSET to graphic
1029 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1030 the coding system CODING allows, produce designation sequence of
1031 short-form. */
1032
1033#define ENCODE_DESIGNATION(charset, reg, coding) \
1034 do { \
1035 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1036 char *intermediate_char_94 = "()*+"; \
1037 char *intermediate_char_96 = ",-./"; \
1038 Lisp_Object temp \
1039 = Fassq (make_number (charset), Vcharset_revision_alist); \
1040 if (! NILP (temp)) \
1041 { \
1042 *dst++ = ISO_CODE_ESC; \
1043 *dst++ = '&'; \
1044 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1045 } \
1046 *dst++ = ISO_CODE_ESC; \
1047 if (CHARSET_DIMENSION (charset) == 1) \
1048 { \
1049 if (CHARSET_CHARS (charset) == 94) \
1050 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1051 else \
1052 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1053 } \
1054 else \
1055 { \
1056 *dst++ = '$'; \
1057 if (CHARSET_CHARS (charset) == 94) \
1058 { \
1059 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1060 || reg != 0 \
1061 || final_char < '@' || final_char > 'B') \
1062 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1063 } \
1064 else \
1065 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1066 } \
1067 *dst++ = final_char; \
1068 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1069 } while (0)
1070
1071/* The following two macros produce codes (control character or escape
1072 sequence) for ISO2022 single-shift functions (single-shift-2 and
1073 single-shift-3). */
1074
1075#define ENCODE_SINGLE_SHIFT_2 \
1076 do { \
1077 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1078 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1079 else \
1080 *dst++ = ISO_CODE_SS2; \
1081 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1082 } while (0)
1083
1084#define ENCODE_SINGLE_SHIFT_3 \
1085 do { \
1086 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1087 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1088 else \
1089 *dst++ = ISO_CODE_SS3; \
1090 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1091 } while (0)
1092
1093/* The following four macros produce codes (control character or
1094 escape sequence) for ISO2022 locking-shift functions (shift-in,
1095 shift-out, locking-shift-2, and locking-shift-3). */
1096
1097#define ENCODE_SHIFT_IN \
1098 do { \
1099 *dst++ = ISO_CODE_SI; \
1100 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1101 } while (0)
1102
1103#define ENCODE_SHIFT_OUT \
1104 do { \
1105 *dst++ = ISO_CODE_SO; \
1106 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1107 } while (0)
1108
1109#define ENCODE_LOCKING_SHIFT_2 \
1110 do { \
1111 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1112 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1113 } while (0)
1114
1115#define ENCODE_LOCKING_SHIFT_3 \
1116 do { \
1117 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1118 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1119 } while (0)
1120
1121/* Produce codes for a DIMENSION1 character of which character set is
1122 CHARSET and position-code is C1. Designation and invocation
1123 sequences are also produced in advance if necessary. */
1124
1125
1126#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1127 do { \
1128 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1129 { \
1130 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1131 *dst++ = c1 & 0x7F; \
1132 else \
1133 *dst++ = c1 | 0x80; \
1134 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1135 break; \
1136 } \
1137 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1138 { \
1139 *dst++ = c1 & 0x7F; \
1140 break; \
1141 } \
1142 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1143 { \
1144 *dst++ = c1 | 0x80; \
1145 break; \
1146 } \
1147 else \
1148 /* Since CHARSET is not yet invoked to any graphic planes, we \
1149 must invoke it, or, at first, designate it to some graphic \
1150 register. Then repeat the loop to actually produce the \
1151 character. */ \
1152 dst = encode_invocation_designation (charset, coding, dst); \
1153 } while (1)
1154
1155/* Produce codes for a DIMENSION2 character of which character set is
1156 CHARSET and position-codes are C1 and C2. Designation and
1157 invocation codes are also produced in advance if necessary. */
1158
1159#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1160 do { \
1161 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1162 { \
1163 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1164 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1165 else \
1166 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1167 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1168 break; \
1169 } \
1170 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1171 { \
1172 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1173 break; \
1174 } \
1175 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1176 { \
1177 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1178 break; \
1179 } \
1180 else \
1181 /* Since CHARSET is not yet invoked to any graphic planes, we \
1182 must invoke it, or, at first, designate it to some graphic \
1183 register. Then repeat the loop to actually produce the \
1184 character. */ \
1185 dst = encode_invocation_designation (charset, coding, dst); \
1186 } while (1)
1187
bdd9fb48
KH
1188#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1189 do { \
1190 int c_alt, charset_alt; \
1191 if (!NILP (unification_table) \
1192 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1193 < 0)) \
1194 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1195 else \
1196 charset_alt = charset; \
1197 if (CHARSET_DIMENSION (charset_alt) == 1) \
1198 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1199 else \
1200 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1201 } while (0)
1202
4ed46869
KH
1203/* Produce designation and invocation codes at a place pointed by DST
1204 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1205 Return new DST. */
1206
1207unsigned char *
1208encode_invocation_designation (charset, coding, dst)
1209 int charset;
1210 struct coding_system *coding;
1211 unsigned char *dst;
1212{
1213 int reg; /* graphic register number */
1214
1215 /* At first, check designations. */
1216 for (reg = 0; reg < 4; reg++)
1217 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1218 break;
1219
1220 if (reg >= 4)
1221 {
1222 /* CHARSET is not yet designated to any graphic registers. */
1223 /* At first check the requested designation. */
1224 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1225 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1226 /* Since CHARSET requests no special designation, designate it
1227 to graphic register 0. */
4ed46869
KH
1228 reg = 0;
1229
1230 ENCODE_DESIGNATION (charset, reg, coding);
1231 }
1232
1233 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1234 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1235 {
1236 /* Since the graphic register REG is not invoked to any graphic
1237 planes, invoke it to graphic plane 0. */
1238 switch (reg)
1239 {
1240 case 0: /* graphic register 0 */
1241 ENCODE_SHIFT_IN;
1242 break;
1243
1244 case 1: /* graphic register 1 */
1245 ENCODE_SHIFT_OUT;
1246 break;
1247
1248 case 2: /* graphic register 2 */
1249 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 ENCODE_SINGLE_SHIFT_2;
1251 else
1252 ENCODE_LOCKING_SHIFT_2;
1253 break;
1254
1255 case 3: /* graphic register 3 */
1256 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1257 ENCODE_SINGLE_SHIFT_3;
1258 else
1259 ENCODE_LOCKING_SHIFT_3;
1260 break;
1261 }
1262 }
1263 return dst;
1264}
1265
1266/* The following two macros produce codes for indicating composition. */
1267#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1268#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1269#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1270
1271/* The following three macros produce codes for indicating direction
1272 of text. */
1273#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1274 do { \
1275 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1276 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1277 else \
1278 *dst++ = ISO_CODE_CSI; \
1279 } while (0)
1280
1281#define ENCODE_DIRECTION_R2L \
1282 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1283
1284#define ENCODE_DIRECTION_L2R \
1285 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1286
1287/* Produce codes for designation and invocation to reset the graphic
1288 planes and registers to initial state. */
e0e989f6
KH
1289#define ENCODE_RESET_PLANE_AND_REGISTER \
1290 do { \
1291 int reg; \
1292 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1293 ENCODE_SHIFT_IN; \
1294 for (reg = 0; reg < 4; reg++) \
1295 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1296 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1297 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1298 ENCODE_DESIGNATION \
1299 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1300 } while (0)
1301
bdd9fb48
KH
1302/* Produce designation sequences of charsets in the line started from
1303 *SRC to a place pointed by DSTP.
1304
1305 If the current block ends before any end-of-line, we may fail to
1306 find all the necessary *designations. */
1307encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1308 struct coding_system *coding;
bdd9fb48 1309 Lisp_Object table;
e0e989f6
KH
1310 unsigned char *src, *src_end, **dstp;
1311{
bdd9fb48
KH
1312 int charset, c, found = 0, reg;
1313 /* Table of charsets to be designated to each graphic register. */
1314 int r[4];
1315 unsigned char *dst = *dstp;
1316
1317 for (reg = 0; reg < 4; reg++)
1318 r[reg] = -1;
1319
1320 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1321 {
bdd9fb48
KH
1322 int bytes = BYTES_BY_CHAR_HEAD (*src);
1323
1324 if (NILP (table))
1325 charset = CHARSET_AT (src);
1326 else
e0e989f6 1327 {
bdd9fb48
KH
1328 int c_alt, c1, c2;
1329
1330 SPLIT_STRING(src, bytes, charset, c1, c2);
1331 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1332 charset = CHAR_CHARSET (c_alt);
e0e989f6 1333 }
bdd9fb48 1334
e0e989f6 1335 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1336 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1337 {
1338 found++;
1339 r[reg] = charset;
1340 }
1341
1342 src += bytes;
1343 }
1344
1345 if (found)
1346 {
1347 for (reg = 0; reg < 4; reg++)
1348 if (r[reg] >= 0
1349 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1350 ENCODE_DESIGNATION (r[reg], reg, coding);
1351 *dstp = dst;
e0e989f6 1352 }
e0e989f6
KH
1353}
1354
4ed46869
KH
1355/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1356
1357int
1358encode_coding_iso2022 (coding, source, destination,
1359 src_bytes, dst_bytes, consumed)
1360 struct coding_system *coding;
1361 unsigned char *source, *destination;
1362 int src_bytes, dst_bytes;
1363 int *consumed;
1364{
1365 unsigned char *src = source;
1366 unsigned char *src_end = source + src_bytes;
1367 unsigned char *dst = destination;
1368 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1369 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1370 from DST_END to assure overflow checking is necessary only at the
1371 head of loop. */
e0e989f6 1372 unsigned char *adjusted_dst_end = dst_end - 19;
bdd9fb48
KH
1373 Lisp_Object unification_table = coding->character_unification_table;
1374
1375 if (!NILP (Venable_character_unification) && NILP (unification_table))
1376 unification_table = Vstandard_character_unification_table_for_write;
4ed46869
KH
1377
1378 while (src < src_end && dst < adjusted_dst_end)
1379 {
1380 /* SRC_BASE remembers the start position in source in each loop.
1381 The loop will be exited when there's not enough source text
1382 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1383 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1384 reset to SRC_BASE before exiting. */
1385 unsigned char *src_base = src;
bdd9fb48 1386 int charset, c1, c2, c3, c4;
4ed46869 1387
e0e989f6
KH
1388 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1389 && CODING_SPEC_ISO_BOL (coding))
1390 {
bdd9fb48
KH
1391 /* We have to produce designation sequences if any now. */
1392 encode_designation_at_bol (coding, unification_table,
1393 src, src_end, &dst);
e0e989f6
KH
1394 CODING_SPEC_ISO_BOL (coding) = 0;
1395 }
1396
1397 c1 = *src++;
4ed46869
KH
1398 /* If we are seeing a component of a composite character, we are
1399 seeing a leading-code specially encoded for composition, or a
1400 composition rule if composing with rule. We must set C1
1401 to a normal leading-code or an ASCII code. If we are not at
1402 a composed character, we must reset the composition state. */
1403 if (COMPOSING_P (coding->composing))
1404 {
1405 if (c1 < 0xA0)
1406 {
1407 /* We are not in a composite character any longer. */
1408 coding->composing = COMPOSING_NO;
1409 ENCODE_COMPOSITION_END;
1410 }
1411 else
1412 {
1413 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1414 {
1415 *dst++ = c1 & 0x7F;
1416 coding->composing = COMPOSING_WITH_RULE_HEAD;
1417 continue;
1418 }
1419 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1420 coding->composing = COMPOSING_WITH_RULE_RULE;
1421 if (c1 == 0xA0)
1422 {
1423 /* This is an ASCII component. */
1424 ONE_MORE_BYTE (c1);
1425 c1 &= 0x7F;
1426 }
1427 else
1428 /* This is a leading-code of non ASCII component. */
1429 c1 -= 0x20;
1430 }
1431 }
1432
1433 /* Now encode one character. C1 is a control character, an
1434 ASCII character, or a leading-code of multi-byte character. */
1435 switch (emacs_code_class[c1])
1436 {
1437 case EMACS_ascii_code:
bdd9fb48 1438 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1439 break;
1440
1441 case EMACS_control_code:
1442 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1443 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1444 *dst++ = c1;
1445 break;
1446
1447 case EMACS_carriage_return_code:
1448 if (!coding->selective)
1449 {
1450 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1451 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1452 *dst++ = c1;
1453 break;
1454 }
1455 /* fall down to treat '\r' as '\n' ... */
1456
1457 case EMACS_linefeed_code:
1458 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1459 ENCODE_RESET_PLANE_AND_REGISTER;
1460 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1461 bcopy (coding->spec.iso2022.initial_designation,
1462 coding->spec.iso2022.current_designation,
1463 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1464 if (coding->eol_type == CODING_EOL_LF
0ef69138 1465 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1466 *dst++ = ISO_CODE_LF;
1467 else if (coding->eol_type == CODING_EOL_CRLF)
1468 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1469 else
1470 *dst++ = ISO_CODE_CR;
e0e989f6 1471 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1472 break;
1473
1474 case EMACS_leading_code_2:
1475 ONE_MORE_BYTE (c2);
bdd9fb48 1476 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1477 break;
1478
1479 case EMACS_leading_code_3:
1480 TWO_MORE_BYTES (c2, c3);
1481 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1482 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1483 else
bdd9fb48 1484 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1485 break;
1486
1487 case EMACS_leading_code_4:
1488 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1489 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1490 break;
1491
1492 case EMACS_leading_code_composition:
1493 ONE_MORE_BYTE (c1);
1494 if (c1 == 0xFF)
1495 {
1496 coding->composing = COMPOSING_WITH_RULE_HEAD;
1497 ENCODE_COMPOSITION_WITH_RULE_START;
1498 }
1499 else
1500 {
1501 /* Rewind one byte because it is a character code of
1502 composition elements. */
1503 src--;
1504 coding->composing = COMPOSING_NO_RULE_HEAD;
1505 ENCODE_COMPOSITION_NO_RULE_START;
1506 }
1507 break;
1508
1509 case EMACS_invalid_code:
1510 *dst++ = c1;
1511 break;
1512 }
1513 continue;
1514 label_end_of_loop:
1515 coding->carryover_size = src - src_base;
1516 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1517 break;
1518 }
1519
1520 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1521 reset graphic planes and registers to the initial state. */
1522 if (src >= src_end && coding->last_block)
4ed46869 1523 {
e0e989f6 1524 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1525 if (coding->carryover_size > 0
1526 && coding->carryover_size < (dst_end - dst))
1527 {
1528 bcopy (coding->carryover, dst, coding->carryover_size);
1529 dst += coding->carryover_size;
1530 coding->carryover_size = 0;
1531 }
4ed46869
KH
1532 }
1533 *consumed = src - source;
1534 return dst - destination;
1535}
1536
1537\f
1538/*** 4. SJIS and BIG5 handlers ***/
1539
1540/* Although SJIS and BIG5 are not ISO's coding system, They are used
1541 quite widely. So, for the moment, Emacs supports them in the bare
1542 C code. But, in the future, they may be supported only by CCL. */
1543
1544/* SJIS is a coding system encoding three character sets: ASCII, right
1545 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1546 as is. A character of charset katakana-jisx0201 is encoded by
1547 "position-code + 0x80". A character of charset japanese-jisx0208
1548 is encoded in 2-byte but two position-codes are divided and shifted
1549 so that it fit in the range below.
1550
1551 --- CODE RANGE of SJIS ---
1552 (character set) (range)
1553 ASCII 0x00 .. 0x7F
1554 KATAKANA-JISX0201 0xA0 .. 0xDF
1555 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1556 (2nd byte) 0x40 .. 0xFF
1557 -------------------------------
1558
1559*/
1560
1561/* BIG5 is a coding system encoding two character sets: ASCII and
1562 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1563 character set and is encoded in two-byte.
1564
1565 --- CODE RANGE of BIG5 ---
1566 (character set) (range)
1567 ASCII 0x00 .. 0x7F
1568 Big5 (1st byte) 0xA1 .. 0xFE
1569 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1570 --------------------------
1571
1572 Since the number of characters in Big5 is larger than maximum
1573 characters in Emacs' charset (96x96), it can't be handled as one
1574 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1575 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1576 contains frequently used characters and the latter contains less
1577 frequently used characters. */
1578
1579/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1580 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1581 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1582 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1583
1584/* Number of Big5 characters which have the same code in 1st byte. */
1585#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1586
1587#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1588 do { \
1589 unsigned int temp \
1590 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1591 if (b1 < 0xC9) \
1592 charset = charset_big5_1; \
1593 else \
1594 { \
1595 charset = charset_big5_2; \
1596 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1597 } \
1598 c1 = temp / (0xFF - 0xA1) + 0x21; \
1599 c2 = temp % (0xFF - 0xA1) + 0x21; \
1600 } while (0)
1601
1602#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1603 do { \
1604 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1605 if (charset == charset_big5_2) \
1606 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1607 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1608 b2 = temp % BIG5_SAME_ROW; \
1609 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1610 } while (0)
1611
1612/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1613 Check if a text is encoded in SJIS. If it is, return
1614 CODING_CATEGORY_MASK_SJIS, else return 0. */
1615
1616int
1617detect_coding_sjis (src, src_end)
1618 unsigned char *src, *src_end;
1619{
1620 unsigned char c;
1621
1622 while (src < src_end)
1623 {
1624 c = *src++;
1625 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1626 return 0;
1627 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1628 {
1629 if (src < src_end && *src++ < 0x40)
1630 return 0;
1631 }
1632 }
1633 return CODING_CATEGORY_MASK_SJIS;
1634}
1635
1636/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1637 Check if a text is encoded in BIG5. If it is, return
1638 CODING_CATEGORY_MASK_BIG5, else return 0. */
1639
1640int
1641detect_coding_big5 (src, src_end)
1642 unsigned char *src, *src_end;
1643{
1644 unsigned char c;
1645
1646 while (src < src_end)
1647 {
1648 c = *src++;
1649 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1650 return 0;
1651 if (c >= 0xA1)
1652 {
1653 if (src >= src_end)
1654 break;
1655 c = *src++;
1656 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1657 return 0;
1658 }
1659 }
1660 return CODING_CATEGORY_MASK_BIG5;
1661}
1662
1663/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1664 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1665
1666int
1667decode_coding_sjis_big5 (coding, source, destination,
1668 src_bytes, dst_bytes, consumed, sjis_p)
1669 struct coding_system *coding;
1670 unsigned char *source, *destination;
1671 int src_bytes, dst_bytes;
1672 int *consumed;
1673 int sjis_p;
1674{
1675 unsigned char *src = source;
1676 unsigned char *src_end = source + src_bytes;
1677 unsigned char *dst = destination;
1678 unsigned char *dst_end = destination + dst_bytes;
1679 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1680 from DST_END to assure overflow checking is necessary only at the
1681 head of loop. */
1682 unsigned char *adjusted_dst_end = dst_end - 3;
1683
1684 while (src < src_end && dst < adjusted_dst_end)
1685 {
1686 /* SRC_BASE remembers the start position in source in each loop.
1687 The loop will be exited when there's not enough source text
1688 to analyze two-byte character (within macro ONE_MORE_BYTE).
1689 In that case, SRC is reset to SRC_BASE before exiting. */
1690 unsigned char *src_base = src;
1691 unsigned char c1 = *src++, c2, c3, c4;
1692
1693 if (c1 == '\r')
1694 {
1695 if (coding->eol_type == CODING_EOL_CRLF)
1696 {
1697 ONE_MORE_BYTE (c2);
1698 if (c2 == '\n')
1699 *dst++ = c2;
1700 else
1701 /* To process C2 again, SRC is subtracted by 1. */
1702 *dst++ = c1, src--;
1703 }
1704 else
1705 *dst++ = c1;
1706 }
1707 else if (c1 < 0x80)
1708 *dst++ = c1;
1709 else if (c1 < 0xA0 || c1 >= 0xE0)
1710 {
1711 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1712 if (sjis_p)
1713 {
1714 ONE_MORE_BYTE (c2);
1715 DECODE_SJIS (c1, c2, c3, c4);
1716 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1717 }
1718 else if (c1 >= 0xE0 && c1 < 0xFF)
1719 {
1720 int charset;
1721
1722 ONE_MORE_BYTE (c2);
1723 DECODE_BIG5 (c1, c2, charset, c3, c4);
1724 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1725 }
1726 else /* Invalid code */
1727 *dst++ = c1;
1728 }
1729 else
1730 {
1731 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1732 if (sjis_p)
1733 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1734 else
1735 {
1736 int charset;
1737
1738 ONE_MORE_BYTE (c2);
1739 DECODE_BIG5 (c1, c2, charset, c3, c4);
1740 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1741 }
1742 }
1743 continue;
1744
1745 label_end_of_loop:
1746 coding->carryover_size = src - src_base;
1747 bcopy (src_base, coding->carryover, coding->carryover_size);
1748 src = src_base;
1749 break;
1750 }
1751
1752 *consumed = src - source;
1753 return dst - destination;
1754}
1755
1756/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1757 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1758 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1759 sure that all these charsets are registered as official charset
1760 (i.e. do not have extended leading-codes). Characters of other
1761 charsets are produced without any encoding. If SJIS_P is 1, encode
1762 SJIS text, else encode BIG5 text. */
1763
1764int
1765encode_coding_sjis_big5 (coding, source, destination,
1766 src_bytes, dst_bytes, consumed, sjis_p)
1767 struct coding_system *coding;
1768 unsigned char *source, *destination;
1769 int src_bytes, dst_bytes;
1770 int *consumed;
1771 int sjis_p;
1772{
1773 unsigned char *src = source;
1774 unsigned char *src_end = source + src_bytes;
1775 unsigned char *dst = destination;
1776 unsigned char *dst_end = destination + dst_bytes;
1777 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1778 from DST_END to assure overflow checking is necessary only at the
1779 head of loop. */
1780 unsigned char *adjusted_dst_end = dst_end - 1;
1781
1782 while (src < src_end && dst < adjusted_dst_end)
1783 {
1784 /* SRC_BASE remembers the start position in source in each loop.
1785 The loop will be exited when there's not enough source text
1786 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1787 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1788 before exiting. */
1789 unsigned char *src_base = src;
1790 unsigned char c1 = *src++, c2, c3, c4;
1791
1792 if (coding->composing)
1793 {
1794 if (c1 == 0xA0)
1795 {
1796 ONE_MORE_BYTE (c1);
1797 c1 &= 0x7F;
1798 }
1799 else if (c1 >= 0xA0)
1800 c1 -= 0x20;
1801 else
1802 coding->composing = 0;
1803 }
1804
1805 switch (emacs_code_class[c1])
1806 {
1807 case EMACS_ascii_code:
1808 case EMACS_control_code:
1809 *dst++ = c1;
1810 break;
1811
1812 case EMACS_carriage_return_code:
1813 if (!coding->selective)
1814 {
1815 *dst++ = c1;
1816 break;
1817 }
1818 /* fall down to treat '\r' as '\n' ... */
1819
1820 case EMACS_linefeed_code:
1821 if (coding->eol_type == CODING_EOL_LF
0ef69138 1822 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1823 *dst++ = '\n';
1824 else if (coding->eol_type == CODING_EOL_CRLF)
1825 *dst++ = '\r', *dst++ = '\n';
1826 else
1827 *dst++ = '\r';
1828 break;
1829
1830 case EMACS_leading_code_2:
1831 ONE_MORE_BYTE (c2);
1832 if (sjis_p && c1 == charset_katakana_jisx0201)
1833 *dst++ = c2;
1834 else
1835 *dst++ = c1, *dst++ = c2;
1836 break;
1837
1838 case EMACS_leading_code_3:
1839 TWO_MORE_BYTES (c2, c3);
1840 c2 &= 0x7F, c3 &= 0x7F;
1841 if (sjis_p && c1 == charset_jisx0208)
1842 {
1843 unsigned char s1, s2;
1844
1845 ENCODE_SJIS (c2, c3, s1, s2);
1846 *dst++ = s1, *dst++ = s2;
1847 }
1848 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1849 {
1850 unsigned char b1, b2;
1851
1852 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1853 *dst++ = b1, *dst++ = b2;
1854 }
1855 else
1856 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1857 break;
1858
1859 case EMACS_leading_code_4:
1860 THREE_MORE_BYTES (c2, c3, c4);
1861 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1862 break;
1863
1864 case EMACS_leading_code_composition:
1865 coding->composing = 1;
1866 break;
1867
1868 default: /* i.e. case EMACS_invalid_code: */
1869 *dst++ = c1;
1870 }
1871 continue;
1872
1873 label_end_of_loop:
1874 coding->carryover_size = src - src_base;
1875 bcopy (src_base, coding->carryover, coding->carryover_size);
1876 src = src_base;
1877 break;
1878 }
1879
1880 *consumed = src - source;
1881 return dst - destination;
1882}
1883
1884\f
1885/*** 5. End-of-line handlers ***/
1886
1887/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1888 This function is called only when `coding->eol_type' is
1889 CODING_EOL_CRLF or CODING_EOL_CR. */
1890
1891decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1892 struct coding_system *coding;
1893 unsigned char *source, *destination;
1894 int src_bytes, dst_bytes;
1895 int *consumed;
1896{
1897 unsigned char *src = source;
1898 unsigned char *src_end = source + src_bytes;
1899 unsigned char *dst = destination;
1900 unsigned char *dst_end = destination + dst_bytes;
1901 int produced;
1902
1903 switch (coding->eol_type)
1904 {
1905 case CODING_EOL_CRLF:
1906 {
1907 /* Since the maximum bytes produced by each loop is 2, we
1908 subtract 1 from DST_END to assure overflow checking is
1909 necessary only at the head of loop. */
1910 unsigned char *adjusted_dst_end = dst_end - 1;
1911
1912 while (src < src_end && dst < adjusted_dst_end)
1913 {
1914 unsigned char *src_base = src;
1915 unsigned char c = *src++;
1916 if (c == '\r')
1917 {
1918 ONE_MORE_BYTE (c);
1919 if (c != '\n')
1920 *dst++ = '\r';
bfd99048 1921 *dst++ = c;
4ed46869
KH
1922 }
1923 else
1924 *dst++ = c;
1925 continue;
1926
1927 label_end_of_loop:
1928 coding->carryover_size = src - src_base;
1929 bcopy (src_base, coding->carryover, coding->carryover_size);
1930 src = src_base;
1931 break;
1932 }
1933 *consumed = src - source;
1934 produced = dst - destination;
1935 break;
1936 }
1937
1938 case CODING_EOL_CR:
1939 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1940 bcopy (source, destination, produced);
1941 dst_end = destination + produced;
1942 while (dst < dst_end)
1943 if (*dst++ == '\r') dst[-1] = '\n';
1944 *consumed = produced;
1945 break;
1946
1947 default: /* i.e. case: CODING_EOL_LF */
1948 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1949 bcopy (source, destination, produced);
1950 *consumed = produced;
1951 break;
1952 }
1953
1954 return produced;
1955}
1956
1957/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1958 format of end-of-line according to `coding->eol_type'. If
1959 `coding->selective' is 1, code '\r' in source text also means
1960 end-of-line. */
1961
1962encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1963 struct coding_system *coding;
1964 unsigned char *source, *destination;
1965 int src_bytes, dst_bytes;
1966 int *consumed;
1967{
1968 unsigned char *src = source;
1969 unsigned char *dst = destination;
1970 int produced;
1971
1972 if (src_bytes <= 0)
1973 return 0;
1974
1975 switch (coding->eol_type)
1976 {
1977 case CODING_EOL_LF:
0ef69138 1978 case CODING_EOL_UNDECIDED:
4ed46869
KH
1979 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1980 bcopy (source, destination, produced);
1981 if (coding->selective)
1982 {
1983 int i = produced;
1984 while (i--)
1985 if (*dst++ == '\r') dst[-1] = '\n';
1986 }
1987 *consumed = produced;
1988
1989 case CODING_EOL_CRLF:
1990 {
1991 unsigned char c;
1992 unsigned char *src_end = source + src_bytes;
1993 unsigned char *dst_end = destination + dst_bytes;
1994 /* Since the maximum bytes produced by each loop is 2, we
1995 subtract 1 from DST_END to assure overflow checking is
1996 necessary only at the head of loop. */
1997 unsigned char *adjusted_dst_end = dst_end - 1;
1998
1999 while (src < src_end && dst < adjusted_dst_end)
2000 {
2001 c = *src++;
2002 if (c == '\n' || (c == '\r' && coding->selective))
2003 *dst++ = '\r', *dst++ = '\n';
2004 else
2005 *dst++ = c;
2006 }
2007 produced = dst - destination;
2008 *consumed = src - source;
2009 break;
2010 }
2011
2012 default: /* i.e. case CODING_EOL_CR: */
2013 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2014 bcopy (source, destination, produced);
2015 {
2016 int i = produced;
2017 while (i--)
2018 if (*dst++ == '\n') dst[-1] = '\r';
2019 }
2020 *consumed = produced;
2021 }
2022
2023 return produced;
2024}
2025
2026\f
2027/*** 6. C library functions ***/
2028
2029/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2030 has a property `coding-system'. The value of this property is a
2031 vector of length 5 (called as coding-vector). Among elements of
2032 this vector, the first (element[0]) and the fifth (element[4])
2033 carry important information for decoding/encoding. Before
2034 decoding/encoding, this information should be set in fields of a
2035 structure of type `coding_system'.
2036
2037 A value of property `coding-system' can be a symbol of another
2038 subsidiary coding-system. In that case, Emacs gets coding-vector
2039 from that symbol.
2040
2041 `element[0]' contains information to be set in `coding->type'. The
2042 value and its meaning is as follows:
2043
0ef69138
KH
2044 0 -- coding_type_emacs_mule
2045 1 -- coding_type_sjis
2046 2 -- coding_type_iso2022
2047 3 -- coding_type_big5
2048 4 -- coding_type_ccl encoder/decoder written in CCL
2049 nil -- coding_type_no_conversion
2050 t -- coding_type_undecided (automatic conversion on decoding,
2051 no-conversion on encoding)
4ed46869
KH
2052
2053 `element[4]' contains information to be set in `coding->flags' and
2054 `coding->spec'. The meaning varies by `coding->type'.
2055
2056 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2057 of length 32 (of which the first 13 sub-elements are used now).
2058 Meanings of these sub-elements are:
2059
2060 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2061 If the value is an integer of valid charset, the charset is
2062 assumed to be designated to graphic register N initially.
2063
2064 If the value is minus, it is a minus value of charset which
2065 reserves graphic register N, which means that the charset is
2066 not designated initially but should be designated to graphic
2067 register N just before encoding a character in that charset.
2068
2069 If the value is nil, graphic register N is never used on
2070 encoding.
2071
2072 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2073 Each value takes t or nil. See the section ISO2022 of
2074 `coding.h' for more information.
2075
2076 If `coding->type' is `coding_type_big5', element[4] is t to denote
2077 BIG5-ETen or nil to denote BIG5-HKU.
2078
2079 If `coding->type' takes the other value, element[4] is ignored.
2080
2081 Emacs Lisp's coding system also carries information about format of
2082 end-of-line in a value of property `eol-type'. If the value is
2083 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2084 means CODING_EOL_CR. If it is not integer, it should be a vector
2085 of subsidiary coding systems of which property `eol-type' has one
2086 of above values.
2087
2088*/
2089
2090/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2091 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2092 is setup so that no conversion is necessary and return -1, else
2093 return 0. */
2094
2095int
e0e989f6
KH
2096setup_coding_system (coding_system, coding)
2097 Lisp_Object coding_system;
4ed46869
KH
2098 struct coding_system *coding;
2099{
4ed46869
KH
2100 Lisp_Object type, eol_type;
2101
2102 /* At first, set several fields default values. */
2103 coding->require_flushing = 0;
2104 coding->last_block = 0;
2105 coding->selective = 0;
2106 coding->composing = 0;
2107 coding->direction = 0;
2108 coding->carryover_size = 0;
4ed46869 2109 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
bdd9fb48
KH
2110 /* We have not yet implemented a way to specify unification table in
2111 a coding system. */
2112 coding->character_unification_table = Qnil;
4ed46869 2113
e0e989f6
KH
2114 Vlast_coding_system_used = coding->symbol = coding_system;
2115 eol_type = Qnil;
2116 /* Get value of property `coding-system' until we get a vector.
2117 While doing that, also get values of properties
2118 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2119 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2120 {
4ed46869 2121 if (NILP (coding->post_read_conversion))
e0e989f6 2122 coding->post_read_conversion = Fget (coding_system,
4ed46869 2123 Qpost_read_conversion);
e0e989f6
KH
2124 if (NILP (coding->pre_write_conversion))
2125 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2126 Qpre_write_conversion);
e0e989f6
KH
2127 if (NILP (eol_type))
2128 eol_type = Fget (coding_system, Qeol_type);
2129 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2130 }
e0e989f6
KH
2131 if (!VECTORP (coding_system)
2132 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2133 goto label_invalid_coding_system;
2134
4ed46869 2135 if (VECTORP (eol_type))
0ef69138 2136 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2137 else if (XFASTINT (eol_type) == 1)
2138 coding->eol_type = CODING_EOL_CRLF;
2139 else if (XFASTINT (eol_type) == 2)
2140 coding->eol_type = CODING_EOL_CR;
2141 else
2142 coding->eol_type = CODING_EOL_LF;
2143
e0e989f6 2144 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2145 switch (XFASTINT (type))
2146 {
2147 case 0:
0ef69138 2148 coding->type = coding_type_emacs_mule;
4ed46869
KH
2149 break;
2150
2151 case 1:
2152 coding->type = coding_type_sjis;
2153 break;
2154
2155 case 2:
2156 coding->type = coding_type_iso2022;
2157 {
e0e989f6 2158 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2159 Lisp_Object *flags;
2160 int i, charset, default_reg_bits = 0;
2161
2162 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2163 goto label_invalid_coding_system;
2164
2165 flags = XVECTOR (val)->contents;
2166 coding->flags
2167 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2168 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2169 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2170 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2171 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2172 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2173 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2174 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2175 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2176 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2177 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2178
2179 /* Invoke graphic register 0 to plane 0. */
2180 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2181 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2182 CODING_SPEC_ISO_INVOCATION (coding, 1)
2183 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2184 /* Not single shifting at first. */
2185 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2186 /* Beginning of buffer should also be regarded as bol. */
2187 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2188
2189 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2190 FLAGS[REG] can be one of below:
2191 integer CHARSET: CHARSET occupies register I,
2192 t: designate nothing to REG initially, but can be used
2193 by any charsets,
2194 list of integer, nil, or t: designate the first
2195 element (if integer) to REG initially, the remaining
2196 elements (if integer) is designated to REG on request,
2197 if an element is t, REG can be used by any charset,
2198 nil: REG is never used. */
467e7675 2199 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2200 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2201 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2202 for (i = 0; i < 4; i++)
2203 {
2204 if (INTEGERP (flags[i])
e0e989f6
KH
2205 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2206 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2207 {
2208 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2209 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2210 }
2211 else if (EQ (flags[i], Qt))
2212 {
2213 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2214 default_reg_bits |= 1 << i;
2215 }
2216 else if (CONSP (flags[i]))
2217 {
2218 Lisp_Object tail = flags[i];
2219
2220 if (INTEGERP (XCONS (tail)->car)
2221 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2222 CHARSET_VALID_P (charset))
2223 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2224 {
2225 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2226 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2227 }
2228 else
2229 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2230 tail = XCONS (tail)->cdr;
2231 while (CONSP (tail))
2232 {
2233 if (INTEGERP (XCONS (tail)->car)
2234 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2235 CHARSET_VALID_P (charset))
2236 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2237 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2238 = i;
2239 else if (EQ (XCONS (tail)->car, Qt))
2240 default_reg_bits |= 1 << i;
2241 tail = XCONS (tail)->cdr;
2242 }
2243 }
2244 else
2245 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2246
2247 CODING_SPEC_ISO_DESIGNATION (coding, i)
2248 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2249 }
2250
2251 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2252 {
2253 /* REG 1 can be used only by locking shift in 7-bit env. */
2254 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2255 default_reg_bits &= ~2;
2256 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2257 /* Without any shifting, only REG 0 and 1 can be used. */
2258 default_reg_bits &= 3;
2259 }
2260
467e7675 2261 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2262 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2263 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2264 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2265 {
2266 /* We have not yet decided where to designate CHARSET. */
2267 int reg_bits = default_reg_bits;
2268
2269 if (CHARSET_CHARS (charset) == 96)
2270 /* A charset of CHARS96 can't be designated to REG 0. */
2271 reg_bits &= ~1;
2272
2273 if (reg_bits)
2274 /* There exist some default graphic register. */
2275 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2276 = (reg_bits & 1
2277 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2278 else
2279 /* We anyway have to designate CHARSET to somewhere. */
2280 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2281 = (CHARSET_CHARS (charset) == 94
2282 ? 0
2283 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2284 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2285 ? 1
2286 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2287 ? 2 : 0)));
2288 }
2289 }
2290 coding->require_flushing = 1;
2291 break;
2292
2293 case 3:
2294 coding->type = coding_type_big5;
2295 coding->flags
e0e989f6 2296 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2297 ? CODING_FLAG_BIG5_HKU
2298 : CODING_FLAG_BIG5_ETEN);
2299 break;
2300
2301 case 4:
2302 coding->type = coding_type_ccl;
2303 {
e0e989f6 2304 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2305 if (CONSP (val)
2306 && VECTORP (XCONS (val)->car)
2307 && VECTORP (XCONS (val)->cdr))
2308 {
2309 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2310 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2311 }
2312 else
2313 goto label_invalid_coding_system;
2314 }
2315 coding->require_flushing = 1;
2316 break;
2317
2318 default:
2319 if (EQ (type, Qt))
0ef69138 2320 coding->type = coding_type_undecided;
4ed46869
KH
2321 else
2322 coding->type = coding_type_no_conversion;
2323 break;
2324 }
2325 return 0;
2326
2327 label_invalid_coding_system:
2328 coding->type = coding_type_no_conversion;
dec137e5 2329 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2330 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2331 = Qnil;
4ed46869
KH
2332 return -1;
2333}
2334
2335/* Emacs has a mechanism to automatically detect a coding system if it
2336 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2337 it's impossible to distinguish some coding systems accurately
2338 because they use the same range of codes. So, at first, coding
2339 systems are categorized into 7, those are:
2340
0ef69138 2341 o coding-category-emacs-mule
4ed46869
KH
2342
2343 The category for a coding system which has the same code range
2344 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2345 symbol) `emacs-mule' by default.
4ed46869
KH
2346
2347 o coding-category-sjis
2348
2349 The category for a coding system which has the same code range
2350 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2351 symbol) `shift-jis' by default.
4ed46869
KH
2352
2353 o coding-category-iso-7
2354
2355 The category for a coding system which has the same code range
2356 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2357 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2358
2359 o coding-category-iso-8-1
2360
2361 The category for a coding system which has the same code range
2362 as ISO2022 of 8-bit environment and graphic plane 1 used only
2363 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2364 symbol) `iso-8859-1' by default.
4ed46869
KH
2365
2366 o coding-category-iso-8-2
2367
2368 The category for a coding system which has the same code range
2369 as ISO2022 of 8-bit environment and graphic plane 1 used only
2370 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2371 symbol) `euc-japan' by default.
4ed46869
KH
2372
2373 o coding-category-iso-else
2374
2375 The category for a coding system which has the same code range
2376 as ISO2022 but not belongs to any of the above three
2377 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2378 `iso-2022-ss2-7' by default.
4ed46869
KH
2379
2380 o coding-category-big5
2381
2382 The category for a coding system which has the same code range
2383 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2384 `cn-big5' by default.
4ed46869
KH
2385
2386 o coding-category-binary
2387
2388 The category for a coding system not categorized in any of the
2389 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2390 `no-conversion' by default.
4ed46869
KH
2391
2392 Each of them is a Lisp symbol and the value is an actual
2393 `coding-system's (this is also a Lisp symbol) assigned by a user.
2394 What Emacs does actually is to detect a category of coding system.
2395 Then, it uses a `coding-system' assigned to it. If Emacs can't
2396 decide only one possible category, it selects a category of the
2397 highest priority. Priorities of categories are also specified by a
2398 user in a Lisp variable `coding-category-list'.
2399
2400*/
2401
2402/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2403 If it detects possible coding systems, return an integer in which
2404 appropriate flag bits are set. Flag bits are defined by macros
2405 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2406
2407int
2408detect_coding_mask (src, src_bytes)
2409 unsigned char *src;
2410 int src_bytes;
2411{
2412 register unsigned char c;
2413 unsigned char *src_end = src + src_bytes;
2414 int mask;
2415
2416 /* At first, skip all ASCII characters and control characters except
2417 for three ISO2022 specific control characters. */
bcf26d6a 2418 label_loop_detect_coding:
4ed46869
KH
2419 while (src < src_end)
2420 {
2421 c = *src;
2422 if (c >= 0x80
2423 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2424 break;
2425 src++;
2426 }
2427
2428 if (src >= src_end)
2429 /* We found nothing other than ASCII. There's nothing to do. */
2430 return CODING_CATEGORY_MASK_ANY;
2431
2432 /* The text seems to be encoded in some multilingual coding system.
2433 Now, try to find in which coding system the text is encoded. */
2434 if (c < 0x80)
bcf26d6a
KH
2435 {
2436 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2437 /* C is an ISO2022 specific control code of C0. */
2438 mask = detect_coding_iso2022 (src, src_end);
2439 src++;
2440 if (mask == CODING_CATEGORY_MASK_ANY)
2441 /* No valid ISO2022 code follows C. Try again. */
2442 goto label_loop_detect_coding;
2443 }
4ed46869
KH
2444 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2445 /* C is an ISO2022 specific control code of C1,
2446 or the first byte of SJIS's 2-byte character code,
2447 or a leading code of Emacs. */
2448 mask = (detect_coding_iso2022 (src, src_end)
2449 | detect_coding_sjis (src, src_end)
0ef69138 2450 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2451
2452 else if (c < 0xA0)
2453 /* C is the first byte of SJIS character code,
2454 or a leading-code of Emacs. */
2455 mask = (detect_coding_sjis (src, src_end)
0ef69138 2456 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2457
2458 else
2459 /* C is a character of ISO2022 in graphic plane right,
2460 or a SJIS's 1-byte character code (i.e. JISX0201),
2461 or the first byte of BIG5's 2-byte code. */
2462 mask = (detect_coding_iso2022 (src, src_end)
2463 | detect_coding_sjis (src, src_end)
2464 | detect_coding_big5 (src, src_end));
2465
2466 return mask;
2467}
2468
2469/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2470 The information of the detected coding system is set in CODING. */
2471
2472void
2473detect_coding (coding, src, src_bytes)
2474 struct coding_system *coding;
2475 unsigned char *src;
2476 int src_bytes;
2477{
2478 int mask = detect_coding_mask (src, src_bytes);
2479 int idx;
2480
2481 if (mask == CODING_CATEGORY_MASK_ANY)
2482 /* We found nothing other than ASCII. There's nothing to do. */
2483 return;
2484
2485 if (!mask)
2486 /* The source text seems to be encoded in unknown coding system.
2487 Emacs regards the category of such a kind of coding system as
2488 `coding-category-binary'. We assume that a user has assigned
2489 an appropriate coding system for a `coding-category-binary'. */
2490 idx = CODING_CATEGORY_IDX_BINARY;
2491 else
2492 {
2493 /* We found some plausible coding systems. Let's use a coding
2494 system of the highest priority. */
2495 Lisp_Object val = Vcoding_category_list;
2496
2497 if (CONSP (val))
2498 while (!NILP (val))
2499 {
2500 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2501 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2502 break;
2503 val = XCONS (val)->cdr;
2504 }
2505 else
2506 val = Qnil;
2507
2508 if (NILP (val))
2509 {
2510 /* For unknown reason, `Vcoding_category_list' contains none
2511 of found categories. Let's use any of them. */
2512 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2513 if (mask & (1 << idx))
2514 break;
2515 }
2516 }
2517 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2518}
2519
2520/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2521 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2522 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2523
2524int
2525detect_eol_type (src, src_bytes)
2526 unsigned char *src;
2527 int src_bytes;
2528{
2529 unsigned char *src_end = src + src_bytes;
2530 unsigned char c;
2531
2532 while (src < src_end)
2533 {
2534 c = *src++;
2535 if (c == '\n')
2536 return CODING_EOL_LF;
2537 else if (c == '\r')
2538 {
2539 if (src < src_end && *src == '\n')
2540 return CODING_EOL_CRLF;
2541 else
2542 return CODING_EOL_CR;
2543 }
2544 }
0ef69138 2545 return CODING_EOL_UNDECIDED;
4ed46869
KH
2546}
2547
2548/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2549 is encoded. If it detects an appropriate format of end-of-line, it
2550 sets the information in *CODING. */
2551
2552void
2553detect_eol (coding, src, src_bytes)
2554 struct coding_system *coding;
2555 unsigned char *src;
2556 int src_bytes;
2557{
2558 Lisp_Object val;
2559 int eol_type = detect_eol_type (src, src_bytes);
2560
0ef69138 2561 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2562 /* We found no end-of-line in the source text. */
2563 return;
2564
2565 val = Fget (coding->symbol, Qeol_type);
2566 if (VECTORP (val) && XVECTOR (val)->size == 3)
2567 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2568}
2569
2570/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2571 decoding, it may detect coding system and format of end-of-line if
2572 those are not yet decided. */
2573
2574int
2575decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2576 struct coding_system *coding;
2577 unsigned char *source, *destination;
2578 int src_bytes, dst_bytes;
2579 int *consumed;
2580{
2581 int produced;
2582
2583 if (src_bytes <= 0)
2584 {
2585 *consumed = 0;
2586 return 0;
2587 }
2588
0ef69138 2589 if (coding->type == coding_type_undecided)
4ed46869
KH
2590 detect_coding (coding, source, src_bytes);
2591
0ef69138 2592 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2593 detect_eol (coding, source, src_bytes);
2594
2595 coding->carryover_size = 0;
2596 switch (coding->type)
2597 {
2598 case coding_type_no_conversion:
2599 label_no_conversion:
2600 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2601 bcopy (source, destination, produced);
2602 *consumed = produced;
2603 break;
2604
0ef69138
KH
2605 case coding_type_emacs_mule:
2606 case coding_type_undecided:
4ed46869 2607 if (coding->eol_type == CODING_EOL_LF
0ef69138 2608 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2609 goto label_no_conversion;
2610 produced = decode_eol (coding, source, destination,
2611 src_bytes, dst_bytes, consumed);
2612 break;
2613
2614 case coding_type_sjis:
2615 produced = decode_coding_sjis_big5 (coding, source, destination,
2616 src_bytes, dst_bytes, consumed,
2617 1);
2618 break;
2619
2620 case coding_type_iso2022:
2621 produced = decode_coding_iso2022 (coding, source, destination,
2622 src_bytes, dst_bytes, consumed);
2623 break;
2624
2625 case coding_type_big5:
2626 produced = decode_coding_sjis_big5 (coding, source, destination,
2627 src_bytes, dst_bytes, consumed,
2628 0);
2629 break;
2630
2631 case coding_type_ccl:
2632 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2633 src_bytes, dst_bytes, consumed);
2634 break;
2635 }
2636
2637 return produced;
2638}
2639
2640/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2641
2642int
2643encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2644 struct coding_system *coding;
2645 unsigned char *source, *destination;
2646 int src_bytes, dst_bytes;
2647 int *consumed;
2648{
2649 int produced;
2650
2651 coding->carryover_size = 0;
2652 switch (coding->type)
2653 {
2654 case coding_type_no_conversion:
2655 label_no_conversion:
2656 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2657 if (produced > 0)
2658 {
2659 bcopy (source, destination, produced);
2660 if (coding->selective)
2661 {
2662 unsigned char *p = destination, *pend = destination + produced;
2663 while (p < pend)
e0e989f6 2664 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2665 }
2666 }
2667 *consumed = produced;
2668 break;
2669
0ef69138
KH
2670 case coding_type_emacs_mule:
2671 case coding_type_undecided:
4ed46869 2672 if (coding->eol_type == CODING_EOL_LF
0ef69138 2673 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2674 goto label_no_conversion;
2675 produced = encode_eol (coding, source, destination,
2676 src_bytes, dst_bytes, consumed);
2677 break;
2678
2679 case coding_type_sjis:
2680 produced = encode_coding_sjis_big5 (coding, source, destination,
2681 src_bytes, dst_bytes, consumed,
2682 1);
2683 break;
2684
2685 case coding_type_iso2022:
2686 produced = encode_coding_iso2022 (coding, source, destination,
2687 src_bytes, dst_bytes, consumed);
2688 break;
2689
2690 case coding_type_big5:
2691 produced = encode_coding_sjis_big5 (coding, source, destination,
2692 src_bytes, dst_bytes, consumed,
2693 0);
2694 break;
2695
2696 case coding_type_ccl:
2697 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2698 src_bytes, dst_bytes, consumed);
2699 break;
2700 }
2701
2702 return produced;
2703}
2704
2705#define CONVERSION_BUFFER_EXTRA_ROOM 256
2706
2707/* Return maximum size (bytes) of a buffer enough for decoding
2708 SRC_BYTES of text encoded in CODING. */
2709
2710int
2711decoding_buffer_size (coding, src_bytes)
2712 struct coding_system *coding;
2713 int src_bytes;
2714{
2715 int magnification;
2716
2717 if (coding->type == coding_type_iso2022)
2718 magnification = 3;
2719 else if (coding->type == coding_type_ccl)
2720 magnification = coding->spec.ccl.decoder.buf_magnification;
2721 else
2722 magnification = 2;
2723
2724 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2725}
2726
2727/* Return maximum size (bytes) of a buffer enough for encoding
2728 SRC_BYTES of text to CODING. */
2729
2730int
2731encoding_buffer_size (coding, src_bytes)
2732 struct coding_system *coding;
2733 int src_bytes;
2734{
2735 int magnification;
2736
2737 if (coding->type == coding_type_ccl)
2738 magnification = coding->spec.ccl.encoder.buf_magnification;
2739 else
2740 magnification = 3;
2741
2742 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2743}
2744
2745#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2746#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2747#endif
2748
2749char *conversion_buffer;
2750int conversion_buffer_size;
2751
2752/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2753 or decoding. Sufficient memory is allocated automatically. If we
2754 run out of memory, return NULL. */
2755
2756char *
2757get_conversion_buffer (size)
2758 int size;
2759{
2760 if (size > conversion_buffer_size)
2761 {
2762 char *buf;
2763 int real_size = conversion_buffer_size * 2;
2764
2765 while (real_size < size) real_size *= 2;
2766 buf = (char *) xmalloc (real_size);
2767 xfree (conversion_buffer);
2768 conversion_buffer = buf;
2769 conversion_buffer_size = real_size;
2770 }
2771 return conversion_buffer;
2772}
2773
2774\f
2775#ifdef emacs
2776/*** 7. Emacs Lisp library functions ***/
2777
02ba4723 2778DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2779 1, 1, 0,
02ba4723 2780 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2781If CODING-SYSTEM is not a valid coding-system, return nil.")
2782 (obj)
2783 Lisp_Object obj;
2784{
2785 while (SYMBOLP (obj) && !NILP (obj))
2786 obj = Fget (obj, Qcoding_system);
2787 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2788 ? Qnil : obj);
2789}
2790
2791DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2792 "Return t if OBJECT is nil or a coding-system.\n\
2793See document of make-coding-system for coding-system object.")
2794 (obj)
2795 Lisp_Object obj;
2796{
02ba4723 2797 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2798}
2799
9d991de8
RS
2800DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2801 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2802 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2803 (prompt)
2804 Lisp_Object prompt;
2805{
e0e989f6 2806 Lisp_Object val;
9d991de8
RS
2807 do
2808 {
02ba4723 2809 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2810 Qt, Qnil, Qnil, Qnil);
2811 }
2812 while (XSTRING (val)->size == 0);
e0e989f6 2813 return (Fintern (val, Qnil));
4ed46869
KH
2814}
2815
2816DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2817 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2818 (prompt)
2819 Lisp_Object prompt;
2820{
e0e989f6 2821 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2822 Qt, Qnil, Qnil, Qnil);
e0e989f6 2823 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2824}
2825
2826DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2827 1, 1, 0,
2828 "Check validity of CODING-SYSTEM.\n\
2829If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2830CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2831The value of property should be a vector of length 5.")
2832 (coding_system)
2833 Lisp_Object coding_system;
2834{
2835 CHECK_SYMBOL (coding_system, 0);
2836 if (!NILP (Fcoding_system_p (coding_system)))
2837 return coding_system;
2838 while (1)
02ba4723 2839 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2840}
2841
2842DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2843 2, 2, 0,
2844 "Detect coding-system of the text in the region between START and END.\n\
2845Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2846If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2847 or its subsidiary coding-system according to a detected end-of-line format.")
2848 (b, e)
2849 Lisp_Object b, e;
2850{
2851 int coding_mask, eol_type;
2852 Lisp_Object val;
2853 int beg, end;
2854
2855 validate_region (&b, &e);
2856 beg = XINT (b), end = XINT (e);
2857 if (beg < GPT && end >= GPT) move_gap (end);
2858
2859 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2860 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2861
2862 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2863 {
0ef69138
KH
2864 val = intern ("undecided");
2865 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2866 {
2867 Lisp_Object val2 = Fget (val, Qeol_type);
2868 if (VECTORP (val2))
2869 val = XVECTOR (val2)->contents[eol_type];
2870 }
2871 }
2872 else
2873 {
2874 Lisp_Object val2;
2875
2876 /* At first, gather possible coding-systems in VAL in a reverse
2877 order. */
2878 val = Qnil;
2879 for (val2 = Vcoding_category_list;
2880 !NILP (val2);
2881 val2 = XCONS (val2)->cdr)
2882 {
2883 int idx
2884 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2885 if (coding_mask & (1 << idx))
2886 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2887 }
2888
2889 /* Then, change the order of the list, while getting subsidiary
2890 coding-systems. */
2891 val2 = val;
2892 val = Qnil;
2893 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2894 {
0ef69138 2895 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2896 val = Fcons (XCONS (val2)->car, val);
2897 else
2898 {
2899 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2900 if (VECTORP (val3))
2901 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2902 else
2903 val = Fcons (XCONS (val2)->car, val);
2904 }
2905 }
2906 }
2907
2908 return val;
2909}
2910
2911/* Scan text in the region between *BEGP and *ENDP, skip characters
2912 which we never have to encode to (iff ENCODEP is 1) or decode from
2913 coding system CODING at the head and tail, then set BEGP and ENDP
2914 to the addresses of start and end of the text we actually convert. */
2915
2916void
2917shrink_conversion_area (begp, endp, coding, encodep)
2918 unsigned char **begp, **endp;
2919 struct coding_system *coding;
2920 int encodep;
2921{
2922 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2923
2924 if (coding->eol_type != CODING_EOL_LF
0ef69138 2925 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2926 /* Since we anyway have to convert end-of-line format, it is not
2927 worth skipping at most 100 bytes or so. */
2928 return;
2929
2930 if (encodep) /* for encoding */
2931 {
2932 switch (coding->type)
2933 {
2934 case coding_type_no_conversion:
0ef69138
KH
2935 case coding_type_emacs_mule:
2936 case coding_type_undecided:
4ed46869
KH
2937 /* We need no conversion. */
2938 *begp = *endp;
2939 return;
2940 case coding_type_ccl:
2941 /* We can't skip any data. */
2942 return;
e0e989f6
KH
2943 case coding_type_iso2022:
2944 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2945 {
2946 unsigned char *bol = beg_addr;
2947 while (beg_addr < end_addr && *beg_addr < 0x80)
2948 {
2949 beg_addr++;
2950 if (*(beg_addr - 1) == '\n')
2951 bol = beg_addr;
2952 }
2953 beg_addr = bol;
2954 goto label_skip_tail;
2955 }
2956 /* fall down ... */
4ed46869
KH
2957 default:
2958 /* We can skip all ASCII characters at the head and tail. */
2959 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 2960 label_skip_tail:
4ed46869
KH
2961 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2962 break;
2963 }
2964 }
2965 else /* for decoding */
2966 {
2967 switch (coding->type)
2968 {
2969 case coding_type_no_conversion:
2970 /* We need no conversion. */
2971 *begp = *endp;
2972 return;
0ef69138 2973 case coding_type_emacs_mule:
4ed46869
KH
2974 if (coding->eol_type == CODING_EOL_LF)
2975 {
2976 /* We need no conversion. */
2977 *begp = *endp;
2978 return;
2979 }
2980 /* We can skip all but carriage-return. */
2981 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2982 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2983 break;
2984 case coding_type_sjis:
2985 case coding_type_big5:
2986 /* We can skip all ASCII characters at the head. */
2987 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2988 /* We can skip all ASCII characters at the tail except for
2989 the second byte of SJIS or BIG5 code. */
2990 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2991 if (end_addr != *endp)
2992 end_addr++;
2993 break;
2994 case coding_type_ccl:
2995 /* We can't skip any data. */
2996 return;
2997 default: /* i.e. case coding_type_iso2022: */
2998 {
2999 unsigned char c;
3000
3001 /* We can skip all ASCII characters except for a few
3002 control codes at the head. */
3003 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3004 && c != ISO_CODE_CR && c != ISO_CODE_SO
3005 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3006 beg_addr++;
3007 }
3008 break;
3009 }
3010 }
3011 *begp = beg_addr;
3012 *endp = end_addr;
3013 return;
3014}
3015
3016/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3017 text between B and E. B and E are buffer position. */
3018
3019Lisp_Object
3020code_convert_region (b, e, coding, encodep)
3021 Lisp_Object b, e;
3022 struct coding_system *coding;
3023 int encodep;
3024{
3025 int beg, end, len, consumed, produced;
3026 char *buf;
3027 unsigned char *begp, *endp;
3028 int pos = PT;
3029
3030 validate_region (&b, &e);
3031 beg = XINT (b), end = XINT (e);
3032 if (beg < GPT && end >= GPT)
3033 move_gap (end);
3034
3035 if (encodep && !NILP (coding->pre_write_conversion))
3036 {
3037 /* We must call a pre-conversion function which may put a new
3038 text to be converted in a new buffer. */
3039 struct buffer *old = current_buffer, *new;
3040
3041 TEMP_SET_PT (beg);
3042 call2 (coding->pre_write_conversion, b, e);
3043 if (old != current_buffer)
3044 {
3045 /* Replace the original text by the text just generated. */
3046 len = ZV - BEGV;
3047 new = current_buffer;
3048 set_buffer_internal (old);
3049 del_range (beg, end);
3050 insert_from_buffer (new, 1, len, 0);
3051 end = beg + len;
3052 }
3053 }
3054
3055 /* We may be able to shrink the conversion region. */
3056 begp = POS_ADDR (beg); endp = begp + (end - beg);
3057 shrink_conversion_area (&begp, &endp, coding, encodep);
3058
3059 if (begp == endp)
3060 /* We need no conversion. */
3061 len = end - beg;
3062 else
3063 {
3064 beg += begp - POS_ADDR (beg);
3065 end = beg + (endp - begp);
3066
3067 if (encodep)
3068 len = encoding_buffer_size (coding, end - beg);
3069 else
3070 len = decoding_buffer_size (coding, end - beg);
3071 buf = get_conversion_buffer (len);
3072
3073 coding->last_block = 1;
3074 produced = (encodep
3075 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3076 &consumed)
3077 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3078 &consumed));
3079
3080 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3081
3082 TEMP_SET_PT (beg);
3083 insert (buf, produced);
3084 del_range (PT, PT + end - beg);
3085 if (pos >= end)
3086 pos = PT + (pos - end);
3087 else if (pos > beg)
3088 pos = beg;
3089 TEMP_SET_PT (pos);
3090 }
3091
3092 if (!encodep && !NILP (coding->post_read_conversion))
3093 {
3094 /* We must call a post-conversion function which may alter
3095 the text just converted. */
3096 Lisp_Object insval;
3097
3098 beg = XINT (b);
3099 TEMP_SET_PT (beg);
3100 insval = call1 (coding->post_read_conversion, make_number (len));
3101 CHECK_NUMBER (insval, 0);
3102 len = XINT (insval);
3103 }
3104
3105 return make_number (len);
3106}
3107
3108Lisp_Object
e0e989f6
KH
3109code_convert_string (str, coding, encodep, nocopy)
3110 Lisp_Object str, nocopy;
4ed46869
KH
3111 struct coding_system *coding;
3112 int encodep;
3113{
3114 int len, consumed, produced;
3115 char *buf;
3116 unsigned char *begp, *endp;
3117 int head_skip, tail_skip;
3118 struct gcpro gcpro1;
3119
3120 if (encodep && !NILP (coding->pre_write_conversion)
3121 || !encodep && !NILP (coding->post_read_conversion))
3122 {
3123 /* Since we have to call Lisp functions which assume target text
3124 is in a buffer, after setting a temporary buffer, call
3125 code_convert_region. */
3126 int count = specpdl_ptr - specpdl;
3127 int len = XSTRING (str)->size;
3128 Lisp_Object result;
3129 struct buffer *old = current_buffer;
3130
3131 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3132 temp_output_buffer_setup (" *code-converting-work*");
3133 set_buffer_internal (XBUFFER (Vstandard_output));
3134 insert_from_string (str, 0, len, 0);
3135 code_convert_region (make_number (BEGV), make_number (ZV),
3136 coding, encodep);
3137 result = make_buffer_string (BEGV, ZV, 0);
3138 set_buffer_internal (old);
3139 return unbind_to (count, result);
3140 }
3141
3142 /* We may be able to shrink the conversion region. */
3143 begp = XSTRING (str)->data;
3144 endp = begp + XSTRING (str)->size;
3145 shrink_conversion_area (&begp, &endp, coding, encodep);
3146
3147 if (begp == endp)
3148 /* We need no conversion. */
e0e989f6 3149 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3150
3151 head_skip = begp - XSTRING (str)->data;
3152 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3153
3154 GCPRO1 (str);
3155
3156 if (encodep)
3157 len = encoding_buffer_size (coding, endp - begp);
3158 else
3159 len = decoding_buffer_size (coding, endp - begp);
3160 buf = get_conversion_buffer (len + head_skip + tail_skip);
3161
3162 bcopy (XSTRING (str)->data, buf, head_skip);
3163 coding->last_block = 1;
3164 produced = (encodep
3165 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3166 buf + head_skip, endp - begp, len, &consumed)
3167 : decode_coding (coding, XSTRING (str)->data + head_skip,
3168 buf + head_skip, endp - begp, len, &consumed));
3169 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3170 buf + head_skip + produced,
3171 tail_skip);
3172
3173 UNGCPRO;
3174
3175 return make_string (buf, head_skip + produced + tail_skip);
3176}
3177
3178DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3179 3, 3, "r\nzCoding system: ",
3180 "Decode current region by specified coding system.\n\
3181When called from a program, takes three arguments:\n\
3182START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3183Return length of decoded text.")
3184 (b, e, coding_system)
3185 Lisp_Object b, e, coding_system;
3186{
3187 struct coding_system coding;
3188
3189 CHECK_NUMBER_COERCE_MARKER (b, 0);
3190 CHECK_NUMBER_COERCE_MARKER (e, 1);
3191 CHECK_SYMBOL (coding_system, 2);
3192
e0e989f6
KH
3193 if (NILP (coding_system))
3194 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3195 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3196 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3197
3198 return code_convert_region (b, e, &coding, 0);
3199}
3200
3201DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3202 3, 3, "r\nzCoding system: ",
3203 "Encode current region by specified coding system.\n\
3204When called from a program, takes three arguments:\n\
3205START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3206Return length of encoded text.")
3207 (b, e, coding_system)
3208 Lisp_Object b, e, coding_system;
3209{
3210 struct coding_system coding;
3211
3212 CHECK_NUMBER_COERCE_MARKER (b, 0);
3213 CHECK_NUMBER_COERCE_MARKER (e, 1);
3214 CHECK_SYMBOL (coding_system, 2);
3215
e0e989f6
KH
3216 if (NILP (coding_system))
3217 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3218 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3219 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3220
3221 return code_convert_region (b, e, &coding, 1);
3222}
3223
3224DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3225 2, 3, 0,
3226 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3227Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3228of decoding.")
3229 (string, coding_system, nocopy)
3230 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3231{
3232 struct coding_system coding;
3233
3234 CHECK_STRING (string, 0);
3235 CHECK_SYMBOL (coding_system, 1);
3236
e0e989f6
KH
3237 if (NILP (coding_system))
3238 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3239 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3240 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3241
e0e989f6 3242 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3243}
3244
3245DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3246 2, 3, 0,
3247 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3248Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3249of encoding.")
3250 (string, coding_system, nocopy)
3251 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3252{
3253 struct coding_system coding;
3254
3255 CHECK_STRING (string, 0);
3256 CHECK_SYMBOL (coding_system, 1);
3257
e0e989f6
KH
3258 if (NILP (coding_system))
3259 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3260 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3261 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3262
e0e989f6 3263 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3264}
3265
3266DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3267 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3268CODE is the character code in SJIS.\n\
3269Return the corresponding character.")
3270 (code)
3271 Lisp_Object code;
3272{
3273 unsigned char c1, c2, s1, s2;
3274 Lisp_Object val;
3275
3276 CHECK_NUMBER (code, 0);
3277 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3278 DECODE_SJIS (s1, s2, c1, c2);
3279 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3280 return val;
3281}
3282
3283DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3284 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3285Return the corresponding character code in SJIS.")
3286 (ch)
3287 Lisp_Object ch;
3288{
bcf26d6a 3289 int charset, c1, c2, s1, s2;
4ed46869
KH
3290 Lisp_Object val;
3291
3292 CHECK_NUMBER (ch, 0);
3293 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3294 if (charset == charset_jisx0208)
3295 {
3296 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3297 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3298 }
3299 else
3300 XSETFASTINT (val, 0);
3301 return val;
3302}
3303
3304DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3305 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3306CODE is the character code in BIG5.\n\
3307Return the corresponding character.")
3308 (code)
3309 Lisp_Object code;
3310{
3311 int charset;
3312 unsigned char b1, b2, c1, c2;
3313 Lisp_Object val;
3314
3315 CHECK_NUMBER (code, 0);
3316 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3317 DECODE_BIG5 (b1, b2, charset, c1, c2);
3318 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3319 return val;
3320}
3321
3322DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3323 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3324Return the corresponding character code in Big5.")
3325 (ch)
3326 Lisp_Object ch;
3327{
bcf26d6a 3328 int charset, c1, c2, b1, b2;
4ed46869
KH
3329 Lisp_Object val;
3330
3331 CHECK_NUMBER (ch, 0);
3332 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3333 if (charset == charset_big5_1 || charset == charset_big5_2)
3334 {
3335 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3336 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3337 }
3338 else
3339 XSETFASTINT (val, 0);
3340 return val;
3341}
3342
1ba9e4ab
KH
3343DEFUN ("set-terminal-coding-system-internal",
3344 Fset_terminal_coding_system_internal,
3345 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3346 (coding_system)
3347 Lisp_Object coding_system;
3348{
3349 CHECK_SYMBOL (coding_system, 0);
3350 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3351 return Qnil;
3352}
3353
3354DEFUN ("terminal-coding-system",
3355 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3356 "Return coding-system of your terminal.")
3357 ()
3358{
3359 return terminal_coding.symbol;
3360}
3361
1ba9e4ab
KH
3362DEFUN ("set-keyboard-coding-system-internal",
3363 Fset_keyboard_coding_system_internal,
3364 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3365 (coding_system)
3366 Lisp_Object coding_system;
3367{
3368 CHECK_SYMBOL (coding_system, 0);
3369 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3370 return Qnil;
3371}
3372
3373DEFUN ("keyboard-coding-system",
3374 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3375 "Return coding-system of what is sent from terminal keyboard.")
3376 ()
3377{
3378 return keyboard_coding.symbol;
3379}
3380
3381\f
3382DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3383 1, MANY, 0,
ccdb79f5
RS
3384 "Choose a coding system for a file operation based on file name.\n\
3385The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3386ENCODING-SYSTEM is the coding system to use for encoding\n\
3387\(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3388for decoding (in case OPERATION does decoding).\n\
3389\n\
3390The first argument OPERATION specifies an I/O primitive:\n\
3391 For file I/O, `insert-file-contents' or `write-region'.\n\
3392 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3393 For network I/O, `open-network-stream'.\n\
3394\n\
3395The remaining arguments should be the same arguments that were passed\n\
3396to the primitive. Depending on which primitive, one of those arguments\n\
3397is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3398whichever argument specifies the file name is TARGET.\n\
3399\n\
3400TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3401 For file I/O, TARGET is a file name.\n\
3402 For process I/O, TARGET is a process name.\n\
3403 For network I/O, TARGET is a service name or a port number\n\
3404\n\
02ba4723
KH
3405This function looks up what specified for TARGET in,\n\
3406`file-coding-system-alist', `process-coding-system-alist',\n\
3407or `network-coding-system-alist' depending on OPERATION.\n\
3408They may specify a coding system, a cons of coding systems,\n\
3409or a function symbol to call.\n\
3410In the last case, we call the function with one argument,\n\
ccdb79f5 3411which is a list of all the arguments given to `find-coding-system'.")
4ed46869
KH
3412 (nargs, args)
3413 int nargs;
3414 Lisp_Object *args;
3415{
3416 Lisp_Object operation, target_idx, target, val;
3417 register Lisp_Object chain;
3418
3419 if (nargs < 2)
3420 error ("Too few arguments");
3421 operation = args[0];
3422 if (!SYMBOLP (operation)
3423 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3424 error ("Invalid first arguement");
3425 if (nargs < 1 + XINT (target_idx))
3426 error ("Too few arguments for operation: %s",
3427 XSYMBOL (operation)->name->data);
3428 target = args[XINT (target_idx) + 1];
3429 if (!(STRINGP (target)
3430 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3431 error ("Invalid %dth argument", XINT (target_idx) + 1);
3432
02ba4723
KH
3433 chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3434 ? Vfile_coding_system_alist
3435 : (operation == Qopen_network_stream
3436 ? Vnetwork_coding_system_alist
3437 : Vprocess_coding_system_alist));
4ed46869
KH
3438 if (NILP (chain))
3439 return Qnil;
3440
02ba4723 3441 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3442 {
3443 Lisp_Object elt = XCONS (chain)->car;
3444
3445 if (CONSP (elt)
3446 && ((STRINGP (target)
3447 && STRINGP (XCONS (elt)->car)
3448 && fast_string_match (XCONS (elt)->car, target) >= 0)
3449 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3450 {
3451 val = XCONS (elt)->cdr;
3452 if (CONSP (val))
3453 return val;
3454 if (! SYMBOLP (val))
3455 return Qnil;
3456 if (! NILP (Fcoding_system_p (val)))
3457 return Fcons (val, val);
3458 if (!NILP (Fboundp (val)))
3459 return call2 (val, Flist (nargs, args));
3460 return Qnil;
3461 }
4ed46869
KH
3462 }
3463 return Qnil;
3464}
3465
3466#endif /* emacs */
3467
3468\f
3469/*** 8. Post-amble ***/
3470
3471init_coding_once ()
3472{
3473 int i;
3474
0ef69138 3475 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3476 for (i = 0; i <= 0x20; i++)
3477 emacs_code_class[i] = EMACS_control_code;
3478 emacs_code_class[0x0A] = EMACS_linefeed_code;
3479 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3480 for (i = 0x21 ; i < 0x7F; i++)
3481 emacs_code_class[i] = EMACS_ascii_code;
3482 emacs_code_class[0x7F] = EMACS_control_code;
3483 emacs_code_class[0x80] = EMACS_leading_code_composition;
3484 for (i = 0x81; i < 0xFF; i++)
3485 emacs_code_class[i] = EMACS_invalid_code;
3486 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3487 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3488 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3489 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3490
3491 /* ISO2022 specific initialize routine. */
3492 for (i = 0; i < 0x20; i++)
3493 iso_code_class[i] = ISO_control_code;
3494 for (i = 0x21; i < 0x7F; i++)
3495 iso_code_class[i] = ISO_graphic_plane_0;
3496 for (i = 0x80; i < 0xA0; i++)
3497 iso_code_class[i] = ISO_control_code;
3498 for (i = 0xA1; i < 0xFF; i++)
3499 iso_code_class[i] = ISO_graphic_plane_1;
3500 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3501 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3502 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3503 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3504 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3505 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3506 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3507 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3508 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3509 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3510
e0e989f6
KH
3511 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3512 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3513
3514 setup_coding_system (Qnil, &keyboard_coding);
3515 setup_coding_system (Qnil, &terminal_coding);
3516}
3517
3518#ifdef emacs
3519
3520syms_of_coding ()
3521{
3522 Qtarget_idx = intern ("target-idx");
3523 staticpro (&Qtarget_idx);
3524
3525 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3526 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3527
3528 Qcall_process = intern ("call-process");
3529 staticpro (&Qcall_process);
3530 Fput (Qcall_process, Qtarget_idx, make_number (0));
3531
3532 Qcall_process_region = intern ("call-process-region");
3533 staticpro (&Qcall_process_region);
3534 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3535
3536 Qstart_process = intern ("start-process");
3537 staticpro (&Qstart_process);
3538 Fput (Qstart_process, Qtarget_idx, make_number (2));
3539
3540 Qopen_network_stream = intern ("open-network-stream");
3541 staticpro (&Qopen_network_stream);
3542 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3543
4ed46869
KH
3544 Qcoding_system = intern ("coding-system");
3545 staticpro (&Qcoding_system);
3546
3547 Qeol_type = intern ("eol-type");
3548 staticpro (&Qeol_type);
3549
3550 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3551 staticpro (&Qbuffer_file_coding_system);
3552
3553 Qpost_read_conversion = intern ("post-read-conversion");
3554 staticpro (&Qpost_read_conversion);
3555
3556 Qpre_write_conversion = intern ("pre-write-conversion");
3557 staticpro (&Qpre_write_conversion);
3558
02ba4723
KH
3559 Qcoding_system_spec = intern ("coding-system-spec");
3560 staticpro (&Qcoding_system_spec);
4ed46869
KH
3561
3562 Qcoding_system_p = intern ("coding-system-p");
3563 staticpro (&Qcoding_system_p);
3564
3565 Qcoding_system_error = intern ("coding-system-error");
3566 staticpro (&Qcoding_system_error);
3567
3568 Fput (Qcoding_system_error, Qerror_conditions,
3569 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3570 Fput (Qcoding_system_error, Qerror_message,
3571 build_string ("Coding-system error"));
3572
3573 Qcoding_category_index = intern ("coding-category-index");
3574 staticpro (&Qcoding_category_index);
3575
3576 {
3577 int i;
3578 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3579 {
3580 coding_category_table[i] = intern (coding_category_name[i]);
3581 staticpro (&coding_category_table[i]);
3582 Fput (coding_category_table[i], Qcoding_category_index,
3583 make_number (i));
3584 }
3585 }
3586
bdd9fb48
KH
3587 Qcharacter_unification_table = intern ("character-unification-table");
3588 staticpro (&Qcharacter_unification_table);
3589 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3590 make_number (0));
3591
02ba4723 3592 defsubr (&Scoding_system_spec);
4ed46869
KH
3593 defsubr (&Scoding_system_p);
3594 defsubr (&Sread_coding_system);
3595 defsubr (&Sread_non_nil_coding_system);
3596 defsubr (&Scheck_coding_system);
3597 defsubr (&Sdetect_coding_region);
3598 defsubr (&Sdecode_coding_region);
3599 defsubr (&Sencode_coding_region);
3600 defsubr (&Sdecode_coding_string);
3601 defsubr (&Sencode_coding_string);
3602 defsubr (&Sdecode_sjis_char);
3603 defsubr (&Sencode_sjis_char);
3604 defsubr (&Sdecode_big5_char);
3605 defsubr (&Sencode_big5_char);
1ba9e4ab 3606 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3607 defsubr (&Sterminal_coding_system);
1ba9e4ab 3608 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869
KH
3609 defsubr (&Skeyboard_coding_system);
3610 defsubr (&Sfind_coding_system);
3611
3612 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3613 "List of coding-categories (symbols) ordered by priority.");
3614 {
3615 int i;
3616
3617 Vcoding_category_list = Qnil;
3618 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3619 Vcoding_category_list
3620 = Fcons (coding_category_table[i], Vcoding_category_list);
3621 }
3622
3623 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3624 "A variable of internal use only.\n\
3625If the value is a coding system, it is used for decoding on read operation.\n\
3626If not, an appropriate element in `coding-system-alist' (which see) is used.");
3627 Vcoding_system_for_read = Qnil;
3628
3629 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3630 "A variable of internal use only.\n\
3631If the value is a coding system, it is used for encoding on write operation.\n\
3632If not, an appropriate element in `coding-system-alist' (which see) is used.");
3633 Vcoding_system_for_write = Qnil;
3634
3635 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3636 "Coding-system used in the latest file or process I/O.");
3637 Vlast_coding_system_used = Qnil;
3638
02ba4723
KH
3639 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3640 "Alist to decide a coding system to use for a file I/O operation.\n\
3641The format is ((PATTERN . VAL) ...),\n\
3642where PATTERN is a regular expression matching a file name,\n\
3643VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3644If VAL is a coding system, it is used for both decoding and encoding\n\
3645the file contents.\n\
3646If VAL is a cons of coding systems, the car part is used for decoding,\n\
3647and the cdr part is used for encoding.\n\
3648If VAL is a function symbol, the function must return a coding system\n\
3649or a cons of coding systems which are used as above.\n\
e0e989f6 3650\n\
02ba4723
KH
3651See also the function `find-coding-system'.");
3652 Vfile_coding_system_alist = Qnil;
3653
3654 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3655 "Alist to decide a coding system to use for a process I/O operation.\n\
3656The format is ((PATTERN . VAL) ...),\n\
3657where PATTERN is a regular expression matching a program name,\n\
3658VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3659If VAL is a coding system, it is used for both decoding what received\n\
3660from the program and encoding what sent to the program.\n\
3661If VAL is a cons of coding systems, the car part is used for decoding,\n\
3662and the cdr part is used for encoding.\n\
3663If VAL is a function symbol, the function must return a coding system\n\
3664or a cons of coding systems which are used as above.\n\
4ed46869 3665\n\
02ba4723
KH
3666See also the function `find-coding-system'.");
3667 Vprocess_coding_system_alist = Qnil;
3668
3669 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3670 "Alist to decide a coding system to use for a network I/O operation.\n\
3671The format is ((PATTERN . VAL) ...),\n\
3672where PATTERN is a regular expression matching a network service name\n\
3673or is a port number to connect to,\n\
3674VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3675If VAL is a coding system, it is used for both decoding what received\n\
3676from the network stream and encoding what sent to the network stream.\n\
3677If VAL is a cons of coding systems, the car part is used for decoding,\n\
3678and the cdr part is used for encoding.\n\
3679If VAL is a function symbol, the function must return a coding system\n\
3680or a cons of coding systems which are used as above.\n\
4ed46869 3681\n\
02ba4723
KH
3682See also the function `find-coding-system'.");
3683 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3684
3685 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3686 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3687 eol_mnemonic_unix = '.';
3688
3689 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3690 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3691 eol_mnemonic_dos = ':';
3692
3693 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3694 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3695 eol_mnemonic_mac = '\'';
3696
3697 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3698 "Mnemonic character indicating end-of-line format is not yet decided.");
3699 eol_mnemonic_undecided = '-';
3700
bdd9fb48
KH
3701 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3702 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3703 Venable_character_unification = Qt;
3704
3705 DEFVAR_LISP ("standard-character-unification-table-for-read",
3706 &Vstandard_character_unification_table_for_read,
3707 "Table for unifying characters when reading.");
3708 Vstandard_character_unification_table_for_read = Qnil;
3709
3710 DEFVAR_LISP ("standard-character-unification-table-for-write",
3711 &Vstandard_character_unification_table_for_write,
3712 "Table for unifying characters when writing.");
3713 Vstandard_character_unification_table_for_write = Qnil;
4ed46869
KH
3714
3715 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3716 "Alist of charsets vs revision numbers.\n\
3717While encoding, if a charset (car part of an element) is found,\n\
3718designate it with the escape sequence identifing revision (cdr part of the element).");
3719 Vcharset_revision_alist = Qnil;
02ba4723
KH
3720
3721 DEFVAR_LISP ("default-process-coding-system",
3722 &Vdefault_process_coding_system,
3723 "Cons of coding systems used for process I/O by default.\n\
3724The car part is used for decoding a process output,\n\
3725the cdr part is used for encoding a text to be sent to a process.");
3726 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3727}
3728
3729#endif /* emacs */