(setup_coding_system, Ffind_operation_coding_system)
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
77
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
83 Emacs represents a coding-system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
87
88 */
89
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
96
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
101
102 */
103
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
112 int
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
115 {
116 ...
117 }
118 #endif
119
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
128 #if 0
129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
134 {
135 ...
136 }
137 #endif
138
139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
140
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
147 #if 0
148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
153 {
154 ...
155 }
156 #endif
157
158 /*** COMMONLY USED MACROS ***/
159
160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
165
166 #define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
173
174 #define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
181
182 #define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190 /* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
196
197 /* Decode one ASCII character C. */
198
199 #define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
206
207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
208 position-code is C. */
209
210 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
221
222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
223 position-codes are C1 and C2. */
224
225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
230
231 \f
232 /*** 1. Preamble ***/
233
234 #include <stdio.h>
235
236 #ifdef emacs
237
238 #include <config.h>
239 #include "lisp.h"
240 #include "buffer.h"
241 #include "charset.h"
242 #include "ccl.h"
243 #include "coding.h"
244 #include "window.h"
245
246 #else /* not emacs */
247
248 #include "mulelib.h"
249
250 #endif /* not emacs */
251
252 Lisp_Object Qcoding_system, Qeol_type;
253 Lisp_Object Qbuffer_file_coding_system;
254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
255 Lisp_Object Qno_conversion, Qundecided;
256
257 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
258 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
259 Lisp_Object Qstart_process, Qopen_network_stream;
260 Lisp_Object Qtarget_idx;
261
262 /* Mnemonic character of each format of end-of-line. */
263 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
264 /* Mnemonic character to indicate format of end-of-line is not yet
265 decided. */
266 int eol_mnemonic_undecided;
267
268 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
269 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
270 int system_eol_type;
271
272 #ifdef emacs
273
274 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
275
276 /* Coding system emacs-mule is for converting only end-of-line format. */
277 Lisp_Object Qemacs_mule;
278
279 /* Coding-systems are handed between Emacs Lisp programs and C internal
280 routines by the following three variables. */
281 /* Coding-system for reading files and receiving data from process. */
282 Lisp_Object Vcoding_system_for_read;
283 /* Coding-system for writing files and sending data to process. */
284 Lisp_Object Vcoding_system_for_write;
285 /* Coding-system actually used in the latest I/O. */
286 Lisp_Object Vlast_coding_system_used;
287
288 /* A vector of length 256 which contains information about special
289 Latin codes (espepcially for dealing with Microsoft code). */
290 Lisp_Object Vlatin_extra_code_table;
291
292 /* Flag to inhibit code conversion of end-of-line format. */
293 int inhibit_eol_conversion;
294
295 /* Coding system to be used to encode text for terminal display. */
296 struct coding_system terminal_coding;
297
298 /* Coding system to be used to encode text for terminal display when
299 terminal coding system is nil. */
300 struct coding_system safe_terminal_coding;
301
302 /* Coding system of what is sent from terminal keyboard. */
303 struct coding_system keyboard_coding;
304
305 Lisp_Object Vfile_coding_system_alist;
306 Lisp_Object Vprocess_coding_system_alist;
307 Lisp_Object Vnetwork_coding_system_alist;
308
309 #endif /* emacs */
310
311 Lisp_Object Qcoding_category_index;
312
313 /* List of symbols `coding-category-xxx' ordered by priority. */
314 Lisp_Object Vcoding_category_list;
315
316 /* Table of coding-systems currently assigned to each coding-category. */
317 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
318
319 /* Table of names of symbol for each coding-category. */
320 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
321 "coding-category-emacs-mule",
322 "coding-category-sjis",
323 "coding-category-iso-7",
324 "coding-category-iso-8-1",
325 "coding-category-iso-8-2",
326 "coding-category-iso-7-else",
327 "coding-category-iso-8-else",
328 "coding-category-big5",
329 "coding-category-raw-text",
330 "coding-category-binary"
331 };
332
333 /* Flag to tell if we look up unification table on character code
334 conversion. */
335 Lisp_Object Venable_character_unification;
336 /* Standard unification table to look up on decoding (reading). */
337 Lisp_Object Vstandard_character_unification_table_for_decode;
338 /* Standard unification table to look up on encoding (writing). */
339 Lisp_Object Vstandard_character_unification_table_for_encode;
340
341 Lisp_Object Qcharacter_unification_table;
342 Lisp_Object Qcharacter_unification_table_for_decode;
343 Lisp_Object Qcharacter_unification_table_for_encode;
344
345 /* Alist of charsets vs revision number. */
346 Lisp_Object Vcharset_revision_alist;
347
348 /* Default coding systems used for process I/O. */
349 Lisp_Object Vdefault_process_coding_system;
350
351 \f
352 /*** 2. Emacs internal format (emacs-mule) handlers ***/
353
354 /* Emacs' internal format for encoding multiple character sets is a
355 kind of multi-byte encoding, i.e. characters are encoded by
356 variable-length sequences of one-byte codes. ASCII characters
357 and control characters (e.g. `tab', `newline') are represented by
358 one-byte sequences which are their ASCII codes, in the range 0x00
359 through 0x7F. The other characters are represented by a sequence
360 of `base leading-code', optional `extended leading-code', and one
361 or two `position-code's. The length of the sequence is determined
362 by the base leading-code. Leading-code takes the range 0x80
363 through 0x9F, whereas extended leading-code and position-code take
364 the range 0xA0 through 0xFF. See `charset.h' for more details
365 about leading-code and position-code.
366
367 There's one exception to this rule. Special leading-code
368 `leading-code-composition' denotes that the following several
369 characters should be composed into one character. Leading-codes of
370 components (except for ASCII) are added 0x20. An ASCII character
371 component is represented by a 2-byte sequence of `0xA0' and
372 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
373 details of composite character. Hence, we can summarize the code
374 range as follows:
375
376 --- CODE RANGE of Emacs' internal format ---
377 (character set) (range)
378 ASCII 0x00 .. 0x7F
379 ELSE (1st byte) 0x80 .. 0x9F
380 (rest bytes) 0xA0 .. 0xFF
381 ---------------------------------------------
382
383 */
384
385 enum emacs_code_class_type emacs_code_class[256];
386
387 /* Go to the next statement only if *SRC is accessible and the code is
388 greater than 0xA0. */
389 #define CHECK_CODE_RANGE_A0_FF \
390 do { \
391 if (src >= src_end) \
392 goto label_end_of_switch; \
393 else if (*src++ < 0xA0) \
394 return 0; \
395 } while (0)
396
397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
398 Check if a text is encoded in Emacs' internal format. If it is,
399 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
400
401 int
402 detect_coding_emacs_mule (src, src_end)
403 unsigned char *src, *src_end;
404 {
405 unsigned char c;
406 int composing = 0;
407
408 while (src < src_end)
409 {
410 c = *src++;
411
412 if (composing)
413 {
414 if (c < 0xA0)
415 composing = 0;
416 else
417 c -= 0x20;
418 }
419
420 switch (emacs_code_class[c])
421 {
422 case EMACS_ascii_code:
423 case EMACS_linefeed_code:
424 break;
425
426 case EMACS_control_code:
427 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
428 return 0;
429 break;
430
431 case EMACS_invalid_code:
432 return 0;
433
434 case EMACS_leading_code_composition: /* c == 0x80 */
435 if (composing)
436 CHECK_CODE_RANGE_A0_FF;
437 else
438 composing = 1;
439 break;
440
441 case EMACS_leading_code_4:
442 CHECK_CODE_RANGE_A0_FF;
443 /* fall down to check it two more times ... */
444
445 case EMACS_leading_code_3:
446 CHECK_CODE_RANGE_A0_FF;
447 /* fall down to check it one more time ... */
448
449 case EMACS_leading_code_2:
450 CHECK_CODE_RANGE_A0_FF;
451 break;
452
453 default:
454 label_end_of_switch:
455 break;
456 }
457 }
458 return CODING_CATEGORY_MASK_EMACS_MULE;
459 }
460
461 \f
462 /*** 3. ISO2022 handlers ***/
463
464 /* The following note describes the coding system ISO2022 briefly.
465 Since the intention of this note is to help in understanding of
466 the programs in this file, some parts are NOT ACCURATE or OVERLY
467 SIMPLIFIED. For the thorough understanding, please refer to the
468 original document of ISO2022.
469
470 ISO2022 provides many mechanisms to encode several character sets
471 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
472 all text is encoded by codes of less than 128. This may make the
473 encoded text a little bit longer, but the text gets more stability
474 to pass through several gateways (some of them strip off the MSB).
475
476 There are two kinds of character set: control character set and
477 graphic character set. The former contains control characters such
478 as `newline' and `escape' to provide control functions (control
479 functions are provided also by escape sequences). The latter
480 contains graphic characters such as ' A' and '-'. Emacs recognizes
481 two control character sets and many graphic character sets.
482
483 Graphic character sets are classified into one of the following
484 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
485 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
486 bytes (DIMENSION) and the number of characters in one dimension
487 (CHARS) of the set. In addition, each character set is assigned an
488 identification tag (called "final character" and denoted as <F>
489 here after) which is unique in each class. <F> of each character
490 set is decided by ECMA(*) when it is registered in ISO. Code range
491 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
492
493 Note (*): ECMA = European Computer Manufacturers Association
494
495 Here are examples of graphic character set [NAME(<F>)]:
496 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
497 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
498 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
499 o DIMENSION2_CHARS96 -- none for the moment
500
501 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
502 C0 [0x00..0x1F] -- control character plane 0
503 GL [0x20..0x7F] -- graphic character plane 0
504 C1 [0x80..0x9F] -- control character plane 1
505 GR [0xA0..0xFF] -- graphic character plane 1
506
507 A control character set is directly designated and invoked to C0 or
508 C1 by an escape sequence. The most common case is that ISO646's
509 control character set is designated/invoked to C0 and ISO6429's
510 control character set is designated/invoked to C1, and usually
511 these designations/invocations are omitted in a coded text. With
512 7-bit environment, only C0 can be used, and a control character for
513 C1 is encoded by an appropriate escape sequence to fit in the
514 environment. All control characters for C1 are defined the
515 corresponding escape sequences.
516
517 A graphic character set is at first designated to one of four
518 graphic registers (G0 through G3), then these graphic registers are
519 invoked to GL or GR. These designations and invocations can be
520 done independently. The most common case is that G0 is invoked to
521 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
522 these invocations and designations are omitted in a coded text.
523 With 7-bit environment, only GL can be used.
524
525 When a graphic character set of CHARS94 is invoked to GL, code 0x20
526 and 0x7F of GL area work as control characters SPACE and DEL
527 respectively, and code 0xA0 and 0xFF of GR area should not be used.
528
529 There are two ways of invocation: locking-shift and single-shift.
530 With locking-shift, the invocation lasts until the next different
531 invocation, whereas with single-shift, the invocation works only
532 for the following character and doesn't affect locking-shift.
533 Invocations are done by the following control characters or escape
534 sequences.
535
536 ----------------------------------------------------------------------
537 function control char escape sequence description
538 ----------------------------------------------------------------------
539 SI (shift-in) 0x0F none invoke G0 to GL
540 SO (shift-out) 0x0E none invoke G1 to GL
541 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
542 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
543 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
544 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
545 ----------------------------------------------------------------------
546 The first four are for locking-shift. Control characters for these
547 functions are defined by macros ISO_CODE_XXX in `coding.h'.
548
549 Designations are done by the following escape sequences.
550 ----------------------------------------------------------------------
551 escape sequence description
552 ----------------------------------------------------------------------
553 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
554 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
555 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
556 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
557 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
558 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
559 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
560 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
561 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
562 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
563 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
564 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
565 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
566 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
567 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
568 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
569 ----------------------------------------------------------------------
570
571 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
572 of dimension 1, chars 94, and final character <F>, and etc.
573
574 Note (*): Although these designations are not allowed in ISO2022,
575 Emacs accepts them on decoding, and produces them on encoding
576 CHARS96 character set in a coding system which is characterized as
577 7-bit environment, non-locking-shift, and non-single-shift.
578
579 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
580 '(' can be omitted. We call this as "short-form" here after.
581
582 Now you may notice that there are a lot of ways for encoding the
583 same multilingual text in ISO2022. Actually, there exists many
584 coding systems such as Compound Text (used in X's inter client
585 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
586 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
587 localized platforms), and all of these are variants of ISO2022.
588
589 In addition to the above, Emacs handles two more kinds of escape
590 sequences: ISO6429's direction specification and Emacs' private
591 sequence for specifying character composition.
592
593 ISO6429's direction specification takes the following format:
594 o CSI ']' -- end of the current direction
595 o CSI '0' ']' -- end of the current direction
596 o CSI '1' ']' -- start of left-to-right text
597 o CSI '2' ']' -- start of right-to-left text
598 The control character CSI (0x9B: control sequence introducer) is
599 abbreviated to the escape sequence ESC '[' in 7-bit environment.
600
601 Character composition specification takes the following format:
602 o ESC '0' -- start character composition
603 o ESC '1' -- end character composition
604 Since these are not standard escape sequences of any ISO, the use
605 of them for these meaning is restricted to Emacs only. */
606
607 enum iso_code_class_type iso_code_class[256];
608
609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
610 Check if a text is encoded in ISO2022. If it is, returns an
611 integer in which appropriate flag bits any of:
612 CODING_CATEGORY_MASK_ISO_7
613 CODING_CATEGORY_MASK_ISO_8_1
614 CODING_CATEGORY_MASK_ISO_8_2
615 CODING_CATEGORY_MASK_ISO_7_ELSE
616 CODING_CATEGORY_MASK_ISO_8_ELSE
617 are set. If a code which should never appear in ISO2022 is found,
618 returns 0. */
619
620 int
621 detect_coding_iso2022 (src, src_end)
622 unsigned char *src, *src_end;
623 {
624 int mask = (CODING_CATEGORY_MASK_ISO_7
625 | CODING_CATEGORY_MASK_ISO_8_1
626 | CODING_CATEGORY_MASK_ISO_8_2
627 | CODING_CATEGORY_MASK_ISO_7_ELSE
628 | CODING_CATEGORY_MASK_ISO_8_ELSE
629 );
630 int g1 = 0; /* 1 iff designating to G1. */
631 int c, i;
632 struct coding_system coding_iso_8_1, coding_iso_8_2;
633
634 /* Coding systems of these categories may accept latin extra codes. */
635 setup_coding_system
636 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
637 &coding_iso_8_1);
638 setup_coding_system
639 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
640 &coding_iso_8_2);
641
642 while (mask && src < src_end)
643 {
644 c = *src++;
645 switch (c)
646 {
647 case ISO_CODE_ESC:
648 if (src >= src_end)
649 break;
650 c = *src++;
651 if ((c >= '(' && c <= '/'))
652 {
653 /* Designation sequence for a charset of dimension 1. */
654 if (src >= src_end)
655 break;
656 c = *src++;
657 if (c < ' ' || c >= 0x80)
658 /* Invalid designation sequence. */
659 return 0;
660 }
661 else if (c == '$')
662 {
663 /* Designation sequence for a charset of dimension 2. */
664 if (src >= src_end)
665 break;
666 c = *src++;
667 if (c >= '@' && c <= 'B')
668 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
669 ;
670 else if (c >= '(' && c <= '/')
671 {
672 if (src >= src_end)
673 break;
674 c = *src++;
675 if (c < ' ' || c >= 0x80)
676 /* Invalid designation sequence. */
677 return 0;
678 }
679 else
680 /* Invalid designation sequence. */
681 return 0;
682 }
683 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
684 /* Locking shift. */
685 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
686 | CODING_CATEGORY_MASK_ISO_8_ELSE);
687 else if (c == '0' || c == '1' || c == '2')
688 /* Start/end composition. */
689 ;
690 else
691 /* Invalid escape sequence. */
692 return 0;
693 break;
694
695 case ISO_CODE_SO:
696 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
697 | CODING_CATEGORY_MASK_ISO_8_ELSE);
698 break;
699
700 case ISO_CODE_CSI:
701 case ISO_CODE_SS2:
702 case ISO_CODE_SS3:
703 {
704 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
705
706 if (VECTORP (Vlatin_extra_code_table)
707 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
708 {
709 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
710 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
711 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
712 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
713 }
714 mask &= newmask;
715 }
716 break;
717
718 default:
719 if (c < 0x80)
720 break;
721 else if (c < 0xA0)
722 {
723 if (VECTORP (Vlatin_extra_code_table)
724 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
725 {
726 int newmask = 0;
727
728 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
729 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
730 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
731 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
732 mask &= newmask;
733 }
734 else
735 return 0;
736 }
737 else
738 {
739 unsigned char *src_begin = src;
740
741 mask &= ~(CODING_CATEGORY_MASK_ISO_7
742 | CODING_CATEGORY_MASK_ISO_7_ELSE);
743 while (src < src_end && *src >= 0xA0)
744 src++;
745 if ((src - src_begin - 1) & 1 && src < src_end)
746 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
747 }
748 break;
749 }
750 }
751
752 return mask;
753 }
754
755 /* Decode a character of which charset is CHARSET and the 1st position
756 code is C1. If dimension of CHARSET is 2, the 2nd position code is
757 fetched from SRC and set to C2. If CHARSET is negative, it means
758 that we are decoding ill formed text, and what we can do is just to
759 read C1 as is. */
760
761 #define DECODE_ISO_CHARACTER(charset, c1) \
762 do { \
763 int c_alt, charset_alt = (charset); \
764 if (COMPOSING_HEAD_P (coding->composing)) \
765 { \
766 *dst++ = LEADING_CODE_COMPOSITION; \
767 if (COMPOSING_WITH_RULE_P (coding->composing)) \
768 /* To tell composition rules are embeded. */ \
769 *dst++ = 0xFF; \
770 coding->composing += 2; \
771 } \
772 if ((charset) >= 0) \
773 { \
774 if (CHARSET_DIMENSION (charset) == 2) \
775 ONE_MORE_BYTE (c2); \
776 if (!NILP (unification_table) \
777 && ((c_alt = unify_char (unification_table, \
778 -1, (charset), c1, c2)) >= 0)) \
779 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
780 } \
781 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
782 DECODE_CHARACTER_ASCII (c1); \
783 else if (CHARSET_DIMENSION (charset_alt) == 1) \
784 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
785 else \
786 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
787 if (COMPOSING_WITH_RULE_P (coding->composing)) \
788 /* To tell a composition rule follows. */ \
789 coding->composing = COMPOSING_WITH_RULE_RULE; \
790 } while (0)
791
792 /* Set designation state into CODING. */
793 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
794 do { \
795 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
796 make_number (chars), \
797 make_number (final_char)); \
798 if (charset >= 0) \
799 { \
800 if (coding->direction == 1 \
801 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
802 charset = CHARSET_REVERSE_CHARSET (charset); \
803 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
804 } \
805 } while (0)
806
807 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
808
809 int
810 decode_coding_iso2022 (coding, source, destination,
811 src_bytes, dst_bytes, consumed)
812 struct coding_system *coding;
813 unsigned char *source, *destination;
814 int src_bytes, dst_bytes;
815 int *consumed;
816 {
817 unsigned char *src = source;
818 unsigned char *src_end = source + src_bytes;
819 unsigned char *dst = destination;
820 unsigned char *dst_end = destination + dst_bytes;
821 /* Since the maximum bytes produced by each loop is 7, we subtract 6
822 from DST_END to assure that overflow checking is necessary only
823 at the head of loop. */
824 unsigned char *adjusted_dst_end = dst_end - 6;
825 int charset;
826 /* Charsets invoked to graphic plane 0 and 1 respectively. */
827 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
828 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
829 Lisp_Object unification_table
830 = coding->character_unification_table_for_decode;
831
832 if (!NILP (Venable_character_unification) && NILP (unification_table))
833 unification_table = Vstandard_character_unification_table_for_decode;
834
835 while (src < src_end && dst < adjusted_dst_end)
836 {
837 /* SRC_BASE remembers the start position in source in each loop.
838 The loop will be exited when there's not enough source text
839 to analyze long escape sequence or 2-byte code (within macros
840 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
841 to SRC_BASE before exiting. */
842 unsigned char *src_base = src;
843 int c1 = *src++, c2;
844
845 switch (iso_code_class [c1])
846 {
847 case ISO_0x20_or_0x7F:
848 if (!coding->composing
849 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
850 {
851 /* This is SPACE or DEL. */
852 *dst++ = c1;
853 break;
854 }
855 /* This is a graphic character, we fall down ... */
856
857 case ISO_graphic_plane_0:
858 if (coding->composing == COMPOSING_WITH_RULE_RULE)
859 {
860 /* This is a composition rule. */
861 *dst++ = c1 | 0x80;
862 coding->composing = COMPOSING_WITH_RULE_TAIL;
863 }
864 else
865 DECODE_ISO_CHARACTER (charset0, c1);
866 break;
867
868 case ISO_0xA0_or_0xFF:
869 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
870 {
871 /* Invalid code. */
872 *dst++ = c1;
873 break;
874 }
875 /* This is a graphic character, we fall down ... */
876
877 case ISO_graphic_plane_1:
878 DECODE_ISO_CHARACTER (charset1, c1);
879 break;
880
881 case ISO_control_code:
882 /* All ISO2022 control characters in this class have the
883 same representation in Emacs internal format. */
884 *dst++ = c1;
885 break;
886
887 case ISO_carriage_return:
888 if (coding->eol_type == CODING_EOL_CR)
889 {
890 *dst++ = '\n';
891 }
892 else if (coding->eol_type == CODING_EOL_CRLF)
893 {
894 ONE_MORE_BYTE (c1);
895 if (c1 == ISO_CODE_LF)
896 *dst++ = '\n';
897 else
898 {
899 src--;
900 *dst++ = c1;
901 }
902 }
903 else
904 {
905 *dst++ = c1;
906 }
907 break;
908
909 case ISO_shift_out:
910 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
911 goto label_invalid_escape_sequence;
912 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
913 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
914 break;
915
916 case ISO_shift_in:
917 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
918 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
919 break;
920
921 case ISO_single_shift_2_7:
922 case ISO_single_shift_2:
923 /* SS2 is handled as an escape sequence of ESC 'N' */
924 c1 = 'N';
925 goto label_escape_sequence;
926
927 case ISO_single_shift_3:
928 /* SS2 is handled as an escape sequence of ESC 'O' */
929 c1 = 'O';
930 goto label_escape_sequence;
931
932 case ISO_control_sequence_introducer:
933 /* CSI is handled as an escape sequence of ESC '[' ... */
934 c1 = '[';
935 goto label_escape_sequence;
936
937 case ISO_escape:
938 ONE_MORE_BYTE (c1);
939 label_escape_sequence:
940 /* Escape sequences handled by Emacs are invocation,
941 designation, direction specification, and character
942 composition specification. */
943 switch (c1)
944 {
945 case '&': /* revision of following character set */
946 ONE_MORE_BYTE (c1);
947 if (!(c1 >= '@' && c1 <= '~'))
948 goto label_invalid_escape_sequence;
949 ONE_MORE_BYTE (c1);
950 if (c1 != ISO_CODE_ESC)
951 goto label_invalid_escape_sequence;
952 ONE_MORE_BYTE (c1);
953 goto label_escape_sequence;
954
955 case '$': /* designation of 2-byte character set */
956 ONE_MORE_BYTE (c1);
957 if (c1 >= '@' && c1 <= 'B')
958 { /* designation of JISX0208.1978, GB2312.1980,
959 or JISX0208.1980 */
960 DECODE_DESIGNATION (0, 2, 94, c1);
961 }
962 else if (c1 >= 0x28 && c1 <= 0x2B)
963 { /* designation of DIMENSION2_CHARS94 character set */
964 ONE_MORE_BYTE (c2);
965 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
966 }
967 else if (c1 >= 0x2C && c1 <= 0x2F)
968 { /* designation of DIMENSION2_CHARS96 character set */
969 ONE_MORE_BYTE (c2);
970 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
971 }
972 else
973 goto label_invalid_escape_sequence;
974 break;
975
976 case 'n': /* invocation of locking-shift-2 */
977 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
978 goto label_invalid_escape_sequence;
979 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
980 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
981 break;
982
983 case 'o': /* invocation of locking-shift-3 */
984 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
985 goto label_invalid_escape_sequence;
986 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
987 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
988 break;
989
990 case 'N': /* invocation of single-shift-2 */
991 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
992 goto label_invalid_escape_sequence;
993 ONE_MORE_BYTE (c1);
994 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
995 DECODE_ISO_CHARACTER (charset, c1);
996 break;
997
998 case 'O': /* invocation of single-shift-3 */
999 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1000 goto label_invalid_escape_sequence;
1001 ONE_MORE_BYTE (c1);
1002 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1003 DECODE_ISO_CHARACTER (charset, c1);
1004 break;
1005
1006 case '0': /* start composing without embeded rules */
1007 coding->composing = COMPOSING_NO_RULE_HEAD;
1008 break;
1009
1010 case '1': /* end composing */
1011 coding->composing = COMPOSING_NO;
1012 break;
1013
1014 case '2': /* start composing with embeded rules */
1015 coding->composing = COMPOSING_WITH_RULE_HEAD;
1016 break;
1017
1018 case '[': /* specification of direction */
1019 /* For the moment, nested direction is not supported.
1020 So, the value of `coding->direction' is 0 or 1: 0
1021 means left-to-right, 1 means right-to-left. */
1022 ONE_MORE_BYTE (c1);
1023 switch (c1)
1024 {
1025 case ']': /* end of the current direction */
1026 coding->direction = 0;
1027
1028 case '0': /* end of the current direction */
1029 case '1': /* start of left-to-right direction */
1030 ONE_MORE_BYTE (c1);
1031 if (c1 == ']')
1032 coding->direction = 0;
1033 else
1034 goto label_invalid_escape_sequence;
1035 break;
1036
1037 case '2': /* start of right-to-left direction */
1038 ONE_MORE_BYTE (c1);
1039 if (c1 == ']')
1040 coding->direction= 1;
1041 else
1042 goto label_invalid_escape_sequence;
1043 break;
1044
1045 default:
1046 goto label_invalid_escape_sequence;
1047 }
1048 break;
1049
1050 default:
1051 if (c1 >= 0x28 && c1 <= 0x2B)
1052 { /* designation of DIMENSION1_CHARS94 character set */
1053 ONE_MORE_BYTE (c2);
1054 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1055 }
1056 else if (c1 >= 0x2C && c1 <= 0x2F)
1057 { /* designation of DIMENSION1_CHARS96 character set */
1058 ONE_MORE_BYTE (c2);
1059 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1060 }
1061 else
1062 {
1063 goto label_invalid_escape_sequence;
1064 }
1065 }
1066 /* We must update these variables now. */
1067 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1068 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1069 break;
1070
1071 label_invalid_escape_sequence:
1072 {
1073 int length = src - src_base;
1074
1075 bcopy (src_base, dst, length);
1076 dst += length;
1077 }
1078 }
1079 continue;
1080
1081 label_end_of_loop:
1082 coding->carryover_size = src - src_base;
1083 bcopy (src_base, coding->carryover, coding->carryover_size);
1084 src = src_base;
1085 break;
1086 }
1087
1088 /* If this is the last block of the text to be decoded, we had
1089 better just flush out all remaining codes in the text although
1090 they are not valid characters. */
1091 if (coding->last_block)
1092 {
1093 bcopy (src, dst, src_end - src);
1094 dst += (src_end - src);
1095 src = src_end;
1096 }
1097 *consumed = src - source;
1098 return dst - destination;
1099 }
1100
1101 /* ISO2022 encoding stuff. */
1102
1103 /*
1104 It is not enough to say just "ISO2022" on encoding, we have to
1105 specify more details. In Emacs, each coding-system of ISO2022
1106 variant has the following specifications:
1107 1. Initial designation to G0 thru G3.
1108 2. Allows short-form designation?
1109 3. ASCII should be designated to G0 before control characters?
1110 4. ASCII should be designated to G0 at end of line?
1111 5. 7-bit environment or 8-bit environment?
1112 6. Use locking-shift?
1113 7. Use Single-shift?
1114 And the following two are only for Japanese:
1115 8. Use ASCII in place of JIS0201-1976-Roman?
1116 9. Use JISX0208-1983 in place of JISX0208-1978?
1117 These specifications are encoded in `coding->flags' as flag bits
1118 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1119 details.
1120 */
1121
1122 /* Produce codes (escape sequence) for designating CHARSET to graphic
1123 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1124 the coding system CODING allows, produce designation sequence of
1125 short-form. */
1126
1127 #define ENCODE_DESIGNATION(charset, reg, coding) \
1128 do { \
1129 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1130 char *intermediate_char_94 = "()*+"; \
1131 char *intermediate_char_96 = ",-./"; \
1132 Lisp_Object temp \
1133 = Fassq (make_number (charset), Vcharset_revision_alist); \
1134 if (! NILP (temp)) \
1135 { \
1136 *dst++ = ISO_CODE_ESC; \
1137 *dst++ = '&'; \
1138 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1139 } \
1140 *dst++ = ISO_CODE_ESC; \
1141 if (CHARSET_DIMENSION (charset) == 1) \
1142 { \
1143 if (CHARSET_CHARS (charset) == 94) \
1144 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1145 else \
1146 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1147 } \
1148 else \
1149 { \
1150 *dst++ = '$'; \
1151 if (CHARSET_CHARS (charset) == 94) \
1152 { \
1153 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1154 || reg != 0 \
1155 || final_char < '@' || final_char > 'B') \
1156 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1157 } \
1158 else \
1159 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1160 } \
1161 *dst++ = final_char; \
1162 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1163 } while (0)
1164
1165 /* The following two macros produce codes (control character or escape
1166 sequence) for ISO2022 single-shift functions (single-shift-2 and
1167 single-shift-3). */
1168
1169 #define ENCODE_SINGLE_SHIFT_2 \
1170 do { \
1171 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1172 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1173 else \
1174 *dst++ = ISO_CODE_SS2; \
1175 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1176 } while (0)
1177
1178 #define ENCODE_SINGLE_SHIFT_3 \
1179 do { \
1180 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1181 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1182 else \
1183 *dst++ = ISO_CODE_SS3; \
1184 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1185 } while (0)
1186
1187 /* The following four macros produce codes (control character or
1188 escape sequence) for ISO2022 locking-shift functions (shift-in,
1189 shift-out, locking-shift-2, and locking-shift-3). */
1190
1191 #define ENCODE_SHIFT_IN \
1192 do { \
1193 *dst++ = ISO_CODE_SI; \
1194 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1195 } while (0)
1196
1197 #define ENCODE_SHIFT_OUT \
1198 do { \
1199 *dst++ = ISO_CODE_SO; \
1200 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1201 } while (0)
1202
1203 #define ENCODE_LOCKING_SHIFT_2 \
1204 do { \
1205 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1206 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1207 } while (0)
1208
1209 #define ENCODE_LOCKING_SHIFT_3 \
1210 do { \
1211 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1212 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1213 } while (0)
1214
1215 /* Produce codes for a DIMENSION1 character whose character set is
1216 CHARSET and whose position-code is C1. Designation and invocation
1217 sequences are also produced in advance if necessary. */
1218
1219
1220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1221 do { \
1222 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1223 { \
1224 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1225 *dst++ = c1 & 0x7F; \
1226 else \
1227 *dst++ = c1 | 0x80; \
1228 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1229 break; \
1230 } \
1231 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1232 { \
1233 *dst++ = c1 & 0x7F; \
1234 break; \
1235 } \
1236 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1237 { \
1238 *dst++ = c1 | 0x80; \
1239 break; \
1240 } \
1241 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1242 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1243 { \
1244 /* We should not encode this character, instead produce one or \
1245 two `?'s. */ \
1246 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1247 if (CHARSET_WIDTH (charset) == 2) \
1248 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1249 break; \
1250 } \
1251 else \
1252 /* Since CHARSET is not yet invoked to any graphic planes, we \
1253 must invoke it, or, at first, designate it to some graphic \
1254 register. Then repeat the loop to actually produce the \
1255 character. */ \
1256 dst = encode_invocation_designation (charset, coding, dst); \
1257 } while (1)
1258
1259 /* Produce codes for a DIMENSION2 character whose character set is
1260 CHARSET and whose position-codes are C1 and C2. Designation and
1261 invocation codes are also produced in advance if necessary. */
1262
1263 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1264 do { \
1265 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1266 { \
1267 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1268 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1269 else \
1270 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1271 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1272 break; \
1273 } \
1274 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1275 { \
1276 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1277 break; \
1278 } \
1279 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1280 { \
1281 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1282 break; \
1283 } \
1284 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1285 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1286 { \
1287 /* We should not encode this character, instead produce one or \
1288 two `?'s. */ \
1289 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1290 if (CHARSET_WIDTH (charset) == 2) \
1291 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1292 break; \
1293 } \
1294 else \
1295 /* Since CHARSET is not yet invoked to any graphic planes, we \
1296 must invoke it, or, at first, designate it to some graphic \
1297 register. Then repeat the loop to actually produce the \
1298 character. */ \
1299 dst = encode_invocation_designation (charset, coding, dst); \
1300 } while (1)
1301
1302 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1303 do { \
1304 int c_alt, charset_alt; \
1305 if (!NILP (unification_table) \
1306 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1307 >= 0)) \
1308 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1309 else \
1310 charset_alt = charset; \
1311 if (CHARSET_DIMENSION (charset_alt) == 1) \
1312 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1313 else \
1314 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1315 } while (0)
1316
1317 /* Produce designation and invocation codes at a place pointed by DST
1318 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1319 Return new DST. */
1320
1321 unsigned char *
1322 encode_invocation_designation (charset, coding, dst)
1323 int charset;
1324 struct coding_system *coding;
1325 unsigned char *dst;
1326 {
1327 int reg; /* graphic register number */
1328
1329 /* At first, check designations. */
1330 for (reg = 0; reg < 4; reg++)
1331 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1332 break;
1333
1334 if (reg >= 4)
1335 {
1336 /* CHARSET is not yet designated to any graphic registers. */
1337 /* At first check the requested designation. */
1338 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1339 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1340 /* Since CHARSET requests no special designation, designate it
1341 to graphic register 0. */
1342 reg = 0;
1343
1344 ENCODE_DESIGNATION (charset, reg, coding);
1345 }
1346
1347 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1348 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1349 {
1350 /* Since the graphic register REG is not invoked to any graphic
1351 planes, invoke it to graphic plane 0. */
1352 switch (reg)
1353 {
1354 case 0: /* graphic register 0 */
1355 ENCODE_SHIFT_IN;
1356 break;
1357
1358 case 1: /* graphic register 1 */
1359 ENCODE_SHIFT_OUT;
1360 break;
1361
1362 case 2: /* graphic register 2 */
1363 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1364 ENCODE_SINGLE_SHIFT_2;
1365 else
1366 ENCODE_LOCKING_SHIFT_2;
1367 break;
1368
1369 case 3: /* graphic register 3 */
1370 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1371 ENCODE_SINGLE_SHIFT_3;
1372 else
1373 ENCODE_LOCKING_SHIFT_3;
1374 break;
1375 }
1376 }
1377 return dst;
1378 }
1379
1380 /* The following two macros produce codes for indicating composition. */
1381 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1382 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1383 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1384
1385 /* The following three macros produce codes for indicating direction
1386 of text. */
1387 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1388 do { \
1389 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1390 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1391 else \
1392 *dst++ = ISO_CODE_CSI; \
1393 } while (0)
1394
1395 #define ENCODE_DIRECTION_R2L \
1396 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1397
1398 #define ENCODE_DIRECTION_L2R \
1399 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1400
1401 /* Produce codes for designation and invocation to reset the graphic
1402 planes and registers to initial state. */
1403 #define ENCODE_RESET_PLANE_AND_REGISTER \
1404 do { \
1405 int reg; \
1406 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1407 ENCODE_SHIFT_IN; \
1408 for (reg = 0; reg < 4; reg++) \
1409 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1410 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1411 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1412 ENCODE_DESIGNATION \
1413 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1414 } while (0)
1415
1416 /* Produce designation sequences of charsets in the line started from
1417 *SRC to a place pointed by DSTP.
1418
1419 If the current block ends before any end-of-line, we may fail to
1420 find all the necessary *designations. */
1421 encode_designation_at_bol (coding, table, src, src_end, dstp)
1422 struct coding_system *coding;
1423 Lisp_Object table;
1424 unsigned char *src, *src_end, **dstp;
1425 {
1426 int charset, c, found = 0, reg;
1427 /* Table of charsets to be designated to each graphic register. */
1428 int r[4];
1429 unsigned char *dst = *dstp;
1430
1431 for (reg = 0; reg < 4; reg++)
1432 r[reg] = -1;
1433
1434 while (src < src_end && *src != '\n' && found < 4)
1435 {
1436 int bytes = BYTES_BY_CHAR_HEAD (*src);
1437
1438 if (NILP (table))
1439 charset = CHARSET_AT (src);
1440 else
1441 {
1442 int c_alt, c1, c2;
1443
1444 SPLIT_STRING(src, bytes, charset, c1, c2);
1445 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1446 charset = CHAR_CHARSET (c_alt);
1447 }
1448
1449 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1450 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1451 {
1452 found++;
1453 r[reg] = charset;
1454 }
1455
1456 src += bytes;
1457 }
1458
1459 if (found)
1460 {
1461 for (reg = 0; reg < 4; reg++)
1462 if (r[reg] >= 0
1463 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1464 ENCODE_DESIGNATION (r[reg], reg, coding);
1465 *dstp = dst;
1466 }
1467 }
1468
1469 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1470
1471 int
1472 encode_coding_iso2022 (coding, source, destination,
1473 src_bytes, dst_bytes, consumed)
1474 struct coding_system *coding;
1475 unsigned char *source, *destination;
1476 int src_bytes, dst_bytes;
1477 int *consumed;
1478 {
1479 unsigned char *src = source;
1480 unsigned char *src_end = source + src_bytes;
1481 unsigned char *dst = destination;
1482 unsigned char *dst_end = destination + dst_bytes;
1483 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1484 from DST_END to assure overflow checking is necessary only at the
1485 head of loop. */
1486 unsigned char *adjusted_dst_end = dst_end - 19;
1487 Lisp_Object unification_table
1488 = coding->character_unification_table_for_encode;
1489
1490 if (!NILP (Venable_character_unification) && NILP (unification_table))
1491 unification_table = Vstandard_character_unification_table_for_encode;
1492
1493 while (src < src_end && dst < adjusted_dst_end)
1494 {
1495 /* SRC_BASE remembers the start position in source in each loop.
1496 The loop will be exited when there's not enough source text
1497 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1498 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1499 reset to SRC_BASE before exiting. */
1500 unsigned char *src_base = src;
1501 int charset, c1, c2, c3, c4;
1502
1503 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1504 && CODING_SPEC_ISO_BOL (coding))
1505 {
1506 /* We have to produce designation sequences if any now. */
1507 encode_designation_at_bol (coding, unification_table,
1508 src, src_end, &dst);
1509 CODING_SPEC_ISO_BOL (coding) = 0;
1510 }
1511
1512 c1 = *src++;
1513 /* If we are seeing a component of a composite character, we are
1514 seeing a leading-code specially encoded for composition, or a
1515 composition rule if composing with rule. We must set C1
1516 to a normal leading-code or an ASCII code. If we are not at
1517 a composed character, we must reset the composition state. */
1518 if (COMPOSING_P (coding->composing))
1519 {
1520 if (c1 < 0xA0)
1521 {
1522 /* We are not in a composite character any longer. */
1523 coding->composing = COMPOSING_NO;
1524 ENCODE_COMPOSITION_END;
1525 }
1526 else
1527 {
1528 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1529 {
1530 *dst++ = c1 & 0x7F;
1531 coding->composing = COMPOSING_WITH_RULE_HEAD;
1532 continue;
1533 }
1534 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1535 coding->composing = COMPOSING_WITH_RULE_RULE;
1536 if (c1 == 0xA0)
1537 {
1538 /* This is an ASCII component. */
1539 ONE_MORE_BYTE (c1);
1540 c1 &= 0x7F;
1541 }
1542 else
1543 /* This is a leading-code of non ASCII component. */
1544 c1 -= 0x20;
1545 }
1546 }
1547
1548 /* Now encode one character. C1 is a control character, an
1549 ASCII character, or a leading-code of multi-byte character. */
1550 switch (emacs_code_class[c1])
1551 {
1552 case EMACS_ascii_code:
1553 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1554 break;
1555
1556 case EMACS_control_code:
1557 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1558 ENCODE_RESET_PLANE_AND_REGISTER;
1559 *dst++ = c1;
1560 break;
1561
1562 case EMACS_carriage_return_code:
1563 if (!coding->selective)
1564 {
1565 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1566 ENCODE_RESET_PLANE_AND_REGISTER;
1567 *dst++ = c1;
1568 break;
1569 }
1570 /* fall down to treat '\r' as '\n' ... */
1571
1572 case EMACS_linefeed_code:
1573 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1574 ENCODE_RESET_PLANE_AND_REGISTER;
1575 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1576 bcopy (coding->spec.iso2022.initial_designation,
1577 coding->spec.iso2022.current_designation,
1578 sizeof coding->spec.iso2022.initial_designation);
1579 if (coding->eol_type == CODING_EOL_LF
1580 || coding->eol_type == CODING_EOL_UNDECIDED)
1581 *dst++ = ISO_CODE_LF;
1582 else if (coding->eol_type == CODING_EOL_CRLF)
1583 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1584 else
1585 *dst++ = ISO_CODE_CR;
1586 CODING_SPEC_ISO_BOL (coding) = 1;
1587 break;
1588
1589 case EMACS_leading_code_2:
1590 ONE_MORE_BYTE (c2);
1591 if (c2 < 0xA0)
1592 {
1593 /* invalid sequence */
1594 *dst++ = c1;
1595 *dst++ = c2;
1596 }
1597 else
1598 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1599 break;
1600
1601 case EMACS_leading_code_3:
1602 TWO_MORE_BYTES (c2, c3);
1603 if (c2 < 0xA0 || c3 < 0xA0)
1604 {
1605 /* invalid sequence */
1606 *dst++ = c1;
1607 *dst++ = c2;
1608 *dst++ = c3;
1609 }
1610 else if (c1 < LEADING_CODE_PRIVATE_11)
1611 ENCODE_ISO_CHARACTER (c1, c2, c3);
1612 else
1613 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1614 break;
1615
1616 case EMACS_leading_code_4:
1617 THREE_MORE_BYTES (c2, c3, c4);
1618 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1619 {
1620 /* invalid sequence */
1621 *dst++ = c1;
1622 *dst++ = c2;
1623 *dst++ = c3;
1624 *dst++ = c4;
1625 }
1626 else
1627 ENCODE_ISO_CHARACTER (c2, c3, c4);
1628 break;
1629
1630 case EMACS_leading_code_composition:
1631 ONE_MORE_BYTE (c2);
1632 if (c2 < 0xA0)
1633 {
1634 /* invalid sequence */
1635 *dst++ = c1;
1636 *dst++ = c2;
1637 }
1638 else if (c2 == 0xFF)
1639 {
1640 coding->composing = COMPOSING_WITH_RULE_HEAD;
1641 ENCODE_COMPOSITION_WITH_RULE_START;
1642 }
1643 else
1644 {
1645 /* Rewind one byte because it is a character code of
1646 composition elements. */
1647 src--;
1648 coding->composing = COMPOSING_NO_RULE_HEAD;
1649 ENCODE_COMPOSITION_NO_RULE_START;
1650 }
1651 break;
1652
1653 case EMACS_invalid_code:
1654 *dst++ = c1;
1655 break;
1656 }
1657 continue;
1658 label_end_of_loop:
1659 /* We reach here because the source date ends not at character
1660 boundary. */
1661 coding->carryover_size = src_end - src_base;
1662 bcopy (src_base, coding->carryover, coding->carryover_size);
1663 src = src_end;
1664 break;
1665 }
1666
1667 /* If this is the last block of the text to be encoded, we must
1668 reset graphic planes and registers to the initial state. */
1669 if (src >= src_end && coding->last_block)
1670 {
1671 ENCODE_RESET_PLANE_AND_REGISTER;
1672 if (coding->carryover_size > 0
1673 && coding->carryover_size < (dst_end - dst))
1674 {
1675 bcopy (coding->carryover, dst, coding->carryover_size);
1676 dst += coding->carryover_size;
1677 coding->carryover_size = 0;
1678 }
1679 }
1680 *consumed = src - source;
1681 return dst - destination;
1682 }
1683
1684 \f
1685 /*** 4. SJIS and BIG5 handlers ***/
1686
1687 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1688 quite widely. So, for the moment, Emacs supports them in the bare
1689 C code. But, in the future, they may be supported only by CCL. */
1690
1691 /* SJIS is a coding system encoding three character sets: ASCII, right
1692 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1693 as is. A character of charset katakana-jisx0201 is encoded by
1694 "position-code + 0x80". A character of charset japanese-jisx0208
1695 is encoded in 2-byte but two position-codes are divided and shifted
1696 so that it fit in the range below.
1697
1698 --- CODE RANGE of SJIS ---
1699 (character set) (range)
1700 ASCII 0x00 .. 0x7F
1701 KATAKANA-JISX0201 0xA0 .. 0xDF
1702 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1703 (2nd byte) 0x40 .. 0xFF
1704 -------------------------------
1705
1706 */
1707
1708 /* BIG5 is a coding system encoding two character sets: ASCII and
1709 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1710 character set and is encoded in two-byte.
1711
1712 --- CODE RANGE of BIG5 ---
1713 (character set) (range)
1714 ASCII 0x00 .. 0x7F
1715 Big5 (1st byte) 0xA1 .. 0xFE
1716 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1717 --------------------------
1718
1719 Since the number of characters in Big5 is larger than maximum
1720 characters in Emacs' charset (96x96), it can't be handled as one
1721 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1722 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1723 contains frequently used characters and the latter contains less
1724 frequently used characters. */
1725
1726 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1727 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1728 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1729 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1730
1731 /* Number of Big5 characters which have the same code in 1st byte. */
1732 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1733
1734 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1735 do { \
1736 unsigned int temp \
1737 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1738 if (b1 < 0xC9) \
1739 charset = charset_big5_1; \
1740 else \
1741 { \
1742 charset = charset_big5_2; \
1743 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1744 } \
1745 c1 = temp / (0xFF - 0xA1) + 0x21; \
1746 c2 = temp % (0xFF - 0xA1) + 0x21; \
1747 } while (0)
1748
1749 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1750 do { \
1751 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1752 if (charset == charset_big5_2) \
1753 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1754 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1755 b2 = temp % BIG5_SAME_ROW; \
1756 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1757 } while (0)
1758
1759 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1760 do { \
1761 int c_alt, charset_alt = (charset); \
1762 if (!NILP (unification_table) \
1763 && ((c_alt = unify_char (unification_table, \
1764 -1, (charset), c1, c2)) >= 0)) \
1765 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1766 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1767 DECODE_CHARACTER_ASCII (c1); \
1768 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1769 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1770 else \
1771 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1772 } while (0)
1773
1774 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1775 do { \
1776 int c_alt, charset_alt; \
1777 if (!NILP (unification_table) \
1778 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1779 >= 0)) \
1780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1781 else \
1782 charset_alt = charset; \
1783 if (charset_alt == charset_ascii) \
1784 *dst++ = c1; \
1785 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1786 { \
1787 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1788 *dst++ = c1; \
1789 else \
1790 *dst++ = charset_alt, *dst++ = c1; \
1791 } \
1792 else \
1793 { \
1794 c1 &= 0x7F, c2 &= 0x7F; \
1795 if (sjis_p && charset_alt == charset_jisx0208) \
1796 { \
1797 unsigned char s1, s2; \
1798 \
1799 ENCODE_SJIS (c1, c2, s1, s2); \
1800 *dst++ = s1, *dst++ = s2; \
1801 } \
1802 else if (!sjis_p \
1803 && (charset_alt == charset_big5_1 \
1804 || charset_alt == charset_big5_2)) \
1805 { \
1806 unsigned char b1, b2; \
1807 \
1808 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1809 *dst++ = b1, *dst++ = b2; \
1810 } \
1811 else \
1812 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1813 } \
1814 } while (0);
1815
1816 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1817 Check if a text is encoded in SJIS. If it is, return
1818 CODING_CATEGORY_MASK_SJIS, else return 0. */
1819
1820 int
1821 detect_coding_sjis (src, src_end)
1822 unsigned char *src, *src_end;
1823 {
1824 unsigned char c;
1825
1826 while (src < src_end)
1827 {
1828 c = *src++;
1829 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1830 return 0;
1831 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1832 {
1833 if (src < src_end && *src++ < 0x40)
1834 return 0;
1835 }
1836 }
1837 return CODING_CATEGORY_MASK_SJIS;
1838 }
1839
1840 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1841 Check if a text is encoded in BIG5. If it is, return
1842 CODING_CATEGORY_MASK_BIG5, else return 0. */
1843
1844 int
1845 detect_coding_big5 (src, src_end)
1846 unsigned char *src, *src_end;
1847 {
1848 unsigned char c;
1849
1850 while (src < src_end)
1851 {
1852 c = *src++;
1853 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1854 return 0;
1855 if (c >= 0xA1)
1856 {
1857 if (src >= src_end)
1858 break;
1859 c = *src++;
1860 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1861 return 0;
1862 }
1863 }
1864 return CODING_CATEGORY_MASK_BIG5;
1865 }
1866
1867 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1868 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1869
1870 int
1871 decode_coding_sjis_big5 (coding, source, destination,
1872 src_bytes, dst_bytes, consumed, sjis_p)
1873 struct coding_system *coding;
1874 unsigned char *source, *destination;
1875 int src_bytes, dst_bytes;
1876 int *consumed;
1877 int sjis_p;
1878 {
1879 unsigned char *src = source;
1880 unsigned char *src_end = source + src_bytes;
1881 unsigned char *dst = destination;
1882 unsigned char *dst_end = destination + dst_bytes;
1883 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1884 from DST_END to assure overflow checking is necessary only at the
1885 head of loop. */
1886 unsigned char *adjusted_dst_end = dst_end - 3;
1887 Lisp_Object unification_table
1888 = coding->character_unification_table_for_decode;
1889
1890 if (!NILP (Venable_character_unification) && NILP (unification_table))
1891 unification_table = Vstandard_character_unification_table_for_decode;
1892
1893 while (src < src_end && dst < adjusted_dst_end)
1894 {
1895 /* SRC_BASE remembers the start position in source in each loop.
1896 The loop will be exited when there's not enough source text
1897 to analyze two-byte character (within macro ONE_MORE_BYTE).
1898 In that case, SRC is reset to SRC_BASE before exiting. */
1899 unsigned char *src_base = src;
1900 unsigned char c1 = *src++, c2, c3, c4;
1901
1902 if (c1 == '\r')
1903 {
1904 if (coding->eol_type == CODING_EOL_CRLF)
1905 {
1906 ONE_MORE_BYTE (c2);
1907 if (c2 == '\n')
1908 *dst++ = c2;
1909 else
1910 /* To process C2 again, SRC is subtracted by 1. */
1911 *dst++ = c1, src--;
1912 }
1913 else
1914 *dst++ = c1;
1915 }
1916 else if (c1 < 0x20)
1917 *dst++ = c1;
1918 else if (c1 < 0x80)
1919 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1920 else if (c1 < 0xA0 || c1 >= 0xE0)
1921 {
1922 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1923 if (sjis_p)
1924 {
1925 ONE_MORE_BYTE (c2);
1926 DECODE_SJIS (c1, c2, c3, c4);
1927 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1928 }
1929 else if (c1 >= 0xE0 && c1 < 0xFF)
1930 {
1931 int charset;
1932
1933 ONE_MORE_BYTE (c2);
1934 DECODE_BIG5 (c1, c2, charset, c3, c4);
1935 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1936 }
1937 else /* Invalid code */
1938 *dst++ = c1;
1939 }
1940 else
1941 {
1942 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1943 if (sjis_p)
1944 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1945 else
1946 {
1947 int charset;
1948
1949 ONE_MORE_BYTE (c2);
1950 DECODE_BIG5 (c1, c2, charset, c3, c4);
1951 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1952 }
1953 }
1954 continue;
1955
1956 label_end_of_loop:
1957 coding->carryover_size = src - src_base;
1958 bcopy (src_base, coding->carryover, coding->carryover_size);
1959 src = src_base;
1960 break;
1961 }
1962
1963 *consumed = src - source;
1964 return dst - destination;
1965 }
1966
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1968 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1969 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1970 sure that all these charsets are registered as official charset
1971 (i.e. do not have extended leading-codes). Characters of other
1972 charsets are produced without any encoding. If SJIS_P is 1, encode
1973 SJIS text, else encode BIG5 text. */
1974
1975 int
1976 encode_coding_sjis_big5 (coding, source, destination,
1977 src_bytes, dst_bytes, consumed, sjis_p)
1978 struct coding_system *coding;
1979 unsigned char *source, *destination;
1980 int src_bytes, dst_bytes;
1981 int *consumed;
1982 int sjis_p;
1983 {
1984 unsigned char *src = source;
1985 unsigned char *src_end = source + src_bytes;
1986 unsigned char *dst = destination;
1987 unsigned char *dst_end = destination + dst_bytes;
1988 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1989 from DST_END to assure overflow checking is necessary only at the
1990 head of loop. */
1991 unsigned char *adjusted_dst_end = dst_end - 1;
1992 Lisp_Object unification_table
1993 = coding->character_unification_table_for_encode;
1994
1995 if (!NILP (Venable_character_unification) && NILP (unification_table))
1996 unification_table = Vstandard_character_unification_table_for_encode;
1997
1998 while (src < src_end && dst < adjusted_dst_end)
1999 {
2000 /* SRC_BASE remembers the start position in source in each loop.
2001 The loop will be exited when there's not enough source text
2002 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2003 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2004 before exiting. */
2005 unsigned char *src_base = src;
2006 unsigned char c1 = *src++, c2, c3, c4;
2007
2008 if (coding->composing)
2009 {
2010 if (c1 == 0xA0)
2011 {
2012 ONE_MORE_BYTE (c1);
2013 c1 &= 0x7F;
2014 }
2015 else if (c1 >= 0xA0)
2016 c1 -= 0x20;
2017 else
2018 coding->composing = 0;
2019 }
2020
2021 switch (emacs_code_class[c1])
2022 {
2023 case EMACS_ascii_code:
2024 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2025 break;
2026
2027 case EMACS_control_code:
2028 *dst++ = c1;
2029 break;
2030
2031 case EMACS_carriage_return_code:
2032 if (!coding->selective)
2033 {
2034 *dst++ = c1;
2035 break;
2036 }
2037 /* fall down to treat '\r' as '\n' ... */
2038
2039 case EMACS_linefeed_code:
2040 if (coding->eol_type == CODING_EOL_LF
2041 || coding->eol_type == CODING_EOL_UNDECIDED)
2042 *dst++ = '\n';
2043 else if (coding->eol_type == CODING_EOL_CRLF)
2044 *dst++ = '\r', *dst++ = '\n';
2045 else
2046 *dst++ = '\r';
2047 break;
2048
2049 case EMACS_leading_code_2:
2050 ONE_MORE_BYTE (c2);
2051 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2052 break;
2053
2054 case EMACS_leading_code_3:
2055 TWO_MORE_BYTES (c2, c3);
2056 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2057 break;
2058
2059 case EMACS_leading_code_4:
2060 THREE_MORE_BYTES (c2, c3, c4);
2061 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2062 break;
2063
2064 case EMACS_leading_code_composition:
2065 coding->composing = 1;
2066 break;
2067
2068 default: /* i.e. case EMACS_invalid_code: */
2069 *dst++ = c1;
2070 }
2071 continue;
2072
2073 label_end_of_loop:
2074 coding->carryover_size = src_end - src_base;
2075 bcopy (src_base, coding->carryover, coding->carryover_size);
2076 src = src_end;
2077 break;
2078 }
2079
2080 *consumed = src - source;
2081 return dst - destination;
2082 }
2083
2084 \f
2085 /*** 5. End-of-line handlers ***/
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2088 This function is called only when `coding->eol_type' is
2089 CODING_EOL_CRLF or CODING_EOL_CR. */
2090
2091 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2092 struct coding_system *coding;
2093 unsigned char *source, *destination;
2094 int src_bytes, dst_bytes;
2095 int *consumed;
2096 {
2097 unsigned char *src = source;
2098 unsigned char *src_end = source + src_bytes;
2099 unsigned char *dst = destination;
2100 unsigned char *dst_end = destination + dst_bytes;
2101 int produced;
2102
2103 switch (coding->eol_type)
2104 {
2105 case CODING_EOL_CRLF:
2106 {
2107 /* Since the maximum bytes produced by each loop is 2, we
2108 subtract 1 from DST_END to assure overflow checking is
2109 necessary only at the head of loop. */
2110 unsigned char *adjusted_dst_end = dst_end - 1;
2111
2112 while (src < src_end && dst < adjusted_dst_end)
2113 {
2114 unsigned char *src_base = src;
2115 unsigned char c = *src++;
2116 if (c == '\r')
2117 {
2118 ONE_MORE_BYTE (c);
2119 if (c != '\n')
2120 *dst++ = '\r';
2121 *dst++ = c;
2122 }
2123 else
2124 *dst++ = c;
2125 continue;
2126
2127 label_end_of_loop:
2128 coding->carryover_size = src - src_base;
2129 bcopy (src_base, coding->carryover, coding->carryover_size);
2130 src = src_base;
2131 break;
2132 }
2133 *consumed = src - source;
2134 produced = dst - destination;
2135 break;
2136 }
2137
2138 case CODING_EOL_CR:
2139 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2140 bcopy (source, destination, produced);
2141 dst_end = destination + produced;
2142 while (dst < dst_end)
2143 if (*dst++ == '\r') dst[-1] = '\n';
2144 *consumed = produced;
2145 break;
2146
2147 default: /* i.e. case: CODING_EOL_LF */
2148 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2149 bcopy (source, destination, produced);
2150 *consumed = produced;
2151 break;
2152 }
2153
2154 return produced;
2155 }
2156
2157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2158 format of end-of-line according to `coding->eol_type'. If
2159 `coding->selective' is 1, code '\r' in source text also means
2160 end-of-line. */
2161
2162 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2163 struct coding_system *coding;
2164 unsigned char *source, *destination;
2165 int src_bytes, dst_bytes;
2166 int *consumed;
2167 {
2168 unsigned char *src = source;
2169 unsigned char *dst = destination;
2170 int produced;
2171
2172 if (src_bytes <= 0)
2173 return 0;
2174
2175 switch (coding->eol_type)
2176 {
2177 case CODING_EOL_LF:
2178 case CODING_EOL_UNDECIDED:
2179 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2180 bcopy (source, destination, produced);
2181 if (coding->selective)
2182 {
2183 int i = produced;
2184 while (i--)
2185 if (*dst++ == '\r') dst[-1] = '\n';
2186 }
2187 *consumed = produced;
2188
2189 case CODING_EOL_CRLF:
2190 {
2191 unsigned char c;
2192 unsigned char *src_end = source + src_bytes;
2193 unsigned char *dst_end = destination + dst_bytes;
2194 /* Since the maximum bytes produced by each loop is 2, we
2195 subtract 1 from DST_END to assure overflow checking is
2196 necessary only at the head of loop. */
2197 unsigned char *adjusted_dst_end = dst_end - 1;
2198
2199 while (src < src_end && dst < adjusted_dst_end)
2200 {
2201 c = *src++;
2202 if (c == '\n' || (c == '\r' && coding->selective))
2203 *dst++ = '\r', *dst++ = '\n';
2204 else
2205 *dst++ = c;
2206 }
2207 produced = dst - destination;
2208 *consumed = src - source;
2209 break;
2210 }
2211
2212 default: /* i.e. case CODING_EOL_CR: */
2213 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2214 bcopy (source, destination, produced);
2215 {
2216 int i = produced;
2217 while (i--)
2218 if (*dst++ == '\n') dst[-1] = '\r';
2219 }
2220 *consumed = produced;
2221 }
2222
2223 return produced;
2224 }
2225
2226 \f
2227 /*** 6. C library functions ***/
2228
2229 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2230 has a property `coding-system'. The value of this property is a
2231 vector of length 5 (called as coding-vector). Among elements of
2232 this vector, the first (element[0]) and the fifth (element[4])
2233 carry important information for decoding/encoding. Before
2234 decoding/encoding, this information should be set in fields of a
2235 structure of type `coding_system'.
2236
2237 A value of property `coding-system' can be a symbol of another
2238 subsidiary coding-system. In that case, Emacs gets coding-vector
2239 from that symbol.
2240
2241 `element[0]' contains information to be set in `coding->type'. The
2242 value and its meaning is as follows:
2243
2244 0 -- coding_type_emacs_mule
2245 1 -- coding_type_sjis
2246 2 -- coding_type_iso2022
2247 3 -- coding_type_big5
2248 4 -- coding_type_ccl encoder/decoder written in CCL
2249 nil -- coding_type_no_conversion
2250 t -- coding_type_undecided (automatic conversion on decoding,
2251 no-conversion on encoding)
2252
2253 `element[4]' contains information to be set in `coding->flags' and
2254 `coding->spec'. The meaning varies by `coding->type'.
2255
2256 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2257 of length 32 (of which the first 13 sub-elements are used now).
2258 Meanings of these sub-elements are:
2259
2260 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2261 If the value is an integer of valid charset, the charset is
2262 assumed to be designated to graphic register N initially.
2263
2264 If the value is minus, it is a minus value of charset which
2265 reserves graphic register N, which means that the charset is
2266 not designated initially but should be designated to graphic
2267 register N just before encoding a character in that charset.
2268
2269 If the value is nil, graphic register N is never used on
2270 encoding.
2271
2272 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2273 Each value takes t or nil. See the section ISO2022 of
2274 `coding.h' for more information.
2275
2276 If `coding->type' is `coding_type_big5', element[4] is t to denote
2277 BIG5-ETen or nil to denote BIG5-HKU.
2278
2279 If `coding->type' takes the other value, element[4] is ignored.
2280
2281 Emacs Lisp's coding system also carries information about format of
2282 end-of-line in a value of property `eol-type'. If the value is
2283 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2284 means CODING_EOL_CR. If it is not integer, it should be a vector
2285 of subsidiary coding systems of which property `eol-type' has one
2286 of above values.
2287
2288 */
2289
2290 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2291 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2292 is setup so that no conversion is necessary and return -1, else
2293 return 0. */
2294
2295 int
2296 setup_coding_system (coding_system, coding)
2297 Lisp_Object coding_system;
2298 struct coding_system *coding;
2299 {
2300 Lisp_Object type, eol_type;
2301
2302 /* At first, set several fields to default values. */
2303 coding->require_flushing = 0;
2304 coding->last_block = 0;
2305 coding->selective = 0;
2306 coding->composing = 0;
2307 coding->direction = 0;
2308 coding->carryover_size = 0;
2309 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2310 coding->character_unification_table_for_decode = Qnil;
2311 coding->character_unification_table_for_encode = Qnil;
2312
2313 Vlast_coding_system_used = coding->symbol = coding_system;
2314 eol_type = Qnil;
2315 /* Get value of property `coding-system' until we get a vector.
2316 While doing that, also get values of properties
2317 `post-read-conversion', `pre-write-conversion',
2318 `character-unification-table-for-decode',
2319 `character-unification-table-for-encode' and `eol-type'. */
2320 while (!NILP (coding_system) && SYMBOLP (coding_system))
2321 {
2322 if (NILP (coding->post_read_conversion))
2323 coding->post_read_conversion = Fget (coding_system,
2324 Qpost_read_conversion);
2325 if (NILP (coding->pre_write_conversion))
2326 coding->pre_write_conversion = Fget (coding_system,
2327 Qpre_write_conversion);
2328 if (!inhibit_eol_conversion && NILP (eol_type))
2329 eol_type = Fget (coding_system, Qeol_type);
2330
2331 if (NILP (coding->character_unification_table_for_decode))
2332 coding->character_unification_table_for_decode
2333 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2334
2335 if (NILP (coding->character_unification_table_for_encode))
2336 coding->character_unification_table_for_encode
2337 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2338
2339 coding_system = Fget (coding_system, Qcoding_system);
2340 }
2341
2342 while (!NILP (coding->character_unification_table_for_decode)
2343 && SYMBOLP (coding->character_unification_table_for_decode))
2344 coding->character_unification_table_for_decode
2345 = Fget (coding->character_unification_table_for_decode,
2346 Qcharacter_unification_table_for_decode);
2347 if (!NILP (coding->character_unification_table_for_decode)
2348 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2349 coding->character_unification_table_for_decode = Qnil;
2350
2351 while (!NILP (coding->character_unification_table_for_encode)
2352 && SYMBOLP (coding->character_unification_table_for_encode))
2353 coding->character_unification_table_for_encode
2354 = Fget (coding->character_unification_table_for_encode,
2355 Qcharacter_unification_table_for_encode);
2356 if (!NILP (coding->character_unification_table_for_encode)
2357 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2358 coding->character_unification_table_for_encode = Qnil;
2359
2360 if (!VECTORP (coding_system)
2361 || XVECTOR (coding_system)->size != 5)
2362 goto label_invalid_coding_system;
2363
2364 if (VECTORP (eol_type))
2365 coding->eol_type = CODING_EOL_UNDECIDED;
2366 else if (XFASTINT (eol_type) == 1)
2367 coding->eol_type = CODING_EOL_CRLF;
2368 else if (XFASTINT (eol_type) == 2)
2369 coding->eol_type = CODING_EOL_CR;
2370 else
2371 coding->eol_type = CODING_EOL_LF;
2372
2373 type = XVECTOR (coding_system)->contents[0];
2374 switch (XFASTINT (type))
2375 {
2376 case 0:
2377 coding->type = coding_type_emacs_mule;
2378 break;
2379
2380 case 1:
2381 coding->type = coding_type_sjis;
2382 break;
2383
2384 case 2:
2385 coding->type = coding_type_iso2022;
2386 {
2387 Lisp_Object val;
2388 Lisp_Object *flags;
2389 int i, charset, default_reg_bits = 0;
2390
2391 val = XVECTOR (coding_system)->contents[4];
2392
2393 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2394 goto label_invalid_coding_system;
2395
2396 flags = XVECTOR (val)->contents;
2397 coding->flags
2398 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2399 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2400 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2401 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2402 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2403 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2404 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2405 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2406 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2407 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2408 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2409 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2410 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2411 );
2412
2413 /* Invoke graphic register 0 to plane 0. */
2414 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2415 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2416 CODING_SPEC_ISO_INVOCATION (coding, 1)
2417 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2418 /* Not single shifting at first. */
2419 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2420 /* Beginning of buffer should also be regarded as bol. */
2421 CODING_SPEC_ISO_BOL (coding) = 1;
2422
2423 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2424 FLAGS[REG] can be one of below:
2425 integer CHARSET: CHARSET occupies register I,
2426 t: designate nothing to REG initially, but can be used
2427 by any charsets,
2428 list of integer, nil, or t: designate the first
2429 element (if integer) to REG initially, the remaining
2430 elements (if integer) is designated to REG on request,
2431 if an element is t, REG can be used by any charset,
2432 nil: REG is never used. */
2433 for (charset = 0; charset <= MAX_CHARSET; charset++)
2434 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2435 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2436 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2437 for (i = 0; i < 4; i++)
2438 {
2439 if (INTEGERP (flags[i])
2440 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2441 || (charset = get_charset_id (flags[i])) >= 0)
2442 {
2443 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2444 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2445 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2446 }
2447 else if (EQ (flags[i], Qt))
2448 {
2449 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2450 default_reg_bits |= 1 << i;
2451 }
2452 else if (CONSP (flags[i]))
2453 {
2454 Lisp_Object tail = flags[i];
2455
2456 if (INTEGERP (XCONS (tail)->car)
2457 && (charset = XINT (XCONS (tail)->car),
2458 CHARSET_VALID_P (charset))
2459 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2460 {
2461 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2462 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2463 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2464 }
2465 else
2466 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2467 tail = XCONS (tail)->cdr;
2468 while (CONSP (tail))
2469 {
2470 if (INTEGERP (XCONS (tail)->car)
2471 && (charset = XINT (XCONS (tail)->car),
2472 CHARSET_VALID_P (charset))
2473 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2474 {
2475 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2476 = i;
2477 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2478 = 1;
2479 }
2480 else if (EQ (XCONS (tail)->car, Qt))
2481 default_reg_bits |= 1 << i;
2482 tail = XCONS (tail)->cdr;
2483 }
2484 }
2485 else
2486 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2487
2488 CODING_SPEC_ISO_DESIGNATION (coding, i)
2489 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2490 }
2491
2492 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2493 {
2494 /* REG 1 can be used only by locking shift in 7-bit env. */
2495 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2496 default_reg_bits &= ~2;
2497 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2498 /* Without any shifting, only REG 0 and 1 can be used. */
2499 default_reg_bits &= 3;
2500 }
2501
2502 for (charset = 0; charset <= MAX_CHARSET; charset++)
2503 if (CHARSET_VALID_P (charset)
2504 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2505 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2506 {
2507 /* We have not yet decided where to designate CHARSET. */
2508 int reg_bits = default_reg_bits;
2509
2510 if (CHARSET_CHARS (charset) == 96)
2511 /* A charset of CHARS96 can't be designated to REG 0. */
2512 reg_bits &= ~1;
2513
2514 if (reg_bits)
2515 /* There exist some default graphic register. */
2516 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2517 = (reg_bits & 1
2518 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2519 else
2520 /* We anyway have to designate CHARSET to somewhere. */
2521 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2522 = (CHARSET_CHARS (charset) == 94
2523 ? 0
2524 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2525 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2526 ? 1
2527 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2528 ? 2 : 0)));
2529 }
2530 }
2531 coding->require_flushing = 1;
2532 break;
2533
2534 case 3:
2535 coding->type = coding_type_big5;
2536 coding->flags
2537 = (NILP (XVECTOR (coding_system)->contents[4])
2538 ? CODING_FLAG_BIG5_HKU
2539 : CODING_FLAG_BIG5_ETEN);
2540 break;
2541
2542 case 4:
2543 coding->type = coding_type_ccl;
2544 {
2545 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2546 if (CONSP (val)
2547 && VECTORP (XCONS (val)->car)
2548 && VECTORP (XCONS (val)->cdr))
2549 {
2550 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2551 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2552 }
2553 else
2554 goto label_invalid_coding_system;
2555 }
2556 coding->require_flushing = 1;
2557 break;
2558
2559 case 5:
2560 coding->type = coding_type_raw_text;
2561 break;
2562
2563 default:
2564 if (EQ (type, Qt))
2565 coding->type = coding_type_undecided;
2566 else
2567 coding->type = coding_type_no_conversion;
2568 break;
2569 }
2570 return 0;
2571
2572 label_invalid_coding_system:
2573 coding->type = coding_type_no_conversion;
2574 coding->eol_type = CODING_EOL_LF;
2575 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2576 = Qnil;
2577 return -1;
2578 }
2579
2580 /* Emacs has a mechanism to automatically detect a coding system if it
2581 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2582 it's impossible to distinguish some coding systems accurately
2583 because they use the same range of codes. So, at first, coding
2584 systems are categorized into 7, those are:
2585
2586 o coding-category-emacs-mule
2587
2588 The category for a coding system which has the same code range
2589 as Emacs' internal format. Assigned the coding-system (Lisp
2590 symbol) `emacs-mule' by default.
2591
2592 o coding-category-sjis
2593
2594 The category for a coding system which has the same code range
2595 as SJIS. Assigned the coding-system (Lisp
2596 symbol) `japanese-shift-jis' by default.
2597
2598 o coding-category-iso-7
2599
2600 The category for a coding system which has the same code range
2601 as ISO2022 of 7-bit environment. This doesn't use any locking
2602 shift and single shift functions. Assigned the coding-system
2603 (Lisp symbol) `iso-2022-7bit' by default.
2604
2605 o coding-category-iso-8-1
2606
2607 The category for a coding system which has the same code range
2608 as ISO2022 of 8-bit environment and graphic plane 1 used only
2609 for DIMENSION1 charset. This doesn't use any locking shift
2610 and single shift functions. Assigned the coding-system (Lisp
2611 symbol) `iso-latin-1' by default.
2612
2613 o coding-category-iso-8-2
2614
2615 The category for a coding system which has the same code range
2616 as ISO2022 of 8-bit environment and graphic plane 1 used only
2617 for DIMENSION2 charset. This doesn't use any locking shift
2618 and single shift functions. Assigned the coding-system (Lisp
2619 symbol) `japanese-iso-8bit' by default.
2620
2621 o coding-category-iso-7-else
2622
2623 The category for a coding system which has the same code range
2624 as ISO2022 of 7-bit environemnt but uses locking shift or
2625 single shift functions. Assigned the coding-system (Lisp
2626 symbol) `iso-2022-7bit-lock' by default.
2627
2628 o coding-category-iso-8-else
2629
2630 The category for a coding system which has the same code range
2631 as ISO2022 of 8-bit environemnt but uses locking shift or
2632 single shift functions. Assigned the coding-system (Lisp
2633 symbol) `iso-2022-8bit-ss2' by default.
2634
2635 o coding-category-big5
2636
2637 The category for a coding system which has the same code range
2638 as BIG5. Assigned the coding-system (Lisp symbol)
2639 `cn-big5' by default.
2640
2641 o coding-category-binary
2642
2643 The category for a coding system not categorized in any of the
2644 above. Assigned the coding-system (Lisp symbol)
2645 `no-conversion' by default.
2646
2647 Each of them is a Lisp symbol and the value is an actual
2648 `coding-system's (this is also a Lisp symbol) assigned by a user.
2649 What Emacs does actually is to detect a category of coding system.
2650 Then, it uses a `coding-system' assigned to it. If Emacs can't
2651 decide only one possible category, it selects a category of the
2652 highest priority. Priorities of categories are also specified by a
2653 user in a Lisp variable `coding-category-list'.
2654
2655 */
2656
2657 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2658 If it detects possible coding systems, return an integer in which
2659 appropriate flag bits are set. Flag bits are defined by macros
2660 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2661
2662 int
2663 detect_coding_mask (src, src_bytes)
2664 unsigned char *src;
2665 int src_bytes;
2666 {
2667 register unsigned char c;
2668 unsigned char *src_end = src + src_bytes;
2669 int mask;
2670
2671 /* At first, skip all ASCII characters and control characters except
2672 for three ISO2022 specific control characters. */
2673 label_loop_detect_coding:
2674 while (src < src_end)
2675 {
2676 c = *src;
2677 if (c >= 0x80
2678 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2679 break;
2680 src++;
2681 }
2682
2683 if (src >= src_end)
2684 /* We found nothing other than ASCII. There's nothing to do. */
2685 return CODING_CATEGORY_MASK_ANY;
2686
2687 /* The text seems to be encoded in some multilingual coding system.
2688 Now, try to find in which coding system the text is encoded. */
2689 if (c < 0x80)
2690 {
2691 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2692 /* C is an ISO2022 specific control code of C0. */
2693 mask = detect_coding_iso2022 (src, src_end);
2694 src++;
2695 if (mask == 0)
2696 /* No valid ISO2022 code follows C. Try again. */
2697 goto label_loop_detect_coding;
2698 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2699 }
2700 else if (c < 0xA0)
2701 {
2702 /* If C is a special latin extra code,
2703 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2704 or is an ISO2022 control-sequence-introducer (CSI),
2705 we should also consider the possibility of ISO2022 codings. */
2706 if ((VECTORP (Vlatin_extra_code_table)
2707 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2708 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2709 || (c == ISO_CODE_CSI
2710 && (src < src_end
2711 && (*src == ']'
2712 || (src + 1 < src_end
2713 && src[1] == ']'
2714 && (*src == '0' || *src == '1' || *src == '2'))))))
2715 mask = (detect_coding_iso2022 (src, src_end)
2716 | detect_coding_sjis (src, src_end)
2717 | detect_coding_emacs_mule (src, src_end)
2718 | CODING_CATEGORY_MASK_RAW_TEXT);
2719
2720 else
2721 /* C is the first byte of SJIS character code,
2722 or a leading-code of Emacs' internal format (emacs-mule). */
2723 mask = (detect_coding_sjis (src, src_end)
2724 | detect_coding_emacs_mule (src, src_end)
2725 | CODING_CATEGORY_MASK_RAW_TEXT);
2726 }
2727 else
2728 /* C is a character of ISO2022 in graphic plane right,
2729 or a SJIS's 1-byte character code (i.e. JISX0201),
2730 or the first byte of BIG5's 2-byte code. */
2731 mask = (detect_coding_iso2022 (src, src_end)
2732 | detect_coding_sjis (src, src_end)
2733 | detect_coding_big5 (src, src_end)
2734 | CODING_CATEGORY_MASK_RAW_TEXT);
2735
2736 return mask;
2737 }
2738
2739 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2740 The information of the detected coding system is set in CODING. */
2741
2742 void
2743 detect_coding (coding, src, src_bytes)
2744 struct coding_system *coding;
2745 unsigned char *src;
2746 int src_bytes;
2747 {
2748 int mask = detect_coding_mask (src, src_bytes);
2749 int idx;
2750 Lisp_Object val = Vcoding_category_list;
2751
2752 if (mask == CODING_CATEGORY_MASK_ANY)
2753 /* We found nothing other than ASCII. There's nothing to do. */
2754 return;
2755
2756 /* We found some plausible coding systems. Let's use a coding
2757 system of the highest priority. */
2758
2759 if (CONSP (val))
2760 while (!NILP (val))
2761 {
2762 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2763 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2764 break;
2765 val = XCONS (val)->cdr;
2766 }
2767 else
2768 val = Qnil;
2769
2770 if (NILP (val))
2771 {
2772 /* For unknown reason, `Vcoding_category_list' contains none of
2773 found categories. Let's use any of them. */
2774 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2775 if (mask & (1 << idx))
2776 break;
2777 }
2778 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2779 }
2780
2781 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2782 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2783 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2784
2785 #define MAX_EOL_CHECK_COUNT 3
2786
2787 int
2788 detect_eol_type (src, src_bytes)
2789 unsigned char *src;
2790 int src_bytes;
2791 {
2792 unsigned char *src_end = src + src_bytes;
2793 unsigned char c;
2794 int total = 0; /* How many end-of-lines are found so far. */
2795 int eol_type = CODING_EOL_UNDECIDED;
2796 int this_eol_type;
2797
2798 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2799 {
2800 c = *src++;
2801 if (c == '\n' || c == '\r')
2802 {
2803 total++;
2804 if (c == '\n')
2805 this_eol_type = CODING_EOL_LF;
2806 else if (src >= src_end || *src != '\n')
2807 this_eol_type = CODING_EOL_CR;
2808 else
2809 this_eol_type = CODING_EOL_CRLF, src++;
2810
2811 if (eol_type == CODING_EOL_UNDECIDED)
2812 /* This is the first end-of-line. */
2813 eol_type = this_eol_type;
2814 else if (eol_type != this_eol_type)
2815 /* The found type is different from what found before.
2816 Let's notice the caller about this inconsistency. */
2817 return CODING_EOL_INCONSISTENT;
2818 }
2819 }
2820
2821 return eol_type;
2822 }
2823
2824 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2825 is encoded. If it detects an appropriate format of end-of-line, it
2826 sets the information in *CODING. */
2827
2828 void
2829 detect_eol (coding, src, src_bytes)
2830 struct coding_system *coding;
2831 unsigned char *src;
2832 int src_bytes;
2833 {
2834 Lisp_Object val, coding_system;
2835 int eol_type = detect_eol_type (src, src_bytes);
2836
2837 if (eol_type == CODING_EOL_UNDECIDED)
2838 /* We found no end-of-line in the source text. */
2839 return;
2840
2841 if (eol_type == CODING_EOL_INCONSISTENT)
2842 {
2843 #if 0
2844 /* This code is suppressed until we find a better way to
2845 distinguish raw text file and binary file. */
2846
2847 /* If we have already detected that the coding is raw-text, the
2848 coding should actually be no-conversion. */
2849 if (coding->type == coding_type_raw_text)
2850 {
2851 setup_coding_system (Qno_conversion, coding);
2852 return;
2853 }
2854 /* Else, let's decode only text code anyway. */
2855 #endif /* 0 */
2856 eol_type = CODING_EOL_LF;
2857 }
2858
2859 coding_system = coding->symbol;
2860 while (!NILP (coding_system)
2861 && NILP (val = Fget (coding_system, Qeol_type)))
2862 coding_system = Fget (coding_system, Qcoding_system);
2863 if (VECTORP (val) && XVECTOR (val)->size == 3)
2864 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2865 }
2866
2867 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2868 decoding, it may detect coding system and format of end-of-line if
2869 those are not yet decided. */
2870
2871 int
2872 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2873 struct coding_system *coding;
2874 unsigned char *source, *destination;
2875 int src_bytes, dst_bytes;
2876 int *consumed;
2877 {
2878 int produced;
2879
2880 if (src_bytes <= 0)
2881 {
2882 *consumed = 0;
2883 return 0;
2884 }
2885
2886 if (coding->type == coding_type_undecided)
2887 detect_coding (coding, source, src_bytes);
2888
2889 if (coding->eol_type == CODING_EOL_UNDECIDED)
2890 detect_eol (coding, source, src_bytes);
2891
2892 coding->carryover_size = 0;
2893 switch (coding->type)
2894 {
2895 case coding_type_no_conversion:
2896 label_no_conversion:
2897 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2898 bcopy (source, destination, produced);
2899 *consumed = produced;
2900 break;
2901
2902 case coding_type_emacs_mule:
2903 case coding_type_undecided:
2904 case coding_type_raw_text:
2905 if (coding->eol_type == CODING_EOL_LF
2906 || coding->eol_type == CODING_EOL_UNDECIDED)
2907 goto label_no_conversion;
2908 produced = decode_eol (coding, source, destination,
2909 src_bytes, dst_bytes, consumed);
2910 break;
2911
2912 case coding_type_sjis:
2913 produced = decode_coding_sjis_big5 (coding, source, destination,
2914 src_bytes, dst_bytes, consumed,
2915 1);
2916 break;
2917
2918 case coding_type_iso2022:
2919 produced = decode_coding_iso2022 (coding, source, destination,
2920 src_bytes, dst_bytes, consumed);
2921 break;
2922
2923 case coding_type_big5:
2924 produced = decode_coding_sjis_big5 (coding, source, destination,
2925 src_bytes, dst_bytes, consumed,
2926 0);
2927 break;
2928
2929 case coding_type_ccl:
2930 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2931 src_bytes, dst_bytes, consumed);
2932 break;
2933 }
2934
2935 return produced;
2936 }
2937
2938 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2939
2940 int
2941 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2942 struct coding_system *coding;
2943 unsigned char *source, *destination;
2944 int src_bytes, dst_bytes;
2945 int *consumed;
2946 {
2947 int produced;
2948
2949 switch (coding->type)
2950 {
2951 case coding_type_no_conversion:
2952 label_no_conversion:
2953 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2954 if (produced > 0)
2955 {
2956 bcopy (source, destination, produced);
2957 if (coding->selective)
2958 {
2959 unsigned char *p = destination, *pend = destination + produced;
2960 while (p < pend)
2961 if (*p++ == '\015') p[-1] = '\n';
2962 }
2963 }
2964 *consumed = produced;
2965 break;
2966
2967 case coding_type_emacs_mule:
2968 case coding_type_undecided:
2969 case coding_type_raw_text:
2970 if (coding->eol_type == CODING_EOL_LF
2971 || coding->eol_type == CODING_EOL_UNDECIDED)
2972 goto label_no_conversion;
2973 produced = encode_eol (coding, source, destination,
2974 src_bytes, dst_bytes, consumed);
2975 break;
2976
2977 case coding_type_sjis:
2978 produced = encode_coding_sjis_big5 (coding, source, destination,
2979 src_bytes, dst_bytes, consumed,
2980 1);
2981 break;
2982
2983 case coding_type_iso2022:
2984 produced = encode_coding_iso2022 (coding, source, destination,
2985 src_bytes, dst_bytes, consumed);
2986 break;
2987
2988 case coding_type_big5:
2989 produced = encode_coding_sjis_big5 (coding, source, destination,
2990 src_bytes, dst_bytes, consumed,
2991 0);
2992 break;
2993
2994 case coding_type_ccl:
2995 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2996 src_bytes, dst_bytes, consumed);
2997 break;
2998 }
2999
3000 return produced;
3001 }
3002
3003 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3004
3005 /* Return maximum size (bytes) of a buffer enough for decoding
3006 SRC_BYTES of text encoded in CODING. */
3007
3008 int
3009 decoding_buffer_size (coding, src_bytes)
3010 struct coding_system *coding;
3011 int src_bytes;
3012 {
3013 int magnification;
3014
3015 if (coding->type == coding_type_iso2022)
3016 magnification = 3;
3017 else if (coding->type == coding_type_ccl)
3018 magnification = coding->spec.ccl.decoder.buf_magnification;
3019 else
3020 magnification = 2;
3021
3022 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3023 }
3024
3025 /* Return maximum size (bytes) of a buffer enough for encoding
3026 SRC_BYTES of text to CODING. */
3027
3028 int
3029 encoding_buffer_size (coding, src_bytes)
3030 struct coding_system *coding;
3031 int src_bytes;
3032 {
3033 int magnification;
3034
3035 if (coding->type == coding_type_ccl)
3036 magnification = coding->spec.ccl.encoder.buf_magnification;
3037 else
3038 magnification = 3;
3039
3040 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3041 }
3042
3043 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3044 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3045 #endif
3046
3047 char *conversion_buffer;
3048 int conversion_buffer_size;
3049
3050 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3051 or decoding. Sufficient memory is allocated automatically. If we
3052 run out of memory, return NULL. */
3053
3054 char *
3055 get_conversion_buffer (size)
3056 int size;
3057 {
3058 if (size > conversion_buffer_size)
3059 {
3060 char *buf;
3061 int real_size = conversion_buffer_size * 2;
3062
3063 while (real_size < size) real_size *= 2;
3064 buf = (char *) xmalloc (real_size);
3065 xfree (conversion_buffer);
3066 conversion_buffer = buf;
3067 conversion_buffer_size = real_size;
3068 }
3069 return conversion_buffer;
3070 }
3071
3072 \f
3073 #ifdef emacs
3074 /*** 7. Emacs Lisp library functions ***/
3075
3076 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3077 1, 1, 0,
3078 "Return coding-spec of CODING-SYSTEM.\n\
3079 If CODING-SYSTEM is not a valid coding-system, return nil.")
3080 (obj)
3081 Lisp_Object obj;
3082 {
3083 while (SYMBOLP (obj) && !NILP (obj))
3084 obj = Fget (obj, Qcoding_system);
3085 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3086 ? Qnil : obj);
3087 }
3088
3089 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3090 "Return t if OBJECT is nil or a coding-system.\n\
3091 See document of make-coding-system for coding-system object.")
3092 (obj)
3093 Lisp_Object obj;
3094 {
3095 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3096 }
3097
3098 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3099 Sread_non_nil_coding_system, 1, 1, 0,
3100 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3101 (prompt)
3102 Lisp_Object prompt;
3103 {
3104 Lisp_Object val;
3105 do
3106 {
3107 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3108 Qt, Qnil, Qnil, Qnil, Qnil);
3109 }
3110 while (XSTRING (val)->size == 0);
3111 return (Fintern (val, Qnil));
3112 }
3113
3114 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3115 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3116 (prompt)
3117 Lisp_Object prompt;
3118 {
3119 Lisp_Object val;
3120 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3121 Qt, Qnil, Qnil, Qnil, Qnil);
3122 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3123 }
3124
3125 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3126 1, 1, 0,
3127 "Check validity of CODING-SYSTEM.\n\
3128 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3129 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3130 The value of property should be a vector of length 5.")
3131 (coding_system)
3132 Lisp_Object coding_system;
3133 {
3134 CHECK_SYMBOL (coding_system, 0);
3135 if (!NILP (Fcoding_system_p (coding_system)))
3136 return coding_system;
3137 while (1)
3138 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3139 }
3140
3141 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3142 2, 2, 0,
3143 "Detect coding system of the text in the region between START and END.\n\
3144 Return a list of possible coding systems ordered by priority.\n\
3145 If only ASCII characters are found, it returns `undecided'\n\
3146 or its subsidiary coding system according to a detected end-of-line format.")
3147 (b, e)
3148 Lisp_Object b, e;
3149 {
3150 int coding_mask, eol_type;
3151 Lisp_Object val;
3152 int beg, end;
3153
3154 validate_region (&b, &e);
3155 beg = XINT (b), end = XINT (e);
3156 if (beg < GPT && end >= GPT) move_gap (end);
3157
3158 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3159 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3160
3161 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3162 {
3163 val = Qundecided;
3164 if (eol_type != CODING_EOL_UNDECIDED
3165 && eol_type != CODING_EOL_INCONSISTENT)
3166 {
3167 Lisp_Object val2;
3168 val2 = Fget (Qundecided, Qeol_type);
3169 if (VECTORP (val2))
3170 val = XVECTOR (val2)->contents[eol_type];
3171 }
3172 }
3173 else
3174 {
3175 Lisp_Object val2;
3176
3177 /* At first, gather possible coding-systems in VAL in a reverse
3178 order. */
3179 val = Qnil;
3180 for (val2 = Vcoding_category_list;
3181 !NILP (val2);
3182 val2 = XCONS (val2)->cdr)
3183 {
3184 int idx
3185 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3186 if (coding_mask & (1 << idx))
3187 {
3188 #if 0
3189 /* This code is suppressed until we find a better way to
3190 distinguish raw text file and binary file. */
3191
3192 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3193 && eol_type == CODING_EOL_INCONSISTENT)
3194 val = Fcons (Qno_conversion, val);
3195 else
3196 #endif /* 0 */
3197 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3198 }
3199 }
3200
3201 /* Then, change the order of the list, while getting subsidiary
3202 coding-systems. */
3203 val2 = val;
3204 val = Qnil;
3205 if (eol_type == CODING_EOL_INCONSISTENT)
3206 eol_type == CODING_EOL_UNDECIDED;
3207 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3208 {
3209 if (eol_type == CODING_EOL_UNDECIDED)
3210 val = Fcons (XCONS (val2)->car, val);
3211 else
3212 {
3213 Lisp_Object val3;
3214 val3 = Fget (XCONS (val2)->car, Qeol_type);
3215 if (VECTORP (val3))
3216 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3217 else
3218 val = Fcons (XCONS (val2)->car, val);
3219 }
3220 }
3221 }
3222
3223 return val;
3224 }
3225
3226 /* Scan text in the region between *BEGP and *ENDP, skip characters
3227 which we never have to encode to (iff ENCODEP is 1) or decode from
3228 coding system CODING at the head and tail, then set BEGP and ENDP
3229 to the addresses of start and end of the text we actually convert. */
3230
3231 void
3232 shrink_conversion_area (begp, endp, coding, encodep)
3233 unsigned char **begp, **endp;
3234 struct coding_system *coding;
3235 int encodep;
3236 {
3237 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3238
3239 if (coding->eol_type != CODING_EOL_LF
3240 && coding->eol_type != CODING_EOL_UNDECIDED)
3241 /* Since we anyway have to convert end-of-line format, it is not
3242 worth skipping at most 100 bytes or so. */
3243 return;
3244
3245 if (encodep) /* for encoding */
3246 {
3247 switch (coding->type)
3248 {
3249 case coding_type_no_conversion:
3250 case coding_type_emacs_mule:
3251 case coding_type_undecided:
3252 case coding_type_raw_text:
3253 /* We need no conversion. */
3254 *begp = *endp;
3255 return;
3256 case coding_type_ccl:
3257 /* We can't skip any data. */
3258 return;
3259 case coding_type_iso2022:
3260 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3261 {
3262 unsigned char *bol = beg_addr;
3263 while (beg_addr < end_addr && *beg_addr < 0x80)
3264 {
3265 beg_addr++;
3266 if (*(beg_addr - 1) == '\n')
3267 bol = beg_addr;
3268 }
3269 beg_addr = bol;
3270 goto label_skip_tail;
3271 }
3272 /* fall down ... */
3273 default:
3274 /* We can skip all ASCII characters at the head and tail. */
3275 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3276 label_skip_tail:
3277 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3278 break;
3279 }
3280 }
3281 else /* for decoding */
3282 {
3283 switch (coding->type)
3284 {
3285 case coding_type_no_conversion:
3286 /* We need no conversion. */
3287 *begp = *endp;
3288 return;
3289 case coding_type_emacs_mule:
3290 case coding_type_raw_text:
3291 if (coding->eol_type == CODING_EOL_LF)
3292 {
3293 /* We need no conversion. */
3294 *begp = *endp;
3295 return;
3296 }
3297 /* We can skip all but carriage-return. */
3298 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3299 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3300 break;
3301 case coding_type_sjis:
3302 case coding_type_big5:
3303 /* We can skip all ASCII characters at the head. */
3304 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3305 /* We can skip all ASCII characters at the tail except for
3306 the second byte of SJIS or BIG5 code. */
3307 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3308 if (end_addr != *endp)
3309 end_addr++;
3310 break;
3311 case coding_type_ccl:
3312 /* We can't skip any data. */
3313 return;
3314 default: /* i.e. case coding_type_iso2022: */
3315 {
3316 unsigned char c;
3317
3318 /* We can skip all ASCII characters except for a few
3319 control codes at the head. */
3320 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3321 && c != ISO_CODE_CR && c != ISO_CODE_SO
3322 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3323 beg_addr++;
3324 }
3325 break;
3326 }
3327 }
3328 *begp = beg_addr;
3329 *endp = end_addr;
3330 return;
3331 }
3332
3333 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3334 text between B and E. B and E are buffer position. */
3335
3336 Lisp_Object
3337 code_convert_region (b, e, coding, encodep)
3338 Lisp_Object b, e;
3339 struct coding_system *coding;
3340 int encodep;
3341 {
3342 int beg, end, len, consumed, produced;
3343 char *buf;
3344 unsigned char *begp, *endp;
3345 int pos = PT;
3346
3347 validate_region (&b, &e);
3348 beg = XINT (b), end = XINT (e);
3349 if (beg < GPT && end >= GPT)
3350 move_gap (end);
3351
3352 if (encodep && !NILP (coding->pre_write_conversion))
3353 {
3354 /* We must call a pre-conversion function which may put a new
3355 text to be converted in a new buffer. */
3356 struct buffer *old = current_buffer, *new;
3357
3358 TEMP_SET_PT (beg);
3359 call2 (coding->pre_write_conversion, b, e);
3360 if (old != current_buffer)
3361 {
3362 /* Replace the original text by the text just generated. */
3363 len = ZV - BEGV;
3364 new = current_buffer;
3365 set_buffer_internal (old);
3366 del_range (beg, end);
3367 insert_from_buffer (new, 1, len, 0);
3368 end = beg + len;
3369 }
3370 }
3371
3372 /* We may be able to shrink the conversion region. */
3373 begp = POS_ADDR (beg); endp = begp + (end - beg);
3374 shrink_conversion_area (&begp, &endp, coding, encodep);
3375
3376 if (begp == endp)
3377 /* We need no conversion. */
3378 len = end - beg;
3379 else
3380 {
3381 beg += begp - POS_ADDR (beg);
3382 end = beg + (endp - begp);
3383
3384 if (encodep)
3385 len = encoding_buffer_size (coding, end - beg);
3386 else
3387 len = decoding_buffer_size (coding, end - beg);
3388 buf = get_conversion_buffer (len);
3389
3390 coding->last_block = 1;
3391 produced = (encodep
3392 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3393 &consumed)
3394 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3395 &consumed));
3396
3397 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3398
3399 TEMP_SET_PT (beg);
3400 insert (buf, produced);
3401 del_range (PT, PT + end - beg);
3402 if (pos >= end)
3403 pos = PT + (pos - end);
3404 else if (pos > beg)
3405 pos = beg;
3406 TEMP_SET_PT (pos);
3407 }
3408
3409 if (!encodep && !NILP (coding->post_read_conversion))
3410 {
3411 /* We must call a post-conversion function which may alter
3412 the text just converted. */
3413 Lisp_Object insval;
3414
3415 beg = XINT (b);
3416 TEMP_SET_PT (beg);
3417 insval = call1 (coding->post_read_conversion, make_number (len));
3418 CHECK_NUMBER (insval, 0);
3419 len = XINT (insval);
3420 }
3421
3422 return make_number (len);
3423 }
3424
3425 Lisp_Object
3426 code_convert_string (str, coding, encodep, nocopy)
3427 Lisp_Object str, nocopy;
3428 struct coding_system *coding;
3429 int encodep;
3430 {
3431 int len, consumed, produced;
3432 char *buf;
3433 unsigned char *begp, *endp;
3434 int head_skip, tail_skip;
3435 struct gcpro gcpro1;
3436
3437 if (encodep && !NILP (coding->pre_write_conversion)
3438 || !encodep && !NILP (coding->post_read_conversion))
3439 {
3440 /* Since we have to call Lisp functions which assume target text
3441 is in a buffer, after setting a temporary buffer, call
3442 code_convert_region. */
3443 int count = specpdl_ptr - specpdl;
3444 int len = XSTRING (str)->size;
3445 Lisp_Object result;
3446 struct buffer *old = current_buffer;
3447
3448 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3449 temp_output_buffer_setup (" *code-converting-work*");
3450 set_buffer_internal (XBUFFER (Vstandard_output));
3451 insert_from_string (str, 0, len, 0);
3452 code_convert_region (make_number (BEGV), make_number (ZV),
3453 coding, encodep);
3454 result = make_buffer_string (BEGV, ZV, 0);
3455 set_buffer_internal (old);
3456 return unbind_to (count, result);
3457 }
3458
3459 /* We may be able to shrink the conversion region. */
3460 begp = XSTRING (str)->data;
3461 endp = begp + XSTRING (str)->size;
3462 shrink_conversion_area (&begp, &endp, coding, encodep);
3463
3464 if (begp == endp)
3465 /* We need no conversion. */
3466 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3467
3468 head_skip = begp - XSTRING (str)->data;
3469 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3470
3471 GCPRO1 (str);
3472
3473 if (encodep)
3474 len = encoding_buffer_size (coding, endp - begp);
3475 else
3476 len = decoding_buffer_size (coding, endp - begp);
3477 buf = get_conversion_buffer (len + head_skip + tail_skip);
3478
3479 bcopy (XSTRING (str)->data, buf, head_skip);
3480 coding->last_block = 1;
3481 produced = (encodep
3482 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3483 buf + head_skip, endp - begp, len, &consumed)
3484 : decode_coding (coding, XSTRING (str)->data + head_skip,
3485 buf + head_skip, endp - begp, len, &consumed));
3486 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3487 buf + head_skip + produced,
3488 tail_skip);
3489
3490 UNGCPRO;
3491
3492 return make_string (buf, head_skip + produced + tail_skip);
3493 }
3494
3495 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3496 3, 3, "r\nzCoding system: ",
3497 "Decode current region by specified coding system.\n\
3498 When called from a program, takes three arguments:\n\
3499 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3500 Return length of decoded text.")
3501 (b, e, coding_system)
3502 Lisp_Object b, e, coding_system;
3503 {
3504 struct coding_system coding;
3505
3506 CHECK_NUMBER_COERCE_MARKER (b, 0);
3507 CHECK_NUMBER_COERCE_MARKER (e, 1);
3508 CHECK_SYMBOL (coding_system, 2);
3509
3510 if (NILP (coding_system))
3511 return make_number (XFASTINT (e) - XFASTINT (b));
3512 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3513 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3514
3515 return code_convert_region (b, e, &coding, 0);
3516 }
3517
3518 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3519 3, 3, "r\nzCoding system: ",
3520 "Encode current region by specified coding system.\n\
3521 When called from a program, takes three arguments:\n\
3522 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3523 Return length of encoded text.")
3524 (b, e, coding_system)
3525 Lisp_Object b, e, coding_system;
3526 {
3527 struct coding_system coding;
3528
3529 CHECK_NUMBER_COERCE_MARKER (b, 0);
3530 CHECK_NUMBER_COERCE_MARKER (e, 1);
3531 CHECK_SYMBOL (coding_system, 2);
3532
3533 if (NILP (coding_system))
3534 return make_number (XFASTINT (e) - XFASTINT (b));
3535 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3536 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3537
3538 return code_convert_region (b, e, &coding, 1);
3539 }
3540
3541 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3542 2, 3, 0,
3543 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3544 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3545 of decoding.")
3546 (string, coding_system, nocopy)
3547 Lisp_Object string, coding_system, nocopy;
3548 {
3549 struct coding_system coding;
3550
3551 CHECK_STRING (string, 0);
3552 CHECK_SYMBOL (coding_system, 1);
3553
3554 if (NILP (coding_system))
3555 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3556 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3557 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3558
3559 return code_convert_string (string, &coding, 0, nocopy);
3560 }
3561
3562 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3563 2, 3, 0,
3564 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3565 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3566 of encoding.")
3567 (string, coding_system, nocopy)
3568 Lisp_Object string, coding_system, nocopy;
3569 {
3570 struct coding_system coding;
3571
3572 CHECK_STRING (string, 0);
3573 CHECK_SYMBOL (coding_system, 1);
3574
3575 if (NILP (coding_system))
3576 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3577 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3578 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3579
3580 return code_convert_string (string, &coding, 1, nocopy);
3581 }
3582
3583 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3584 "Decode a JISX0208 character of shift-jis encoding.\n\
3585 CODE is the character code in SJIS.\n\
3586 Return the corresponding character.")
3587 (code)
3588 Lisp_Object code;
3589 {
3590 unsigned char c1, c2, s1, s2;
3591 Lisp_Object val;
3592
3593 CHECK_NUMBER (code, 0);
3594 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3595 DECODE_SJIS (s1, s2, c1, c2);
3596 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3597 return val;
3598 }
3599
3600 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3601 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3602 Return the corresponding character code in SJIS.")
3603 (ch)
3604 Lisp_Object ch;
3605 {
3606 int charset, c1, c2, s1, s2;
3607 Lisp_Object val;
3608
3609 CHECK_NUMBER (ch, 0);
3610 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3611 if (charset == charset_jisx0208)
3612 {
3613 ENCODE_SJIS (c1, c2, s1, s2);
3614 XSETFASTINT (val, (s1 << 8) | s2);
3615 }
3616 else
3617 XSETFASTINT (val, 0);
3618 return val;
3619 }
3620
3621 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3622 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3623 CODE is the character code in BIG5.\n\
3624 Return the corresponding character.")
3625 (code)
3626 Lisp_Object code;
3627 {
3628 int charset;
3629 unsigned char b1, b2, c1, c2;
3630 Lisp_Object val;
3631
3632 CHECK_NUMBER (code, 0);
3633 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3634 DECODE_BIG5 (b1, b2, charset, c1, c2);
3635 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3636 return val;
3637 }
3638
3639 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3640 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3641 Return the corresponding character code in Big5.")
3642 (ch)
3643 Lisp_Object ch;
3644 {
3645 int charset, c1, c2, b1, b2;
3646 Lisp_Object val;
3647
3648 CHECK_NUMBER (ch, 0);
3649 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3650 if (charset == charset_big5_1 || charset == charset_big5_2)
3651 {
3652 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3653 XSETFASTINT (val, (b1 << 8) | b2);
3654 }
3655 else
3656 XSETFASTINT (val, 0);
3657 return val;
3658 }
3659
3660 DEFUN ("set-terminal-coding-system-internal",
3661 Fset_terminal_coding_system_internal,
3662 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3663 (coding_system)
3664 Lisp_Object coding_system;
3665 {
3666 CHECK_SYMBOL (coding_system, 0);
3667 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3668 /* We had better not send unexpected characters to terminal. */
3669 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3670
3671 return Qnil;
3672 }
3673
3674 DEFUN ("set-safe-terminal-coding-system-internal",
3675 Fset_safe_terminal_coding_system_internal,
3676 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3677 (coding_system)
3678 Lisp_Object coding_system;
3679 {
3680 CHECK_SYMBOL (coding_system, 0);
3681 setup_coding_system (Fcheck_coding_system (coding_system),
3682 &safe_terminal_coding);
3683 return Qnil;
3684 }
3685
3686 DEFUN ("terminal-coding-system",
3687 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3688 "Return coding-system of your terminal.")
3689 ()
3690 {
3691 return terminal_coding.symbol;
3692 }
3693
3694 DEFUN ("set-keyboard-coding-system-internal",
3695 Fset_keyboard_coding_system_internal,
3696 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3697 (coding_system)
3698 Lisp_Object coding_system;
3699 {
3700 CHECK_SYMBOL (coding_system, 0);
3701 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3702 return Qnil;
3703 }
3704
3705 DEFUN ("keyboard-coding-system",
3706 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3707 "Return coding-system of what is sent from terminal keyboard.")
3708 ()
3709 {
3710 return keyboard_coding.symbol;
3711 }
3712
3713 \f
3714 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3715 Sfind_operation_coding_system, 1, MANY, 0,
3716 "Choose a coding system for an operation based on the target name.\n\
3717 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3718 DECODING-SYSTEM is the coding system to use for decoding\n\
3719 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3720 for encoding (in case OPERATION does encoding).\n\
3721 \n\
3722 The first argument OPERATION specifies an I/O primitive:\n\
3723 For file I/O, `insert-file-contents' or `write-region'.\n\
3724 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3725 For network I/O, `open-network-stream'.\n\
3726 \n\
3727 The remaining arguments should be the same arguments that were passed\n\
3728 to the primitive. Depending on which primitive, one of those arguments\n\
3729 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3730 whichever argument specifies the file name is TARGET.\n\
3731 \n\
3732 TARGET has a meaning which depends on OPERATION:\n\
3733 For file I/O, TARGET is a file name.\n\
3734 For process I/O, TARGET is a process name.\n\
3735 For network I/O, TARGET is a service name or a port number\n\
3736 \n\
3737 This function looks up what specified for TARGET in,\n\
3738 `file-coding-system-alist', `process-coding-system-alist',\n\
3739 or `network-coding-system-alist' depending on OPERATION.\n\
3740 They may specify a coding system, a cons of coding systems,\n\
3741 or a function symbol to call.\n\
3742 In the last case, we call the function with one argument,\n\
3743 which is a list of all the arguments given to this function.")
3744 (nargs, args)
3745 int nargs;
3746 Lisp_Object *args;
3747 {
3748 Lisp_Object operation, target_idx, target, val;
3749 register Lisp_Object chain;
3750
3751 if (nargs < 2)
3752 error ("Too few arguments");
3753 operation = args[0];
3754 if (!SYMBOLP (operation)
3755 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3756 error ("Invalid first arguement");
3757 if (nargs < 1 + XINT (target_idx))
3758 error ("Too few arguments for operation: %s",
3759 XSYMBOL (operation)->name->data);
3760 target = args[XINT (target_idx) + 1];
3761 if (!(STRINGP (target)
3762 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3763 error ("Invalid %dth argument", XINT (target_idx) + 1);
3764
3765 chain = ((EQ (operation, Qinsert_file_contents)
3766 || EQ (operation, Qwrite_region))
3767 ? Vfile_coding_system_alist
3768 : (EQ (operation, Qopen_network_stream)
3769 ? Vnetwork_coding_system_alist
3770 : Vprocess_coding_system_alist));
3771 if (NILP (chain))
3772 return Qnil;
3773
3774 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3775 {
3776 Lisp_Object elt;
3777 elt = XCONS (chain)->car;
3778
3779 if (CONSP (elt)
3780 && ((STRINGP (target)
3781 && STRINGP (XCONS (elt)->car)
3782 && fast_string_match (XCONS (elt)->car, target) >= 0)
3783 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3784 {
3785 val = XCONS (elt)->cdr;
3786 if (CONSP (val))
3787 return val;
3788 if (! SYMBOLP (val))
3789 return Qnil;
3790 if (! NILP (Fcoding_system_p (val)))
3791 return Fcons (val, val);
3792 if (!NILP (Ffboundp (val)))
3793 return call1 (val, Flist (nargs, args));
3794 return Qnil;
3795 }
3796 }
3797 return Qnil;
3798 }
3799
3800 #endif /* emacs */
3801
3802 \f
3803 /*** 8. Post-amble ***/
3804
3805 init_coding_once ()
3806 {
3807 int i;
3808
3809 /* Emacs' internal format specific initialize routine. */
3810 for (i = 0; i <= 0x20; i++)
3811 emacs_code_class[i] = EMACS_control_code;
3812 emacs_code_class[0x0A] = EMACS_linefeed_code;
3813 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3814 for (i = 0x21 ; i < 0x7F; i++)
3815 emacs_code_class[i] = EMACS_ascii_code;
3816 emacs_code_class[0x7F] = EMACS_control_code;
3817 emacs_code_class[0x80] = EMACS_leading_code_composition;
3818 for (i = 0x81; i < 0xFF; i++)
3819 emacs_code_class[i] = EMACS_invalid_code;
3820 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3821 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3822 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3823 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3824
3825 /* ISO2022 specific initialize routine. */
3826 for (i = 0; i < 0x20; i++)
3827 iso_code_class[i] = ISO_control_code;
3828 for (i = 0x21; i < 0x7F; i++)
3829 iso_code_class[i] = ISO_graphic_plane_0;
3830 for (i = 0x80; i < 0xA0; i++)
3831 iso_code_class[i] = ISO_control_code;
3832 for (i = 0xA1; i < 0xFF; i++)
3833 iso_code_class[i] = ISO_graphic_plane_1;
3834 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3835 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3836 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3837 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3838 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3839 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3840 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3841 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3842 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3843 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3844
3845 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3846 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3847
3848 setup_coding_system (Qnil, &keyboard_coding);
3849 setup_coding_system (Qnil, &terminal_coding);
3850 setup_coding_system (Qnil, &safe_terminal_coding);
3851
3852 #if defined (MSDOS) || defined (WINDOWSNT)
3853 system_eol_type = CODING_EOL_CRLF;
3854 #else
3855 system_eol_type = CODING_EOL_LF;
3856 #endif
3857 }
3858
3859 #ifdef emacs
3860
3861 syms_of_coding ()
3862 {
3863 Qtarget_idx = intern ("target-idx");
3864 staticpro (&Qtarget_idx);
3865
3866 /* Target FILENAME is the first argument. */
3867 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3868 /* Target FILENAME is the third argument. */
3869 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3870
3871 Qcall_process = intern ("call-process");
3872 staticpro (&Qcall_process);
3873 /* Target PROGRAM is the first argument. */
3874 Fput (Qcall_process, Qtarget_idx, make_number (0));
3875
3876 Qcall_process_region = intern ("call-process-region");
3877 staticpro (&Qcall_process_region);
3878 /* Target PROGRAM is the third argument. */
3879 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3880
3881 Qstart_process = intern ("start-process");
3882 staticpro (&Qstart_process);
3883 /* Target PROGRAM is the third argument. */
3884 Fput (Qstart_process, Qtarget_idx, make_number (2));
3885
3886 Qopen_network_stream = intern ("open-network-stream");
3887 staticpro (&Qopen_network_stream);
3888 /* Target SERVICE is the fourth argument. */
3889 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3890
3891 Qcoding_system = intern ("coding-system");
3892 staticpro (&Qcoding_system);
3893
3894 Qeol_type = intern ("eol-type");
3895 staticpro (&Qeol_type);
3896
3897 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3898 staticpro (&Qbuffer_file_coding_system);
3899
3900 Qpost_read_conversion = intern ("post-read-conversion");
3901 staticpro (&Qpost_read_conversion);
3902
3903 Qpre_write_conversion = intern ("pre-write-conversion");
3904 staticpro (&Qpre_write_conversion);
3905
3906 Qno_conversion = intern ("no-conversion");
3907 staticpro (&Qno_conversion);
3908
3909 Qundecided = intern ("undecided");
3910 staticpro (&Qundecided);
3911
3912 Qcoding_system_spec = intern ("coding-system-spec");
3913 staticpro (&Qcoding_system_spec);
3914
3915 Qcoding_system_p = intern ("coding-system-p");
3916 staticpro (&Qcoding_system_p);
3917
3918 Qcoding_system_error = intern ("coding-system-error");
3919 staticpro (&Qcoding_system_error);
3920
3921 Fput (Qcoding_system_error, Qerror_conditions,
3922 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3923 Fput (Qcoding_system_error, Qerror_message,
3924 build_string ("Invalid coding system"));
3925
3926 Qcoding_category_index = intern ("coding-category-index");
3927 staticpro (&Qcoding_category_index);
3928
3929 {
3930 int i;
3931 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3932 {
3933 coding_category_table[i] = intern (coding_category_name[i]);
3934 staticpro (&coding_category_table[i]);
3935 Fput (coding_category_table[i], Qcoding_category_index,
3936 make_number (i));
3937 }
3938 }
3939
3940 Qcharacter_unification_table = intern ("character-unification-table");
3941 staticpro (&Qcharacter_unification_table);
3942 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3943 make_number (0));
3944
3945 Qcharacter_unification_table_for_decode
3946 = intern ("character-unification-table-for-decode");
3947 staticpro (&Qcharacter_unification_table_for_decode);
3948
3949 Qcharacter_unification_table_for_encode
3950 = intern ("character-unification-table-for-encode");
3951 staticpro (&Qcharacter_unification_table_for_encode);
3952
3953 Qemacs_mule = intern ("emacs-mule");
3954 staticpro (&Qemacs_mule);
3955
3956 defsubr (&Scoding_system_spec);
3957 defsubr (&Scoding_system_p);
3958 defsubr (&Sread_coding_system);
3959 defsubr (&Sread_non_nil_coding_system);
3960 defsubr (&Scheck_coding_system);
3961 defsubr (&Sdetect_coding_region);
3962 defsubr (&Sdecode_coding_region);
3963 defsubr (&Sencode_coding_region);
3964 defsubr (&Sdecode_coding_string);
3965 defsubr (&Sencode_coding_string);
3966 defsubr (&Sdecode_sjis_char);
3967 defsubr (&Sencode_sjis_char);
3968 defsubr (&Sdecode_big5_char);
3969 defsubr (&Sencode_big5_char);
3970 defsubr (&Sset_terminal_coding_system_internal);
3971 defsubr (&Sset_safe_terminal_coding_system_internal);
3972 defsubr (&Sterminal_coding_system);
3973 defsubr (&Sset_keyboard_coding_system_internal);
3974 defsubr (&Skeyboard_coding_system);
3975 defsubr (&Sfind_operation_coding_system);
3976
3977 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3978 "List of coding-categories (symbols) ordered by priority.");
3979 {
3980 int i;
3981
3982 Vcoding_category_list = Qnil;
3983 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3984 Vcoding_category_list
3985 = Fcons (coding_category_table[i], Vcoding_category_list);
3986 }
3987
3988 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3989 "Specify the coding system for read operations.\n\
3990 It is useful to bind this variable with `let', but do not set it globally.\n\
3991 If the value is a coding system, it is used for decoding on read operation.\n\
3992 If not, an appropriate element is used from one of the coding system alists:\n\
3993 There are three such tables, `file-coding-system-alist',\n\
3994 `process-coding-system-alist', and `network-coding-system-alist'.");
3995 Vcoding_system_for_read = Qnil;
3996
3997 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3998 "Specify the coding system for write operations.\n\
3999 It is useful to bind this variable with `let', but do not set it globally.\n\
4000 If the value is a coding system, it is used for encoding on write operation.\n\
4001 If not, an appropriate element is used from one of the coding system alists:\n\
4002 There are three such tables, `file-coding-system-alist',\n\
4003 `process-coding-system-alist', and `network-coding-system-alist'.");
4004 Vcoding_system_for_write = Qnil;
4005
4006 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4007 "Coding system used in the latest file or process I/O.");
4008 Vlast_coding_system_used = Qnil;
4009
4010 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4011 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4012 inhibit_eol_conversion = 0;
4013
4014 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4015 "Alist to decide a coding system to use for a file I/O operation.\n\
4016 The format is ((PATTERN . VAL) ...),\n\
4017 where PATTERN is a regular expression matching a file name,\n\
4018 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4019 If VAL is a coding system, it is used for both decoding and encoding\n\
4020 the file contents.\n\
4021 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4022 and the cdr part is used for encoding.\n\
4023 If VAL is a function symbol, the function must return a coding system\n\
4024 or a cons of coding systems which are used as above.\n\
4025 \n\
4026 See also the function `find-operation-coding-system'.");
4027 Vfile_coding_system_alist = Qnil;
4028
4029 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4030 "Alist to decide a coding system to use for a process I/O operation.\n\
4031 The format is ((PATTERN . VAL) ...),\n\
4032 where PATTERN is a regular expression matching a program name,\n\
4033 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4034 If VAL is a coding system, it is used for both decoding what received\n\
4035 from the program and encoding what sent to the program.\n\
4036 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4037 and the cdr part is used for encoding.\n\
4038 If VAL is a function symbol, the function must return a coding system\n\
4039 or a cons of coding systems which are used as above.\n\
4040 \n\
4041 See also the function `find-operation-coding-system'.");
4042 Vprocess_coding_system_alist = Qnil;
4043
4044 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4045 "Alist to decide a coding system to use for a network I/O operation.\n\
4046 The format is ((PATTERN . VAL) ...),\n\
4047 where PATTERN is a regular expression matching a network service name\n\
4048 or is a port number to connect to,\n\
4049 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4050 If VAL is a coding system, it is used for both decoding what received\n\
4051 from the network stream and encoding what sent to the network stream.\n\
4052 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4053 and the cdr part is used for encoding.\n\
4054 If VAL is a function symbol, the function must return a coding system\n\
4055 or a cons of coding systems which are used as above.\n\
4056 \n\
4057 See also the function `find-operation-coding-system'.");
4058 Vnetwork_coding_system_alist = Qnil;
4059
4060 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4061 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4062 eol_mnemonic_unix = ':';
4063
4064 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4065 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4066 eol_mnemonic_dos = '\\';
4067
4068 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4069 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4070 eol_mnemonic_mac = '/';
4071
4072 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4073 "Mnemonic character indicating end-of-line format is not yet decided.");
4074 eol_mnemonic_undecided = ':';
4075
4076 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4077 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4078 Venable_character_unification = Qt;
4079
4080 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4081 &Vstandard_character_unification_table_for_decode,
4082 "Table for unifying characters when reading.");
4083 Vstandard_character_unification_table_for_decode = Qnil;
4084
4085 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4086 &Vstandard_character_unification_table_for_encode,
4087 "Table for unifying characters when writing.");
4088 Vstandard_character_unification_table_for_encode = Qnil;
4089
4090 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4091 "Alist of charsets vs revision numbers.\n\
4092 While encoding, if a charset (car part of an element) is found,\n\
4093 designate it with the escape sequence identifing revision (cdr part of the element).");
4094 Vcharset_revision_alist = Qnil;
4095
4096 DEFVAR_LISP ("default-process-coding-system",
4097 &Vdefault_process_coding_system,
4098 "Cons of coding systems used for process I/O by default.\n\
4099 The car part is used for decoding a process output,\n\
4100 the cdr part is used for encoding a text to be sent to a process.");
4101 Vdefault_process_coding_system = Qnil;
4102
4103 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4104 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4105 This is a vector of length 256.\n\
4106 If Nth element is non-nil, the existence of code N in a file\n\
4107 (or output of subprocess) doesn't prevent it to be detected as\n\
4108 a coding system of ISO 2022 variant which has a flag\n\
4109 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4110 or reading output of a subprocess.\n\
4111 Only 128th through 159th elements has a meaning.");
4112 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4113 }
4114
4115 #endif /* emacs */