[! HAVE_LIBXMU] (LIBXMU): Define as empty.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
77
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
83 Emacs represents a coding-system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
87
88 */
89
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
96
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
101
102 */
103
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
112 int
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
115 {
116 ...
117 }
118 #endif
119
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
128 #if 0
129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
134 {
135 ...
136 }
137 #endif
138
139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
140
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
147 #if 0
148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
153 {
154 ...
155 }
156 #endif
157
158 /*** COMMONLY USED MACROS ***/
159
160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
165
166 #define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
173
174 #define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
181
182 #define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190 /* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
196
197 /* Decode one ASCII character C. */
198
199 #define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
206
207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
208 position-code is C. */
209
210 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
221
222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
223 position-codes are C1 and C2. */
224
225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
230
231 \f
232 /*** 1. Preamble ***/
233
234 #include <stdio.h>
235
236 #ifdef emacs
237
238 #include <config.h>
239 #include "lisp.h"
240 #include "buffer.h"
241 #include "charset.h"
242 #include "ccl.h"
243 #include "coding.h"
244 #include "window.h"
245
246 #else /* not emacs */
247
248 #include "mulelib.h"
249
250 #endif /* not emacs */
251
252 Lisp_Object Qcoding_system, Qeol_type;
253 Lisp_Object Qbuffer_file_coding_system;
254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
255 Lisp_Object Qno_conversion, Qundecided;
256 Lisp_Object Qcoding_system_history;
257
258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
260 Lisp_Object Qstart_process, Qopen_network_stream;
261 Lisp_Object Qtarget_idx;
262
263 /* Mnemonic character of each format of end-of-line. */
264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
265 /* Mnemonic character to indicate format of end-of-line is not yet
266 decided. */
267 int eol_mnemonic_undecided;
268
269 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
271 int system_eol_type;
272
273 #ifdef emacs
274
275 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
276
277 /* Coding system emacs-mule is for converting only end-of-line format. */
278 Lisp_Object Qemacs_mule;
279
280 /* Coding-systems are handed between Emacs Lisp programs and C internal
281 routines by the following three variables. */
282 /* Coding-system for reading files and receiving data from process. */
283 Lisp_Object Vcoding_system_for_read;
284 /* Coding-system for writing files and sending data to process. */
285 Lisp_Object Vcoding_system_for_write;
286 /* Coding-system actually used in the latest I/O. */
287 Lisp_Object Vlast_coding_system_used;
288
289 /* A vector of length 256 which contains information about special
290 Latin codes (espepcially for dealing with Microsoft code). */
291 Lisp_Object Vlatin_extra_code_table;
292
293 /* Flag to inhibit code conversion of end-of-line format. */
294 int inhibit_eol_conversion;
295
296 /* Coding system to be used to encode text for terminal display. */
297 struct coding_system terminal_coding;
298
299 /* Coding system to be used to encode text for terminal display when
300 terminal coding system is nil. */
301 struct coding_system safe_terminal_coding;
302
303 /* Coding system of what is sent from terminal keyboard. */
304 struct coding_system keyboard_coding;
305
306 Lisp_Object Vfile_coding_system_alist;
307 Lisp_Object Vprocess_coding_system_alist;
308 Lisp_Object Vnetwork_coding_system_alist;
309
310 #endif /* emacs */
311
312 Lisp_Object Qcoding_category_index;
313
314 /* List of symbols `coding-category-xxx' ordered by priority. */
315 Lisp_Object Vcoding_category_list;
316
317 /* Table of coding-systems currently assigned to each coding-category. */
318 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
319
320 /* Table of names of symbol for each coding-category. */
321 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
322 "coding-category-emacs-mule",
323 "coding-category-sjis",
324 "coding-category-iso-7",
325 "coding-category-iso-8-1",
326 "coding-category-iso-8-2",
327 "coding-category-iso-7-else",
328 "coding-category-iso-8-else",
329 "coding-category-big5",
330 "coding-category-raw-text",
331 "coding-category-binary"
332 };
333
334 /* Flag to tell if we look up unification table on character code
335 conversion. */
336 Lisp_Object Venable_character_unification;
337 /* Standard unification table to look up on decoding (reading). */
338 Lisp_Object Vstandard_character_unification_table_for_decode;
339 /* Standard unification table to look up on encoding (writing). */
340 Lisp_Object Vstandard_character_unification_table_for_encode;
341
342 Lisp_Object Qcharacter_unification_table;
343 Lisp_Object Qcharacter_unification_table_for_decode;
344 Lisp_Object Qcharacter_unification_table_for_encode;
345
346 /* Alist of charsets vs revision number. */
347 Lisp_Object Vcharset_revision_alist;
348
349 /* Default coding systems used for process I/O. */
350 Lisp_Object Vdefault_process_coding_system;
351
352 \f
353 /*** 2. Emacs internal format (emacs-mule) handlers ***/
354
355 /* Emacs' internal format for encoding multiple character sets is a
356 kind of multi-byte encoding, i.e. characters are encoded by
357 variable-length sequences of one-byte codes. ASCII characters
358 and control characters (e.g. `tab', `newline') are represented by
359 one-byte sequences which are their ASCII codes, in the range 0x00
360 through 0x7F. The other characters are represented by a sequence
361 of `base leading-code', optional `extended leading-code', and one
362 or two `position-code's. The length of the sequence is determined
363 by the base leading-code. Leading-code takes the range 0x80
364 through 0x9F, whereas extended leading-code and position-code take
365 the range 0xA0 through 0xFF. See `charset.h' for more details
366 about leading-code and position-code.
367
368 There's one exception to this rule. Special leading-code
369 `leading-code-composition' denotes that the following several
370 characters should be composed into one character. Leading-codes of
371 components (except for ASCII) are added 0x20. An ASCII character
372 component is represented by a 2-byte sequence of `0xA0' and
373 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
374 details of composite character. Hence, we can summarize the code
375 range as follows:
376
377 --- CODE RANGE of Emacs' internal format ---
378 (character set) (range)
379 ASCII 0x00 .. 0x7F
380 ELSE (1st byte) 0x80 .. 0x9F
381 (rest bytes) 0xA0 .. 0xFF
382 ---------------------------------------------
383
384 */
385
386 enum emacs_code_class_type emacs_code_class[256];
387
388 /* Go to the next statement only if *SRC is accessible and the code is
389 greater than 0xA0. */
390 #define CHECK_CODE_RANGE_A0_FF \
391 do { \
392 if (src >= src_end) \
393 goto label_end_of_switch; \
394 else if (*src++ < 0xA0) \
395 return 0; \
396 } while (0)
397
398 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
399 Check if a text is encoded in Emacs' internal format. If it is,
400 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
401
402 int
403 detect_coding_emacs_mule (src, src_end)
404 unsigned char *src, *src_end;
405 {
406 unsigned char c;
407 int composing = 0;
408
409 while (src < src_end)
410 {
411 c = *src++;
412
413 if (composing)
414 {
415 if (c < 0xA0)
416 composing = 0;
417 else
418 c -= 0x20;
419 }
420
421 switch (emacs_code_class[c])
422 {
423 case EMACS_ascii_code:
424 case EMACS_linefeed_code:
425 break;
426
427 case EMACS_control_code:
428 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
429 return 0;
430 break;
431
432 case EMACS_invalid_code:
433 return 0;
434
435 case EMACS_leading_code_composition: /* c == 0x80 */
436 if (composing)
437 CHECK_CODE_RANGE_A0_FF;
438 else
439 composing = 1;
440 break;
441
442 case EMACS_leading_code_4:
443 CHECK_CODE_RANGE_A0_FF;
444 /* fall down to check it two more times ... */
445
446 case EMACS_leading_code_3:
447 CHECK_CODE_RANGE_A0_FF;
448 /* fall down to check it one more time ... */
449
450 case EMACS_leading_code_2:
451 CHECK_CODE_RANGE_A0_FF;
452 break;
453
454 default:
455 label_end_of_switch:
456 break;
457 }
458 }
459 return CODING_CATEGORY_MASK_EMACS_MULE;
460 }
461
462 \f
463 /*** 3. ISO2022 handlers ***/
464
465 /* The following note describes the coding system ISO2022 briefly.
466 Since the intention of this note is to help in understanding of
467 the programs in this file, some parts are NOT ACCURATE or OVERLY
468 SIMPLIFIED. For the thorough understanding, please refer to the
469 original document of ISO2022.
470
471 ISO2022 provides many mechanisms to encode several character sets
472 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
473 all text is encoded by codes of less than 128. This may make the
474 encoded text a little bit longer, but the text gets more stability
475 to pass through several gateways (some of them strip off the MSB).
476
477 There are two kinds of character set: control character set and
478 graphic character set. The former contains control characters such
479 as `newline' and `escape' to provide control functions (control
480 functions are provided also by escape sequences). The latter
481 contains graphic characters such as ' A' and '-'. Emacs recognizes
482 two control character sets and many graphic character sets.
483
484 Graphic character sets are classified into one of the following
485 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
486 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
487 bytes (DIMENSION) and the number of characters in one dimension
488 (CHARS) of the set. In addition, each character set is assigned an
489 identification tag (called "final character" and denoted as <F>
490 here after) which is unique in each class. <F> of each character
491 set is decided by ECMA(*) when it is registered in ISO. Code range
492 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
493
494 Note (*): ECMA = European Computer Manufacturers Association
495
496 Here are examples of graphic character set [NAME(<F>)]:
497 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
498 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
499 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
500 o DIMENSION2_CHARS96 -- none for the moment
501
502 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
503 C0 [0x00..0x1F] -- control character plane 0
504 GL [0x20..0x7F] -- graphic character plane 0
505 C1 [0x80..0x9F] -- control character plane 1
506 GR [0xA0..0xFF] -- graphic character plane 1
507
508 A control character set is directly designated and invoked to C0 or
509 C1 by an escape sequence. The most common case is that ISO646's
510 control character set is designated/invoked to C0 and ISO6429's
511 control character set is designated/invoked to C1, and usually
512 these designations/invocations are omitted in a coded text. With
513 7-bit environment, only C0 can be used, and a control character for
514 C1 is encoded by an appropriate escape sequence to fit in the
515 environment. All control characters for C1 are defined the
516 corresponding escape sequences.
517
518 A graphic character set is at first designated to one of four
519 graphic registers (G0 through G3), then these graphic registers are
520 invoked to GL or GR. These designations and invocations can be
521 done independently. The most common case is that G0 is invoked to
522 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
523 these invocations and designations are omitted in a coded text.
524 With 7-bit environment, only GL can be used.
525
526 When a graphic character set of CHARS94 is invoked to GL, code 0x20
527 and 0x7F of GL area work as control characters SPACE and DEL
528 respectively, and code 0xA0 and 0xFF of GR area should not be used.
529
530 There are two ways of invocation: locking-shift and single-shift.
531 With locking-shift, the invocation lasts until the next different
532 invocation, whereas with single-shift, the invocation works only
533 for the following character and doesn't affect locking-shift.
534 Invocations are done by the following control characters or escape
535 sequences.
536
537 ----------------------------------------------------------------------
538 function control char escape sequence description
539 ----------------------------------------------------------------------
540 SI (shift-in) 0x0F none invoke G0 to GL
541 SO (shift-out) 0x0E none invoke G1 to GL
542 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
543 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
544 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
545 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
546 ----------------------------------------------------------------------
547 The first four are for locking-shift. Control characters for these
548 functions are defined by macros ISO_CODE_XXX in `coding.h'.
549
550 Designations are done by the following escape sequences.
551 ----------------------------------------------------------------------
552 escape sequence description
553 ----------------------------------------------------------------------
554 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
555 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
556 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
557 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
558 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
559 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
560 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
561 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
562 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
563 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
564 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
565 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
566 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
567 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
568 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
569 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
570 ----------------------------------------------------------------------
571
572 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
573 of dimension 1, chars 94, and final character <F>, and etc.
574
575 Note (*): Although these designations are not allowed in ISO2022,
576 Emacs accepts them on decoding, and produces them on encoding
577 CHARS96 character set in a coding system which is characterized as
578 7-bit environment, non-locking-shift, and non-single-shift.
579
580 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
581 '(' can be omitted. We call this as "short-form" here after.
582
583 Now you may notice that there are a lot of ways for encoding the
584 same multilingual text in ISO2022. Actually, there exists many
585 coding systems such as Compound Text (used in X's inter client
586 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
587 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
588 localized platforms), and all of these are variants of ISO2022.
589
590 In addition to the above, Emacs handles two more kinds of escape
591 sequences: ISO6429's direction specification and Emacs' private
592 sequence for specifying character composition.
593
594 ISO6429's direction specification takes the following format:
595 o CSI ']' -- end of the current direction
596 o CSI '0' ']' -- end of the current direction
597 o CSI '1' ']' -- start of left-to-right text
598 o CSI '2' ']' -- start of right-to-left text
599 The control character CSI (0x9B: control sequence introducer) is
600 abbreviated to the escape sequence ESC '[' in 7-bit environment.
601
602 Character composition specification takes the following format:
603 o ESC '0' -- start character composition
604 o ESC '1' -- end character composition
605 Since these are not standard escape sequences of any ISO, the use
606 of them for these meaning is restricted to Emacs only. */
607
608 enum iso_code_class_type iso_code_class[256];
609
610 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
611 Check if a text is encoded in ISO2022. If it is, returns an
612 integer in which appropriate flag bits any of:
613 CODING_CATEGORY_MASK_ISO_7
614 CODING_CATEGORY_MASK_ISO_8_1
615 CODING_CATEGORY_MASK_ISO_8_2
616 CODING_CATEGORY_MASK_ISO_7_ELSE
617 CODING_CATEGORY_MASK_ISO_8_ELSE
618 are set. If a code which should never appear in ISO2022 is found,
619 returns 0. */
620
621 int
622 detect_coding_iso2022 (src, src_end)
623 unsigned char *src, *src_end;
624 {
625 int mask = (CODING_CATEGORY_MASK_ISO_7
626 | CODING_CATEGORY_MASK_ISO_8_1
627 | CODING_CATEGORY_MASK_ISO_8_2
628 | CODING_CATEGORY_MASK_ISO_7_ELSE
629 | CODING_CATEGORY_MASK_ISO_8_ELSE
630 );
631 int g1 = 0; /* 1 iff designating to G1. */
632 int c, i;
633 struct coding_system coding_iso_8_1, coding_iso_8_2;
634
635 /* Coding systems of these categories may accept latin extra codes. */
636 setup_coding_system
637 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
638 &coding_iso_8_1);
639 setup_coding_system
640 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
641 &coding_iso_8_2);
642
643 while (mask && src < src_end)
644 {
645 c = *src++;
646 switch (c)
647 {
648 case ISO_CODE_ESC:
649 if (src >= src_end)
650 break;
651 c = *src++;
652 if ((c >= '(' && c <= '/'))
653 {
654 /* Designation sequence for a charset of dimension 1. */
655 if (src >= src_end)
656 break;
657 c = *src++;
658 if (c < ' ' || c >= 0x80)
659 /* Invalid designation sequence. */
660 return 0;
661 }
662 else if (c == '$')
663 {
664 /* Designation sequence for a charset of dimension 2. */
665 if (src >= src_end)
666 break;
667 c = *src++;
668 if (c >= '@' && c <= 'B')
669 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
670 ;
671 else if (c >= '(' && c <= '/')
672 {
673 if (src >= src_end)
674 break;
675 c = *src++;
676 if (c < ' ' || c >= 0x80)
677 /* Invalid designation sequence. */
678 return 0;
679 }
680 else
681 /* Invalid designation sequence. */
682 return 0;
683 }
684 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
685 /* Locking shift. */
686 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
687 | CODING_CATEGORY_MASK_ISO_8_ELSE);
688 else if (c == '0' || c == '1' || c == '2')
689 /* Start/end composition. */
690 ;
691 else
692 /* Invalid escape sequence. */
693 return 0;
694 break;
695
696 case ISO_CODE_SO:
697 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
698 | CODING_CATEGORY_MASK_ISO_8_ELSE);
699 break;
700
701 case ISO_CODE_CSI:
702 case ISO_CODE_SS2:
703 case ISO_CODE_SS3:
704 {
705 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
706
707 if (VECTORP (Vlatin_extra_code_table)
708 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
709 {
710 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
711 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
712 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
713 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
714 }
715 mask &= newmask;
716 }
717 break;
718
719 default:
720 if (c < 0x80)
721 break;
722 else if (c < 0xA0)
723 {
724 if (VECTORP (Vlatin_extra_code_table)
725 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
726 {
727 int newmask = 0;
728
729 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
730 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
731 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
732 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
733 mask &= newmask;
734 }
735 else
736 return 0;
737 }
738 else
739 {
740 unsigned char *src_begin = src;
741
742 mask &= ~(CODING_CATEGORY_MASK_ISO_7
743 | CODING_CATEGORY_MASK_ISO_7_ELSE);
744 while (src < src_end && *src >= 0xA0)
745 src++;
746 if ((src - src_begin - 1) & 1 && src < src_end)
747 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
748 }
749 break;
750 }
751 }
752
753 return mask;
754 }
755
756 /* Decode a character of which charset is CHARSET and the 1st position
757 code is C1. If dimension of CHARSET is 2, the 2nd position code is
758 fetched from SRC and set to C2. If CHARSET is negative, it means
759 that we are decoding ill formed text, and what we can do is just to
760 read C1 as is. */
761
762 #define DECODE_ISO_CHARACTER(charset, c1) \
763 do { \
764 int c_alt, charset_alt = (charset); \
765 if (COMPOSING_HEAD_P (coding->composing)) \
766 { \
767 *dst++ = LEADING_CODE_COMPOSITION; \
768 if (COMPOSING_WITH_RULE_P (coding->composing)) \
769 /* To tell composition rules are embeded. */ \
770 *dst++ = 0xFF; \
771 coding->composing += 2; \
772 } \
773 if ((charset) >= 0) \
774 { \
775 if (CHARSET_DIMENSION (charset) == 2) \
776 ONE_MORE_BYTE (c2); \
777 if (!NILP (unification_table) \
778 && ((c_alt = unify_char (unification_table, \
779 -1, (charset), c1, c2)) >= 0)) \
780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
781 } \
782 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
783 DECODE_CHARACTER_ASCII (c1); \
784 else if (CHARSET_DIMENSION (charset_alt) == 1) \
785 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
786 else \
787 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
788 if (COMPOSING_WITH_RULE_P (coding->composing)) \
789 /* To tell a composition rule follows. */ \
790 coding->composing = COMPOSING_WITH_RULE_RULE; \
791 } while (0)
792
793 /* Set designation state into CODING. */
794 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
795 do { \
796 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
797 make_number (chars), \
798 make_number (final_char)); \
799 if (charset >= 0) \
800 { \
801 if (coding->direction == 1 \
802 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
803 charset = CHARSET_REVERSE_CHARSET (charset); \
804 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
805 } \
806 } while (0)
807
808 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
809
810 int
811 decode_coding_iso2022 (coding, source, destination,
812 src_bytes, dst_bytes, consumed)
813 struct coding_system *coding;
814 unsigned char *source, *destination;
815 int src_bytes, dst_bytes;
816 int *consumed;
817 {
818 unsigned char *src = source;
819 unsigned char *src_end = source + src_bytes;
820 unsigned char *dst = destination;
821 unsigned char *dst_end = destination + dst_bytes;
822 /* Since the maximum bytes produced by each loop is 7, we subtract 6
823 from DST_END to assure that overflow checking is necessary only
824 at the head of loop. */
825 unsigned char *adjusted_dst_end = dst_end - 6;
826 int charset;
827 /* Charsets invoked to graphic plane 0 and 1 respectively. */
828 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
829 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
830 Lisp_Object unification_table
831 = coding->character_unification_table_for_decode;
832
833 if (!NILP (Venable_character_unification) && NILP (unification_table))
834 unification_table = Vstandard_character_unification_table_for_decode;
835
836 while (src < src_end && dst < adjusted_dst_end)
837 {
838 /* SRC_BASE remembers the start position in source in each loop.
839 The loop will be exited when there's not enough source text
840 to analyze long escape sequence or 2-byte code (within macros
841 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
842 to SRC_BASE before exiting. */
843 unsigned char *src_base = src;
844 int c1 = *src++, c2;
845
846 switch (iso_code_class [c1])
847 {
848 case ISO_0x20_or_0x7F:
849 if (!coding->composing
850 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
851 {
852 /* This is SPACE or DEL. */
853 *dst++ = c1;
854 break;
855 }
856 /* This is a graphic character, we fall down ... */
857
858 case ISO_graphic_plane_0:
859 if (coding->composing == COMPOSING_WITH_RULE_RULE)
860 {
861 /* This is a composition rule. */
862 *dst++ = c1 | 0x80;
863 coding->composing = COMPOSING_WITH_RULE_TAIL;
864 }
865 else
866 DECODE_ISO_CHARACTER (charset0, c1);
867 break;
868
869 case ISO_0xA0_or_0xFF:
870 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
871 {
872 /* Invalid code. */
873 *dst++ = c1;
874 break;
875 }
876 /* This is a graphic character, we fall down ... */
877
878 case ISO_graphic_plane_1:
879 DECODE_ISO_CHARACTER (charset1, c1);
880 break;
881
882 case ISO_control_code:
883 /* All ISO2022 control characters in this class have the
884 same representation in Emacs internal format. */
885 *dst++ = c1;
886 break;
887
888 case ISO_carriage_return:
889 if (coding->eol_type == CODING_EOL_CR)
890 {
891 *dst++ = '\n';
892 }
893 else if (coding->eol_type == CODING_EOL_CRLF)
894 {
895 ONE_MORE_BYTE (c1);
896 if (c1 == ISO_CODE_LF)
897 *dst++ = '\n';
898 else
899 {
900 src--;
901 *dst++ = c1;
902 }
903 }
904 else
905 {
906 *dst++ = c1;
907 }
908 break;
909
910 case ISO_shift_out:
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
912 goto label_invalid_escape_sequence;
913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
915 break;
916
917 case ISO_shift_in:
918 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
919 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
920 break;
921
922 case ISO_single_shift_2_7:
923 case ISO_single_shift_2:
924 /* SS2 is handled as an escape sequence of ESC 'N' */
925 c1 = 'N';
926 goto label_escape_sequence;
927
928 case ISO_single_shift_3:
929 /* SS2 is handled as an escape sequence of ESC 'O' */
930 c1 = 'O';
931 goto label_escape_sequence;
932
933 case ISO_control_sequence_introducer:
934 /* CSI is handled as an escape sequence of ESC '[' ... */
935 c1 = '[';
936 goto label_escape_sequence;
937
938 case ISO_escape:
939 ONE_MORE_BYTE (c1);
940 label_escape_sequence:
941 /* Escape sequences handled by Emacs are invocation,
942 designation, direction specification, and character
943 composition specification. */
944 switch (c1)
945 {
946 case '&': /* revision of following character set */
947 ONE_MORE_BYTE (c1);
948 if (!(c1 >= '@' && c1 <= '~'))
949 goto label_invalid_escape_sequence;
950 ONE_MORE_BYTE (c1);
951 if (c1 != ISO_CODE_ESC)
952 goto label_invalid_escape_sequence;
953 ONE_MORE_BYTE (c1);
954 goto label_escape_sequence;
955
956 case '$': /* designation of 2-byte character set */
957 ONE_MORE_BYTE (c1);
958 if (c1 >= '@' && c1 <= 'B')
959 { /* designation of JISX0208.1978, GB2312.1980,
960 or JISX0208.1980 */
961 DECODE_DESIGNATION (0, 2, 94, c1);
962 }
963 else if (c1 >= 0x28 && c1 <= 0x2B)
964 { /* designation of DIMENSION2_CHARS94 character set */
965 ONE_MORE_BYTE (c2);
966 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
967 }
968 else if (c1 >= 0x2C && c1 <= 0x2F)
969 { /* designation of DIMENSION2_CHARS96 character set */
970 ONE_MORE_BYTE (c2);
971 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
972 }
973 else
974 goto label_invalid_escape_sequence;
975 break;
976
977 case 'n': /* invocation of locking-shift-2 */
978 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
979 goto label_invalid_escape_sequence;
980 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
981 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
982 break;
983
984 case 'o': /* invocation of locking-shift-3 */
985 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
986 goto label_invalid_escape_sequence;
987 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
988 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
989 break;
990
991 case 'N': /* invocation of single-shift-2 */
992 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
993 goto label_invalid_escape_sequence;
994 ONE_MORE_BYTE (c1);
995 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
996 DECODE_ISO_CHARACTER (charset, c1);
997 break;
998
999 case 'O': /* invocation of single-shift-3 */
1000 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1001 goto label_invalid_escape_sequence;
1002 ONE_MORE_BYTE (c1);
1003 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1004 DECODE_ISO_CHARACTER (charset, c1);
1005 break;
1006
1007 case '0': /* start composing without embeded rules */
1008 coding->composing = COMPOSING_NO_RULE_HEAD;
1009 break;
1010
1011 case '1': /* end composing */
1012 coding->composing = COMPOSING_NO;
1013 break;
1014
1015 case '2': /* start composing with embeded rules */
1016 coding->composing = COMPOSING_WITH_RULE_HEAD;
1017 break;
1018
1019 case '[': /* specification of direction */
1020 /* For the moment, nested direction is not supported.
1021 So, the value of `coding->direction' is 0 or 1: 0
1022 means left-to-right, 1 means right-to-left. */
1023 ONE_MORE_BYTE (c1);
1024 switch (c1)
1025 {
1026 case ']': /* end of the current direction */
1027 coding->direction = 0;
1028
1029 case '0': /* end of the current direction */
1030 case '1': /* start of left-to-right direction */
1031 ONE_MORE_BYTE (c1);
1032 if (c1 == ']')
1033 coding->direction = 0;
1034 else
1035 goto label_invalid_escape_sequence;
1036 break;
1037
1038 case '2': /* start of right-to-left direction */
1039 ONE_MORE_BYTE (c1);
1040 if (c1 == ']')
1041 coding->direction= 1;
1042 else
1043 goto label_invalid_escape_sequence;
1044 break;
1045
1046 default:
1047 goto label_invalid_escape_sequence;
1048 }
1049 break;
1050
1051 default:
1052 if (c1 >= 0x28 && c1 <= 0x2B)
1053 { /* designation of DIMENSION1_CHARS94 character set */
1054 ONE_MORE_BYTE (c2);
1055 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056 }
1057 else if (c1 >= 0x2C && c1 <= 0x2F)
1058 { /* designation of DIMENSION1_CHARS96 character set */
1059 ONE_MORE_BYTE (c2);
1060 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061 }
1062 else
1063 {
1064 goto label_invalid_escape_sequence;
1065 }
1066 }
1067 /* We must update these variables now. */
1068 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1069 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1070 break;
1071
1072 label_invalid_escape_sequence:
1073 {
1074 int length = src - src_base;
1075
1076 bcopy (src_base, dst, length);
1077 dst += length;
1078 }
1079 }
1080 continue;
1081
1082 label_end_of_loop:
1083 coding->carryover_size = src - src_base;
1084 bcopy (src_base, coding->carryover, coding->carryover_size);
1085 src = src_base;
1086 break;
1087 }
1088
1089 /* If this is the last block of the text to be decoded, we had
1090 better just flush out all remaining codes in the text although
1091 they are not valid characters. */
1092 if (coding->last_block)
1093 {
1094 bcopy (src, dst, src_end - src);
1095 dst += (src_end - src);
1096 src = src_end;
1097 }
1098 *consumed = src - source;
1099 return dst - destination;
1100 }
1101
1102 /* ISO2022 encoding stuff. */
1103
1104 /*
1105 It is not enough to say just "ISO2022" on encoding, we have to
1106 specify more details. In Emacs, each coding-system of ISO2022
1107 variant has the following specifications:
1108 1. Initial designation to G0 thru G3.
1109 2. Allows short-form designation?
1110 3. ASCII should be designated to G0 before control characters?
1111 4. ASCII should be designated to G0 at end of line?
1112 5. 7-bit environment or 8-bit environment?
1113 6. Use locking-shift?
1114 7. Use Single-shift?
1115 And the following two are only for Japanese:
1116 8. Use ASCII in place of JIS0201-1976-Roman?
1117 9. Use JISX0208-1983 in place of JISX0208-1978?
1118 These specifications are encoded in `coding->flags' as flag bits
1119 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1120 details.
1121 */
1122
1123 /* Produce codes (escape sequence) for designating CHARSET to graphic
1124 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1125 the coding system CODING allows, produce designation sequence of
1126 short-form. */
1127
1128 #define ENCODE_DESIGNATION(charset, reg, coding) \
1129 do { \
1130 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1131 char *intermediate_char_94 = "()*+"; \
1132 char *intermediate_char_96 = ",-./"; \
1133 Lisp_Object temp \
1134 = Fassq (make_number (charset), Vcharset_revision_alist); \
1135 if (! NILP (temp)) \
1136 { \
1137 *dst++ = ISO_CODE_ESC; \
1138 *dst++ = '&'; \
1139 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1140 } \
1141 *dst++ = ISO_CODE_ESC; \
1142 if (CHARSET_DIMENSION (charset) == 1) \
1143 { \
1144 if (CHARSET_CHARS (charset) == 94) \
1145 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1146 else \
1147 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1148 } \
1149 else \
1150 { \
1151 *dst++ = '$'; \
1152 if (CHARSET_CHARS (charset) == 94) \
1153 { \
1154 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1155 || reg != 0 \
1156 || final_char < '@' || final_char > 'B') \
1157 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1158 } \
1159 else \
1160 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1161 } \
1162 *dst++ = final_char; \
1163 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1164 } while (0)
1165
1166 /* The following two macros produce codes (control character or escape
1167 sequence) for ISO2022 single-shift functions (single-shift-2 and
1168 single-shift-3). */
1169
1170 #define ENCODE_SINGLE_SHIFT_2 \
1171 do { \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1174 else \
1175 *dst++ = ISO_CODE_SS2; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1177 } while (0)
1178
1179 #define ENCODE_SINGLE_SHIFT_3 \
1180 do { \
1181 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1182 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1183 else \
1184 *dst++ = ISO_CODE_SS3; \
1185 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1186 } while (0)
1187
1188 /* The following four macros produce codes (control character or
1189 escape sequence) for ISO2022 locking-shift functions (shift-in,
1190 shift-out, locking-shift-2, and locking-shift-3). */
1191
1192 #define ENCODE_SHIFT_IN \
1193 do { \
1194 *dst++ = ISO_CODE_SI; \
1195 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1196 } while (0)
1197
1198 #define ENCODE_SHIFT_OUT \
1199 do { \
1200 *dst++ = ISO_CODE_SO; \
1201 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1202 } while (0)
1203
1204 #define ENCODE_LOCKING_SHIFT_2 \
1205 do { \
1206 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1207 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1208 } while (0)
1209
1210 #define ENCODE_LOCKING_SHIFT_3 \
1211 do { \
1212 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1213 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1214 } while (0)
1215
1216 /* Produce codes for a DIMENSION1 character whose character set is
1217 CHARSET and whose position-code is C1. Designation and invocation
1218 sequences are also produced in advance if necessary. */
1219
1220
1221 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1222 do { \
1223 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1224 { \
1225 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1226 *dst++ = c1 & 0x7F; \
1227 else \
1228 *dst++ = c1 | 0x80; \
1229 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1230 break; \
1231 } \
1232 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1233 { \
1234 *dst++ = c1 & 0x7F; \
1235 break; \
1236 } \
1237 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1238 { \
1239 *dst++ = c1 | 0x80; \
1240 break; \
1241 } \
1242 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1243 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1244 { \
1245 /* We should not encode this character, instead produce one or \
1246 two `?'s. */ \
1247 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1248 if (CHARSET_WIDTH (charset) == 2) \
1249 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1250 break; \
1251 } \
1252 else \
1253 /* Since CHARSET is not yet invoked to any graphic planes, we \
1254 must invoke it, or, at first, designate it to some graphic \
1255 register. Then repeat the loop to actually produce the \
1256 character. */ \
1257 dst = encode_invocation_designation (charset, coding, dst); \
1258 } while (1)
1259
1260 /* Produce codes for a DIMENSION2 character whose character set is
1261 CHARSET and whose position-codes are C1 and C2. Designation and
1262 invocation codes are also produced in advance if necessary. */
1263
1264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1265 do { \
1266 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1267 { \
1268 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1269 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1270 else \
1271 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1272 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1273 break; \
1274 } \
1275 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1276 { \
1277 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1278 break; \
1279 } \
1280 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1281 { \
1282 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1283 break; \
1284 } \
1285 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1286 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1287 { \
1288 /* We should not encode this character, instead produce one or \
1289 two `?'s. */ \
1290 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1291 if (CHARSET_WIDTH (charset) == 2) \
1292 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1293 break; \
1294 } \
1295 else \
1296 /* Since CHARSET is not yet invoked to any graphic planes, we \
1297 must invoke it, or, at first, designate it to some graphic \
1298 register. Then repeat the loop to actually produce the \
1299 character. */ \
1300 dst = encode_invocation_designation (charset, coding, dst); \
1301 } while (1)
1302
1303 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1304 do { \
1305 int c_alt, charset_alt; \
1306 if (!NILP (unification_table) \
1307 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1308 >= 0)) \
1309 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1310 else \
1311 charset_alt = charset; \
1312 if (CHARSET_DIMENSION (charset_alt) == 1) \
1313 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1314 else \
1315 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1316 } while (0)
1317
1318 /* Produce designation and invocation codes at a place pointed by DST
1319 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1320 Return new DST. */
1321
1322 unsigned char *
1323 encode_invocation_designation (charset, coding, dst)
1324 int charset;
1325 struct coding_system *coding;
1326 unsigned char *dst;
1327 {
1328 int reg; /* graphic register number */
1329
1330 /* At first, check designations. */
1331 for (reg = 0; reg < 4; reg++)
1332 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1333 break;
1334
1335 if (reg >= 4)
1336 {
1337 /* CHARSET is not yet designated to any graphic registers. */
1338 /* At first check the requested designation. */
1339 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1340 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1341 /* Since CHARSET requests no special designation, designate it
1342 to graphic register 0. */
1343 reg = 0;
1344
1345 ENCODE_DESIGNATION (charset, reg, coding);
1346 }
1347
1348 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1349 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350 {
1351 /* Since the graphic register REG is not invoked to any graphic
1352 planes, invoke it to graphic plane 0. */
1353 switch (reg)
1354 {
1355 case 0: /* graphic register 0 */
1356 ENCODE_SHIFT_IN;
1357 break;
1358
1359 case 1: /* graphic register 1 */
1360 ENCODE_SHIFT_OUT;
1361 break;
1362
1363 case 2: /* graphic register 2 */
1364 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1365 ENCODE_SINGLE_SHIFT_2;
1366 else
1367 ENCODE_LOCKING_SHIFT_2;
1368 break;
1369
1370 case 3: /* graphic register 3 */
1371 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1372 ENCODE_SINGLE_SHIFT_3;
1373 else
1374 ENCODE_LOCKING_SHIFT_3;
1375 break;
1376 }
1377 }
1378 return dst;
1379 }
1380
1381 /* The following two macros produce codes for indicating composition. */
1382 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385
1386 /* The following three macros produce codes for indicating direction
1387 of text. */
1388 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1389 do { \
1390 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1391 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1392 else \
1393 *dst++ = ISO_CODE_CSI; \
1394 } while (0)
1395
1396 #define ENCODE_DIRECTION_R2L \
1397 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398
1399 #define ENCODE_DIRECTION_L2R \
1400 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401
1402 /* Produce codes for designation and invocation to reset the graphic
1403 planes and registers to initial state. */
1404 #define ENCODE_RESET_PLANE_AND_REGISTER \
1405 do { \
1406 int reg; \
1407 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1408 ENCODE_SHIFT_IN; \
1409 for (reg = 0; reg < 4; reg++) \
1410 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1411 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1412 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1413 ENCODE_DESIGNATION \
1414 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1415 } while (0)
1416
1417 /* Produce designation sequences of charsets in the line started from
1418 *SRC to a place pointed by DSTP.
1419
1420 If the current block ends before any end-of-line, we may fail to
1421 find all the necessary *designations. */
1422 encode_designation_at_bol (coding, table, src, src_end, dstp)
1423 struct coding_system *coding;
1424 Lisp_Object table;
1425 unsigned char *src, *src_end, **dstp;
1426 {
1427 int charset, c, found = 0, reg;
1428 /* Table of charsets to be designated to each graphic register. */
1429 int r[4];
1430 unsigned char *dst = *dstp;
1431
1432 for (reg = 0; reg < 4; reg++)
1433 r[reg] = -1;
1434
1435 while (src < src_end && *src != '\n' && found < 4)
1436 {
1437 int bytes = BYTES_BY_CHAR_HEAD (*src);
1438
1439 if (NILP (table))
1440 charset = CHARSET_AT (src);
1441 else
1442 {
1443 int c_alt;
1444 unsigned char c1, c2;
1445
1446 SPLIT_STRING(src, bytes, charset, c1, c2);
1447 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1448 charset = CHAR_CHARSET (c_alt);
1449 }
1450
1451 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1452 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1453 {
1454 found++;
1455 r[reg] = charset;
1456 }
1457
1458 src += bytes;
1459 }
1460
1461 if (found)
1462 {
1463 for (reg = 0; reg < 4; reg++)
1464 if (r[reg] >= 0
1465 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1466 ENCODE_DESIGNATION (r[reg], reg, coding);
1467 *dstp = dst;
1468 }
1469 }
1470
1471 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1472
1473 int
1474 encode_coding_iso2022 (coding, source, destination,
1475 src_bytes, dst_bytes, consumed)
1476 struct coding_system *coding;
1477 unsigned char *source, *destination;
1478 int src_bytes, dst_bytes;
1479 int *consumed;
1480 {
1481 unsigned char *src = source;
1482 unsigned char *src_end = source + src_bytes;
1483 unsigned char *dst = destination;
1484 unsigned char *dst_end = destination + dst_bytes;
1485 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1486 from DST_END to assure overflow checking is necessary only at the
1487 head of loop. */
1488 unsigned char *adjusted_dst_end = dst_end - 19;
1489 Lisp_Object unification_table
1490 = coding->character_unification_table_for_encode;
1491
1492 if (!NILP (Venable_character_unification) && NILP (unification_table))
1493 unification_table = Vstandard_character_unification_table_for_encode;
1494
1495 while (src < src_end && dst < adjusted_dst_end)
1496 {
1497 /* SRC_BASE remembers the start position in source in each loop.
1498 The loop will be exited when there's not enough source text
1499 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1500 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1501 reset to SRC_BASE before exiting. */
1502 unsigned char *src_base = src;
1503 int charset, c1, c2, c3, c4;
1504
1505 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1506 && CODING_SPEC_ISO_BOL (coding))
1507 {
1508 /* We have to produce designation sequences if any now. */
1509 encode_designation_at_bol (coding, unification_table,
1510 src, src_end, &dst);
1511 CODING_SPEC_ISO_BOL (coding) = 0;
1512 }
1513
1514 c1 = *src++;
1515 /* If we are seeing a component of a composite character, we are
1516 seeing a leading-code specially encoded for composition, or a
1517 composition rule if composing with rule. We must set C1
1518 to a normal leading-code or an ASCII code. If we are not at
1519 a composed character, we must reset the composition state. */
1520 if (COMPOSING_P (coding->composing))
1521 {
1522 if (c1 < 0xA0)
1523 {
1524 /* We are not in a composite character any longer. */
1525 coding->composing = COMPOSING_NO;
1526 ENCODE_COMPOSITION_END;
1527 }
1528 else
1529 {
1530 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1531 {
1532 *dst++ = c1 & 0x7F;
1533 coding->composing = COMPOSING_WITH_RULE_HEAD;
1534 continue;
1535 }
1536 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1537 coding->composing = COMPOSING_WITH_RULE_RULE;
1538 if (c1 == 0xA0)
1539 {
1540 /* This is an ASCII component. */
1541 ONE_MORE_BYTE (c1);
1542 c1 &= 0x7F;
1543 }
1544 else
1545 /* This is a leading-code of non ASCII component. */
1546 c1 -= 0x20;
1547 }
1548 }
1549
1550 /* Now encode one character. C1 is a control character, an
1551 ASCII character, or a leading-code of multi-byte character. */
1552 switch (emacs_code_class[c1])
1553 {
1554 case EMACS_ascii_code:
1555 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1556 break;
1557
1558 case EMACS_control_code:
1559 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1560 ENCODE_RESET_PLANE_AND_REGISTER;
1561 *dst++ = c1;
1562 break;
1563
1564 case EMACS_carriage_return_code:
1565 if (!coding->selective)
1566 {
1567 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1568 ENCODE_RESET_PLANE_AND_REGISTER;
1569 *dst++ = c1;
1570 break;
1571 }
1572 /* fall down to treat '\r' as '\n' ... */
1573
1574 case EMACS_linefeed_code:
1575 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1576 ENCODE_RESET_PLANE_AND_REGISTER;
1577 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1578 bcopy (coding->spec.iso2022.initial_designation,
1579 coding->spec.iso2022.current_designation,
1580 sizeof coding->spec.iso2022.initial_designation);
1581 if (coding->eol_type == CODING_EOL_LF
1582 || coding->eol_type == CODING_EOL_UNDECIDED)
1583 *dst++ = ISO_CODE_LF;
1584 else if (coding->eol_type == CODING_EOL_CRLF)
1585 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1586 else
1587 *dst++ = ISO_CODE_CR;
1588 CODING_SPEC_ISO_BOL (coding) = 1;
1589 break;
1590
1591 case EMACS_leading_code_2:
1592 ONE_MORE_BYTE (c2);
1593 if (c2 < 0xA0)
1594 {
1595 /* invalid sequence */
1596 *dst++ = c1;
1597 *dst++ = c2;
1598 }
1599 else
1600 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1601 break;
1602
1603 case EMACS_leading_code_3:
1604 TWO_MORE_BYTES (c2, c3);
1605 if (c2 < 0xA0 || c3 < 0xA0)
1606 {
1607 /* invalid sequence */
1608 *dst++ = c1;
1609 *dst++ = c2;
1610 *dst++ = c3;
1611 }
1612 else if (c1 < LEADING_CODE_PRIVATE_11)
1613 ENCODE_ISO_CHARACTER (c1, c2, c3);
1614 else
1615 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1616 break;
1617
1618 case EMACS_leading_code_4:
1619 THREE_MORE_BYTES (c2, c3, c4);
1620 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1621 {
1622 /* invalid sequence */
1623 *dst++ = c1;
1624 *dst++ = c2;
1625 *dst++ = c3;
1626 *dst++ = c4;
1627 }
1628 else
1629 ENCODE_ISO_CHARACTER (c2, c3, c4);
1630 break;
1631
1632 case EMACS_leading_code_composition:
1633 ONE_MORE_BYTE (c2);
1634 if (c2 < 0xA0)
1635 {
1636 /* invalid sequence */
1637 *dst++ = c1;
1638 *dst++ = c2;
1639 }
1640 else if (c2 == 0xFF)
1641 {
1642 coding->composing = COMPOSING_WITH_RULE_HEAD;
1643 ENCODE_COMPOSITION_WITH_RULE_START;
1644 }
1645 else
1646 {
1647 /* Rewind one byte because it is a character code of
1648 composition elements. */
1649 src--;
1650 coding->composing = COMPOSING_NO_RULE_HEAD;
1651 ENCODE_COMPOSITION_NO_RULE_START;
1652 }
1653 break;
1654
1655 case EMACS_invalid_code:
1656 *dst++ = c1;
1657 break;
1658 }
1659 continue;
1660 label_end_of_loop:
1661 /* We reach here because the source date ends not at character
1662 boundary. */
1663 coding->carryover_size = src_end - src_base;
1664 bcopy (src_base, coding->carryover, coding->carryover_size);
1665 src = src_end;
1666 break;
1667 }
1668
1669 /* If this is the last block of the text to be encoded, we must
1670 reset graphic planes and registers to the initial state. */
1671 if (src >= src_end && coding->last_block)
1672 {
1673 ENCODE_RESET_PLANE_AND_REGISTER;
1674 if (coding->carryover_size > 0
1675 && coding->carryover_size < (dst_end - dst))
1676 {
1677 bcopy (coding->carryover, dst, coding->carryover_size);
1678 dst += coding->carryover_size;
1679 coding->carryover_size = 0;
1680 }
1681 }
1682 *consumed = src - source;
1683 return dst - destination;
1684 }
1685
1686 \f
1687 /*** 4. SJIS and BIG5 handlers ***/
1688
1689 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1690 quite widely. So, for the moment, Emacs supports them in the bare
1691 C code. But, in the future, they may be supported only by CCL. */
1692
1693 /* SJIS is a coding system encoding three character sets: ASCII, right
1694 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1695 as is. A character of charset katakana-jisx0201 is encoded by
1696 "position-code + 0x80". A character of charset japanese-jisx0208
1697 is encoded in 2-byte but two position-codes are divided and shifted
1698 so that it fit in the range below.
1699
1700 --- CODE RANGE of SJIS ---
1701 (character set) (range)
1702 ASCII 0x00 .. 0x7F
1703 KATAKANA-JISX0201 0xA0 .. 0xDF
1704 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1705 (2nd byte) 0x40 .. 0xFF
1706 -------------------------------
1707
1708 */
1709
1710 /* BIG5 is a coding system encoding two character sets: ASCII and
1711 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1712 character set and is encoded in two-byte.
1713
1714 --- CODE RANGE of BIG5 ---
1715 (character set) (range)
1716 ASCII 0x00 .. 0x7F
1717 Big5 (1st byte) 0xA1 .. 0xFE
1718 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1719 --------------------------
1720
1721 Since the number of characters in Big5 is larger than maximum
1722 characters in Emacs' charset (96x96), it can't be handled as one
1723 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1724 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1725 contains frequently used characters and the latter contains less
1726 frequently used characters. */
1727
1728 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1729 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1730 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1731 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1732
1733 /* Number of Big5 characters which have the same code in 1st byte. */
1734 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1735
1736 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1737 do { \
1738 unsigned int temp \
1739 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1740 if (b1 < 0xC9) \
1741 charset = charset_big5_1; \
1742 else \
1743 { \
1744 charset = charset_big5_2; \
1745 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1746 } \
1747 c1 = temp / (0xFF - 0xA1) + 0x21; \
1748 c2 = temp % (0xFF - 0xA1) + 0x21; \
1749 } while (0)
1750
1751 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1752 do { \
1753 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1754 if (charset == charset_big5_2) \
1755 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1756 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1757 b2 = temp % BIG5_SAME_ROW; \
1758 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1759 } while (0)
1760
1761 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1762 do { \
1763 int c_alt, charset_alt = (charset); \
1764 if (!NILP (unification_table) \
1765 && ((c_alt = unify_char (unification_table, \
1766 -1, (charset), c1, c2)) >= 0)) \
1767 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1768 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1769 DECODE_CHARACTER_ASCII (c1); \
1770 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1771 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1772 else \
1773 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1774 } while (0)
1775
1776 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1777 do { \
1778 int c_alt, charset_alt; \
1779 if (!NILP (unification_table) \
1780 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1781 >= 0)) \
1782 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1783 else \
1784 charset_alt = charset; \
1785 if (charset_alt == charset_ascii) \
1786 *dst++ = c1; \
1787 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1788 { \
1789 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1790 *dst++ = c1; \
1791 else \
1792 *dst++ = charset_alt, *dst++ = c1; \
1793 } \
1794 else \
1795 { \
1796 c1 &= 0x7F, c2 &= 0x7F; \
1797 if (sjis_p && charset_alt == charset_jisx0208) \
1798 { \
1799 unsigned char s1, s2; \
1800 \
1801 ENCODE_SJIS (c1, c2, s1, s2); \
1802 *dst++ = s1, *dst++ = s2; \
1803 } \
1804 else if (!sjis_p \
1805 && (charset_alt == charset_big5_1 \
1806 || charset_alt == charset_big5_2)) \
1807 { \
1808 unsigned char b1, b2; \
1809 \
1810 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1811 *dst++ = b1, *dst++ = b2; \
1812 } \
1813 else \
1814 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1815 } \
1816 } while (0);
1817
1818 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1819 Check if a text is encoded in SJIS. If it is, return
1820 CODING_CATEGORY_MASK_SJIS, else return 0. */
1821
1822 int
1823 detect_coding_sjis (src, src_end)
1824 unsigned char *src, *src_end;
1825 {
1826 unsigned char c;
1827
1828 while (src < src_end)
1829 {
1830 c = *src++;
1831 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1832 return 0;
1833 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1834 {
1835 if (src < src_end && *src++ < 0x40)
1836 return 0;
1837 }
1838 }
1839 return CODING_CATEGORY_MASK_SJIS;
1840 }
1841
1842 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1843 Check if a text is encoded in BIG5. If it is, return
1844 CODING_CATEGORY_MASK_BIG5, else return 0. */
1845
1846 int
1847 detect_coding_big5 (src, src_end)
1848 unsigned char *src, *src_end;
1849 {
1850 unsigned char c;
1851
1852 while (src < src_end)
1853 {
1854 c = *src++;
1855 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1856 return 0;
1857 if (c >= 0xA1)
1858 {
1859 if (src >= src_end)
1860 break;
1861 c = *src++;
1862 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1863 return 0;
1864 }
1865 }
1866 return CODING_CATEGORY_MASK_BIG5;
1867 }
1868
1869 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1870 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1871
1872 int
1873 decode_coding_sjis_big5 (coding, source, destination,
1874 src_bytes, dst_bytes, consumed, sjis_p)
1875 struct coding_system *coding;
1876 unsigned char *source, *destination;
1877 int src_bytes, dst_bytes;
1878 int *consumed;
1879 int sjis_p;
1880 {
1881 unsigned char *src = source;
1882 unsigned char *src_end = source + src_bytes;
1883 unsigned char *dst = destination;
1884 unsigned char *dst_end = destination + dst_bytes;
1885 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1886 from DST_END to assure overflow checking is necessary only at the
1887 head of loop. */
1888 unsigned char *adjusted_dst_end = dst_end - 3;
1889 Lisp_Object unification_table
1890 = coding->character_unification_table_for_decode;
1891
1892 if (!NILP (Venable_character_unification) && NILP (unification_table))
1893 unification_table = Vstandard_character_unification_table_for_decode;
1894
1895 while (src < src_end && dst < adjusted_dst_end)
1896 {
1897 /* SRC_BASE remembers the start position in source in each loop.
1898 The loop will be exited when there's not enough source text
1899 to analyze two-byte character (within macro ONE_MORE_BYTE).
1900 In that case, SRC is reset to SRC_BASE before exiting. */
1901 unsigned char *src_base = src;
1902 unsigned char c1 = *src++, c2, c3, c4;
1903
1904 if (c1 == '\r')
1905 {
1906 if (coding->eol_type == CODING_EOL_CRLF)
1907 {
1908 ONE_MORE_BYTE (c2);
1909 if (c2 == '\n')
1910 *dst++ = c2;
1911 else
1912 /* To process C2 again, SRC is subtracted by 1. */
1913 *dst++ = c1, src--;
1914 }
1915 else
1916 *dst++ = c1;
1917 }
1918 else if (c1 < 0x20)
1919 *dst++ = c1;
1920 else if (c1 < 0x80)
1921 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1922 else if (c1 < 0xA0 || c1 >= 0xE0)
1923 {
1924 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1925 if (sjis_p)
1926 {
1927 ONE_MORE_BYTE (c2);
1928 DECODE_SJIS (c1, c2, c3, c4);
1929 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1930 }
1931 else if (c1 >= 0xE0 && c1 < 0xFF)
1932 {
1933 int charset;
1934
1935 ONE_MORE_BYTE (c2);
1936 DECODE_BIG5 (c1, c2, charset, c3, c4);
1937 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1938 }
1939 else /* Invalid code */
1940 *dst++ = c1;
1941 }
1942 else
1943 {
1944 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1945 if (sjis_p)
1946 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1947 else
1948 {
1949 int charset;
1950
1951 ONE_MORE_BYTE (c2);
1952 DECODE_BIG5 (c1, c2, charset, c3, c4);
1953 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1954 }
1955 }
1956 continue;
1957
1958 label_end_of_loop:
1959 coding->carryover_size = src - src_base;
1960 bcopy (src_base, coding->carryover, coding->carryover_size);
1961 src = src_base;
1962 break;
1963 }
1964
1965 *consumed = src - source;
1966 return dst - destination;
1967 }
1968
1969 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1970 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1971 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1972 sure that all these charsets are registered as official charset
1973 (i.e. do not have extended leading-codes). Characters of other
1974 charsets are produced without any encoding. If SJIS_P is 1, encode
1975 SJIS text, else encode BIG5 text. */
1976
1977 int
1978 encode_coding_sjis_big5 (coding, source, destination,
1979 src_bytes, dst_bytes, consumed, sjis_p)
1980 struct coding_system *coding;
1981 unsigned char *source, *destination;
1982 int src_bytes, dst_bytes;
1983 int *consumed;
1984 int sjis_p;
1985 {
1986 unsigned char *src = source;
1987 unsigned char *src_end = source + src_bytes;
1988 unsigned char *dst = destination;
1989 unsigned char *dst_end = destination + dst_bytes;
1990 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1991 from DST_END to assure overflow checking is necessary only at the
1992 head of loop. */
1993 unsigned char *adjusted_dst_end = dst_end - 1;
1994 Lisp_Object unification_table
1995 = coding->character_unification_table_for_encode;
1996
1997 if (!NILP (Venable_character_unification) && NILP (unification_table))
1998 unification_table = Vstandard_character_unification_table_for_encode;
1999
2000 while (src < src_end && dst < adjusted_dst_end)
2001 {
2002 /* SRC_BASE remembers the start position in source in each loop.
2003 The loop will be exited when there's not enough source text
2004 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2005 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2006 before exiting. */
2007 unsigned char *src_base = src;
2008 unsigned char c1 = *src++, c2, c3, c4;
2009
2010 if (coding->composing)
2011 {
2012 if (c1 == 0xA0)
2013 {
2014 ONE_MORE_BYTE (c1);
2015 c1 &= 0x7F;
2016 }
2017 else if (c1 >= 0xA0)
2018 c1 -= 0x20;
2019 else
2020 coding->composing = 0;
2021 }
2022
2023 switch (emacs_code_class[c1])
2024 {
2025 case EMACS_ascii_code:
2026 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2027 break;
2028
2029 case EMACS_control_code:
2030 *dst++ = c1;
2031 break;
2032
2033 case EMACS_carriage_return_code:
2034 if (!coding->selective)
2035 {
2036 *dst++ = c1;
2037 break;
2038 }
2039 /* fall down to treat '\r' as '\n' ... */
2040
2041 case EMACS_linefeed_code:
2042 if (coding->eol_type == CODING_EOL_LF
2043 || coding->eol_type == CODING_EOL_UNDECIDED)
2044 *dst++ = '\n';
2045 else if (coding->eol_type == CODING_EOL_CRLF)
2046 *dst++ = '\r', *dst++ = '\n';
2047 else
2048 *dst++ = '\r';
2049 break;
2050
2051 case EMACS_leading_code_2:
2052 ONE_MORE_BYTE (c2);
2053 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2054 break;
2055
2056 case EMACS_leading_code_3:
2057 TWO_MORE_BYTES (c2, c3);
2058 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2059 break;
2060
2061 case EMACS_leading_code_4:
2062 THREE_MORE_BYTES (c2, c3, c4);
2063 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2064 break;
2065
2066 case EMACS_leading_code_composition:
2067 coding->composing = 1;
2068 break;
2069
2070 default: /* i.e. case EMACS_invalid_code: */
2071 *dst++ = c1;
2072 }
2073 continue;
2074
2075 label_end_of_loop:
2076 coding->carryover_size = src_end - src_base;
2077 bcopy (src_base, coding->carryover, coding->carryover_size);
2078 src = src_end;
2079 break;
2080 }
2081
2082 *consumed = src - source;
2083 return dst - destination;
2084 }
2085
2086 \f
2087 /*** 5. End-of-line handlers ***/
2088
2089 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2090 This function is called only when `coding->eol_type' is
2091 CODING_EOL_CRLF or CODING_EOL_CR. */
2092
2093 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2094 struct coding_system *coding;
2095 unsigned char *source, *destination;
2096 int src_bytes, dst_bytes;
2097 int *consumed;
2098 {
2099 unsigned char *src = source;
2100 unsigned char *src_end = source + src_bytes;
2101 unsigned char *dst = destination;
2102 unsigned char *dst_end = destination + dst_bytes;
2103 int produced;
2104
2105 switch (coding->eol_type)
2106 {
2107 case CODING_EOL_CRLF:
2108 {
2109 /* Since the maximum bytes produced by each loop is 2, we
2110 subtract 1 from DST_END to assure overflow checking is
2111 necessary only at the head of loop. */
2112 unsigned char *adjusted_dst_end = dst_end - 1;
2113
2114 while (src < src_end && dst < adjusted_dst_end)
2115 {
2116 unsigned char *src_base = src;
2117 unsigned char c = *src++;
2118 if (c == '\r')
2119 {
2120 ONE_MORE_BYTE (c);
2121 if (c != '\n')
2122 *dst++ = '\r';
2123 *dst++ = c;
2124 }
2125 else
2126 *dst++ = c;
2127 continue;
2128
2129 label_end_of_loop:
2130 coding->carryover_size = src - src_base;
2131 bcopy (src_base, coding->carryover, coding->carryover_size);
2132 src = src_base;
2133 break;
2134 }
2135 *consumed = src - source;
2136 produced = dst - destination;
2137 break;
2138 }
2139
2140 case CODING_EOL_CR:
2141 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2142 bcopy (source, destination, produced);
2143 dst_end = destination + produced;
2144 while (dst < dst_end)
2145 if (*dst++ == '\r') dst[-1] = '\n';
2146 *consumed = produced;
2147 break;
2148
2149 default: /* i.e. case: CODING_EOL_LF */
2150 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2151 bcopy (source, destination, produced);
2152 *consumed = produced;
2153 break;
2154 }
2155
2156 return produced;
2157 }
2158
2159 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2160 format of end-of-line according to `coding->eol_type'. If
2161 `coding->selective' is 1, code '\r' in source text also means
2162 end-of-line. */
2163
2164 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2165 struct coding_system *coding;
2166 unsigned char *source, *destination;
2167 int src_bytes, dst_bytes;
2168 int *consumed;
2169 {
2170 unsigned char *src = source;
2171 unsigned char *dst = destination;
2172 int produced;
2173
2174 if (src_bytes <= 0)
2175 return 0;
2176
2177 switch (coding->eol_type)
2178 {
2179 case CODING_EOL_LF:
2180 case CODING_EOL_UNDECIDED:
2181 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2182 bcopy (source, destination, produced);
2183 if (coding->selective)
2184 {
2185 int i = produced;
2186 while (i--)
2187 if (*dst++ == '\r') dst[-1] = '\n';
2188 }
2189 *consumed = produced;
2190
2191 case CODING_EOL_CRLF:
2192 {
2193 unsigned char c;
2194 unsigned char *src_end = source + src_bytes;
2195 unsigned char *dst_end = destination + dst_bytes;
2196 /* Since the maximum bytes produced by each loop is 2, we
2197 subtract 1 from DST_END to assure overflow checking is
2198 necessary only at the head of loop. */
2199 unsigned char *adjusted_dst_end = dst_end - 1;
2200
2201 while (src < src_end && dst < adjusted_dst_end)
2202 {
2203 c = *src++;
2204 if (c == '\n' || (c == '\r' && coding->selective))
2205 *dst++ = '\r', *dst++ = '\n';
2206 else
2207 *dst++ = c;
2208 }
2209 produced = dst - destination;
2210 *consumed = src - source;
2211 break;
2212 }
2213
2214 default: /* i.e. case CODING_EOL_CR: */
2215 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2216 bcopy (source, destination, produced);
2217 {
2218 int i = produced;
2219 while (i--)
2220 if (*dst++ == '\n') dst[-1] = '\r';
2221 }
2222 *consumed = produced;
2223 }
2224
2225 return produced;
2226 }
2227
2228 \f
2229 /*** 6. C library functions ***/
2230
2231 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2232 has a property `coding-system'. The value of this property is a
2233 vector of length 5 (called as coding-vector). Among elements of
2234 this vector, the first (element[0]) and the fifth (element[4])
2235 carry important information for decoding/encoding. Before
2236 decoding/encoding, this information should be set in fields of a
2237 structure of type `coding_system'.
2238
2239 A value of property `coding-system' can be a symbol of another
2240 subsidiary coding-system. In that case, Emacs gets coding-vector
2241 from that symbol.
2242
2243 `element[0]' contains information to be set in `coding->type'. The
2244 value and its meaning is as follows:
2245
2246 0 -- coding_type_emacs_mule
2247 1 -- coding_type_sjis
2248 2 -- coding_type_iso2022
2249 3 -- coding_type_big5
2250 4 -- coding_type_ccl encoder/decoder written in CCL
2251 nil -- coding_type_no_conversion
2252 t -- coding_type_undecided (automatic conversion on decoding,
2253 no-conversion on encoding)
2254
2255 `element[4]' contains information to be set in `coding->flags' and
2256 `coding->spec'. The meaning varies by `coding->type'.
2257
2258 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2259 of length 32 (of which the first 13 sub-elements are used now).
2260 Meanings of these sub-elements are:
2261
2262 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2263 If the value is an integer of valid charset, the charset is
2264 assumed to be designated to graphic register N initially.
2265
2266 If the value is minus, it is a minus value of charset which
2267 reserves graphic register N, which means that the charset is
2268 not designated initially but should be designated to graphic
2269 register N just before encoding a character in that charset.
2270
2271 If the value is nil, graphic register N is never used on
2272 encoding.
2273
2274 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2275 Each value takes t or nil. See the section ISO2022 of
2276 `coding.h' for more information.
2277
2278 If `coding->type' is `coding_type_big5', element[4] is t to denote
2279 BIG5-ETen or nil to denote BIG5-HKU.
2280
2281 If `coding->type' takes the other value, element[4] is ignored.
2282
2283 Emacs Lisp's coding system also carries information about format of
2284 end-of-line in a value of property `eol-type'. If the value is
2285 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2286 means CODING_EOL_CR. If it is not integer, it should be a vector
2287 of subsidiary coding systems of which property `eol-type' has one
2288 of above values.
2289
2290 */
2291
2292 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2293 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2294 is setup so that no conversion is necessary and return -1, else
2295 return 0. */
2296
2297 int
2298 setup_coding_system (coding_system, coding)
2299 Lisp_Object coding_system;
2300 struct coding_system *coding;
2301 {
2302 Lisp_Object type, eol_type;
2303
2304 /* At first, set several fields to default values. */
2305 coding->require_flushing = 0;
2306 coding->last_block = 0;
2307 coding->selective = 0;
2308 coding->composing = 0;
2309 coding->direction = 0;
2310 coding->carryover_size = 0;
2311 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2312 coding->character_unification_table_for_decode = Qnil;
2313 coding->character_unification_table_for_encode = Qnil;
2314
2315 Vlast_coding_system_used = coding->symbol = coding_system;
2316 eol_type = Qnil;
2317 /* Get value of property `coding-system' until we get a vector.
2318 While doing that, also get values of properties
2319 `post-read-conversion', `pre-write-conversion',
2320 `character-unification-table-for-decode',
2321 `character-unification-table-for-encode' and `eol-type'. */
2322 while (!NILP (coding_system) && SYMBOLP (coding_system))
2323 {
2324 if (NILP (coding->post_read_conversion))
2325 coding->post_read_conversion = Fget (coding_system,
2326 Qpost_read_conversion);
2327 if (NILP (coding->pre_write_conversion))
2328 coding->pre_write_conversion = Fget (coding_system,
2329 Qpre_write_conversion);
2330 if (!inhibit_eol_conversion && NILP (eol_type))
2331 eol_type = Fget (coding_system, Qeol_type);
2332
2333 if (NILP (coding->character_unification_table_for_decode))
2334 coding->character_unification_table_for_decode
2335 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2336
2337 if (NILP (coding->character_unification_table_for_encode))
2338 coding->character_unification_table_for_encode
2339 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2340
2341 coding_system = Fget (coding_system, Qcoding_system);
2342 }
2343
2344 while (!NILP (coding->character_unification_table_for_decode)
2345 && SYMBOLP (coding->character_unification_table_for_decode))
2346 coding->character_unification_table_for_decode
2347 = Fget (coding->character_unification_table_for_decode,
2348 Qcharacter_unification_table_for_decode);
2349 if (!NILP (coding->character_unification_table_for_decode)
2350 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2351 coding->character_unification_table_for_decode = Qnil;
2352
2353 while (!NILP (coding->character_unification_table_for_encode)
2354 && SYMBOLP (coding->character_unification_table_for_encode))
2355 coding->character_unification_table_for_encode
2356 = Fget (coding->character_unification_table_for_encode,
2357 Qcharacter_unification_table_for_encode);
2358 if (!NILP (coding->character_unification_table_for_encode)
2359 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2360 coding->character_unification_table_for_encode = Qnil;
2361
2362 if (!VECTORP (coding_system)
2363 || XVECTOR (coding_system)->size != 5)
2364 goto label_invalid_coding_system;
2365
2366 if (VECTORP (eol_type))
2367 coding->eol_type = CODING_EOL_UNDECIDED;
2368 else if (XFASTINT (eol_type) == 1)
2369 coding->eol_type = CODING_EOL_CRLF;
2370 else if (XFASTINT (eol_type) == 2)
2371 coding->eol_type = CODING_EOL_CR;
2372 else
2373 coding->eol_type = CODING_EOL_LF;
2374
2375 type = XVECTOR (coding_system)->contents[0];
2376 switch (XFASTINT (type))
2377 {
2378 case 0:
2379 coding->type = coding_type_emacs_mule;
2380 break;
2381
2382 case 1:
2383 coding->type = coding_type_sjis;
2384 break;
2385
2386 case 2:
2387 coding->type = coding_type_iso2022;
2388 {
2389 Lisp_Object val;
2390 Lisp_Object *flags;
2391 int i, charset, default_reg_bits = 0;
2392
2393 val = XVECTOR (coding_system)->contents[4];
2394
2395 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2396 goto label_invalid_coding_system;
2397
2398 flags = XVECTOR (val)->contents;
2399 coding->flags
2400 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2401 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2402 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2403 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2404 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2405 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2406 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2407 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2408 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2409 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2410 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2411 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2412 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2413 );
2414
2415 /* Invoke graphic register 0 to plane 0. */
2416 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2417 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2418 CODING_SPEC_ISO_INVOCATION (coding, 1)
2419 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2420 /* Not single shifting at first. */
2421 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2422 /* Beginning of buffer should also be regarded as bol. */
2423 CODING_SPEC_ISO_BOL (coding) = 1;
2424
2425 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2426 FLAGS[REG] can be one of below:
2427 integer CHARSET: CHARSET occupies register I,
2428 t: designate nothing to REG initially, but can be used
2429 by any charsets,
2430 list of integer, nil, or t: designate the first
2431 element (if integer) to REG initially, the remaining
2432 elements (if integer) is designated to REG on request,
2433 if an element is t, REG can be used by any charset,
2434 nil: REG is never used. */
2435 for (charset = 0; charset <= MAX_CHARSET; charset++)
2436 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2437 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2438 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2439 for (i = 0; i < 4; i++)
2440 {
2441 if (INTEGERP (flags[i])
2442 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2443 || (charset = get_charset_id (flags[i])) >= 0)
2444 {
2445 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2446 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2447 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2448 }
2449 else if (EQ (flags[i], Qt))
2450 {
2451 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2452 default_reg_bits |= 1 << i;
2453 }
2454 else if (CONSP (flags[i]))
2455 {
2456 Lisp_Object tail = flags[i];
2457
2458 if (INTEGERP (XCONS (tail)->car)
2459 && (charset = XINT (XCONS (tail)->car),
2460 CHARSET_VALID_P (charset))
2461 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2462 {
2463 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2464 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2465 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2466 }
2467 else
2468 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2469 tail = XCONS (tail)->cdr;
2470 while (CONSP (tail))
2471 {
2472 if (INTEGERP (XCONS (tail)->car)
2473 && (charset = XINT (XCONS (tail)->car),
2474 CHARSET_VALID_P (charset))
2475 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2476 {
2477 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2478 = i;
2479 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2480 = 1;
2481 }
2482 else if (EQ (XCONS (tail)->car, Qt))
2483 default_reg_bits |= 1 << i;
2484 tail = XCONS (tail)->cdr;
2485 }
2486 }
2487 else
2488 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2489
2490 CODING_SPEC_ISO_DESIGNATION (coding, i)
2491 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2492 }
2493
2494 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2495 {
2496 /* REG 1 can be used only by locking shift in 7-bit env. */
2497 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2498 default_reg_bits &= ~2;
2499 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2500 /* Without any shifting, only REG 0 and 1 can be used. */
2501 default_reg_bits &= 3;
2502 }
2503
2504 for (charset = 0; charset <= MAX_CHARSET; charset++)
2505 if (CHARSET_VALID_P (charset)
2506 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2507 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2508 {
2509 /* We have not yet decided where to designate CHARSET. */
2510 int reg_bits = default_reg_bits;
2511
2512 if (CHARSET_CHARS (charset) == 96)
2513 /* A charset of CHARS96 can't be designated to REG 0. */
2514 reg_bits &= ~1;
2515
2516 if (reg_bits)
2517 /* There exist some default graphic register. */
2518 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2519 = (reg_bits & 1
2520 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2521 else
2522 /* We anyway have to designate CHARSET to somewhere. */
2523 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2524 = (CHARSET_CHARS (charset) == 94
2525 ? 0
2526 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2527 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2528 ? 1
2529 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2530 ? 2 : 0)));
2531 }
2532 }
2533 coding->require_flushing = 1;
2534 break;
2535
2536 case 3:
2537 coding->type = coding_type_big5;
2538 coding->flags
2539 = (NILP (XVECTOR (coding_system)->contents[4])
2540 ? CODING_FLAG_BIG5_HKU
2541 : CODING_FLAG_BIG5_ETEN);
2542 break;
2543
2544 case 4:
2545 coding->type = coding_type_ccl;
2546 {
2547 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2548 if (CONSP (val)
2549 && VECTORP (XCONS (val)->car)
2550 && VECTORP (XCONS (val)->cdr))
2551 {
2552 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2553 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2554 }
2555 else
2556 goto label_invalid_coding_system;
2557 }
2558 coding->require_flushing = 1;
2559 break;
2560
2561 case 5:
2562 coding->type = coding_type_raw_text;
2563 break;
2564
2565 default:
2566 if (EQ (type, Qt))
2567 coding->type = coding_type_undecided;
2568 else
2569 coding->type = coding_type_no_conversion;
2570 break;
2571 }
2572 return 0;
2573
2574 label_invalid_coding_system:
2575 coding->type = coding_type_no_conversion;
2576 coding->eol_type = CODING_EOL_LF;
2577 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2578 = Qnil;
2579 return -1;
2580 }
2581
2582 /* Emacs has a mechanism to automatically detect a coding system if it
2583 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2584 it's impossible to distinguish some coding systems accurately
2585 because they use the same range of codes. So, at first, coding
2586 systems are categorized into 7, those are:
2587
2588 o coding-category-emacs-mule
2589
2590 The category for a coding system which has the same code range
2591 as Emacs' internal format. Assigned the coding-system (Lisp
2592 symbol) `emacs-mule' by default.
2593
2594 o coding-category-sjis
2595
2596 The category for a coding system which has the same code range
2597 as SJIS. Assigned the coding-system (Lisp
2598 symbol) `japanese-shift-jis' by default.
2599
2600 o coding-category-iso-7
2601
2602 The category for a coding system which has the same code range
2603 as ISO2022 of 7-bit environment. This doesn't use any locking
2604 shift and single shift functions. Assigned the coding-system
2605 (Lisp symbol) `iso-2022-7bit' by default.
2606
2607 o coding-category-iso-8-1
2608
2609 The category for a coding system which has the same code range
2610 as ISO2022 of 8-bit environment and graphic plane 1 used only
2611 for DIMENSION1 charset. This doesn't use any locking shift
2612 and single shift functions. Assigned the coding-system (Lisp
2613 symbol) `iso-latin-1' by default.
2614
2615 o coding-category-iso-8-2
2616
2617 The category for a coding system which has the same code range
2618 as ISO2022 of 8-bit environment and graphic plane 1 used only
2619 for DIMENSION2 charset. This doesn't use any locking shift
2620 and single shift functions. Assigned the coding-system (Lisp
2621 symbol) `japanese-iso-8bit' by default.
2622
2623 o coding-category-iso-7-else
2624
2625 The category for a coding system which has the same code range
2626 as ISO2022 of 7-bit environemnt but uses locking shift or
2627 single shift functions. Assigned the coding-system (Lisp
2628 symbol) `iso-2022-7bit-lock' by default.
2629
2630 o coding-category-iso-8-else
2631
2632 The category for a coding system which has the same code range
2633 as ISO2022 of 8-bit environemnt but uses locking shift or
2634 single shift functions. Assigned the coding-system (Lisp
2635 symbol) `iso-2022-8bit-ss2' by default.
2636
2637 o coding-category-big5
2638
2639 The category for a coding system which has the same code range
2640 as BIG5. Assigned the coding-system (Lisp symbol)
2641 `cn-big5' by default.
2642
2643 o coding-category-binary
2644
2645 The category for a coding system not categorized in any of the
2646 above. Assigned the coding-system (Lisp symbol)
2647 `no-conversion' by default.
2648
2649 Each of them is a Lisp symbol and the value is an actual
2650 `coding-system's (this is also a Lisp symbol) assigned by a user.
2651 What Emacs does actually is to detect a category of coding system.
2652 Then, it uses a `coding-system' assigned to it. If Emacs can't
2653 decide only one possible category, it selects a category of the
2654 highest priority. Priorities of categories are also specified by a
2655 user in a Lisp variable `coding-category-list'.
2656
2657 */
2658
2659 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2660 If it detects possible coding systems, return an integer in which
2661 appropriate flag bits are set. Flag bits are defined by macros
2662 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2663
2664 int
2665 detect_coding_mask (src, src_bytes)
2666 unsigned char *src;
2667 int src_bytes;
2668 {
2669 register unsigned char c;
2670 unsigned char *src_end = src + src_bytes;
2671 int mask;
2672
2673 /* At first, skip all ASCII characters and control characters except
2674 for three ISO2022 specific control characters. */
2675 label_loop_detect_coding:
2676 while (src < src_end)
2677 {
2678 c = *src;
2679 if (c >= 0x80
2680 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2681 break;
2682 src++;
2683 }
2684
2685 if (src >= src_end)
2686 /* We found nothing other than ASCII. There's nothing to do. */
2687 return CODING_CATEGORY_MASK_ANY;
2688
2689 /* The text seems to be encoded in some multilingual coding system.
2690 Now, try to find in which coding system the text is encoded. */
2691 if (c < 0x80)
2692 {
2693 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2694 /* C is an ISO2022 specific control code of C0. */
2695 mask = detect_coding_iso2022 (src, src_end);
2696 src++;
2697 if (mask == 0)
2698 /* No valid ISO2022 code follows C. Try again. */
2699 goto label_loop_detect_coding;
2700 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2701 }
2702 else if (c < 0xA0)
2703 {
2704 /* If C is a special latin extra code,
2705 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2706 or is an ISO2022 control-sequence-introducer (CSI),
2707 we should also consider the possibility of ISO2022 codings. */
2708 if ((VECTORP (Vlatin_extra_code_table)
2709 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2710 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2711 || (c == ISO_CODE_CSI
2712 && (src < src_end
2713 && (*src == ']'
2714 || (src + 1 < src_end
2715 && src[1] == ']'
2716 && (*src == '0' || *src == '1' || *src == '2'))))))
2717 mask = (detect_coding_iso2022 (src, src_end)
2718 | detect_coding_sjis (src, src_end)
2719 | detect_coding_emacs_mule (src, src_end)
2720 | CODING_CATEGORY_MASK_RAW_TEXT);
2721
2722 else
2723 /* C is the first byte of SJIS character code,
2724 or a leading-code of Emacs' internal format (emacs-mule). */
2725 mask = (detect_coding_sjis (src, src_end)
2726 | detect_coding_emacs_mule (src, src_end)
2727 | CODING_CATEGORY_MASK_RAW_TEXT);
2728 }
2729 else
2730 /* C is a character of ISO2022 in graphic plane right,
2731 or a SJIS's 1-byte character code (i.e. JISX0201),
2732 or the first byte of BIG5's 2-byte code. */
2733 mask = (detect_coding_iso2022 (src, src_end)
2734 | detect_coding_sjis (src, src_end)
2735 | detect_coding_big5 (src, src_end)
2736 | CODING_CATEGORY_MASK_RAW_TEXT);
2737
2738 return mask;
2739 }
2740
2741 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2742 The information of the detected coding system is set in CODING. */
2743
2744 void
2745 detect_coding (coding, src, src_bytes)
2746 struct coding_system *coding;
2747 unsigned char *src;
2748 int src_bytes;
2749 {
2750 int mask = detect_coding_mask (src, src_bytes);
2751 int idx;
2752 Lisp_Object val = Vcoding_category_list;
2753
2754 if (mask == CODING_CATEGORY_MASK_ANY)
2755 /* We found nothing other than ASCII. There's nothing to do. */
2756 return;
2757
2758 /* We found some plausible coding systems. Let's use a coding
2759 system of the highest priority. */
2760
2761 if (CONSP (val))
2762 while (!NILP (val))
2763 {
2764 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2765 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2766 break;
2767 val = XCONS (val)->cdr;
2768 }
2769 else
2770 val = Qnil;
2771
2772 if (NILP (val))
2773 {
2774 /* For unknown reason, `Vcoding_category_list' contains none of
2775 found categories. Let's use any of them. */
2776 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2777 if (mask & (1 << idx))
2778 break;
2779 }
2780 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2781 }
2782
2783 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2784 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2785 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2786
2787 #define MAX_EOL_CHECK_COUNT 3
2788
2789 int
2790 detect_eol_type (src, src_bytes)
2791 unsigned char *src;
2792 int src_bytes;
2793 {
2794 unsigned char *src_end = src + src_bytes;
2795 unsigned char c;
2796 int total = 0; /* How many end-of-lines are found so far. */
2797 int eol_type = CODING_EOL_UNDECIDED;
2798 int this_eol_type;
2799
2800 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2801 {
2802 c = *src++;
2803 if (c == '\n' || c == '\r')
2804 {
2805 total++;
2806 if (c == '\n')
2807 this_eol_type = CODING_EOL_LF;
2808 else if (src >= src_end || *src != '\n')
2809 this_eol_type = CODING_EOL_CR;
2810 else
2811 this_eol_type = CODING_EOL_CRLF, src++;
2812
2813 if (eol_type == CODING_EOL_UNDECIDED)
2814 /* This is the first end-of-line. */
2815 eol_type = this_eol_type;
2816 else if (eol_type != this_eol_type)
2817 /* The found type is different from what found before.
2818 Let's notice the caller about this inconsistency. */
2819 return CODING_EOL_INCONSISTENT;
2820 }
2821 }
2822
2823 return eol_type;
2824 }
2825
2826 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2827 is encoded. If it detects an appropriate format of end-of-line, it
2828 sets the information in *CODING. */
2829
2830 void
2831 detect_eol (coding, src, src_bytes)
2832 struct coding_system *coding;
2833 unsigned char *src;
2834 int src_bytes;
2835 {
2836 Lisp_Object val, coding_system;
2837 int eol_type = detect_eol_type (src, src_bytes);
2838
2839 if (eol_type == CODING_EOL_UNDECIDED)
2840 /* We found no end-of-line in the source text. */
2841 return;
2842
2843 if (eol_type == CODING_EOL_INCONSISTENT)
2844 {
2845 #if 0
2846 /* This code is suppressed until we find a better way to
2847 distinguish raw text file and binary file. */
2848
2849 /* If we have already detected that the coding is raw-text, the
2850 coding should actually be no-conversion. */
2851 if (coding->type == coding_type_raw_text)
2852 {
2853 setup_coding_system (Qno_conversion, coding);
2854 return;
2855 }
2856 /* Else, let's decode only text code anyway. */
2857 #endif /* 0 */
2858 eol_type = CODING_EOL_LF;
2859 }
2860
2861 coding_system = coding->symbol;
2862 while (!NILP (coding_system)
2863 && NILP (val = Fget (coding_system, Qeol_type)))
2864 coding_system = Fget (coding_system, Qcoding_system);
2865 if (VECTORP (val) && XVECTOR (val)->size == 3)
2866 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2867 }
2868
2869 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2870 decoding, it may detect coding system and format of end-of-line if
2871 those are not yet decided. */
2872
2873 int
2874 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2875 struct coding_system *coding;
2876 unsigned char *source, *destination;
2877 int src_bytes, dst_bytes;
2878 int *consumed;
2879 {
2880 int produced;
2881
2882 if (src_bytes <= 0)
2883 {
2884 *consumed = 0;
2885 return 0;
2886 }
2887
2888 if (coding->type == coding_type_undecided)
2889 detect_coding (coding, source, src_bytes);
2890
2891 if (coding->eol_type == CODING_EOL_UNDECIDED)
2892 detect_eol (coding, source, src_bytes);
2893
2894 coding->carryover_size = 0;
2895 switch (coding->type)
2896 {
2897 case coding_type_no_conversion:
2898 label_no_conversion:
2899 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2900 bcopy (source, destination, produced);
2901 *consumed = produced;
2902 break;
2903
2904 case coding_type_emacs_mule:
2905 case coding_type_undecided:
2906 case coding_type_raw_text:
2907 if (coding->eol_type == CODING_EOL_LF
2908 || coding->eol_type == CODING_EOL_UNDECIDED)
2909 goto label_no_conversion;
2910 produced = decode_eol (coding, source, destination,
2911 src_bytes, dst_bytes, consumed);
2912 break;
2913
2914 case coding_type_sjis:
2915 produced = decode_coding_sjis_big5 (coding, source, destination,
2916 src_bytes, dst_bytes, consumed,
2917 1);
2918 break;
2919
2920 case coding_type_iso2022:
2921 produced = decode_coding_iso2022 (coding, source, destination,
2922 src_bytes, dst_bytes, consumed);
2923 break;
2924
2925 case coding_type_big5:
2926 produced = decode_coding_sjis_big5 (coding, source, destination,
2927 src_bytes, dst_bytes, consumed,
2928 0);
2929 break;
2930
2931 case coding_type_ccl:
2932 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2933 src_bytes, dst_bytes, consumed);
2934 break;
2935 }
2936
2937 return produced;
2938 }
2939
2940 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2941
2942 int
2943 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2944 struct coding_system *coding;
2945 unsigned char *source, *destination;
2946 int src_bytes, dst_bytes;
2947 int *consumed;
2948 {
2949 int produced;
2950
2951 switch (coding->type)
2952 {
2953 case coding_type_no_conversion:
2954 label_no_conversion:
2955 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2956 if (produced > 0)
2957 {
2958 bcopy (source, destination, produced);
2959 if (coding->selective)
2960 {
2961 unsigned char *p = destination, *pend = destination + produced;
2962 while (p < pend)
2963 if (*p++ == '\015') p[-1] = '\n';
2964 }
2965 }
2966 *consumed = produced;
2967 break;
2968
2969 case coding_type_emacs_mule:
2970 case coding_type_undecided:
2971 case coding_type_raw_text:
2972 if (coding->eol_type == CODING_EOL_LF
2973 || coding->eol_type == CODING_EOL_UNDECIDED)
2974 goto label_no_conversion;
2975 produced = encode_eol (coding, source, destination,
2976 src_bytes, dst_bytes, consumed);
2977 break;
2978
2979 case coding_type_sjis:
2980 produced = encode_coding_sjis_big5 (coding, source, destination,
2981 src_bytes, dst_bytes, consumed,
2982 1);
2983 break;
2984
2985 case coding_type_iso2022:
2986 produced = encode_coding_iso2022 (coding, source, destination,
2987 src_bytes, dst_bytes, consumed);
2988 break;
2989
2990 case coding_type_big5:
2991 produced = encode_coding_sjis_big5 (coding, source, destination,
2992 src_bytes, dst_bytes, consumed,
2993 0);
2994 break;
2995
2996 case coding_type_ccl:
2997 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2998 src_bytes, dst_bytes, consumed);
2999 break;
3000 }
3001
3002 return produced;
3003 }
3004
3005 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3006
3007 /* Return maximum size (bytes) of a buffer enough for decoding
3008 SRC_BYTES of text encoded in CODING. */
3009
3010 int
3011 decoding_buffer_size (coding, src_bytes)
3012 struct coding_system *coding;
3013 int src_bytes;
3014 {
3015 int magnification;
3016
3017 if (coding->type == coding_type_iso2022)
3018 magnification = 3;
3019 else if (coding->type == coding_type_ccl)
3020 magnification = coding->spec.ccl.decoder.buf_magnification;
3021 else
3022 magnification = 2;
3023
3024 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3025 }
3026
3027 /* Return maximum size (bytes) of a buffer enough for encoding
3028 SRC_BYTES of text to CODING. */
3029
3030 int
3031 encoding_buffer_size (coding, src_bytes)
3032 struct coding_system *coding;
3033 int src_bytes;
3034 {
3035 int magnification;
3036
3037 if (coding->type == coding_type_ccl)
3038 magnification = coding->spec.ccl.encoder.buf_magnification;
3039 else
3040 magnification = 3;
3041
3042 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3043 }
3044
3045 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3046 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3047 #endif
3048
3049 char *conversion_buffer;
3050 int conversion_buffer_size;
3051
3052 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3053 or decoding. Sufficient memory is allocated automatically. If we
3054 run out of memory, return NULL. */
3055
3056 char *
3057 get_conversion_buffer (size)
3058 int size;
3059 {
3060 if (size > conversion_buffer_size)
3061 {
3062 char *buf;
3063 int real_size = conversion_buffer_size * 2;
3064
3065 while (real_size < size) real_size *= 2;
3066 buf = (char *) xmalloc (real_size);
3067 xfree (conversion_buffer);
3068 conversion_buffer = buf;
3069 conversion_buffer_size = real_size;
3070 }
3071 return conversion_buffer;
3072 }
3073
3074 \f
3075 #ifdef emacs
3076 /*** 7. Emacs Lisp library functions ***/
3077
3078 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3079 1, 1, 0,
3080 "Return coding-spec of CODING-SYSTEM.\n\
3081 If CODING-SYSTEM is not a valid coding-system, return nil.")
3082 (obj)
3083 Lisp_Object obj;
3084 {
3085 while (SYMBOLP (obj) && !NILP (obj))
3086 obj = Fget (obj, Qcoding_system);
3087 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3088 ? Qnil : obj);
3089 }
3090
3091 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3092 "Return t if OBJECT is nil or a coding-system.\n\
3093 See document of make-coding-system for coding-system object.")
3094 (obj)
3095 Lisp_Object obj;
3096 {
3097 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3098 }
3099
3100 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3101 Sread_non_nil_coding_system, 1, 1, 0,
3102 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3103 (prompt)
3104 Lisp_Object prompt;
3105 {
3106 Lisp_Object val;
3107 do
3108 {
3109 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3110 Qt, Qnil, Qnil, Qnil, Qnil);
3111 }
3112 while (XSTRING (val)->size == 0);
3113 return (Fintern (val, Qnil));
3114 }
3115
3116 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3117 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3118 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3119 (prompt, default_coding_system)
3120 Lisp_Object prompt, default_coding_system;
3121 {
3122 Lisp_Object val;
3123 if (SYMBOLP (default_coding_system))
3124 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
3125 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3126 Qt, Qnil, Qcoding_system_history,
3127 default_coding_system, Qnil);
3128 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3129 }
3130
3131 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3132 1, 1, 0,
3133 "Check validity of CODING-SYSTEM.\n\
3134 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3135 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3136 The value of property should be a vector of length 5.")
3137 (coding_system)
3138 Lisp_Object coding_system;
3139 {
3140 CHECK_SYMBOL (coding_system, 0);
3141 if (!NILP (Fcoding_system_p (coding_system)))
3142 return coding_system;
3143 while (1)
3144 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3145 }
3146
3147 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3148 2, 2, 0,
3149 "Detect coding system of the text in the region between START and END.\n\
3150 Return a list of possible coding systems ordered by priority.\n\
3151 If only ASCII characters are found, it returns `undecided'\n\
3152 or its subsidiary coding system according to a detected end-of-line format.")
3153 (b, e)
3154 Lisp_Object b, e;
3155 {
3156 int coding_mask, eol_type;
3157 Lisp_Object val;
3158 int beg, end;
3159
3160 validate_region (&b, &e);
3161 beg = XINT (b), end = XINT (e);
3162 if (beg < GPT && end >= GPT) move_gap (end);
3163
3164 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3165 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3166
3167 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3168 {
3169 val = Qundecided;
3170 if (eol_type != CODING_EOL_UNDECIDED
3171 && eol_type != CODING_EOL_INCONSISTENT)
3172 {
3173 Lisp_Object val2;
3174 val2 = Fget (Qundecided, Qeol_type);
3175 if (VECTORP (val2))
3176 val = XVECTOR (val2)->contents[eol_type];
3177 }
3178 }
3179 else
3180 {
3181 Lisp_Object val2;
3182
3183 /* At first, gather possible coding-systems in VAL in a reverse
3184 order. */
3185 val = Qnil;
3186 for (val2 = Vcoding_category_list;
3187 !NILP (val2);
3188 val2 = XCONS (val2)->cdr)
3189 {
3190 int idx
3191 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3192 if (coding_mask & (1 << idx))
3193 {
3194 #if 0
3195 /* This code is suppressed until we find a better way to
3196 distinguish raw text file and binary file. */
3197
3198 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3199 && eol_type == CODING_EOL_INCONSISTENT)
3200 val = Fcons (Qno_conversion, val);
3201 else
3202 #endif /* 0 */
3203 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3204 }
3205 }
3206
3207 /* Then, change the order of the list, while getting subsidiary
3208 coding-systems. */
3209 val2 = val;
3210 val = Qnil;
3211 if (eol_type == CODING_EOL_INCONSISTENT)
3212 eol_type == CODING_EOL_UNDECIDED;
3213 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3214 {
3215 if (eol_type == CODING_EOL_UNDECIDED)
3216 val = Fcons (XCONS (val2)->car, val);
3217 else
3218 {
3219 Lisp_Object val3;
3220 val3 = Fget (XCONS (val2)->car, Qeol_type);
3221 if (VECTORP (val3))
3222 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3223 else
3224 val = Fcons (XCONS (val2)->car, val);
3225 }
3226 }
3227 }
3228
3229 return val;
3230 }
3231
3232 /* Scan text in the region between *BEGP and *ENDP, skip characters
3233 which we never have to encode to (iff ENCODEP is 1) or decode from
3234 coding system CODING at the head and tail, then set BEGP and ENDP
3235 to the addresses of start and end of the text we actually convert. */
3236
3237 void
3238 shrink_conversion_area (begp, endp, coding, encodep)
3239 unsigned char **begp, **endp;
3240 struct coding_system *coding;
3241 int encodep;
3242 {
3243 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3244
3245 if (coding->eol_type != CODING_EOL_LF
3246 && coding->eol_type != CODING_EOL_UNDECIDED)
3247 /* Since we anyway have to convert end-of-line format, it is not
3248 worth skipping at most 100 bytes or so. */
3249 return;
3250
3251 if (encodep) /* for encoding */
3252 {
3253 switch (coding->type)
3254 {
3255 case coding_type_no_conversion:
3256 case coding_type_emacs_mule:
3257 case coding_type_undecided:
3258 case coding_type_raw_text:
3259 /* We need no conversion. */
3260 *begp = *endp;
3261 return;
3262 case coding_type_ccl:
3263 /* We can't skip any data. */
3264 return;
3265 case coding_type_iso2022:
3266 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3267 {
3268 unsigned char *bol = beg_addr;
3269 while (beg_addr < end_addr && *beg_addr < 0x80)
3270 {
3271 beg_addr++;
3272 if (*(beg_addr - 1) == '\n')
3273 bol = beg_addr;
3274 }
3275 beg_addr = bol;
3276 goto label_skip_tail;
3277 }
3278 /* fall down ... */
3279 default:
3280 /* We can skip all ASCII characters at the head and tail. */
3281 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3282 label_skip_tail:
3283 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3284 break;
3285 }
3286 }
3287 else /* for decoding */
3288 {
3289 switch (coding->type)
3290 {
3291 case coding_type_no_conversion:
3292 /* We need no conversion. */
3293 *begp = *endp;
3294 return;
3295 case coding_type_emacs_mule:
3296 case coding_type_raw_text:
3297 if (coding->eol_type == CODING_EOL_LF)
3298 {
3299 /* We need no conversion. */
3300 *begp = *endp;
3301 return;
3302 }
3303 /* We can skip all but carriage-return. */
3304 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3305 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3306 break;
3307 case coding_type_sjis:
3308 case coding_type_big5:
3309 /* We can skip all ASCII characters at the head. */
3310 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3311 /* We can skip all ASCII characters at the tail except for
3312 the second byte of SJIS or BIG5 code. */
3313 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3314 if (end_addr != *endp)
3315 end_addr++;
3316 break;
3317 case coding_type_ccl:
3318 /* We can't skip any data. */
3319 return;
3320 default: /* i.e. case coding_type_iso2022: */
3321 {
3322 unsigned char c;
3323
3324 /* We can skip all ASCII characters except for a few
3325 control codes at the head. */
3326 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3327 && c != ISO_CODE_CR && c != ISO_CODE_SO
3328 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3329 beg_addr++;
3330 }
3331 break;
3332 }
3333 }
3334 *begp = beg_addr;
3335 *endp = end_addr;
3336 return;
3337 }
3338
3339 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3340 text between B and E. B and E are buffer position. */
3341
3342 Lisp_Object
3343 code_convert_region (b, e, coding, encodep)
3344 Lisp_Object b, e;
3345 struct coding_system *coding;
3346 int encodep;
3347 {
3348 int beg, end, len, consumed, produced;
3349 char *buf;
3350 unsigned char *begp, *endp;
3351 int pos = PT;
3352
3353 validate_region (&b, &e);
3354 beg = XINT (b), end = XINT (e);
3355 if (beg < GPT && end >= GPT)
3356 move_gap (end);
3357
3358 if (encodep && !NILP (coding->pre_write_conversion))
3359 {
3360 /* We must call a pre-conversion function which may put a new
3361 text to be converted in a new buffer. */
3362 struct buffer *old = current_buffer, *new;
3363
3364 TEMP_SET_PT (beg);
3365 call2 (coding->pre_write_conversion, b, e);
3366 if (old != current_buffer)
3367 {
3368 /* Replace the original text by the text just generated. */
3369 len = ZV - BEGV;
3370 new = current_buffer;
3371 set_buffer_internal (old);
3372 del_range (beg, end);
3373 insert_from_buffer (new, 1, len, 0);
3374 end = beg + len;
3375 }
3376 }
3377
3378 /* We may be able to shrink the conversion region. */
3379 begp = POS_ADDR (beg); endp = begp + (end - beg);
3380 shrink_conversion_area (&begp, &endp, coding, encodep);
3381
3382 if (begp == endp)
3383 /* We need no conversion. */
3384 len = end - beg;
3385 else
3386 {
3387 beg += begp - POS_ADDR (beg);
3388 end = beg + (endp - begp);
3389
3390 if (encodep)
3391 len = encoding_buffer_size (coding, end - beg);
3392 else
3393 len = decoding_buffer_size (coding, end - beg);
3394 buf = get_conversion_buffer (len);
3395
3396 coding->last_block = 1;
3397 produced = (encodep
3398 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3399 &consumed)
3400 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3401 &consumed));
3402
3403 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3404
3405 TEMP_SET_PT (beg);
3406 insert (buf, produced);
3407 del_range (PT, PT + end - beg);
3408 if (pos >= end)
3409 pos = PT + (pos - end);
3410 else if (pos > beg)
3411 pos = beg;
3412 TEMP_SET_PT (pos);
3413 }
3414
3415 if (!encodep && !NILP (coding->post_read_conversion))
3416 {
3417 /* We must call a post-conversion function which may alter
3418 the text just converted. */
3419 Lisp_Object insval;
3420
3421 beg = XINT (b);
3422 TEMP_SET_PT (beg);
3423 insval = call1 (coding->post_read_conversion, make_number (len));
3424 CHECK_NUMBER (insval, 0);
3425 len = XINT (insval);
3426 }
3427
3428 return make_number (len);
3429 }
3430
3431 Lisp_Object
3432 code_convert_string (str, coding, encodep, nocopy)
3433 Lisp_Object str, nocopy;
3434 struct coding_system *coding;
3435 int encodep;
3436 {
3437 int len, consumed, produced;
3438 char *buf;
3439 unsigned char *begp, *endp;
3440 int head_skip, tail_skip;
3441 struct gcpro gcpro1;
3442
3443 if (encodep && !NILP (coding->pre_write_conversion)
3444 || !encodep && !NILP (coding->post_read_conversion))
3445 {
3446 /* Since we have to call Lisp functions which assume target text
3447 is in a buffer, after setting a temporary buffer, call
3448 code_convert_region. */
3449 int count = specpdl_ptr - specpdl;
3450 int len = XSTRING (str)->size;
3451 Lisp_Object result;
3452 struct buffer *old = current_buffer;
3453
3454 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3455 temp_output_buffer_setup (" *code-converting-work*");
3456 set_buffer_internal (XBUFFER (Vstandard_output));
3457 insert_from_string (str, 0, len, 0);
3458 code_convert_region (make_number (BEGV), make_number (ZV),
3459 coding, encodep);
3460 result = make_buffer_string (BEGV, ZV, 0);
3461 set_buffer_internal (old);
3462 return unbind_to (count, result);
3463 }
3464
3465 /* We may be able to shrink the conversion region. */
3466 begp = XSTRING (str)->data;
3467 endp = begp + XSTRING (str)->size;
3468 shrink_conversion_area (&begp, &endp, coding, encodep);
3469
3470 if (begp == endp)
3471 /* We need no conversion. */
3472 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3473
3474 head_skip = begp - XSTRING (str)->data;
3475 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3476
3477 GCPRO1 (str);
3478
3479 if (encodep)
3480 len = encoding_buffer_size (coding, endp - begp);
3481 else
3482 len = decoding_buffer_size (coding, endp - begp);
3483 buf = get_conversion_buffer (len + head_skip + tail_skip);
3484
3485 bcopy (XSTRING (str)->data, buf, head_skip);
3486 coding->last_block = 1;
3487 produced = (encodep
3488 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3489 buf + head_skip, endp - begp, len, &consumed)
3490 : decode_coding (coding, XSTRING (str)->data + head_skip,
3491 buf + head_skip, endp - begp, len, &consumed));
3492 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3493 buf + head_skip + produced,
3494 tail_skip);
3495
3496 UNGCPRO;
3497
3498 return make_string (buf, head_skip + produced + tail_skip);
3499 }
3500
3501 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3502 3, 3, "r\nzCoding system: ",
3503 "Decode current region by specified coding system.\n\
3504 When called from a program, takes three arguments:\n\
3505 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3506 Return length of decoded text.")
3507 (b, e, coding_system)
3508 Lisp_Object b, e, coding_system;
3509 {
3510 struct coding_system coding;
3511
3512 CHECK_NUMBER_COERCE_MARKER (b, 0);
3513 CHECK_NUMBER_COERCE_MARKER (e, 1);
3514 CHECK_SYMBOL (coding_system, 2);
3515
3516 if (NILP (coding_system))
3517 return make_number (XFASTINT (e) - XFASTINT (b));
3518 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3519 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3520
3521 return code_convert_region (b, e, &coding, 0);
3522 }
3523
3524 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3525 3, 3, "r\nzCoding system: ",
3526 "Encode current region by specified coding system.\n\
3527 When called from a program, takes three arguments:\n\
3528 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3529 Return length of encoded text.")
3530 (b, e, coding_system)
3531 Lisp_Object b, e, coding_system;
3532 {
3533 struct coding_system coding;
3534
3535 CHECK_NUMBER_COERCE_MARKER (b, 0);
3536 CHECK_NUMBER_COERCE_MARKER (e, 1);
3537 CHECK_SYMBOL (coding_system, 2);
3538
3539 if (NILP (coding_system))
3540 return make_number (XFASTINT (e) - XFASTINT (b));
3541 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3542 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3543
3544 return code_convert_region (b, e, &coding, 1);
3545 }
3546
3547 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3548 2, 3, 0,
3549 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3550 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3551 if the decoding operation is trivial.")
3552 (string, coding_system, nocopy)
3553 Lisp_Object string, coding_system, nocopy;
3554 {
3555 struct coding_system coding;
3556
3557 CHECK_STRING (string, 0);
3558 CHECK_SYMBOL (coding_system, 1);
3559
3560 if (NILP (coding_system))
3561 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3562 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3563 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3564
3565 return code_convert_string (string, &coding, 0, nocopy);
3566 }
3567
3568 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3569 2, 3, 0,
3570 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3571 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3572 if the encoding operation is trivial.")
3573 (string, coding_system, nocopy)
3574 Lisp_Object string, coding_system, nocopy;
3575 {
3576 struct coding_system coding;
3577
3578 CHECK_STRING (string, 0);
3579 CHECK_SYMBOL (coding_system, 1);
3580
3581 if (NILP (coding_system))
3582 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3583 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3584 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3585
3586 return code_convert_string (string, &coding, 1, nocopy);
3587 }
3588
3589 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3590 "Decode a JISX0208 character of shift-jis encoding.\n\
3591 CODE is the character code in SJIS.\n\
3592 Return the corresponding character.")
3593 (code)
3594 Lisp_Object code;
3595 {
3596 unsigned char c1, c2, s1, s2;
3597 Lisp_Object val;
3598
3599 CHECK_NUMBER (code, 0);
3600 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3601 DECODE_SJIS (s1, s2, c1, c2);
3602 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3603 return val;
3604 }
3605
3606 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3607 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3608 Return the corresponding character code in SJIS.")
3609 (ch)
3610 Lisp_Object ch;
3611 {
3612 int charset, c1, c2, s1, s2;
3613 Lisp_Object val;
3614
3615 CHECK_NUMBER (ch, 0);
3616 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3617 if (charset == charset_jisx0208)
3618 {
3619 ENCODE_SJIS (c1, c2, s1, s2);
3620 XSETFASTINT (val, (s1 << 8) | s2);
3621 }
3622 else
3623 XSETFASTINT (val, 0);
3624 return val;
3625 }
3626
3627 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3628 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3629 CODE is the character code in BIG5.\n\
3630 Return the corresponding character.")
3631 (code)
3632 Lisp_Object code;
3633 {
3634 int charset;
3635 unsigned char b1, b2, c1, c2;
3636 Lisp_Object val;
3637
3638 CHECK_NUMBER (code, 0);
3639 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3640 DECODE_BIG5 (b1, b2, charset, c1, c2);
3641 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3642 return val;
3643 }
3644
3645 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3646 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3647 Return the corresponding character code in Big5.")
3648 (ch)
3649 Lisp_Object ch;
3650 {
3651 int charset, c1, c2, b1, b2;
3652 Lisp_Object val;
3653
3654 CHECK_NUMBER (ch, 0);
3655 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3656 if (charset == charset_big5_1 || charset == charset_big5_2)
3657 {
3658 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3659 XSETFASTINT (val, (b1 << 8) | b2);
3660 }
3661 else
3662 XSETFASTINT (val, 0);
3663 return val;
3664 }
3665
3666 DEFUN ("set-terminal-coding-system-internal",
3667 Fset_terminal_coding_system_internal,
3668 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3669 (coding_system)
3670 Lisp_Object coding_system;
3671 {
3672 CHECK_SYMBOL (coding_system, 0);
3673 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3674 /* We had better not send unexpected characters to terminal. */
3675 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3676
3677 return Qnil;
3678 }
3679
3680 DEFUN ("set-safe-terminal-coding-system-internal",
3681 Fset_safe_terminal_coding_system_internal,
3682 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3683 (coding_system)
3684 Lisp_Object coding_system;
3685 {
3686 CHECK_SYMBOL (coding_system, 0);
3687 setup_coding_system (Fcheck_coding_system (coding_system),
3688 &safe_terminal_coding);
3689 return Qnil;
3690 }
3691
3692 DEFUN ("terminal-coding-system",
3693 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3694 "Return coding-system of your terminal.")
3695 ()
3696 {
3697 return terminal_coding.symbol;
3698 }
3699
3700 DEFUN ("set-keyboard-coding-system-internal",
3701 Fset_keyboard_coding_system_internal,
3702 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3703 (coding_system)
3704 Lisp_Object coding_system;
3705 {
3706 CHECK_SYMBOL (coding_system, 0);
3707 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3708 return Qnil;
3709 }
3710
3711 DEFUN ("keyboard-coding-system",
3712 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3713 "Return coding-system of what is sent from terminal keyboard.")
3714 ()
3715 {
3716 return keyboard_coding.symbol;
3717 }
3718
3719 \f
3720 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3721 Sfind_operation_coding_system, 1, MANY, 0,
3722 "Choose a coding system for an operation based on the target name.\n\
3723 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3724 DECODING-SYSTEM is the coding system to use for decoding\n\
3725 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3726 for encoding (in case OPERATION does encoding).\n\
3727 \n\
3728 The first argument OPERATION specifies an I/O primitive:\n\
3729 For file I/O, `insert-file-contents' or `write-region'.\n\
3730 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3731 For network I/O, `open-network-stream'.\n\
3732 \n\
3733 The remaining arguments should be the same arguments that were passed\n\
3734 to the primitive. Depending on which primitive, one of those arguments\n\
3735 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3736 whichever argument specifies the file name is TARGET.\n\
3737 \n\
3738 TARGET has a meaning which depends on OPERATION:\n\
3739 For file I/O, TARGET is a file name.\n\
3740 For process I/O, TARGET is a process name.\n\
3741 For network I/O, TARGET is a service name or a port number\n\
3742 \n\
3743 This function looks up what specified for TARGET in,\n\
3744 `file-coding-system-alist', `process-coding-system-alist',\n\
3745 or `network-coding-system-alist' depending on OPERATION.\n\
3746 They may specify a coding system, a cons of coding systems,\n\
3747 or a function symbol to call.\n\
3748 In the last case, we call the function with one argument,\n\
3749 which is a list of all the arguments given to this function.")
3750 (nargs, args)
3751 int nargs;
3752 Lisp_Object *args;
3753 {
3754 Lisp_Object operation, target_idx, target, val;
3755 register Lisp_Object chain;
3756
3757 if (nargs < 2)
3758 error ("Too few arguments");
3759 operation = args[0];
3760 if (!SYMBOLP (operation)
3761 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3762 error ("Invalid first arguement");
3763 if (nargs < 1 + XINT (target_idx))
3764 error ("Too few arguments for operation: %s",
3765 XSYMBOL (operation)->name->data);
3766 target = args[XINT (target_idx) + 1];
3767 if (!(STRINGP (target)
3768 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3769 error ("Invalid %dth argument", XINT (target_idx) + 1);
3770
3771 chain = ((EQ (operation, Qinsert_file_contents)
3772 || EQ (operation, Qwrite_region))
3773 ? Vfile_coding_system_alist
3774 : (EQ (operation, Qopen_network_stream)
3775 ? Vnetwork_coding_system_alist
3776 : Vprocess_coding_system_alist));
3777 if (NILP (chain))
3778 return Qnil;
3779
3780 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3781 {
3782 Lisp_Object elt;
3783 elt = XCONS (chain)->car;
3784
3785 if (CONSP (elt)
3786 && ((STRINGP (target)
3787 && STRINGP (XCONS (elt)->car)
3788 && fast_string_match (XCONS (elt)->car, target) >= 0)
3789 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3790 {
3791 val = XCONS (elt)->cdr;
3792 /* Here, if VAL is both a valid coding system and a valid
3793 function symbol, we return VAL as a coding system. */
3794 if (CONSP (val))
3795 return val;
3796 if (! SYMBOLP (val))
3797 return Qnil;
3798 if (! NILP (Fcoding_system_p (val)))
3799 return Fcons (val, val);
3800 if (! NILP (Ffboundp (val)))
3801 {
3802 val = call1 (val, Flist (nargs, args));
3803 if (CONSP (val))
3804 return val;
3805 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3806 return Fcons (val, val);
3807 }
3808 return Qnil;
3809 }
3810 }
3811 return Qnil;
3812 }
3813
3814 #endif /* emacs */
3815
3816 \f
3817 /*** 8. Post-amble ***/
3818
3819 init_coding_once ()
3820 {
3821 int i;
3822
3823 /* Emacs' internal format specific initialize routine. */
3824 for (i = 0; i <= 0x20; i++)
3825 emacs_code_class[i] = EMACS_control_code;
3826 emacs_code_class[0x0A] = EMACS_linefeed_code;
3827 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3828 for (i = 0x21 ; i < 0x7F; i++)
3829 emacs_code_class[i] = EMACS_ascii_code;
3830 emacs_code_class[0x7F] = EMACS_control_code;
3831 emacs_code_class[0x80] = EMACS_leading_code_composition;
3832 for (i = 0x81; i < 0xFF; i++)
3833 emacs_code_class[i] = EMACS_invalid_code;
3834 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3835 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3836 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3837 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3838
3839 /* ISO2022 specific initialize routine. */
3840 for (i = 0; i < 0x20; i++)
3841 iso_code_class[i] = ISO_control_code;
3842 for (i = 0x21; i < 0x7F; i++)
3843 iso_code_class[i] = ISO_graphic_plane_0;
3844 for (i = 0x80; i < 0xA0; i++)
3845 iso_code_class[i] = ISO_control_code;
3846 for (i = 0xA1; i < 0xFF; i++)
3847 iso_code_class[i] = ISO_graphic_plane_1;
3848 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3849 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3850 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3851 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3852 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3853 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3854 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3855 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3856 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3857 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3858
3859 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3860 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3861
3862 setup_coding_system (Qnil, &keyboard_coding);
3863 setup_coding_system (Qnil, &terminal_coding);
3864 setup_coding_system (Qnil, &safe_terminal_coding);
3865
3866 #if defined (MSDOS) || defined (WINDOWSNT)
3867 system_eol_type = CODING_EOL_CRLF;
3868 #else
3869 system_eol_type = CODING_EOL_LF;
3870 #endif
3871 }
3872
3873 #ifdef emacs
3874
3875 syms_of_coding ()
3876 {
3877 Qtarget_idx = intern ("target-idx");
3878 staticpro (&Qtarget_idx);
3879
3880 Qcoding_system_history = intern ("coding-system-history");
3881 staticpro (&Qcoding_system_history);
3882 Fset (Qcoding_system_history, Qnil);
3883
3884 /* Target FILENAME is the first argument. */
3885 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3886 /* Target FILENAME is the third argument. */
3887 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3888
3889 Qcall_process = intern ("call-process");
3890 staticpro (&Qcall_process);
3891 /* Target PROGRAM is the first argument. */
3892 Fput (Qcall_process, Qtarget_idx, make_number (0));
3893
3894 Qcall_process_region = intern ("call-process-region");
3895 staticpro (&Qcall_process_region);
3896 /* Target PROGRAM is the third argument. */
3897 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3898
3899 Qstart_process = intern ("start-process");
3900 staticpro (&Qstart_process);
3901 /* Target PROGRAM is the third argument. */
3902 Fput (Qstart_process, Qtarget_idx, make_number (2));
3903
3904 Qopen_network_stream = intern ("open-network-stream");
3905 staticpro (&Qopen_network_stream);
3906 /* Target SERVICE is the fourth argument. */
3907 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3908
3909 Qcoding_system = intern ("coding-system");
3910 staticpro (&Qcoding_system);
3911
3912 Qeol_type = intern ("eol-type");
3913 staticpro (&Qeol_type);
3914
3915 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3916 staticpro (&Qbuffer_file_coding_system);
3917
3918 Qpost_read_conversion = intern ("post-read-conversion");
3919 staticpro (&Qpost_read_conversion);
3920
3921 Qpre_write_conversion = intern ("pre-write-conversion");
3922 staticpro (&Qpre_write_conversion);
3923
3924 Qno_conversion = intern ("no-conversion");
3925 staticpro (&Qno_conversion);
3926
3927 Qundecided = intern ("undecided");
3928 staticpro (&Qundecided);
3929
3930 Qcoding_system_spec = intern ("coding-system-spec");
3931 staticpro (&Qcoding_system_spec);
3932
3933 Qcoding_system_p = intern ("coding-system-p");
3934 staticpro (&Qcoding_system_p);
3935
3936 Qcoding_system_error = intern ("coding-system-error");
3937 staticpro (&Qcoding_system_error);
3938
3939 Fput (Qcoding_system_error, Qerror_conditions,
3940 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3941 Fput (Qcoding_system_error, Qerror_message,
3942 build_string ("Invalid coding system"));
3943
3944 Qcoding_category_index = intern ("coding-category-index");
3945 staticpro (&Qcoding_category_index);
3946
3947 {
3948 int i;
3949 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3950 {
3951 coding_category_table[i] = intern (coding_category_name[i]);
3952 staticpro (&coding_category_table[i]);
3953 Fput (coding_category_table[i], Qcoding_category_index,
3954 make_number (i));
3955 }
3956 }
3957
3958 Qcharacter_unification_table = intern ("character-unification-table");
3959 staticpro (&Qcharacter_unification_table);
3960 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3961 make_number (0));
3962
3963 Qcharacter_unification_table_for_decode
3964 = intern ("character-unification-table-for-decode");
3965 staticpro (&Qcharacter_unification_table_for_decode);
3966
3967 Qcharacter_unification_table_for_encode
3968 = intern ("character-unification-table-for-encode");
3969 staticpro (&Qcharacter_unification_table_for_encode);
3970
3971 Qemacs_mule = intern ("emacs-mule");
3972 staticpro (&Qemacs_mule);
3973
3974 defsubr (&Scoding_system_spec);
3975 defsubr (&Scoding_system_p);
3976 defsubr (&Sread_coding_system);
3977 defsubr (&Sread_non_nil_coding_system);
3978 defsubr (&Scheck_coding_system);
3979 defsubr (&Sdetect_coding_region);
3980 defsubr (&Sdecode_coding_region);
3981 defsubr (&Sencode_coding_region);
3982 defsubr (&Sdecode_coding_string);
3983 defsubr (&Sencode_coding_string);
3984 defsubr (&Sdecode_sjis_char);
3985 defsubr (&Sencode_sjis_char);
3986 defsubr (&Sdecode_big5_char);
3987 defsubr (&Sencode_big5_char);
3988 defsubr (&Sset_terminal_coding_system_internal);
3989 defsubr (&Sset_safe_terminal_coding_system_internal);
3990 defsubr (&Sterminal_coding_system);
3991 defsubr (&Sset_keyboard_coding_system_internal);
3992 defsubr (&Skeyboard_coding_system);
3993 defsubr (&Sfind_operation_coding_system);
3994
3995 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3996 "List of coding-categories (symbols) ordered by priority.");
3997 {
3998 int i;
3999
4000 Vcoding_category_list = Qnil;
4001 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4002 Vcoding_category_list
4003 = Fcons (coding_category_table[i], Vcoding_category_list);
4004 }
4005
4006 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4007 "Specify the coding system for read operations.\n\
4008 It is useful to bind this variable with `let', but do not set it globally.\n\
4009 If the value is a coding system, it is used for decoding on read operation.\n\
4010 If not, an appropriate element is used from one of the coding system alists:\n\
4011 There are three such tables, `file-coding-system-alist',\n\
4012 `process-coding-system-alist', and `network-coding-system-alist'.");
4013 Vcoding_system_for_read = Qnil;
4014
4015 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4016 "Specify the coding system for write operations.\n\
4017 It is useful to bind this variable with `let', but do not set it globally.\n\
4018 If the value is a coding system, it is used for encoding on write operation.\n\
4019 If not, an appropriate element is used from one of the coding system alists:\n\
4020 There are three such tables, `file-coding-system-alist',\n\
4021 `process-coding-system-alist', and `network-coding-system-alist'.");
4022 Vcoding_system_for_write = Qnil;
4023
4024 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4025 "Coding system used in the latest file or process I/O.");
4026 Vlast_coding_system_used = Qnil;
4027
4028 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4029 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4030 inhibit_eol_conversion = 0;
4031
4032 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4033 "Alist to decide a coding system to use for a file I/O operation.\n\
4034 The format is ((PATTERN . VAL) ...),\n\
4035 where PATTERN is a regular expression matching a file name,\n\
4036 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4037 If VAL is a coding system, it is used for both decoding and encoding\n\
4038 the file contents.\n\
4039 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4040 and the cdr part is used for encoding.\n\
4041 If VAL is a function symbol, the function must return a coding system\n\
4042 or a cons of coding systems which are used as above.\n\
4043 \n\
4044 See also the function `find-operation-coding-system'.");
4045 Vfile_coding_system_alist = Qnil;
4046
4047 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4048 "Alist to decide a coding system to use for a process I/O operation.\n\
4049 The format is ((PATTERN . VAL) ...),\n\
4050 where PATTERN is a regular expression matching a program name,\n\
4051 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4052 If VAL is a coding system, it is used for both decoding what received\n\
4053 from the program and encoding what sent to the program.\n\
4054 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4055 and the cdr part is used for encoding.\n\
4056 If VAL is a function symbol, the function must return a coding system\n\
4057 or a cons of coding systems which are used as above.\n\
4058 \n\
4059 See also the function `find-operation-coding-system'.");
4060 Vprocess_coding_system_alist = Qnil;
4061
4062 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4063 "Alist to decide a coding system to use for a network I/O operation.\n\
4064 The format is ((PATTERN . VAL) ...),\n\
4065 where PATTERN is a regular expression matching a network service name\n\
4066 or is a port number to connect to,\n\
4067 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4068 If VAL is a coding system, it is used for both decoding what received\n\
4069 from the network stream and encoding what sent to the network stream.\n\
4070 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4071 and the cdr part is used for encoding.\n\
4072 If VAL is a function symbol, the function must return a coding system\n\
4073 or a cons of coding systems which are used as above.\n\
4074 \n\
4075 See also the function `find-operation-coding-system'.");
4076 Vnetwork_coding_system_alist = Qnil;
4077
4078 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4079 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4080 eol_mnemonic_unix = ':';
4081
4082 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4083 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4084 eol_mnemonic_dos = '\\';
4085
4086 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4087 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4088 eol_mnemonic_mac = '/';
4089
4090 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4091 "Mnemonic character indicating end-of-line format is not yet decided.");
4092 eol_mnemonic_undecided = ':';
4093
4094 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4095 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4096 Venable_character_unification = Qt;
4097
4098 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4099 &Vstandard_character_unification_table_for_decode,
4100 "Table for unifying characters when reading.");
4101 Vstandard_character_unification_table_for_decode = Qnil;
4102
4103 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4104 &Vstandard_character_unification_table_for_encode,
4105 "Table for unifying characters when writing.");
4106 Vstandard_character_unification_table_for_encode = Qnil;
4107
4108 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4109 "Alist of charsets vs revision numbers.\n\
4110 While encoding, if a charset (car part of an element) is found,\n\
4111 designate it with the escape sequence identifing revision (cdr part of the element).");
4112 Vcharset_revision_alist = Qnil;
4113
4114 DEFVAR_LISP ("default-process-coding-system",
4115 &Vdefault_process_coding_system,
4116 "Cons of coding systems used for process I/O by default.\n\
4117 The car part is used for decoding a process output,\n\
4118 the cdr part is used for encoding a text to be sent to a process.");
4119 Vdefault_process_coding_system = Qnil;
4120
4121 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4122 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4123 This is a vector of length 256.\n\
4124 If Nth element is non-nil, the existence of code N in a file\n\
4125 \(or output of subprocess) doesn't prevent it to be detected as\n\
4126 a coding system of ISO 2022 variant which has a flag\n\
4127 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4128 or reading output of a subprocess.\n\
4129 Only 128th through 159th elements has a meaning.");
4130 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4131 }
4132
4133 #endif /* emacs */