Maintainer change. Doc fixes.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
34
35 */
36
37 /*** 0. General comments ***/
38
39
40 /*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
48
49 0. Emacs' internal format (emacs-mule)
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
74
75 4. Raw text
76
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
79
80 5. Other
81
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
91
92 */
93
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
101
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
106
107 */
108
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
117 int
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
120 {
121 ...
122 }
123 #endif
124
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
131
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 {
149 ...
150 }
151 #endif
152
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
159
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
164
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
168
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
176 {
177 ...
178 }
179 #endif
180
181 /*** COMMONLY USED MACROS ***/
182
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
190
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
199 } while (0)
200
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
211
212
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
222
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
241
242
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
268 \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
276 } while (0)
277
278
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
309
310 \f
311 /*** 1. Preamble ***/
312
313 #ifdef emacs
314 #include <config.h>
315 #endif
316
317 #include <stdio.h>
318
319 #ifdef emacs
320
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
328
329 #else /* not emacs */
330
331 #include "mulelib.h"
332
333 #endif /* not emacs */
334
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_charsets;
341 Lisp_Object Qvalid_codes;
342
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
347
348 Lisp_Object Vselect_safe_coding_system_function;
349
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
355
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
359
360 #ifdef emacs
361
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
365
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
369
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
378
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
382
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
385
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
388
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding;
391
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding;
395
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding;
398
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding;
401
402 Lisp_Object Vfile_coding_system_alist;
403 Lisp_Object Vprocess_coding_system_alist;
404 Lisp_Object Vnetwork_coding_system_alist;
405
406 Lisp_Object Vlocale_coding_system;
407
408 #endif /* emacs */
409
410 Lisp_Object Qcoding_category, Qcoding_category_index;
411
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list;
414
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table;
417
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
435 };
436
437 /* Table of pointers to coding systems corresponding to each coding
438 categories. */
439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443 static
444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
446 /* Flag to tell if we look up translation table on character code
447 conversion. */
448 Lisp_Object Venable_character_translation;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode;
453
454 Lisp_Object Qtranslation_table;
455 Lisp_Object Qtranslation_table_id;
456 Lisp_Object Qtranslation_table_for_decode;
457 Lisp_Object Qtranslation_table_for_encode;
458
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist;
461
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system;
464
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion;
470
471 \f
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
473
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
496
497 --- CODE RANGE of Emacs' internal format ---
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
505
506 */
507
508 enum emacs_code_class_type emacs_code_class[256];
509
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
513
514 int
515 detect_coding_emacs_mule (src, src_end)
516 unsigned char *src, *src_end;
517 {
518 unsigned char c;
519 int composing = 0;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
523
524 while (1)
525 {
526 ONE_MORE_BYTE (c);
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
537 else
538 c -= 0x20;
539 }
540
541 if (c < 0x20)
542 {
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
555
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565 }
566
567
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
569
570 static void
571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575 {
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
585
586 coding->produced_char = 0;
587 while (src < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
591
592 src_base = src;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 {
595 p = src;
596 src += bytes;
597 }
598 else
599 {
600 bytes = CHAR_STRING (*src, tmp);
601 p = tmp;
602 src++;
603 }
604 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 {
606 coding->result = CODING_FINISH_INSUFFICIENT_DST;
607 break;
608 }
609 while (bytes--) *dst++ = *p++;
610 coding->produced_char++;
611 }
612 coding->consumed = coding->consumed_char = src_base - source;
613 coding->produced = dst - destination;
614 }
615
616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
618
619
620 \f
621 /*** 3. ISO2022 handlers ***/
622
623 /* The following note describes the coding system ISO2022 briefly.
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
627 original document of ISO2022.
628
629 ISO2022 provides many mechanisms to encode several character sets
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
634
635 There are two kinds of character sets: control character set and
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
640 two control character sets and many graphic character sets.
641
642 Graphic character sets are classified into one of the following
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
645 - DIMENSION1_CHARS94
646 - DIMENSION1_CHARS96
647 - DIMENSION2_CHARS94
648 - DIMENSION2_CHARS96
649
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
655
656 Note (*): ECMA = European Computer Manufacturers Association
657
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
663
664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
669
670 A control character set is directly designated and invoked to C0 or
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
679
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
687
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
691 be used.
692
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
698 escape sequences:
699
700 ----------------------------------------------------------------------
701 abbrev function cntrl escape seq description
702 ----------------------------------------------------------------------
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
712 ----------------------------------------------------------------------
713 (*) These are not used by any known coding system.
714
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
717
718 Designations are done by the following escape sequences:
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
739
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
741 of dimension 1, chars 94, and final character <F>, etc...
742
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
745 CHARS96 character sets in a coding system which is characterized as
746 7-bit environment, non-locking-shift, and non-single-shift.
747
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
749 '(' can be omitted. We refer to this as "short-form" hereafter.
750
751 Now you may notice that there are a lot of ways for encoding the
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
756 localized platforms), and all of these are variants of ISO2022.
757
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
761
762 ISO6429's direction specification takes the following form:
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769
770 Character composition specification takes the following form:
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
778
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
783
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
786
787 COMPOSITION_RELATIVE:
788 ESC 0 CHAR [ CHAR ] ESC 1
789 COMPOSITOIN_WITH_RULE:
790 ESC 2 CHAR [ RULE CHAR ] ESC 1
791 COMPOSITION_WITH_ALTCHARS:
792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
793 COMPOSITION_WITH_RULE_ALTCHARS:
794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
795
796 enum iso_code_class_type iso_code_class[256];
797
798 #define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
804
805 #define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807
808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
812 CODING_CATEGORY_MASK_ISO_7_TIGHT
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
817 are set. If a code which should never appear in ISO2022 is found,
818 returns 0. */
819
820 int
821 detect_coding_iso2022 (src, src_end)
822 unsigned char *src, *src_end;
823 {
824 int mask = CODING_CATEGORY_MASK_ISO;
825 int mask_found = 0;
826 int reg[4], shift_out = 0, single_shifting = 0;
827 int c, c1, i, charset;
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding;
830 struct coding_system *coding = &dummy_coding;
831
832 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
833 while (mask && src < src_end)
834 {
835 ONE_MORE_BYTE (c);
836 switch (c)
837 {
838 case ISO_CODE_ESC:
839 single_shifting = 0;
840 ONE_MORE_BYTE (c);
841 if (c >= '(' && c <= '/')
842 {
843 /* Designation sequence for a charset of dimension 1. */
844 ONE_MORE_BYTE (c1);
845 if (c1 < ' ' || c1 >= 0x80
846 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
847 /* Invalid designation sequence. Just ignore. */
848 break;
849 reg[(c - '(') % 4] = charset;
850 }
851 else if (c == '$')
852 {
853 /* Designation sequence for a charset of dimension 2. */
854 ONE_MORE_BYTE (c);
855 if (c >= '@' && c <= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
857 reg[0] = charset = iso_charset_table[1][0][c];
858 else if (c >= '(' && c <= '/')
859 {
860 ONE_MORE_BYTE (c1);
861 if (c1 < ' ' || c1 >= 0x80
862 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
863 /* Invalid designation sequence. Just ignore. */
864 break;
865 reg[(c - '(') % 4] = charset;
866 }
867 else
868 /* Invalid designation sequence. Just ignore. */
869 break;
870 }
871 else if (c == 'N' || c == 'O')
872 {
873 /* ESC <Fe> for SS2 or SS3. */
874 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
875 break;
876 }
877 else if (c >= '0' && c <= '4')
878 {
879 /* ESC <Fp> for start/end composition. */
880 mask_found |= CODING_CATEGORY_MASK_ISO;
881 break;
882 }
883 else
884 /* Invalid escape sequence. Just ignore. */
885 break;
886
887 /* We found a valid designation sequence for CHARSET. */
888 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
890 mask_found |= CODING_CATEGORY_MASK_ISO_7;
891 else
892 mask &= ~CODING_CATEGORY_MASK_ISO_7;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
894 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
895 else
896 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
898 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
899 else
900 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
902 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
903 else
904 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
905 break;
906
907 case ISO_CODE_SO:
908 single_shifting = 0;
909 if (shift_out == 0
910 && (reg[1] >= 0
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 {
914 /* Locking shift out. */
915 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
916 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 }
918 break;
919
920 case ISO_CODE_SI:
921 single_shifting = 0;
922 if (shift_out == 1)
923 {
924 /* Locking shift in. */
925 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
926 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 }
928 break;
929
930 case ISO_CODE_CSI:
931 single_shifting = 0;
932 case ISO_CODE_SS2:
933 case ISO_CODE_SS3:
934 {
935 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936
937 if (c != ISO_CODE_CSI)
938 {
939 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT)
941 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
942 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT)
944 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
945 single_shifting = 1;
946 }
947 if (VECTORP (Vlatin_extra_code_table)
948 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 {
950 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA)
952 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
953 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA)
955 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 }
957 mask &= newmask;
958 mask_found |= newmask;
959 }
960 break;
961
962 default:
963 if (c < 0x80)
964 {
965 single_shifting = 0;
966 break;
967 }
968 else if (c < 0xA0)
969 {
970 single_shifting = 0;
971 if (VECTORP (Vlatin_extra_code_table)
972 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
973 {
974 int newmask = 0;
975
976 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA)
978 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
979 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA)
981 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
982 mask &= newmask;
983 mask_found |= newmask;
984 }
985 else
986 return 0;
987 }
988 else
989 {
990 unsigned char *src_begin = src;
991
992 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
993 | CODING_CATEGORY_MASK_ISO_7_ELSE);
994 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
995 /* Check the length of succeeding codes of the range
996 0xA0..0FF. If the byte length is odd, we exclude
997 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
998 when we are not single shifting. */
999 if (!single_shifting
1000 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1001 {
1002 int i = 0;
1003 while (src < src_end)
1004 {
1005 ONE_MORE_BYTE (c);
1006 if (c < 0xA0)
1007 break;
1008 i++;
1009 }
1010
1011 if (i & 1 && src < src_end)
1012 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1013 else
1014 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1015 }
1016 }
1017 break;
1018 }
1019 }
1020 label_end_of_loop:
1021 return (mask & mask_found);
1022 }
1023
1024 /* Decode a character of which charset is CHARSET, the 1st position
1025 code is C1, the 2nd position code is C2, and return the decoded
1026 character code. If the variable `translation_table' is non-nil,
1027 returned the translated code. */
1028
1029 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1030 (NILP (translation_table) \
1031 ? MAKE_CHAR (charset, c1, c2) \
1032 : translate_char (translation_table, -1, charset, c1, c2))
1033
1034 /* Set designation state into CODING. */
1035 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1036 do { \
1037 int charset; \
1038 \
1039 if (final_char < '0' || final_char >= 128) \
1040 goto label_invalid_code; \
1041 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1042 make_number (chars), \
1043 make_number (final_char)); \
1044 if (charset >= 0 \
1045 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046 || coding->safe_charsets[charset])) \
1047 { \
1048 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1049 && reg == 0 \
1050 && charset == CHARSET_ASCII) \
1051 { \
1052 /* We should insert this designation sequence as is so \
1053 that it is surely written back to a file. */ \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 goto label_invalid_code; \
1056 } \
1057 coding->spec.iso2022.last_invalid_designation_register = -1; \
1058 if ((coding->mode & CODING_MODE_DIRECTION) \
1059 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1060 charset = CHARSET_REVERSE_CHARSET (charset); \
1061 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1062 } \
1063 else \
1064 { \
1065 coding->spec.iso2022.last_invalid_designation_register = reg; \
1066 goto label_invalid_code; \
1067 } \
1068 } while (0)
1069
1070 /* Allocate a memory block for storing information about compositions.
1071 The block is chained to the already allocated blocks. */
1072
1073 static void
1074 coding_allocate_composition_data (coding, char_offset)
1075 struct coding_system *coding;
1076 int char_offset;
1077 {
1078 struct composition_data *cmp_data
1079 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1080
1081 cmp_data->char_offset = char_offset;
1082 cmp_data->used = 0;
1083 cmp_data->prev = coding->cmp_data;
1084 cmp_data->next = NULL;
1085 if (coding->cmp_data)
1086 coding->cmp_data->next = cmp_data;
1087 coding->cmp_data = cmp_data;
1088 coding->cmp_data_start = 0;
1089 }
1090
1091 /* Record the starting position START and METHOD of one composition. */
1092
1093 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1094 do { \
1095 struct composition_data *cmp_data = coding->cmp_data; \
1096 int *data = cmp_data->data + cmp_data->used; \
1097 coding->cmp_data_start = cmp_data->used; \
1098 data[0] = -1; \
1099 data[1] = cmp_data->char_offset + start; \
1100 data[3] = (int) method; \
1101 cmp_data->used += 4; \
1102 } while (0)
1103
1104 /* Record the ending position END of the current composition. */
1105
1106 #define CODING_ADD_COMPOSITION_END(coding, end) \
1107 do { \
1108 struct composition_data *cmp_data = coding->cmp_data; \
1109 int *data = cmp_data->data + coding->cmp_data_start; \
1110 data[0] = cmp_data->used - coding->cmp_data_start; \
1111 data[2] = cmp_data->char_offset + end; \
1112 } while (0)
1113
1114 /* Record one COMPONENT (alternate character or composition rule). */
1115
1116 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1117 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1118
1119 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1120
1121 #define DECODE_COMPOSITION_START(c1) \
1122 do { \
1123 if (coding->composing == COMPOSITION_DISABLED) \
1124 { \
1125 *dst++ = ISO_CODE_ESC; \
1126 *dst++ = c1 & 0x7f; \
1127 coding->produced_char += 2; \
1128 } \
1129 else if (!COMPOSING_P (coding)) \
1130 { \
1131 /* This is surely the start of a composition. We must be sure \
1132 that coding->cmp_data has enough space to store the \
1133 information about the composition. If not, terminate the \
1134 current decoding loop, allocate one more memory block for \
1135 coding->cmp_data in the calller, then start the decoding \
1136 loop again. We can't allocate memory here directly because \
1137 it may cause buffer/string relocation. */ \
1138 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1139 >= COMPOSITION_DATA_SIZE) \
1140 { \
1141 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1142 goto label_end_of_loop; \
1143 } \
1144 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1145 : c1 == '2' ? COMPOSITION_WITH_RULE \
1146 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1147 : COMPOSITION_WITH_RULE_ALTCHARS); \
1148 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1149 coding->composing); \
1150 coding->composition_rule_follows = 0; \
1151 } \
1152 else \
1153 { \
1154 /* We are already handling a composition. If the method is \
1155 the following two, the codes following the current escape \
1156 sequence are actual characters stored in a buffer. */ \
1157 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1158 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1159 { \
1160 coding->composing = COMPOSITION_RELATIVE; \
1161 coding->composition_rule_follows = 0; \
1162 } \
1163 } \
1164 } while (0)
1165
1166 /* Handle compositoin end sequence ESC 1. */
1167
1168 #define DECODE_COMPOSITION_END(c1) \
1169 do { \
1170 if (coding->composing == COMPOSITION_DISABLED) \
1171 { \
1172 *dst++ = ISO_CODE_ESC; \
1173 *dst++ = c1; \
1174 coding->produced_char += 2; \
1175 } \
1176 else \
1177 { \
1178 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1179 coding->composing = COMPOSITION_NO; \
1180 } \
1181 } while (0)
1182
1183 /* Decode a composition rule from the byte C1 (and maybe one more byte
1184 from SRC) and store one encoded composition rule in
1185 coding->cmp_data. */
1186
1187 #define DECODE_COMPOSITION_RULE(c1) \
1188 do { \
1189 int rule = 0; \
1190 (c1) -= 32; \
1191 if (c1 < 81) /* old format (before ver.21) */ \
1192 { \
1193 int gref = (c1) / 9; \
1194 int nref = (c1) % 9; \
1195 if (gref == 4) gref = 10; \
1196 if (nref == 4) nref = 10; \
1197 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1198 } \
1199 else if (c1 < 93) /* new format (after ver.21) */ \
1200 { \
1201 ONE_MORE_BYTE (c2); \
1202 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1203 } \
1204 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1205 coding->composition_rule_follows = 0; \
1206 } while (0)
1207
1208
1209 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1210
1211 static void
1212 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1213 struct coding_system *coding;
1214 unsigned char *source, *destination;
1215 int src_bytes, dst_bytes;
1216 {
1217 unsigned char *src = source;
1218 unsigned char *src_end = source + src_bytes;
1219 unsigned char *dst = destination;
1220 unsigned char *dst_end = destination + dst_bytes;
1221 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1222 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1223 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1224 /* SRC_BASE remembers the start position in source in each loop.
1225 The loop will be exited when there's not enough source code
1226 (within macro ONE_MORE_BYTE), or when there's not enough
1227 destination area to produce a character (within macro
1228 EMIT_CHAR). */
1229 unsigned char *src_base;
1230 int c, charset;
1231 Lisp_Object translation_table;
1232
1233 if (NILP (Venable_character_translation))
1234 translation_table = Qnil;
1235 else
1236 {
1237 translation_table = coding->translation_table_for_decode;
1238 if (NILP (translation_table))
1239 translation_table = Vstandard_translation_table_for_decode;
1240 }
1241
1242 coding->result = CODING_FINISH_NORMAL;
1243
1244 while (1)
1245 {
1246 int c1, c2;
1247
1248 src_base = src;
1249 ONE_MORE_BYTE (c1);
1250
1251 /* We produce no character or one character. */
1252 switch (iso_code_class [c1])
1253 {
1254 case ISO_0x20_or_0x7F:
1255 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1256 {
1257 DECODE_COMPOSITION_RULE (c1);
1258 continue;
1259 }
1260 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1261 {
1262 /* This is SPACE or DEL. */
1263 charset = CHARSET_ASCII;
1264 break;
1265 }
1266 /* This is a graphic character, we fall down ... */
1267
1268 case ISO_graphic_plane_0:
1269 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1270 {
1271 DECODE_COMPOSITION_RULE (c1);
1272 continue;
1273 }
1274 charset = charset0;
1275 break;
1276
1277 case ISO_0xA0_or_0xFF:
1278 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1279 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1280 goto label_invalid_code;
1281 /* This is a graphic character, we fall down ... */
1282
1283 case ISO_graphic_plane_1:
1284 if (charset1 < 0)
1285 goto label_invalid_code;
1286 charset = charset1;
1287 break;
1288
1289 case ISO_control_0:
1290 if (COMPOSING_P (coding))
1291 DECODE_COMPOSITION_END ('1');
1292
1293 /* All ISO2022 control characters in this class have the
1294 same representation in Emacs internal format. */
1295 if (c1 == '\n'
1296 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1297 && (coding->eol_type == CODING_EOL_CR
1298 || coding->eol_type == CODING_EOL_CRLF))
1299 {
1300 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1301 goto label_end_of_loop;
1302 }
1303 charset = CHARSET_ASCII;
1304 break;
1305
1306 case ISO_control_1:
1307 if (COMPOSING_P (coding))
1308 DECODE_COMPOSITION_END ('1');
1309 goto label_invalid_code;
1310
1311 case ISO_carriage_return:
1312 if (COMPOSING_P (coding))
1313 DECODE_COMPOSITION_END ('1');
1314
1315 if (coding->eol_type == CODING_EOL_CR)
1316 c1 = '\n';
1317 else if (coding->eol_type == CODING_EOL_CRLF)
1318 {
1319 ONE_MORE_BYTE (c1);
1320 if (c1 != ISO_CODE_LF)
1321 {
1322 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1323 {
1324 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1325 goto label_end_of_loop;
1326 }
1327 src--;
1328 c1 = '\r';
1329 }
1330 }
1331 charset = CHARSET_ASCII;
1332 break;
1333
1334 case ISO_shift_out:
1335 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1336 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1337 goto label_invalid_code;
1338 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1339 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1340 continue;
1341
1342 case ISO_shift_in:
1343 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1344 goto label_invalid_code;
1345 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1346 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1347 continue;
1348
1349 case ISO_single_shift_2_7:
1350 case ISO_single_shift_2:
1351 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1352 goto label_invalid_code;
1353 /* SS2 is handled as an escape sequence of ESC 'N' */
1354 c1 = 'N';
1355 goto label_escape_sequence;
1356
1357 case ISO_single_shift_3:
1358 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1359 goto label_invalid_code;
1360 /* SS2 is handled as an escape sequence of ESC 'O' */
1361 c1 = 'O';
1362 goto label_escape_sequence;
1363
1364 case ISO_control_sequence_introducer:
1365 /* CSI is handled as an escape sequence of ESC '[' ... */
1366 c1 = '[';
1367 goto label_escape_sequence;
1368
1369 case ISO_escape:
1370 ONE_MORE_BYTE (c1);
1371 label_escape_sequence:
1372 /* Escape sequences handled by Emacs are invocation,
1373 designation, direction specification, and character
1374 composition specification. */
1375 switch (c1)
1376 {
1377 case '&': /* revision of following character set */
1378 ONE_MORE_BYTE (c1);
1379 if (!(c1 >= '@' && c1 <= '~'))
1380 goto label_invalid_code;
1381 ONE_MORE_BYTE (c1);
1382 if (c1 != ISO_CODE_ESC)
1383 goto label_invalid_code;
1384 ONE_MORE_BYTE (c1);
1385 goto label_escape_sequence;
1386
1387 case '$': /* designation of 2-byte character set */
1388 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1389 goto label_invalid_code;
1390 ONE_MORE_BYTE (c1);
1391 if (c1 >= '@' && c1 <= 'B')
1392 { /* designation of JISX0208.1978, GB2312.1980,
1393 or JISX0208.1980 */
1394 DECODE_DESIGNATION (0, 2, 94, c1);
1395 }
1396 else if (c1 >= 0x28 && c1 <= 0x2B)
1397 { /* designation of DIMENSION2_CHARS94 character set */
1398 ONE_MORE_BYTE (c2);
1399 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1400 }
1401 else if (c1 >= 0x2C && c1 <= 0x2F)
1402 { /* designation of DIMENSION2_CHARS96 character set */
1403 ONE_MORE_BYTE (c2);
1404 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1405 }
1406 else
1407 goto label_invalid_code;
1408 /* We must update these variables now. */
1409 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1410 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1411 continue;
1412
1413 case 'n': /* invocation of locking-shift-2 */
1414 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1415 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1416 goto label_invalid_code;
1417 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1418 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1419 continue;
1420
1421 case 'o': /* invocation of locking-shift-3 */
1422 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1423 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1424 goto label_invalid_code;
1425 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1426 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1427 continue;
1428
1429 case 'N': /* invocation of single-shift-2 */
1430 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1431 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1432 goto label_invalid_code;
1433 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1434 ONE_MORE_BYTE (c1);
1435 break;
1436
1437 case 'O': /* invocation of single-shift-3 */
1438 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1439 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1440 goto label_invalid_code;
1441 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1442 ONE_MORE_BYTE (c1);
1443 break;
1444
1445 case '0': case '2': case '3': case '4': /* start composition */
1446 DECODE_COMPOSITION_START (c1);
1447 continue;
1448
1449 case '1': /* end composition */
1450 DECODE_COMPOSITION_END (c1);
1451 continue;
1452
1453 case '[': /* specification of direction */
1454 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1455 goto label_invalid_code;
1456 /* For the moment, nested direction is not supported.
1457 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1458 left-to-right, and nozero means right-to-left. */
1459 ONE_MORE_BYTE (c1);
1460 switch (c1)
1461 {
1462 case ']': /* end of the current direction */
1463 coding->mode &= ~CODING_MODE_DIRECTION;
1464
1465 case '0': /* end of the current direction */
1466 case '1': /* start of left-to-right direction */
1467 ONE_MORE_BYTE (c1);
1468 if (c1 == ']')
1469 coding->mode &= ~CODING_MODE_DIRECTION;
1470 else
1471 goto label_invalid_code;
1472 break;
1473
1474 case '2': /* start of right-to-left direction */
1475 ONE_MORE_BYTE (c1);
1476 if (c1 == ']')
1477 coding->mode |= CODING_MODE_DIRECTION;
1478 else
1479 goto label_invalid_code;
1480 break;
1481
1482 default:
1483 goto label_invalid_code;
1484 }
1485 continue;
1486
1487 default:
1488 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1489 goto label_invalid_code;
1490 if (c1 >= 0x28 && c1 <= 0x2B)
1491 { /* designation of DIMENSION1_CHARS94 character set */
1492 ONE_MORE_BYTE (c2);
1493 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1494 }
1495 else if (c1 >= 0x2C && c1 <= 0x2F)
1496 { /* designation of DIMENSION1_CHARS96 character set */
1497 ONE_MORE_BYTE (c2);
1498 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1499 }
1500 else
1501 goto label_invalid_code;
1502 /* We must update these variables now. */
1503 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1504 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1505 continue;
1506 }
1507 }
1508
1509 /* Now we know CHARSET and 1st position code C1 of a character.
1510 Produce a multibyte sequence for that character while getting
1511 2nd position code C2 if necessary. */
1512 if (CHARSET_DIMENSION (charset) == 2)
1513 {
1514 ONE_MORE_BYTE (c2);
1515 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1516 /* C2 is not in a valid range. */
1517 goto label_invalid_code;
1518 }
1519 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1520 EMIT_CHAR (c);
1521 continue;
1522
1523 label_invalid_code:
1524 coding->errors++;
1525 if (COMPOSING_P (coding))
1526 DECODE_COMPOSITION_END ('1');
1527 src = src_base;
1528 c = *src++;
1529 EMIT_CHAR (c);
1530 }
1531
1532 label_end_of_loop:
1533 coding->consumed = coding->consumed_char = src_base - source;
1534 coding->produced = dst - destination;
1535 return;
1536 }
1537
1538
1539 /* ISO2022 encoding stuff. */
1540
1541 /*
1542 It is not enough to say just "ISO2022" on encoding, we have to
1543 specify more details. In Emacs, each coding system of ISO2022
1544 variant has the following specifications:
1545 1. Initial designation to G0 thru G3.
1546 2. Allows short-form designation?
1547 3. ASCII should be designated to G0 before control characters?
1548 4. ASCII should be designated to G0 at end of line?
1549 5. 7-bit environment or 8-bit environment?
1550 6. Use locking-shift?
1551 7. Use Single-shift?
1552 And the following two are only for Japanese:
1553 8. Use ASCII in place of JIS0201-1976-Roman?
1554 9. Use JISX0208-1983 in place of JISX0208-1978?
1555 These specifications are encoded in `coding->flags' as flag bits
1556 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1557 details.
1558 */
1559
1560 /* Produce codes (escape sequence) for designating CHARSET to graphic
1561 register REG at DST, and increment DST. If <final-char> of CHARSET is
1562 '@', 'A', or 'B' and the coding system CODING allows, produce
1563 designation sequence of short-form. */
1564
1565 #define ENCODE_DESIGNATION(charset, reg, coding) \
1566 do { \
1567 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1568 char *intermediate_char_94 = "()*+"; \
1569 char *intermediate_char_96 = ",-./"; \
1570 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1571 \
1572 if (revision < 255) \
1573 { \
1574 *dst++ = ISO_CODE_ESC; \
1575 *dst++ = '&'; \
1576 *dst++ = '@' + revision; \
1577 } \
1578 *dst++ = ISO_CODE_ESC; \
1579 if (CHARSET_DIMENSION (charset) == 1) \
1580 { \
1581 if (CHARSET_CHARS (charset) == 94) \
1582 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1583 else \
1584 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1585 } \
1586 else \
1587 { \
1588 *dst++ = '$'; \
1589 if (CHARSET_CHARS (charset) == 94) \
1590 { \
1591 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1592 || reg != 0 \
1593 || final_char < '@' || final_char > 'B') \
1594 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1595 } \
1596 else \
1597 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1598 } \
1599 *dst++ = final_char; \
1600 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1601 } while (0)
1602
1603 /* The following two macros produce codes (control character or escape
1604 sequence) for ISO2022 single-shift functions (single-shift-2 and
1605 single-shift-3). */
1606
1607 #define ENCODE_SINGLE_SHIFT_2 \
1608 do { \
1609 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1610 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1611 else \
1612 *dst++ = ISO_CODE_SS2; \
1613 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1614 } while (0)
1615
1616 #define ENCODE_SINGLE_SHIFT_3 \
1617 do { \
1618 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1619 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1620 else \
1621 *dst++ = ISO_CODE_SS3; \
1622 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1623 } while (0)
1624
1625 /* The following four macros produce codes (control character or
1626 escape sequence) for ISO2022 locking-shift functions (shift-in,
1627 shift-out, locking-shift-2, and locking-shift-3). */
1628
1629 #define ENCODE_SHIFT_IN \
1630 do { \
1631 *dst++ = ISO_CODE_SI; \
1632 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1633 } while (0)
1634
1635 #define ENCODE_SHIFT_OUT \
1636 do { \
1637 *dst++ = ISO_CODE_SO; \
1638 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1639 } while (0)
1640
1641 #define ENCODE_LOCKING_SHIFT_2 \
1642 do { \
1643 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1644 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1645 } while (0)
1646
1647 #define ENCODE_LOCKING_SHIFT_3 \
1648 do { \
1649 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1650 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1651 } while (0)
1652
1653 /* Produce codes for a DIMENSION1 character whose character set is
1654 CHARSET and whose position-code is C1. Designation and invocation
1655 sequences are also produced in advance if necessary. */
1656
1657 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1658 do { \
1659 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1660 { \
1661 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1662 *dst++ = c1 & 0x7F; \
1663 else \
1664 *dst++ = c1 | 0x80; \
1665 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1666 break; \
1667 } \
1668 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1669 { \
1670 *dst++ = c1 & 0x7F; \
1671 break; \
1672 } \
1673 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1674 { \
1675 *dst++ = c1 | 0x80; \
1676 break; \
1677 } \
1678 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1679 && !coding->safe_charsets[charset]) \
1680 { \
1681 /* We should not encode this character, instead produce one or \
1682 two `?'s. */ \
1683 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1684 if (CHARSET_WIDTH (charset) == 2) \
1685 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1686 break; \
1687 } \
1688 else \
1689 /* Since CHARSET is not yet invoked to any graphic planes, we \
1690 must invoke it, or, at first, designate it to some graphic \
1691 register. Then repeat the loop to actually produce the \
1692 character. */ \
1693 dst = encode_invocation_designation (charset, coding, dst); \
1694 } while (1)
1695
1696 /* Produce codes for a DIMENSION2 character whose character set is
1697 CHARSET and whose position-codes are C1 and C2. Designation and
1698 invocation codes are also produced in advance if necessary. */
1699
1700 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1701 do { \
1702 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1703 { \
1704 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1706 else \
1707 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1708 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1709 break; \
1710 } \
1711 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1712 { \
1713 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1714 break; \
1715 } \
1716 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1717 { \
1718 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1719 break; \
1720 } \
1721 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1722 && !coding->safe_charsets[charset]) \
1723 { \
1724 /* We should not encode this character, instead produce one or \
1725 two `?'s. */ \
1726 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1727 if (CHARSET_WIDTH (charset) == 2) \
1728 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1729 break; \
1730 } \
1731 else \
1732 /* Since CHARSET is not yet invoked to any graphic planes, we \
1733 must invoke it, or, at first, designate it to some graphic \
1734 register. Then repeat the loop to actually produce the \
1735 character. */ \
1736 dst = encode_invocation_designation (charset, coding, dst); \
1737 } while (1)
1738
1739 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1740 do { \
1741 int alt_charset = charset; \
1742 \
1743 if (CHARSET_DEFINED_P (charset)) \
1744 { \
1745 if (CHARSET_DIMENSION (charset) == 1) \
1746 { \
1747 if (charset == CHARSET_ASCII \
1748 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1749 alt_charset = charset_latin_jisx0201; \
1750 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1751 } \
1752 else \
1753 { \
1754 if (charset == charset_jisx0208 \
1755 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1756 alt_charset = charset_jisx0208_1978; \
1757 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1758 } \
1759 } \
1760 else \
1761 { \
1762 *dst++ = c1; \
1763 if (c2 >= 0) \
1764 *dst++ = c2; \
1765 } \
1766 } while (0)
1767
1768 /* Produce designation and invocation codes at a place pointed by DST
1769 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1770 Return new DST. */
1771
1772 unsigned char *
1773 encode_invocation_designation (charset, coding, dst)
1774 int charset;
1775 struct coding_system *coding;
1776 unsigned char *dst;
1777 {
1778 int reg; /* graphic register number */
1779
1780 /* At first, check designations. */
1781 for (reg = 0; reg < 4; reg++)
1782 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1783 break;
1784
1785 if (reg >= 4)
1786 {
1787 /* CHARSET is not yet designated to any graphic registers. */
1788 /* At first check the requested designation. */
1789 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1790 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1791 /* Since CHARSET requests no special designation, designate it
1792 to graphic register 0. */
1793 reg = 0;
1794
1795 ENCODE_DESIGNATION (charset, reg, coding);
1796 }
1797
1798 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1799 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1800 {
1801 /* Since the graphic register REG is not invoked to any graphic
1802 planes, invoke it to graphic plane 0. */
1803 switch (reg)
1804 {
1805 case 0: /* graphic register 0 */
1806 ENCODE_SHIFT_IN;
1807 break;
1808
1809 case 1: /* graphic register 1 */
1810 ENCODE_SHIFT_OUT;
1811 break;
1812
1813 case 2: /* graphic register 2 */
1814 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1815 ENCODE_SINGLE_SHIFT_2;
1816 else
1817 ENCODE_LOCKING_SHIFT_2;
1818 break;
1819
1820 case 3: /* graphic register 3 */
1821 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1822 ENCODE_SINGLE_SHIFT_3;
1823 else
1824 ENCODE_LOCKING_SHIFT_3;
1825 break;
1826 }
1827 }
1828
1829 return dst;
1830 }
1831
1832 /* Produce 2-byte codes for encoded composition rule RULE. */
1833
1834 #define ENCODE_COMPOSITION_RULE(rule) \
1835 do { \
1836 int gref, nref; \
1837 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1838 *dst++ = 32 + 81 + gref; \
1839 *dst++ = 32 + nref; \
1840 } while (0)
1841
1842 /* Produce codes for indicating the start of a composition sequence
1843 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1844 which specify information about the composition. See the comment
1845 in coding.h for the format of DATA. */
1846
1847 #define ENCODE_COMPOSITION_START(coding, data) \
1848 do { \
1849 coding->composing = data[3]; \
1850 *dst++ = ISO_CODE_ESC; \
1851 if (coding->composing == COMPOSITION_RELATIVE) \
1852 *dst++ = '0'; \
1853 else \
1854 { \
1855 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1856 ? '3' : '4'); \
1857 coding->cmp_data_index = coding->cmp_data_start + 4; \
1858 coding->composition_rule_follows = 0; \
1859 } \
1860 } while (0)
1861
1862 /* Produce codes for indicating the end of the current composition. */
1863
1864 #define ENCODE_COMPOSITION_END(coding, data) \
1865 do { \
1866 *dst++ = ISO_CODE_ESC; \
1867 *dst++ = '1'; \
1868 coding->cmp_data_start += data[0]; \
1869 coding->composing = COMPOSITION_NO; \
1870 if (coding->cmp_data_start == coding->cmp_data->used \
1871 && coding->cmp_data->next) \
1872 { \
1873 coding->cmp_data = coding->cmp_data->next; \
1874 coding->cmp_data_start = 0; \
1875 } \
1876 } while (0)
1877
1878 /* Produce composition start sequence ESC 0. Here, this sequence
1879 doesn't mean the start of a new composition but means that we have
1880 just produced components (alternate chars and composition rules) of
1881 the composition and the actual text follows in SRC. */
1882
1883 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1884 do { \
1885 *dst++ = ISO_CODE_ESC; \
1886 *dst++ = '0'; \
1887 coding->composing = COMPOSITION_RELATIVE; \
1888 } while (0)
1889
1890 /* The following three macros produce codes for indicating direction
1891 of text. */
1892 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1893 do { \
1894 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1895 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1896 else \
1897 *dst++ = ISO_CODE_CSI; \
1898 } while (0)
1899
1900 #define ENCODE_DIRECTION_R2L \
1901 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1902
1903 #define ENCODE_DIRECTION_L2R \
1904 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1905
1906 /* Produce codes for designation and invocation to reset the graphic
1907 planes and registers to initial state. */
1908 #define ENCODE_RESET_PLANE_AND_REGISTER \
1909 do { \
1910 int reg; \
1911 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1912 ENCODE_SHIFT_IN; \
1913 for (reg = 0; reg < 4; reg++) \
1914 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1915 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1916 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1917 ENCODE_DESIGNATION \
1918 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1919 } while (0)
1920
1921 /* Produce designation sequences of charsets in the line started from
1922 SRC to a place pointed by DST, and return updated DST.
1923
1924 If the current block ends before any end-of-line, we may fail to
1925 find all the necessary designations. */
1926
1927 static unsigned char *
1928 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1929 struct coding_system *coding;
1930 Lisp_Object translation_table;
1931 unsigned char *src, *src_end, *dst;
1932 {
1933 int charset, c, found = 0, reg;
1934 /* Table of charsets to be designated to each graphic register. */
1935 int r[4];
1936
1937 for (reg = 0; reg < 4; reg++)
1938 r[reg] = -1;
1939
1940 while (found < 4)
1941 {
1942 ONE_MORE_CHAR (c);
1943 if (c == '\n')
1944 break;
1945
1946 charset = CHAR_CHARSET (c);
1947 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1948 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1949 {
1950 found++;
1951 r[reg] = charset;
1952 }
1953 }
1954
1955 label_end_of_loop:
1956 if (found)
1957 {
1958 for (reg = 0; reg < 4; reg++)
1959 if (r[reg] >= 0
1960 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1961 ENCODE_DESIGNATION (r[reg], reg, coding);
1962 }
1963
1964 return dst;
1965 }
1966
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1968
1969 static void
1970 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1971 struct coding_system *coding;
1972 unsigned char *source, *destination;
1973 int src_bytes, dst_bytes;
1974 {
1975 unsigned char *src = source;
1976 unsigned char *src_end = source + src_bytes;
1977 unsigned char *dst = destination;
1978 unsigned char *dst_end = destination + dst_bytes;
1979 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1980 from DST_END to assure overflow checking is necessary only at the
1981 head of loop. */
1982 unsigned char *adjusted_dst_end = dst_end - 19;
1983 /* SRC_BASE remembers the start position in source in each loop.
1984 The loop will be exited when there's not enough source text to
1985 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1986 there's not enough destination area to produce encoded codes
1987 (within macro EMIT_BYTES). */
1988 unsigned char *src_base;
1989 int c;
1990 Lisp_Object translation_table;
1991
1992 if (NILP (Venable_character_translation))
1993 translation_table = Qnil;
1994 else
1995 {
1996 translation_table = coding->translation_table_for_encode;
1997 if (NILP (translation_table))
1998 translation_table = Vstandard_translation_table_for_encode;
1999 }
2000
2001 coding->consumed_char = 0;
2002 coding->errors = 0;
2003 while (1)
2004 {
2005 int charset, c1, c2;
2006
2007 src_base = src;
2008
2009 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2010 {
2011 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2012 break;
2013 }
2014
2015 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2016 && CODING_SPEC_ISO_BOL (coding))
2017 {
2018 /* We have to produce designation sequences if any now. */
2019 dst = encode_designation_at_bol (coding, translation_table,
2020 src, src_end, dst);
2021 CODING_SPEC_ISO_BOL (coding) = 0;
2022 }
2023
2024 /* Check composition start and end. */
2025 if (coding->composing != COMPOSITION_DISABLED
2026 && coding->cmp_data_start < coding->cmp_data->used)
2027 {
2028 struct composition_data *cmp_data = coding->cmp_data;
2029 int *data = cmp_data->data + coding->cmp_data_start;
2030 int this_pos = cmp_data->char_offset + coding->consumed_char;
2031
2032 if (coding->composing == COMPOSITION_RELATIVE)
2033 {
2034 if (this_pos == data[2])
2035 {
2036 ENCODE_COMPOSITION_END (coding, data);
2037 cmp_data = coding->cmp_data;
2038 data = cmp_data->data + coding->cmp_data_start;
2039 }
2040 }
2041 else if (COMPOSING_P (coding))
2042 {
2043 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2044 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2045 /* We have consumed components of the composition.
2046 What follows in SRC is the compositions's base
2047 text. */
2048 ENCODE_COMPOSITION_FAKE_START (coding);
2049 else
2050 {
2051 int c = cmp_data->data[coding->cmp_data_index++];
2052 if (coding->composition_rule_follows)
2053 {
2054 ENCODE_COMPOSITION_RULE (c);
2055 coding->composition_rule_follows = 0;
2056 }
2057 else
2058 {
2059 SPLIT_CHAR (c, charset, c1, c2);
2060 ENCODE_ISO_CHARACTER (charset, c1, c2);
2061 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2062 coding->composition_rule_follows = 1;
2063 }
2064 continue;
2065 }
2066 }
2067 if (!COMPOSING_P (coding))
2068 {
2069 if (this_pos == data[1])
2070 {
2071 ENCODE_COMPOSITION_START (coding, data);
2072 continue;
2073 }
2074 }
2075 }
2076
2077 ONE_MORE_CHAR (c);
2078
2079 /* Now encode the character C. */
2080 if (c < 0x20 || c == 0x7F)
2081 {
2082 if (c == '\r')
2083 {
2084 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2085 {
2086 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2087 ENCODE_RESET_PLANE_AND_REGISTER;
2088 *dst++ = c;
2089 continue;
2090 }
2091 /* fall down to treat '\r' as '\n' ... */
2092 c = '\n';
2093 }
2094 if (c == '\n')
2095 {
2096 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2097 ENCODE_RESET_PLANE_AND_REGISTER;
2098 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2099 bcopy (coding->spec.iso2022.initial_designation,
2100 coding->spec.iso2022.current_designation,
2101 sizeof coding->spec.iso2022.initial_designation);
2102 if (coding->eol_type == CODING_EOL_LF
2103 || coding->eol_type == CODING_EOL_UNDECIDED)
2104 *dst++ = ISO_CODE_LF;
2105 else if (coding->eol_type == CODING_EOL_CRLF)
2106 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2107 else
2108 *dst++ = ISO_CODE_CR;
2109 CODING_SPEC_ISO_BOL (coding) = 1;
2110 }
2111 else
2112 {
2113 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2114 ENCODE_RESET_PLANE_AND_REGISTER;
2115 *dst++ = c;
2116 }
2117 }
2118 else if (ASCII_BYTE_P (c))
2119 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2120 else if (SINGLE_BYTE_CHAR_P (c))
2121 {
2122 *dst++ = c;
2123 coding->errors++;
2124 }
2125 else
2126 {
2127 SPLIT_CHAR (c, charset, c1, c2);
2128 ENCODE_ISO_CHARACTER (charset, c1, c2);
2129 }
2130
2131 coding->consumed_char++;
2132 }
2133
2134 label_end_of_loop:
2135 coding->consumed = src_base - source;
2136 coding->produced = coding->produced_char = dst - destination;
2137 }
2138
2139 \f
2140 /*** 4. SJIS and BIG5 handlers ***/
2141
2142 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2143 quite widely. So, for the moment, Emacs supports them in the bare
2144 C code. But, in the future, they may be supported only by CCL. */
2145
2146 /* SJIS is a coding system encoding three character sets: ASCII, right
2147 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2148 as is. A character of charset katakana-jisx0201 is encoded by
2149 "position-code + 0x80". A character of charset japanese-jisx0208
2150 is encoded in 2-byte but two position-codes are divided and shifted
2151 so that it fit in the range below.
2152
2153 --- CODE RANGE of SJIS ---
2154 (character set) (range)
2155 ASCII 0x00 .. 0x7F
2156 KATAKANA-JISX0201 0xA0 .. 0xDF
2157 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2158 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2159 -------------------------------
2160
2161 */
2162
2163 /* BIG5 is a coding system encoding two character sets: ASCII and
2164 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2165 character set and is encoded in two-byte.
2166
2167 --- CODE RANGE of BIG5 ---
2168 (character set) (range)
2169 ASCII 0x00 .. 0x7F
2170 Big5 (1st byte) 0xA1 .. 0xFE
2171 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2172 --------------------------
2173
2174 Since the number of characters in Big5 is larger than maximum
2175 characters in Emacs' charset (96x96), it can't be handled as one
2176 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2177 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2178 contains frequently used characters and the latter contains less
2179 frequently used characters. */
2180
2181 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2182 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2183 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2184 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2185
2186 /* Number of Big5 characters which have the same code in 1st byte. */
2187 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2188
2189 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2190 do { \
2191 unsigned int temp \
2192 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2193 if (b1 < 0xC9) \
2194 charset = charset_big5_1; \
2195 else \
2196 { \
2197 charset = charset_big5_2; \
2198 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2199 } \
2200 c1 = temp / (0xFF - 0xA1) + 0x21; \
2201 c2 = temp % (0xFF - 0xA1) + 0x21; \
2202 } while (0)
2203
2204 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2205 do { \
2206 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2207 if (charset == charset_big5_2) \
2208 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2209 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2210 b2 = temp % BIG5_SAME_ROW; \
2211 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2212 } while (0)
2213
2214 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2215 Check if a text is encoded in SJIS. If it is, return
2216 CODING_CATEGORY_MASK_SJIS, else return 0. */
2217
2218 int
2219 detect_coding_sjis (src, src_end)
2220 unsigned char *src, *src_end;
2221 {
2222 int c;
2223 /* Dummy for ONE_MORE_BYTE. */
2224 struct coding_system dummy_coding;
2225 struct coding_system *coding = &dummy_coding;
2226
2227 while (1)
2228 {
2229 ONE_MORE_BYTE (c);
2230 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2231 {
2232 ONE_MORE_BYTE (c);
2233 if (c < 0x40)
2234 return 0;
2235 }
2236 }
2237 label_end_of_loop:
2238 return CODING_CATEGORY_MASK_SJIS;
2239 }
2240
2241 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2242 Check if a text is encoded in BIG5. If it is, return
2243 CODING_CATEGORY_MASK_BIG5, else return 0. */
2244
2245 int
2246 detect_coding_big5 (src, src_end)
2247 unsigned char *src, *src_end;
2248 {
2249 int c;
2250 /* Dummy for ONE_MORE_BYTE. */
2251 struct coding_system dummy_coding;
2252 struct coding_system *coding = &dummy_coding;
2253
2254 while (1)
2255 {
2256 ONE_MORE_BYTE (c);
2257 if (c >= 0xA1)
2258 {
2259 ONE_MORE_BYTE (c);
2260 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2261 return 0;
2262 }
2263 }
2264 label_end_of_loop:
2265 return CODING_CATEGORY_MASK_BIG5;
2266 }
2267
2268 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2269 Check if a text is encoded in UTF-8. If it is, return
2270 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2271
2272 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2273 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2274 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2275 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2276 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2277 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2278 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2279
2280 int
2281 detect_coding_utf_8 (src, src_end)
2282 unsigned char *src, *src_end;
2283 {
2284 unsigned char c;
2285 int seq_maybe_bytes;
2286 /* Dummy for ONE_MORE_BYTE. */
2287 struct coding_system dummy_coding;
2288 struct coding_system *coding = &dummy_coding;
2289
2290 while (1)
2291 {
2292 ONE_MORE_BYTE (c);
2293 if (UTF_8_1_OCTET_P (c))
2294 continue;
2295 else if (UTF_8_2_OCTET_LEADING_P (c))
2296 seq_maybe_bytes = 1;
2297 else if (UTF_8_3_OCTET_LEADING_P (c))
2298 seq_maybe_bytes = 2;
2299 else if (UTF_8_4_OCTET_LEADING_P (c))
2300 seq_maybe_bytes = 3;
2301 else if (UTF_8_5_OCTET_LEADING_P (c))
2302 seq_maybe_bytes = 4;
2303 else if (UTF_8_6_OCTET_LEADING_P (c))
2304 seq_maybe_bytes = 5;
2305 else
2306 return 0;
2307
2308 do
2309 {
2310 ONE_MORE_BYTE (c);
2311 if (!UTF_8_EXTRA_OCTET_P (c))
2312 return 0;
2313 seq_maybe_bytes--;
2314 }
2315 while (seq_maybe_bytes > 0);
2316 }
2317
2318 label_end_of_loop:
2319 return CODING_CATEGORY_MASK_UTF_8;
2320 }
2321
2322 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2323 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2324 Little Endian (otherwise). If it is, return
2325 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2326 else return 0. */
2327
2328 #define UTF_16_INVALID_P(val) \
2329 (((val) == 0xFFFE) \
2330 || ((val) == 0xFFFF))
2331
2332 #define UTF_16_HIGH_SURROGATE_P(val) \
2333 (((val) & 0xD800) == 0xD800)
2334
2335 #define UTF_16_LOW_SURROGATE_P(val) \
2336 (((val) & 0xDC00) == 0xDC00)
2337
2338 int
2339 detect_coding_utf_16 (src, src_end)
2340 unsigned char *src, *src_end;
2341 {
2342 unsigned char c1, c2;
2343 /* Dummy for TWO_MORE_BYTES. */
2344 struct coding_system dummy_coding;
2345 struct coding_system *coding = &dummy_coding;
2346
2347 TWO_MORE_BYTES (c1, c2);
2348
2349 if ((c1 == 0xFF) && (c2 == 0xFE))
2350 return CODING_CATEGORY_MASK_UTF_16_LE;
2351 else if ((c1 == 0xFE) && (c2 == 0xFF))
2352 return CODING_CATEGORY_MASK_UTF_16_BE;
2353
2354 label_end_of_loop:
2355 return 0;
2356 }
2357
2358 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2359 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2360
2361 static void
2362 decode_coding_sjis_big5 (coding, source, destination,
2363 src_bytes, dst_bytes, sjis_p)
2364 struct coding_system *coding;
2365 unsigned char *source, *destination;
2366 int src_bytes, dst_bytes;
2367 int sjis_p;
2368 {
2369 unsigned char *src = source;
2370 unsigned char *src_end = source + src_bytes;
2371 unsigned char *dst = destination;
2372 unsigned char *dst_end = destination + dst_bytes;
2373 /* SRC_BASE remembers the start position in source in each loop.
2374 The loop will be exited when there's not enough source code
2375 (within macro ONE_MORE_BYTE), or when there's not enough
2376 destination area to produce a character (within macro
2377 EMIT_CHAR). */
2378 unsigned char *src_base;
2379 Lisp_Object translation_table;
2380
2381 if (NILP (Venable_character_translation))
2382 translation_table = Qnil;
2383 else
2384 {
2385 translation_table = coding->translation_table_for_decode;
2386 if (NILP (translation_table))
2387 translation_table = Vstandard_translation_table_for_decode;
2388 }
2389
2390 coding->produced_char = 0;
2391 while (1)
2392 {
2393 int c, charset, c1, c2;
2394
2395 src_base = src;
2396 ONE_MORE_BYTE (c1);
2397
2398 if (c1 < 0x80)
2399 {
2400 charset = CHARSET_ASCII;
2401 if (c1 < 0x20)
2402 {
2403 if (c1 == '\r')
2404 {
2405 if (coding->eol_type == CODING_EOL_CRLF)
2406 {
2407 ONE_MORE_BYTE (c2);
2408 if (c2 == '\n')
2409 c1 = c2;
2410 else if (coding->mode
2411 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2412 {
2413 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2414 goto label_end_of_loop;
2415 }
2416 else
2417 /* To process C2 again, SRC is subtracted by 1. */
2418 src--;
2419 }
2420 else if (coding->eol_type == CODING_EOL_CR)
2421 c1 = '\n';
2422 }
2423 else if (c1 == '\n'
2424 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2425 && (coding->eol_type == CODING_EOL_CR
2426 || coding->eol_type == CODING_EOL_CRLF))
2427 {
2428 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2429 goto label_end_of_loop;
2430 }
2431 }
2432 }
2433 else
2434 {
2435 if (sjis_p)
2436 {
2437 if (c1 >= 0xF0)
2438 goto label_invalid_code;
2439 if (c1 < 0xA0 || c1 >= 0xE0)
2440 {
2441 /* SJIS -> JISX0208 */
2442 ONE_MORE_BYTE (c2);
2443 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2444 goto label_invalid_code;
2445 DECODE_SJIS (c1, c2, c1, c2);
2446 charset = charset_jisx0208;
2447 }
2448 else
2449 /* SJIS -> JISX0201-Kana */
2450 charset = charset_katakana_jisx0201;
2451 }
2452 else
2453 {
2454 /* BIG5 -> Big5 */
2455 if (c1 < 0xA1 || c1 > 0xFE)
2456 goto label_invalid_code;
2457 ONE_MORE_BYTE (c2);
2458 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2459 goto label_invalid_code;
2460 DECODE_BIG5 (c1, c2, charset, c1, c2);
2461 }
2462 }
2463
2464 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2465 EMIT_CHAR (c);
2466 continue;
2467
2468 label_invalid_code:
2469 coding->errors++;
2470 src = src_base;
2471 c = *src++;
2472 EMIT_CHAR (c);
2473 }
2474
2475 label_end_of_loop:
2476 coding->consumed = coding->consumed_char = src_base - source;
2477 coding->produced = dst - destination;
2478 return;
2479 }
2480
2481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2482 This function can encode charsets `ascii', `katakana-jisx0201',
2483 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2484 are sure that all these charsets are registered as official charset
2485 (i.e. do not have extended leading-codes). Characters of other
2486 charsets are produced without any encoding. If SJIS_P is 1, encode
2487 SJIS text, else encode BIG5 text. */
2488
2489 static void
2490 encode_coding_sjis_big5 (coding, source, destination,
2491 src_bytes, dst_bytes, sjis_p)
2492 struct coding_system *coding;
2493 unsigned char *source, *destination;
2494 int src_bytes, dst_bytes;
2495 int sjis_p;
2496 {
2497 unsigned char *src = source;
2498 unsigned char *src_end = source + src_bytes;
2499 unsigned char *dst = destination;
2500 unsigned char *dst_end = destination + dst_bytes;
2501 /* SRC_BASE remembers the start position in source in each loop.
2502 The loop will be exited when there's not enough source text to
2503 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2504 there's not enough destination area to produce encoded codes
2505 (within macro EMIT_BYTES). */
2506 unsigned char *src_base;
2507 Lisp_Object translation_table;
2508
2509 if (NILP (Venable_character_translation))
2510 translation_table = Qnil;
2511 else
2512 {
2513 translation_table = coding->translation_table_for_decode;
2514 if (NILP (translation_table))
2515 translation_table = Vstandard_translation_table_for_decode;
2516 }
2517
2518 while (1)
2519 {
2520 int c, charset, c1, c2;
2521
2522 src_base = src;
2523 ONE_MORE_CHAR (c);
2524
2525 /* Now encode the character C. */
2526 if (SINGLE_BYTE_CHAR_P (c))
2527 {
2528 switch (c)
2529 {
2530 case '\r':
2531 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2532 {
2533 EMIT_ONE_BYTE (c);
2534 break;
2535 }
2536 c = '\n';
2537 case '\n':
2538 if (coding->eol_type == CODING_EOL_CRLF)
2539 {
2540 EMIT_TWO_BYTES ('\r', c);
2541 break;
2542 }
2543 else if (coding->eol_type == CODING_EOL_CR)
2544 c = '\r';
2545 default:
2546 EMIT_ONE_BYTE (c);
2547 }
2548 }
2549 else
2550 {
2551 SPLIT_CHAR (c, charset, c1, c2);
2552 if (sjis_p)
2553 {
2554 if (charset == charset_jisx0208
2555 || charset == charset_jisx0208_1978)
2556 {
2557 ENCODE_SJIS (c1, c2, c1, c2);
2558 EMIT_TWO_BYTES (c1, c2);
2559 }
2560 else if (charset == charset_latin_jisx0201)
2561 EMIT_ONE_BYTE (c1);
2562 else
2563 /* There's no way other than producing the internal
2564 codes as is. */
2565 EMIT_BYTES (src_base, src);
2566 }
2567 else
2568 {
2569 if (charset == charset_big5_1 || charset == charset_big5_2)
2570 {
2571 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2572 EMIT_TWO_BYTES (c1, c2);
2573 }
2574 else
2575 /* There's no way other than producing the internal
2576 codes as is. */
2577 EMIT_BYTES (src_base, src);
2578 }
2579 }
2580 coding->consumed_char++;
2581 }
2582
2583 label_end_of_loop:
2584 coding->consumed = src_base - source;
2585 coding->produced = coding->produced_char = dst - destination;
2586 }
2587
2588 \f
2589 /*** 5. CCL handlers ***/
2590
2591 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2592 Check if a text is encoded in a coding system of which
2593 encoder/decoder are written in CCL program. If it is, return
2594 CODING_CATEGORY_MASK_CCL, else return 0. */
2595
2596 int
2597 detect_coding_ccl (src, src_end)
2598 unsigned char *src, *src_end;
2599 {
2600 unsigned char *valid;
2601 int c;
2602 /* Dummy for ONE_MORE_BYTE. */
2603 struct coding_system dummy_coding;
2604 struct coding_system *coding = &dummy_coding;
2605
2606 /* No coding system is assigned to coding-category-ccl. */
2607 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2608 return 0;
2609
2610 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2611 while (1)
2612 {
2613 ONE_MORE_BYTE (c);
2614 if (! valid[c])
2615 return 0;
2616 }
2617 label_end_of_loop:
2618 return CODING_CATEGORY_MASK_CCL;
2619 }
2620
2621 \f
2622 /*** 6. End-of-line handlers ***/
2623
2624 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2625
2626 static void
2627 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2628 struct coding_system *coding;
2629 unsigned char *source, *destination;
2630 int src_bytes, dst_bytes;
2631 {
2632 unsigned char *src = source;
2633 unsigned char *dst = destination;
2634 unsigned char *src_end = src + src_bytes;
2635 unsigned char *dst_end = dst + dst_bytes;
2636 Lisp_Object translation_table;
2637 /* SRC_BASE remembers the start position in source in each loop.
2638 The loop will be exited when there's not enough source code
2639 (within macro ONE_MORE_BYTE), or when there's not enough
2640 destination area to produce a character (within macro
2641 EMIT_CHAR). */
2642 unsigned char *src_base;
2643 int c;
2644
2645 translation_table = Qnil;
2646 switch (coding->eol_type)
2647 {
2648 case CODING_EOL_CRLF:
2649 while (1)
2650 {
2651 src_base = src;
2652 ONE_MORE_BYTE (c);
2653 if (c == '\r')
2654 {
2655 ONE_MORE_BYTE (c);
2656 if (c != '\n')
2657 {
2658 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2659 {
2660 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2661 goto label_end_of_loop;
2662 }
2663 src--;
2664 c = '\r';
2665 }
2666 }
2667 else if (c == '\n'
2668 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2669 {
2670 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2671 goto label_end_of_loop;
2672 }
2673 EMIT_CHAR (c);
2674 }
2675 break;
2676
2677 case CODING_EOL_CR:
2678 while (1)
2679 {
2680 src_base = src;
2681 ONE_MORE_BYTE (c);
2682 if (c == '\n')
2683 {
2684 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2685 {
2686 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2687 goto label_end_of_loop;
2688 }
2689 }
2690 else if (c == '\r')
2691 c = '\n';
2692 EMIT_CHAR (c);
2693 }
2694 break;
2695
2696 default: /* no need for EOL handling */
2697 while (1)
2698 {
2699 src_base = src;
2700 ONE_MORE_BYTE (c);
2701 EMIT_CHAR (c);
2702 }
2703 }
2704
2705 label_end_of_loop:
2706 coding->consumed = coding->consumed_char = src_base - source;
2707 coding->produced = dst - destination;
2708 return;
2709 }
2710
2711 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2712 format of end-of-line according to `coding->eol_type'. It also
2713 convert multibyte form 8-bit characers to unibyte if
2714 CODING->src_multibyte is nonzero. If `coding->mode &
2715 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2716 also means end-of-line. */
2717
2718 static void
2719 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2720 struct coding_system *coding;
2721 unsigned char *source, *destination;
2722 int src_bytes, dst_bytes;
2723 {
2724 unsigned char *src = source;
2725 unsigned char *dst = destination;
2726 unsigned char *src_end = src + src_bytes;
2727 unsigned char *dst_end = dst + dst_bytes;
2728 Lisp_Object translation_table;
2729 /* SRC_BASE remembers the start position in source in each loop.
2730 The loop will be exited when there's not enough source text to
2731 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2732 there's not enough destination area to produce encoded codes
2733 (within macro EMIT_BYTES). */
2734 unsigned char *src_base;
2735 int c;
2736 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2737
2738 translation_table = Qnil;
2739 if (coding->src_multibyte
2740 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2741 {
2742 src_end--;
2743 src_bytes--;
2744 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2745 }
2746
2747 if (coding->eol_type == CODING_EOL_CRLF)
2748 {
2749 while (src < src_end)
2750 {
2751 src_base = src;
2752 c = *src++;
2753 if (c >= 0x20)
2754 EMIT_ONE_BYTE (c);
2755 else if (c == '\n' || (c == '\r' && selective_display))
2756 EMIT_TWO_BYTES ('\r', '\n');
2757 else
2758 EMIT_ONE_BYTE (c);
2759 }
2760 src_base = src;
2761 label_end_of_loop:
2762 ;
2763 }
2764 else
2765 {
2766 if (src_bytes <= dst_bytes)
2767 {
2768 safe_bcopy (src, dst, src_bytes);
2769 src_base = src_end;
2770 dst += src_bytes;
2771 }
2772 else
2773 {
2774 if (coding->src_multibyte
2775 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2776 dst_bytes--;
2777 safe_bcopy (src, dst, dst_bytes);
2778 src_base = src + dst_bytes;
2779 dst = destination + dst_bytes;
2780 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2781 }
2782 if (coding->eol_type == CODING_EOL_CR)
2783 {
2784 for (src = destination; src < dst; src++)
2785 if (*src == '\n') *src = '\r';
2786 }
2787 else if (selective_display)
2788 {
2789 for (src = destination; src < dst; src++)
2790 if (*src == '\r') *src = '\n';
2791 }
2792 }
2793 if (coding->src_multibyte)
2794 dst = destination + str_as_unibyte (destination, dst - destination);
2795
2796 coding->consumed = src_base - source;
2797 coding->produced = dst - destination;
2798 }
2799
2800 \f
2801 /*** 7. C library functions ***/
2802
2803 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2804 has a property `coding-system'. The value of this property is a
2805 vector of length 5 (called as coding-vector). Among elements of
2806 this vector, the first (element[0]) and the fifth (element[4])
2807 carry important information for decoding/encoding. Before
2808 decoding/encoding, this information should be set in fields of a
2809 structure of type `coding_system'.
2810
2811 A value of property `coding-system' can be a symbol of another
2812 subsidiary coding-system. In that case, Emacs gets coding-vector
2813 from that symbol.
2814
2815 `element[0]' contains information to be set in `coding->type'. The
2816 value and its meaning is as follows:
2817
2818 0 -- coding_type_emacs_mule
2819 1 -- coding_type_sjis
2820 2 -- coding_type_iso2022
2821 3 -- coding_type_big5
2822 4 -- coding_type_ccl encoder/decoder written in CCL
2823 nil -- coding_type_no_conversion
2824 t -- coding_type_undecided (automatic conversion on decoding,
2825 no-conversion on encoding)
2826
2827 `element[4]' contains information to be set in `coding->flags' and
2828 `coding->spec'. The meaning varies by `coding->type'.
2829
2830 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2831 of length 32 (of which the first 13 sub-elements are used now).
2832 Meanings of these sub-elements are:
2833
2834 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2835 If the value is an integer of valid charset, the charset is
2836 assumed to be designated to graphic register N initially.
2837
2838 If the value is minus, it is a minus value of charset which
2839 reserves graphic register N, which means that the charset is
2840 not designated initially but should be designated to graphic
2841 register N just before encoding a character in that charset.
2842
2843 If the value is nil, graphic register N is never used on
2844 encoding.
2845
2846 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2847 Each value takes t or nil. See the section ISO2022 of
2848 `coding.h' for more information.
2849
2850 If `coding->type' is `coding_type_big5', element[4] is t to denote
2851 BIG5-ETen or nil to denote BIG5-HKU.
2852
2853 If `coding->type' takes the other value, element[4] is ignored.
2854
2855 Emacs Lisp's coding system also carries information about format of
2856 end-of-line in a value of property `eol-type'. If the value is
2857 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2858 means CODING_EOL_CR. If it is not integer, it should be a vector
2859 of subsidiary coding systems of which property `eol-type' has one
2860 of above values.
2861
2862 */
2863
2864 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2865 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2866 is setup so that no conversion is necessary and return -1, else
2867 return 0. */
2868
2869 int
2870 setup_coding_system (coding_system, coding)
2871 Lisp_Object coding_system;
2872 struct coding_system *coding;
2873 {
2874 Lisp_Object coding_spec, coding_type, eol_type, plist;
2875 Lisp_Object val;
2876 int i;
2877
2878 /* Initialize some fields required for all kinds of coding systems. */
2879 coding->symbol = coding_system;
2880 coding->common_flags = 0;
2881 coding->mode = 0;
2882 coding->heading_ascii = -1;
2883 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2884 coding->composing = COMPOSITION_DISABLED;
2885 coding->cmp_data = NULL;
2886
2887 if (NILP (coding_system))
2888 goto label_invalid_coding_system;
2889
2890 coding_spec = Fget (coding_system, Qcoding_system);
2891
2892 if (!VECTORP (coding_spec)
2893 || XVECTOR (coding_spec)->size != 5
2894 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2895 goto label_invalid_coding_system;
2896
2897 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2898 if (VECTORP (eol_type))
2899 {
2900 coding->eol_type = CODING_EOL_UNDECIDED;
2901 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2902 }
2903 else if (XFASTINT (eol_type) == 1)
2904 {
2905 coding->eol_type = CODING_EOL_CRLF;
2906 coding->common_flags
2907 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2908 }
2909 else if (XFASTINT (eol_type) == 2)
2910 {
2911 coding->eol_type = CODING_EOL_CR;
2912 coding->common_flags
2913 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2914 }
2915 else
2916 coding->eol_type = CODING_EOL_LF;
2917
2918 coding_type = XVECTOR (coding_spec)->contents[0];
2919 /* Try short cut. */
2920 if (SYMBOLP (coding_type))
2921 {
2922 if (EQ (coding_type, Qt))
2923 {
2924 coding->type = coding_type_undecided;
2925 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2926 }
2927 else
2928 coding->type = coding_type_no_conversion;
2929 return 0;
2930 }
2931
2932 /* Get values of coding system properties:
2933 `post-read-conversion', `pre-write-conversion',
2934 `translation-table-for-decode', `translation-table-for-encode'. */
2935 plist = XVECTOR (coding_spec)->contents[3];
2936 /* Pre & post conversion functions should be disabled if
2937 inhibit_eol_conversion is nozero. This is the case that a code
2938 conversion function is called while those functions are running. */
2939 if (! inhibit_pre_post_conversion)
2940 {
2941 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2942 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2943 }
2944 val = Fplist_get (plist, Qtranslation_table_for_decode);
2945 if (SYMBOLP (val))
2946 val = Fget (val, Qtranslation_table_for_decode);
2947 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2948 val = Fplist_get (plist, Qtranslation_table_for_encode);
2949 if (SYMBOLP (val))
2950 val = Fget (val, Qtranslation_table_for_encode);
2951 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2952 val = Fplist_get (plist, Qcoding_category);
2953 if (!NILP (val))
2954 {
2955 val = Fget (val, Qcoding_category_index);
2956 if (INTEGERP (val))
2957 coding->category_idx = XINT (val);
2958 else
2959 goto label_invalid_coding_system;
2960 }
2961 else
2962 goto label_invalid_coding_system;
2963
2964 val = Fplist_get (plist, Qsafe_charsets);
2965 if (EQ (val, Qt))
2966 {
2967 for (i = 0; i <= MAX_CHARSET; i++)
2968 coding->safe_charsets[i] = 1;
2969 }
2970 else
2971 {
2972 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2973 while (CONSP (val))
2974 {
2975 if ((i = get_charset_id (XCAR (val))) >= 0)
2976 coding->safe_charsets[i] = 1;
2977 val = XCDR (val);
2978 }
2979 }
2980
2981 /* If the coding system has non-nil `composition' property, enable
2982 composition handling. */
2983 val = Fplist_get (plist, Qcomposition);
2984 if (!NILP (val))
2985 coding->composing = COMPOSITION_NO;
2986
2987 switch (XFASTINT (coding_type))
2988 {
2989 case 0:
2990 coding->type = coding_type_emacs_mule;
2991 if (!NILP (coding->post_read_conversion))
2992 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2993 if (!NILP (coding->pre_write_conversion))
2994 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2995 break;
2996
2997 case 1:
2998 coding->type = coding_type_sjis;
2999 coding->common_flags
3000 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3001 break;
3002
3003 case 2:
3004 coding->type = coding_type_iso2022;
3005 coding->common_flags
3006 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3007 {
3008 Lisp_Object val, temp;
3009 Lisp_Object *flags;
3010 int i, charset, reg_bits = 0;
3011
3012 val = XVECTOR (coding_spec)->contents[4];
3013
3014 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3015 goto label_invalid_coding_system;
3016
3017 flags = XVECTOR (val)->contents;
3018 coding->flags
3019 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3020 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3021 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3022 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3023 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3024 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3025 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3026 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3027 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3028 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3029 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3030 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3031 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3032 );
3033
3034 /* Invoke graphic register 0 to plane 0. */
3035 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3036 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3037 CODING_SPEC_ISO_INVOCATION (coding, 1)
3038 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3039 /* Not single shifting at first. */
3040 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3041 /* Beginning of buffer should also be regarded as bol. */
3042 CODING_SPEC_ISO_BOL (coding) = 1;
3043
3044 for (charset = 0; charset <= MAX_CHARSET; charset++)
3045 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3046 val = Vcharset_revision_alist;
3047 while (CONSP (val))
3048 {
3049 charset = get_charset_id (Fcar_safe (XCAR (val)));
3050 if (charset >= 0
3051 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3052 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3053 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3054 val = XCDR (val);
3055 }
3056
3057 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3058 FLAGS[REG] can be one of below:
3059 integer CHARSET: CHARSET occupies register I,
3060 t: designate nothing to REG initially, but can be used
3061 by any charsets,
3062 list of integer, nil, or t: designate the first
3063 element (if integer) to REG initially, the remaining
3064 elements (if integer) is designated to REG on request,
3065 if an element is t, REG can be used by any charsets,
3066 nil: REG is never used. */
3067 for (charset = 0; charset <= MAX_CHARSET; charset++)
3068 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3069 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3070 for (i = 0; i < 4; i++)
3071 {
3072 if (INTEGERP (flags[i])
3073 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3074 || (charset = get_charset_id (flags[i])) >= 0)
3075 {
3076 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3077 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3078 }
3079 else if (EQ (flags[i], Qt))
3080 {
3081 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3082 reg_bits |= 1 << i;
3083 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3084 }
3085 else if (CONSP (flags[i]))
3086 {
3087 Lisp_Object tail;
3088 tail = flags[i];
3089
3090 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3091 if (INTEGERP (XCAR (tail))
3092 && (charset = XINT (XCAR (tail)),
3093 CHARSET_VALID_P (charset))
3094 || (charset = get_charset_id (XCAR (tail))) >= 0)
3095 {
3096 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3097 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3098 }
3099 else
3100 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3101 tail = XCDR (tail);
3102 while (CONSP (tail))
3103 {
3104 if (INTEGERP (XCAR (tail))
3105 && (charset = XINT (XCAR (tail)),
3106 CHARSET_VALID_P (charset))
3107 || (charset = get_charset_id (XCAR (tail))) >= 0)
3108 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3109 = i;
3110 else if (EQ (XCAR (tail), Qt))
3111 reg_bits |= 1 << i;
3112 tail = XCDR (tail);
3113 }
3114 }
3115 else
3116 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3117
3118 CODING_SPEC_ISO_DESIGNATION (coding, i)
3119 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3120 }
3121
3122 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3123 {
3124 /* REG 1 can be used only by locking shift in 7-bit env. */
3125 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3126 reg_bits &= ~2;
3127 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3128 /* Without any shifting, only REG 0 and 1 can be used. */
3129 reg_bits &= 3;
3130 }
3131
3132 if (reg_bits)
3133 for (charset = 0; charset <= MAX_CHARSET; charset++)
3134 {
3135 if (CHARSET_VALID_P (charset))
3136 {
3137 /* There exist some default graphic registers to be
3138 used CHARSET. */
3139
3140 /* We had better avoid designating a charset of
3141 CHARS96 to REG 0 as far as possible. */
3142 if (CHARSET_CHARS (charset) == 96)
3143 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3144 = (reg_bits & 2
3145 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3146 else
3147 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3148 = (reg_bits & 1
3149 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3150 }
3151 }
3152 }
3153 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3154 coding->spec.iso2022.last_invalid_designation_register = -1;
3155 break;
3156
3157 case 3:
3158 coding->type = coding_type_big5;
3159 coding->common_flags
3160 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3161 coding->flags
3162 = (NILP (XVECTOR (coding_spec)->contents[4])
3163 ? CODING_FLAG_BIG5_HKU
3164 : CODING_FLAG_BIG5_ETEN);
3165 break;
3166
3167 case 4:
3168 coding->type = coding_type_ccl;
3169 coding->common_flags
3170 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3171 {
3172 val = XVECTOR (coding_spec)->contents[4];
3173 if (! CONSP (val)
3174 || setup_ccl_program (&(coding->spec.ccl.decoder),
3175 XCAR (val)) < 0
3176 || setup_ccl_program (&(coding->spec.ccl.encoder),
3177 XCDR (val)) < 0)
3178 goto label_invalid_coding_system;
3179
3180 bzero (coding->spec.ccl.valid_codes, 256);
3181 val = Fplist_get (plist, Qvalid_codes);
3182 if (CONSP (val))
3183 {
3184 Lisp_Object this;
3185
3186 for (; CONSP (val); val = XCDR (val))
3187 {
3188 this = XCAR (val);
3189 if (INTEGERP (this)
3190 && XINT (this) >= 0 && XINT (this) < 256)
3191 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3192 else if (CONSP (this)
3193 && INTEGERP (XCAR (this))
3194 && INTEGERP (XCDR (this)))
3195 {
3196 int start = XINT (XCAR (this));
3197 int end = XINT (XCDR (this));
3198
3199 if (start >= 0 && start <= end && end < 256)
3200 while (start <= end)
3201 coding->spec.ccl.valid_codes[start++] = 1;
3202 }
3203 }
3204 }
3205 }
3206 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3207 break;
3208
3209 case 5:
3210 coding->type = coding_type_raw_text;
3211 break;
3212
3213 default:
3214 goto label_invalid_coding_system;
3215 }
3216 return 0;
3217
3218 label_invalid_coding_system:
3219 coding->type = coding_type_no_conversion;
3220 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3221 coding->common_flags = 0;
3222 coding->eol_type = CODING_EOL_LF;
3223 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3224 return -1;
3225 }
3226
3227 /* Free memory blocks allocated for storing composition information. */
3228
3229 void
3230 coding_free_composition_data (coding)
3231 struct coding_system *coding;
3232 {
3233 struct composition_data *cmp_data = coding->cmp_data, *next;
3234
3235 if (!cmp_data)
3236 return;
3237 /* Memory blocks are chained. At first, rewind to the first, then,
3238 free blocks one by one. */
3239 while (cmp_data->prev)
3240 cmp_data = cmp_data->prev;
3241 while (cmp_data)
3242 {
3243 next = cmp_data->next;
3244 xfree (cmp_data);
3245 cmp_data = next;
3246 }
3247 coding->cmp_data = NULL;
3248 }
3249
3250 /* Set `char_offset' member of all memory blocks pointed by
3251 coding->cmp_data to POS. */
3252
3253 void
3254 coding_adjust_composition_offset (coding, pos)
3255 struct coding_system *coding;
3256 int pos;
3257 {
3258 struct composition_data *cmp_data;
3259
3260 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3261 cmp_data->char_offset = pos;
3262 }
3263
3264 /* Setup raw-text or one of its subsidiaries in the structure
3265 coding_system CODING according to the already setup value eol_type
3266 in CODING. CODING should be setup for some coding system in
3267 advance. */
3268
3269 void
3270 setup_raw_text_coding_system (coding)
3271 struct coding_system *coding;
3272 {
3273 if (coding->type != coding_type_raw_text)
3274 {
3275 coding->symbol = Qraw_text;
3276 coding->type = coding_type_raw_text;
3277 if (coding->eol_type != CODING_EOL_UNDECIDED)
3278 {
3279 Lisp_Object subsidiaries;
3280 subsidiaries = Fget (Qraw_text, Qeol_type);
3281
3282 if (VECTORP (subsidiaries)
3283 && XVECTOR (subsidiaries)->size == 3)
3284 coding->symbol
3285 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3286 }
3287 setup_coding_system (coding->symbol, coding);
3288 }
3289 return;
3290 }
3291
3292 /* Emacs has a mechanism to automatically detect a coding system if it
3293 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3294 it's impossible to distinguish some coding systems accurately
3295 because they use the same range of codes. So, at first, coding
3296 systems are categorized into 7, those are:
3297
3298 o coding-category-emacs-mule
3299
3300 The category for a coding system which has the same code range
3301 as Emacs' internal format. Assigned the coding-system (Lisp
3302 symbol) `emacs-mule' by default.
3303
3304 o coding-category-sjis
3305
3306 The category for a coding system which has the same code range
3307 as SJIS. Assigned the coding-system (Lisp
3308 symbol) `japanese-shift-jis' by default.
3309
3310 o coding-category-iso-7
3311
3312 The category for a coding system which has the same code range
3313 as ISO2022 of 7-bit environment. This doesn't use any locking
3314 shift and single shift functions. This can encode/decode all
3315 charsets. Assigned the coding-system (Lisp symbol)
3316 `iso-2022-7bit' by default.
3317
3318 o coding-category-iso-7-tight
3319
3320 Same as coding-category-iso-7 except that this can
3321 encode/decode only the specified charsets.
3322
3323 o coding-category-iso-8-1
3324
3325 The category for a coding system which has the same code range
3326 as ISO2022 of 8-bit environment and graphic plane 1 used only
3327 for DIMENSION1 charset. This doesn't use any locking shift
3328 and single shift functions. Assigned the coding-system (Lisp
3329 symbol) `iso-latin-1' by default.
3330
3331 o coding-category-iso-8-2
3332
3333 The category for a coding system which has the same code range
3334 as ISO2022 of 8-bit environment and graphic plane 1 used only
3335 for DIMENSION2 charset. This doesn't use any locking shift
3336 and single shift functions. Assigned the coding-system (Lisp
3337 symbol) `japanese-iso-8bit' by default.
3338
3339 o coding-category-iso-7-else
3340
3341 The category for a coding system which has the same code range
3342 as ISO2022 of 7-bit environemnt but uses locking shift or
3343 single shift functions. Assigned the coding-system (Lisp
3344 symbol) `iso-2022-7bit-lock' by default.
3345
3346 o coding-category-iso-8-else
3347
3348 The category for a coding system which has the same code range
3349 as ISO2022 of 8-bit environemnt but uses locking shift or
3350 single shift functions. Assigned the coding-system (Lisp
3351 symbol) `iso-2022-8bit-ss2' by default.
3352
3353 o coding-category-big5
3354
3355 The category for a coding system which has the same code range
3356 as BIG5. Assigned the coding-system (Lisp symbol)
3357 `cn-big5' by default.
3358
3359 o coding-category-utf-8
3360
3361 The category for a coding system which has the same code range
3362 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3363 symbol) `utf-8' by default.
3364
3365 o coding-category-utf-16-be
3366
3367 The category for a coding system in which a text has an
3368 Unicode signature (cf. Unicode Standard) in the order of BIG
3369 endian at the head. Assigned the coding-system (Lisp symbol)
3370 `utf-16-be' by default.
3371
3372 o coding-category-utf-16-le
3373
3374 The category for a coding system in which a text has an
3375 Unicode signature (cf. Unicode Standard) in the order of
3376 LITTLE endian at the head. Assigned the coding-system (Lisp
3377 symbol) `utf-16-le' by default.
3378
3379 o coding-category-ccl
3380
3381 The category for a coding system of which encoder/decoder is
3382 written in CCL programs. The default value is nil, i.e., no
3383 coding system is assigned.
3384
3385 o coding-category-binary
3386
3387 The category for a coding system not categorized in any of the
3388 above. Assigned the coding-system (Lisp symbol)
3389 `no-conversion' by default.
3390
3391 Each of them is a Lisp symbol and the value is an actual
3392 `coding-system's (this is also a Lisp symbol) assigned by a user.
3393 What Emacs does actually is to detect a category of coding system.
3394 Then, it uses a `coding-system' assigned to it. If Emacs can't
3395 decide only one possible category, it selects a category of the
3396 highest priority. Priorities of categories are also specified by a
3397 user in a Lisp variable `coding-category-list'.
3398
3399 */
3400
3401 static
3402 int ascii_skip_code[256];
3403
3404 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3405 If it detects possible coding systems, return an integer in which
3406 appropriate flag bits are set. Flag bits are defined by macros
3407 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3408 it should point the table `coding_priorities'. In that case, only
3409 the flag bit for a coding system of the highest priority is set in
3410 the returned value.
3411
3412 How many ASCII characters are at the head is returned as *SKIP. */
3413
3414 static int
3415 detect_coding_mask (source, src_bytes, priorities, skip)
3416 unsigned char *source;
3417 int src_bytes, *priorities, *skip;
3418 {
3419 register unsigned char c;
3420 unsigned char *src = source, *src_end = source + src_bytes;
3421 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3422 int i, idx;
3423
3424 /* At first, skip all ASCII characters and control characters except
3425 for three ISO2022 specific control characters. */
3426 ascii_skip_code[ISO_CODE_SO] = 0;
3427 ascii_skip_code[ISO_CODE_SI] = 0;
3428 ascii_skip_code[ISO_CODE_ESC] = 0;
3429
3430 label_loop_detect_coding:
3431 while (src < src_end && ascii_skip_code[*src]) src++;
3432 *skip = src - source;
3433
3434 if (src >= src_end)
3435 /* We found nothing other than ASCII. There's nothing to do. */
3436 return 0;
3437
3438 c = *src;
3439 /* The text seems to be encoded in some multilingual coding system.
3440 Now, try to find in which coding system the text is encoded. */
3441 if (c < 0x80)
3442 {
3443 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3444 /* C is an ISO2022 specific control code of C0. */
3445 mask = detect_coding_iso2022 (src, src_end);
3446 if (mask == 0)
3447 {
3448 /* No valid ISO2022 code follows C. Try again. */
3449 src++;
3450 if (c == ISO_CODE_ESC)
3451 ascii_skip_code[ISO_CODE_ESC] = 1;
3452 else
3453 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3454 goto label_loop_detect_coding;
3455 }
3456 if (priorities)
3457 {
3458 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3459 {
3460 if (mask & priorities[i])
3461 return priorities[i];
3462 }
3463 return CODING_CATEGORY_MASK_RAW_TEXT;
3464 }
3465 }
3466 else
3467 {
3468 int try;
3469
3470 if (c < 0xA0)
3471 {
3472 /* C is the first byte of SJIS character code,
3473 or a leading-code of Emacs' internal format (emacs-mule),
3474 or the first byte of UTF-16. */
3475 try = (CODING_CATEGORY_MASK_SJIS
3476 | CODING_CATEGORY_MASK_EMACS_MULE
3477 | CODING_CATEGORY_MASK_UTF_16_BE
3478 | CODING_CATEGORY_MASK_UTF_16_LE);
3479
3480 /* Or, if C is a special latin extra code,
3481 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3482 or is an ISO2022 control-sequence-introducer (CSI),
3483 we should also consider the possibility of ISO2022 codings. */
3484 if ((VECTORP (Vlatin_extra_code_table)
3485 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3486 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3487 || (c == ISO_CODE_CSI
3488 && (src < src_end
3489 && (*src == ']'
3490 || ((*src == '0' || *src == '1' || *src == '2')
3491 && src + 1 < src_end
3492 && src[1] == ']')))))
3493 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3494 | CODING_CATEGORY_MASK_ISO_8BIT);
3495 }
3496 else
3497 /* C is a character of ISO2022 in graphic plane right,
3498 or a SJIS's 1-byte character code (i.e. JISX0201),
3499 or the first byte of BIG5's 2-byte code,
3500 or the first byte of UTF-8/16. */
3501 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3502 | CODING_CATEGORY_MASK_ISO_8BIT
3503 | CODING_CATEGORY_MASK_SJIS
3504 | CODING_CATEGORY_MASK_BIG5
3505 | CODING_CATEGORY_MASK_UTF_8
3506 | CODING_CATEGORY_MASK_UTF_16_BE
3507 | CODING_CATEGORY_MASK_UTF_16_LE);
3508
3509 /* Or, we may have to consider the possibility of CCL. */
3510 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3512 ->spec.ccl.valid_codes)[c])
3513 try |= CODING_CATEGORY_MASK_CCL;
3514
3515 mask = 0;
3516 utf16_examined_p = iso2022_examined_p = 0;
3517 if (priorities)
3518 {
3519 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3520 {
3521 if (!iso2022_examined_p
3522 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3523 {
3524 mask |= detect_coding_iso2022 (src, src_end);
3525 iso2022_examined_p = 1;
3526 }
3527 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3528 mask |= detect_coding_sjis (src, src_end);
3529 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3530 mask |= detect_coding_utf_8 (src, src_end);
3531 else if (!utf16_examined_p
3532 && (priorities[i] & try &
3533 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3534 {
3535 mask |= detect_coding_utf_16 (src, src_end);
3536 utf16_examined_p = 1;
3537 }
3538 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3539 mask |= detect_coding_big5 (src, src_end);
3540 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3541 mask |= detect_coding_emacs_mule (src, src_end);
3542 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3543 mask |= detect_coding_ccl (src, src_end);
3544 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3545 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3546 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3547 mask |= CODING_CATEGORY_MASK_BINARY;
3548 if (mask & priorities[i])
3549 return priorities[i];
3550 }
3551 return CODING_CATEGORY_MASK_RAW_TEXT;
3552 }
3553 if (try & CODING_CATEGORY_MASK_ISO)
3554 mask |= detect_coding_iso2022 (src, src_end);
3555 if (try & CODING_CATEGORY_MASK_SJIS)
3556 mask |= detect_coding_sjis (src, src_end);
3557 if (try & CODING_CATEGORY_MASK_BIG5)
3558 mask |= detect_coding_big5 (src, src_end);
3559 if (try & CODING_CATEGORY_MASK_UTF_8)
3560 mask |= detect_coding_utf_8 (src, src_end);
3561 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3562 mask |= detect_coding_utf_16 (src, src_end);
3563 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3564 mask |= detect_coding_emacs_mule (src, src_end);
3565 if (try & CODING_CATEGORY_MASK_CCL)
3566 mask |= detect_coding_ccl (src, src_end);
3567 }
3568 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3569 }
3570
3571 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3572 The information of the detected coding system is set in CODING. */
3573
3574 void
3575 detect_coding (coding, src, src_bytes)
3576 struct coding_system *coding;
3577 unsigned char *src;
3578 int src_bytes;
3579 {
3580 unsigned int idx;
3581 int skip, mask, i;
3582 Lisp_Object val;
3583
3584 val = Vcoding_category_list;
3585 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3586 coding->heading_ascii = skip;
3587
3588 if (!mask) return;
3589
3590 /* We found a single coding system of the highest priority in MASK. */
3591 idx = 0;
3592 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3593 if (! mask)
3594 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3595
3596 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3597
3598 if (coding->eol_type != CODING_EOL_UNDECIDED)
3599 {
3600 Lisp_Object tmp;
3601
3602 tmp = Fget (val, Qeol_type);
3603 if (VECTORP (tmp))
3604 val = XVECTOR (tmp)->contents[coding->eol_type];
3605 }
3606
3607 /* Setup this new coding system while preserving some slots. */
3608 {
3609 int src_multibyte = coding->src_multibyte;
3610 int dst_multibyte = coding->dst_multibyte;
3611
3612 setup_coding_system (val, coding);
3613 coding->src_multibyte = src_multibyte;
3614 coding->dst_multibyte = dst_multibyte;
3615 coding->heading_ascii = skip;
3616 }
3617 }
3618
3619 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3620 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3621 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3622
3623 How many non-eol characters are at the head is returned as *SKIP. */
3624
3625 #define MAX_EOL_CHECK_COUNT 3
3626
3627 static int
3628 detect_eol_type (source, src_bytes, skip)
3629 unsigned char *source;
3630 int src_bytes, *skip;
3631 {
3632 unsigned char *src = source, *src_end = src + src_bytes;
3633 unsigned char c;
3634 int total = 0; /* How many end-of-lines are found so far. */
3635 int eol_type = CODING_EOL_UNDECIDED;
3636 int this_eol_type;
3637
3638 *skip = 0;
3639
3640 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3641 {
3642 c = *src++;
3643 if (c == '\n' || c == '\r')
3644 {
3645 if (*skip == 0)
3646 *skip = src - 1 - source;
3647 total++;
3648 if (c == '\n')
3649 this_eol_type = CODING_EOL_LF;
3650 else if (src >= src_end || *src != '\n')
3651 this_eol_type = CODING_EOL_CR;
3652 else
3653 this_eol_type = CODING_EOL_CRLF, src++;
3654
3655 if (eol_type == CODING_EOL_UNDECIDED)
3656 /* This is the first end-of-line. */
3657 eol_type = this_eol_type;
3658 else if (eol_type != this_eol_type)
3659 {
3660 /* The found type is different from what found before. */
3661 eol_type = CODING_EOL_INCONSISTENT;
3662 break;
3663 }
3664 }
3665 }
3666
3667 if (*skip == 0)
3668 *skip = src_end - source;
3669 return eol_type;
3670 }
3671
3672 /* Like detect_eol_type, but detect EOL type in 2-octet
3673 big-endian/little-endian format for coding systems utf-16-be and
3674 utf-16-le. */
3675
3676 static int
3677 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3678 unsigned char *source;
3679 int src_bytes, *skip;
3680 {
3681 unsigned char *src = source, *src_end = src + src_bytes;
3682 unsigned int c1, c2;
3683 int total = 0; /* How many end-of-lines are found so far. */
3684 int eol_type = CODING_EOL_UNDECIDED;
3685 int this_eol_type;
3686 int msb, lsb;
3687
3688 if (big_endian_p)
3689 msb = 0, lsb = 1;
3690 else
3691 msb = 1, lsb = 0;
3692
3693 *skip = 0;
3694
3695 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3696 {
3697 c1 = (src[msb] << 8) | (src[lsb]);
3698 src += 2;
3699
3700 if (c1 == '\n' || c1 == '\r')
3701 {
3702 if (*skip == 0)
3703 *skip = src - 2 - source;
3704 total++;
3705 if (c1 == '\n')
3706 {
3707 this_eol_type = CODING_EOL_LF;
3708 }
3709 else
3710 {
3711 if ((src + 1) >= src_end)
3712 {
3713 this_eol_type = CODING_EOL_CR;
3714 }
3715 else
3716 {
3717 c2 = (src[msb] << 8) | (src[lsb]);
3718 if (c2 == '\n')
3719 this_eol_type = CODING_EOL_CRLF, src += 2;
3720 else
3721 this_eol_type = CODING_EOL_CR;
3722 }
3723 }
3724
3725 if (eol_type == CODING_EOL_UNDECIDED)
3726 /* This is the first end-of-line. */
3727 eol_type = this_eol_type;
3728 else if (eol_type != this_eol_type)
3729 {
3730 /* The found type is different from what found before. */
3731 eol_type = CODING_EOL_INCONSISTENT;
3732 break;
3733 }
3734 }
3735 }
3736
3737 if (*skip == 0)
3738 *skip = src_end - source;
3739 return eol_type;
3740 }
3741
3742 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3743 is encoded. If it detects an appropriate format of end-of-line, it
3744 sets the information in *CODING. */
3745
3746 void
3747 detect_eol (coding, src, src_bytes)
3748 struct coding_system *coding;
3749 unsigned char *src;
3750 int src_bytes;
3751 {
3752 Lisp_Object val;
3753 int skip;
3754 int eol_type;
3755
3756 switch (coding->category_idx)
3757 {
3758 case CODING_CATEGORY_IDX_UTF_16_BE:
3759 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3760 break;
3761 case CODING_CATEGORY_IDX_UTF_16_LE:
3762 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3763 break;
3764 default:
3765 eol_type = detect_eol_type (src, src_bytes, &skip);
3766 break;
3767 }
3768
3769 if (coding->heading_ascii > skip)
3770 coding->heading_ascii = skip;
3771 else
3772 skip = coding->heading_ascii;
3773
3774 if (eol_type == CODING_EOL_UNDECIDED)
3775 return;
3776 if (eol_type == CODING_EOL_INCONSISTENT)
3777 {
3778 #if 0
3779 /* This code is suppressed until we find a better way to
3780 distinguish raw text file and binary file. */
3781
3782 /* If we have already detected that the coding is raw-text, the
3783 coding should actually be no-conversion. */
3784 if (coding->type == coding_type_raw_text)
3785 {
3786 setup_coding_system (Qno_conversion, coding);
3787 return;
3788 }
3789 /* Else, let's decode only text code anyway. */
3790 #endif /* 0 */
3791 eol_type = CODING_EOL_LF;
3792 }
3793
3794 val = Fget (coding->symbol, Qeol_type);
3795 if (VECTORP (val) && XVECTOR (val)->size == 3)
3796 {
3797 int src_multibyte = coding->src_multibyte;
3798 int dst_multibyte = coding->dst_multibyte;
3799
3800 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3801 coding->src_multibyte = src_multibyte;
3802 coding->dst_multibyte = dst_multibyte;
3803 coding->heading_ascii = skip;
3804 }
3805 }
3806
3807 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3808
3809 #define DECODING_BUFFER_MAG(coding) \
3810 (coding->type == coding_type_iso2022 \
3811 ? 3 \
3812 : (coding->type == coding_type_ccl \
3813 ? coding->spec.ccl.decoder.buf_magnification \
3814 : 2))
3815
3816 /* Return maximum size (bytes) of a buffer enough for decoding
3817 SRC_BYTES of text encoded in CODING. */
3818
3819 int
3820 decoding_buffer_size (coding, src_bytes)
3821 struct coding_system *coding;
3822 int src_bytes;
3823 {
3824 return (src_bytes * DECODING_BUFFER_MAG (coding)
3825 + CONVERSION_BUFFER_EXTRA_ROOM);
3826 }
3827
3828 /* Return maximum size (bytes) of a buffer enough for encoding
3829 SRC_BYTES of text to CODING. */
3830
3831 int
3832 encoding_buffer_size (coding, src_bytes)
3833 struct coding_system *coding;
3834 int src_bytes;
3835 {
3836 int magnification;
3837
3838 if (coding->type == coding_type_ccl)
3839 magnification = coding->spec.ccl.encoder.buf_magnification;
3840 else if (CODING_REQUIRE_ENCODING (coding))
3841 magnification = 3;
3842 else
3843 magnification = 1;
3844
3845 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3846 }
3847
3848 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3849 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3850 #endif
3851
3852 char *conversion_buffer;
3853 int conversion_buffer_size;
3854
3855 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3856 or decoding. Sufficient memory is allocated automatically. If we
3857 run out of memory, return NULL. */
3858
3859 char *
3860 get_conversion_buffer (size)
3861 int size;
3862 {
3863 if (size > conversion_buffer_size)
3864 {
3865 char *buf;
3866 int real_size = conversion_buffer_size * 2;
3867
3868 while (real_size < size) real_size *= 2;
3869 buf = (char *) xmalloc (real_size);
3870 xfree (conversion_buffer);
3871 conversion_buffer = buf;
3872 conversion_buffer_size = real_size;
3873 }
3874 return conversion_buffer;
3875 }
3876
3877 int
3878 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3879 struct coding_system *coding;
3880 unsigned char *source, *destination;
3881 int src_bytes, dst_bytes, encodep;
3882 {
3883 struct ccl_program *ccl
3884 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3885 int result;
3886
3887 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3888
3889 coding->produced = ccl_driver (ccl, source, destination,
3890 src_bytes, dst_bytes, &(coding->consumed));
3891 if (encodep)
3892 coding->produced_char = coding->produced;
3893 else
3894 {
3895 int bytes
3896 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897 coding->produced = str_as_multibyte (destination, bytes,
3898 coding->produced,
3899 &(coding->produced_char));
3900 }
3901
3902 switch (ccl->status)
3903 {
3904 case CCL_STAT_SUSPEND_BY_SRC:
3905 result = CODING_FINISH_INSUFFICIENT_SRC;
3906 break;
3907 case CCL_STAT_SUSPEND_BY_DST:
3908 result = CODING_FINISH_INSUFFICIENT_DST;
3909 break;
3910 case CCL_STAT_QUIT:
3911 case CCL_STAT_INVALID_CMD:
3912 result = CODING_FINISH_INTERRUPT;
3913 break;
3914 default:
3915 result = CODING_FINISH_NORMAL;
3916 break;
3917 }
3918 return result;
3919 }
3920
3921 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3922 decoding, it may detect coding system and format of end-of-line if
3923 those are not yet decided. The source should be unibyte, the
3924 result is multibyte if CODING->dst_multibyte is nonzero, else
3925 unibyte. */
3926
3927 int
3928 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3929 struct coding_system *coding;
3930 unsigned char *source, *destination;
3931 int src_bytes, dst_bytes;
3932 {
3933 if (coding->type == coding_type_undecided)
3934 detect_coding (coding, source, src_bytes);
3935
3936 if (coding->eol_type == CODING_EOL_UNDECIDED)
3937 detect_eol (coding, source, src_bytes);
3938
3939 coding->produced = coding->produced_char = 0;
3940 coding->consumed = coding->consumed_char = 0;
3941 coding->errors = 0;
3942 coding->result = CODING_FINISH_NORMAL;
3943
3944 switch (coding->type)
3945 {
3946 case coding_type_sjis:
3947 decode_coding_sjis_big5 (coding, source, destination,
3948 src_bytes, dst_bytes, 1);
3949 break;
3950
3951 case coding_type_iso2022:
3952 decode_coding_iso2022 (coding, source, destination,
3953 src_bytes, dst_bytes);
3954 break;
3955
3956 case coding_type_big5:
3957 decode_coding_sjis_big5 (coding, source, destination,
3958 src_bytes, dst_bytes, 0);
3959 break;
3960
3961 case coding_type_emacs_mule:
3962 decode_coding_emacs_mule (coding, source, destination,
3963 src_bytes, dst_bytes);
3964 break;
3965
3966 case coding_type_ccl:
3967 ccl_coding_driver (coding, source, destination,
3968 src_bytes, dst_bytes, 0);
3969 break;
3970
3971 default:
3972 decode_eol (coding, source, destination, src_bytes, dst_bytes);
3973 }
3974
3975 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3976 && coding->consumed == src_bytes)
3977 coding->result = CODING_FINISH_NORMAL;
3978
3979 if (coding->mode & CODING_MODE_LAST_BLOCK
3980 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3981 {
3982 unsigned char *src = source + coding->consumed;
3983 unsigned char *dst = destination + coding->produced;
3984
3985 src_bytes -= coding->consumed;
3986 coding->errors++;
3987 if (COMPOSING_P (coding))
3988 DECODE_COMPOSITION_END ('1');
3989 while (src_bytes--)
3990 {
3991 int c = *src++;
3992 dst += CHAR_STRING (c, dst);
3993 coding->produced_char++;
3994 }
3995 coding->consumed = coding->consumed_char = src - source;
3996 coding->produced = dst - destination;
3997 }
3998
3999 if (!coding->dst_multibyte)
4000 {
4001 coding->produced = str_as_unibyte (destination, coding->produced);
4002 coding->produced_char = coding->produced;
4003 }
4004
4005 return coding->result;
4006 }
4007
4008 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4009 multibyteness of the source is CODING->src_multibyte, the
4010 multibyteness of the result is always unibyte. */
4011
4012 int
4013 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4014 struct coding_system *coding;
4015 unsigned char *source, *destination;
4016 int src_bytes, dst_bytes;
4017 {
4018 coding->produced = coding->produced_char = 0;
4019 coding->consumed = coding->consumed_char = 0;
4020 coding->errors = 0;
4021 coding->result = CODING_FINISH_NORMAL;
4022
4023 switch (coding->type)
4024 {
4025 case coding_type_sjis:
4026 encode_coding_sjis_big5 (coding, source, destination,
4027 src_bytes, dst_bytes, 1);
4028 break;
4029
4030 case coding_type_iso2022:
4031 encode_coding_iso2022 (coding, source, destination,
4032 src_bytes, dst_bytes);
4033 break;
4034
4035 case coding_type_big5:
4036 encode_coding_sjis_big5 (coding, source, destination,
4037 src_bytes, dst_bytes, 0);
4038 break;
4039
4040 case coding_type_emacs_mule:
4041 encode_coding_emacs_mule (coding, source, destination,
4042 src_bytes, dst_bytes);
4043 break;
4044
4045 case coding_type_ccl:
4046 ccl_coding_driver (coding, source, destination,
4047 src_bytes, dst_bytes, 1);
4048 break;
4049
4050 default:
4051 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4052 }
4053
4054 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4055 && coding->consumed == src_bytes)
4056 coding->result = CODING_FINISH_NORMAL;
4057
4058 if (coding->mode & CODING_MODE_LAST_BLOCK)
4059 {
4060 unsigned char *src = source + coding->consumed;
4061 unsigned char *src_end = src + src_bytes;
4062 unsigned char *dst = destination + coding->produced;
4063
4064 if (coding->type == coding_type_iso2022)
4065 ENCODE_RESET_PLANE_AND_REGISTER;
4066 if (COMPOSING_P (coding))
4067 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4068 if (coding->consumed < src_bytes)
4069 {
4070 int len = src_bytes - coding->consumed;
4071
4072 BCOPY_SHORT (source + coding->consumed, dst, len);
4073 if (coding->src_multibyte)
4074 len = str_as_unibyte (dst, len);
4075 dst += len;
4076 coding->consumed = src_bytes;
4077 }
4078 coding->produced = coding->produced_char = dst - destination;
4079 }
4080
4081 return coding->result;
4082 }
4083
4084 /* Scan text in the region between *BEG and *END (byte positions),
4085 skip characters which we don't have to decode by coding system
4086 CODING at the head and tail, then set *BEG and *END to the region
4087 of the text we actually have to convert. The caller should move
4088 the gap out of the region in advance if the region is from a
4089 buffer.
4090
4091 If STR is not NULL, *BEG and *END are indices into STR. */
4092
4093 static void
4094 shrink_decoding_region (beg, end, coding, str)
4095 int *beg, *end;
4096 struct coding_system *coding;
4097 unsigned char *str;
4098 {
4099 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4100 int eol_conversion;
4101 Lisp_Object translation_table;
4102
4103 if (coding->type == coding_type_ccl
4104 || coding->type == coding_type_undecided
4105 || coding->eol_type != CODING_EOL_LF
4106 || !NILP (coding->post_read_conversion)
4107 || coding->composing != COMPOSITION_DISABLED)
4108 {
4109 /* We can't skip any data. */
4110 return;
4111 }
4112 if (coding->type == coding_type_no_conversion
4113 || coding->type == coding_type_raw_text
4114 || coding->type == coding_type_emacs_mule)
4115 {
4116 /* We need no conversion, but don't have to skip any data here.
4117 Decoding routine handles them effectively anyway. */
4118 return;
4119 }
4120
4121 translation_table = coding->translation_table_for_decode;
4122 if (NILP (translation_table) && !NILP (Venable_character_translation))
4123 translation_table = Vstandard_translation_table_for_decode;
4124 if (CHAR_TABLE_P (translation_table))
4125 {
4126 int i;
4127 for (i = 0; i < 128; i++)
4128 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4129 break;
4130 if (i < 128)
4131 /* Some ASCII character should be tranlsated. We give up
4132 shrinking. */
4133 return;
4134 }
4135
4136 if (coding->heading_ascii >= 0)
4137 /* Detection routine has already found how much we can skip at the
4138 head. */
4139 *beg += coding->heading_ascii;
4140
4141 if (str)
4142 {
4143 begp_orig = begp = str + *beg;
4144 endp_orig = endp = str + *end;
4145 }
4146 else
4147 {
4148 begp_orig = begp = BYTE_POS_ADDR (*beg);
4149 endp_orig = endp = begp + *end - *beg;
4150 }
4151
4152 switch (coding->type)
4153 {
4154 case coding_type_sjis:
4155 case coding_type_big5:
4156 /* We can skip all ASCII characters at the head. */
4157 if (coding->heading_ascii < 0)
4158 {
4159 if (eol_conversion)
4160 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4161 else
4162 while (begp < endp && *begp < 0x80) begp++;
4163 }
4164 /* We can skip all ASCII characters at the tail except for the
4165 second byte of SJIS or BIG5 code. */
4166 if (eol_conversion)
4167 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4168 else
4169 while (begp < endp && endp[-1] < 0x80) endp--;
4170 /* Do not consider LF as ascii if preceded by CR, since that
4171 confuses eol decoding. */
4172 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4173 endp++;
4174 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4175 endp++;
4176 break;
4177
4178 case coding_type_iso2022:
4179 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4180 /* We can't skip any data. */
4181 break;
4182 if (coding->heading_ascii < 0)
4183 {
4184 /* We can skip all ASCII characters at the head except for a
4185 few control codes. */
4186 while (begp < endp && (c = *begp) < 0x80
4187 && c != ISO_CODE_CR && c != ISO_CODE_SO
4188 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4189 && (!eol_conversion || c != ISO_CODE_LF))
4190 begp++;
4191 }
4192 switch (coding->category_idx)
4193 {
4194 case CODING_CATEGORY_IDX_ISO_8_1:
4195 case CODING_CATEGORY_IDX_ISO_8_2:
4196 /* We can skip all ASCII characters at the tail. */
4197 if (eol_conversion)
4198 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4199 else
4200 while (begp < endp && endp[-1] < 0x80) endp--;
4201 /* Do not consider LF as ascii if preceded by CR, since that
4202 confuses eol decoding. */
4203 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4204 endp++;
4205 break;
4206
4207 case CODING_CATEGORY_IDX_ISO_7:
4208 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4209 {
4210 /* We can skip all charactes at the tail except for 8-bit
4211 codes and ESC and the following 2-byte at the tail. */
4212 unsigned char *eight_bit = NULL;
4213
4214 if (eol_conversion)
4215 while (begp < endp
4216 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4217 {
4218 if (!eight_bit && c & 0x80) eight_bit = endp;
4219 endp--;
4220 }
4221 else
4222 while (begp < endp
4223 && (c = endp[-1]) != ISO_CODE_ESC)
4224 {
4225 if (!eight_bit && c & 0x80) eight_bit = endp;
4226 endp--;
4227 }
4228 /* Do not consider LF as ascii if preceded by CR, since that
4229 confuses eol decoding. */
4230 if (begp < endp && endp < endp_orig
4231 && endp[-1] == '\r' && endp[0] == '\n')
4232 endp++;
4233 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4234 {
4235 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4236 /* This is an ASCII designation sequence. We can
4237 surely skip the tail. But, if we have
4238 encountered an 8-bit code, skip only the codes
4239 after that. */
4240 endp = eight_bit ? eight_bit : endp + 2;
4241 else
4242 /* Hmmm, we can't skip the tail. */
4243 endp = endp_orig;
4244 }
4245 else if (eight_bit)
4246 endp = eight_bit;
4247 }
4248 }
4249 break;
4250
4251 default:
4252 abort ();
4253 }
4254 *beg += begp - begp_orig;
4255 *end += endp - endp_orig;
4256 return;
4257 }
4258
4259 /* Like shrink_decoding_region but for encoding. */
4260
4261 static void
4262 shrink_encoding_region (beg, end, coding, str)
4263 int *beg, *end;
4264 struct coding_system *coding;
4265 unsigned char *str;
4266 {
4267 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4268 int eol_conversion;
4269 Lisp_Object translation_table;
4270
4271 if (coding->type == coding_type_ccl
4272 || coding->eol_type == CODING_EOL_CRLF
4273 || coding->eol_type == CODING_EOL_CR
4274 || coding->cmp_data && coding->cmp_data->used > 0)
4275 {
4276 /* We can't skip any data. */
4277 return;
4278 }
4279 if (coding->type == coding_type_no_conversion
4280 || coding->type == coding_type_raw_text
4281 || coding->type == coding_type_emacs_mule
4282 || coding->type == coding_type_undecided)
4283 {
4284 /* We need no conversion, but don't have to skip any data here.
4285 Encoding routine handles them effectively anyway. */
4286 return;
4287 }
4288
4289 translation_table = coding->translation_table_for_encode;
4290 if (NILP (translation_table) && !NILP (Venable_character_translation))
4291 translation_table = Vstandard_translation_table_for_encode;
4292 if (CHAR_TABLE_P (translation_table))
4293 {
4294 int i;
4295 for (i = 0; i < 128; i++)
4296 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4297 break;
4298 if (i < 128)
4299 /* Some ASCII character should be tranlsated. We give up
4300 shrinking. */
4301 return;
4302 }
4303
4304 if (str)
4305 {
4306 begp_orig = begp = str + *beg;
4307 endp_orig = endp = str + *end;
4308 }
4309 else
4310 {
4311 begp_orig = begp = BYTE_POS_ADDR (*beg);
4312 endp_orig = endp = begp + *end - *beg;
4313 }
4314
4315 eol_conversion = (coding->eol_type == CODING_EOL_CR
4316 || coding->eol_type == CODING_EOL_CRLF);
4317
4318 /* Here, we don't have to check coding->pre_write_conversion because
4319 the caller is expected to have handled it already. */
4320 switch (coding->type)
4321 {
4322 case coding_type_iso2022:
4323 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4324 /* We can't skip any data. */
4325 break;
4326 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4327 {
4328 unsigned char *bol = begp;
4329 while (begp < endp && *begp < 0x80)
4330 {
4331 begp++;
4332 if (begp[-1] == '\n')
4333 bol = begp;
4334 }
4335 begp = bol;
4336 goto label_skip_tail;
4337 }
4338 /* fall down ... */
4339
4340 case coding_type_sjis:
4341 case coding_type_big5:
4342 /* We can skip all ASCII characters at the head and tail. */
4343 if (eol_conversion)
4344 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4345 else
4346 while (begp < endp && *begp < 0x80) begp++;
4347 label_skip_tail:
4348 if (eol_conversion)
4349 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4350 else
4351 while (begp < endp && *(endp - 1) < 0x80) endp--;
4352 break;
4353
4354 default:
4355 abort ();
4356 }
4357
4358 *beg += begp - begp_orig;
4359 *end += endp - endp_orig;
4360 return;
4361 }
4362
4363 /* As shrinking conversion region requires some overhead, we don't try
4364 shrinking if the length of conversion region is less than this
4365 value. */
4366 static int shrink_conversion_region_threshhold = 1024;
4367
4368 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4369 do { \
4370 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4371 { \
4372 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4373 else shrink_decoding_region (beg, end, coding, str); \
4374 } \
4375 } while (0)
4376
4377 static Lisp_Object
4378 code_convert_region_unwind (dummy)
4379 Lisp_Object dummy;
4380 {
4381 inhibit_pre_post_conversion = 0;
4382 return Qnil;
4383 }
4384
4385 /* Store information about all compositions in the range FROM and TO
4386 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4387 buffer or a string, defaults to the current buffer. */
4388
4389 void
4390 coding_save_composition (coding, from, to, obj)
4391 struct coding_system *coding;
4392 int from, to;
4393 Lisp_Object obj;
4394 {
4395 Lisp_Object prop;
4396 int start, end;
4397
4398 if (coding->composing == COMPOSITION_DISABLED)
4399 return;
4400 if (!coding->cmp_data)
4401 coding_allocate_composition_data (coding, from);
4402 if (!find_composition (from, to, &start, &end, &prop, obj)
4403 || end > to)
4404 return;
4405 if (start < from
4406 && (!find_composition (end, to, &start, &end, &prop, obj)
4407 || end > to))
4408 return;
4409 coding->composing = COMPOSITION_NO;
4410 do
4411 {
4412 if (COMPOSITION_VALID_P (start, end, prop))
4413 {
4414 enum composition_method method = COMPOSITION_METHOD (prop);
4415 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4416 >= COMPOSITION_DATA_SIZE)
4417 coding_allocate_composition_data (coding, from);
4418 /* For relative composition, we remember start and end
4419 positions, for the other compositions, we also remember
4420 components. */
4421 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4422 if (method != COMPOSITION_RELATIVE)
4423 {
4424 /* We must store a*/
4425 Lisp_Object val, ch;
4426
4427 val = COMPOSITION_COMPONENTS (prop);
4428 if (CONSP (val))
4429 while (CONSP (val))
4430 {
4431 ch = XCAR (val), val = XCDR (val);
4432 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4433 }
4434 else if (VECTORP (val) || STRINGP (val))
4435 {
4436 int len = (VECTORP (val)
4437 ? XVECTOR (val)->size : XSTRING (val)->size);
4438 int i;
4439 for (i = 0; i < len; i++)
4440 {
4441 ch = (STRINGP (val)
4442 ? Faref (val, make_number (i))
4443 : XVECTOR (val)->contents[i]);
4444 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4445 }
4446 }
4447 else /* INTEGERP (val) */
4448 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4449 }
4450 CODING_ADD_COMPOSITION_END (coding, end - from);
4451 }
4452 start = end;
4453 }
4454 while (start < to
4455 && find_composition (start, to, &start, &end, &prop, obj)
4456 && end <= to);
4457
4458 /* Make coding->cmp_data point to the first memory block. */
4459 while (coding->cmp_data->prev)
4460 coding->cmp_data = coding->cmp_data->prev;
4461 coding->cmp_data_start = 0;
4462 }
4463
4464 /* Reflect the saved information about compositions to OBJ.
4465 CODING->cmp_data points to a memory block for the informaiton. OBJ
4466 is a buffer or a string, defaults to the current buffer. */
4467
4468 static void
4469 coding_restore_composition (coding, obj)
4470 struct coding_system *coding;
4471 Lisp_Object obj;
4472 {
4473 struct composition_data *cmp_data = coding->cmp_data;
4474
4475 if (!cmp_data)
4476 return;
4477
4478 while (cmp_data->prev)
4479 cmp_data = cmp_data->prev;
4480
4481 while (cmp_data)
4482 {
4483 int i;
4484
4485 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4486 {
4487 int *data = cmp_data->data + i;
4488 enum composition_method method = (enum composition_method) data[3];
4489 Lisp_Object components;
4490
4491 if (method == COMPOSITION_RELATIVE)
4492 components = Qnil;
4493 else
4494 {
4495 int len = data[0] - 4, j;
4496 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4497
4498 for (j = 0; j < len; j++)
4499 args[j] = make_number (data[4 + j]);
4500 components = (method == COMPOSITION_WITH_ALTCHARS
4501 ? Fstring (len, args) : Fvector (len, args));
4502 }
4503 compose_text (data[1], data[2], components, Qnil, obj);
4504 }
4505 cmp_data = cmp_data->next;
4506 }
4507 }
4508
4509 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4510 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4511 coding system CODING, and return the status code of code conversion
4512 (currently, this value has no meaning).
4513
4514 How many characters (and bytes) are converted to how many
4515 characters (and bytes) are recorded in members of the structure
4516 CODING.
4517
4518 If REPLACE is nonzero, we do various things as if the original text
4519 is deleted and a new text is inserted. See the comments in
4520 replace_range (insdel.c) to know what we are doing.
4521
4522 If REPLACE is zero, it is assumed that the source text is unibyte.
4523 Otherwize, it is assumed that the source text is multibyte. */
4524
4525 int
4526 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4527 int from, from_byte, to, to_byte, encodep, replace;
4528 struct coding_system *coding;
4529 {
4530 int len = to - from, len_byte = to_byte - from_byte;
4531 int require, inserted, inserted_byte;
4532 int head_skip, tail_skip, total_skip = 0;
4533 Lisp_Object saved_coding_symbol;
4534 int first = 1;
4535 unsigned char *src, *dst;
4536 Lisp_Object deletion;
4537 int orig_point = PT, orig_len = len;
4538 int prev_Z;
4539 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4540
4541 coding->src_multibyte = replace && multibyte_p;
4542 coding->dst_multibyte = multibyte_p;
4543
4544 deletion = Qnil;
4545 saved_coding_symbol = Qnil;
4546
4547 if (from < PT && PT < to)
4548 {
4549 TEMP_SET_PT_BOTH (from, from_byte);
4550 orig_point = from;
4551 }
4552
4553 if (replace)
4554 {
4555 int saved_from = from;
4556
4557 prepare_to_modify_buffer (from, to, &from);
4558 if (saved_from != from)
4559 {
4560 to = from + len;
4561 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4562 len_byte = to_byte - from_byte;
4563 }
4564 }
4565
4566 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4567 {
4568 /* We must detect encoding of text and eol format. */
4569
4570 if (from < GPT && to > GPT)
4571 move_gap_both (from, from_byte);
4572 if (coding->type == coding_type_undecided)
4573 {
4574 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4575 if (coding->type == coding_type_undecided)
4576 /* It seems that the text contains only ASCII, but we
4577 should not left it undecided because the deeper
4578 decoding routine (decode_coding) tries to detect the
4579 encodings again in vain. */
4580 coding->type = coding_type_emacs_mule;
4581 }
4582 if (coding->eol_type == CODING_EOL_UNDECIDED)
4583 {
4584 saved_coding_symbol = coding->symbol;
4585 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4586 if (coding->eol_type == CODING_EOL_UNDECIDED)
4587 coding->eol_type = CODING_EOL_LF;
4588 /* We had better recover the original eol format if we
4589 encounter an inconsitent eol format while decoding. */
4590 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4591 }
4592 }
4593
4594 /* Now we convert the text. */
4595
4596 /* For encoding, we must process pre-write-conversion in advance. */
4597 if (! inhibit_pre_post_conversion
4598 && encodep
4599 && SYMBOLP (coding->pre_write_conversion)
4600 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4601 {
4602 /* The function in pre-write-conversion may put a new text in a
4603 new buffer. */
4604 struct buffer *prev = current_buffer;
4605 Lisp_Object new;
4606 int count = specpdl_ptr - specpdl;
4607
4608 record_unwind_protect (code_convert_region_unwind, Qnil);
4609 /* We should not call any more pre-write/post-read-conversion
4610 functions while this pre-write-conversion is running. */
4611 inhibit_pre_post_conversion = 1;
4612 call2 (coding->pre_write_conversion,
4613 make_number (from), make_number (to));
4614 inhibit_pre_post_conversion = 0;
4615 /* Discard the unwind protect. */
4616 specpdl_ptr--;
4617
4618 if (current_buffer != prev)
4619 {
4620 len = ZV - BEGV;
4621 new = Fcurrent_buffer ();
4622 set_buffer_internal_1 (prev);
4623 del_range_2 (from, from_byte, to, to_byte, 0);
4624 TEMP_SET_PT_BOTH (from, from_byte);
4625 insert_from_buffer (XBUFFER (new), 1, len, 0);
4626 Fkill_buffer (new);
4627 if (orig_point >= to)
4628 orig_point += len - orig_len;
4629 else if (orig_point > from)
4630 orig_point = from;
4631 orig_len = len;
4632 to = from + len;
4633 from_byte = CHAR_TO_BYTE (from);
4634 to_byte = CHAR_TO_BYTE (to);
4635 len_byte = to_byte - from_byte;
4636 TEMP_SET_PT_BOTH (from, from_byte);
4637 }
4638 }
4639
4640 if (replace)
4641 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4642
4643 if (coding->composing != COMPOSITION_DISABLED)
4644 {
4645 if (encodep)
4646 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4647 else
4648 coding_allocate_composition_data (coding, from);
4649 }
4650
4651 /* Try to skip the heading and tailing ASCIIs. */
4652 {
4653 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4654
4655 if (from < GPT && GPT < to)
4656 move_gap_both (from, from_byte);
4657 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4658 if (from_byte == to_byte
4659 && (encodep || NILP (coding->post_read_conversion))
4660 && ! CODING_REQUIRE_FLUSHING (coding))
4661 {
4662 coding->produced = len_byte;
4663 coding->produced_char = len;
4664 if (!replace)
4665 /* We must record and adjust for this new text now. */
4666 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4667 return 0;
4668 }
4669
4670 head_skip = from_byte - from_byte_orig;
4671 tail_skip = to_byte_orig - to_byte;
4672 total_skip = head_skip + tail_skip;
4673 from += head_skip;
4674 to -= tail_skip;
4675 len -= total_skip; len_byte -= total_skip;
4676 }
4677
4678 /* The code conversion routine can not preserve text properties for
4679 now. So, we must remove all text properties in the region.
4680 Here, we must suppress all modification hooks. */
4681 if (replace)
4682 {
4683 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4684 inhibit_modification_hooks = 1;
4685 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4686 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4687 }
4688
4689 /* For converion, we must put the gap before the text in addition to
4690 making the gap larger for efficient decoding. The required gap
4691 size starts from 2000 which is the magic number used in make_gap.
4692 But, after one batch of conversion, it will be incremented if we
4693 find that it is not enough . */
4694 require = 2000;
4695
4696 if (GAP_SIZE < require)
4697 make_gap (require - GAP_SIZE);
4698 move_gap_both (from, from_byte);
4699
4700 inserted = inserted_byte = 0;
4701
4702 GAP_SIZE += len_byte;
4703 ZV -= len;
4704 Z -= len;
4705 ZV_BYTE -= len_byte;
4706 Z_BYTE -= len_byte;
4707
4708 if (GPT - BEG < BEG_UNCHANGED)
4709 BEG_UNCHANGED = GPT - BEG;
4710 if (Z - GPT < END_UNCHANGED)
4711 END_UNCHANGED = Z - GPT;
4712
4713 if (!encodep && coding->src_multibyte)
4714 {
4715 /* Decoding routines expects that the source text is unibyte.
4716 We must convert 8-bit characters of multibyte form to
4717 unibyte. */
4718 int len_byte_orig = len_byte;
4719 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4720 if (len_byte < len_byte_orig)
4721 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4722 len_byte);
4723 coding->src_multibyte = 0;
4724 }
4725
4726 for (;;)
4727 {
4728 int result;
4729
4730 /* The buffer memory is now:
4731 +--------+converted-text+---------+-------original-text-------+---+
4732 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4733 |<---------------------- GAP ----------------------->| */
4734 src = GAP_END_ADDR - len_byte;
4735 dst = GPT_ADDR + inserted_byte;
4736
4737 if (encodep)
4738 result = encode_coding (coding, src, dst, len_byte, 0);
4739 else
4740 result = decode_coding (coding, src, dst, len_byte, 0);
4741
4742 /* The buffer memory is now:
4743 +--------+-------converted-text----+--+------original-text----+---+
4744 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4745 |<---------------------- GAP ----------------------->| */
4746
4747 inserted += coding->produced_char;
4748 inserted_byte += coding->produced;
4749 len_byte -= coding->consumed;
4750
4751 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4752 {
4753 coding_allocate_composition_data (coding, from + inserted);
4754 continue;
4755 }
4756
4757 src += coding->consumed;
4758 dst += coding->produced;
4759
4760 if (result == CODING_FINISH_NORMAL)
4761 {
4762 src += len_byte;
4763 break;
4764 }
4765 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4766 {
4767 unsigned char *pend = dst, *p = pend - inserted_byte;
4768 Lisp_Object eol_type;
4769
4770 /* Encode LFs back to the original eol format (CR or CRLF). */
4771 if (coding->eol_type == CODING_EOL_CR)
4772 {
4773 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4774 }
4775 else
4776 {
4777 int count = 0;
4778
4779 while (p < pend) if (*p++ == '\n') count++;
4780 if (src - dst < count)
4781 {
4782 /* We don't have sufficient room for encoding LFs
4783 back to CRLF. We must record converted and
4784 not-yet-converted text back to the buffer
4785 content, enlarge the gap, then record them out of
4786 the buffer contents again. */
4787 int add = len_byte + inserted_byte;
4788
4789 GAP_SIZE -= add;
4790 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4791 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4792 make_gap (count - GAP_SIZE);
4793 GAP_SIZE += add;
4794 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4795 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4796 /* Don't forget to update SRC, DST, and PEND. */
4797 src = GAP_END_ADDR - len_byte;
4798 dst = GPT_ADDR + inserted_byte;
4799 pend = dst;
4800 }
4801 inserted += count;
4802 inserted_byte += count;
4803 coding->produced += count;
4804 p = dst = pend + count;
4805 while (count)
4806 {
4807 *--p = *--pend;
4808 if (*p == '\n') count--, *--p = '\r';
4809 }
4810 }
4811
4812 /* Suppress eol-format conversion in the further conversion. */
4813 coding->eol_type = CODING_EOL_LF;
4814
4815 /* Set the coding system symbol to that for Unix-like EOL. */
4816 eol_type = Fget (saved_coding_symbol, Qeol_type);
4817 if (VECTORP (eol_type)
4818 && XVECTOR (eol_type)->size == 3
4819 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4820 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4821 else
4822 coding->symbol = saved_coding_symbol;
4823
4824 continue;
4825 }
4826 if (len_byte <= 0)
4827 {
4828 if (coding->type != coding_type_ccl
4829 || coding->mode & CODING_MODE_LAST_BLOCK)
4830 break;
4831 coding->mode |= CODING_MODE_LAST_BLOCK;
4832 continue;
4833 }
4834 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4835 {
4836 /* The source text ends in invalid codes. Let's just
4837 make them valid buffer contents, and finish conversion. */
4838 inserted += len_byte;
4839 inserted_byte += len_byte;
4840 while (len_byte--)
4841 *dst++ = *src++;
4842 break;
4843 }
4844 if (result == CODING_FINISH_INTERRUPT)
4845 {
4846 /* The conversion procedure was interrupted by a user. */
4847 break;
4848 }
4849 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4850 if (coding->consumed < 1)
4851 {
4852 /* It's quite strange to require more memory without
4853 consuming any bytes. Perhaps CCL program bug. */
4854 break;
4855 }
4856 if (first)
4857 {
4858 /* We have just done the first batch of conversion which was
4859 stoped because of insufficient gap. Let's reconsider the
4860 required gap size (i.e. SRT - DST) now.
4861
4862 We have converted ORIG bytes (== coding->consumed) into
4863 NEW bytes (coding->produced). To convert the remaining
4864 LEN bytes, we may need REQUIRE bytes of gap, where:
4865 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4866 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4867 Here, we are sure that NEW >= ORIG. */
4868 float ratio = coding->produced - coding->consumed;
4869 ratio /= coding->consumed;
4870 require = len_byte * ratio;
4871 first = 0;
4872 }
4873 if ((src - dst) < (require + 2000))
4874 {
4875 /* See the comment above the previous call of make_gap. */
4876 int add = len_byte + inserted_byte;
4877
4878 GAP_SIZE -= add;
4879 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4880 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4881 make_gap (require + 2000);
4882 GAP_SIZE += add;
4883 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4884 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4885 }
4886 }
4887 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4888
4889 if (encodep && coding->dst_multibyte)
4890 {
4891 /* The output is unibyte. We must convert 8-bit characters to
4892 multibyte form. */
4893 if (inserted_byte * 2 > GAP_SIZE)
4894 {
4895 GAP_SIZE -= inserted_byte;
4896 ZV += inserted_byte; Z += inserted_byte;
4897 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4898 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4899 make_gap (inserted_byte - GAP_SIZE);
4900 GAP_SIZE += inserted_byte;
4901 ZV -= inserted_byte; Z -= inserted_byte;
4902 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4903 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4904 }
4905 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4906 }
4907
4908 /* If we have shrinked the conversion area, adjust it now. */
4909 if (total_skip > 0)
4910 {
4911 if (tail_skip > 0)
4912 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4913 inserted += total_skip; inserted_byte += total_skip;
4914 GAP_SIZE += total_skip;
4915 GPT -= head_skip; GPT_BYTE -= head_skip;
4916 ZV -= total_skip; ZV_BYTE -= total_skip;
4917 Z -= total_skip; Z_BYTE -= total_skip;
4918 from -= head_skip; from_byte -= head_skip;
4919 to += tail_skip; to_byte += tail_skip;
4920 }
4921
4922 prev_Z = Z;
4923 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4924 inserted = Z - prev_Z;
4925
4926 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4927 coding_restore_composition (coding, Fcurrent_buffer ());
4928 coding_free_composition_data (coding);
4929
4930 if (! inhibit_pre_post_conversion
4931 && ! encodep && ! NILP (coding->post_read_conversion))
4932 {
4933 Lisp_Object val;
4934 int count = specpdl_ptr - specpdl;
4935
4936 if (from != PT)
4937 TEMP_SET_PT_BOTH (from, from_byte);
4938 prev_Z = Z;
4939 record_unwind_protect (code_convert_region_unwind, Qnil);
4940 /* We should not call any more pre-write/post-read-conversion
4941 functions while this post-read-conversion is running. */
4942 inhibit_pre_post_conversion = 1;
4943 val = call1 (coding->post_read_conversion, make_number (inserted));
4944 inhibit_pre_post_conversion = 0;
4945 /* Discard the unwind protect. */
4946 specpdl_ptr--;
4947 CHECK_NUMBER (val, 0);
4948 inserted += Z - prev_Z;
4949 }
4950
4951 if (orig_point >= from)
4952 {
4953 if (orig_point >= from + orig_len)
4954 orig_point += inserted - orig_len;
4955 else
4956 orig_point = from;
4957 TEMP_SET_PT (orig_point);
4958 }
4959
4960 if (replace)
4961 {
4962 signal_after_change (from, to - from, inserted);
4963 update_compositions (from, from + inserted, CHECK_BORDER);
4964 }
4965
4966 {
4967 coding->consumed = to_byte - from_byte;
4968 coding->consumed_char = to - from;
4969 coding->produced = inserted_byte;
4970 coding->produced_char = inserted;
4971 }
4972
4973 return 0;
4974 }
4975
4976 Lisp_Object
4977 run_pre_post_conversion_on_str (str, coding, encodep)
4978 Lisp_Object str;
4979 struct coding_system *coding;
4980 int encodep;
4981 {
4982 int count = specpdl_ptr - specpdl;
4983 struct gcpro gcpro1;
4984 struct buffer *prev = current_buffer;
4985 int multibyte = STRING_MULTIBYTE (str);
4986
4987 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4988 record_unwind_protect (code_convert_region_unwind, Qnil);
4989 GCPRO1 (str);
4990 temp_output_buffer_setup (" *code-converting-work*");
4991 set_buffer_internal (XBUFFER (Vstandard_output));
4992 /* We must insert the contents of STR as is without
4993 unibyte<->multibyte conversion. For that, we adjust the
4994 multibyteness of the working buffer to that of STR. */
4995 Ferase_buffer ();
4996 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4997 insert_from_string (str, 0, 0,
4998 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
4999 UNGCPRO;
5000 inhibit_pre_post_conversion = 1;
5001 if (encodep)
5002 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5003 else
5004 {
5005 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5006 call1 (coding->post_read_conversion, make_number (Z - BEG));
5007 }
5008 inhibit_pre_post_conversion = 0;
5009 str = make_buffer_string (BEG, Z, 0);
5010 return unbind_to (count, str);
5011 }
5012
5013 Lisp_Object
5014 decode_coding_string (str, coding, nocopy)
5015 Lisp_Object str;
5016 struct coding_system *coding;
5017 int nocopy;
5018 {
5019 int len;
5020 char *buf;
5021 int from, to, to_byte;
5022 struct gcpro gcpro1;
5023 Lisp_Object saved_coding_symbol;
5024 int result;
5025
5026 from = 0;
5027 to = XSTRING (str)->size;
5028 to_byte = STRING_BYTES (XSTRING (str));
5029
5030 saved_coding_symbol = Qnil;
5031 if (CODING_REQUIRE_DETECTION (coding))
5032 {
5033 /* See the comments in code_convert_region. */
5034 if (coding->type == coding_type_undecided)
5035 {
5036 detect_coding (coding, XSTRING (str)->data, to_byte);
5037 if (coding->type == coding_type_undecided)
5038 coding->type = coding_type_emacs_mule;
5039 }
5040 if (coding->eol_type == CODING_EOL_UNDECIDED)
5041 {
5042 saved_coding_symbol = coding->symbol;
5043 detect_eol (coding, XSTRING (str)->data, to_byte);
5044 if (coding->eol_type == CODING_EOL_UNDECIDED)
5045 coding->eol_type = CODING_EOL_LF;
5046 /* We had better recover the original eol format if we
5047 encounter an inconsitent eol format while decoding. */
5048 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5049 }
5050 }
5051
5052 if (! CODING_REQUIRE_DECODING (coding))
5053 {
5054 if (!STRING_MULTIBYTE (str))
5055 {
5056 str = Fstring_as_multibyte (str);
5057 nocopy = 1;
5058 }
5059 return (nocopy ? str : Fcopy_sequence (str));
5060 }
5061
5062 if (STRING_MULTIBYTE (str))
5063 {
5064 /* Decoding routines expect the source text to be unibyte. */
5065 str = Fstring_as_unibyte (str);
5066 nocopy = 1;
5067 coding->src_multibyte = 0;
5068 }
5069 coding->dst_multibyte = 1;
5070
5071 if (coding->composing != COMPOSITION_DISABLED)
5072 coding_allocate_composition_data (coding, from);
5073
5074 /* Try to skip the heading and tailing ASCIIs. */
5075 {
5076 int from_orig = from;
5077
5078 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5079 0);
5080 if (from == to_byte)
5081 return (nocopy ? str : Fcopy_sequence (str));
5082 }
5083
5084 len = decoding_buffer_size (coding, to_byte - from);
5085 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5086 GCPRO1 (str);
5087 buf = get_conversion_buffer (len);
5088 UNGCPRO;
5089
5090 if (from > 0)
5091 bcopy (XSTRING (str)->data, buf, from);
5092 result = decode_coding (coding, XSTRING (str)->data + from,
5093 buf + from, to_byte - from, len);
5094 if (result == CODING_FINISH_INCONSISTENT_EOL)
5095 {
5096 /* We simply try to decode the whole string again but without
5097 eol-conversion this time. */
5098 coding->eol_type = CODING_EOL_LF;
5099 coding->symbol = saved_coding_symbol;
5100 coding_free_composition_data (coding);
5101 return decode_coding_string (str, coding, nocopy);
5102 }
5103
5104 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5105 STRING_BYTES (XSTRING (str)) - to_byte);
5106
5107 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5108 str = make_multibyte_string (buf, len + coding->produced_char,
5109 len + coding->produced);
5110
5111 if (coding->cmp_data && coding->cmp_data->used)
5112 coding_restore_composition (coding, str);
5113 coding_free_composition_data (coding);
5114
5115 if (SYMBOLP (coding->post_read_conversion)
5116 && !NILP (Ffboundp (coding->post_read_conversion)))
5117 str = run_pre_post_conversion_on_str (str, coding, 0);
5118
5119 return str;
5120 }
5121
5122 Lisp_Object
5123 encode_coding_string (str, coding, nocopy)
5124 Lisp_Object str;
5125 struct coding_system *coding;
5126 int nocopy;
5127 {
5128 int len;
5129 char *buf;
5130 int from, to, to_byte;
5131 struct gcpro gcpro1;
5132 Lisp_Object saved_coding_symbol;
5133 int result;
5134
5135 if (SYMBOLP (coding->pre_write_conversion)
5136 && !NILP (Ffboundp (coding->pre_write_conversion)))
5137 str = run_pre_post_conversion_on_str (str, coding, 1);
5138
5139 from = 0;
5140 to = XSTRING (str)->size;
5141 to_byte = STRING_BYTES (XSTRING (str));
5142
5143 saved_coding_symbol = Qnil;
5144 if (! CODING_REQUIRE_ENCODING (coding))
5145 {
5146 if (STRING_MULTIBYTE (str))
5147 {
5148 str = Fstring_as_unibyte (str);
5149 nocopy = 1;
5150 }
5151 return (nocopy ? str : Fcopy_sequence (str));
5152 }
5153
5154 /* Encoding routines determine the multibyteness of the source text
5155 by coding->src_multibyte. */
5156 coding->src_multibyte = STRING_MULTIBYTE (str);
5157 coding->dst_multibyte = 0;
5158
5159 if (coding->composing != COMPOSITION_DISABLED)
5160 coding_save_composition (coding, from, to, str);
5161
5162 /* Try to skip the heading and tailing ASCIIs. */
5163 {
5164 int from_orig = from;
5165
5166 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5167 1);
5168 if (from == to_byte)
5169 return (nocopy ? str : Fcopy_sequence (str));
5170 }
5171
5172 len = encoding_buffer_size (coding, to_byte - from);
5173 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5174 GCPRO1 (str);
5175 buf = get_conversion_buffer (len);
5176 UNGCPRO;
5177
5178 if (from > 0)
5179 bcopy (XSTRING (str)->data, buf, from);
5180 result = encode_coding (coding, XSTRING (str)->data + from,
5181 buf + from, to_byte - from, len);
5182 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5183 STRING_BYTES (XSTRING (str)) - to_byte);
5184
5185 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5186 str = make_unibyte_string (buf, len + coding->produced);
5187 coding_free_composition_data (coding);
5188
5189 return str;
5190 }
5191
5192 \f
5193 #ifdef emacs
5194 /*** 8. Emacs Lisp library functions ***/
5195
5196 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5197 "Return t if OBJECT is nil or a coding-system.\n\
5198 See the documentation of `make-coding-system' for information\n\
5199 about coding-system objects.")
5200 (obj)
5201 Lisp_Object obj;
5202 {
5203 if (NILP (obj))
5204 return Qt;
5205 if (!SYMBOLP (obj))
5206 return Qnil;
5207 /* Get coding-spec vector for OBJ. */
5208 obj = Fget (obj, Qcoding_system);
5209 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5210 ? Qt : Qnil);
5211 }
5212
5213 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5214 Sread_non_nil_coding_system, 1, 1, 0,
5215 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5216 (prompt)
5217 Lisp_Object prompt;
5218 {
5219 Lisp_Object val;
5220 do
5221 {
5222 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5223 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5224 }
5225 while (XSTRING (val)->size == 0);
5226 return (Fintern (val, Qnil));
5227 }
5228
5229 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5230 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5231 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5232 (prompt, default_coding_system)
5233 Lisp_Object prompt, default_coding_system;
5234 {
5235 Lisp_Object val;
5236 if (SYMBOLP (default_coding_system))
5237 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5238 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5239 Qt, Qnil, Qcoding_system_history,
5240 default_coding_system, Qnil);
5241 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5242 }
5243
5244 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5245 1, 1, 0,
5246 "Check validity of CODING-SYSTEM.\n\
5247 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5248 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5249 The value of property should be a vector of length 5.")
5250 (coding_system)
5251 Lisp_Object coding_system;
5252 {
5253 CHECK_SYMBOL (coding_system, 0);
5254 if (!NILP (Fcoding_system_p (coding_system)))
5255 return coding_system;
5256 while (1)
5257 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5258 }
5259 \f
5260 Lisp_Object
5261 detect_coding_system (src, src_bytes, highest)
5262 unsigned char *src;
5263 int src_bytes, highest;
5264 {
5265 int coding_mask, eol_type;
5266 Lisp_Object val, tmp;
5267 int dummy;
5268
5269 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5270 eol_type = detect_eol_type (src, src_bytes, &dummy);
5271 if (eol_type == CODING_EOL_INCONSISTENT)
5272 eol_type = CODING_EOL_UNDECIDED;
5273
5274 if (!coding_mask)
5275 {
5276 val = Qundecided;
5277 if (eol_type != CODING_EOL_UNDECIDED)
5278 {
5279 Lisp_Object val2;
5280 val2 = Fget (Qundecided, Qeol_type);
5281 if (VECTORP (val2))
5282 val = XVECTOR (val2)->contents[eol_type];
5283 }
5284 return (highest ? val : Fcons (val, Qnil));
5285 }
5286
5287 /* At first, gather possible coding systems in VAL. */
5288 val = Qnil;
5289 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5290 {
5291 Lisp_Object category_val, category_index;
5292
5293 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5294 category_val = Fsymbol_value (XCAR (tmp));
5295 if (!NILP (category_val)
5296 && NATNUMP (category_index)
5297 && (coding_mask & (1 << XFASTINT (category_index))))
5298 {
5299 val = Fcons (category_val, val);
5300 if (highest)
5301 break;
5302 }
5303 }
5304 if (!highest)
5305 val = Fnreverse (val);
5306
5307 /* Then, replace the elements with subsidiary coding systems. */
5308 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5309 {
5310 if (eol_type != CODING_EOL_UNDECIDED
5311 && eol_type != CODING_EOL_INCONSISTENT)
5312 {
5313 Lisp_Object eol;
5314 eol = Fget (XCAR (tmp), Qeol_type);
5315 if (VECTORP (eol))
5316 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5317 }
5318 }
5319 return (highest ? XCAR (val) : val);
5320 }
5321
5322 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5323 2, 3, 0,
5324 "Detect coding system of the text in the region between START and END.\n\
5325 Return a list of possible coding systems ordered by priority.\n\
5326 \n\
5327 If only ASCII characters are found, it returns a list of single element\n\
5328 `undecided' or its subsidiary coding system according to a detected\n\
5329 end-of-line format.\n\
5330 \n\
5331 If optional argument HIGHEST is non-nil, return the coding system of\n\
5332 highest priority.")
5333 (start, end, highest)
5334 Lisp_Object start, end, highest;
5335 {
5336 int from, to;
5337 int from_byte, to_byte;
5338
5339 CHECK_NUMBER_COERCE_MARKER (start, 0);
5340 CHECK_NUMBER_COERCE_MARKER (end, 1);
5341
5342 validate_region (&start, &end);
5343 from = XINT (start), to = XINT (end);
5344 from_byte = CHAR_TO_BYTE (from);
5345 to_byte = CHAR_TO_BYTE (to);
5346
5347 if (from < GPT && to >= GPT)
5348 move_gap_both (to, to_byte);
5349
5350 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5351 to_byte - from_byte,
5352 !NILP (highest));
5353 }
5354
5355 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5356 1, 2, 0,
5357 "Detect coding system of the text in STRING.\n\
5358 Return a list of possible coding systems ordered by priority.\n\
5359 \n\
5360 If only ASCII characters are found, it returns a list of single element\n\
5361 `undecided' or its subsidiary coding system according to a detected\n\
5362 end-of-line format.\n\
5363 \n\
5364 If optional argument HIGHEST is non-nil, return the coding system of\n\
5365 highest priority.")
5366 (string, highest)
5367 Lisp_Object string, highest;
5368 {
5369 CHECK_STRING (string, 0);
5370
5371 return detect_coding_system (XSTRING (string)->data,
5372 STRING_BYTES (XSTRING (string)),
5373 !NILP (highest));
5374 }
5375
5376 Lisp_Object
5377 code_convert_region1 (start, end, coding_system, encodep)
5378 Lisp_Object start, end, coding_system;
5379 int encodep;
5380 {
5381 struct coding_system coding;
5382 int from, to, len;
5383
5384 CHECK_NUMBER_COERCE_MARKER (start, 0);
5385 CHECK_NUMBER_COERCE_MARKER (end, 1);
5386 CHECK_SYMBOL (coding_system, 2);
5387
5388 validate_region (&start, &end);
5389 from = XFASTINT (start);
5390 to = XFASTINT (end);
5391
5392 if (NILP (coding_system))
5393 return make_number (to - from);
5394
5395 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5396 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5397
5398 coding.mode |= CODING_MODE_LAST_BLOCK;
5399 coding.src_multibyte = coding.dst_multibyte
5400 = !NILP (current_buffer->enable_multibyte_characters);
5401 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5402 &coding, encodep, 1);
5403 Vlast_coding_system_used = coding.symbol;
5404 return make_number (coding.produced_char);
5405 }
5406
5407 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5408 3, 3, "r\nzCoding system: ",
5409 "Decode the current region by specified coding system.\n\
5410 When called from a program, takes three arguments:\n\
5411 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5412 This function sets `last-coding-system-used' to the precise coding system\n\
5413 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5414 not fully specified.)\n\
5415 It returns the length of the decoded text.")
5416 (start, end, coding_system)
5417 Lisp_Object start, end, coding_system;
5418 {
5419 return code_convert_region1 (start, end, coding_system, 0);
5420 }
5421
5422 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5423 3, 3, "r\nzCoding system: ",
5424 "Encode the current region by specified coding system.\n\
5425 When called from a program, takes three arguments:\n\
5426 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5427 This function sets `last-coding-system-used' to the precise coding system\n\
5428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5429 not fully specified.)\n\
5430 It returns the length of the encoded text.")
5431 (start, end, coding_system)
5432 Lisp_Object start, end, coding_system;
5433 {
5434 return code_convert_region1 (start, end, coding_system, 1);
5435 }
5436
5437 Lisp_Object
5438 code_convert_string1 (string, coding_system, nocopy, encodep)
5439 Lisp_Object string, coding_system, nocopy;
5440 int encodep;
5441 {
5442 struct coding_system coding;
5443
5444 CHECK_STRING (string, 0);
5445 CHECK_SYMBOL (coding_system, 1);
5446
5447 if (NILP (coding_system))
5448 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5449
5450 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5451 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5452
5453 coding.mode |= CODING_MODE_LAST_BLOCK;
5454 string = (encodep
5455 ? encode_coding_string (string, &coding, !NILP (nocopy))
5456 : decode_coding_string (string, &coding, !NILP (nocopy)));
5457 Vlast_coding_system_used = coding.symbol;
5458
5459 return string;
5460 }
5461
5462 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5463 2, 3, 0,
5464 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5465 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5466 if the decoding operation is trivial.\n\
5467 This function sets `last-coding-system-used' to the precise coding system\n\
5468 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5469 not fully specified.)")
5470 (string, coding_system, nocopy)
5471 Lisp_Object string, coding_system, nocopy;
5472 {
5473 return code_convert_string1 (string, coding_system, nocopy, 0);
5474 }
5475
5476 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5477 2, 3, 0,
5478 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5479 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5480 if the encoding operation is trivial.\n\
5481 This function sets `last-coding-system-used' to the precise coding system\n\
5482 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5483 not fully specified.)")
5484 (string, coding_system, nocopy)
5485 Lisp_Object string, coding_system, nocopy;
5486 {
5487 return code_convert_string1 (string, coding_system, nocopy, 1);
5488 }
5489
5490 /* Encode or decode STRING according to CODING_SYSTEM.
5491 Do not set Vlast_coding_system_used.
5492
5493 This function is called only from macros DECODE_FILE and
5494 ENCODE_FILE, thus we ignore character composition. */
5495
5496 Lisp_Object
5497 code_convert_string_norecord (string, coding_system, encodep)
5498 Lisp_Object string, coding_system;
5499 int encodep;
5500 {
5501 struct coding_system coding;
5502
5503 CHECK_STRING (string, 0);
5504 CHECK_SYMBOL (coding_system, 1);
5505
5506 if (NILP (coding_system))
5507 return string;
5508
5509 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5510 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5511
5512 coding.composing = COMPOSITION_DISABLED;
5513 coding.mode |= CODING_MODE_LAST_BLOCK;
5514 return (encodep
5515 ? encode_coding_string (string, &coding, 1)
5516 : decode_coding_string (string, &coding, 1));
5517 }
5518 \f
5519 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5520 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5521 Return the corresponding character.")
5522 (code)
5523 Lisp_Object code;
5524 {
5525 unsigned char c1, c2, s1, s2;
5526 Lisp_Object val;
5527
5528 CHECK_NUMBER (code, 0);
5529 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5530 if (s1 == 0)
5531 {
5532 if (s2 < 0x80)
5533 XSETFASTINT (val, s2);
5534 else if (s2 >= 0xA0 || s2 <= 0xDF)
5535 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5536 else
5537 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5538 }
5539 else
5540 {
5541 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5542 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5543 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5544 DECODE_SJIS (s1, s2, c1, c2);
5545 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5546 }
5547 return val;
5548 }
5549
5550 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5551 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5552 Return the corresponding code in SJIS.")
5553 (ch)
5554 Lisp_Object ch;
5555 {
5556 int charset, c1, c2, s1, s2;
5557 Lisp_Object val;
5558
5559 CHECK_NUMBER (ch, 0);
5560 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5561 if (charset == CHARSET_ASCII)
5562 {
5563 val = ch;
5564 }
5565 else if (charset == charset_jisx0208
5566 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5567 {
5568 ENCODE_SJIS (c1, c2, s1, s2);
5569 XSETFASTINT (val, (s1 << 8) | s2);
5570 }
5571 else if (charset == charset_katakana_jisx0201
5572 && c1 > 0x20 && c2 < 0xE0)
5573 {
5574 XSETFASTINT (val, c1 | 0x80);
5575 }
5576 else
5577 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5578 return val;
5579 }
5580
5581 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5582 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5583 Return the corresponding character.")
5584 (code)
5585 Lisp_Object code;
5586 {
5587 int charset;
5588 unsigned char b1, b2, c1, c2;
5589 Lisp_Object val;
5590
5591 CHECK_NUMBER (code, 0);
5592 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5593 if (b1 == 0)
5594 {
5595 if (b2 >= 0x80)
5596 error ("Invalid BIG5 code: %x", XFASTINT (code));
5597 val = code;
5598 }
5599 else
5600 {
5601 if ((b1 < 0xA1 || b1 > 0xFE)
5602 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5603 error ("Invalid BIG5 code: %x", XFASTINT (code));
5604 DECODE_BIG5 (b1, b2, charset, c1, c2);
5605 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5606 }
5607 return val;
5608 }
5609
5610 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5611 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5612 Return the corresponding character code in Big5.")
5613 (ch)
5614 Lisp_Object ch;
5615 {
5616 int charset, c1, c2, b1, b2;
5617 Lisp_Object val;
5618
5619 CHECK_NUMBER (ch, 0);
5620 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5621 if (charset == CHARSET_ASCII)
5622 {
5623 val = ch;
5624 }
5625 else if ((charset == charset_big5_1
5626 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5627 || (charset == charset_big5_2
5628 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5629 {
5630 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5631 XSETFASTINT (val, (b1 << 8) | b2);
5632 }
5633 else
5634 error ("Can't encode to Big5: %d", XFASTINT (ch));
5635 return val;
5636 }
5637 \f
5638 DEFUN ("set-terminal-coding-system-internal",
5639 Fset_terminal_coding_system_internal,
5640 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5641 (coding_system)
5642 Lisp_Object coding_system;
5643 {
5644 CHECK_SYMBOL (coding_system, 0);
5645 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5646 /* We had better not send unsafe characters to terminal. */
5647 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5648 /* Characer composition should be disabled. */
5649 terminal_coding.composing = COMPOSITION_DISABLED;
5650 terminal_coding.src_multibyte = 1;
5651 terminal_coding.dst_multibyte = 0;
5652 return Qnil;
5653 }
5654
5655 DEFUN ("set-safe-terminal-coding-system-internal",
5656 Fset_safe_terminal_coding_system_internal,
5657 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5658 (coding_system)
5659 Lisp_Object coding_system;
5660 {
5661 CHECK_SYMBOL (coding_system, 0);
5662 setup_coding_system (Fcheck_coding_system (coding_system),
5663 &safe_terminal_coding);
5664 /* Characer composition should be disabled. */
5665 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5666 safe_terminal_coding.src_multibyte = 1;
5667 safe_terminal_coding.dst_multibyte = 0;
5668 return Qnil;
5669 }
5670
5671 DEFUN ("terminal-coding-system",
5672 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5673 "Return coding system specified for terminal output.")
5674 ()
5675 {
5676 return terminal_coding.symbol;
5677 }
5678
5679 DEFUN ("set-keyboard-coding-system-internal",
5680 Fset_keyboard_coding_system_internal,
5681 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5682 (coding_system)
5683 Lisp_Object coding_system;
5684 {
5685 CHECK_SYMBOL (coding_system, 0);
5686 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5687 /* Characer composition should be disabled. */
5688 keyboard_coding.composing = COMPOSITION_DISABLED;
5689 return Qnil;
5690 }
5691
5692 DEFUN ("keyboard-coding-system",
5693 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5694 "Return coding system specified for decoding keyboard input.")
5695 ()
5696 {
5697 return keyboard_coding.symbol;
5698 }
5699
5700 \f
5701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5702 Sfind_operation_coding_system, 1, MANY, 0,
5703 "Choose a coding system for an operation based on the target name.\n\
5704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5705 DECODING-SYSTEM is the coding system to use for decoding\n\
5706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5707 for encoding (in case OPERATION does encoding).\n\
5708 \n\
5709 The first argument OPERATION specifies an I/O primitive:\n\
5710 For file I/O, `insert-file-contents' or `write-region'.\n\
5711 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5712 For network I/O, `open-network-stream'.\n\
5713 \n\
5714 The remaining arguments should be the same arguments that were passed\n\
5715 to the primitive. Depending on which primitive, one of those arguments\n\
5716 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5717 whichever argument specifies the file name is TARGET.\n\
5718 \n\
5719 TARGET has a meaning which depends on OPERATION:\n\
5720 For file I/O, TARGET is a file name.\n\
5721 For process I/O, TARGET is a process name.\n\
5722 For network I/O, TARGET is a service name or a port number\n\
5723 \n\
5724 This function looks up what specified for TARGET in,\n\
5725 `file-coding-system-alist', `process-coding-system-alist',\n\
5726 or `network-coding-system-alist' depending on OPERATION.\n\
5727 They may specify a coding system, a cons of coding systems,\n\
5728 or a function symbol to call.\n\
5729 In the last case, we call the function with one argument,\n\
5730 which is a list of all the arguments given to this function.")
5731 (nargs, args)
5732 int nargs;
5733 Lisp_Object *args;
5734 {
5735 Lisp_Object operation, target_idx, target, val;
5736 register Lisp_Object chain;
5737
5738 if (nargs < 2)
5739 error ("Too few arguments");
5740 operation = args[0];
5741 if (!SYMBOLP (operation)
5742 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5743 error ("Invalid first arguement");
5744 if (nargs < 1 + XINT (target_idx))
5745 error ("Too few arguments for operation: %s",
5746 XSYMBOL (operation)->name->data);
5747 target = args[XINT (target_idx) + 1];
5748 if (!(STRINGP (target)
5749 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5750 error ("Invalid %dth argument", XINT (target_idx) + 1);
5751
5752 chain = ((EQ (operation, Qinsert_file_contents)
5753 || EQ (operation, Qwrite_region))
5754 ? Vfile_coding_system_alist
5755 : (EQ (operation, Qopen_network_stream)
5756 ? Vnetwork_coding_system_alist
5757 : Vprocess_coding_system_alist));
5758 if (NILP (chain))
5759 return Qnil;
5760
5761 for (; CONSP (chain); chain = XCDR (chain))
5762 {
5763 Lisp_Object elt;
5764 elt = XCAR (chain);
5765
5766 if (CONSP (elt)
5767 && ((STRINGP (target)
5768 && STRINGP (XCAR (elt))
5769 && fast_string_match (XCAR (elt), target) >= 0)
5770 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5771 {
5772 val = XCDR (elt);
5773 /* Here, if VAL is both a valid coding system and a valid
5774 function symbol, we return VAL as a coding system. */
5775 if (CONSP (val))
5776 return val;
5777 if (! SYMBOLP (val))
5778 return Qnil;
5779 if (! NILP (Fcoding_system_p (val)))
5780 return Fcons (val, val);
5781 if (! NILP (Ffboundp (val)))
5782 {
5783 val = call1 (val, Flist (nargs, args));
5784 if (CONSP (val))
5785 return val;
5786 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5787 return Fcons (val, val);
5788 }
5789 return Qnil;
5790 }
5791 }
5792 return Qnil;
5793 }
5794
5795 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5796 Supdate_coding_systems_internal, 0, 0, 0,
5797 "Update internal database for ISO2022 and CCL based coding systems.\n\
5798 When values of any coding categories are changed, you must\n\
5799 call this function")
5800 ()
5801 {
5802 int i;
5803
5804 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5805 {
5806 Lisp_Object val;
5807
5808 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5809 if (!NILP (val))
5810 {
5811 if (! coding_system_table[i])
5812 coding_system_table[i] = ((struct coding_system *)
5813 xmalloc (sizeof (struct coding_system)));
5814 setup_coding_system (val, coding_system_table[i]);
5815 }
5816 else if (coding_system_table[i])
5817 {
5818 xfree (coding_system_table[i]);
5819 coding_system_table[i] = NULL;
5820 }
5821 }
5822
5823 return Qnil;
5824 }
5825
5826 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5827 Sset_coding_priority_internal, 0, 0, 0,
5828 "Update internal database for the current value of `coding-category-list'.\n\
5829 This function is internal use only.")
5830 ()
5831 {
5832 int i = 0, idx;
5833 Lisp_Object val;
5834
5835 val = Vcoding_category_list;
5836
5837 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5838 {
5839 if (! SYMBOLP (XCAR (val)))
5840 break;
5841 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5842 if (idx >= CODING_CATEGORY_IDX_MAX)
5843 break;
5844 coding_priorities[i++] = (1 << idx);
5845 val = XCDR (val);
5846 }
5847 /* If coding-category-list is valid and contains all coding
5848 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5849 the following code saves Emacs from crashing. */
5850 while (i < CODING_CATEGORY_IDX_MAX)
5851 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5852
5853 return Qnil;
5854 }
5855
5856 #endif /* emacs */
5857
5858 \f
5859 /*** 9. Post-amble ***/
5860
5861 void
5862 init_coding ()
5863 {
5864 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5865 }
5866
5867 void
5868 init_coding_once ()
5869 {
5870 int i;
5871
5872 /* Emacs' internal format specific initialize routine. */
5873 for (i = 0; i <= 0x20; i++)
5874 emacs_code_class[i] = EMACS_control_code;
5875 emacs_code_class[0x0A] = EMACS_linefeed_code;
5876 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5877 for (i = 0x21 ; i < 0x7F; i++)
5878 emacs_code_class[i] = EMACS_ascii_code;
5879 emacs_code_class[0x7F] = EMACS_control_code;
5880 for (i = 0x80; i < 0xFF; i++)
5881 emacs_code_class[i] = EMACS_invalid_code;
5882 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5883 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5884 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5885 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5886
5887 /* ISO2022 specific initialize routine. */
5888 for (i = 0; i < 0x20; i++)
5889 iso_code_class[i] = ISO_control_0;
5890 for (i = 0x21; i < 0x7F; i++)
5891 iso_code_class[i] = ISO_graphic_plane_0;
5892 for (i = 0x80; i < 0xA0; i++)
5893 iso_code_class[i] = ISO_control_1;
5894 for (i = 0xA1; i < 0xFF; i++)
5895 iso_code_class[i] = ISO_graphic_plane_1;
5896 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5897 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5898 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5899 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5900 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5901 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5902 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5903 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5904 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5905 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5906
5907 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5908
5909 setup_coding_system (Qnil, &keyboard_coding);
5910 setup_coding_system (Qnil, &terminal_coding);
5911 setup_coding_system (Qnil, &safe_terminal_coding);
5912 setup_coding_system (Qnil, &default_buffer_file_coding);
5913
5914 bzero (coding_system_table, sizeof coding_system_table);
5915
5916 bzero (ascii_skip_code, sizeof ascii_skip_code);
5917 for (i = 0; i < 128; i++)
5918 ascii_skip_code[i] = 1;
5919
5920 #if defined (MSDOS) || defined (WINDOWSNT)
5921 system_eol_type = CODING_EOL_CRLF;
5922 #else
5923 system_eol_type = CODING_EOL_LF;
5924 #endif
5925
5926 inhibit_pre_post_conversion = 0;
5927 }
5928
5929 #ifdef emacs
5930
5931 void
5932 syms_of_coding ()
5933 {
5934 Qtarget_idx = intern ("target-idx");
5935 staticpro (&Qtarget_idx);
5936
5937 Qcoding_system_history = intern ("coding-system-history");
5938 staticpro (&Qcoding_system_history);
5939 Fset (Qcoding_system_history, Qnil);
5940
5941 /* Target FILENAME is the first argument. */
5942 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5943 /* Target FILENAME is the third argument. */
5944 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5945
5946 Qcall_process = intern ("call-process");
5947 staticpro (&Qcall_process);
5948 /* Target PROGRAM is the first argument. */
5949 Fput (Qcall_process, Qtarget_idx, make_number (0));
5950
5951 Qcall_process_region = intern ("call-process-region");
5952 staticpro (&Qcall_process_region);
5953 /* Target PROGRAM is the third argument. */
5954 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5955
5956 Qstart_process = intern ("start-process");
5957 staticpro (&Qstart_process);
5958 /* Target PROGRAM is the third argument. */
5959 Fput (Qstart_process, Qtarget_idx, make_number (2));
5960
5961 Qopen_network_stream = intern ("open-network-stream");
5962 staticpro (&Qopen_network_stream);
5963 /* Target SERVICE is the fourth argument. */
5964 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5965
5966 Qcoding_system = intern ("coding-system");
5967 staticpro (&Qcoding_system);
5968
5969 Qeol_type = intern ("eol-type");
5970 staticpro (&Qeol_type);
5971
5972 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5973 staticpro (&Qbuffer_file_coding_system);
5974
5975 Qpost_read_conversion = intern ("post-read-conversion");
5976 staticpro (&Qpost_read_conversion);
5977
5978 Qpre_write_conversion = intern ("pre-write-conversion");
5979 staticpro (&Qpre_write_conversion);
5980
5981 Qno_conversion = intern ("no-conversion");
5982 staticpro (&Qno_conversion);
5983
5984 Qundecided = intern ("undecided");
5985 staticpro (&Qundecided);
5986
5987 Qcoding_system_p = intern ("coding-system-p");
5988 staticpro (&Qcoding_system_p);
5989
5990 Qcoding_system_error = intern ("coding-system-error");
5991 staticpro (&Qcoding_system_error);
5992
5993 Fput (Qcoding_system_error, Qerror_conditions,
5994 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5995 Fput (Qcoding_system_error, Qerror_message,
5996 build_string ("Invalid coding system"));
5997
5998 Qcoding_category = intern ("coding-category");
5999 staticpro (&Qcoding_category);
6000 Qcoding_category_index = intern ("coding-category-index");
6001 staticpro (&Qcoding_category_index);
6002
6003 Vcoding_category_table
6004 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6005 staticpro (&Vcoding_category_table);
6006 {
6007 int i;
6008 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6009 {
6010 XVECTOR (Vcoding_category_table)->contents[i]
6011 = intern (coding_category_name[i]);
6012 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6013 Qcoding_category_index, make_number (i));
6014 }
6015 }
6016
6017 Qtranslation_table = intern ("translation-table");
6018 staticpro (&Qtranslation_table);
6019 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6020
6021 Qtranslation_table_id = intern ("translation-table-id");
6022 staticpro (&Qtranslation_table_id);
6023
6024 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6025 staticpro (&Qtranslation_table_for_decode);
6026
6027 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6028 staticpro (&Qtranslation_table_for_encode);
6029
6030 Qsafe_charsets = intern ("safe-charsets");
6031 staticpro (&Qsafe_charsets);
6032
6033 Qvalid_codes = intern ("valid-codes");
6034 staticpro (&Qvalid_codes);
6035
6036 Qemacs_mule = intern ("emacs-mule");
6037 staticpro (&Qemacs_mule);
6038
6039 Qraw_text = intern ("raw-text");
6040 staticpro (&Qraw_text);
6041
6042 defsubr (&Scoding_system_p);
6043 defsubr (&Sread_coding_system);
6044 defsubr (&Sread_non_nil_coding_system);
6045 defsubr (&Scheck_coding_system);
6046 defsubr (&Sdetect_coding_region);
6047 defsubr (&Sdetect_coding_string);
6048 defsubr (&Sdecode_coding_region);
6049 defsubr (&Sencode_coding_region);
6050 defsubr (&Sdecode_coding_string);
6051 defsubr (&Sencode_coding_string);
6052 defsubr (&Sdecode_sjis_char);
6053 defsubr (&Sencode_sjis_char);
6054 defsubr (&Sdecode_big5_char);
6055 defsubr (&Sencode_big5_char);
6056 defsubr (&Sset_terminal_coding_system_internal);
6057 defsubr (&Sset_safe_terminal_coding_system_internal);
6058 defsubr (&Sterminal_coding_system);
6059 defsubr (&Sset_keyboard_coding_system_internal);
6060 defsubr (&Skeyboard_coding_system);
6061 defsubr (&Sfind_operation_coding_system);
6062 defsubr (&Supdate_coding_systems_internal);
6063 defsubr (&Sset_coding_priority_internal);
6064
6065 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6066 "List of coding systems.\n\
6067 \n\
6068 Do not alter the value of this variable manually. This variable should be\n\
6069 updated by the functions `make-coding-system' and\n\
6070 `define-coding-system-alias'.");
6071 Vcoding_system_list = Qnil;
6072
6073 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6074 "Alist of coding system names.\n\
6075 Each element is one element list of coding system name.\n\
6076 This variable is given to `completing-read' as TABLE argument.\n\
6077 \n\
6078 Do not alter the value of this variable manually. This variable should be\n\
6079 updated by the functions `make-coding-system' and\n\
6080 `define-coding-system-alias'.");
6081 Vcoding_system_alist = Qnil;
6082
6083 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6084 "List of coding-categories (symbols) ordered by priority.");
6085 {
6086 int i;
6087
6088 Vcoding_category_list = Qnil;
6089 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6090 Vcoding_category_list
6091 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6092 Vcoding_category_list);
6093 }
6094
6095 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6096 "Specify the coding system for read operations.\n\
6097 It is useful to bind this variable with `let', but do not set it globally.\n\
6098 If the value is a coding system, it is used for decoding on read operation.\n\
6099 If not, an appropriate element is used from one of the coding system alists:\n\
6100 There are three such tables, `file-coding-system-alist',\n\
6101 `process-coding-system-alist', and `network-coding-system-alist'.");
6102 Vcoding_system_for_read = Qnil;
6103
6104 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6105 "Specify the coding system for write operations.\n\
6106 Programs bind this variable with `let', but you should not set it globally.\n\
6107 If the value is a coding system, it is used for encoding of output,\n\
6108 when writing it to a file and when sending it to a file or subprocess.\n\
6109 \n\
6110 If this does not specify a coding system, an appropriate element\n\
6111 is used from one of the coding system alists:\n\
6112 There are three such tables, `file-coding-system-alist',\n\
6113 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6114 For output to files, if the above procedure does not specify a coding system,\n\
6115 the value of `buffer-file-coding-system' is used.");
6116 Vcoding_system_for_write = Qnil;
6117
6118 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6119 "Coding system used in the latest file or process I/O.");
6120 Vlast_coding_system_used = Qnil;
6121
6122 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6123 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6124 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6125 such conversion.");
6126 inhibit_eol_conversion = 0;
6127
6128 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6129 "Non-nil means process buffer inherits coding system of process output.\n\
6130 Bind it to t if the process output is to be treated as if it were a file\n\
6131 read from some filesystem.");
6132 inherit_process_coding_system = 0;
6133
6134 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6135 "Alist to decide a coding system to use for a file I/O operation.\n\
6136 The format is ((PATTERN . VAL) ...),\n\
6137 where PATTERN is a regular expression matching a file name,\n\
6138 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6139 If VAL is a coding system, it is used for both decoding and encoding\n\
6140 the file contents.\n\
6141 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6142 and the cdr part is used for encoding.\n\
6143 If VAL is a function symbol, the function must return a coding system\n\
6144 or a cons of coding systems which are used as above.\n\
6145 \n\
6146 See also the function `find-operation-coding-system'\n\
6147 and the variable `auto-coding-alist'.");
6148 Vfile_coding_system_alist = Qnil;
6149
6150 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6151 "Alist to decide a coding system to use for a process I/O operation.\n\
6152 The format is ((PATTERN . VAL) ...),\n\
6153 where PATTERN is a regular expression matching a program name,\n\
6154 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6155 If VAL is a coding system, it is used for both decoding what received\n\
6156 from the program and encoding what sent to the program.\n\
6157 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6158 and the cdr part is used for encoding.\n\
6159 If VAL is a function symbol, the function must return a coding system\n\
6160 or a cons of coding systems which are used as above.\n\
6161 \n\
6162 See also the function `find-operation-coding-system'.");
6163 Vprocess_coding_system_alist = Qnil;
6164
6165 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6166 "Alist to decide a coding system to use for a network I/O operation.\n\
6167 The format is ((PATTERN . VAL) ...),\n\
6168 where PATTERN is a regular expression matching a network service name\n\
6169 or is a port number to connect to,\n\
6170 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6171 If VAL is a coding system, it is used for both decoding what received\n\
6172 from the network stream and encoding what sent to the network stream.\n\
6173 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6174 and the cdr part is used for encoding.\n\
6175 If VAL is a function symbol, the function must return a coding system\n\
6176 or a cons of coding systems which are used as above.\n\
6177 \n\
6178 See also the function `find-operation-coding-system'.");
6179 Vnetwork_coding_system_alist = Qnil;
6180
6181 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6182 "Coding system to use with system messages.");
6183 Vlocale_coding_system = Qnil;
6184
6185 /* The eol mnemonics are reset in startup.el system-dependently. */
6186 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6187 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6188 eol_mnemonic_unix = build_string (":");
6189
6190 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6191 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6192 eol_mnemonic_dos = build_string ("\\");
6193
6194 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6195 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6196 eol_mnemonic_mac = build_string ("/");
6197
6198 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6199 "*String displayed in mode line when end-of-line format is not yet determined.");
6200 eol_mnemonic_undecided = build_string (":");
6201
6202 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6203 "*Non-nil enables character translation while encoding and decoding.");
6204 Venable_character_translation = Qt;
6205
6206 DEFVAR_LISP ("standard-translation-table-for-decode",
6207 &Vstandard_translation_table_for_decode,
6208 "Table for translating characters while decoding.");
6209 Vstandard_translation_table_for_decode = Qnil;
6210
6211 DEFVAR_LISP ("standard-translation-table-for-encode",
6212 &Vstandard_translation_table_for_encode,
6213 "Table for translationg characters while encoding.");
6214 Vstandard_translation_table_for_encode = Qnil;
6215
6216 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6217 "Alist of charsets vs revision numbers.\n\
6218 While encoding, if a charset (car part of an element) is found,\n\
6219 designate it with the escape sequence identifing revision (cdr part of the element).");
6220 Vcharset_revision_alist = Qnil;
6221
6222 DEFVAR_LISP ("default-process-coding-system",
6223 &Vdefault_process_coding_system,
6224 "Cons of coding systems used for process I/O by default.\n\
6225 The car part is used for decoding a process output,\n\
6226 the cdr part is used for encoding a text to be sent to a process.");
6227 Vdefault_process_coding_system = Qnil;
6228
6229 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6230 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6231 This is a vector of length 256.\n\
6232 If Nth element is non-nil, the existence of code N in a file\n\
6233 \(or output of subprocess) doesn't prevent it to be detected as\n\
6234 a coding system of ISO 2022 variant which has a flag\n\
6235 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6236 or reading output of a subprocess.\n\
6237 Only 128th through 159th elements has a meaning.");
6238 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6239
6240 DEFVAR_LISP ("select-safe-coding-system-function",
6241 &Vselect_safe_coding_system_function,
6242 "Function to call to select safe coding system for encoding a text.\n\
6243 \n\
6244 If set, this function is called to force a user to select a proper\n\
6245 coding system which can encode the text in the case that a default\n\
6246 coding system used in each operation can't encode the text.\n\
6247 \n\
6248 The default value is `select-safe-coding-system' (which see).");
6249 Vselect_safe_coding_system_function = Qnil;
6250
6251 }
6252
6253 char *
6254 emacs_strerror (error_number)
6255 int error_number;
6256 {
6257 char *str;
6258
6259 synchronize_system_messages_locale ();
6260 str = strerror (error_number);
6261
6262 if (! NILP (Vlocale_coding_system))
6263 {
6264 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6265 Vlocale_coding_system,
6266 0);
6267 str = (char *) XSTRING (dec)->data;
6268 }
6269
6270 return str;
6271 }
6272
6273 #endif /* emacs */
6274