(DECODE_COMPOSITION_START): If coding->cmp_data is not
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
34
35 */
36
37 /*** 0. General comments ***/
38
39
40 /*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
48
49 0. Emacs' internal format (emacs-mule)
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
74
75 4. Raw text
76
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
79
80 5. Other
81
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
91
92 */
93
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
101
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
106
107 */
108
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
117 int
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
120 {
121 ...
122 }
123 #endif
124
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
131
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 {
149 ...
150 }
151 #endif
152
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
159
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
164
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
168
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
176 {
177 ...
178 }
179 #endif
180
181 /*** COMMONLY USED MACROS ***/
182
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
190
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
199 } while (0)
200
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
211
212
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
222
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
241
242
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
268 \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
276 } while (0)
277
278
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
309
310 \f
311 /*** 1. Preamble ***/
312
313 #ifdef emacs
314 #include <config.h>
315 #endif
316
317 #include <stdio.h>
318
319 #ifdef emacs
320
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
328
329 #else /* not emacs */
330
331 #include "mulelib.h"
332
333 #endif /* not emacs */
334
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_charsets;
341 Lisp_Object Qvalid_codes;
342
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
347
348 Lisp_Object Vselect_safe_coding_system_function;
349
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
355
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
359
360 #ifdef emacs
361
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
365
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
369
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
378
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
382
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
385
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
388
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding;
391
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding;
395
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding;
398
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding;
401
402 Lisp_Object Vfile_coding_system_alist;
403 Lisp_Object Vprocess_coding_system_alist;
404 Lisp_Object Vnetwork_coding_system_alist;
405
406 Lisp_Object Vlocale_coding_system;
407
408 #endif /* emacs */
409
410 Lisp_Object Qcoding_category, Qcoding_category_index;
411
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list;
414
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table;
417
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
435 };
436
437 /* Table of pointers to coding systems corresponding to each coding
438 categories. */
439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443 static
444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
446 /* Flag to tell if we look up translation table on character code
447 conversion. */
448 Lisp_Object Venable_character_translation;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode;
453
454 Lisp_Object Qtranslation_table;
455 Lisp_Object Qtranslation_table_id;
456 Lisp_Object Qtranslation_table_for_decode;
457 Lisp_Object Qtranslation_table_for_encode;
458
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist;
461
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system;
464
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion;
470
471 \f
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
473
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
496
497 --- CODE RANGE of Emacs' internal format ---
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
505
506 */
507
508 enum emacs_code_class_type emacs_code_class[256];
509
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
513
514 int
515 detect_coding_emacs_mule (src, src_end)
516 unsigned char *src, *src_end;
517 {
518 unsigned char c;
519 int composing = 0;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
523
524 while (1)
525 {
526 ONE_MORE_BYTE (c);
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
537 else
538 c -= 0x20;
539 }
540
541 if (c < 0x20)
542 {
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
555
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565 }
566
567
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
569
570 static void
571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575 {
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
585
586 coding->produced_char = 0;
587 while (src < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
591
592 src_base = src;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 {
595 p = src;
596 src += bytes;
597 }
598 else
599 {
600 bytes = CHAR_STRING (*src, tmp);
601 p = tmp;
602 src++;
603 }
604 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 {
606 coding->result = CODING_FINISH_INSUFFICIENT_DST;
607 break;
608 }
609 while (bytes--) *dst++ = *p++;
610 coding->produced_char++;
611 }
612 coding->consumed = coding->consumed_char = src_base - source;
613 coding->produced = dst - destination;
614 }
615
616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
618
619
620 \f
621 /*** 3. ISO2022 handlers ***/
622
623 /* The following note describes the coding system ISO2022 briefly.
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
627 original document of ISO2022.
628
629 ISO2022 provides many mechanisms to encode several character sets
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
634
635 There are two kinds of character sets: control character set and
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
640 two control character sets and many graphic character sets.
641
642 Graphic character sets are classified into one of the following
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
645 - DIMENSION1_CHARS94
646 - DIMENSION1_CHARS96
647 - DIMENSION2_CHARS94
648 - DIMENSION2_CHARS96
649
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
655
656 Note (*): ECMA = European Computer Manufacturers Association
657
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
663
664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
669
670 A control character set is directly designated and invoked to C0 or
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
679
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
687
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
691 be used.
692
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
698 escape sequences:
699
700 ----------------------------------------------------------------------
701 abbrev function cntrl escape seq description
702 ----------------------------------------------------------------------
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
712 ----------------------------------------------------------------------
713 (*) These are not used by any known coding system.
714
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
717
718 Designations are done by the following escape sequences:
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
739
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
741 of dimension 1, chars 94, and final character <F>, etc...
742
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
745 CHARS96 character sets in a coding system which is characterized as
746 7-bit environment, non-locking-shift, and non-single-shift.
747
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
749 '(' can be omitted. We refer to this as "short-form" hereafter.
750
751 Now you may notice that there are a lot of ways for encoding the
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
756 localized platforms), and all of these are variants of ISO2022.
757
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
761
762 ISO6429's direction specification takes the following form:
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769
770 Character composition specification takes the following form:
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
778
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
783
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
786
787 COMPOSITION_RELATIVE:
788 ESC 0 CHAR [ CHAR ] ESC 1
789 COMPOSITOIN_WITH_RULE:
790 ESC 2 CHAR [ RULE CHAR ] ESC 1
791 COMPOSITION_WITH_ALTCHARS:
792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
793 COMPOSITION_WITH_RULE_ALTCHARS:
794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
795
796 enum iso_code_class_type iso_code_class[256];
797
798 #define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
804
805 #define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807
808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
812 CODING_CATEGORY_MASK_ISO_7_TIGHT
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
817 are set. If a code which should never appear in ISO2022 is found,
818 returns 0. */
819
820 int
821 detect_coding_iso2022 (src, src_end)
822 unsigned char *src, *src_end;
823 {
824 int mask = CODING_CATEGORY_MASK_ISO;
825 int mask_found = 0;
826 int reg[4], shift_out = 0, single_shifting = 0;
827 int c, c1, i, charset;
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding;
830 struct coding_system *coding = &dummy_coding;
831
832 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
833 while (mask && src < src_end)
834 {
835 ONE_MORE_BYTE (c);
836 switch (c)
837 {
838 case ISO_CODE_ESC:
839 single_shifting = 0;
840 ONE_MORE_BYTE (c);
841 if (c >= '(' && c <= '/')
842 {
843 /* Designation sequence for a charset of dimension 1. */
844 ONE_MORE_BYTE (c1);
845 if (c1 < ' ' || c1 >= 0x80
846 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
847 /* Invalid designation sequence. Just ignore. */
848 break;
849 reg[(c - '(') % 4] = charset;
850 }
851 else if (c == '$')
852 {
853 /* Designation sequence for a charset of dimension 2. */
854 ONE_MORE_BYTE (c);
855 if (c >= '@' && c <= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
857 reg[0] = charset = iso_charset_table[1][0][c];
858 else if (c >= '(' && c <= '/')
859 {
860 ONE_MORE_BYTE (c1);
861 if (c1 < ' ' || c1 >= 0x80
862 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
863 /* Invalid designation sequence. Just ignore. */
864 break;
865 reg[(c - '(') % 4] = charset;
866 }
867 else
868 /* Invalid designation sequence. Just ignore. */
869 break;
870 }
871 else if (c == 'N' || c == 'O')
872 {
873 /* ESC <Fe> for SS2 or SS3. */
874 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
875 break;
876 }
877 else if (c >= '0' && c <= '4')
878 {
879 /* ESC <Fp> for start/end composition. */
880 mask_found |= CODING_CATEGORY_MASK_ISO;
881 break;
882 }
883 else
884 /* Invalid escape sequence. Just ignore. */
885 break;
886
887 /* We found a valid designation sequence for CHARSET. */
888 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
890 mask_found |= CODING_CATEGORY_MASK_ISO_7;
891 else
892 mask &= ~CODING_CATEGORY_MASK_ISO_7;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
894 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
895 else
896 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
898 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
899 else
900 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
902 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
903 else
904 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
905 break;
906
907 case ISO_CODE_SO:
908 single_shifting = 0;
909 if (shift_out == 0
910 && (reg[1] >= 0
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 {
914 /* Locking shift out. */
915 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
916 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 }
918 break;
919
920 case ISO_CODE_SI:
921 single_shifting = 0;
922 if (shift_out == 1)
923 {
924 /* Locking shift in. */
925 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
926 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 }
928 break;
929
930 case ISO_CODE_CSI:
931 single_shifting = 0;
932 case ISO_CODE_SS2:
933 case ISO_CODE_SS3:
934 {
935 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936
937 if (c != ISO_CODE_CSI)
938 {
939 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT)
941 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
942 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT)
944 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
945 single_shifting = 1;
946 }
947 if (VECTORP (Vlatin_extra_code_table)
948 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 {
950 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA)
952 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
953 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA)
955 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 }
957 mask &= newmask;
958 mask_found |= newmask;
959 }
960 break;
961
962 default:
963 if (c < 0x80)
964 {
965 single_shifting = 0;
966 break;
967 }
968 else if (c < 0xA0)
969 {
970 single_shifting = 0;
971 if (VECTORP (Vlatin_extra_code_table)
972 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
973 {
974 int newmask = 0;
975
976 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA)
978 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
979 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA)
981 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
982 mask &= newmask;
983 mask_found |= newmask;
984 }
985 else
986 return 0;
987 }
988 else
989 {
990 unsigned char *src_begin = src;
991
992 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
993 | CODING_CATEGORY_MASK_ISO_7_ELSE);
994 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
995 /* Check the length of succeeding codes of the range
996 0xA0..0FF. If the byte length is odd, we exclude
997 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
998 when we are not single shifting. */
999 if (!single_shifting
1000 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1001 {
1002 int i = 0;
1003 while (src < src_end)
1004 {
1005 ONE_MORE_BYTE (c);
1006 if (c < 0xA0)
1007 break;
1008 i++;
1009 }
1010
1011 if (i & 1 && src < src_end)
1012 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1013 else
1014 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1015 }
1016 }
1017 break;
1018 }
1019 }
1020 label_end_of_loop:
1021 return (mask & mask_found);
1022 }
1023
1024 /* Decode a character of which charset is CHARSET, the 1st position
1025 code is C1, the 2nd position code is C2, and return the decoded
1026 character code. If the variable `translation_table' is non-nil,
1027 returned the translated code. */
1028
1029 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1030 (NILP (translation_table) \
1031 ? MAKE_CHAR (charset, c1, c2) \
1032 : translate_char (translation_table, -1, charset, c1, c2))
1033
1034 /* Set designation state into CODING. */
1035 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1036 do { \
1037 int charset; \
1038 \
1039 if (final_char < '0' || final_char >= 128) \
1040 goto label_invalid_code; \
1041 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1042 make_number (chars), \
1043 make_number (final_char)); \
1044 if (charset >= 0 \
1045 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046 || coding->safe_charsets[charset])) \
1047 { \
1048 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1049 && reg == 0 \
1050 && charset == CHARSET_ASCII) \
1051 { \
1052 /* We should insert this designation sequence as is so \
1053 that it is surely written back to a file. */ \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 goto label_invalid_code; \
1056 } \
1057 coding->spec.iso2022.last_invalid_designation_register = -1; \
1058 if ((coding->mode & CODING_MODE_DIRECTION) \
1059 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1060 charset = CHARSET_REVERSE_CHARSET (charset); \
1061 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1062 } \
1063 else \
1064 { \
1065 coding->spec.iso2022.last_invalid_designation_register = reg; \
1066 goto label_invalid_code; \
1067 } \
1068 } while (0)
1069
1070 /* Allocate a memory block for storing information about compositions.
1071 The block is chained to the already allocated blocks. */
1072
1073 void
1074 coding_allocate_composition_data (coding, char_offset)
1075 struct coding_system *coding;
1076 int char_offset;
1077 {
1078 struct composition_data *cmp_data
1079 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1080
1081 cmp_data->char_offset = char_offset;
1082 cmp_data->used = 0;
1083 cmp_data->prev = coding->cmp_data;
1084 cmp_data->next = NULL;
1085 if (coding->cmp_data)
1086 coding->cmp_data->next = cmp_data;
1087 coding->cmp_data = cmp_data;
1088 coding->cmp_data_start = 0;
1089 }
1090
1091 /* Record the starting position START and METHOD of one composition. */
1092
1093 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1094 do { \
1095 struct composition_data *cmp_data = coding->cmp_data; \
1096 int *data = cmp_data->data + cmp_data->used; \
1097 coding->cmp_data_start = cmp_data->used; \
1098 data[0] = -1; \
1099 data[1] = cmp_data->char_offset + start; \
1100 data[3] = (int) method; \
1101 cmp_data->used += 4; \
1102 } while (0)
1103
1104 /* Record the ending position END of the current composition. */
1105
1106 #define CODING_ADD_COMPOSITION_END(coding, end) \
1107 do { \
1108 struct composition_data *cmp_data = coding->cmp_data; \
1109 int *data = cmp_data->data + coding->cmp_data_start; \
1110 data[0] = cmp_data->used - coding->cmp_data_start; \
1111 data[2] = cmp_data->char_offset + end; \
1112 } while (0)
1113
1114 /* Record one COMPONENT (alternate character or composition rule). */
1115
1116 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1117 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1118
1119 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1120
1121 #define DECODE_COMPOSITION_START(c1) \
1122 do { \
1123 if (coding->composing == COMPOSITION_DISABLED) \
1124 { \
1125 *dst++ = ISO_CODE_ESC; \
1126 *dst++ = c1 & 0x7f; \
1127 coding->produced_char += 2; \
1128 } \
1129 else if (!COMPOSING_P (coding)) \
1130 { \
1131 /* This is surely the start of a composition. We must be sure \
1132 that coding->cmp_data has enough space to store the \
1133 information about the composition. If not, terminate the \
1134 current decoding loop, allocate one more memory block for \
1135 coding->cmp_data in the calller, then start the decoding \
1136 loop again. We can't allocate memory here directly because \
1137 it may cause buffer/string relocation. */ \
1138 if (!coding->cmp_data \
1139 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1140 >= COMPOSITION_DATA_SIZE)) \
1141 { \
1142 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1143 goto label_end_of_loop; \
1144 } \
1145 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1146 : c1 == '2' ? COMPOSITION_WITH_RULE \
1147 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1148 : COMPOSITION_WITH_RULE_ALTCHARS); \
1149 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1150 coding->composing); \
1151 coding->composition_rule_follows = 0; \
1152 } \
1153 else \
1154 { \
1155 /* We are already handling a composition. If the method is \
1156 the following two, the codes following the current escape \
1157 sequence are actual characters stored in a buffer. */ \
1158 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1159 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1160 { \
1161 coding->composing = COMPOSITION_RELATIVE; \
1162 coding->composition_rule_follows = 0; \
1163 } \
1164 } \
1165 } while (0)
1166
1167 /* Handle compositoin end sequence ESC 1. */
1168
1169 #define DECODE_COMPOSITION_END(c1) \
1170 do { \
1171 if (coding->composing == COMPOSITION_DISABLED) \
1172 { \
1173 *dst++ = ISO_CODE_ESC; \
1174 *dst++ = c1; \
1175 coding->produced_char += 2; \
1176 } \
1177 else \
1178 { \
1179 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1180 coding->composing = COMPOSITION_NO; \
1181 } \
1182 } while (0)
1183
1184 /* Decode a composition rule from the byte C1 (and maybe one more byte
1185 from SRC) and store one encoded composition rule in
1186 coding->cmp_data. */
1187
1188 #define DECODE_COMPOSITION_RULE(c1) \
1189 do { \
1190 int rule = 0; \
1191 (c1) -= 32; \
1192 if (c1 < 81) /* old format (before ver.21) */ \
1193 { \
1194 int gref = (c1) / 9; \
1195 int nref = (c1) % 9; \
1196 if (gref == 4) gref = 10; \
1197 if (nref == 4) nref = 10; \
1198 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1199 } \
1200 else if (c1 < 93) /* new format (after ver.21) */ \
1201 { \
1202 ONE_MORE_BYTE (c2); \
1203 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1204 } \
1205 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1206 coding->composition_rule_follows = 0; \
1207 } while (0)
1208
1209
1210 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1211
1212 static void
1213 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1214 struct coding_system *coding;
1215 unsigned char *source, *destination;
1216 int src_bytes, dst_bytes;
1217 {
1218 unsigned char *src = source;
1219 unsigned char *src_end = source + src_bytes;
1220 unsigned char *dst = destination;
1221 unsigned char *dst_end = destination + dst_bytes;
1222 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1223 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1224 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1225 /* SRC_BASE remembers the start position in source in each loop.
1226 The loop will be exited when there's not enough source code
1227 (within macro ONE_MORE_BYTE), or when there's not enough
1228 destination area to produce a character (within macro
1229 EMIT_CHAR). */
1230 unsigned char *src_base;
1231 int c, charset;
1232 Lisp_Object translation_table;
1233
1234 if (NILP (Venable_character_translation))
1235 translation_table = Qnil;
1236 else
1237 {
1238 translation_table = coding->translation_table_for_decode;
1239 if (NILP (translation_table))
1240 translation_table = Vstandard_translation_table_for_decode;
1241 }
1242
1243 coding->result = CODING_FINISH_NORMAL;
1244
1245 while (1)
1246 {
1247 int c1, c2;
1248
1249 src_base = src;
1250 ONE_MORE_BYTE (c1);
1251
1252 /* We produce no character or one character. */
1253 switch (iso_code_class [c1])
1254 {
1255 case ISO_0x20_or_0x7F:
1256 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1257 {
1258 DECODE_COMPOSITION_RULE (c1);
1259 continue;
1260 }
1261 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1262 {
1263 /* This is SPACE or DEL. */
1264 charset = CHARSET_ASCII;
1265 break;
1266 }
1267 /* This is a graphic character, we fall down ... */
1268
1269 case ISO_graphic_plane_0:
1270 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1271 {
1272 DECODE_COMPOSITION_RULE (c1);
1273 continue;
1274 }
1275 charset = charset0;
1276 break;
1277
1278 case ISO_0xA0_or_0xFF:
1279 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1280 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1281 goto label_invalid_code;
1282 /* This is a graphic character, we fall down ... */
1283
1284 case ISO_graphic_plane_1:
1285 if (charset1 < 0)
1286 goto label_invalid_code;
1287 charset = charset1;
1288 break;
1289
1290 case ISO_control_0:
1291 if (COMPOSING_P (coding))
1292 DECODE_COMPOSITION_END ('1');
1293
1294 /* All ISO2022 control characters in this class have the
1295 same representation in Emacs internal format. */
1296 if (c1 == '\n'
1297 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1298 && (coding->eol_type == CODING_EOL_CR
1299 || coding->eol_type == CODING_EOL_CRLF))
1300 {
1301 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1302 goto label_end_of_loop;
1303 }
1304 charset = CHARSET_ASCII;
1305 break;
1306
1307 case ISO_control_1:
1308 if (COMPOSING_P (coding))
1309 DECODE_COMPOSITION_END ('1');
1310 goto label_invalid_code;
1311
1312 case ISO_carriage_return:
1313 if (COMPOSING_P (coding))
1314 DECODE_COMPOSITION_END ('1');
1315
1316 if (coding->eol_type == CODING_EOL_CR)
1317 c1 = '\n';
1318 else if (coding->eol_type == CODING_EOL_CRLF)
1319 {
1320 ONE_MORE_BYTE (c1);
1321 if (c1 != ISO_CODE_LF)
1322 {
1323 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1324 {
1325 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1326 goto label_end_of_loop;
1327 }
1328 src--;
1329 c1 = '\r';
1330 }
1331 }
1332 charset = CHARSET_ASCII;
1333 break;
1334
1335 case ISO_shift_out:
1336 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1337 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1338 goto label_invalid_code;
1339 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1340 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1341 continue;
1342
1343 case ISO_shift_in:
1344 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1345 goto label_invalid_code;
1346 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1347 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1348 continue;
1349
1350 case ISO_single_shift_2_7:
1351 case ISO_single_shift_2:
1352 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1353 goto label_invalid_code;
1354 /* SS2 is handled as an escape sequence of ESC 'N' */
1355 c1 = 'N';
1356 goto label_escape_sequence;
1357
1358 case ISO_single_shift_3:
1359 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1360 goto label_invalid_code;
1361 /* SS2 is handled as an escape sequence of ESC 'O' */
1362 c1 = 'O';
1363 goto label_escape_sequence;
1364
1365 case ISO_control_sequence_introducer:
1366 /* CSI is handled as an escape sequence of ESC '[' ... */
1367 c1 = '[';
1368 goto label_escape_sequence;
1369
1370 case ISO_escape:
1371 ONE_MORE_BYTE (c1);
1372 label_escape_sequence:
1373 /* Escape sequences handled by Emacs are invocation,
1374 designation, direction specification, and character
1375 composition specification. */
1376 switch (c1)
1377 {
1378 case '&': /* revision of following character set */
1379 ONE_MORE_BYTE (c1);
1380 if (!(c1 >= '@' && c1 <= '~'))
1381 goto label_invalid_code;
1382 ONE_MORE_BYTE (c1);
1383 if (c1 != ISO_CODE_ESC)
1384 goto label_invalid_code;
1385 ONE_MORE_BYTE (c1);
1386 goto label_escape_sequence;
1387
1388 case '$': /* designation of 2-byte character set */
1389 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1390 goto label_invalid_code;
1391 ONE_MORE_BYTE (c1);
1392 if (c1 >= '@' && c1 <= 'B')
1393 { /* designation of JISX0208.1978, GB2312.1980,
1394 or JISX0208.1980 */
1395 DECODE_DESIGNATION (0, 2, 94, c1);
1396 }
1397 else if (c1 >= 0x28 && c1 <= 0x2B)
1398 { /* designation of DIMENSION2_CHARS94 character set */
1399 ONE_MORE_BYTE (c2);
1400 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1401 }
1402 else if (c1 >= 0x2C && c1 <= 0x2F)
1403 { /* designation of DIMENSION2_CHARS96 character set */
1404 ONE_MORE_BYTE (c2);
1405 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1406 }
1407 else
1408 goto label_invalid_code;
1409 /* We must update these variables now. */
1410 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1411 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1412 continue;
1413
1414 case 'n': /* invocation of locking-shift-2 */
1415 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1416 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1417 goto label_invalid_code;
1418 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1419 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1420 continue;
1421
1422 case 'o': /* invocation of locking-shift-3 */
1423 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1424 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1425 goto label_invalid_code;
1426 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1427 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1428 continue;
1429
1430 case 'N': /* invocation of single-shift-2 */
1431 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1432 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1433 goto label_invalid_code;
1434 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1435 ONE_MORE_BYTE (c1);
1436 break;
1437
1438 case 'O': /* invocation of single-shift-3 */
1439 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1440 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1441 goto label_invalid_code;
1442 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1443 ONE_MORE_BYTE (c1);
1444 break;
1445
1446 case '0': case '2': case '3': case '4': /* start composition */
1447 DECODE_COMPOSITION_START (c1);
1448 continue;
1449
1450 case '1': /* end composition */
1451 DECODE_COMPOSITION_END (c1);
1452 continue;
1453
1454 case '[': /* specification of direction */
1455 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1456 goto label_invalid_code;
1457 /* For the moment, nested direction is not supported.
1458 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1459 left-to-right, and nozero means right-to-left. */
1460 ONE_MORE_BYTE (c1);
1461 switch (c1)
1462 {
1463 case ']': /* end of the current direction */
1464 coding->mode &= ~CODING_MODE_DIRECTION;
1465
1466 case '0': /* end of the current direction */
1467 case '1': /* start of left-to-right direction */
1468 ONE_MORE_BYTE (c1);
1469 if (c1 == ']')
1470 coding->mode &= ~CODING_MODE_DIRECTION;
1471 else
1472 goto label_invalid_code;
1473 break;
1474
1475 case '2': /* start of right-to-left direction */
1476 ONE_MORE_BYTE (c1);
1477 if (c1 == ']')
1478 coding->mode |= CODING_MODE_DIRECTION;
1479 else
1480 goto label_invalid_code;
1481 break;
1482
1483 default:
1484 goto label_invalid_code;
1485 }
1486 continue;
1487
1488 default:
1489 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1490 goto label_invalid_code;
1491 if (c1 >= 0x28 && c1 <= 0x2B)
1492 { /* designation of DIMENSION1_CHARS94 character set */
1493 ONE_MORE_BYTE (c2);
1494 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1495 }
1496 else if (c1 >= 0x2C && c1 <= 0x2F)
1497 { /* designation of DIMENSION1_CHARS96 character set */
1498 ONE_MORE_BYTE (c2);
1499 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1500 }
1501 else
1502 goto label_invalid_code;
1503 /* We must update these variables now. */
1504 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1505 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1506 continue;
1507 }
1508 }
1509
1510 /* Now we know CHARSET and 1st position code C1 of a character.
1511 Produce a multibyte sequence for that character while getting
1512 2nd position code C2 if necessary. */
1513 if (CHARSET_DIMENSION (charset) == 2)
1514 {
1515 ONE_MORE_BYTE (c2);
1516 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1517 /* C2 is not in a valid range. */
1518 goto label_invalid_code;
1519 }
1520 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1521 EMIT_CHAR (c);
1522 continue;
1523
1524 label_invalid_code:
1525 coding->errors++;
1526 if (COMPOSING_P (coding))
1527 DECODE_COMPOSITION_END ('1');
1528 src = src_base;
1529 c = *src++;
1530 EMIT_CHAR (c);
1531 }
1532
1533 label_end_of_loop:
1534 coding->consumed = coding->consumed_char = src_base - source;
1535 coding->produced = dst - destination;
1536 return;
1537 }
1538
1539
1540 /* ISO2022 encoding stuff. */
1541
1542 /*
1543 It is not enough to say just "ISO2022" on encoding, we have to
1544 specify more details. In Emacs, each coding system of ISO2022
1545 variant has the following specifications:
1546 1. Initial designation to G0 thru G3.
1547 2. Allows short-form designation?
1548 3. ASCII should be designated to G0 before control characters?
1549 4. ASCII should be designated to G0 at end of line?
1550 5. 7-bit environment or 8-bit environment?
1551 6. Use locking-shift?
1552 7. Use Single-shift?
1553 And the following two are only for Japanese:
1554 8. Use ASCII in place of JIS0201-1976-Roman?
1555 9. Use JISX0208-1983 in place of JISX0208-1978?
1556 These specifications are encoded in `coding->flags' as flag bits
1557 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1558 details.
1559 */
1560
1561 /* Produce codes (escape sequence) for designating CHARSET to graphic
1562 register REG at DST, and increment DST. If <final-char> of CHARSET is
1563 '@', 'A', or 'B' and the coding system CODING allows, produce
1564 designation sequence of short-form. */
1565
1566 #define ENCODE_DESIGNATION(charset, reg, coding) \
1567 do { \
1568 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1569 char *intermediate_char_94 = "()*+"; \
1570 char *intermediate_char_96 = ",-./"; \
1571 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1572 \
1573 if (revision < 255) \
1574 { \
1575 *dst++ = ISO_CODE_ESC; \
1576 *dst++ = '&'; \
1577 *dst++ = '@' + revision; \
1578 } \
1579 *dst++ = ISO_CODE_ESC; \
1580 if (CHARSET_DIMENSION (charset) == 1) \
1581 { \
1582 if (CHARSET_CHARS (charset) == 94) \
1583 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1584 else \
1585 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1586 } \
1587 else \
1588 { \
1589 *dst++ = '$'; \
1590 if (CHARSET_CHARS (charset) == 94) \
1591 { \
1592 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1593 || reg != 0 \
1594 || final_char < '@' || final_char > 'B') \
1595 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1596 } \
1597 else \
1598 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1599 } \
1600 *dst++ = final_char; \
1601 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1602 } while (0)
1603
1604 /* The following two macros produce codes (control character or escape
1605 sequence) for ISO2022 single-shift functions (single-shift-2 and
1606 single-shift-3). */
1607
1608 #define ENCODE_SINGLE_SHIFT_2 \
1609 do { \
1610 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1611 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1612 else \
1613 *dst++ = ISO_CODE_SS2; \
1614 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1615 } while (0)
1616
1617 #define ENCODE_SINGLE_SHIFT_3 \
1618 do { \
1619 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1620 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1621 else \
1622 *dst++ = ISO_CODE_SS3; \
1623 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1624 } while (0)
1625
1626 /* The following four macros produce codes (control character or
1627 escape sequence) for ISO2022 locking-shift functions (shift-in,
1628 shift-out, locking-shift-2, and locking-shift-3). */
1629
1630 #define ENCODE_SHIFT_IN \
1631 do { \
1632 *dst++ = ISO_CODE_SI; \
1633 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1634 } while (0)
1635
1636 #define ENCODE_SHIFT_OUT \
1637 do { \
1638 *dst++ = ISO_CODE_SO; \
1639 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1640 } while (0)
1641
1642 #define ENCODE_LOCKING_SHIFT_2 \
1643 do { \
1644 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1645 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1646 } while (0)
1647
1648 #define ENCODE_LOCKING_SHIFT_3 \
1649 do { \
1650 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1651 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1652 } while (0)
1653
1654 /* Produce codes for a DIMENSION1 character whose character set is
1655 CHARSET and whose position-code is C1. Designation and invocation
1656 sequences are also produced in advance if necessary. */
1657
1658 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1659 do { \
1660 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1661 { \
1662 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1663 *dst++ = c1 & 0x7F; \
1664 else \
1665 *dst++ = c1 | 0x80; \
1666 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1667 break; \
1668 } \
1669 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1670 { \
1671 *dst++ = c1 & 0x7F; \
1672 break; \
1673 } \
1674 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1675 { \
1676 *dst++ = c1 | 0x80; \
1677 break; \
1678 } \
1679 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1680 && !coding->safe_charsets[charset]) \
1681 { \
1682 /* We should not encode this character, instead produce one or \
1683 two `?'s. */ \
1684 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1685 if (CHARSET_WIDTH (charset) == 2) \
1686 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1687 break; \
1688 } \
1689 else \
1690 /* Since CHARSET is not yet invoked to any graphic planes, we \
1691 must invoke it, or, at first, designate it to some graphic \
1692 register. Then repeat the loop to actually produce the \
1693 character. */ \
1694 dst = encode_invocation_designation (charset, coding, dst); \
1695 } while (1)
1696
1697 /* Produce codes for a DIMENSION2 character whose character set is
1698 CHARSET and whose position-codes are C1 and C2. Designation and
1699 invocation codes are also produced in advance if necessary. */
1700
1701 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1702 do { \
1703 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1704 { \
1705 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1706 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1707 else \
1708 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1709 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1710 break; \
1711 } \
1712 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1713 { \
1714 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1715 break; \
1716 } \
1717 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1718 { \
1719 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1720 break; \
1721 } \
1722 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1723 && !coding->safe_charsets[charset]) \
1724 { \
1725 /* We should not encode this character, instead produce one or \
1726 two `?'s. */ \
1727 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1728 if (CHARSET_WIDTH (charset) == 2) \
1729 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1730 break; \
1731 } \
1732 else \
1733 /* Since CHARSET is not yet invoked to any graphic planes, we \
1734 must invoke it, or, at first, designate it to some graphic \
1735 register. Then repeat the loop to actually produce the \
1736 character. */ \
1737 dst = encode_invocation_designation (charset, coding, dst); \
1738 } while (1)
1739
1740 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1741 do { \
1742 int alt_charset = charset; \
1743 \
1744 if (CHARSET_DEFINED_P (charset)) \
1745 { \
1746 if (CHARSET_DIMENSION (charset) == 1) \
1747 { \
1748 if (charset == CHARSET_ASCII \
1749 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1750 alt_charset = charset_latin_jisx0201; \
1751 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1752 } \
1753 else \
1754 { \
1755 if (charset == charset_jisx0208 \
1756 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1757 alt_charset = charset_jisx0208_1978; \
1758 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1759 } \
1760 } \
1761 else \
1762 { \
1763 *dst++ = c1; \
1764 if (c2 >= 0) \
1765 *dst++ = c2; \
1766 } \
1767 } while (0)
1768
1769 /* Produce designation and invocation codes at a place pointed by DST
1770 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1771 Return new DST. */
1772
1773 unsigned char *
1774 encode_invocation_designation (charset, coding, dst)
1775 int charset;
1776 struct coding_system *coding;
1777 unsigned char *dst;
1778 {
1779 int reg; /* graphic register number */
1780
1781 /* At first, check designations. */
1782 for (reg = 0; reg < 4; reg++)
1783 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1784 break;
1785
1786 if (reg >= 4)
1787 {
1788 /* CHARSET is not yet designated to any graphic registers. */
1789 /* At first check the requested designation. */
1790 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1791 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1792 /* Since CHARSET requests no special designation, designate it
1793 to graphic register 0. */
1794 reg = 0;
1795
1796 ENCODE_DESIGNATION (charset, reg, coding);
1797 }
1798
1799 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1800 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1801 {
1802 /* Since the graphic register REG is not invoked to any graphic
1803 planes, invoke it to graphic plane 0. */
1804 switch (reg)
1805 {
1806 case 0: /* graphic register 0 */
1807 ENCODE_SHIFT_IN;
1808 break;
1809
1810 case 1: /* graphic register 1 */
1811 ENCODE_SHIFT_OUT;
1812 break;
1813
1814 case 2: /* graphic register 2 */
1815 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1816 ENCODE_SINGLE_SHIFT_2;
1817 else
1818 ENCODE_LOCKING_SHIFT_2;
1819 break;
1820
1821 case 3: /* graphic register 3 */
1822 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1823 ENCODE_SINGLE_SHIFT_3;
1824 else
1825 ENCODE_LOCKING_SHIFT_3;
1826 break;
1827 }
1828 }
1829
1830 return dst;
1831 }
1832
1833 /* Produce 2-byte codes for encoded composition rule RULE. */
1834
1835 #define ENCODE_COMPOSITION_RULE(rule) \
1836 do { \
1837 int gref, nref; \
1838 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1839 *dst++ = 32 + 81 + gref; \
1840 *dst++ = 32 + nref; \
1841 } while (0)
1842
1843 /* Produce codes for indicating the start of a composition sequence
1844 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1845 which specify information about the composition. See the comment
1846 in coding.h for the format of DATA. */
1847
1848 #define ENCODE_COMPOSITION_START(coding, data) \
1849 do { \
1850 coding->composing = data[3]; \
1851 *dst++ = ISO_CODE_ESC; \
1852 if (coding->composing == COMPOSITION_RELATIVE) \
1853 *dst++ = '0'; \
1854 else \
1855 { \
1856 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1857 ? '3' : '4'); \
1858 coding->cmp_data_index = coding->cmp_data_start + 4; \
1859 coding->composition_rule_follows = 0; \
1860 } \
1861 } while (0)
1862
1863 /* Produce codes for indicating the end of the current composition. */
1864
1865 #define ENCODE_COMPOSITION_END(coding, data) \
1866 do { \
1867 *dst++ = ISO_CODE_ESC; \
1868 *dst++ = '1'; \
1869 coding->cmp_data_start += data[0]; \
1870 coding->composing = COMPOSITION_NO; \
1871 if (coding->cmp_data_start == coding->cmp_data->used \
1872 && coding->cmp_data->next) \
1873 { \
1874 coding->cmp_data = coding->cmp_data->next; \
1875 coding->cmp_data_start = 0; \
1876 } \
1877 } while (0)
1878
1879 /* Produce composition start sequence ESC 0. Here, this sequence
1880 doesn't mean the start of a new composition but means that we have
1881 just produced components (alternate chars and composition rules) of
1882 the composition and the actual text follows in SRC. */
1883
1884 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1885 do { \
1886 *dst++ = ISO_CODE_ESC; \
1887 *dst++ = '0'; \
1888 coding->composing = COMPOSITION_RELATIVE; \
1889 } while (0)
1890
1891 /* The following three macros produce codes for indicating direction
1892 of text. */
1893 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1894 do { \
1895 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1896 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1897 else \
1898 *dst++ = ISO_CODE_CSI; \
1899 } while (0)
1900
1901 #define ENCODE_DIRECTION_R2L \
1902 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1903
1904 #define ENCODE_DIRECTION_L2R \
1905 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1906
1907 /* Produce codes for designation and invocation to reset the graphic
1908 planes and registers to initial state. */
1909 #define ENCODE_RESET_PLANE_AND_REGISTER \
1910 do { \
1911 int reg; \
1912 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1913 ENCODE_SHIFT_IN; \
1914 for (reg = 0; reg < 4; reg++) \
1915 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1916 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1917 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1918 ENCODE_DESIGNATION \
1919 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1920 } while (0)
1921
1922 /* Produce designation sequences of charsets in the line started from
1923 SRC to a place pointed by DST, and return updated DST.
1924
1925 If the current block ends before any end-of-line, we may fail to
1926 find all the necessary designations. */
1927
1928 static unsigned char *
1929 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1930 struct coding_system *coding;
1931 Lisp_Object translation_table;
1932 unsigned char *src, *src_end, *dst;
1933 {
1934 int charset, c, found = 0, reg;
1935 /* Table of charsets to be designated to each graphic register. */
1936 int r[4];
1937
1938 for (reg = 0; reg < 4; reg++)
1939 r[reg] = -1;
1940
1941 while (found < 4)
1942 {
1943 ONE_MORE_CHAR (c);
1944 if (c == '\n')
1945 break;
1946
1947 charset = CHAR_CHARSET (c);
1948 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1949 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1950 {
1951 found++;
1952 r[reg] = charset;
1953 }
1954 }
1955
1956 label_end_of_loop:
1957 if (found)
1958 {
1959 for (reg = 0; reg < 4; reg++)
1960 if (r[reg] >= 0
1961 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1962 ENCODE_DESIGNATION (r[reg], reg, coding);
1963 }
1964
1965 return dst;
1966 }
1967
1968 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1969
1970 static void
1971 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1972 struct coding_system *coding;
1973 unsigned char *source, *destination;
1974 int src_bytes, dst_bytes;
1975 {
1976 unsigned char *src = source;
1977 unsigned char *src_end = source + src_bytes;
1978 unsigned char *dst = destination;
1979 unsigned char *dst_end = destination + dst_bytes;
1980 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1981 from DST_END to assure overflow checking is necessary only at the
1982 head of loop. */
1983 unsigned char *adjusted_dst_end = dst_end - 19;
1984 /* SRC_BASE remembers the start position in source in each loop.
1985 The loop will be exited when there's not enough source text to
1986 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1987 there's not enough destination area to produce encoded codes
1988 (within macro EMIT_BYTES). */
1989 unsigned char *src_base;
1990 int c;
1991 Lisp_Object translation_table;
1992
1993 if (NILP (Venable_character_translation))
1994 translation_table = Qnil;
1995 else
1996 {
1997 translation_table = coding->translation_table_for_encode;
1998 if (NILP (translation_table))
1999 translation_table = Vstandard_translation_table_for_encode;
2000 }
2001
2002 coding->consumed_char = 0;
2003 coding->errors = 0;
2004 while (1)
2005 {
2006 int charset, c1, c2;
2007
2008 src_base = src;
2009
2010 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2011 {
2012 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2013 break;
2014 }
2015
2016 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2017 && CODING_SPEC_ISO_BOL (coding))
2018 {
2019 /* We have to produce designation sequences if any now. */
2020 dst = encode_designation_at_bol (coding, translation_table,
2021 src, src_end, dst);
2022 CODING_SPEC_ISO_BOL (coding) = 0;
2023 }
2024
2025 /* Check composition start and end. */
2026 if (coding->composing != COMPOSITION_DISABLED
2027 && coding->cmp_data_start < coding->cmp_data->used)
2028 {
2029 struct composition_data *cmp_data = coding->cmp_data;
2030 int *data = cmp_data->data + coding->cmp_data_start;
2031 int this_pos = cmp_data->char_offset + coding->consumed_char;
2032
2033 if (coding->composing == COMPOSITION_RELATIVE)
2034 {
2035 if (this_pos == data[2])
2036 {
2037 ENCODE_COMPOSITION_END (coding, data);
2038 cmp_data = coding->cmp_data;
2039 data = cmp_data->data + coding->cmp_data_start;
2040 }
2041 }
2042 else if (COMPOSING_P (coding))
2043 {
2044 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2045 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2046 /* We have consumed components of the composition.
2047 What follows in SRC is the compositions's base
2048 text. */
2049 ENCODE_COMPOSITION_FAKE_START (coding);
2050 else
2051 {
2052 int c = cmp_data->data[coding->cmp_data_index++];
2053 if (coding->composition_rule_follows)
2054 {
2055 ENCODE_COMPOSITION_RULE (c);
2056 coding->composition_rule_follows = 0;
2057 }
2058 else
2059 {
2060 SPLIT_CHAR (c, charset, c1, c2);
2061 ENCODE_ISO_CHARACTER (charset, c1, c2);
2062 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2063 coding->composition_rule_follows = 1;
2064 }
2065 continue;
2066 }
2067 }
2068 if (!COMPOSING_P (coding))
2069 {
2070 if (this_pos == data[1])
2071 {
2072 ENCODE_COMPOSITION_START (coding, data);
2073 continue;
2074 }
2075 }
2076 }
2077
2078 ONE_MORE_CHAR (c);
2079
2080 /* Now encode the character C. */
2081 if (c < 0x20 || c == 0x7F)
2082 {
2083 if (c == '\r')
2084 {
2085 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2086 {
2087 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2088 ENCODE_RESET_PLANE_AND_REGISTER;
2089 *dst++ = c;
2090 continue;
2091 }
2092 /* fall down to treat '\r' as '\n' ... */
2093 c = '\n';
2094 }
2095 if (c == '\n')
2096 {
2097 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2098 ENCODE_RESET_PLANE_AND_REGISTER;
2099 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2100 bcopy (coding->spec.iso2022.initial_designation,
2101 coding->spec.iso2022.current_designation,
2102 sizeof coding->spec.iso2022.initial_designation);
2103 if (coding->eol_type == CODING_EOL_LF
2104 || coding->eol_type == CODING_EOL_UNDECIDED)
2105 *dst++ = ISO_CODE_LF;
2106 else if (coding->eol_type == CODING_EOL_CRLF)
2107 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2108 else
2109 *dst++ = ISO_CODE_CR;
2110 CODING_SPEC_ISO_BOL (coding) = 1;
2111 }
2112 else
2113 {
2114 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2115 ENCODE_RESET_PLANE_AND_REGISTER;
2116 *dst++ = c;
2117 }
2118 }
2119 else if (ASCII_BYTE_P (c))
2120 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2121 else if (SINGLE_BYTE_CHAR_P (c))
2122 {
2123 *dst++ = c;
2124 coding->errors++;
2125 }
2126 else
2127 {
2128 SPLIT_CHAR (c, charset, c1, c2);
2129 ENCODE_ISO_CHARACTER (charset, c1, c2);
2130 }
2131
2132 coding->consumed_char++;
2133 }
2134
2135 label_end_of_loop:
2136 coding->consumed = src_base - source;
2137 coding->produced = coding->produced_char = dst - destination;
2138 }
2139
2140 \f
2141 /*** 4. SJIS and BIG5 handlers ***/
2142
2143 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2144 quite widely. So, for the moment, Emacs supports them in the bare
2145 C code. But, in the future, they may be supported only by CCL. */
2146
2147 /* SJIS is a coding system encoding three character sets: ASCII, right
2148 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2149 as is. A character of charset katakana-jisx0201 is encoded by
2150 "position-code + 0x80". A character of charset japanese-jisx0208
2151 is encoded in 2-byte but two position-codes are divided and shifted
2152 so that it fit in the range below.
2153
2154 --- CODE RANGE of SJIS ---
2155 (character set) (range)
2156 ASCII 0x00 .. 0x7F
2157 KATAKANA-JISX0201 0xA0 .. 0xDF
2158 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2159 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2160 -------------------------------
2161
2162 */
2163
2164 /* BIG5 is a coding system encoding two character sets: ASCII and
2165 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2166 character set and is encoded in two-byte.
2167
2168 --- CODE RANGE of BIG5 ---
2169 (character set) (range)
2170 ASCII 0x00 .. 0x7F
2171 Big5 (1st byte) 0xA1 .. 0xFE
2172 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2173 --------------------------
2174
2175 Since the number of characters in Big5 is larger than maximum
2176 characters in Emacs' charset (96x96), it can't be handled as one
2177 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2178 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2179 contains frequently used characters and the latter contains less
2180 frequently used characters. */
2181
2182 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2183 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2184 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2185 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2186
2187 /* Number of Big5 characters which have the same code in 1st byte. */
2188 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2189
2190 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2191 do { \
2192 unsigned int temp \
2193 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2194 if (b1 < 0xC9) \
2195 charset = charset_big5_1; \
2196 else \
2197 { \
2198 charset = charset_big5_2; \
2199 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2200 } \
2201 c1 = temp / (0xFF - 0xA1) + 0x21; \
2202 c2 = temp % (0xFF - 0xA1) + 0x21; \
2203 } while (0)
2204
2205 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2206 do { \
2207 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2208 if (charset == charset_big5_2) \
2209 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2210 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2211 b2 = temp % BIG5_SAME_ROW; \
2212 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2213 } while (0)
2214
2215 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2216 Check if a text is encoded in SJIS. If it is, return
2217 CODING_CATEGORY_MASK_SJIS, else return 0. */
2218
2219 int
2220 detect_coding_sjis (src, src_end)
2221 unsigned char *src, *src_end;
2222 {
2223 int c;
2224 /* Dummy for ONE_MORE_BYTE. */
2225 struct coding_system dummy_coding;
2226 struct coding_system *coding = &dummy_coding;
2227
2228 while (1)
2229 {
2230 ONE_MORE_BYTE (c);
2231 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2232 {
2233 ONE_MORE_BYTE (c);
2234 if (c < 0x40)
2235 return 0;
2236 }
2237 }
2238 label_end_of_loop:
2239 return CODING_CATEGORY_MASK_SJIS;
2240 }
2241
2242 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2243 Check if a text is encoded in BIG5. If it is, return
2244 CODING_CATEGORY_MASK_BIG5, else return 0. */
2245
2246 int
2247 detect_coding_big5 (src, src_end)
2248 unsigned char *src, *src_end;
2249 {
2250 int c;
2251 /* Dummy for ONE_MORE_BYTE. */
2252 struct coding_system dummy_coding;
2253 struct coding_system *coding = &dummy_coding;
2254
2255 while (1)
2256 {
2257 ONE_MORE_BYTE (c);
2258 if (c >= 0xA1)
2259 {
2260 ONE_MORE_BYTE (c);
2261 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2262 return 0;
2263 }
2264 }
2265 label_end_of_loop:
2266 return CODING_CATEGORY_MASK_BIG5;
2267 }
2268
2269 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2270 Check if a text is encoded in UTF-8. If it is, return
2271 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2272
2273 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2274 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2275 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2276 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2277 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2278 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2279 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2280
2281 int
2282 detect_coding_utf_8 (src, src_end)
2283 unsigned char *src, *src_end;
2284 {
2285 unsigned char c;
2286 int seq_maybe_bytes;
2287 /* Dummy for ONE_MORE_BYTE. */
2288 struct coding_system dummy_coding;
2289 struct coding_system *coding = &dummy_coding;
2290
2291 while (1)
2292 {
2293 ONE_MORE_BYTE (c);
2294 if (UTF_8_1_OCTET_P (c))
2295 continue;
2296 else if (UTF_8_2_OCTET_LEADING_P (c))
2297 seq_maybe_bytes = 1;
2298 else if (UTF_8_3_OCTET_LEADING_P (c))
2299 seq_maybe_bytes = 2;
2300 else if (UTF_8_4_OCTET_LEADING_P (c))
2301 seq_maybe_bytes = 3;
2302 else if (UTF_8_5_OCTET_LEADING_P (c))
2303 seq_maybe_bytes = 4;
2304 else if (UTF_8_6_OCTET_LEADING_P (c))
2305 seq_maybe_bytes = 5;
2306 else
2307 return 0;
2308
2309 do
2310 {
2311 ONE_MORE_BYTE (c);
2312 if (!UTF_8_EXTRA_OCTET_P (c))
2313 return 0;
2314 seq_maybe_bytes--;
2315 }
2316 while (seq_maybe_bytes > 0);
2317 }
2318
2319 label_end_of_loop:
2320 return CODING_CATEGORY_MASK_UTF_8;
2321 }
2322
2323 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2324 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2325 Little Endian (otherwise). If it is, return
2326 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2327 else return 0. */
2328
2329 #define UTF_16_INVALID_P(val) \
2330 (((val) == 0xFFFE) \
2331 || ((val) == 0xFFFF))
2332
2333 #define UTF_16_HIGH_SURROGATE_P(val) \
2334 (((val) & 0xD800) == 0xD800)
2335
2336 #define UTF_16_LOW_SURROGATE_P(val) \
2337 (((val) & 0xDC00) == 0xDC00)
2338
2339 int
2340 detect_coding_utf_16 (src, src_end)
2341 unsigned char *src, *src_end;
2342 {
2343 unsigned char c1, c2;
2344 /* Dummy for TWO_MORE_BYTES. */
2345 struct coding_system dummy_coding;
2346 struct coding_system *coding = &dummy_coding;
2347
2348 TWO_MORE_BYTES (c1, c2);
2349
2350 if ((c1 == 0xFF) && (c2 == 0xFE))
2351 return CODING_CATEGORY_MASK_UTF_16_LE;
2352 else if ((c1 == 0xFE) && (c2 == 0xFF))
2353 return CODING_CATEGORY_MASK_UTF_16_BE;
2354
2355 label_end_of_loop:
2356 return 0;
2357 }
2358
2359 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2360 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2361
2362 static void
2363 decode_coding_sjis_big5 (coding, source, destination,
2364 src_bytes, dst_bytes, sjis_p)
2365 struct coding_system *coding;
2366 unsigned char *source, *destination;
2367 int src_bytes, dst_bytes;
2368 int sjis_p;
2369 {
2370 unsigned char *src = source;
2371 unsigned char *src_end = source + src_bytes;
2372 unsigned char *dst = destination;
2373 unsigned char *dst_end = destination + dst_bytes;
2374 /* SRC_BASE remembers the start position in source in each loop.
2375 The loop will be exited when there's not enough source code
2376 (within macro ONE_MORE_BYTE), or when there's not enough
2377 destination area to produce a character (within macro
2378 EMIT_CHAR). */
2379 unsigned char *src_base;
2380 Lisp_Object translation_table;
2381
2382 if (NILP (Venable_character_translation))
2383 translation_table = Qnil;
2384 else
2385 {
2386 translation_table = coding->translation_table_for_decode;
2387 if (NILP (translation_table))
2388 translation_table = Vstandard_translation_table_for_decode;
2389 }
2390
2391 coding->produced_char = 0;
2392 while (1)
2393 {
2394 int c, charset, c1, c2;
2395
2396 src_base = src;
2397 ONE_MORE_BYTE (c1);
2398
2399 if (c1 < 0x80)
2400 {
2401 charset = CHARSET_ASCII;
2402 if (c1 < 0x20)
2403 {
2404 if (c1 == '\r')
2405 {
2406 if (coding->eol_type == CODING_EOL_CRLF)
2407 {
2408 ONE_MORE_BYTE (c2);
2409 if (c2 == '\n')
2410 c1 = c2;
2411 else if (coding->mode
2412 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2413 {
2414 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2415 goto label_end_of_loop;
2416 }
2417 else
2418 /* To process C2 again, SRC is subtracted by 1. */
2419 src--;
2420 }
2421 else if (coding->eol_type == CODING_EOL_CR)
2422 c1 = '\n';
2423 }
2424 else if (c1 == '\n'
2425 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2426 && (coding->eol_type == CODING_EOL_CR
2427 || coding->eol_type == CODING_EOL_CRLF))
2428 {
2429 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2430 goto label_end_of_loop;
2431 }
2432 }
2433 }
2434 else
2435 {
2436 if (sjis_p)
2437 {
2438 if (c1 >= 0xF0)
2439 goto label_invalid_code;
2440 if (c1 < 0xA0 || c1 >= 0xE0)
2441 {
2442 /* SJIS -> JISX0208 */
2443 ONE_MORE_BYTE (c2);
2444 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2445 goto label_invalid_code;
2446 DECODE_SJIS (c1, c2, c1, c2);
2447 charset = charset_jisx0208;
2448 }
2449 else
2450 /* SJIS -> JISX0201-Kana */
2451 charset = charset_katakana_jisx0201;
2452 }
2453 else
2454 {
2455 /* BIG5 -> Big5 */
2456 if (c1 < 0xA1 || c1 > 0xFE)
2457 goto label_invalid_code;
2458 ONE_MORE_BYTE (c2);
2459 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2460 goto label_invalid_code;
2461 DECODE_BIG5 (c1, c2, charset, c1, c2);
2462 }
2463 }
2464
2465 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2466 EMIT_CHAR (c);
2467 continue;
2468
2469 label_invalid_code:
2470 coding->errors++;
2471 src = src_base;
2472 c = *src++;
2473 EMIT_CHAR (c);
2474 }
2475
2476 label_end_of_loop:
2477 coding->consumed = coding->consumed_char = src_base - source;
2478 coding->produced = dst - destination;
2479 return;
2480 }
2481
2482 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2483 This function can encode charsets `ascii', `katakana-jisx0201',
2484 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2485 are sure that all these charsets are registered as official charset
2486 (i.e. do not have extended leading-codes). Characters of other
2487 charsets are produced without any encoding. If SJIS_P is 1, encode
2488 SJIS text, else encode BIG5 text. */
2489
2490 static void
2491 encode_coding_sjis_big5 (coding, source, destination,
2492 src_bytes, dst_bytes, sjis_p)
2493 struct coding_system *coding;
2494 unsigned char *source, *destination;
2495 int src_bytes, dst_bytes;
2496 int sjis_p;
2497 {
2498 unsigned char *src = source;
2499 unsigned char *src_end = source + src_bytes;
2500 unsigned char *dst = destination;
2501 unsigned char *dst_end = destination + dst_bytes;
2502 /* SRC_BASE remembers the start position in source in each loop.
2503 The loop will be exited when there's not enough source text to
2504 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2505 there's not enough destination area to produce encoded codes
2506 (within macro EMIT_BYTES). */
2507 unsigned char *src_base;
2508 Lisp_Object translation_table;
2509
2510 if (NILP (Venable_character_translation))
2511 translation_table = Qnil;
2512 else
2513 {
2514 translation_table = coding->translation_table_for_decode;
2515 if (NILP (translation_table))
2516 translation_table = Vstandard_translation_table_for_decode;
2517 }
2518
2519 while (1)
2520 {
2521 int c, charset, c1, c2;
2522
2523 src_base = src;
2524 ONE_MORE_CHAR (c);
2525
2526 /* Now encode the character C. */
2527 if (SINGLE_BYTE_CHAR_P (c))
2528 {
2529 switch (c)
2530 {
2531 case '\r':
2532 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2533 {
2534 EMIT_ONE_BYTE (c);
2535 break;
2536 }
2537 c = '\n';
2538 case '\n':
2539 if (coding->eol_type == CODING_EOL_CRLF)
2540 {
2541 EMIT_TWO_BYTES ('\r', c);
2542 break;
2543 }
2544 else if (coding->eol_type == CODING_EOL_CR)
2545 c = '\r';
2546 default:
2547 EMIT_ONE_BYTE (c);
2548 }
2549 }
2550 else
2551 {
2552 SPLIT_CHAR (c, charset, c1, c2);
2553 if (sjis_p)
2554 {
2555 if (charset == charset_jisx0208
2556 || charset == charset_jisx0208_1978)
2557 {
2558 ENCODE_SJIS (c1, c2, c1, c2);
2559 EMIT_TWO_BYTES (c1, c2);
2560 }
2561 else if (charset == charset_latin_jisx0201)
2562 EMIT_ONE_BYTE (c1);
2563 else
2564 /* There's no way other than producing the internal
2565 codes as is. */
2566 EMIT_BYTES (src_base, src);
2567 }
2568 else
2569 {
2570 if (charset == charset_big5_1 || charset == charset_big5_2)
2571 {
2572 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2573 EMIT_TWO_BYTES (c1, c2);
2574 }
2575 else
2576 /* There's no way other than producing the internal
2577 codes as is. */
2578 EMIT_BYTES (src_base, src);
2579 }
2580 }
2581 coding->consumed_char++;
2582 }
2583
2584 label_end_of_loop:
2585 coding->consumed = src_base - source;
2586 coding->produced = coding->produced_char = dst - destination;
2587 }
2588
2589 \f
2590 /*** 5. CCL handlers ***/
2591
2592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2593 Check if a text is encoded in a coding system of which
2594 encoder/decoder are written in CCL program. If it is, return
2595 CODING_CATEGORY_MASK_CCL, else return 0. */
2596
2597 int
2598 detect_coding_ccl (src, src_end)
2599 unsigned char *src, *src_end;
2600 {
2601 unsigned char *valid;
2602 int c;
2603 /* Dummy for ONE_MORE_BYTE. */
2604 struct coding_system dummy_coding;
2605 struct coding_system *coding = &dummy_coding;
2606
2607 /* No coding system is assigned to coding-category-ccl. */
2608 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2609 return 0;
2610
2611 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2612 while (1)
2613 {
2614 ONE_MORE_BYTE (c);
2615 if (! valid[c])
2616 return 0;
2617 }
2618 label_end_of_loop:
2619 return CODING_CATEGORY_MASK_CCL;
2620 }
2621
2622 \f
2623 /*** 6. End-of-line handlers ***/
2624
2625 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2626
2627 static void
2628 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2629 struct coding_system *coding;
2630 unsigned char *source, *destination;
2631 int src_bytes, dst_bytes;
2632 {
2633 unsigned char *src = source;
2634 unsigned char *dst = destination;
2635 unsigned char *src_end = src + src_bytes;
2636 unsigned char *dst_end = dst + dst_bytes;
2637 Lisp_Object translation_table;
2638 /* SRC_BASE remembers the start position in source in each loop.
2639 The loop will be exited when there's not enough source code
2640 (within macro ONE_MORE_BYTE), or when there's not enough
2641 destination area to produce a character (within macro
2642 EMIT_CHAR). */
2643 unsigned char *src_base;
2644 int c;
2645
2646 translation_table = Qnil;
2647 switch (coding->eol_type)
2648 {
2649 case CODING_EOL_CRLF:
2650 while (1)
2651 {
2652 src_base = src;
2653 ONE_MORE_BYTE (c);
2654 if (c == '\r')
2655 {
2656 ONE_MORE_BYTE (c);
2657 if (c != '\n')
2658 {
2659 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2660 {
2661 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2662 goto label_end_of_loop;
2663 }
2664 src--;
2665 c = '\r';
2666 }
2667 }
2668 else if (c == '\n'
2669 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2670 {
2671 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2672 goto label_end_of_loop;
2673 }
2674 EMIT_CHAR (c);
2675 }
2676 break;
2677
2678 case CODING_EOL_CR:
2679 while (1)
2680 {
2681 src_base = src;
2682 ONE_MORE_BYTE (c);
2683 if (c == '\n')
2684 {
2685 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2686 {
2687 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2688 goto label_end_of_loop;
2689 }
2690 }
2691 else if (c == '\r')
2692 c = '\n';
2693 EMIT_CHAR (c);
2694 }
2695 break;
2696
2697 default: /* no need for EOL handling */
2698 while (1)
2699 {
2700 src_base = src;
2701 ONE_MORE_BYTE (c);
2702 EMIT_CHAR (c);
2703 }
2704 }
2705
2706 label_end_of_loop:
2707 coding->consumed = coding->consumed_char = src_base - source;
2708 coding->produced = dst - destination;
2709 return;
2710 }
2711
2712 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2713 format of end-of-line according to `coding->eol_type'. It also
2714 convert multibyte form 8-bit characers to unibyte if
2715 CODING->src_multibyte is nonzero. If `coding->mode &
2716 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2717 also means end-of-line. */
2718
2719 static void
2720 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2721 struct coding_system *coding;
2722 unsigned char *source, *destination;
2723 int src_bytes, dst_bytes;
2724 {
2725 unsigned char *src = source;
2726 unsigned char *dst = destination;
2727 unsigned char *src_end = src + src_bytes;
2728 unsigned char *dst_end = dst + dst_bytes;
2729 Lisp_Object translation_table;
2730 /* SRC_BASE remembers the start position in source in each loop.
2731 The loop will be exited when there's not enough source text to
2732 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2733 there's not enough destination area to produce encoded codes
2734 (within macro EMIT_BYTES). */
2735 unsigned char *src_base;
2736 int c;
2737 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2738
2739 translation_table = Qnil;
2740 if (coding->src_multibyte
2741 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2742 {
2743 src_end--;
2744 src_bytes--;
2745 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2746 }
2747
2748 if (coding->eol_type == CODING_EOL_CRLF)
2749 {
2750 while (src < src_end)
2751 {
2752 src_base = src;
2753 c = *src++;
2754 if (c >= 0x20)
2755 EMIT_ONE_BYTE (c);
2756 else if (c == '\n' || (c == '\r' && selective_display))
2757 EMIT_TWO_BYTES ('\r', '\n');
2758 else
2759 EMIT_ONE_BYTE (c);
2760 }
2761 src_base = src;
2762 label_end_of_loop:
2763 ;
2764 }
2765 else
2766 {
2767 if (src_bytes <= dst_bytes)
2768 {
2769 safe_bcopy (src, dst, src_bytes);
2770 src_base = src_end;
2771 dst += src_bytes;
2772 }
2773 else
2774 {
2775 if (coding->src_multibyte
2776 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2777 dst_bytes--;
2778 safe_bcopy (src, dst, dst_bytes);
2779 src_base = src + dst_bytes;
2780 dst = destination + dst_bytes;
2781 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2782 }
2783 if (coding->eol_type == CODING_EOL_CR)
2784 {
2785 for (src = destination; src < dst; src++)
2786 if (*src == '\n') *src = '\r';
2787 }
2788 else if (selective_display)
2789 {
2790 for (src = destination; src < dst; src++)
2791 if (*src == '\r') *src = '\n';
2792 }
2793 }
2794 if (coding->src_multibyte)
2795 dst = destination + str_as_unibyte (destination, dst - destination);
2796
2797 coding->consumed = src_base - source;
2798 coding->produced = dst - destination;
2799 }
2800
2801 \f
2802 /*** 7. C library functions ***/
2803
2804 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2805 has a property `coding-system'. The value of this property is a
2806 vector of length 5 (called as coding-vector). Among elements of
2807 this vector, the first (element[0]) and the fifth (element[4])
2808 carry important information for decoding/encoding. Before
2809 decoding/encoding, this information should be set in fields of a
2810 structure of type `coding_system'.
2811
2812 A value of property `coding-system' can be a symbol of another
2813 subsidiary coding-system. In that case, Emacs gets coding-vector
2814 from that symbol.
2815
2816 `element[0]' contains information to be set in `coding->type'. The
2817 value and its meaning is as follows:
2818
2819 0 -- coding_type_emacs_mule
2820 1 -- coding_type_sjis
2821 2 -- coding_type_iso2022
2822 3 -- coding_type_big5
2823 4 -- coding_type_ccl encoder/decoder written in CCL
2824 nil -- coding_type_no_conversion
2825 t -- coding_type_undecided (automatic conversion on decoding,
2826 no-conversion on encoding)
2827
2828 `element[4]' contains information to be set in `coding->flags' and
2829 `coding->spec'. The meaning varies by `coding->type'.
2830
2831 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2832 of length 32 (of which the first 13 sub-elements are used now).
2833 Meanings of these sub-elements are:
2834
2835 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2836 If the value is an integer of valid charset, the charset is
2837 assumed to be designated to graphic register N initially.
2838
2839 If the value is minus, it is a minus value of charset which
2840 reserves graphic register N, which means that the charset is
2841 not designated initially but should be designated to graphic
2842 register N just before encoding a character in that charset.
2843
2844 If the value is nil, graphic register N is never used on
2845 encoding.
2846
2847 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2848 Each value takes t or nil. See the section ISO2022 of
2849 `coding.h' for more information.
2850
2851 If `coding->type' is `coding_type_big5', element[4] is t to denote
2852 BIG5-ETen or nil to denote BIG5-HKU.
2853
2854 If `coding->type' takes the other value, element[4] is ignored.
2855
2856 Emacs Lisp's coding system also carries information about format of
2857 end-of-line in a value of property `eol-type'. If the value is
2858 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2859 means CODING_EOL_CR. If it is not integer, it should be a vector
2860 of subsidiary coding systems of which property `eol-type' has one
2861 of above values.
2862
2863 */
2864
2865 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2866 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2867 is setup so that no conversion is necessary and return -1, else
2868 return 0. */
2869
2870 int
2871 setup_coding_system (coding_system, coding)
2872 Lisp_Object coding_system;
2873 struct coding_system *coding;
2874 {
2875 Lisp_Object coding_spec, coding_type, eol_type, plist;
2876 Lisp_Object val;
2877 int i;
2878
2879 /* Initialize some fields required for all kinds of coding systems. */
2880 coding->symbol = coding_system;
2881 coding->common_flags = 0;
2882 coding->mode = 0;
2883 coding->heading_ascii = -1;
2884 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2885 coding->composing = COMPOSITION_DISABLED;
2886 coding->cmp_data = NULL;
2887
2888 if (NILP (coding_system))
2889 goto label_invalid_coding_system;
2890
2891 coding_spec = Fget (coding_system, Qcoding_system);
2892
2893 if (!VECTORP (coding_spec)
2894 || XVECTOR (coding_spec)->size != 5
2895 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2896 goto label_invalid_coding_system;
2897
2898 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2899 if (VECTORP (eol_type))
2900 {
2901 coding->eol_type = CODING_EOL_UNDECIDED;
2902 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2903 }
2904 else if (XFASTINT (eol_type) == 1)
2905 {
2906 coding->eol_type = CODING_EOL_CRLF;
2907 coding->common_flags
2908 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2909 }
2910 else if (XFASTINT (eol_type) == 2)
2911 {
2912 coding->eol_type = CODING_EOL_CR;
2913 coding->common_flags
2914 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2915 }
2916 else
2917 coding->eol_type = CODING_EOL_LF;
2918
2919 coding_type = XVECTOR (coding_spec)->contents[0];
2920 /* Try short cut. */
2921 if (SYMBOLP (coding_type))
2922 {
2923 if (EQ (coding_type, Qt))
2924 {
2925 coding->type = coding_type_undecided;
2926 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2927 }
2928 else
2929 coding->type = coding_type_no_conversion;
2930 return 0;
2931 }
2932
2933 /* Get values of coding system properties:
2934 `post-read-conversion', `pre-write-conversion',
2935 `translation-table-for-decode', `translation-table-for-encode'. */
2936 plist = XVECTOR (coding_spec)->contents[3];
2937 /* Pre & post conversion functions should be disabled if
2938 inhibit_eol_conversion is nozero. This is the case that a code
2939 conversion function is called while those functions are running. */
2940 if (! inhibit_pre_post_conversion)
2941 {
2942 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2943 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2944 }
2945 val = Fplist_get (plist, Qtranslation_table_for_decode);
2946 if (SYMBOLP (val))
2947 val = Fget (val, Qtranslation_table_for_decode);
2948 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2949 val = Fplist_get (plist, Qtranslation_table_for_encode);
2950 if (SYMBOLP (val))
2951 val = Fget (val, Qtranslation_table_for_encode);
2952 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2953 val = Fplist_get (plist, Qcoding_category);
2954 if (!NILP (val))
2955 {
2956 val = Fget (val, Qcoding_category_index);
2957 if (INTEGERP (val))
2958 coding->category_idx = XINT (val);
2959 else
2960 goto label_invalid_coding_system;
2961 }
2962 else
2963 goto label_invalid_coding_system;
2964
2965 val = Fplist_get (plist, Qsafe_charsets);
2966 if (EQ (val, Qt))
2967 {
2968 for (i = 0; i <= MAX_CHARSET; i++)
2969 coding->safe_charsets[i] = 1;
2970 }
2971 else
2972 {
2973 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2974 while (CONSP (val))
2975 {
2976 if ((i = get_charset_id (XCAR (val))) >= 0)
2977 coding->safe_charsets[i] = 1;
2978 val = XCDR (val);
2979 }
2980 }
2981
2982 /* If the coding system has non-nil `composition' property, enable
2983 composition handling. */
2984 val = Fplist_get (plist, Qcomposition);
2985 if (!NILP (val))
2986 coding->composing = COMPOSITION_NO;
2987
2988 switch (XFASTINT (coding_type))
2989 {
2990 case 0:
2991 coding->type = coding_type_emacs_mule;
2992 if (!NILP (coding->post_read_conversion))
2993 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2994 if (!NILP (coding->pre_write_conversion))
2995 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2996 break;
2997
2998 case 1:
2999 coding->type = coding_type_sjis;
3000 coding->common_flags
3001 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3002 break;
3003
3004 case 2:
3005 coding->type = coding_type_iso2022;
3006 coding->common_flags
3007 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3008 {
3009 Lisp_Object val, temp;
3010 Lisp_Object *flags;
3011 int i, charset, reg_bits = 0;
3012
3013 val = XVECTOR (coding_spec)->contents[4];
3014
3015 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3016 goto label_invalid_coding_system;
3017
3018 flags = XVECTOR (val)->contents;
3019 coding->flags
3020 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3021 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3022 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3023 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3024 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3025 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3026 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3027 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3028 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3029 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3030 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3031 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3032 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3033 );
3034
3035 /* Invoke graphic register 0 to plane 0. */
3036 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3037 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3038 CODING_SPEC_ISO_INVOCATION (coding, 1)
3039 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3040 /* Not single shifting at first. */
3041 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3042 /* Beginning of buffer should also be regarded as bol. */
3043 CODING_SPEC_ISO_BOL (coding) = 1;
3044
3045 for (charset = 0; charset <= MAX_CHARSET; charset++)
3046 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3047 val = Vcharset_revision_alist;
3048 while (CONSP (val))
3049 {
3050 charset = get_charset_id (Fcar_safe (XCAR (val)));
3051 if (charset >= 0
3052 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3053 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3054 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3055 val = XCDR (val);
3056 }
3057
3058 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3059 FLAGS[REG] can be one of below:
3060 integer CHARSET: CHARSET occupies register I,
3061 t: designate nothing to REG initially, but can be used
3062 by any charsets,
3063 list of integer, nil, or t: designate the first
3064 element (if integer) to REG initially, the remaining
3065 elements (if integer) is designated to REG on request,
3066 if an element is t, REG can be used by any charsets,
3067 nil: REG is never used. */
3068 for (charset = 0; charset <= MAX_CHARSET; charset++)
3069 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3070 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3071 for (i = 0; i < 4; i++)
3072 {
3073 if (INTEGERP (flags[i])
3074 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3075 || (charset = get_charset_id (flags[i])) >= 0)
3076 {
3077 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3078 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3079 }
3080 else if (EQ (flags[i], Qt))
3081 {
3082 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3083 reg_bits |= 1 << i;
3084 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3085 }
3086 else if (CONSP (flags[i]))
3087 {
3088 Lisp_Object tail;
3089 tail = flags[i];
3090
3091 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3092 if (INTEGERP (XCAR (tail))
3093 && (charset = XINT (XCAR (tail)),
3094 CHARSET_VALID_P (charset))
3095 || (charset = get_charset_id (XCAR (tail))) >= 0)
3096 {
3097 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3098 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3099 }
3100 else
3101 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3102 tail = XCDR (tail);
3103 while (CONSP (tail))
3104 {
3105 if (INTEGERP (XCAR (tail))
3106 && (charset = XINT (XCAR (tail)),
3107 CHARSET_VALID_P (charset))
3108 || (charset = get_charset_id (XCAR (tail))) >= 0)
3109 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3110 = i;
3111 else if (EQ (XCAR (tail), Qt))
3112 reg_bits |= 1 << i;
3113 tail = XCDR (tail);
3114 }
3115 }
3116 else
3117 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3118
3119 CODING_SPEC_ISO_DESIGNATION (coding, i)
3120 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3121 }
3122
3123 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3124 {
3125 /* REG 1 can be used only by locking shift in 7-bit env. */
3126 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3127 reg_bits &= ~2;
3128 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3129 /* Without any shifting, only REG 0 and 1 can be used. */
3130 reg_bits &= 3;
3131 }
3132
3133 if (reg_bits)
3134 for (charset = 0; charset <= MAX_CHARSET; charset++)
3135 {
3136 if (CHARSET_VALID_P (charset))
3137 {
3138 /* There exist some default graphic registers to be
3139 used CHARSET. */
3140
3141 /* We had better avoid designating a charset of
3142 CHARS96 to REG 0 as far as possible. */
3143 if (CHARSET_CHARS (charset) == 96)
3144 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3145 = (reg_bits & 2
3146 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3147 else
3148 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3149 = (reg_bits & 1
3150 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3151 }
3152 }
3153 }
3154 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3155 coding->spec.iso2022.last_invalid_designation_register = -1;
3156 break;
3157
3158 case 3:
3159 coding->type = coding_type_big5;
3160 coding->common_flags
3161 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3162 coding->flags
3163 = (NILP (XVECTOR (coding_spec)->contents[4])
3164 ? CODING_FLAG_BIG5_HKU
3165 : CODING_FLAG_BIG5_ETEN);
3166 break;
3167
3168 case 4:
3169 coding->type = coding_type_ccl;
3170 coding->common_flags
3171 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3172 {
3173 val = XVECTOR (coding_spec)->contents[4];
3174 if (! CONSP (val)
3175 || setup_ccl_program (&(coding->spec.ccl.decoder),
3176 XCAR (val)) < 0
3177 || setup_ccl_program (&(coding->spec.ccl.encoder),
3178 XCDR (val)) < 0)
3179 goto label_invalid_coding_system;
3180
3181 bzero (coding->spec.ccl.valid_codes, 256);
3182 val = Fplist_get (plist, Qvalid_codes);
3183 if (CONSP (val))
3184 {
3185 Lisp_Object this;
3186
3187 for (; CONSP (val); val = XCDR (val))
3188 {
3189 this = XCAR (val);
3190 if (INTEGERP (this)
3191 && XINT (this) >= 0 && XINT (this) < 256)
3192 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3193 else if (CONSP (this)
3194 && INTEGERP (XCAR (this))
3195 && INTEGERP (XCDR (this)))
3196 {
3197 int start = XINT (XCAR (this));
3198 int end = XINT (XCDR (this));
3199
3200 if (start >= 0 && start <= end && end < 256)
3201 while (start <= end)
3202 coding->spec.ccl.valid_codes[start++] = 1;
3203 }
3204 }
3205 }
3206 }
3207 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3208 break;
3209
3210 case 5:
3211 coding->type = coding_type_raw_text;
3212 break;
3213
3214 default:
3215 goto label_invalid_coding_system;
3216 }
3217 return 0;
3218
3219 label_invalid_coding_system:
3220 coding->type = coding_type_no_conversion;
3221 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3222 coding->common_flags = 0;
3223 coding->eol_type = CODING_EOL_LF;
3224 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3225 return -1;
3226 }
3227
3228 /* Free memory blocks allocated for storing composition information. */
3229
3230 void
3231 coding_free_composition_data (coding)
3232 struct coding_system *coding;
3233 {
3234 struct composition_data *cmp_data = coding->cmp_data, *next;
3235
3236 if (!cmp_data)
3237 return;
3238 /* Memory blocks are chained. At first, rewind to the first, then,
3239 free blocks one by one. */
3240 while (cmp_data->prev)
3241 cmp_data = cmp_data->prev;
3242 while (cmp_data)
3243 {
3244 next = cmp_data->next;
3245 xfree (cmp_data);
3246 cmp_data = next;
3247 }
3248 coding->cmp_data = NULL;
3249 }
3250
3251 /* Set `char_offset' member of all memory blocks pointed by
3252 coding->cmp_data to POS. */
3253
3254 void
3255 coding_adjust_composition_offset (coding, pos)
3256 struct coding_system *coding;
3257 int pos;
3258 {
3259 struct composition_data *cmp_data;
3260
3261 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3262 cmp_data->char_offset = pos;
3263 }
3264
3265 /* Setup raw-text or one of its subsidiaries in the structure
3266 coding_system CODING according to the already setup value eol_type
3267 in CODING. CODING should be setup for some coding system in
3268 advance. */
3269
3270 void
3271 setup_raw_text_coding_system (coding)
3272 struct coding_system *coding;
3273 {
3274 if (coding->type != coding_type_raw_text)
3275 {
3276 coding->symbol = Qraw_text;
3277 coding->type = coding_type_raw_text;
3278 if (coding->eol_type != CODING_EOL_UNDECIDED)
3279 {
3280 Lisp_Object subsidiaries;
3281 subsidiaries = Fget (Qraw_text, Qeol_type);
3282
3283 if (VECTORP (subsidiaries)
3284 && XVECTOR (subsidiaries)->size == 3)
3285 coding->symbol
3286 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3287 }
3288 setup_coding_system (coding->symbol, coding);
3289 }
3290 return;
3291 }
3292
3293 /* Emacs has a mechanism to automatically detect a coding system if it
3294 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3295 it's impossible to distinguish some coding systems accurately
3296 because they use the same range of codes. So, at first, coding
3297 systems are categorized into 7, those are:
3298
3299 o coding-category-emacs-mule
3300
3301 The category for a coding system which has the same code range
3302 as Emacs' internal format. Assigned the coding-system (Lisp
3303 symbol) `emacs-mule' by default.
3304
3305 o coding-category-sjis
3306
3307 The category for a coding system which has the same code range
3308 as SJIS. Assigned the coding-system (Lisp
3309 symbol) `japanese-shift-jis' by default.
3310
3311 o coding-category-iso-7
3312
3313 The category for a coding system which has the same code range
3314 as ISO2022 of 7-bit environment. This doesn't use any locking
3315 shift and single shift functions. This can encode/decode all
3316 charsets. Assigned the coding-system (Lisp symbol)
3317 `iso-2022-7bit' by default.
3318
3319 o coding-category-iso-7-tight
3320
3321 Same as coding-category-iso-7 except that this can
3322 encode/decode only the specified charsets.
3323
3324 o coding-category-iso-8-1
3325
3326 The category for a coding system which has the same code range
3327 as ISO2022 of 8-bit environment and graphic plane 1 used only
3328 for DIMENSION1 charset. This doesn't use any locking shift
3329 and single shift functions. Assigned the coding-system (Lisp
3330 symbol) `iso-latin-1' by default.
3331
3332 o coding-category-iso-8-2
3333
3334 The category for a coding system which has the same code range
3335 as ISO2022 of 8-bit environment and graphic plane 1 used only
3336 for DIMENSION2 charset. This doesn't use any locking shift
3337 and single shift functions. Assigned the coding-system (Lisp
3338 symbol) `japanese-iso-8bit' by default.
3339
3340 o coding-category-iso-7-else
3341
3342 The category for a coding system which has the same code range
3343 as ISO2022 of 7-bit environemnt but uses locking shift or
3344 single shift functions. Assigned the coding-system (Lisp
3345 symbol) `iso-2022-7bit-lock' by default.
3346
3347 o coding-category-iso-8-else
3348
3349 The category for a coding system which has the same code range
3350 as ISO2022 of 8-bit environemnt but uses locking shift or
3351 single shift functions. Assigned the coding-system (Lisp
3352 symbol) `iso-2022-8bit-ss2' by default.
3353
3354 o coding-category-big5
3355
3356 The category for a coding system which has the same code range
3357 as BIG5. Assigned the coding-system (Lisp symbol)
3358 `cn-big5' by default.
3359
3360 o coding-category-utf-8
3361
3362 The category for a coding system which has the same code range
3363 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3364 symbol) `utf-8' by default.
3365
3366 o coding-category-utf-16-be
3367
3368 The category for a coding system in which a text has an
3369 Unicode signature (cf. Unicode Standard) in the order of BIG
3370 endian at the head. Assigned the coding-system (Lisp symbol)
3371 `utf-16-be' by default.
3372
3373 o coding-category-utf-16-le
3374
3375 The category for a coding system in which a text has an
3376 Unicode signature (cf. Unicode Standard) in the order of
3377 LITTLE endian at the head. Assigned the coding-system (Lisp
3378 symbol) `utf-16-le' by default.
3379
3380 o coding-category-ccl
3381
3382 The category for a coding system of which encoder/decoder is
3383 written in CCL programs. The default value is nil, i.e., no
3384 coding system is assigned.
3385
3386 o coding-category-binary
3387
3388 The category for a coding system not categorized in any of the
3389 above. Assigned the coding-system (Lisp symbol)
3390 `no-conversion' by default.
3391
3392 Each of them is a Lisp symbol and the value is an actual
3393 `coding-system's (this is also a Lisp symbol) assigned by a user.
3394 What Emacs does actually is to detect a category of coding system.
3395 Then, it uses a `coding-system' assigned to it. If Emacs can't
3396 decide only one possible category, it selects a category of the
3397 highest priority. Priorities of categories are also specified by a
3398 user in a Lisp variable `coding-category-list'.
3399
3400 */
3401
3402 static
3403 int ascii_skip_code[256];
3404
3405 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3406 If it detects possible coding systems, return an integer in which
3407 appropriate flag bits are set. Flag bits are defined by macros
3408 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3409 it should point the table `coding_priorities'. In that case, only
3410 the flag bit for a coding system of the highest priority is set in
3411 the returned value.
3412
3413 How many ASCII characters are at the head is returned as *SKIP. */
3414
3415 static int
3416 detect_coding_mask (source, src_bytes, priorities, skip)
3417 unsigned char *source;
3418 int src_bytes, *priorities, *skip;
3419 {
3420 register unsigned char c;
3421 unsigned char *src = source, *src_end = source + src_bytes;
3422 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3423 int i, idx;
3424
3425 /* At first, skip all ASCII characters and control characters except
3426 for three ISO2022 specific control characters. */
3427 ascii_skip_code[ISO_CODE_SO] = 0;
3428 ascii_skip_code[ISO_CODE_SI] = 0;
3429 ascii_skip_code[ISO_CODE_ESC] = 0;
3430
3431 label_loop_detect_coding:
3432 while (src < src_end && ascii_skip_code[*src]) src++;
3433 *skip = src - source;
3434
3435 if (src >= src_end)
3436 /* We found nothing other than ASCII. There's nothing to do. */
3437 return 0;
3438
3439 c = *src;
3440 /* The text seems to be encoded in some multilingual coding system.
3441 Now, try to find in which coding system the text is encoded. */
3442 if (c < 0x80)
3443 {
3444 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3445 /* C is an ISO2022 specific control code of C0. */
3446 mask = detect_coding_iso2022 (src, src_end);
3447 if (mask == 0)
3448 {
3449 /* No valid ISO2022 code follows C. Try again. */
3450 src++;
3451 if (c == ISO_CODE_ESC)
3452 ascii_skip_code[ISO_CODE_ESC] = 1;
3453 else
3454 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3455 goto label_loop_detect_coding;
3456 }
3457 if (priorities)
3458 {
3459 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3460 {
3461 if (mask & priorities[i])
3462 return priorities[i];
3463 }
3464 return CODING_CATEGORY_MASK_RAW_TEXT;
3465 }
3466 }
3467 else
3468 {
3469 int try;
3470
3471 if (c < 0xA0)
3472 {
3473 /* C is the first byte of SJIS character code,
3474 or a leading-code of Emacs' internal format (emacs-mule),
3475 or the first byte of UTF-16. */
3476 try = (CODING_CATEGORY_MASK_SJIS
3477 | CODING_CATEGORY_MASK_EMACS_MULE
3478 | CODING_CATEGORY_MASK_UTF_16_BE
3479 | CODING_CATEGORY_MASK_UTF_16_LE);
3480
3481 /* Or, if C is a special latin extra code,
3482 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3483 or is an ISO2022 control-sequence-introducer (CSI),
3484 we should also consider the possibility of ISO2022 codings. */
3485 if ((VECTORP (Vlatin_extra_code_table)
3486 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3487 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3488 || (c == ISO_CODE_CSI
3489 && (src < src_end
3490 && (*src == ']'
3491 || ((*src == '0' || *src == '1' || *src == '2')
3492 && src + 1 < src_end
3493 && src[1] == ']')))))
3494 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3495 | CODING_CATEGORY_MASK_ISO_8BIT);
3496 }
3497 else
3498 /* C is a character of ISO2022 in graphic plane right,
3499 or a SJIS's 1-byte character code (i.e. JISX0201),
3500 or the first byte of BIG5's 2-byte code,
3501 or the first byte of UTF-8/16. */
3502 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3503 | CODING_CATEGORY_MASK_ISO_8BIT
3504 | CODING_CATEGORY_MASK_SJIS
3505 | CODING_CATEGORY_MASK_BIG5
3506 | CODING_CATEGORY_MASK_UTF_8
3507 | CODING_CATEGORY_MASK_UTF_16_BE
3508 | CODING_CATEGORY_MASK_UTF_16_LE);
3509
3510 /* Or, we may have to consider the possibility of CCL. */
3511 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3512 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3513 ->spec.ccl.valid_codes)[c])
3514 try |= CODING_CATEGORY_MASK_CCL;
3515
3516 mask = 0;
3517 utf16_examined_p = iso2022_examined_p = 0;
3518 if (priorities)
3519 {
3520 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3521 {
3522 if (!iso2022_examined_p
3523 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3524 {
3525 mask |= detect_coding_iso2022 (src, src_end);
3526 iso2022_examined_p = 1;
3527 }
3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3529 mask |= detect_coding_sjis (src, src_end);
3530 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3531 mask |= detect_coding_utf_8 (src, src_end);
3532 else if (!utf16_examined_p
3533 && (priorities[i] & try &
3534 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3535 {
3536 mask |= detect_coding_utf_16 (src, src_end);
3537 utf16_examined_p = 1;
3538 }
3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3540 mask |= detect_coding_big5 (src, src_end);
3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3542 mask |= detect_coding_emacs_mule (src, src_end);
3543 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3544 mask |= detect_coding_ccl (src, src_end);
3545 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3546 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3547 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3548 mask |= CODING_CATEGORY_MASK_BINARY;
3549 if (mask & priorities[i])
3550 return priorities[i];
3551 }
3552 return CODING_CATEGORY_MASK_RAW_TEXT;
3553 }
3554 if (try & CODING_CATEGORY_MASK_ISO)
3555 mask |= detect_coding_iso2022 (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_SJIS)
3557 mask |= detect_coding_sjis (src, src_end);
3558 if (try & CODING_CATEGORY_MASK_BIG5)
3559 mask |= detect_coding_big5 (src, src_end);
3560 if (try & CODING_CATEGORY_MASK_UTF_8)
3561 mask |= detect_coding_utf_8 (src, src_end);
3562 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3563 mask |= detect_coding_utf_16 (src, src_end);
3564 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3565 mask |= detect_coding_emacs_mule (src, src_end);
3566 if (try & CODING_CATEGORY_MASK_CCL)
3567 mask |= detect_coding_ccl (src, src_end);
3568 }
3569 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3570 }
3571
3572 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3573 The information of the detected coding system is set in CODING. */
3574
3575 void
3576 detect_coding (coding, src, src_bytes)
3577 struct coding_system *coding;
3578 unsigned char *src;
3579 int src_bytes;
3580 {
3581 unsigned int idx;
3582 int skip, mask, i;
3583 Lisp_Object val;
3584
3585 val = Vcoding_category_list;
3586 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3587 coding->heading_ascii = skip;
3588
3589 if (!mask) return;
3590
3591 /* We found a single coding system of the highest priority in MASK. */
3592 idx = 0;
3593 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3594 if (! mask)
3595 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3596
3597 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3598
3599 if (coding->eol_type != CODING_EOL_UNDECIDED)
3600 {
3601 Lisp_Object tmp;
3602
3603 tmp = Fget (val, Qeol_type);
3604 if (VECTORP (tmp))
3605 val = XVECTOR (tmp)->contents[coding->eol_type];
3606 }
3607
3608 /* Setup this new coding system while preserving some slots. */
3609 {
3610 int src_multibyte = coding->src_multibyte;
3611 int dst_multibyte = coding->dst_multibyte;
3612
3613 setup_coding_system (val, coding);
3614 coding->src_multibyte = src_multibyte;
3615 coding->dst_multibyte = dst_multibyte;
3616 coding->heading_ascii = skip;
3617 }
3618 }
3619
3620 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3621 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3622 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3623
3624 How many non-eol characters are at the head is returned as *SKIP. */
3625
3626 #define MAX_EOL_CHECK_COUNT 3
3627
3628 static int
3629 detect_eol_type (source, src_bytes, skip)
3630 unsigned char *source;
3631 int src_bytes, *skip;
3632 {
3633 unsigned char *src = source, *src_end = src + src_bytes;
3634 unsigned char c;
3635 int total = 0; /* How many end-of-lines are found so far. */
3636 int eol_type = CODING_EOL_UNDECIDED;
3637 int this_eol_type;
3638
3639 *skip = 0;
3640
3641 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3642 {
3643 c = *src++;
3644 if (c == '\n' || c == '\r')
3645 {
3646 if (*skip == 0)
3647 *skip = src - 1 - source;
3648 total++;
3649 if (c == '\n')
3650 this_eol_type = CODING_EOL_LF;
3651 else if (src >= src_end || *src != '\n')
3652 this_eol_type = CODING_EOL_CR;
3653 else
3654 this_eol_type = CODING_EOL_CRLF, src++;
3655
3656 if (eol_type == CODING_EOL_UNDECIDED)
3657 /* This is the first end-of-line. */
3658 eol_type = this_eol_type;
3659 else if (eol_type != this_eol_type)
3660 {
3661 /* The found type is different from what found before. */
3662 eol_type = CODING_EOL_INCONSISTENT;
3663 break;
3664 }
3665 }
3666 }
3667
3668 if (*skip == 0)
3669 *skip = src_end - source;
3670 return eol_type;
3671 }
3672
3673 /* Like detect_eol_type, but detect EOL type in 2-octet
3674 big-endian/little-endian format for coding systems utf-16-be and
3675 utf-16-le. */
3676
3677 static int
3678 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3679 unsigned char *source;
3680 int src_bytes, *skip;
3681 {
3682 unsigned char *src = source, *src_end = src + src_bytes;
3683 unsigned int c1, c2;
3684 int total = 0; /* How many end-of-lines are found so far. */
3685 int eol_type = CODING_EOL_UNDECIDED;
3686 int this_eol_type;
3687 int msb, lsb;
3688
3689 if (big_endian_p)
3690 msb = 0, lsb = 1;
3691 else
3692 msb = 1, lsb = 0;
3693
3694 *skip = 0;
3695
3696 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3697 {
3698 c1 = (src[msb] << 8) | (src[lsb]);
3699 src += 2;
3700
3701 if (c1 == '\n' || c1 == '\r')
3702 {
3703 if (*skip == 0)
3704 *skip = src - 2 - source;
3705 total++;
3706 if (c1 == '\n')
3707 {
3708 this_eol_type = CODING_EOL_LF;
3709 }
3710 else
3711 {
3712 if ((src + 1) >= src_end)
3713 {
3714 this_eol_type = CODING_EOL_CR;
3715 }
3716 else
3717 {
3718 c2 = (src[msb] << 8) | (src[lsb]);
3719 if (c2 == '\n')
3720 this_eol_type = CODING_EOL_CRLF, src += 2;
3721 else
3722 this_eol_type = CODING_EOL_CR;
3723 }
3724 }
3725
3726 if (eol_type == CODING_EOL_UNDECIDED)
3727 /* This is the first end-of-line. */
3728 eol_type = this_eol_type;
3729 else if (eol_type != this_eol_type)
3730 {
3731 /* The found type is different from what found before. */
3732 eol_type = CODING_EOL_INCONSISTENT;
3733 break;
3734 }
3735 }
3736 }
3737
3738 if (*skip == 0)
3739 *skip = src_end - source;
3740 return eol_type;
3741 }
3742
3743 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3744 is encoded. If it detects an appropriate format of end-of-line, it
3745 sets the information in *CODING. */
3746
3747 void
3748 detect_eol (coding, src, src_bytes)
3749 struct coding_system *coding;
3750 unsigned char *src;
3751 int src_bytes;
3752 {
3753 Lisp_Object val;
3754 int skip;
3755 int eol_type;
3756
3757 switch (coding->category_idx)
3758 {
3759 case CODING_CATEGORY_IDX_UTF_16_BE:
3760 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3761 break;
3762 case CODING_CATEGORY_IDX_UTF_16_LE:
3763 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3764 break;
3765 default:
3766 eol_type = detect_eol_type (src, src_bytes, &skip);
3767 break;
3768 }
3769
3770 if (coding->heading_ascii > skip)
3771 coding->heading_ascii = skip;
3772 else
3773 skip = coding->heading_ascii;
3774
3775 if (eol_type == CODING_EOL_UNDECIDED)
3776 return;
3777 if (eol_type == CODING_EOL_INCONSISTENT)
3778 {
3779 #if 0
3780 /* This code is suppressed until we find a better way to
3781 distinguish raw text file and binary file. */
3782
3783 /* If we have already detected that the coding is raw-text, the
3784 coding should actually be no-conversion. */
3785 if (coding->type == coding_type_raw_text)
3786 {
3787 setup_coding_system (Qno_conversion, coding);
3788 return;
3789 }
3790 /* Else, let's decode only text code anyway. */
3791 #endif /* 0 */
3792 eol_type = CODING_EOL_LF;
3793 }
3794
3795 val = Fget (coding->symbol, Qeol_type);
3796 if (VECTORP (val) && XVECTOR (val)->size == 3)
3797 {
3798 int src_multibyte = coding->src_multibyte;
3799 int dst_multibyte = coding->dst_multibyte;
3800
3801 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3802 coding->src_multibyte = src_multibyte;
3803 coding->dst_multibyte = dst_multibyte;
3804 coding->heading_ascii = skip;
3805 }
3806 }
3807
3808 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3809
3810 #define DECODING_BUFFER_MAG(coding) \
3811 (coding->type == coding_type_iso2022 \
3812 ? 3 \
3813 : (coding->type == coding_type_ccl \
3814 ? coding->spec.ccl.decoder.buf_magnification \
3815 : 2))
3816
3817 /* Return maximum size (bytes) of a buffer enough for decoding
3818 SRC_BYTES of text encoded in CODING. */
3819
3820 int
3821 decoding_buffer_size (coding, src_bytes)
3822 struct coding_system *coding;
3823 int src_bytes;
3824 {
3825 return (src_bytes * DECODING_BUFFER_MAG (coding)
3826 + CONVERSION_BUFFER_EXTRA_ROOM);
3827 }
3828
3829 /* Return maximum size (bytes) of a buffer enough for encoding
3830 SRC_BYTES of text to CODING. */
3831
3832 int
3833 encoding_buffer_size (coding, src_bytes)
3834 struct coding_system *coding;
3835 int src_bytes;
3836 {
3837 int magnification;
3838
3839 if (coding->type == coding_type_ccl)
3840 magnification = coding->spec.ccl.encoder.buf_magnification;
3841 else if (CODING_REQUIRE_ENCODING (coding))
3842 magnification = 3;
3843 else
3844 magnification = 1;
3845
3846 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3847 }
3848
3849 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3850 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3851 #endif
3852
3853 char *conversion_buffer;
3854 int conversion_buffer_size;
3855
3856 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3857 or decoding. Sufficient memory is allocated automatically. If we
3858 run out of memory, return NULL. */
3859
3860 char *
3861 get_conversion_buffer (size)
3862 int size;
3863 {
3864 if (size > conversion_buffer_size)
3865 {
3866 char *buf;
3867 int real_size = conversion_buffer_size * 2;
3868
3869 while (real_size < size) real_size *= 2;
3870 buf = (char *) xmalloc (real_size);
3871 xfree (conversion_buffer);
3872 conversion_buffer = buf;
3873 conversion_buffer_size = real_size;
3874 }
3875 return conversion_buffer;
3876 }
3877
3878 int
3879 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3880 struct coding_system *coding;
3881 unsigned char *source, *destination;
3882 int src_bytes, dst_bytes, encodep;
3883 {
3884 struct ccl_program *ccl
3885 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3886 int result;
3887
3888 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3889
3890 coding->produced = ccl_driver (ccl, source, destination,
3891 src_bytes, dst_bytes, &(coding->consumed));
3892 if (encodep)
3893 coding->produced_char = coding->produced;
3894 else
3895 {
3896 int bytes
3897 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3898 coding->produced = str_as_multibyte (destination, bytes,
3899 coding->produced,
3900 &(coding->produced_char));
3901 }
3902
3903 switch (ccl->status)
3904 {
3905 case CCL_STAT_SUSPEND_BY_SRC:
3906 result = CODING_FINISH_INSUFFICIENT_SRC;
3907 break;
3908 case CCL_STAT_SUSPEND_BY_DST:
3909 result = CODING_FINISH_INSUFFICIENT_DST;
3910 break;
3911 case CCL_STAT_QUIT:
3912 case CCL_STAT_INVALID_CMD:
3913 result = CODING_FINISH_INTERRUPT;
3914 break;
3915 default:
3916 result = CODING_FINISH_NORMAL;
3917 break;
3918 }
3919 return result;
3920 }
3921
3922 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3923 decoding, it may detect coding system and format of end-of-line if
3924 those are not yet decided. The source should be unibyte, the
3925 result is multibyte if CODING->dst_multibyte is nonzero, else
3926 unibyte. */
3927
3928 int
3929 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3930 struct coding_system *coding;
3931 unsigned char *source, *destination;
3932 int src_bytes, dst_bytes;
3933 {
3934 if (coding->type == coding_type_undecided)
3935 detect_coding (coding, source, src_bytes);
3936
3937 if (coding->eol_type == CODING_EOL_UNDECIDED)
3938 detect_eol (coding, source, src_bytes);
3939
3940 coding->produced = coding->produced_char = 0;
3941 coding->consumed = coding->consumed_char = 0;
3942 coding->errors = 0;
3943 coding->result = CODING_FINISH_NORMAL;
3944
3945 switch (coding->type)
3946 {
3947 case coding_type_sjis:
3948 decode_coding_sjis_big5 (coding, source, destination,
3949 src_bytes, dst_bytes, 1);
3950 break;
3951
3952 case coding_type_iso2022:
3953 decode_coding_iso2022 (coding, source, destination,
3954 src_bytes, dst_bytes);
3955 break;
3956
3957 case coding_type_big5:
3958 decode_coding_sjis_big5 (coding, source, destination,
3959 src_bytes, dst_bytes, 0);
3960 break;
3961
3962 case coding_type_emacs_mule:
3963 decode_coding_emacs_mule (coding, source, destination,
3964 src_bytes, dst_bytes);
3965 break;
3966
3967 case coding_type_ccl:
3968 ccl_coding_driver (coding, source, destination,
3969 src_bytes, dst_bytes, 0);
3970 break;
3971
3972 default:
3973 decode_eol (coding, source, destination, src_bytes, dst_bytes);
3974 }
3975
3976 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3977 && coding->consumed == src_bytes)
3978 coding->result = CODING_FINISH_NORMAL;
3979
3980 if (coding->mode & CODING_MODE_LAST_BLOCK
3981 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3982 {
3983 unsigned char *src = source + coding->consumed;
3984 unsigned char *dst = destination + coding->produced;
3985
3986 src_bytes -= coding->consumed;
3987 coding->errors++;
3988 if (COMPOSING_P (coding))
3989 DECODE_COMPOSITION_END ('1');
3990 while (src_bytes--)
3991 {
3992 int c = *src++;
3993 dst += CHAR_STRING (c, dst);
3994 coding->produced_char++;
3995 }
3996 coding->consumed = coding->consumed_char = src - source;
3997 coding->produced = dst - destination;
3998 }
3999
4000 if (!coding->dst_multibyte)
4001 {
4002 coding->produced = str_as_unibyte (destination, coding->produced);
4003 coding->produced_char = coding->produced;
4004 }
4005
4006 return coding->result;
4007 }
4008
4009 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4010 multibyteness of the source is CODING->src_multibyte, the
4011 multibyteness of the result is always unibyte. */
4012
4013 int
4014 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4015 struct coding_system *coding;
4016 unsigned char *source, *destination;
4017 int src_bytes, dst_bytes;
4018 {
4019 coding->produced = coding->produced_char = 0;
4020 coding->consumed = coding->consumed_char = 0;
4021 coding->errors = 0;
4022 coding->result = CODING_FINISH_NORMAL;
4023
4024 switch (coding->type)
4025 {
4026 case coding_type_sjis:
4027 encode_coding_sjis_big5 (coding, source, destination,
4028 src_bytes, dst_bytes, 1);
4029 break;
4030
4031 case coding_type_iso2022:
4032 encode_coding_iso2022 (coding, source, destination,
4033 src_bytes, dst_bytes);
4034 break;
4035
4036 case coding_type_big5:
4037 encode_coding_sjis_big5 (coding, source, destination,
4038 src_bytes, dst_bytes, 0);
4039 break;
4040
4041 case coding_type_emacs_mule:
4042 encode_coding_emacs_mule (coding, source, destination,
4043 src_bytes, dst_bytes);
4044 break;
4045
4046 case coding_type_ccl:
4047 ccl_coding_driver (coding, source, destination,
4048 src_bytes, dst_bytes, 1);
4049 break;
4050
4051 default:
4052 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4053 }
4054
4055 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4056 && coding->consumed == src_bytes)
4057 coding->result = CODING_FINISH_NORMAL;
4058
4059 if (coding->mode & CODING_MODE_LAST_BLOCK)
4060 {
4061 unsigned char *src = source + coding->consumed;
4062 unsigned char *src_end = src + src_bytes;
4063 unsigned char *dst = destination + coding->produced;
4064
4065 if (coding->type == coding_type_iso2022)
4066 ENCODE_RESET_PLANE_AND_REGISTER;
4067 if (COMPOSING_P (coding))
4068 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4069 if (coding->consumed < src_bytes)
4070 {
4071 int len = src_bytes - coding->consumed;
4072
4073 BCOPY_SHORT (source + coding->consumed, dst, len);
4074 if (coding->src_multibyte)
4075 len = str_as_unibyte (dst, len);
4076 dst += len;
4077 coding->consumed = src_bytes;
4078 }
4079 coding->produced = coding->produced_char = dst - destination;
4080 }
4081
4082 return coding->result;
4083 }
4084
4085 /* Scan text in the region between *BEG and *END (byte positions),
4086 skip characters which we don't have to decode by coding system
4087 CODING at the head and tail, then set *BEG and *END to the region
4088 of the text we actually have to convert. The caller should move
4089 the gap out of the region in advance if the region is from a
4090 buffer.
4091
4092 If STR is not NULL, *BEG and *END are indices into STR. */
4093
4094 static void
4095 shrink_decoding_region (beg, end, coding, str)
4096 int *beg, *end;
4097 struct coding_system *coding;
4098 unsigned char *str;
4099 {
4100 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4101 int eol_conversion;
4102 Lisp_Object translation_table;
4103
4104 if (coding->type == coding_type_ccl
4105 || coding->type == coding_type_undecided
4106 || coding->eol_type != CODING_EOL_LF
4107 || !NILP (coding->post_read_conversion)
4108 || coding->composing != COMPOSITION_DISABLED)
4109 {
4110 /* We can't skip any data. */
4111 return;
4112 }
4113 if (coding->type == coding_type_no_conversion
4114 || coding->type == coding_type_raw_text
4115 || coding->type == coding_type_emacs_mule)
4116 {
4117 /* We need no conversion, but don't have to skip any data here.
4118 Decoding routine handles them effectively anyway. */
4119 return;
4120 }
4121
4122 translation_table = coding->translation_table_for_decode;
4123 if (NILP (translation_table) && !NILP (Venable_character_translation))
4124 translation_table = Vstandard_translation_table_for_decode;
4125 if (CHAR_TABLE_P (translation_table))
4126 {
4127 int i;
4128 for (i = 0; i < 128; i++)
4129 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4130 break;
4131 if (i < 128)
4132 /* Some ASCII character should be translated. We give up
4133 shrinking. */
4134 return;
4135 }
4136
4137 if (coding->heading_ascii >= 0)
4138 /* Detection routine has already found how much we can skip at the
4139 head. */
4140 *beg += coding->heading_ascii;
4141
4142 if (str)
4143 {
4144 begp_orig = begp = str + *beg;
4145 endp_orig = endp = str + *end;
4146 }
4147 else
4148 {
4149 begp_orig = begp = BYTE_POS_ADDR (*beg);
4150 endp_orig = endp = begp + *end - *beg;
4151 }
4152
4153 eol_conversion = (coding->eol_type == CODING_EOL_CR
4154 || coding->eol_type == CODING_EOL_CRLF);
4155
4156 switch (coding->type)
4157 {
4158 case coding_type_sjis:
4159 case coding_type_big5:
4160 /* We can skip all ASCII characters at the head. */
4161 if (coding->heading_ascii < 0)
4162 {
4163 if (eol_conversion)
4164 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4165 else
4166 while (begp < endp && *begp < 0x80) begp++;
4167 }
4168 /* We can skip all ASCII characters at the tail except for the
4169 second byte of SJIS or BIG5 code. */
4170 if (eol_conversion)
4171 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4172 else
4173 while (begp < endp && endp[-1] < 0x80) endp--;
4174 /* Do not consider LF as ascii if preceded by CR, since that
4175 confuses eol decoding. */
4176 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4177 endp++;
4178 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4179 endp++;
4180 break;
4181
4182 case coding_type_iso2022:
4183 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4184 /* We can't skip any data. */
4185 break;
4186 if (coding->heading_ascii < 0)
4187 {
4188 /* We can skip all ASCII characters at the head except for a
4189 few control codes. */
4190 while (begp < endp && (c = *begp) < 0x80
4191 && c != ISO_CODE_CR && c != ISO_CODE_SO
4192 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4193 && (!eol_conversion || c != ISO_CODE_LF))
4194 begp++;
4195 }
4196 switch (coding->category_idx)
4197 {
4198 case CODING_CATEGORY_IDX_ISO_8_1:
4199 case CODING_CATEGORY_IDX_ISO_8_2:
4200 /* We can skip all ASCII characters at the tail. */
4201 if (eol_conversion)
4202 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4203 else
4204 while (begp < endp && endp[-1] < 0x80) endp--;
4205 /* Do not consider LF as ascii if preceded by CR, since that
4206 confuses eol decoding. */
4207 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4208 endp++;
4209 break;
4210
4211 case CODING_CATEGORY_IDX_ISO_7:
4212 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4213 {
4214 /* We can skip all charactes at the tail except for 8-bit
4215 codes and ESC and the following 2-byte at the tail. */
4216 unsigned char *eight_bit = NULL;
4217
4218 if (eol_conversion)
4219 while (begp < endp
4220 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4221 {
4222 if (!eight_bit && c & 0x80) eight_bit = endp;
4223 endp--;
4224 }
4225 else
4226 while (begp < endp
4227 && (c = endp[-1]) != ISO_CODE_ESC)
4228 {
4229 if (!eight_bit && c & 0x80) eight_bit = endp;
4230 endp--;
4231 }
4232 /* Do not consider LF as ascii if preceded by CR, since that
4233 confuses eol decoding. */
4234 if (begp < endp && endp < endp_orig
4235 && endp[-1] == '\r' && endp[0] == '\n')
4236 endp++;
4237 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4238 {
4239 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4240 /* This is an ASCII designation sequence. We can
4241 surely skip the tail. But, if we have
4242 encountered an 8-bit code, skip only the codes
4243 after that. */
4244 endp = eight_bit ? eight_bit : endp + 2;
4245 else
4246 /* Hmmm, we can't skip the tail. */
4247 endp = endp_orig;
4248 }
4249 else if (eight_bit)
4250 endp = eight_bit;
4251 }
4252 }
4253 break;
4254
4255 default:
4256 abort ();
4257 }
4258 *beg += begp - begp_orig;
4259 *end += endp - endp_orig;
4260 return;
4261 }
4262
4263 /* Like shrink_decoding_region but for encoding. */
4264
4265 static void
4266 shrink_encoding_region (beg, end, coding, str)
4267 int *beg, *end;
4268 struct coding_system *coding;
4269 unsigned char *str;
4270 {
4271 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4272 int eol_conversion;
4273 Lisp_Object translation_table;
4274
4275 if (coding->type == coding_type_ccl
4276 || coding->eol_type == CODING_EOL_CRLF
4277 || coding->eol_type == CODING_EOL_CR
4278 || coding->cmp_data && coding->cmp_data->used > 0)
4279 {
4280 /* We can't skip any data. */
4281 return;
4282 }
4283 if (coding->type == coding_type_no_conversion
4284 || coding->type == coding_type_raw_text
4285 || coding->type == coding_type_emacs_mule
4286 || coding->type == coding_type_undecided)
4287 {
4288 /* We need no conversion, but don't have to skip any data here.
4289 Encoding routine handles them effectively anyway. */
4290 return;
4291 }
4292
4293 translation_table = coding->translation_table_for_encode;
4294 if (NILP (translation_table) && !NILP (Venable_character_translation))
4295 translation_table = Vstandard_translation_table_for_encode;
4296 if (CHAR_TABLE_P (translation_table))
4297 {
4298 int i;
4299 for (i = 0; i < 128; i++)
4300 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4301 break;
4302 if (i < 128)
4303 /* Some ASCII character should be tranlsated. We give up
4304 shrinking. */
4305 return;
4306 }
4307
4308 if (str)
4309 {
4310 begp_orig = begp = str + *beg;
4311 endp_orig = endp = str + *end;
4312 }
4313 else
4314 {
4315 begp_orig = begp = BYTE_POS_ADDR (*beg);
4316 endp_orig = endp = begp + *end - *beg;
4317 }
4318
4319 eol_conversion = (coding->eol_type == CODING_EOL_CR
4320 || coding->eol_type == CODING_EOL_CRLF);
4321
4322 /* Here, we don't have to check coding->pre_write_conversion because
4323 the caller is expected to have handled it already. */
4324 switch (coding->type)
4325 {
4326 case coding_type_iso2022:
4327 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4328 /* We can't skip any data. */
4329 break;
4330 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4331 {
4332 unsigned char *bol = begp;
4333 while (begp < endp && *begp < 0x80)
4334 {
4335 begp++;
4336 if (begp[-1] == '\n')
4337 bol = begp;
4338 }
4339 begp = bol;
4340 goto label_skip_tail;
4341 }
4342 /* fall down ... */
4343
4344 case coding_type_sjis:
4345 case coding_type_big5:
4346 /* We can skip all ASCII characters at the head and tail. */
4347 if (eol_conversion)
4348 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4349 else
4350 while (begp < endp && *begp < 0x80) begp++;
4351 label_skip_tail:
4352 if (eol_conversion)
4353 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4354 else
4355 while (begp < endp && *(endp - 1) < 0x80) endp--;
4356 break;
4357
4358 default:
4359 abort ();
4360 }
4361
4362 *beg += begp - begp_orig;
4363 *end += endp - endp_orig;
4364 return;
4365 }
4366
4367 /* As shrinking conversion region requires some overhead, we don't try
4368 shrinking if the length of conversion region is less than this
4369 value. */
4370 static int shrink_conversion_region_threshhold = 1024;
4371
4372 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4373 do { \
4374 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4375 { \
4376 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4377 else shrink_decoding_region (beg, end, coding, str); \
4378 } \
4379 } while (0)
4380
4381 static Lisp_Object
4382 code_convert_region_unwind (dummy)
4383 Lisp_Object dummy;
4384 {
4385 inhibit_pre_post_conversion = 0;
4386 return Qnil;
4387 }
4388
4389 /* Store information about all compositions in the range FROM and TO
4390 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4391 buffer or a string, defaults to the current buffer. */
4392
4393 void
4394 coding_save_composition (coding, from, to, obj)
4395 struct coding_system *coding;
4396 int from, to;
4397 Lisp_Object obj;
4398 {
4399 Lisp_Object prop;
4400 int start, end;
4401
4402 if (coding->composing == COMPOSITION_DISABLED)
4403 return;
4404 if (!coding->cmp_data)
4405 coding_allocate_composition_data (coding, from);
4406 if (!find_composition (from, to, &start, &end, &prop, obj)
4407 || end > to)
4408 return;
4409 if (start < from
4410 && (!find_composition (end, to, &start, &end, &prop, obj)
4411 || end > to))
4412 return;
4413 coding->composing = COMPOSITION_NO;
4414 do
4415 {
4416 if (COMPOSITION_VALID_P (start, end, prop))
4417 {
4418 enum composition_method method = COMPOSITION_METHOD (prop);
4419 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4420 >= COMPOSITION_DATA_SIZE)
4421 coding_allocate_composition_data (coding, from);
4422 /* For relative composition, we remember start and end
4423 positions, for the other compositions, we also remember
4424 components. */
4425 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4426 if (method != COMPOSITION_RELATIVE)
4427 {
4428 /* We must store a*/
4429 Lisp_Object val, ch;
4430
4431 val = COMPOSITION_COMPONENTS (prop);
4432 if (CONSP (val))
4433 while (CONSP (val))
4434 {
4435 ch = XCAR (val), val = XCDR (val);
4436 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4437 }
4438 else if (VECTORP (val) || STRINGP (val))
4439 {
4440 int len = (VECTORP (val)
4441 ? XVECTOR (val)->size : XSTRING (val)->size);
4442 int i;
4443 for (i = 0; i < len; i++)
4444 {
4445 ch = (STRINGP (val)
4446 ? Faref (val, make_number (i))
4447 : XVECTOR (val)->contents[i]);
4448 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4449 }
4450 }
4451 else /* INTEGERP (val) */
4452 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4453 }
4454 CODING_ADD_COMPOSITION_END (coding, end - from);
4455 }
4456 start = end;
4457 }
4458 while (start < to
4459 && find_composition (start, to, &start, &end, &prop, obj)
4460 && end <= to);
4461
4462 /* Make coding->cmp_data point to the first memory block. */
4463 while (coding->cmp_data->prev)
4464 coding->cmp_data = coding->cmp_data->prev;
4465 coding->cmp_data_start = 0;
4466 }
4467
4468 /* Reflect the saved information about compositions to OBJ.
4469 CODING->cmp_data points to a memory block for the informaiton. OBJ
4470 is a buffer or a string, defaults to the current buffer. */
4471
4472 void
4473 coding_restore_composition (coding, obj)
4474 struct coding_system *coding;
4475 Lisp_Object obj;
4476 {
4477 struct composition_data *cmp_data = coding->cmp_data;
4478
4479 if (!cmp_data)
4480 return;
4481
4482 while (cmp_data->prev)
4483 cmp_data = cmp_data->prev;
4484
4485 while (cmp_data)
4486 {
4487 int i;
4488
4489 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4490 {
4491 int *data = cmp_data->data + i;
4492 enum composition_method method = (enum composition_method) data[3];
4493 Lisp_Object components;
4494
4495 if (method == COMPOSITION_RELATIVE)
4496 components = Qnil;
4497 else
4498 {
4499 int len = data[0] - 4, j;
4500 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4501
4502 for (j = 0; j < len; j++)
4503 args[j] = make_number (data[4 + j]);
4504 components = (method == COMPOSITION_WITH_ALTCHARS
4505 ? Fstring (len, args) : Fvector (len, args));
4506 }
4507 compose_text (data[1], data[2], components, Qnil, obj);
4508 }
4509 cmp_data = cmp_data->next;
4510 }
4511 }
4512
4513 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4514 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4515 coding system CODING, and return the status code of code conversion
4516 (currently, this value has no meaning).
4517
4518 How many characters (and bytes) are converted to how many
4519 characters (and bytes) are recorded in members of the structure
4520 CODING.
4521
4522 If REPLACE is nonzero, we do various things as if the original text
4523 is deleted and a new text is inserted. See the comments in
4524 replace_range (insdel.c) to know what we are doing.
4525
4526 If REPLACE is zero, it is assumed that the source text is unibyte.
4527 Otherwize, it is assumed that the source text is multibyte. */
4528
4529 int
4530 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4531 int from, from_byte, to, to_byte, encodep, replace;
4532 struct coding_system *coding;
4533 {
4534 int len = to - from, len_byte = to_byte - from_byte;
4535 int require, inserted, inserted_byte;
4536 int head_skip, tail_skip, total_skip = 0;
4537 Lisp_Object saved_coding_symbol;
4538 int first = 1;
4539 unsigned char *src, *dst;
4540 Lisp_Object deletion;
4541 int orig_point = PT, orig_len = len;
4542 int prev_Z;
4543 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4544
4545 coding->src_multibyte = replace && multibyte_p;
4546 coding->dst_multibyte = multibyte_p;
4547
4548 deletion = Qnil;
4549 saved_coding_symbol = Qnil;
4550
4551 if (from < PT && PT < to)
4552 {
4553 TEMP_SET_PT_BOTH (from, from_byte);
4554 orig_point = from;
4555 }
4556
4557 if (replace)
4558 {
4559 int saved_from = from;
4560
4561 prepare_to_modify_buffer (from, to, &from);
4562 if (saved_from != from)
4563 {
4564 to = from + len;
4565 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4566 len_byte = to_byte - from_byte;
4567 }
4568 }
4569
4570 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4571 {
4572 /* We must detect encoding of text and eol format. */
4573
4574 if (from < GPT && to > GPT)
4575 move_gap_both (from, from_byte);
4576 if (coding->type == coding_type_undecided)
4577 {
4578 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4579 if (coding->type == coding_type_undecided)
4580 /* It seems that the text contains only ASCII, but we
4581 should not left it undecided because the deeper
4582 decoding routine (decode_coding) tries to detect the
4583 encodings again in vain. */
4584 coding->type = coding_type_emacs_mule;
4585 }
4586 if (coding->eol_type == CODING_EOL_UNDECIDED)
4587 {
4588 saved_coding_symbol = coding->symbol;
4589 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4590 if (coding->eol_type == CODING_EOL_UNDECIDED)
4591 coding->eol_type = CODING_EOL_LF;
4592 /* We had better recover the original eol format if we
4593 encounter an inconsitent eol format while decoding. */
4594 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4595 }
4596 }
4597
4598 /* Now we convert the text. */
4599
4600 /* For encoding, we must process pre-write-conversion in advance. */
4601 if (! inhibit_pre_post_conversion
4602 && encodep
4603 && SYMBOLP (coding->pre_write_conversion)
4604 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4605 {
4606 /* The function in pre-write-conversion may put a new text in a
4607 new buffer. */
4608 struct buffer *prev = current_buffer;
4609 Lisp_Object new;
4610 int count = specpdl_ptr - specpdl;
4611
4612 record_unwind_protect (code_convert_region_unwind, Qnil);
4613 /* We should not call any more pre-write/post-read-conversion
4614 functions while this pre-write-conversion is running. */
4615 inhibit_pre_post_conversion = 1;
4616 call2 (coding->pre_write_conversion,
4617 make_number (from), make_number (to));
4618 inhibit_pre_post_conversion = 0;
4619 /* Discard the unwind protect. */
4620 specpdl_ptr--;
4621
4622 if (current_buffer != prev)
4623 {
4624 len = ZV - BEGV;
4625 new = Fcurrent_buffer ();
4626 set_buffer_internal_1 (prev);
4627 del_range_2 (from, from_byte, to, to_byte, 0);
4628 TEMP_SET_PT_BOTH (from, from_byte);
4629 insert_from_buffer (XBUFFER (new), 1, len, 0);
4630 Fkill_buffer (new);
4631 if (orig_point >= to)
4632 orig_point += len - orig_len;
4633 else if (orig_point > from)
4634 orig_point = from;
4635 orig_len = len;
4636 to = from + len;
4637 from_byte = CHAR_TO_BYTE (from);
4638 to_byte = CHAR_TO_BYTE (to);
4639 len_byte = to_byte - from_byte;
4640 TEMP_SET_PT_BOTH (from, from_byte);
4641 }
4642 }
4643
4644 if (replace)
4645 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4646
4647 if (coding->composing != COMPOSITION_DISABLED)
4648 {
4649 if (encodep)
4650 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4651 else
4652 coding_allocate_composition_data (coding, from);
4653 }
4654
4655 /* Try to skip the heading and tailing ASCIIs. */
4656 {
4657 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4658
4659 if (from < GPT && GPT < to)
4660 move_gap_both (from, from_byte);
4661 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4662 if (from_byte == to_byte
4663 && (encodep || NILP (coding->post_read_conversion))
4664 && ! CODING_REQUIRE_FLUSHING (coding))
4665 {
4666 coding->produced = len_byte;
4667 coding->produced_char = len;
4668 if (!replace)
4669 /* We must record and adjust for this new text now. */
4670 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4671 return 0;
4672 }
4673
4674 head_skip = from_byte - from_byte_orig;
4675 tail_skip = to_byte_orig - to_byte;
4676 total_skip = head_skip + tail_skip;
4677 from += head_skip;
4678 to -= tail_skip;
4679 len -= total_skip; len_byte -= total_skip;
4680 }
4681
4682 /* The code conversion routine can not preserve text properties for
4683 now. So, we must remove all text properties in the region.
4684 Here, we must suppress all modification hooks. */
4685 if (replace)
4686 {
4687 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4688 inhibit_modification_hooks = 1;
4689 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4690 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4691 }
4692
4693 /* For converion, we must put the gap before the text in addition to
4694 making the gap larger for efficient decoding. The required gap
4695 size starts from 2000 which is the magic number used in make_gap.
4696 But, after one batch of conversion, it will be incremented if we
4697 find that it is not enough . */
4698 require = 2000;
4699
4700 if (GAP_SIZE < require)
4701 make_gap (require - GAP_SIZE);
4702 move_gap_both (from, from_byte);
4703
4704 inserted = inserted_byte = 0;
4705
4706 GAP_SIZE += len_byte;
4707 ZV -= len;
4708 Z -= len;
4709 ZV_BYTE -= len_byte;
4710 Z_BYTE -= len_byte;
4711
4712 if (GPT - BEG < BEG_UNCHANGED)
4713 BEG_UNCHANGED = GPT - BEG;
4714 if (Z - GPT < END_UNCHANGED)
4715 END_UNCHANGED = Z - GPT;
4716
4717 if (!encodep && coding->src_multibyte)
4718 {
4719 /* Decoding routines expects that the source text is unibyte.
4720 We must convert 8-bit characters of multibyte form to
4721 unibyte. */
4722 int len_byte_orig = len_byte;
4723 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4724 if (len_byte < len_byte_orig)
4725 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4726 len_byte);
4727 coding->src_multibyte = 0;
4728 }
4729
4730 for (;;)
4731 {
4732 int result;
4733
4734 /* The buffer memory is now:
4735 +--------+converted-text+---------+-------original-text-------+---+
4736 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4737 |<---------------------- GAP ----------------------->| */
4738 src = GAP_END_ADDR - len_byte;
4739 dst = GPT_ADDR + inserted_byte;
4740
4741 if (encodep)
4742 result = encode_coding (coding, src, dst, len_byte, 0);
4743 else
4744 result = decode_coding (coding, src, dst, len_byte, 0);
4745
4746 /* The buffer memory is now:
4747 +--------+-------converted-text----+--+------original-text----+---+
4748 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4749 |<---------------------- GAP ----------------------->| */
4750
4751 inserted += coding->produced_char;
4752 inserted_byte += coding->produced;
4753 len_byte -= coding->consumed;
4754
4755 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4756 {
4757 coding_allocate_composition_data (coding, from + inserted);
4758 continue;
4759 }
4760
4761 src += coding->consumed;
4762 dst += coding->produced;
4763
4764 if (result == CODING_FINISH_NORMAL)
4765 {
4766 src += len_byte;
4767 break;
4768 }
4769 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4770 {
4771 unsigned char *pend = dst, *p = pend - inserted_byte;
4772 Lisp_Object eol_type;
4773
4774 /* Encode LFs back to the original eol format (CR or CRLF). */
4775 if (coding->eol_type == CODING_EOL_CR)
4776 {
4777 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4778 }
4779 else
4780 {
4781 int count = 0;
4782
4783 while (p < pend) if (*p++ == '\n') count++;
4784 if (src - dst < count)
4785 {
4786 /* We don't have sufficient room for encoding LFs
4787 back to CRLF. We must record converted and
4788 not-yet-converted text back to the buffer
4789 content, enlarge the gap, then record them out of
4790 the buffer contents again. */
4791 int add = len_byte + inserted_byte;
4792
4793 GAP_SIZE -= add;
4794 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4795 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4796 make_gap (count - GAP_SIZE);
4797 GAP_SIZE += add;
4798 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4799 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4800 /* Don't forget to update SRC, DST, and PEND. */
4801 src = GAP_END_ADDR - len_byte;
4802 dst = GPT_ADDR + inserted_byte;
4803 pend = dst;
4804 }
4805 inserted += count;
4806 inserted_byte += count;
4807 coding->produced += count;
4808 p = dst = pend + count;
4809 while (count)
4810 {
4811 *--p = *--pend;
4812 if (*p == '\n') count--, *--p = '\r';
4813 }
4814 }
4815
4816 /* Suppress eol-format conversion in the further conversion. */
4817 coding->eol_type = CODING_EOL_LF;
4818
4819 /* Set the coding system symbol to that for Unix-like EOL. */
4820 eol_type = Fget (saved_coding_symbol, Qeol_type);
4821 if (VECTORP (eol_type)
4822 && XVECTOR (eol_type)->size == 3
4823 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4824 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4825 else
4826 coding->symbol = saved_coding_symbol;
4827
4828 continue;
4829 }
4830 if (len_byte <= 0)
4831 {
4832 if (coding->type != coding_type_ccl
4833 || coding->mode & CODING_MODE_LAST_BLOCK)
4834 break;
4835 coding->mode |= CODING_MODE_LAST_BLOCK;
4836 continue;
4837 }
4838 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4839 {
4840 /* The source text ends in invalid codes. Let's just
4841 make them valid buffer contents, and finish conversion. */
4842 inserted += len_byte;
4843 inserted_byte += len_byte;
4844 while (len_byte--)
4845 *dst++ = *src++;
4846 break;
4847 }
4848 if (result == CODING_FINISH_INTERRUPT)
4849 {
4850 /* The conversion procedure was interrupted by a user. */
4851 break;
4852 }
4853 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4854 if (coding->consumed < 1)
4855 {
4856 /* It's quite strange to require more memory without
4857 consuming any bytes. Perhaps CCL program bug. */
4858 break;
4859 }
4860 if (first)
4861 {
4862 /* We have just done the first batch of conversion which was
4863 stoped because of insufficient gap. Let's reconsider the
4864 required gap size (i.e. SRT - DST) now.
4865
4866 We have converted ORIG bytes (== coding->consumed) into
4867 NEW bytes (coding->produced). To convert the remaining
4868 LEN bytes, we may need REQUIRE bytes of gap, where:
4869 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4870 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4871 Here, we are sure that NEW >= ORIG. */
4872 float ratio = coding->produced - coding->consumed;
4873 ratio /= coding->consumed;
4874 require = len_byte * ratio;
4875 first = 0;
4876 }
4877 if ((src - dst) < (require + 2000))
4878 {
4879 /* See the comment above the previous call of make_gap. */
4880 int add = len_byte + inserted_byte;
4881
4882 GAP_SIZE -= add;
4883 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4884 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4885 make_gap (require + 2000);
4886 GAP_SIZE += add;
4887 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4888 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4889 }
4890 }
4891 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4892
4893 if (encodep && coding->dst_multibyte)
4894 {
4895 /* The output is unibyte. We must convert 8-bit characters to
4896 multibyte form. */
4897 if (inserted_byte * 2 > GAP_SIZE)
4898 {
4899 GAP_SIZE -= inserted_byte;
4900 ZV += inserted_byte; Z += inserted_byte;
4901 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4902 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4903 make_gap (inserted_byte - GAP_SIZE);
4904 GAP_SIZE += inserted_byte;
4905 ZV -= inserted_byte; Z -= inserted_byte;
4906 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4907 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4908 }
4909 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4910 }
4911
4912 /* If we have shrinked the conversion area, adjust it now. */
4913 if (total_skip > 0)
4914 {
4915 if (tail_skip > 0)
4916 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4917 inserted += total_skip; inserted_byte += total_skip;
4918 GAP_SIZE += total_skip;
4919 GPT -= head_skip; GPT_BYTE -= head_skip;
4920 ZV -= total_skip; ZV_BYTE -= total_skip;
4921 Z -= total_skip; Z_BYTE -= total_skip;
4922 from -= head_skip; from_byte -= head_skip;
4923 to += tail_skip; to_byte += tail_skip;
4924 }
4925
4926 prev_Z = Z;
4927 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4928 inserted = Z - prev_Z;
4929
4930 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4931 coding_restore_composition (coding, Fcurrent_buffer ());
4932 coding_free_composition_data (coding);
4933
4934 if (! inhibit_pre_post_conversion
4935 && ! encodep && ! NILP (coding->post_read_conversion))
4936 {
4937 Lisp_Object val;
4938 int count = specpdl_ptr - specpdl;
4939
4940 if (from != PT)
4941 TEMP_SET_PT_BOTH (from, from_byte);
4942 prev_Z = Z;
4943 record_unwind_protect (code_convert_region_unwind, Qnil);
4944 /* We should not call any more pre-write/post-read-conversion
4945 functions while this post-read-conversion is running. */
4946 inhibit_pre_post_conversion = 1;
4947 val = call1 (coding->post_read_conversion, make_number (inserted));
4948 inhibit_pre_post_conversion = 0;
4949 /* Discard the unwind protect. */
4950 specpdl_ptr--;
4951 CHECK_NUMBER (val, 0);
4952 inserted += Z - prev_Z;
4953 }
4954
4955 if (orig_point >= from)
4956 {
4957 if (orig_point >= from + orig_len)
4958 orig_point += inserted - orig_len;
4959 else
4960 orig_point = from;
4961 TEMP_SET_PT (orig_point);
4962 }
4963
4964 if (replace)
4965 {
4966 signal_after_change (from, to - from, inserted);
4967 update_compositions (from, from + inserted, CHECK_BORDER);
4968 }
4969
4970 {
4971 coding->consumed = to_byte - from_byte;
4972 coding->consumed_char = to - from;
4973 coding->produced = inserted_byte;
4974 coding->produced_char = inserted;
4975 }
4976
4977 return 0;
4978 }
4979
4980 Lisp_Object
4981 run_pre_post_conversion_on_str (str, coding, encodep)
4982 Lisp_Object str;
4983 struct coding_system *coding;
4984 int encodep;
4985 {
4986 int count = specpdl_ptr - specpdl;
4987 struct gcpro gcpro1;
4988 struct buffer *prev = current_buffer;
4989 int multibyte = STRING_MULTIBYTE (str);
4990
4991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4992 record_unwind_protect (code_convert_region_unwind, Qnil);
4993 GCPRO1 (str);
4994 temp_output_buffer_setup (" *code-converting-work*");
4995 set_buffer_internal (XBUFFER (Vstandard_output));
4996 /* We must insert the contents of STR as is without
4997 unibyte<->multibyte conversion. For that, we adjust the
4998 multibyteness of the working buffer to that of STR. */
4999 Ferase_buffer ();
5000 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5001 insert_from_string (str, 0, 0,
5002 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5003 UNGCPRO;
5004 inhibit_pre_post_conversion = 1;
5005 if (encodep)
5006 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5007 else
5008 {
5009 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5010 call1 (coding->post_read_conversion, make_number (Z - BEG));
5011 }
5012 inhibit_pre_post_conversion = 0;
5013 str = make_buffer_string (BEG, Z, 0);
5014 return unbind_to (count, str);
5015 }
5016
5017 Lisp_Object
5018 decode_coding_string (str, coding, nocopy)
5019 Lisp_Object str;
5020 struct coding_system *coding;
5021 int nocopy;
5022 {
5023 int len;
5024 char *buf;
5025 int from, to, to_byte;
5026 struct gcpro gcpro1;
5027 Lisp_Object saved_coding_symbol;
5028 int result;
5029
5030 from = 0;
5031 to = XSTRING (str)->size;
5032 to_byte = STRING_BYTES (XSTRING (str));
5033
5034 saved_coding_symbol = Qnil;
5035 if (CODING_REQUIRE_DETECTION (coding))
5036 {
5037 /* See the comments in code_convert_region. */
5038 if (coding->type == coding_type_undecided)
5039 {
5040 detect_coding (coding, XSTRING (str)->data, to_byte);
5041 if (coding->type == coding_type_undecided)
5042 coding->type = coding_type_emacs_mule;
5043 }
5044 if (coding->eol_type == CODING_EOL_UNDECIDED)
5045 {
5046 saved_coding_symbol = coding->symbol;
5047 detect_eol (coding, XSTRING (str)->data, to_byte);
5048 if (coding->eol_type == CODING_EOL_UNDECIDED)
5049 coding->eol_type = CODING_EOL_LF;
5050 /* We had better recover the original eol format if we
5051 encounter an inconsitent eol format while decoding. */
5052 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5053 }
5054 }
5055
5056 if (! CODING_REQUIRE_DECODING (coding))
5057 {
5058 if (!STRING_MULTIBYTE (str))
5059 {
5060 str = Fstring_as_multibyte (str);
5061 nocopy = 1;
5062 }
5063 return (nocopy ? str : Fcopy_sequence (str));
5064 }
5065
5066 if (STRING_MULTIBYTE (str))
5067 {
5068 /* Decoding routines expect the source text to be unibyte. */
5069 str = Fstring_as_unibyte (str);
5070 nocopy = 1;
5071 coding->src_multibyte = 0;
5072 }
5073 coding->dst_multibyte = 1;
5074
5075 if (coding->composing != COMPOSITION_DISABLED)
5076 coding_allocate_composition_data (coding, from);
5077
5078 /* Try to skip the heading and tailing ASCIIs. */
5079 {
5080 int from_orig = from;
5081
5082 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5083 0);
5084 if (from == to_byte)
5085 return (nocopy ? str : Fcopy_sequence (str));
5086 }
5087
5088 len = decoding_buffer_size (coding, to_byte - from);
5089 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5090 GCPRO1 (str);
5091 buf = get_conversion_buffer (len);
5092 UNGCPRO;
5093
5094 if (from > 0)
5095 bcopy (XSTRING (str)->data, buf, from);
5096 result = decode_coding (coding, XSTRING (str)->data + from,
5097 buf + from, to_byte - from, len);
5098 if (result == CODING_FINISH_INCONSISTENT_EOL)
5099 {
5100 /* We simply try to decode the whole string again but without
5101 eol-conversion this time. */
5102 coding->eol_type = CODING_EOL_LF;
5103 coding->symbol = saved_coding_symbol;
5104 coding_free_composition_data (coding);
5105 return decode_coding_string (str, coding, nocopy);
5106 }
5107
5108 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5109 STRING_BYTES (XSTRING (str)) - to_byte);
5110
5111 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5112 str = make_multibyte_string (buf, len + coding->produced_char,
5113 len + coding->produced);
5114
5115 if (coding->cmp_data && coding->cmp_data->used)
5116 coding_restore_composition (coding, str);
5117 coding_free_composition_data (coding);
5118
5119 if (SYMBOLP (coding->post_read_conversion)
5120 && !NILP (Ffboundp (coding->post_read_conversion)))
5121 str = run_pre_post_conversion_on_str (str, coding, 0);
5122
5123 return str;
5124 }
5125
5126 Lisp_Object
5127 encode_coding_string (str, coding, nocopy)
5128 Lisp_Object str;
5129 struct coding_system *coding;
5130 int nocopy;
5131 {
5132 int len;
5133 char *buf;
5134 int from, to, to_byte;
5135 struct gcpro gcpro1;
5136 Lisp_Object saved_coding_symbol;
5137 int result;
5138
5139 if (SYMBOLP (coding->pre_write_conversion)
5140 && !NILP (Ffboundp (coding->pre_write_conversion)))
5141 str = run_pre_post_conversion_on_str (str, coding, 1);
5142
5143 from = 0;
5144 to = XSTRING (str)->size;
5145 to_byte = STRING_BYTES (XSTRING (str));
5146
5147 saved_coding_symbol = Qnil;
5148 if (! CODING_REQUIRE_ENCODING (coding))
5149 {
5150 if (STRING_MULTIBYTE (str))
5151 {
5152 str = Fstring_as_unibyte (str);
5153 nocopy = 1;
5154 }
5155 return (nocopy ? str : Fcopy_sequence (str));
5156 }
5157
5158 /* Encoding routines determine the multibyteness of the source text
5159 by coding->src_multibyte. */
5160 coding->src_multibyte = STRING_MULTIBYTE (str);
5161 coding->dst_multibyte = 0;
5162
5163 if (coding->composing != COMPOSITION_DISABLED)
5164 coding_save_composition (coding, from, to, str);
5165
5166 /* Try to skip the heading and tailing ASCIIs. */
5167 {
5168 int from_orig = from;
5169
5170 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5171 1);
5172 if (from == to_byte)
5173 return (nocopy ? str : Fcopy_sequence (str));
5174 }
5175
5176 len = encoding_buffer_size (coding, to_byte - from);
5177 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5178 GCPRO1 (str);
5179 buf = get_conversion_buffer (len);
5180 UNGCPRO;
5181
5182 if (from > 0)
5183 bcopy (XSTRING (str)->data, buf, from);
5184 result = encode_coding (coding, XSTRING (str)->data + from,
5185 buf + from, to_byte - from, len);
5186 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5187 STRING_BYTES (XSTRING (str)) - to_byte);
5188
5189 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5190 str = make_unibyte_string (buf, len + coding->produced);
5191 coding_free_composition_data (coding);
5192
5193 return str;
5194 }
5195
5196 \f
5197 #ifdef emacs
5198 /*** 8. Emacs Lisp library functions ***/
5199
5200 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5201 "Return t if OBJECT is nil or a coding-system.\n\
5202 See the documentation of `make-coding-system' for information\n\
5203 about coding-system objects.")
5204 (obj)
5205 Lisp_Object obj;
5206 {
5207 if (NILP (obj))
5208 return Qt;
5209 if (!SYMBOLP (obj))
5210 return Qnil;
5211 /* Get coding-spec vector for OBJ. */
5212 obj = Fget (obj, Qcoding_system);
5213 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5214 ? Qt : Qnil);
5215 }
5216
5217 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5218 Sread_non_nil_coding_system, 1, 1, 0,
5219 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5220 (prompt)
5221 Lisp_Object prompt;
5222 {
5223 Lisp_Object val;
5224 do
5225 {
5226 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5227 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5228 }
5229 while (XSTRING (val)->size == 0);
5230 return (Fintern (val, Qnil));
5231 }
5232
5233 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5234 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5235 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5236 (prompt, default_coding_system)
5237 Lisp_Object prompt, default_coding_system;
5238 {
5239 Lisp_Object val;
5240 if (SYMBOLP (default_coding_system))
5241 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5242 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5243 Qt, Qnil, Qcoding_system_history,
5244 default_coding_system, Qnil);
5245 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5246 }
5247
5248 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5249 1, 1, 0,
5250 "Check validity of CODING-SYSTEM.\n\
5251 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5252 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5253 The value of property should be a vector of length 5.")
5254 (coding_system)
5255 Lisp_Object coding_system;
5256 {
5257 CHECK_SYMBOL (coding_system, 0);
5258 if (!NILP (Fcoding_system_p (coding_system)))
5259 return coding_system;
5260 while (1)
5261 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5262 }
5263 \f
5264 Lisp_Object
5265 detect_coding_system (src, src_bytes, highest)
5266 unsigned char *src;
5267 int src_bytes, highest;
5268 {
5269 int coding_mask, eol_type;
5270 Lisp_Object val, tmp;
5271 int dummy;
5272
5273 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5274 eol_type = detect_eol_type (src, src_bytes, &dummy);
5275 if (eol_type == CODING_EOL_INCONSISTENT)
5276 eol_type = CODING_EOL_UNDECIDED;
5277
5278 if (!coding_mask)
5279 {
5280 val = Qundecided;
5281 if (eol_type != CODING_EOL_UNDECIDED)
5282 {
5283 Lisp_Object val2;
5284 val2 = Fget (Qundecided, Qeol_type);
5285 if (VECTORP (val2))
5286 val = XVECTOR (val2)->contents[eol_type];
5287 }
5288 return (highest ? val : Fcons (val, Qnil));
5289 }
5290
5291 /* At first, gather possible coding systems in VAL. */
5292 val = Qnil;
5293 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5294 {
5295 Lisp_Object category_val, category_index;
5296
5297 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5298 category_val = Fsymbol_value (XCAR (tmp));
5299 if (!NILP (category_val)
5300 && NATNUMP (category_index)
5301 && (coding_mask & (1 << XFASTINT (category_index))))
5302 {
5303 val = Fcons (category_val, val);
5304 if (highest)
5305 break;
5306 }
5307 }
5308 if (!highest)
5309 val = Fnreverse (val);
5310
5311 /* Then, replace the elements with subsidiary coding systems. */
5312 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5313 {
5314 if (eol_type != CODING_EOL_UNDECIDED
5315 && eol_type != CODING_EOL_INCONSISTENT)
5316 {
5317 Lisp_Object eol;
5318 eol = Fget (XCAR (tmp), Qeol_type);
5319 if (VECTORP (eol))
5320 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5321 }
5322 }
5323 return (highest ? XCAR (val) : val);
5324 }
5325
5326 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5327 2, 3, 0,
5328 "Detect coding system of the text in the region between START and END.\n\
5329 Return a list of possible coding systems ordered by priority.\n\
5330 \n\
5331 If only ASCII characters are found, it returns a list of single element\n\
5332 `undecided' or its subsidiary coding system according to a detected\n\
5333 end-of-line format.\n\
5334 \n\
5335 If optional argument HIGHEST is non-nil, return the coding system of\n\
5336 highest priority.")
5337 (start, end, highest)
5338 Lisp_Object start, end, highest;
5339 {
5340 int from, to;
5341 int from_byte, to_byte;
5342
5343 CHECK_NUMBER_COERCE_MARKER (start, 0);
5344 CHECK_NUMBER_COERCE_MARKER (end, 1);
5345
5346 validate_region (&start, &end);
5347 from = XINT (start), to = XINT (end);
5348 from_byte = CHAR_TO_BYTE (from);
5349 to_byte = CHAR_TO_BYTE (to);
5350
5351 if (from < GPT && to >= GPT)
5352 move_gap_both (to, to_byte);
5353
5354 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5355 to_byte - from_byte,
5356 !NILP (highest));
5357 }
5358
5359 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5360 1, 2, 0,
5361 "Detect coding system of the text in STRING.\n\
5362 Return a list of possible coding systems ordered by priority.\n\
5363 \n\
5364 If only ASCII characters are found, it returns a list of single element\n\
5365 `undecided' or its subsidiary coding system according to a detected\n\
5366 end-of-line format.\n\
5367 \n\
5368 If optional argument HIGHEST is non-nil, return the coding system of\n\
5369 highest priority.")
5370 (string, highest)
5371 Lisp_Object string, highest;
5372 {
5373 CHECK_STRING (string, 0);
5374
5375 return detect_coding_system (XSTRING (string)->data,
5376 STRING_BYTES (XSTRING (string)),
5377 !NILP (highest));
5378 }
5379
5380 Lisp_Object
5381 code_convert_region1 (start, end, coding_system, encodep)
5382 Lisp_Object start, end, coding_system;
5383 int encodep;
5384 {
5385 struct coding_system coding;
5386 int from, to, len;
5387
5388 CHECK_NUMBER_COERCE_MARKER (start, 0);
5389 CHECK_NUMBER_COERCE_MARKER (end, 1);
5390 CHECK_SYMBOL (coding_system, 2);
5391
5392 validate_region (&start, &end);
5393 from = XFASTINT (start);
5394 to = XFASTINT (end);
5395
5396 if (NILP (coding_system))
5397 return make_number (to - from);
5398
5399 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5400 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5401
5402 coding.mode |= CODING_MODE_LAST_BLOCK;
5403 coding.src_multibyte = coding.dst_multibyte
5404 = !NILP (current_buffer->enable_multibyte_characters);
5405 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5406 &coding, encodep, 1);
5407 Vlast_coding_system_used = coding.symbol;
5408 return make_number (coding.produced_char);
5409 }
5410
5411 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5412 3, 3, "r\nzCoding system: ",
5413 "Decode the current region by specified coding system.\n\
5414 When called from a program, takes three arguments:\n\
5415 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5416 This function sets `last-coding-system-used' to the precise coding system\n\
5417 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5418 not fully specified.)\n\
5419 It returns the length of the decoded text.")
5420 (start, end, coding_system)
5421 Lisp_Object start, end, coding_system;
5422 {
5423 return code_convert_region1 (start, end, coding_system, 0);
5424 }
5425
5426 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5427 3, 3, "r\nzCoding system: ",
5428 "Encode the current region by specified coding system.\n\
5429 When called from a program, takes three arguments:\n\
5430 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5431 This function sets `last-coding-system-used' to the precise coding system\n\
5432 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5433 not fully specified.)\n\
5434 It returns the length of the encoded text.")
5435 (start, end, coding_system)
5436 Lisp_Object start, end, coding_system;
5437 {
5438 return code_convert_region1 (start, end, coding_system, 1);
5439 }
5440
5441 Lisp_Object
5442 code_convert_string1 (string, coding_system, nocopy, encodep)
5443 Lisp_Object string, coding_system, nocopy;
5444 int encodep;
5445 {
5446 struct coding_system coding;
5447
5448 CHECK_STRING (string, 0);
5449 CHECK_SYMBOL (coding_system, 1);
5450
5451 if (NILP (coding_system))
5452 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5453
5454 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5455 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5456
5457 coding.mode |= CODING_MODE_LAST_BLOCK;
5458 string = (encodep
5459 ? encode_coding_string (string, &coding, !NILP (nocopy))
5460 : decode_coding_string (string, &coding, !NILP (nocopy)));
5461 Vlast_coding_system_used = coding.symbol;
5462
5463 return string;
5464 }
5465
5466 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5467 2, 3, 0,
5468 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5469 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5470 if the decoding operation is trivial.\n\
5471 This function sets `last-coding-system-used' to the precise coding system\n\
5472 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5473 not fully specified.)")
5474 (string, coding_system, nocopy)
5475 Lisp_Object string, coding_system, nocopy;
5476 {
5477 return code_convert_string1 (string, coding_system, nocopy, 0);
5478 }
5479
5480 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5481 2, 3, 0,
5482 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5483 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5484 if the encoding operation is trivial.\n\
5485 This function sets `last-coding-system-used' to the precise coding system\n\
5486 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5487 not fully specified.)")
5488 (string, coding_system, nocopy)
5489 Lisp_Object string, coding_system, nocopy;
5490 {
5491 return code_convert_string1 (string, coding_system, nocopy, 1);
5492 }
5493
5494 /* Encode or decode STRING according to CODING_SYSTEM.
5495 Do not set Vlast_coding_system_used.
5496
5497 This function is called only from macros DECODE_FILE and
5498 ENCODE_FILE, thus we ignore character composition. */
5499
5500 Lisp_Object
5501 code_convert_string_norecord (string, coding_system, encodep)
5502 Lisp_Object string, coding_system;
5503 int encodep;
5504 {
5505 struct coding_system coding;
5506
5507 CHECK_STRING (string, 0);
5508 CHECK_SYMBOL (coding_system, 1);
5509
5510 if (NILP (coding_system))
5511 return string;
5512
5513 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5514 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5515
5516 coding.composing = COMPOSITION_DISABLED;
5517 coding.mode |= CODING_MODE_LAST_BLOCK;
5518 return (encodep
5519 ? encode_coding_string (string, &coding, 1)
5520 : decode_coding_string (string, &coding, 1));
5521 }
5522 \f
5523 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5524 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5525 Return the corresponding character.")
5526 (code)
5527 Lisp_Object code;
5528 {
5529 unsigned char c1, c2, s1, s2;
5530 Lisp_Object val;
5531
5532 CHECK_NUMBER (code, 0);
5533 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5534 if (s1 == 0)
5535 {
5536 if (s2 < 0x80)
5537 XSETFASTINT (val, s2);
5538 else if (s2 >= 0xA0 || s2 <= 0xDF)
5539 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5540 else
5541 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5542 }
5543 else
5544 {
5545 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5546 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5547 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5548 DECODE_SJIS (s1, s2, c1, c2);
5549 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5550 }
5551 return val;
5552 }
5553
5554 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5555 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5556 Return the corresponding code in SJIS.")
5557 (ch)
5558 Lisp_Object ch;
5559 {
5560 int charset, c1, c2, s1, s2;
5561 Lisp_Object val;
5562
5563 CHECK_NUMBER (ch, 0);
5564 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5565 if (charset == CHARSET_ASCII)
5566 {
5567 val = ch;
5568 }
5569 else if (charset == charset_jisx0208
5570 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5571 {
5572 ENCODE_SJIS (c1, c2, s1, s2);
5573 XSETFASTINT (val, (s1 << 8) | s2);
5574 }
5575 else if (charset == charset_katakana_jisx0201
5576 && c1 > 0x20 && c2 < 0xE0)
5577 {
5578 XSETFASTINT (val, c1 | 0x80);
5579 }
5580 else
5581 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5582 return val;
5583 }
5584
5585 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5586 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5587 Return the corresponding character.")
5588 (code)
5589 Lisp_Object code;
5590 {
5591 int charset;
5592 unsigned char b1, b2, c1, c2;
5593 Lisp_Object val;
5594
5595 CHECK_NUMBER (code, 0);
5596 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5597 if (b1 == 0)
5598 {
5599 if (b2 >= 0x80)
5600 error ("Invalid BIG5 code: %x", XFASTINT (code));
5601 val = code;
5602 }
5603 else
5604 {
5605 if ((b1 < 0xA1 || b1 > 0xFE)
5606 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5607 error ("Invalid BIG5 code: %x", XFASTINT (code));
5608 DECODE_BIG5 (b1, b2, charset, c1, c2);
5609 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5610 }
5611 return val;
5612 }
5613
5614 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5615 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5616 Return the corresponding character code in Big5.")
5617 (ch)
5618 Lisp_Object ch;
5619 {
5620 int charset, c1, c2, b1, b2;
5621 Lisp_Object val;
5622
5623 CHECK_NUMBER (ch, 0);
5624 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5625 if (charset == CHARSET_ASCII)
5626 {
5627 val = ch;
5628 }
5629 else if ((charset == charset_big5_1
5630 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5631 || (charset == charset_big5_2
5632 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5633 {
5634 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5635 XSETFASTINT (val, (b1 << 8) | b2);
5636 }
5637 else
5638 error ("Can't encode to Big5: %d", XFASTINT (ch));
5639 return val;
5640 }
5641 \f
5642 DEFUN ("set-terminal-coding-system-internal",
5643 Fset_terminal_coding_system_internal,
5644 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5645 (coding_system)
5646 Lisp_Object coding_system;
5647 {
5648 CHECK_SYMBOL (coding_system, 0);
5649 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5650 /* We had better not send unsafe characters to terminal. */
5651 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5652 /* Characer composition should be disabled. */
5653 terminal_coding.composing = COMPOSITION_DISABLED;
5654 terminal_coding.src_multibyte = 1;
5655 terminal_coding.dst_multibyte = 0;
5656 return Qnil;
5657 }
5658
5659 DEFUN ("set-safe-terminal-coding-system-internal",
5660 Fset_safe_terminal_coding_system_internal,
5661 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5662 (coding_system)
5663 Lisp_Object coding_system;
5664 {
5665 CHECK_SYMBOL (coding_system, 0);
5666 setup_coding_system (Fcheck_coding_system (coding_system),
5667 &safe_terminal_coding);
5668 /* Characer composition should be disabled. */
5669 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5670 safe_terminal_coding.src_multibyte = 1;
5671 safe_terminal_coding.dst_multibyte = 0;
5672 return Qnil;
5673 }
5674
5675 DEFUN ("terminal-coding-system",
5676 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5677 "Return coding system specified for terminal output.")
5678 ()
5679 {
5680 return terminal_coding.symbol;
5681 }
5682
5683 DEFUN ("set-keyboard-coding-system-internal",
5684 Fset_keyboard_coding_system_internal,
5685 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5686 (coding_system)
5687 Lisp_Object coding_system;
5688 {
5689 CHECK_SYMBOL (coding_system, 0);
5690 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5691 /* Characer composition should be disabled. */
5692 keyboard_coding.composing = COMPOSITION_DISABLED;
5693 return Qnil;
5694 }
5695
5696 DEFUN ("keyboard-coding-system",
5697 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5698 "Return coding system specified for decoding keyboard input.")
5699 ()
5700 {
5701 return keyboard_coding.symbol;
5702 }
5703
5704 \f
5705 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5706 Sfind_operation_coding_system, 1, MANY, 0,
5707 "Choose a coding system for an operation based on the target name.\n\
5708 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5709 DECODING-SYSTEM is the coding system to use for decoding\n\
5710 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5711 for encoding (in case OPERATION does encoding).\n\
5712 \n\
5713 The first argument OPERATION specifies an I/O primitive:\n\
5714 For file I/O, `insert-file-contents' or `write-region'.\n\
5715 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5716 For network I/O, `open-network-stream'.\n\
5717 \n\
5718 The remaining arguments should be the same arguments that were passed\n\
5719 to the primitive. Depending on which primitive, one of those arguments\n\
5720 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5721 whichever argument specifies the file name is TARGET.\n\
5722 \n\
5723 TARGET has a meaning which depends on OPERATION:\n\
5724 For file I/O, TARGET is a file name.\n\
5725 For process I/O, TARGET is a process name.\n\
5726 For network I/O, TARGET is a service name or a port number\n\
5727 \n\
5728 This function looks up what specified for TARGET in,\n\
5729 `file-coding-system-alist', `process-coding-system-alist',\n\
5730 or `network-coding-system-alist' depending on OPERATION.\n\
5731 They may specify a coding system, a cons of coding systems,\n\
5732 or a function symbol to call.\n\
5733 In the last case, we call the function with one argument,\n\
5734 which is a list of all the arguments given to this function.")
5735 (nargs, args)
5736 int nargs;
5737 Lisp_Object *args;
5738 {
5739 Lisp_Object operation, target_idx, target, val;
5740 register Lisp_Object chain;
5741
5742 if (nargs < 2)
5743 error ("Too few arguments");
5744 operation = args[0];
5745 if (!SYMBOLP (operation)
5746 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5747 error ("Invalid first arguement");
5748 if (nargs < 1 + XINT (target_idx))
5749 error ("Too few arguments for operation: %s",
5750 XSYMBOL (operation)->name->data);
5751 target = args[XINT (target_idx) + 1];
5752 if (!(STRINGP (target)
5753 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5754 error ("Invalid %dth argument", XINT (target_idx) + 1);
5755
5756 chain = ((EQ (operation, Qinsert_file_contents)
5757 || EQ (operation, Qwrite_region))
5758 ? Vfile_coding_system_alist
5759 : (EQ (operation, Qopen_network_stream)
5760 ? Vnetwork_coding_system_alist
5761 : Vprocess_coding_system_alist));
5762 if (NILP (chain))
5763 return Qnil;
5764
5765 for (; CONSP (chain); chain = XCDR (chain))
5766 {
5767 Lisp_Object elt;
5768 elt = XCAR (chain);
5769
5770 if (CONSP (elt)
5771 && ((STRINGP (target)
5772 && STRINGP (XCAR (elt))
5773 && fast_string_match (XCAR (elt), target) >= 0)
5774 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5775 {
5776 val = XCDR (elt);
5777 /* Here, if VAL is both a valid coding system and a valid
5778 function symbol, we return VAL as a coding system. */
5779 if (CONSP (val))
5780 return val;
5781 if (! SYMBOLP (val))
5782 return Qnil;
5783 if (! NILP (Fcoding_system_p (val)))
5784 return Fcons (val, val);
5785 if (! NILP (Ffboundp (val)))
5786 {
5787 val = call1 (val, Flist (nargs, args));
5788 if (CONSP (val))
5789 return val;
5790 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5791 return Fcons (val, val);
5792 }
5793 return Qnil;
5794 }
5795 }
5796 return Qnil;
5797 }
5798
5799 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5800 Supdate_coding_systems_internal, 0, 0, 0,
5801 "Update internal database for ISO2022 and CCL based coding systems.\n\
5802 When values of any coding categories are changed, you must\n\
5803 call this function")
5804 ()
5805 {
5806 int i;
5807
5808 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5809 {
5810 Lisp_Object val;
5811
5812 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5813 if (!NILP (val))
5814 {
5815 if (! coding_system_table[i])
5816 coding_system_table[i] = ((struct coding_system *)
5817 xmalloc (sizeof (struct coding_system)));
5818 setup_coding_system (val, coding_system_table[i]);
5819 }
5820 else if (coding_system_table[i])
5821 {
5822 xfree (coding_system_table[i]);
5823 coding_system_table[i] = NULL;
5824 }
5825 }
5826
5827 return Qnil;
5828 }
5829
5830 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5831 Sset_coding_priority_internal, 0, 0, 0,
5832 "Update internal database for the current value of `coding-category-list'.\n\
5833 This function is internal use only.")
5834 ()
5835 {
5836 int i = 0, idx;
5837 Lisp_Object val;
5838
5839 val = Vcoding_category_list;
5840
5841 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5842 {
5843 if (! SYMBOLP (XCAR (val)))
5844 break;
5845 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5846 if (idx >= CODING_CATEGORY_IDX_MAX)
5847 break;
5848 coding_priorities[i++] = (1 << idx);
5849 val = XCDR (val);
5850 }
5851 /* If coding-category-list is valid and contains all coding
5852 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5853 the following code saves Emacs from crashing. */
5854 while (i < CODING_CATEGORY_IDX_MAX)
5855 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5856
5857 return Qnil;
5858 }
5859
5860 #endif /* emacs */
5861
5862 \f
5863 /*** 9. Post-amble ***/
5864
5865 void
5866 init_coding ()
5867 {
5868 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5869 }
5870
5871 void
5872 init_coding_once ()
5873 {
5874 int i;
5875
5876 /* Emacs' internal format specific initialize routine. */
5877 for (i = 0; i <= 0x20; i++)
5878 emacs_code_class[i] = EMACS_control_code;
5879 emacs_code_class[0x0A] = EMACS_linefeed_code;
5880 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5881 for (i = 0x21 ; i < 0x7F; i++)
5882 emacs_code_class[i] = EMACS_ascii_code;
5883 emacs_code_class[0x7F] = EMACS_control_code;
5884 for (i = 0x80; i < 0xFF; i++)
5885 emacs_code_class[i] = EMACS_invalid_code;
5886 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5887 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5888 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5889 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5890
5891 /* ISO2022 specific initialize routine. */
5892 for (i = 0; i < 0x20; i++)
5893 iso_code_class[i] = ISO_control_0;
5894 for (i = 0x21; i < 0x7F; i++)
5895 iso_code_class[i] = ISO_graphic_plane_0;
5896 for (i = 0x80; i < 0xA0; i++)
5897 iso_code_class[i] = ISO_control_1;
5898 for (i = 0xA1; i < 0xFF; i++)
5899 iso_code_class[i] = ISO_graphic_plane_1;
5900 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5901 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5902 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5903 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5904 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5905 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5906 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5907 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5908 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5909 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5910
5911 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5912
5913 setup_coding_system (Qnil, &keyboard_coding);
5914 setup_coding_system (Qnil, &terminal_coding);
5915 setup_coding_system (Qnil, &safe_terminal_coding);
5916 setup_coding_system (Qnil, &default_buffer_file_coding);
5917
5918 bzero (coding_system_table, sizeof coding_system_table);
5919
5920 bzero (ascii_skip_code, sizeof ascii_skip_code);
5921 for (i = 0; i < 128; i++)
5922 ascii_skip_code[i] = 1;
5923
5924 #if defined (MSDOS) || defined (WINDOWSNT)
5925 system_eol_type = CODING_EOL_CRLF;
5926 #else
5927 system_eol_type = CODING_EOL_LF;
5928 #endif
5929
5930 inhibit_pre_post_conversion = 0;
5931 }
5932
5933 #ifdef emacs
5934
5935 void
5936 syms_of_coding ()
5937 {
5938 Qtarget_idx = intern ("target-idx");
5939 staticpro (&Qtarget_idx);
5940
5941 Qcoding_system_history = intern ("coding-system-history");
5942 staticpro (&Qcoding_system_history);
5943 Fset (Qcoding_system_history, Qnil);
5944
5945 /* Target FILENAME is the first argument. */
5946 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5947 /* Target FILENAME is the third argument. */
5948 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5949
5950 Qcall_process = intern ("call-process");
5951 staticpro (&Qcall_process);
5952 /* Target PROGRAM is the first argument. */
5953 Fput (Qcall_process, Qtarget_idx, make_number (0));
5954
5955 Qcall_process_region = intern ("call-process-region");
5956 staticpro (&Qcall_process_region);
5957 /* Target PROGRAM is the third argument. */
5958 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5959
5960 Qstart_process = intern ("start-process");
5961 staticpro (&Qstart_process);
5962 /* Target PROGRAM is the third argument. */
5963 Fput (Qstart_process, Qtarget_idx, make_number (2));
5964
5965 Qopen_network_stream = intern ("open-network-stream");
5966 staticpro (&Qopen_network_stream);
5967 /* Target SERVICE is the fourth argument. */
5968 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5969
5970 Qcoding_system = intern ("coding-system");
5971 staticpro (&Qcoding_system);
5972
5973 Qeol_type = intern ("eol-type");
5974 staticpro (&Qeol_type);
5975
5976 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5977 staticpro (&Qbuffer_file_coding_system);
5978
5979 Qpost_read_conversion = intern ("post-read-conversion");
5980 staticpro (&Qpost_read_conversion);
5981
5982 Qpre_write_conversion = intern ("pre-write-conversion");
5983 staticpro (&Qpre_write_conversion);
5984
5985 Qno_conversion = intern ("no-conversion");
5986 staticpro (&Qno_conversion);
5987
5988 Qundecided = intern ("undecided");
5989 staticpro (&Qundecided);
5990
5991 Qcoding_system_p = intern ("coding-system-p");
5992 staticpro (&Qcoding_system_p);
5993
5994 Qcoding_system_error = intern ("coding-system-error");
5995 staticpro (&Qcoding_system_error);
5996
5997 Fput (Qcoding_system_error, Qerror_conditions,
5998 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5999 Fput (Qcoding_system_error, Qerror_message,
6000 build_string ("Invalid coding system"));
6001
6002 Qcoding_category = intern ("coding-category");
6003 staticpro (&Qcoding_category);
6004 Qcoding_category_index = intern ("coding-category-index");
6005 staticpro (&Qcoding_category_index);
6006
6007 Vcoding_category_table
6008 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6009 staticpro (&Vcoding_category_table);
6010 {
6011 int i;
6012 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6013 {
6014 XVECTOR (Vcoding_category_table)->contents[i]
6015 = intern (coding_category_name[i]);
6016 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6017 Qcoding_category_index, make_number (i));
6018 }
6019 }
6020
6021 Qtranslation_table = intern ("translation-table");
6022 staticpro (&Qtranslation_table);
6023 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6024
6025 Qtranslation_table_id = intern ("translation-table-id");
6026 staticpro (&Qtranslation_table_id);
6027
6028 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6029 staticpro (&Qtranslation_table_for_decode);
6030
6031 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6032 staticpro (&Qtranslation_table_for_encode);
6033
6034 Qsafe_charsets = intern ("safe-charsets");
6035 staticpro (&Qsafe_charsets);
6036
6037 Qvalid_codes = intern ("valid-codes");
6038 staticpro (&Qvalid_codes);
6039
6040 Qemacs_mule = intern ("emacs-mule");
6041 staticpro (&Qemacs_mule);
6042
6043 Qraw_text = intern ("raw-text");
6044 staticpro (&Qraw_text);
6045
6046 defsubr (&Scoding_system_p);
6047 defsubr (&Sread_coding_system);
6048 defsubr (&Sread_non_nil_coding_system);
6049 defsubr (&Scheck_coding_system);
6050 defsubr (&Sdetect_coding_region);
6051 defsubr (&Sdetect_coding_string);
6052 defsubr (&Sdecode_coding_region);
6053 defsubr (&Sencode_coding_region);
6054 defsubr (&Sdecode_coding_string);
6055 defsubr (&Sencode_coding_string);
6056 defsubr (&Sdecode_sjis_char);
6057 defsubr (&Sencode_sjis_char);
6058 defsubr (&Sdecode_big5_char);
6059 defsubr (&Sencode_big5_char);
6060 defsubr (&Sset_terminal_coding_system_internal);
6061 defsubr (&Sset_safe_terminal_coding_system_internal);
6062 defsubr (&Sterminal_coding_system);
6063 defsubr (&Sset_keyboard_coding_system_internal);
6064 defsubr (&Skeyboard_coding_system);
6065 defsubr (&Sfind_operation_coding_system);
6066 defsubr (&Supdate_coding_systems_internal);
6067 defsubr (&Sset_coding_priority_internal);
6068
6069 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6070 "List of coding systems.\n\
6071 \n\
6072 Do not alter the value of this variable manually. This variable should be\n\
6073 updated by the functions `make-coding-system' and\n\
6074 `define-coding-system-alias'.");
6075 Vcoding_system_list = Qnil;
6076
6077 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6078 "Alist of coding system names.\n\
6079 Each element is one element list of coding system name.\n\
6080 This variable is given to `completing-read' as TABLE argument.\n\
6081 \n\
6082 Do not alter the value of this variable manually. This variable should be\n\
6083 updated by the functions `make-coding-system' and\n\
6084 `define-coding-system-alias'.");
6085 Vcoding_system_alist = Qnil;
6086
6087 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6088 "List of coding-categories (symbols) ordered by priority.");
6089 {
6090 int i;
6091
6092 Vcoding_category_list = Qnil;
6093 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6094 Vcoding_category_list
6095 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6096 Vcoding_category_list);
6097 }
6098
6099 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6100 "Specify the coding system for read operations.\n\
6101 It is useful to bind this variable with `let', but do not set it globally.\n\
6102 If the value is a coding system, it is used for decoding on read operation.\n\
6103 If not, an appropriate element is used from one of the coding system alists:\n\
6104 There are three such tables, `file-coding-system-alist',\n\
6105 `process-coding-system-alist', and `network-coding-system-alist'.");
6106 Vcoding_system_for_read = Qnil;
6107
6108 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6109 "Specify the coding system for write operations.\n\
6110 Programs bind this variable with `let', but you should not set it globally.\n\
6111 If the value is a coding system, it is used for encoding of output,\n\
6112 when writing it to a file and when sending it to a file or subprocess.\n\
6113 \n\
6114 If this does not specify a coding system, an appropriate element\n\
6115 is used from one of the coding system alists:\n\
6116 There are three such tables, `file-coding-system-alist',\n\
6117 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6118 For output to files, if the above procedure does not specify a coding system,\n\
6119 the value of `buffer-file-coding-system' is used.");
6120 Vcoding_system_for_write = Qnil;
6121
6122 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6123 "Coding system used in the latest file or process I/O.");
6124 Vlast_coding_system_used = Qnil;
6125
6126 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6127 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6128 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6129 such conversion.");
6130 inhibit_eol_conversion = 0;
6131
6132 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6133 "Non-nil means process buffer inherits coding system of process output.\n\
6134 Bind it to t if the process output is to be treated as if it were a file\n\
6135 read from some filesystem.");
6136 inherit_process_coding_system = 0;
6137
6138 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6139 "Alist to decide a coding system to use for a file I/O operation.\n\
6140 The format is ((PATTERN . VAL) ...),\n\
6141 where PATTERN is a regular expression matching a file name,\n\
6142 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6143 If VAL is a coding system, it is used for both decoding and encoding\n\
6144 the file contents.\n\
6145 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6146 and the cdr part is used for encoding.\n\
6147 If VAL is a function symbol, the function must return a coding system\n\
6148 or a cons of coding systems which are used as above.\n\
6149 \n\
6150 See also the function `find-operation-coding-system'\n\
6151 and the variable `auto-coding-alist'.");
6152 Vfile_coding_system_alist = Qnil;
6153
6154 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6155 "Alist to decide a coding system to use for a process I/O operation.\n\
6156 The format is ((PATTERN . VAL) ...),\n\
6157 where PATTERN is a regular expression matching a program name,\n\
6158 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6159 If VAL is a coding system, it is used for both decoding what received\n\
6160 from the program and encoding what sent to the program.\n\
6161 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6162 and the cdr part is used for encoding.\n\
6163 If VAL is a function symbol, the function must return a coding system\n\
6164 or a cons of coding systems which are used as above.\n\
6165 \n\
6166 See also the function `find-operation-coding-system'.");
6167 Vprocess_coding_system_alist = Qnil;
6168
6169 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6170 "Alist to decide a coding system to use for a network I/O operation.\n\
6171 The format is ((PATTERN . VAL) ...),\n\
6172 where PATTERN is a regular expression matching a network service name\n\
6173 or is a port number to connect to,\n\
6174 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6175 If VAL is a coding system, it is used for both decoding what received\n\
6176 from the network stream and encoding what sent to the network stream.\n\
6177 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6178 and the cdr part is used for encoding.\n\
6179 If VAL is a function symbol, the function must return a coding system\n\
6180 or a cons of coding systems which are used as above.\n\
6181 \n\
6182 See also the function `find-operation-coding-system'.");
6183 Vnetwork_coding_system_alist = Qnil;
6184
6185 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6186 "Coding system to use with system messages.");
6187 Vlocale_coding_system = Qnil;
6188
6189 /* The eol mnemonics are reset in startup.el system-dependently. */
6190 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6191 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6192 eol_mnemonic_unix = build_string (":");
6193
6194 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6195 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6196 eol_mnemonic_dos = build_string ("\\");
6197
6198 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6199 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6200 eol_mnemonic_mac = build_string ("/");
6201
6202 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6203 "*String displayed in mode line when end-of-line format is not yet determined.");
6204 eol_mnemonic_undecided = build_string (":");
6205
6206 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6207 "*Non-nil enables character translation while encoding and decoding.");
6208 Venable_character_translation = Qt;
6209
6210 DEFVAR_LISP ("standard-translation-table-for-decode",
6211 &Vstandard_translation_table_for_decode,
6212 "Table for translating characters while decoding.");
6213 Vstandard_translation_table_for_decode = Qnil;
6214
6215 DEFVAR_LISP ("standard-translation-table-for-encode",
6216 &Vstandard_translation_table_for_encode,
6217 "Table for translationg characters while encoding.");
6218 Vstandard_translation_table_for_encode = Qnil;
6219
6220 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6221 "Alist of charsets vs revision numbers.\n\
6222 While encoding, if a charset (car part of an element) is found,\n\
6223 designate it with the escape sequence identifing revision (cdr part of the element).");
6224 Vcharset_revision_alist = Qnil;
6225
6226 DEFVAR_LISP ("default-process-coding-system",
6227 &Vdefault_process_coding_system,
6228 "Cons of coding systems used for process I/O by default.\n\
6229 The car part is used for decoding a process output,\n\
6230 the cdr part is used for encoding a text to be sent to a process.");
6231 Vdefault_process_coding_system = Qnil;
6232
6233 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6234 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6235 This is a vector of length 256.\n\
6236 If Nth element is non-nil, the existence of code N in a file\n\
6237 \(or output of subprocess) doesn't prevent it to be detected as\n\
6238 a coding system of ISO 2022 variant which has a flag\n\
6239 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6240 or reading output of a subprocess.\n\
6241 Only 128th through 159th elements has a meaning.");
6242 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6243
6244 DEFVAR_LISP ("select-safe-coding-system-function",
6245 &Vselect_safe_coding_system_function,
6246 "Function to call to select safe coding system for encoding a text.\n\
6247 \n\
6248 If set, this function is called to force a user to select a proper\n\
6249 coding system which can encode the text in the case that a default\n\
6250 coding system used in each operation can't encode the text.\n\
6251 \n\
6252 The default value is `select-safe-coding-system' (which see).");
6253 Vselect_safe_coding_system_function = Qnil;
6254
6255 }
6256
6257 char *
6258 emacs_strerror (error_number)
6259 int error_number;
6260 {
6261 char *str;
6262
6263 synchronize_system_messages_locale ();
6264 str = strerror (error_number);
6265
6266 if (! NILP (Vlocale_coding_system))
6267 {
6268 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6269 Vlocale_coding_system,
6270 0);
6271 str = (char *) XSTRING (dec)->data;
6272 }
6273
6274 return str;
6275 }
6276
6277 #endif /* emacs */
6278