(ccl_coding_driver): Initialize ccl->multibyte.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
34
35 */
36
37 /*** 0. General comments ***/
38
39
40 /*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
48
49 0. Emacs' internal format (emacs-mule)
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
74
75 4. Raw text
76
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
79
80 5. Other
81
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
91
92 */
93
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
101
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
106
107 */
108
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
117 int
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
120 {
121 ...
122 }
123 #endif
124
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
131
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 {
149 ...
150 }
151 #endif
152
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
159
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
164
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
168
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
176 {
177 ...
178 }
179 #endif
180
181 /*** COMMONLY USED MACROS ***/
182
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
190
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
199 } while (0)
200
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
211
212
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
222
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
241
242
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
268 \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
276 } while (0)
277
278
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
309
310 \f
311 /*** 1. Preamble ***/
312
313 #ifdef emacs
314 #include <config.h>
315 #endif
316
317 #include <stdio.h>
318
319 #ifdef emacs
320
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
328
329 #else /* not emacs */
330
331 #include "mulelib.h"
332
333 #endif /* not emacs */
334
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_chars;
341 Lisp_Object Qvalid_codes;
342
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
347
348 Lisp_Object Vselect_safe_coding_system_function;
349
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
355
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
359
360 #ifdef emacs
361
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
365
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
369
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
378
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
382
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
385
386 /* Flag to inhibit ISO2022 escape sequence detection. */
387 int inhibit_iso_escape_detection;
388
389 /* Flag to make buffer-file-coding-system inherit from process-coding. */
390 int inherit_process_coding_system;
391
392 /* Coding system to be used to encode text for terminal display. */
393 struct coding_system terminal_coding;
394
395 /* Coding system to be used to encode text for terminal display when
396 terminal coding system is nil. */
397 struct coding_system safe_terminal_coding;
398
399 /* Coding system of what is sent from terminal keyboard. */
400 struct coding_system keyboard_coding;
401
402 /* Default coding system to be used to write a file. */
403 struct coding_system default_buffer_file_coding;
404
405 Lisp_Object Vfile_coding_system_alist;
406 Lisp_Object Vprocess_coding_system_alist;
407 Lisp_Object Vnetwork_coding_system_alist;
408
409 Lisp_Object Vlocale_coding_system;
410
411 #endif /* emacs */
412
413 Lisp_Object Qcoding_category, Qcoding_category_index;
414
415 /* List of symbols `coding-category-xxx' ordered by priority. */
416 Lisp_Object Vcoding_category_list;
417
418 /* Table of coding categories (Lisp symbols). */
419 Lisp_Object Vcoding_category_table;
420
421 /* Table of names of symbol for each coding-category. */
422 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
423 "coding-category-emacs-mule",
424 "coding-category-sjis",
425 "coding-category-iso-7",
426 "coding-category-iso-7-tight",
427 "coding-category-iso-8-1",
428 "coding-category-iso-8-2",
429 "coding-category-iso-7-else",
430 "coding-category-iso-8-else",
431 "coding-category-ccl",
432 "coding-category-big5",
433 "coding-category-utf-8",
434 "coding-category-utf-16-be",
435 "coding-category-utf-16-le",
436 "coding-category-raw-text",
437 "coding-category-binary"
438 };
439
440 /* Table of pointers to coding systems corresponding to each coding
441 categories. */
442 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
443
444 /* Table of coding category masks. Nth element is a mask for a coding
445 cateogry of which priority is Nth. */
446 static
447 int coding_priorities[CODING_CATEGORY_IDX_MAX];
448
449 /* Flag to tell if we look up translation table on character code
450 conversion. */
451 Lisp_Object Venable_character_translation;
452 /* Standard translation table to look up on decoding (reading). */
453 Lisp_Object Vstandard_translation_table_for_decode;
454 /* Standard translation table to look up on encoding (writing). */
455 Lisp_Object Vstandard_translation_table_for_encode;
456
457 Lisp_Object Qtranslation_table;
458 Lisp_Object Qtranslation_table_id;
459 Lisp_Object Qtranslation_table_for_decode;
460 Lisp_Object Qtranslation_table_for_encode;
461
462 /* Alist of charsets vs revision number. */
463 Lisp_Object Vcharset_revision_alist;
464
465 /* Default coding systems used for process I/O. */
466 Lisp_Object Vdefault_process_coding_system;
467
468 /* Global flag to tell that we can't call post-read-conversion and
469 pre-write-conversion functions. Usually the value is zero, but it
470 is set to 1 temporarily while such functions are running. This is
471 to avoid infinite recursive call. */
472 static int inhibit_pre_post_conversion;
473
474 /* Char-table containing safe coding systems of each character. */
475 Lisp_Object Vchar_coding_system_table;
476 Lisp_Object Qchar_coding_system;
477
478 /* Return `safe-chars' property of coding system CODING. Don't check
479 validity of CODING. */
480
481 Lisp_Object
482 coding_safe_chars (coding)
483 struct coding_system *coding;
484 {
485 Lisp_Object coding_spec, plist, safe_chars;
486
487 coding_spec = Fget (coding->symbol, Qcoding_system);
488 plist = XVECTOR (coding_spec)->contents[3];
489 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
490 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
491 }
492
493 #define CODING_SAFE_CHAR_P(safe_chars, c) \
494 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
495
496 \f
497 /*** 2. Emacs internal format (emacs-mule) handlers ***/
498
499 /* Emacs' internal format for encoding multiple character sets is a
500 kind of multi-byte encoding, i.e. characters are encoded by
501 variable-length sequences of one-byte codes.
502
503 ASCII characters and control characters (e.g. `tab', `newline') are
504 represented by one-byte sequences which are their ASCII codes, in
505 the range 0x00 through 0x7F.
506
507 8-bit characters of the range 0x80..0x9F are represented by
508 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
509 code + 0x20).
510
511 8-bit characters of the range 0xA0..0xFF are represented by
512 one-byte sequences which are their 8-bit code.
513
514 The other characters are represented by a sequence of `base
515 leading-code', optional `extended leading-code', and one or two
516 `position-code's. The length of the sequence is determined by the
517 base leading-code. Leading-code takes the range 0x80 through 0x9F,
518 whereas extended leading-code and position-code take the range 0xA0
519 through 0xFF. See `charset.h' for more details about leading-code
520 and position-code.
521
522 --- CODE RANGE of Emacs' internal format ---
523 character set range
524 ------------- -----
525 ascii 0x00..0x7F
526 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
527 eight-bit-graphic 0xA0..0xBF
528 ELSE 0x81..0x9F + [0xA0..0xFF]+
529 ---------------------------------------------
530
531 */
532
533 enum emacs_code_class_type emacs_code_class[256];
534
535 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
536 Check if a text is encoded in Emacs' internal format. If it is,
537 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
538
539 int
540 detect_coding_emacs_mule (src, src_end)
541 unsigned char *src, *src_end;
542 {
543 unsigned char c;
544 int composing = 0;
545 /* Dummy for ONE_MORE_BYTE. */
546 struct coding_system dummy_coding;
547 struct coding_system *coding = &dummy_coding;
548
549 while (1)
550 {
551 ONE_MORE_BYTE (c);
552
553 if (composing)
554 {
555 if (c < 0xA0)
556 composing = 0;
557 else if (c == 0xA0)
558 {
559 ONE_MORE_BYTE (c);
560 c &= 0x7F;
561 }
562 else
563 c -= 0x20;
564 }
565
566 if (c < 0x20)
567 {
568 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
569 return 0;
570 }
571 else if (c >= 0x80 && c < 0xA0)
572 {
573 if (c == 0x80)
574 /* Old leading code for a composite character. */
575 composing = 1;
576 else
577 {
578 unsigned char *src_base = src - 1;
579 int bytes;
580
581 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
582 bytes))
583 return 0;
584 src = src_base + bytes;
585 }
586 }
587 }
588 label_end_of_loop:
589 return CODING_CATEGORY_MASK_EMACS_MULE;
590 }
591
592
593 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
594
595 static void
596 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
597 struct coding_system *coding;
598 unsigned char *source, *destination;
599 int src_bytes, dst_bytes;
600 {
601 unsigned char *src = source;
602 unsigned char *src_end = source + src_bytes;
603 unsigned char *dst = destination;
604 unsigned char *dst_end = destination + dst_bytes;
605 /* SRC_BASE remembers the start position in source in each loop.
606 The loop will be exited when there's not enough source code, or
607 when there's not enough destination area to produce a
608 character. */
609 unsigned char *src_base;
610
611 coding->produced_char = 0;
612 while ((src_base = src) < src_end)
613 {
614 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
615 int bytes;
616
617 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
618 {
619 p = src;
620 src += bytes;
621 }
622 else
623 {
624 bytes = CHAR_STRING (*src, tmp);
625 p = tmp;
626 src++;
627 }
628 if (dst + bytes >= (dst_bytes ? dst_end : src))
629 {
630 coding->result = CODING_FINISH_INSUFFICIENT_DST;
631 break;
632 }
633 while (bytes--) *dst++ = *p++;
634 coding->produced_char++;
635 }
636 coding->consumed = coding->consumed_char = src_base - source;
637 coding->produced = dst - destination;
638 }
639
640 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
641 encode_eol (coding, source, destination, src_bytes, dst_bytes)
642
643
644 \f
645 /*** 3. ISO2022 handlers ***/
646
647 /* The following note describes the coding system ISO2022 briefly.
648 Since the intention of this note is to help understand the
649 functions in this file, some parts are NOT ACCURATE or OVERLY
650 SIMPLIFIED. For thorough understanding, please refer to the
651 original document of ISO2022.
652
653 ISO2022 provides many mechanisms to encode several character sets
654 in 7-bit and 8-bit environments. For 7-bite environments, all text
655 is encoded using bytes less than 128. This may make the encoded
656 text a little bit longer, but the text passes more easily through
657 several gateways, some of which strip off MSB (Most Signigant Bit).
658
659 There are two kinds of character sets: control character set and
660 graphic character set. The former contains control characters such
661 as `newline' and `escape' to provide control functions (control
662 functions are also provided by escape sequences). The latter
663 contains graphic characters such as 'A' and '-'. Emacs recognizes
664 two control character sets and many graphic character sets.
665
666 Graphic character sets are classified into one of the following
667 four classes, according to the number of bytes (DIMENSION) and
668 number of characters in one dimension (CHARS) of the set:
669 - DIMENSION1_CHARS94
670 - DIMENSION1_CHARS96
671 - DIMENSION2_CHARS94
672 - DIMENSION2_CHARS96
673
674 In addition, each character set is assigned an identification tag,
675 unique for each set, called "final character" (denoted as <F>
676 hereafter). The <F> of each character set is decided by ECMA(*)
677 when it is registered in ISO. The code range of <F> is 0x30..0x7F
678 (0x30..0x3F are for private use only).
679
680 Note (*): ECMA = European Computer Manufacturers Association
681
682 Here are examples of graphic character set [NAME(<F>)]:
683 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
684 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
685 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
686 o DIMENSION2_CHARS96 -- none for the moment
687
688 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
689 C0 [0x00..0x1F] -- control character plane 0
690 GL [0x20..0x7F] -- graphic character plane 0
691 C1 [0x80..0x9F] -- control character plane 1
692 GR [0xA0..0xFF] -- graphic character plane 1
693
694 A control character set is directly designated and invoked to C0 or
695 C1 by an escape sequence. The most common case is that:
696 - ISO646's control character set is designated/invoked to C0, and
697 - ISO6429's control character set is designated/invoked to C1,
698 and usually these designations/invocations are omitted in encoded
699 text. In a 7-bit environment, only C0 can be used, and a control
700 character for C1 is encoded by an appropriate escape sequence to
701 fit into the environment. All control characters for C1 are
702 defined to have corresponding escape sequences.
703
704 A graphic character set is at first designated to one of four
705 graphic registers (G0 through G3), then these graphic registers are
706 invoked to GL or GR. These designations and invocations can be
707 done independently. The most common case is that G0 is invoked to
708 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
709 these invocations and designations are omitted in encoded text.
710 In a 7-bit environment, only GL can be used.
711
712 When a graphic character set of CHARS94 is invoked to GL, codes
713 0x20 and 0x7F of the GL area work as control characters SPACE and
714 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
715 be used.
716
717 There are two ways of invocation: locking-shift and single-shift.
718 With locking-shift, the invocation lasts until the next different
719 invocation, whereas with single-shift, the invocation affects the
720 following character only and doesn't affect the locking-shift
721 state. Invocations are done by the following control characters or
722 escape sequences:
723
724 ----------------------------------------------------------------------
725 abbrev function cntrl escape seq description
726 ----------------------------------------------------------------------
727 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
728 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
729 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
730 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
731 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
732 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
733 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
734 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
735 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
736 ----------------------------------------------------------------------
737 (*) These are not used by any known coding system.
738
739 Control characters for these functions are defined by macros
740 ISO_CODE_XXX in `coding.h'.
741
742 Designations are done by the following escape sequences:
743 ----------------------------------------------------------------------
744 escape sequence description
745 ----------------------------------------------------------------------
746 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
747 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
748 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
749 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
750 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
751 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
752 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
753 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
754 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
755 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
756 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
757 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
758 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
759 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
760 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
761 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
762 ----------------------------------------------------------------------
763
764 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
765 of dimension 1, chars 94, and final character <F>, etc...
766
767 Note (*): Although these designations are not allowed in ISO2022,
768 Emacs accepts them on decoding, and produces them on encoding
769 CHARS96 character sets in a coding system which is characterized as
770 7-bit environment, non-locking-shift, and non-single-shift.
771
772 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
773 '(' can be omitted. We refer to this as "short-form" hereafter.
774
775 Now you may notice that there are a lot of ways for encoding the
776 same multilingual text in ISO2022. Actually, there exist many
777 coding systems such as Compound Text (used in X11's inter client
778 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
779 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
780 localized platforms), and all of these are variants of ISO2022.
781
782 In addition to the above, Emacs handles two more kinds of escape
783 sequences: ISO6429's direction specification and Emacs' private
784 sequence for specifying character composition.
785
786 ISO6429's direction specification takes the following form:
787 o CSI ']' -- end of the current direction
788 o CSI '0' ']' -- end of the current direction
789 o CSI '1' ']' -- start of left-to-right text
790 o CSI '2' ']' -- start of right-to-left text
791 The control character CSI (0x9B: control sequence introducer) is
792 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
793
794 Character composition specification takes the following form:
795 o ESC '0' -- start relative composition
796 o ESC '1' -- end composition
797 o ESC '2' -- start rule-base composition (*)
798 o ESC '3' -- start relative composition with alternate chars (**)
799 o ESC '4' -- start rule-base composition with alternate chars (**)
800 Since these are not standard escape sequences of any ISO standard,
801 the use of them for these meaning is restricted to Emacs only.
802
803 (*) This form is used only in Emacs 20.5 and the older versions,
804 but the newer versions can safely decode it.
805 (**) This form is used only in Emacs 21.1 and the newer versions,
806 and the older versions can't decode it.
807
808 Here's a list of examples usages of these composition escape
809 sequences (categorized by `enum composition_method').
810
811 COMPOSITION_RELATIVE:
812 ESC 0 CHAR [ CHAR ] ESC 1
813 COMPOSITOIN_WITH_RULE:
814 ESC 2 CHAR [ RULE CHAR ] ESC 1
815 COMPOSITION_WITH_ALTCHARS:
816 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
817 COMPOSITION_WITH_RULE_ALTCHARS:
818 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
819
820 enum iso_code_class_type iso_code_class[256];
821
822 #define CHARSET_OK(idx, charset, c) \
823 (coding_system_table[idx] \
824 && (charset == CHARSET_ASCII \
825 || (safe_chars = coding_safe_chars (coding_system_table[idx]), \
826 CODING_SAFE_CHAR_P (safe_chars, c))) \
827 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
828 charset) \
829 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
830
831 #define SHIFT_OUT_OK(idx) \
832 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
833
834 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
835 Check if a text is encoded in ISO2022. If it is, returns an
836 integer in which appropriate flag bits any of:
837 CODING_CATEGORY_MASK_ISO_7
838 CODING_CATEGORY_MASK_ISO_7_TIGHT
839 CODING_CATEGORY_MASK_ISO_8_1
840 CODING_CATEGORY_MASK_ISO_8_2
841 CODING_CATEGORY_MASK_ISO_7_ELSE
842 CODING_CATEGORY_MASK_ISO_8_ELSE
843 are set. If a code which should never appear in ISO2022 is found,
844 returns 0. */
845
846 int
847 detect_coding_iso2022 (src, src_end)
848 unsigned char *src, *src_end;
849 {
850 int mask = CODING_CATEGORY_MASK_ISO;
851 int mask_found = 0;
852 int reg[4], shift_out = 0, single_shifting = 0;
853 int c, c1, i, charset;
854 /* Dummy for ONE_MORE_BYTE. */
855 struct coding_system dummy_coding;
856 struct coding_system *coding = &dummy_coding;
857 Lisp_Object safe_chars;
858
859 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
860 while (mask && src < src_end)
861 {
862 ONE_MORE_BYTE (c);
863 switch (c)
864 {
865 case ISO_CODE_ESC:
866 if (inhibit_iso_escape_detection)
867 break;
868 single_shifting = 0;
869 ONE_MORE_BYTE (c);
870 if (c >= '(' && c <= '/')
871 {
872 /* Designation sequence for a charset of dimension 1. */
873 ONE_MORE_BYTE (c1);
874 if (c1 < ' ' || c1 >= 0x80
875 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
876 /* Invalid designation sequence. Just ignore. */
877 break;
878 reg[(c - '(') % 4] = charset;
879 }
880 else if (c == '$')
881 {
882 /* Designation sequence for a charset of dimension 2. */
883 ONE_MORE_BYTE (c);
884 if (c >= '@' && c <= 'B')
885 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
886 reg[0] = charset = iso_charset_table[1][0][c];
887 else if (c >= '(' && c <= '/')
888 {
889 ONE_MORE_BYTE (c1);
890 if (c1 < ' ' || c1 >= 0x80
891 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
892 /* Invalid designation sequence. Just ignore. */
893 break;
894 reg[(c - '(') % 4] = charset;
895 }
896 else
897 /* Invalid designation sequence. Just ignore. */
898 break;
899 }
900 else if (c == 'N' || c == 'O')
901 {
902 /* ESC <Fe> for SS2 or SS3. */
903 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
904 break;
905 }
906 else if (c >= '0' && c <= '4')
907 {
908 /* ESC <Fp> for start/end composition. */
909 mask_found |= CODING_CATEGORY_MASK_ISO;
910 break;
911 }
912 else
913 /* Invalid escape sequence. Just ignore. */
914 break;
915
916 /* We found a valid designation sequence for CHARSET. */
917 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
918 c = MAKE_CHAR (charset, 0, 0);
919 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
920 mask_found |= CODING_CATEGORY_MASK_ISO_7;
921 else
922 mask &= ~CODING_CATEGORY_MASK_ISO_7;
923 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
924 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
925 else
926 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
927 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
928 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
929 else
930 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
931 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
932 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
933 else
934 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
935 break;
936
937 case ISO_CODE_SO:
938 if (inhibit_iso_escape_detection)
939 break;
940 single_shifting = 0;
941 if (shift_out == 0
942 && (reg[1] >= 0
943 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
944 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
945 {
946 /* Locking shift out. */
947 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
948 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
949 }
950 break;
951
952 case ISO_CODE_SI:
953 if (inhibit_iso_escape_detection)
954 break;
955 single_shifting = 0;
956 if (shift_out == 1)
957 {
958 /* Locking shift in. */
959 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
960 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
961 }
962 break;
963
964 case ISO_CODE_CSI:
965 single_shifting = 0;
966 case ISO_CODE_SS2:
967 case ISO_CODE_SS3:
968 {
969 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
970
971 if (inhibit_iso_escape_detection)
972 break;
973 if (c != ISO_CODE_CSI)
974 {
975 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
976 & CODING_FLAG_ISO_SINGLE_SHIFT)
977 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
978 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
979 & CODING_FLAG_ISO_SINGLE_SHIFT)
980 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
981 single_shifting = 1;
982 }
983 if (VECTORP (Vlatin_extra_code_table)
984 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
985 {
986 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
987 & CODING_FLAG_ISO_LATIN_EXTRA)
988 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
989 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
990 & CODING_FLAG_ISO_LATIN_EXTRA)
991 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
992 }
993 mask &= newmask;
994 mask_found |= newmask;
995 }
996 break;
997
998 default:
999 if (c < 0x80)
1000 {
1001 single_shifting = 0;
1002 break;
1003 }
1004 else if (c < 0xA0)
1005 {
1006 single_shifting = 0;
1007 if (VECTORP (Vlatin_extra_code_table)
1008 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1009 {
1010 int newmask = 0;
1011
1012 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1013 & CODING_FLAG_ISO_LATIN_EXTRA)
1014 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1015 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1016 & CODING_FLAG_ISO_LATIN_EXTRA)
1017 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1018 mask &= newmask;
1019 mask_found |= newmask;
1020 }
1021 else
1022 return 0;
1023 }
1024 else
1025 {
1026 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1027 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1028 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1029 /* Check the length of succeeding codes of the range
1030 0xA0..0FF. If the byte length is odd, we exclude
1031 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1032 when we are not single shifting. */
1033 if (!single_shifting
1034 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1035 {
1036 int i = 1;
1037 while (src < src_end)
1038 {
1039 ONE_MORE_BYTE (c);
1040 if (c < 0xA0)
1041 break;
1042 i++;
1043 }
1044
1045 if (i & 1 && src < src_end)
1046 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1047 else
1048 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1049 }
1050 }
1051 break;
1052 }
1053 }
1054 label_end_of_loop:
1055 return (mask & mask_found);
1056 }
1057
1058 /* Decode a character of which charset is CHARSET, the 1st position
1059 code is C1, the 2nd position code is C2, and return the decoded
1060 character code. If the variable `translation_table' is non-nil,
1061 returned the translated code. */
1062
1063 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1064 (NILP (translation_table) \
1065 ? MAKE_CHAR (charset, c1, c2) \
1066 : translate_char (translation_table, -1, charset, c1, c2))
1067
1068 /* Set designation state into CODING. */
1069 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1070 do { \
1071 int charset, c; \
1072 \
1073 if (final_char < '0' || final_char >= 128) \
1074 goto label_invalid_code; \
1075 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1076 make_number (chars), \
1077 make_number (final_char)); \
1078 c = MAKE_CHAR (charset, 0, 0); \
1079 if (charset >= 0 \
1080 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1081 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1082 { \
1083 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1084 && reg == 0 \
1085 && charset == CHARSET_ASCII) \
1086 { \
1087 /* We should insert this designation sequence as is so \
1088 that it is surely written back to a file. */ \
1089 coding->spec.iso2022.last_invalid_designation_register = -1; \
1090 goto label_invalid_code; \
1091 } \
1092 coding->spec.iso2022.last_invalid_designation_register = -1; \
1093 if ((coding->mode & CODING_MODE_DIRECTION) \
1094 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1095 charset = CHARSET_REVERSE_CHARSET (charset); \
1096 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1097 } \
1098 else \
1099 { \
1100 coding->spec.iso2022.last_invalid_designation_register = reg; \
1101 goto label_invalid_code; \
1102 } \
1103 } while (0)
1104
1105 /* Allocate a memory block for storing information about compositions.
1106 The block is chained to the already allocated blocks. */
1107
1108 void
1109 coding_allocate_composition_data (coding, char_offset)
1110 struct coding_system *coding;
1111 int char_offset;
1112 {
1113 struct composition_data *cmp_data
1114 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1115
1116 cmp_data->char_offset = char_offset;
1117 cmp_data->used = 0;
1118 cmp_data->prev = coding->cmp_data;
1119 cmp_data->next = NULL;
1120 if (coding->cmp_data)
1121 coding->cmp_data->next = cmp_data;
1122 coding->cmp_data = cmp_data;
1123 coding->cmp_data_start = 0;
1124 }
1125
1126 /* Record the starting position START and METHOD of one composition. */
1127
1128 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1129 do { \
1130 struct composition_data *cmp_data = coding->cmp_data; \
1131 int *data = cmp_data->data + cmp_data->used; \
1132 coding->cmp_data_start = cmp_data->used; \
1133 data[0] = -1; \
1134 data[1] = cmp_data->char_offset + start; \
1135 data[3] = (int) method; \
1136 cmp_data->used += 4; \
1137 } while (0)
1138
1139 /* Record the ending position END of the current composition. */
1140
1141 #define CODING_ADD_COMPOSITION_END(coding, end) \
1142 do { \
1143 struct composition_data *cmp_data = coding->cmp_data; \
1144 int *data = cmp_data->data + coding->cmp_data_start; \
1145 data[0] = cmp_data->used - coding->cmp_data_start; \
1146 data[2] = cmp_data->char_offset + end; \
1147 } while (0)
1148
1149 /* Record one COMPONENT (alternate character or composition rule). */
1150
1151 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1152 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1153
1154 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1155
1156 #define DECODE_COMPOSITION_START(c1) \
1157 do { \
1158 if (coding->composing == COMPOSITION_DISABLED) \
1159 { \
1160 *dst++ = ISO_CODE_ESC; \
1161 *dst++ = c1 & 0x7f; \
1162 coding->produced_char += 2; \
1163 } \
1164 else if (!COMPOSING_P (coding)) \
1165 { \
1166 /* This is surely the start of a composition. We must be sure \
1167 that coding->cmp_data has enough space to store the \
1168 information about the composition. If not, terminate the \
1169 current decoding loop, allocate one more memory block for \
1170 coding->cmp_data in the calller, then start the decoding \
1171 loop again. We can't allocate memory here directly because \
1172 it may cause buffer/string relocation. */ \
1173 if (!coding->cmp_data \
1174 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1175 >= COMPOSITION_DATA_SIZE)) \
1176 { \
1177 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1178 goto label_end_of_loop; \
1179 } \
1180 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1181 : c1 == '2' ? COMPOSITION_WITH_RULE \
1182 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1183 : COMPOSITION_WITH_RULE_ALTCHARS); \
1184 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1185 coding->composing); \
1186 coding->composition_rule_follows = 0; \
1187 } \
1188 else \
1189 { \
1190 /* We are already handling a composition. If the method is \
1191 the following two, the codes following the current escape \
1192 sequence are actual characters stored in a buffer. */ \
1193 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1194 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1195 { \
1196 coding->composing = COMPOSITION_RELATIVE; \
1197 coding->composition_rule_follows = 0; \
1198 } \
1199 } \
1200 } while (0)
1201
1202 /* Handle compositoin end sequence ESC 1. */
1203
1204 #define DECODE_COMPOSITION_END(c1) \
1205 do { \
1206 if (coding->composing == COMPOSITION_DISABLED) \
1207 { \
1208 *dst++ = ISO_CODE_ESC; \
1209 *dst++ = c1; \
1210 coding->produced_char += 2; \
1211 } \
1212 else \
1213 { \
1214 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1215 coding->composing = COMPOSITION_NO; \
1216 } \
1217 } while (0)
1218
1219 /* Decode a composition rule from the byte C1 (and maybe one more byte
1220 from SRC) and store one encoded composition rule in
1221 coding->cmp_data. */
1222
1223 #define DECODE_COMPOSITION_RULE(c1) \
1224 do { \
1225 int rule = 0; \
1226 (c1) -= 32; \
1227 if (c1 < 81) /* old format (before ver.21) */ \
1228 { \
1229 int gref = (c1) / 9; \
1230 int nref = (c1) % 9; \
1231 if (gref == 4) gref = 10; \
1232 if (nref == 4) nref = 10; \
1233 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1234 } \
1235 else if (c1 < 93) /* new format (after ver.21) */ \
1236 { \
1237 ONE_MORE_BYTE (c2); \
1238 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1239 } \
1240 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1241 coding->composition_rule_follows = 0; \
1242 } while (0)
1243
1244
1245 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1246
1247 static void
1248 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1249 struct coding_system *coding;
1250 unsigned char *source, *destination;
1251 int src_bytes, dst_bytes;
1252 {
1253 unsigned char *src = source;
1254 unsigned char *src_end = source + src_bytes;
1255 unsigned char *dst = destination;
1256 unsigned char *dst_end = destination + dst_bytes;
1257 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1258 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1259 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1260 /* SRC_BASE remembers the start position in source in each loop.
1261 The loop will be exited when there's not enough source code
1262 (within macro ONE_MORE_BYTE), or when there's not enough
1263 destination area to produce a character (within macro
1264 EMIT_CHAR). */
1265 unsigned char *src_base;
1266 int c, charset;
1267 Lisp_Object translation_table;
1268 Lisp_Object safe_chars;
1269
1270 safe_chars = coding_safe_chars (coding);
1271
1272 if (NILP (Venable_character_translation))
1273 translation_table = Qnil;
1274 else
1275 {
1276 translation_table = coding->translation_table_for_decode;
1277 if (NILP (translation_table))
1278 translation_table = Vstandard_translation_table_for_decode;
1279 }
1280
1281 coding->result = CODING_FINISH_NORMAL;
1282
1283 while (1)
1284 {
1285 int c1, c2;
1286
1287 src_base = src;
1288 ONE_MORE_BYTE (c1);
1289
1290 /* We produce no character or one character. */
1291 switch (iso_code_class [c1])
1292 {
1293 case ISO_0x20_or_0x7F:
1294 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1295 {
1296 DECODE_COMPOSITION_RULE (c1);
1297 continue;
1298 }
1299 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1300 {
1301 /* This is SPACE or DEL. */
1302 charset = CHARSET_ASCII;
1303 break;
1304 }
1305 /* This is a graphic character, we fall down ... */
1306
1307 case ISO_graphic_plane_0:
1308 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1309 {
1310 DECODE_COMPOSITION_RULE (c1);
1311 continue;
1312 }
1313 charset = charset0;
1314 break;
1315
1316 case ISO_0xA0_or_0xFF:
1317 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1318 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1319 goto label_invalid_code;
1320 /* This is a graphic character, we fall down ... */
1321
1322 case ISO_graphic_plane_1:
1323 if (charset1 < 0)
1324 goto label_invalid_code;
1325 charset = charset1;
1326 break;
1327
1328 case ISO_control_0:
1329 if (COMPOSING_P (coding))
1330 DECODE_COMPOSITION_END ('1');
1331
1332 /* All ISO2022 control characters in this class have the
1333 same representation in Emacs internal format. */
1334 if (c1 == '\n'
1335 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1336 && (coding->eol_type == CODING_EOL_CR
1337 || coding->eol_type == CODING_EOL_CRLF))
1338 {
1339 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1340 goto label_end_of_loop;
1341 }
1342 charset = CHARSET_ASCII;
1343 break;
1344
1345 case ISO_control_1:
1346 if (COMPOSING_P (coding))
1347 DECODE_COMPOSITION_END ('1');
1348 goto label_invalid_code;
1349
1350 case ISO_carriage_return:
1351 if (COMPOSING_P (coding))
1352 DECODE_COMPOSITION_END ('1');
1353
1354 if (coding->eol_type == CODING_EOL_CR)
1355 c1 = '\n';
1356 else if (coding->eol_type == CODING_EOL_CRLF)
1357 {
1358 ONE_MORE_BYTE (c1);
1359 if (c1 != ISO_CODE_LF)
1360 {
1361 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1362 {
1363 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1364 goto label_end_of_loop;
1365 }
1366 src--;
1367 c1 = '\r';
1368 }
1369 }
1370 charset = CHARSET_ASCII;
1371 break;
1372
1373 case ISO_shift_out:
1374 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1375 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1376 goto label_invalid_code;
1377 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1378 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1379 continue;
1380
1381 case ISO_shift_in:
1382 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1383 goto label_invalid_code;
1384 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1385 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1386 continue;
1387
1388 case ISO_single_shift_2_7:
1389 case ISO_single_shift_2:
1390 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1391 goto label_invalid_code;
1392 /* SS2 is handled as an escape sequence of ESC 'N' */
1393 c1 = 'N';
1394 goto label_escape_sequence;
1395
1396 case ISO_single_shift_3:
1397 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1398 goto label_invalid_code;
1399 /* SS2 is handled as an escape sequence of ESC 'O' */
1400 c1 = 'O';
1401 goto label_escape_sequence;
1402
1403 case ISO_control_sequence_introducer:
1404 /* CSI is handled as an escape sequence of ESC '[' ... */
1405 c1 = '[';
1406 goto label_escape_sequence;
1407
1408 case ISO_escape:
1409 ONE_MORE_BYTE (c1);
1410 label_escape_sequence:
1411 /* Escape sequences handled by Emacs are invocation,
1412 designation, direction specification, and character
1413 composition specification. */
1414 switch (c1)
1415 {
1416 case '&': /* revision of following character set */
1417 ONE_MORE_BYTE (c1);
1418 if (!(c1 >= '@' && c1 <= '~'))
1419 goto label_invalid_code;
1420 ONE_MORE_BYTE (c1);
1421 if (c1 != ISO_CODE_ESC)
1422 goto label_invalid_code;
1423 ONE_MORE_BYTE (c1);
1424 goto label_escape_sequence;
1425
1426 case '$': /* designation of 2-byte character set */
1427 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1428 goto label_invalid_code;
1429 ONE_MORE_BYTE (c1);
1430 if (c1 >= '@' && c1 <= 'B')
1431 { /* designation of JISX0208.1978, GB2312.1980,
1432 or JISX0208.1980 */
1433 DECODE_DESIGNATION (0, 2, 94, c1);
1434 }
1435 else if (c1 >= 0x28 && c1 <= 0x2B)
1436 { /* designation of DIMENSION2_CHARS94 character set */
1437 ONE_MORE_BYTE (c2);
1438 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1439 }
1440 else if (c1 >= 0x2C && c1 <= 0x2F)
1441 { /* designation of DIMENSION2_CHARS96 character set */
1442 ONE_MORE_BYTE (c2);
1443 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1444 }
1445 else
1446 goto label_invalid_code;
1447 /* We must update these variables now. */
1448 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1449 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1450 continue;
1451
1452 case 'n': /* invocation of locking-shift-2 */
1453 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1454 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1455 goto label_invalid_code;
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1457 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1458 continue;
1459
1460 case 'o': /* invocation of locking-shift-3 */
1461 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1462 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1463 goto label_invalid_code;
1464 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1465 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1466 continue;
1467
1468 case 'N': /* invocation of single-shift-2 */
1469 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1470 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1471 goto label_invalid_code;
1472 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1473 ONE_MORE_BYTE (c1);
1474 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1475 goto label_invalid_code;
1476 break;
1477
1478 case 'O': /* invocation of single-shift-3 */
1479 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1480 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1481 goto label_invalid_code;
1482 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1483 ONE_MORE_BYTE (c1);
1484 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1485 goto label_invalid_code;
1486 break;
1487
1488 case '0': case '2': case '3': case '4': /* start composition */
1489 DECODE_COMPOSITION_START (c1);
1490 continue;
1491
1492 case '1': /* end composition */
1493 DECODE_COMPOSITION_END (c1);
1494 continue;
1495
1496 case '[': /* specification of direction */
1497 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1498 goto label_invalid_code;
1499 /* For the moment, nested direction is not supported.
1500 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1501 left-to-right, and nozero means right-to-left. */
1502 ONE_MORE_BYTE (c1);
1503 switch (c1)
1504 {
1505 case ']': /* end of the current direction */
1506 coding->mode &= ~CODING_MODE_DIRECTION;
1507
1508 case '0': /* end of the current direction */
1509 case '1': /* start of left-to-right direction */
1510 ONE_MORE_BYTE (c1);
1511 if (c1 == ']')
1512 coding->mode &= ~CODING_MODE_DIRECTION;
1513 else
1514 goto label_invalid_code;
1515 break;
1516
1517 case '2': /* start of right-to-left direction */
1518 ONE_MORE_BYTE (c1);
1519 if (c1 == ']')
1520 coding->mode |= CODING_MODE_DIRECTION;
1521 else
1522 goto label_invalid_code;
1523 break;
1524
1525 default:
1526 goto label_invalid_code;
1527 }
1528 continue;
1529
1530 default:
1531 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1532 goto label_invalid_code;
1533 if (c1 >= 0x28 && c1 <= 0x2B)
1534 { /* designation of DIMENSION1_CHARS94 character set */
1535 ONE_MORE_BYTE (c2);
1536 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1537 }
1538 else if (c1 >= 0x2C && c1 <= 0x2F)
1539 { /* designation of DIMENSION1_CHARS96 character set */
1540 ONE_MORE_BYTE (c2);
1541 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1542 }
1543 else
1544 goto label_invalid_code;
1545 /* We must update these variables now. */
1546 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1547 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1548 continue;
1549 }
1550 }
1551
1552 /* Now we know CHARSET and 1st position code C1 of a character.
1553 Produce a multibyte sequence for that character while getting
1554 2nd position code C2 if necessary. */
1555 if (CHARSET_DIMENSION (charset) == 2)
1556 {
1557 ONE_MORE_BYTE (c2);
1558 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1559 /* C2 is not in a valid range. */
1560 goto label_invalid_code;
1561 }
1562 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1563 EMIT_CHAR (c);
1564 continue;
1565
1566 label_invalid_code:
1567 coding->errors++;
1568 if (COMPOSING_P (coding))
1569 DECODE_COMPOSITION_END ('1');
1570 src = src_base;
1571 c = *src++;
1572 EMIT_CHAR (c);
1573 }
1574
1575 label_end_of_loop:
1576 coding->consumed = coding->consumed_char = src_base - source;
1577 coding->produced = dst - destination;
1578 return;
1579 }
1580
1581
1582 /* ISO2022 encoding stuff. */
1583
1584 /*
1585 It is not enough to say just "ISO2022" on encoding, we have to
1586 specify more details. In Emacs, each coding system of ISO2022
1587 variant has the following specifications:
1588 1. Initial designation to G0 thru G3.
1589 2. Allows short-form designation?
1590 3. ASCII should be designated to G0 before control characters?
1591 4. ASCII should be designated to G0 at end of line?
1592 5. 7-bit environment or 8-bit environment?
1593 6. Use locking-shift?
1594 7. Use Single-shift?
1595 And the following two are only for Japanese:
1596 8. Use ASCII in place of JIS0201-1976-Roman?
1597 9. Use JISX0208-1983 in place of JISX0208-1978?
1598 These specifications are encoded in `coding->flags' as flag bits
1599 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1600 details.
1601 */
1602
1603 /* Produce codes (escape sequence) for designating CHARSET to graphic
1604 register REG at DST, and increment DST. If <final-char> of CHARSET is
1605 '@', 'A', or 'B' and the coding system CODING allows, produce
1606 designation sequence of short-form. */
1607
1608 #define ENCODE_DESIGNATION(charset, reg, coding) \
1609 do { \
1610 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1611 char *intermediate_char_94 = "()*+"; \
1612 char *intermediate_char_96 = ",-./"; \
1613 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1614 \
1615 if (revision < 255) \
1616 { \
1617 *dst++ = ISO_CODE_ESC; \
1618 *dst++ = '&'; \
1619 *dst++ = '@' + revision; \
1620 } \
1621 *dst++ = ISO_CODE_ESC; \
1622 if (CHARSET_DIMENSION (charset) == 1) \
1623 { \
1624 if (CHARSET_CHARS (charset) == 94) \
1625 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1626 else \
1627 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1628 } \
1629 else \
1630 { \
1631 *dst++ = '$'; \
1632 if (CHARSET_CHARS (charset) == 94) \
1633 { \
1634 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1635 || reg != 0 \
1636 || final_char < '@' || final_char > 'B') \
1637 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1638 } \
1639 else \
1640 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1641 } \
1642 *dst++ = final_char; \
1643 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1644 } while (0)
1645
1646 /* The following two macros produce codes (control character or escape
1647 sequence) for ISO2022 single-shift functions (single-shift-2 and
1648 single-shift-3). */
1649
1650 #define ENCODE_SINGLE_SHIFT_2 \
1651 do { \
1652 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1653 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1654 else \
1655 *dst++ = ISO_CODE_SS2; \
1656 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1657 } while (0)
1658
1659 #define ENCODE_SINGLE_SHIFT_3 \
1660 do { \
1661 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1662 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1663 else \
1664 *dst++ = ISO_CODE_SS3; \
1665 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1666 } while (0)
1667
1668 /* The following four macros produce codes (control character or
1669 escape sequence) for ISO2022 locking-shift functions (shift-in,
1670 shift-out, locking-shift-2, and locking-shift-3). */
1671
1672 #define ENCODE_SHIFT_IN \
1673 do { \
1674 *dst++ = ISO_CODE_SI; \
1675 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1676 } while (0)
1677
1678 #define ENCODE_SHIFT_OUT \
1679 do { \
1680 *dst++ = ISO_CODE_SO; \
1681 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1682 } while (0)
1683
1684 #define ENCODE_LOCKING_SHIFT_2 \
1685 do { \
1686 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1687 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1688 } while (0)
1689
1690 #define ENCODE_LOCKING_SHIFT_3 \
1691 do { \
1692 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1693 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1694 } while (0)
1695
1696 /* Produce codes for a DIMENSION1 character whose character set is
1697 CHARSET and whose position-code is C1. Designation and invocation
1698 sequences are also produced in advance if necessary. */
1699
1700 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1701 do { \
1702 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1703 { \
1704 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = c1 & 0x7F; \
1706 else \
1707 *dst++ = c1 | 0x80; \
1708 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1709 break; \
1710 } \
1711 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1712 { \
1713 *dst++ = c1 & 0x7F; \
1714 break; \
1715 } \
1716 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1717 { \
1718 *dst++ = c1 | 0x80; \
1719 break; \
1720 } \
1721 else \
1722 /* Since CHARSET is not yet invoked to any graphic planes, we \
1723 must invoke it, or, at first, designate it to some graphic \
1724 register. Then repeat the loop to actually produce the \
1725 character. */ \
1726 dst = encode_invocation_designation (charset, coding, dst); \
1727 } while (1)
1728
1729 /* Produce codes for a DIMENSION2 character whose character set is
1730 CHARSET and whose position-codes are C1 and C2. Designation and
1731 invocation codes are also produced in advance if necessary. */
1732
1733 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1734 do { \
1735 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1736 { \
1737 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1738 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1739 else \
1740 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1741 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1742 break; \
1743 } \
1744 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1745 { \
1746 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1747 break; \
1748 } \
1749 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1750 { \
1751 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1752 break; \
1753 } \
1754 else \
1755 /* Since CHARSET is not yet invoked to any graphic planes, we \
1756 must invoke it, or, at first, designate it to some graphic \
1757 register. Then repeat the loop to actually produce the \
1758 character. */ \
1759 dst = encode_invocation_designation (charset, coding, dst); \
1760 } while (1)
1761
1762 #define ENCODE_ISO_CHARACTER(c) \
1763 do { \
1764 int charset, c1, c2; \
1765 \
1766 SPLIT_CHAR (c, charset, c1, c2); \
1767 if (CHARSET_DEFINED_P (charset)) \
1768 { \
1769 if (CHARSET_DIMENSION (charset) == 1) \
1770 { \
1771 if (charset == CHARSET_ASCII \
1772 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1773 charset = charset_latin_jisx0201; \
1774 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
1775 } \
1776 else \
1777 { \
1778 if (charset == charset_jisx0208 \
1779 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1780 charset = charset_jisx0208_1978; \
1781 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
1782 } \
1783 } \
1784 else \
1785 { \
1786 *dst++ = c1; \
1787 if (c2 >= 0) \
1788 *dst++ = c2; \
1789 } \
1790 } while (0)
1791
1792
1793 /* Instead of encoding character C, produce one or two `?'s. */
1794
1795 #define ENCODE_UNSAFE_CHARACTER(c) \
1796 do { \
1797 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
1798 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
1799 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
1800 } while (0)
1801
1802
1803 /* Produce designation and invocation codes at a place pointed by DST
1804 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1805 Return new DST. */
1806
1807 unsigned char *
1808 encode_invocation_designation (charset, coding, dst)
1809 int charset;
1810 struct coding_system *coding;
1811 unsigned char *dst;
1812 {
1813 int reg; /* graphic register number */
1814
1815 /* At first, check designations. */
1816 for (reg = 0; reg < 4; reg++)
1817 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1818 break;
1819
1820 if (reg >= 4)
1821 {
1822 /* CHARSET is not yet designated to any graphic registers. */
1823 /* At first check the requested designation. */
1824 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1825 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1826 /* Since CHARSET requests no special designation, designate it
1827 to graphic register 0. */
1828 reg = 0;
1829
1830 ENCODE_DESIGNATION (charset, reg, coding);
1831 }
1832
1833 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1834 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1835 {
1836 /* Since the graphic register REG is not invoked to any graphic
1837 planes, invoke it to graphic plane 0. */
1838 switch (reg)
1839 {
1840 case 0: /* graphic register 0 */
1841 ENCODE_SHIFT_IN;
1842 break;
1843
1844 case 1: /* graphic register 1 */
1845 ENCODE_SHIFT_OUT;
1846 break;
1847
1848 case 2: /* graphic register 2 */
1849 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1850 ENCODE_SINGLE_SHIFT_2;
1851 else
1852 ENCODE_LOCKING_SHIFT_2;
1853 break;
1854
1855 case 3: /* graphic register 3 */
1856 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1857 ENCODE_SINGLE_SHIFT_3;
1858 else
1859 ENCODE_LOCKING_SHIFT_3;
1860 break;
1861 }
1862 }
1863
1864 return dst;
1865 }
1866
1867 /* Produce 2-byte codes for encoded composition rule RULE. */
1868
1869 #define ENCODE_COMPOSITION_RULE(rule) \
1870 do { \
1871 int gref, nref; \
1872 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1873 *dst++ = 32 + 81 + gref; \
1874 *dst++ = 32 + nref; \
1875 } while (0)
1876
1877 /* Produce codes for indicating the start of a composition sequence
1878 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1879 which specify information about the composition. See the comment
1880 in coding.h for the format of DATA. */
1881
1882 #define ENCODE_COMPOSITION_START(coding, data) \
1883 do { \
1884 coding->composing = data[3]; \
1885 *dst++ = ISO_CODE_ESC; \
1886 if (coding->composing == COMPOSITION_RELATIVE) \
1887 *dst++ = '0'; \
1888 else \
1889 { \
1890 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1891 ? '3' : '4'); \
1892 coding->cmp_data_index = coding->cmp_data_start + 4; \
1893 coding->composition_rule_follows = 0; \
1894 } \
1895 } while (0)
1896
1897 /* Produce codes for indicating the end of the current composition. */
1898
1899 #define ENCODE_COMPOSITION_END(coding, data) \
1900 do { \
1901 *dst++ = ISO_CODE_ESC; \
1902 *dst++ = '1'; \
1903 coding->cmp_data_start += data[0]; \
1904 coding->composing = COMPOSITION_NO; \
1905 if (coding->cmp_data_start == coding->cmp_data->used \
1906 && coding->cmp_data->next) \
1907 { \
1908 coding->cmp_data = coding->cmp_data->next; \
1909 coding->cmp_data_start = 0; \
1910 } \
1911 } while (0)
1912
1913 /* Produce composition start sequence ESC 0. Here, this sequence
1914 doesn't mean the start of a new composition but means that we have
1915 just produced components (alternate chars and composition rules) of
1916 the composition and the actual text follows in SRC. */
1917
1918 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1919 do { \
1920 *dst++ = ISO_CODE_ESC; \
1921 *dst++ = '0'; \
1922 coding->composing = COMPOSITION_RELATIVE; \
1923 } while (0)
1924
1925 /* The following three macros produce codes for indicating direction
1926 of text. */
1927 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1928 do { \
1929 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1930 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1931 else \
1932 *dst++ = ISO_CODE_CSI; \
1933 } while (0)
1934
1935 #define ENCODE_DIRECTION_R2L \
1936 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1937
1938 #define ENCODE_DIRECTION_L2R \
1939 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1940
1941 /* Produce codes for designation and invocation to reset the graphic
1942 planes and registers to initial state. */
1943 #define ENCODE_RESET_PLANE_AND_REGISTER \
1944 do { \
1945 int reg; \
1946 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1947 ENCODE_SHIFT_IN; \
1948 for (reg = 0; reg < 4; reg++) \
1949 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1950 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1951 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1952 ENCODE_DESIGNATION \
1953 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1954 } while (0)
1955
1956 /* Produce designation sequences of charsets in the line started from
1957 SRC to a place pointed by DST, and return updated DST.
1958
1959 If the current block ends before any end-of-line, we may fail to
1960 find all the necessary designations. */
1961
1962 static unsigned char *
1963 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1964 struct coding_system *coding;
1965 Lisp_Object translation_table;
1966 unsigned char *src, *src_end, *dst;
1967 {
1968 int charset, c, found = 0, reg;
1969 /* Table of charsets to be designated to each graphic register. */
1970 int r[4];
1971
1972 for (reg = 0; reg < 4; reg++)
1973 r[reg] = -1;
1974
1975 while (found < 4)
1976 {
1977 ONE_MORE_CHAR (c);
1978 if (c == '\n')
1979 break;
1980
1981 charset = CHAR_CHARSET (c);
1982 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1983 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1984 {
1985 found++;
1986 r[reg] = charset;
1987 }
1988 }
1989
1990 label_end_of_loop:
1991 if (found)
1992 {
1993 for (reg = 0; reg < 4; reg++)
1994 if (r[reg] >= 0
1995 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1996 ENCODE_DESIGNATION (r[reg], reg, coding);
1997 }
1998
1999 return dst;
2000 }
2001
2002 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2003
2004 static void
2005 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2006 struct coding_system *coding;
2007 unsigned char *source, *destination;
2008 int src_bytes, dst_bytes;
2009 {
2010 unsigned char *src = source;
2011 unsigned char *src_end = source + src_bytes;
2012 unsigned char *dst = destination;
2013 unsigned char *dst_end = destination + dst_bytes;
2014 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2015 from DST_END to assure overflow checking is necessary only at the
2016 head of loop. */
2017 unsigned char *adjusted_dst_end = dst_end - 19;
2018 /* SRC_BASE remembers the start position in source in each loop.
2019 The loop will be exited when there's not enough source text to
2020 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2021 there's not enough destination area to produce encoded codes
2022 (within macro EMIT_BYTES). */
2023 unsigned char *src_base;
2024 int c;
2025 Lisp_Object translation_table;
2026 Lisp_Object safe_chars;
2027
2028 safe_chars = coding_safe_chars (coding);
2029
2030 if (NILP (Venable_character_translation))
2031 translation_table = Qnil;
2032 else
2033 {
2034 translation_table = coding->translation_table_for_encode;
2035 if (NILP (translation_table))
2036 translation_table = Vstandard_translation_table_for_encode;
2037 }
2038
2039 coding->consumed_char = 0;
2040 coding->errors = 0;
2041 while (1)
2042 {
2043 src_base = src;
2044
2045 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2046 {
2047 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2048 break;
2049 }
2050
2051 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2052 && CODING_SPEC_ISO_BOL (coding))
2053 {
2054 /* We have to produce designation sequences if any now. */
2055 dst = encode_designation_at_bol (coding, translation_table,
2056 src, src_end, dst);
2057 CODING_SPEC_ISO_BOL (coding) = 0;
2058 }
2059
2060 /* Check composition start and end. */
2061 if (coding->composing != COMPOSITION_DISABLED
2062 && coding->cmp_data_start < coding->cmp_data->used)
2063 {
2064 struct composition_data *cmp_data = coding->cmp_data;
2065 int *data = cmp_data->data + coding->cmp_data_start;
2066 int this_pos = cmp_data->char_offset + coding->consumed_char;
2067
2068 if (coding->composing == COMPOSITION_RELATIVE)
2069 {
2070 if (this_pos == data[2])
2071 {
2072 ENCODE_COMPOSITION_END (coding, data);
2073 cmp_data = coding->cmp_data;
2074 data = cmp_data->data + coding->cmp_data_start;
2075 }
2076 }
2077 else if (COMPOSING_P (coding))
2078 {
2079 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2080 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2081 /* We have consumed components of the composition.
2082 What follows in SRC is the compositions's base
2083 text. */
2084 ENCODE_COMPOSITION_FAKE_START (coding);
2085 else
2086 {
2087 int c = cmp_data->data[coding->cmp_data_index++];
2088 if (coding->composition_rule_follows)
2089 {
2090 ENCODE_COMPOSITION_RULE (c);
2091 coding->composition_rule_follows = 0;
2092 }
2093 else
2094 {
2095 if (coding->flags & CODING_FLAG_ISO_SAFE
2096 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2097 ENCODE_UNSAFE_CHARACTER (c);
2098 else
2099 ENCODE_ISO_CHARACTER (c);
2100 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2101 coding->composition_rule_follows = 1;
2102 }
2103 continue;
2104 }
2105 }
2106 if (!COMPOSING_P (coding))
2107 {
2108 if (this_pos == data[1])
2109 {
2110 ENCODE_COMPOSITION_START (coding, data);
2111 continue;
2112 }
2113 }
2114 }
2115
2116 ONE_MORE_CHAR (c);
2117
2118 /* Now encode the character C. */
2119 if (c < 0x20 || c == 0x7F)
2120 {
2121 if (c == '\r')
2122 {
2123 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2124 {
2125 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2126 ENCODE_RESET_PLANE_AND_REGISTER;
2127 *dst++ = c;
2128 continue;
2129 }
2130 /* fall down to treat '\r' as '\n' ... */
2131 c = '\n';
2132 }
2133 if (c == '\n')
2134 {
2135 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2136 ENCODE_RESET_PLANE_AND_REGISTER;
2137 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2138 bcopy (coding->spec.iso2022.initial_designation,
2139 coding->spec.iso2022.current_designation,
2140 sizeof coding->spec.iso2022.initial_designation);
2141 if (coding->eol_type == CODING_EOL_LF
2142 || coding->eol_type == CODING_EOL_UNDECIDED)
2143 *dst++ = ISO_CODE_LF;
2144 else if (coding->eol_type == CODING_EOL_CRLF)
2145 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2146 else
2147 *dst++ = ISO_CODE_CR;
2148 CODING_SPEC_ISO_BOL (coding) = 1;
2149 }
2150 else
2151 {
2152 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2153 ENCODE_RESET_PLANE_AND_REGISTER;
2154 *dst++ = c;
2155 }
2156 }
2157 else if (ASCII_BYTE_P (c))
2158 ENCODE_ISO_CHARACTER (c);
2159 else if (SINGLE_BYTE_CHAR_P (c))
2160 {
2161 *dst++ = c;
2162 coding->errors++;
2163 }
2164 else if (coding->flags & CODING_FLAG_ISO_SAFE
2165 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2166 ENCODE_UNSAFE_CHARACTER (c);
2167 else
2168 ENCODE_ISO_CHARACTER (c);
2169
2170 coding->consumed_char++;
2171 }
2172
2173 label_end_of_loop:
2174 coding->consumed = src_base - source;
2175 coding->produced = coding->produced_char = dst - destination;
2176 }
2177
2178 \f
2179 /*** 4. SJIS and BIG5 handlers ***/
2180
2181 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2182 quite widely. So, for the moment, Emacs supports them in the bare
2183 C code. But, in the future, they may be supported only by CCL. */
2184
2185 /* SJIS is a coding system encoding three character sets: ASCII, right
2186 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2187 as is. A character of charset katakana-jisx0201 is encoded by
2188 "position-code + 0x80". A character of charset japanese-jisx0208
2189 is encoded in 2-byte but two position-codes are divided and shifted
2190 so that it fit in the range below.
2191
2192 --- CODE RANGE of SJIS ---
2193 (character set) (range)
2194 ASCII 0x00 .. 0x7F
2195 KATAKANA-JISX0201 0xA0 .. 0xDF
2196 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2197 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2198 -------------------------------
2199
2200 */
2201
2202 /* BIG5 is a coding system encoding two character sets: ASCII and
2203 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2204 character set and is encoded in two-byte.
2205
2206 --- CODE RANGE of BIG5 ---
2207 (character set) (range)
2208 ASCII 0x00 .. 0x7F
2209 Big5 (1st byte) 0xA1 .. 0xFE
2210 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2211 --------------------------
2212
2213 Since the number of characters in Big5 is larger than maximum
2214 characters in Emacs' charset (96x96), it can't be handled as one
2215 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2216 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2217 contains frequently used characters and the latter contains less
2218 frequently used characters. */
2219
2220 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2221 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2222 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2223 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2224
2225 /* Number of Big5 characters which have the same code in 1st byte. */
2226 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2227
2228 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2229 do { \
2230 unsigned int temp \
2231 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2232 if (b1 < 0xC9) \
2233 charset = charset_big5_1; \
2234 else \
2235 { \
2236 charset = charset_big5_2; \
2237 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2238 } \
2239 c1 = temp / (0xFF - 0xA1) + 0x21; \
2240 c2 = temp % (0xFF - 0xA1) + 0x21; \
2241 } while (0)
2242
2243 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2244 do { \
2245 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2246 if (charset == charset_big5_2) \
2247 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2248 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2249 b2 = temp % BIG5_SAME_ROW; \
2250 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2251 } while (0)
2252
2253 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2254 Check if a text is encoded in SJIS. If it is, return
2255 CODING_CATEGORY_MASK_SJIS, else return 0. */
2256
2257 int
2258 detect_coding_sjis (src, src_end)
2259 unsigned char *src, *src_end;
2260 {
2261 int c;
2262 /* Dummy for ONE_MORE_BYTE. */
2263 struct coding_system dummy_coding;
2264 struct coding_system *coding = &dummy_coding;
2265
2266 while (1)
2267 {
2268 ONE_MORE_BYTE (c);
2269 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2270 {
2271 ONE_MORE_BYTE (c);
2272 if (c < 0x40)
2273 return 0;
2274 }
2275 }
2276 label_end_of_loop:
2277 return CODING_CATEGORY_MASK_SJIS;
2278 }
2279
2280 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2281 Check if a text is encoded in BIG5. If it is, return
2282 CODING_CATEGORY_MASK_BIG5, else return 0. */
2283
2284 int
2285 detect_coding_big5 (src, src_end)
2286 unsigned char *src, *src_end;
2287 {
2288 int c;
2289 /* Dummy for ONE_MORE_BYTE. */
2290 struct coding_system dummy_coding;
2291 struct coding_system *coding = &dummy_coding;
2292
2293 while (1)
2294 {
2295 ONE_MORE_BYTE (c);
2296 if (c >= 0xA1)
2297 {
2298 ONE_MORE_BYTE (c);
2299 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2300 return 0;
2301 }
2302 }
2303 label_end_of_loop:
2304 return CODING_CATEGORY_MASK_BIG5;
2305 }
2306
2307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2308 Check if a text is encoded in UTF-8. If it is, return
2309 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2310
2311 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2312 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2313 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2314 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2315 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2316 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2317 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2318
2319 int
2320 detect_coding_utf_8 (src, src_end)
2321 unsigned char *src, *src_end;
2322 {
2323 unsigned char c;
2324 int seq_maybe_bytes;
2325 /* Dummy for ONE_MORE_BYTE. */
2326 struct coding_system dummy_coding;
2327 struct coding_system *coding = &dummy_coding;
2328
2329 while (1)
2330 {
2331 ONE_MORE_BYTE (c);
2332 if (UTF_8_1_OCTET_P (c))
2333 continue;
2334 else if (UTF_8_2_OCTET_LEADING_P (c))
2335 seq_maybe_bytes = 1;
2336 else if (UTF_8_3_OCTET_LEADING_P (c))
2337 seq_maybe_bytes = 2;
2338 else if (UTF_8_4_OCTET_LEADING_P (c))
2339 seq_maybe_bytes = 3;
2340 else if (UTF_8_5_OCTET_LEADING_P (c))
2341 seq_maybe_bytes = 4;
2342 else if (UTF_8_6_OCTET_LEADING_P (c))
2343 seq_maybe_bytes = 5;
2344 else
2345 return 0;
2346
2347 do
2348 {
2349 ONE_MORE_BYTE (c);
2350 if (!UTF_8_EXTRA_OCTET_P (c))
2351 return 0;
2352 seq_maybe_bytes--;
2353 }
2354 while (seq_maybe_bytes > 0);
2355 }
2356
2357 label_end_of_loop:
2358 return CODING_CATEGORY_MASK_UTF_8;
2359 }
2360
2361 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2362 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2363 Little Endian (otherwise). If it is, return
2364 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2365 else return 0. */
2366
2367 #define UTF_16_INVALID_P(val) \
2368 (((val) == 0xFFFE) \
2369 || ((val) == 0xFFFF))
2370
2371 #define UTF_16_HIGH_SURROGATE_P(val) \
2372 (((val) & 0xD800) == 0xD800)
2373
2374 #define UTF_16_LOW_SURROGATE_P(val) \
2375 (((val) & 0xDC00) == 0xDC00)
2376
2377 int
2378 detect_coding_utf_16 (src, src_end)
2379 unsigned char *src, *src_end;
2380 {
2381 unsigned char c1, c2;
2382 /* Dummy for TWO_MORE_BYTES. */
2383 struct coding_system dummy_coding;
2384 struct coding_system *coding = &dummy_coding;
2385
2386 TWO_MORE_BYTES (c1, c2);
2387
2388 if ((c1 == 0xFF) && (c2 == 0xFE))
2389 return CODING_CATEGORY_MASK_UTF_16_LE;
2390 else if ((c1 == 0xFE) && (c2 == 0xFF))
2391 return CODING_CATEGORY_MASK_UTF_16_BE;
2392
2393 label_end_of_loop:
2394 return 0;
2395 }
2396
2397 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2398 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2399
2400 static void
2401 decode_coding_sjis_big5 (coding, source, destination,
2402 src_bytes, dst_bytes, sjis_p)
2403 struct coding_system *coding;
2404 unsigned char *source, *destination;
2405 int src_bytes, dst_bytes;
2406 int sjis_p;
2407 {
2408 unsigned char *src = source;
2409 unsigned char *src_end = source + src_bytes;
2410 unsigned char *dst = destination;
2411 unsigned char *dst_end = destination + dst_bytes;
2412 /* SRC_BASE remembers the start position in source in each loop.
2413 The loop will be exited when there's not enough source code
2414 (within macro ONE_MORE_BYTE), or when there's not enough
2415 destination area to produce a character (within macro
2416 EMIT_CHAR). */
2417 unsigned char *src_base;
2418 Lisp_Object translation_table;
2419
2420 if (NILP (Venable_character_translation))
2421 translation_table = Qnil;
2422 else
2423 {
2424 translation_table = coding->translation_table_for_decode;
2425 if (NILP (translation_table))
2426 translation_table = Vstandard_translation_table_for_decode;
2427 }
2428
2429 coding->produced_char = 0;
2430 while (1)
2431 {
2432 int c, charset, c1, c2;
2433
2434 src_base = src;
2435 ONE_MORE_BYTE (c1);
2436
2437 if (c1 < 0x80)
2438 {
2439 charset = CHARSET_ASCII;
2440 if (c1 < 0x20)
2441 {
2442 if (c1 == '\r')
2443 {
2444 if (coding->eol_type == CODING_EOL_CRLF)
2445 {
2446 ONE_MORE_BYTE (c2);
2447 if (c2 == '\n')
2448 c1 = c2;
2449 else if (coding->mode
2450 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2451 {
2452 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2453 goto label_end_of_loop;
2454 }
2455 else
2456 /* To process C2 again, SRC is subtracted by 1. */
2457 src--;
2458 }
2459 else if (coding->eol_type == CODING_EOL_CR)
2460 c1 = '\n';
2461 }
2462 else if (c1 == '\n'
2463 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2464 && (coding->eol_type == CODING_EOL_CR
2465 || coding->eol_type == CODING_EOL_CRLF))
2466 {
2467 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2468 goto label_end_of_loop;
2469 }
2470 }
2471 }
2472 else
2473 {
2474 if (sjis_p)
2475 {
2476 if (c1 >= 0xF0)
2477 goto label_invalid_code;
2478 if (c1 < 0xA0 || c1 >= 0xE0)
2479 {
2480 /* SJIS -> JISX0208 */
2481 ONE_MORE_BYTE (c2);
2482 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2483 goto label_invalid_code;
2484 DECODE_SJIS (c1, c2, c1, c2);
2485 charset = charset_jisx0208;
2486 }
2487 else
2488 /* SJIS -> JISX0201-Kana */
2489 charset = charset_katakana_jisx0201;
2490 }
2491 else
2492 {
2493 /* BIG5 -> Big5 */
2494 if (c1 < 0xA1 || c1 > 0xFE)
2495 goto label_invalid_code;
2496 ONE_MORE_BYTE (c2);
2497 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2498 goto label_invalid_code;
2499 DECODE_BIG5 (c1, c2, charset, c1, c2);
2500 }
2501 }
2502
2503 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2504 EMIT_CHAR (c);
2505 continue;
2506
2507 label_invalid_code:
2508 coding->errors++;
2509 src = src_base;
2510 c = *src++;
2511 EMIT_CHAR (c);
2512 }
2513
2514 label_end_of_loop:
2515 coding->consumed = coding->consumed_char = src_base - source;
2516 coding->produced = dst - destination;
2517 return;
2518 }
2519
2520 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2521 This function can encode charsets `ascii', `katakana-jisx0201',
2522 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2523 are sure that all these charsets are registered as official charset
2524 (i.e. do not have extended leading-codes). Characters of other
2525 charsets are produced without any encoding. If SJIS_P is 1, encode
2526 SJIS text, else encode BIG5 text. */
2527
2528 static void
2529 encode_coding_sjis_big5 (coding, source, destination,
2530 src_bytes, dst_bytes, sjis_p)
2531 struct coding_system *coding;
2532 unsigned char *source, *destination;
2533 int src_bytes, dst_bytes;
2534 int sjis_p;
2535 {
2536 unsigned char *src = source;
2537 unsigned char *src_end = source + src_bytes;
2538 unsigned char *dst = destination;
2539 unsigned char *dst_end = destination + dst_bytes;
2540 /* SRC_BASE remembers the start position in source in each loop.
2541 The loop will be exited when there's not enough source text to
2542 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2543 there's not enough destination area to produce encoded codes
2544 (within macro EMIT_BYTES). */
2545 unsigned char *src_base;
2546 Lisp_Object translation_table;
2547
2548 if (NILP (Venable_character_translation))
2549 translation_table = Qnil;
2550 else
2551 {
2552 translation_table = coding->translation_table_for_decode;
2553 if (NILP (translation_table))
2554 translation_table = Vstandard_translation_table_for_decode;
2555 }
2556
2557 while (1)
2558 {
2559 int c, charset, c1, c2;
2560
2561 src_base = src;
2562 ONE_MORE_CHAR (c);
2563
2564 /* Now encode the character C. */
2565 if (SINGLE_BYTE_CHAR_P (c))
2566 {
2567 switch (c)
2568 {
2569 case '\r':
2570 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2571 {
2572 EMIT_ONE_BYTE (c);
2573 break;
2574 }
2575 c = '\n';
2576 case '\n':
2577 if (coding->eol_type == CODING_EOL_CRLF)
2578 {
2579 EMIT_TWO_BYTES ('\r', c);
2580 break;
2581 }
2582 else if (coding->eol_type == CODING_EOL_CR)
2583 c = '\r';
2584 default:
2585 EMIT_ONE_BYTE (c);
2586 }
2587 }
2588 else
2589 {
2590 SPLIT_CHAR (c, charset, c1, c2);
2591 if (sjis_p)
2592 {
2593 if (charset == charset_jisx0208
2594 || charset == charset_jisx0208_1978)
2595 {
2596 ENCODE_SJIS (c1, c2, c1, c2);
2597 EMIT_TWO_BYTES (c1, c2);
2598 }
2599 else if (charset == charset_latin_jisx0201)
2600 EMIT_ONE_BYTE (c1);
2601 else
2602 /* There's no way other than producing the internal
2603 codes as is. */
2604 EMIT_BYTES (src_base, src);
2605 }
2606 else
2607 {
2608 if (charset == charset_big5_1 || charset == charset_big5_2)
2609 {
2610 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2611 EMIT_TWO_BYTES (c1, c2);
2612 }
2613 else
2614 /* There's no way other than producing the internal
2615 codes as is. */
2616 EMIT_BYTES (src_base, src);
2617 }
2618 }
2619 coding->consumed_char++;
2620 }
2621
2622 label_end_of_loop:
2623 coding->consumed = src_base - source;
2624 coding->produced = coding->produced_char = dst - destination;
2625 }
2626
2627 \f
2628 /*** 5. CCL handlers ***/
2629
2630 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2631 Check if a text is encoded in a coding system of which
2632 encoder/decoder are written in CCL program. If it is, return
2633 CODING_CATEGORY_MASK_CCL, else return 0. */
2634
2635 int
2636 detect_coding_ccl (src, src_end)
2637 unsigned char *src, *src_end;
2638 {
2639 unsigned char *valid;
2640 int c;
2641 /* Dummy for ONE_MORE_BYTE. */
2642 struct coding_system dummy_coding;
2643 struct coding_system *coding = &dummy_coding;
2644
2645 /* No coding system is assigned to coding-category-ccl. */
2646 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2647 return 0;
2648
2649 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2650 while (1)
2651 {
2652 ONE_MORE_BYTE (c);
2653 if (! valid[c])
2654 return 0;
2655 }
2656 label_end_of_loop:
2657 return CODING_CATEGORY_MASK_CCL;
2658 }
2659
2660 \f
2661 /*** 6. End-of-line handlers ***/
2662
2663 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2664
2665 static void
2666 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2667 struct coding_system *coding;
2668 unsigned char *source, *destination;
2669 int src_bytes, dst_bytes;
2670 {
2671 unsigned char *src = source;
2672 unsigned char *dst = destination;
2673 unsigned char *src_end = src + src_bytes;
2674 unsigned char *dst_end = dst + dst_bytes;
2675 Lisp_Object translation_table;
2676 /* SRC_BASE remembers the start position in source in each loop.
2677 The loop will be exited when there's not enough source code
2678 (within macro ONE_MORE_BYTE), or when there's not enough
2679 destination area to produce a character (within macro
2680 EMIT_CHAR). */
2681 unsigned char *src_base;
2682 int c;
2683
2684 translation_table = Qnil;
2685 switch (coding->eol_type)
2686 {
2687 case CODING_EOL_CRLF:
2688 while (1)
2689 {
2690 src_base = src;
2691 ONE_MORE_BYTE (c);
2692 if (c == '\r')
2693 {
2694 ONE_MORE_BYTE (c);
2695 if (c != '\n')
2696 {
2697 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2698 {
2699 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2700 goto label_end_of_loop;
2701 }
2702 src--;
2703 c = '\r';
2704 }
2705 }
2706 else if (c == '\n'
2707 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2708 {
2709 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2710 goto label_end_of_loop;
2711 }
2712 EMIT_CHAR (c);
2713 }
2714 break;
2715
2716 case CODING_EOL_CR:
2717 while (1)
2718 {
2719 src_base = src;
2720 ONE_MORE_BYTE (c);
2721 if (c == '\n')
2722 {
2723 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2724 {
2725 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2726 goto label_end_of_loop;
2727 }
2728 }
2729 else if (c == '\r')
2730 c = '\n';
2731 EMIT_CHAR (c);
2732 }
2733 break;
2734
2735 default: /* no need for EOL handling */
2736 while (1)
2737 {
2738 src_base = src;
2739 ONE_MORE_BYTE (c);
2740 EMIT_CHAR (c);
2741 }
2742 }
2743
2744 label_end_of_loop:
2745 coding->consumed = coding->consumed_char = src_base - source;
2746 coding->produced = dst - destination;
2747 return;
2748 }
2749
2750 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2751 format of end-of-line according to `coding->eol_type'. It also
2752 convert multibyte form 8-bit characers to unibyte if
2753 CODING->src_multibyte is nonzero. If `coding->mode &
2754 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2755 also means end-of-line. */
2756
2757 static void
2758 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2759 struct coding_system *coding;
2760 unsigned char *source, *destination;
2761 int src_bytes, dst_bytes;
2762 {
2763 unsigned char *src = source;
2764 unsigned char *dst = destination;
2765 unsigned char *src_end = src + src_bytes;
2766 unsigned char *dst_end = dst + dst_bytes;
2767 Lisp_Object translation_table;
2768 /* SRC_BASE remembers the start position in source in each loop.
2769 The loop will be exited when there's not enough source text to
2770 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2771 there's not enough destination area to produce encoded codes
2772 (within macro EMIT_BYTES). */
2773 unsigned char *src_base;
2774 int c;
2775 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2776
2777 translation_table = Qnil;
2778 if (coding->src_multibyte
2779 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2780 {
2781 src_end--;
2782 src_bytes--;
2783 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2784 }
2785
2786 if (coding->eol_type == CODING_EOL_CRLF)
2787 {
2788 while (src < src_end)
2789 {
2790 src_base = src;
2791 c = *src++;
2792 if (c >= 0x20)
2793 EMIT_ONE_BYTE (c);
2794 else if (c == '\n' || (c == '\r' && selective_display))
2795 EMIT_TWO_BYTES ('\r', '\n');
2796 else
2797 EMIT_ONE_BYTE (c);
2798 }
2799 src_base = src;
2800 label_end_of_loop:
2801 ;
2802 }
2803 else
2804 {
2805 if (src_bytes <= dst_bytes)
2806 {
2807 safe_bcopy (src, dst, src_bytes);
2808 src_base = src_end;
2809 dst += src_bytes;
2810 }
2811 else
2812 {
2813 if (coding->src_multibyte
2814 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2815 dst_bytes--;
2816 safe_bcopy (src, dst, dst_bytes);
2817 src_base = src + dst_bytes;
2818 dst = destination + dst_bytes;
2819 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2820 }
2821 if (coding->eol_type == CODING_EOL_CR)
2822 {
2823 for (src = destination; src < dst; src++)
2824 if (*src == '\n') *src = '\r';
2825 }
2826 else if (selective_display)
2827 {
2828 for (src = destination; src < dst; src++)
2829 if (*src == '\r') *src = '\n';
2830 }
2831 }
2832 if (coding->src_multibyte)
2833 dst = destination + str_as_unibyte (destination, dst - destination);
2834
2835 coding->consumed = src_base - source;
2836 coding->produced = dst - destination;
2837 }
2838
2839 \f
2840 /*** 7. C library functions ***/
2841
2842 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2843 has a property `coding-system'. The value of this property is a
2844 vector of length 5 (called as coding-vector). Among elements of
2845 this vector, the first (element[0]) and the fifth (element[4])
2846 carry important information for decoding/encoding. Before
2847 decoding/encoding, this information should be set in fields of a
2848 structure of type `coding_system'.
2849
2850 A value of property `coding-system' can be a symbol of another
2851 subsidiary coding-system. In that case, Emacs gets coding-vector
2852 from that symbol.
2853
2854 `element[0]' contains information to be set in `coding->type'. The
2855 value and its meaning is as follows:
2856
2857 0 -- coding_type_emacs_mule
2858 1 -- coding_type_sjis
2859 2 -- coding_type_iso2022
2860 3 -- coding_type_big5
2861 4 -- coding_type_ccl encoder/decoder written in CCL
2862 nil -- coding_type_no_conversion
2863 t -- coding_type_undecided (automatic conversion on decoding,
2864 no-conversion on encoding)
2865
2866 `element[4]' contains information to be set in `coding->flags' and
2867 `coding->spec'. The meaning varies by `coding->type'.
2868
2869 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2870 of length 32 (of which the first 13 sub-elements are used now).
2871 Meanings of these sub-elements are:
2872
2873 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2874 If the value is an integer of valid charset, the charset is
2875 assumed to be designated to graphic register N initially.
2876
2877 If the value is minus, it is a minus value of charset which
2878 reserves graphic register N, which means that the charset is
2879 not designated initially but should be designated to graphic
2880 register N just before encoding a character in that charset.
2881
2882 If the value is nil, graphic register N is never used on
2883 encoding.
2884
2885 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2886 Each value takes t or nil. See the section ISO2022 of
2887 `coding.h' for more information.
2888
2889 If `coding->type' is `coding_type_big5', element[4] is t to denote
2890 BIG5-ETen or nil to denote BIG5-HKU.
2891
2892 If `coding->type' takes the other value, element[4] is ignored.
2893
2894 Emacs Lisp's coding system also carries information about format of
2895 end-of-line in a value of property `eol-type'. If the value is
2896 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2897 means CODING_EOL_CR. If it is not integer, it should be a vector
2898 of subsidiary coding systems of which property `eol-type' has one
2899 of above values.
2900
2901 */
2902
2903 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2904 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2905 is setup so that no conversion is necessary and return -1, else
2906 return 0. */
2907
2908 int
2909 setup_coding_system (coding_system, coding)
2910 Lisp_Object coding_system;
2911 struct coding_system *coding;
2912 {
2913 Lisp_Object coding_spec, coding_type, eol_type, plist;
2914 Lisp_Object val;
2915 int i;
2916
2917 /* Initialize some fields required for all kinds of coding systems. */
2918 coding->symbol = coding_system;
2919 coding->common_flags = 0;
2920 coding->mode = 0;
2921 coding->heading_ascii = -1;
2922 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2923 coding->composing = COMPOSITION_DISABLED;
2924 coding->cmp_data = NULL;
2925
2926 if (NILP (coding_system))
2927 goto label_invalid_coding_system;
2928
2929 coding_spec = Fget (coding_system, Qcoding_system);
2930
2931 if (!VECTORP (coding_spec)
2932 || XVECTOR (coding_spec)->size != 5
2933 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2934 goto label_invalid_coding_system;
2935
2936 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2937 if (VECTORP (eol_type))
2938 {
2939 coding->eol_type = CODING_EOL_UNDECIDED;
2940 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2941 }
2942 else if (XFASTINT (eol_type) == 1)
2943 {
2944 coding->eol_type = CODING_EOL_CRLF;
2945 coding->common_flags
2946 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2947 }
2948 else if (XFASTINT (eol_type) == 2)
2949 {
2950 coding->eol_type = CODING_EOL_CR;
2951 coding->common_flags
2952 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2953 }
2954 else
2955 coding->eol_type = CODING_EOL_LF;
2956
2957 coding_type = XVECTOR (coding_spec)->contents[0];
2958 /* Try short cut. */
2959 if (SYMBOLP (coding_type))
2960 {
2961 if (EQ (coding_type, Qt))
2962 {
2963 coding->type = coding_type_undecided;
2964 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2965 }
2966 else
2967 coding->type = coding_type_no_conversion;
2968 return 0;
2969 }
2970
2971 /* Get values of coding system properties:
2972 `post-read-conversion', `pre-write-conversion',
2973 `translation-table-for-decode', `translation-table-for-encode'. */
2974 plist = XVECTOR (coding_spec)->contents[3];
2975 /* Pre & post conversion functions should be disabled if
2976 inhibit_eol_conversion is nozero. This is the case that a code
2977 conversion function is called while those functions are running. */
2978 if (! inhibit_pre_post_conversion)
2979 {
2980 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2981 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2982 }
2983 val = Fplist_get (plist, Qtranslation_table_for_decode);
2984 if (SYMBOLP (val))
2985 val = Fget (val, Qtranslation_table_for_decode);
2986 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2987 val = Fplist_get (plist, Qtranslation_table_for_encode);
2988 if (SYMBOLP (val))
2989 val = Fget (val, Qtranslation_table_for_encode);
2990 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2991 val = Fplist_get (plist, Qcoding_category);
2992 if (!NILP (val))
2993 {
2994 val = Fget (val, Qcoding_category_index);
2995 if (INTEGERP (val))
2996 coding->category_idx = XINT (val);
2997 else
2998 goto label_invalid_coding_system;
2999 }
3000 else
3001 goto label_invalid_coding_system;
3002
3003 /* If the coding system has non-nil `composition' property, enable
3004 composition handling. */
3005 val = Fplist_get (plist, Qcomposition);
3006 if (!NILP (val))
3007 coding->composing = COMPOSITION_NO;
3008
3009 switch (XFASTINT (coding_type))
3010 {
3011 case 0:
3012 coding->type = coding_type_emacs_mule;
3013 if (!NILP (coding->post_read_conversion))
3014 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3015 if (!NILP (coding->pre_write_conversion))
3016 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3017 break;
3018
3019 case 1:
3020 coding->type = coding_type_sjis;
3021 coding->common_flags
3022 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3023 break;
3024
3025 case 2:
3026 coding->type = coding_type_iso2022;
3027 coding->common_flags
3028 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3029 {
3030 Lisp_Object val, temp;
3031 Lisp_Object *flags;
3032 int i, charset, reg_bits = 0;
3033
3034 val = XVECTOR (coding_spec)->contents[4];
3035
3036 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3037 goto label_invalid_coding_system;
3038
3039 flags = XVECTOR (val)->contents;
3040 coding->flags
3041 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3042 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3043 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3044 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3045 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3046 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3047 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3048 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3049 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3050 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3051 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3052 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3053 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3054 );
3055
3056 /* Invoke graphic register 0 to plane 0. */
3057 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3058 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3059 CODING_SPEC_ISO_INVOCATION (coding, 1)
3060 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3061 /* Not single shifting at first. */
3062 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3063 /* Beginning of buffer should also be regarded as bol. */
3064 CODING_SPEC_ISO_BOL (coding) = 1;
3065
3066 for (charset = 0; charset <= MAX_CHARSET; charset++)
3067 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3068 val = Vcharset_revision_alist;
3069 while (CONSP (val))
3070 {
3071 charset = get_charset_id (Fcar_safe (XCAR (val)));
3072 if (charset >= 0
3073 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3074 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3075 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3076 val = XCDR (val);
3077 }
3078
3079 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3080 FLAGS[REG] can be one of below:
3081 integer CHARSET: CHARSET occupies register I,
3082 t: designate nothing to REG initially, but can be used
3083 by any charsets,
3084 list of integer, nil, or t: designate the first
3085 element (if integer) to REG initially, the remaining
3086 elements (if integer) is designated to REG on request,
3087 if an element is t, REG can be used by any charsets,
3088 nil: REG is never used. */
3089 for (charset = 0; charset <= MAX_CHARSET; charset++)
3090 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3091 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3092 for (i = 0; i < 4; i++)
3093 {
3094 if (INTEGERP (flags[i])
3095 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3096 || (charset = get_charset_id (flags[i])) >= 0)
3097 {
3098 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3099 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3100 }
3101 else if (EQ (flags[i], Qt))
3102 {
3103 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3104 reg_bits |= 1 << i;
3105 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3106 }
3107 else if (CONSP (flags[i]))
3108 {
3109 Lisp_Object tail;
3110 tail = flags[i];
3111
3112 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3113 if (INTEGERP (XCAR (tail))
3114 && (charset = XINT (XCAR (tail)),
3115 CHARSET_VALID_P (charset))
3116 || (charset = get_charset_id (XCAR (tail))) >= 0)
3117 {
3118 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3119 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3120 }
3121 else
3122 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3123 tail = XCDR (tail);
3124 while (CONSP (tail))
3125 {
3126 if (INTEGERP (XCAR (tail))
3127 && (charset = XINT (XCAR (tail)),
3128 CHARSET_VALID_P (charset))
3129 || (charset = get_charset_id (XCAR (tail))) >= 0)
3130 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3131 = i;
3132 else if (EQ (XCAR (tail), Qt))
3133 reg_bits |= 1 << i;
3134 tail = XCDR (tail);
3135 }
3136 }
3137 else
3138 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3139
3140 CODING_SPEC_ISO_DESIGNATION (coding, i)
3141 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3142 }
3143
3144 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3145 {
3146 /* REG 1 can be used only by locking shift in 7-bit env. */
3147 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3148 reg_bits &= ~2;
3149 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3150 /* Without any shifting, only REG 0 and 1 can be used. */
3151 reg_bits &= 3;
3152 }
3153
3154 if (reg_bits)
3155 for (charset = 0; charset <= MAX_CHARSET; charset++)
3156 {
3157 if (CHARSET_VALID_P (charset)
3158 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3159 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3160 {
3161 /* There exist some default graphic registers to be
3162 used by CHARSET. */
3163
3164 /* We had better avoid designating a charset of
3165 CHARS96 to REG 0 as far as possible. */
3166 if (CHARSET_CHARS (charset) == 96)
3167 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3168 = (reg_bits & 2
3169 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3170 else
3171 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3172 = (reg_bits & 1
3173 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3174 }
3175 }
3176 }
3177 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3178 coding->spec.iso2022.last_invalid_designation_register = -1;
3179 break;
3180
3181 case 3:
3182 coding->type = coding_type_big5;
3183 coding->common_flags
3184 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3185 coding->flags
3186 = (NILP (XVECTOR (coding_spec)->contents[4])
3187 ? CODING_FLAG_BIG5_HKU
3188 : CODING_FLAG_BIG5_ETEN);
3189 break;
3190
3191 case 4:
3192 coding->type = coding_type_ccl;
3193 coding->common_flags
3194 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3195 {
3196 val = XVECTOR (coding_spec)->contents[4];
3197 if (! CONSP (val)
3198 || setup_ccl_program (&(coding->spec.ccl.decoder),
3199 XCAR (val)) < 0
3200 || setup_ccl_program (&(coding->spec.ccl.encoder),
3201 XCDR (val)) < 0)
3202 goto label_invalid_coding_system;
3203
3204 bzero (coding->spec.ccl.valid_codes, 256);
3205 val = Fplist_get (plist, Qvalid_codes);
3206 if (CONSP (val))
3207 {
3208 Lisp_Object this;
3209
3210 for (; CONSP (val); val = XCDR (val))
3211 {
3212 this = XCAR (val);
3213 if (INTEGERP (this)
3214 && XINT (this) >= 0 && XINT (this) < 256)
3215 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3216 else if (CONSP (this)
3217 && INTEGERP (XCAR (this))
3218 && INTEGERP (XCDR (this)))
3219 {
3220 int start = XINT (XCAR (this));
3221 int end = XINT (XCDR (this));
3222
3223 if (start >= 0 && start <= end && end < 256)
3224 while (start <= end)
3225 coding->spec.ccl.valid_codes[start++] = 1;
3226 }
3227 }
3228 }
3229 }
3230 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3231 coding->spec.ccl.cr_carryover = 0;
3232 break;
3233
3234 case 5:
3235 coding->type = coding_type_raw_text;
3236 break;
3237
3238 default:
3239 goto label_invalid_coding_system;
3240 }
3241 return 0;
3242
3243 label_invalid_coding_system:
3244 coding->type = coding_type_no_conversion;
3245 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3246 coding->common_flags = 0;
3247 coding->eol_type = CODING_EOL_LF;
3248 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3249 return -1;
3250 }
3251
3252 /* Free memory blocks allocated for storing composition information. */
3253
3254 void
3255 coding_free_composition_data (coding)
3256 struct coding_system *coding;
3257 {
3258 struct composition_data *cmp_data = coding->cmp_data, *next;
3259
3260 if (!cmp_data)
3261 return;
3262 /* Memory blocks are chained. At first, rewind to the first, then,
3263 free blocks one by one. */
3264 while (cmp_data->prev)
3265 cmp_data = cmp_data->prev;
3266 while (cmp_data)
3267 {
3268 next = cmp_data->next;
3269 xfree (cmp_data);
3270 cmp_data = next;
3271 }
3272 coding->cmp_data = NULL;
3273 }
3274
3275 /* Set `char_offset' member of all memory blocks pointed by
3276 coding->cmp_data to POS. */
3277
3278 void
3279 coding_adjust_composition_offset (coding, pos)
3280 struct coding_system *coding;
3281 int pos;
3282 {
3283 struct composition_data *cmp_data;
3284
3285 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3286 cmp_data->char_offset = pos;
3287 }
3288
3289 /* Setup raw-text or one of its subsidiaries in the structure
3290 coding_system CODING according to the already setup value eol_type
3291 in CODING. CODING should be setup for some coding system in
3292 advance. */
3293
3294 void
3295 setup_raw_text_coding_system (coding)
3296 struct coding_system *coding;
3297 {
3298 if (coding->type != coding_type_raw_text)
3299 {
3300 coding->symbol = Qraw_text;
3301 coding->type = coding_type_raw_text;
3302 if (coding->eol_type != CODING_EOL_UNDECIDED)
3303 {
3304 Lisp_Object subsidiaries;
3305 subsidiaries = Fget (Qraw_text, Qeol_type);
3306
3307 if (VECTORP (subsidiaries)
3308 && XVECTOR (subsidiaries)->size == 3)
3309 coding->symbol
3310 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3311 }
3312 setup_coding_system (coding->symbol, coding);
3313 }
3314 return;
3315 }
3316
3317 /* Emacs has a mechanism to automatically detect a coding system if it
3318 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3319 it's impossible to distinguish some coding systems accurately
3320 because they use the same range of codes. So, at first, coding
3321 systems are categorized into 7, those are:
3322
3323 o coding-category-emacs-mule
3324
3325 The category for a coding system which has the same code range
3326 as Emacs' internal format. Assigned the coding-system (Lisp
3327 symbol) `emacs-mule' by default.
3328
3329 o coding-category-sjis
3330
3331 The category for a coding system which has the same code range
3332 as SJIS. Assigned the coding-system (Lisp
3333 symbol) `japanese-shift-jis' by default.
3334
3335 o coding-category-iso-7
3336
3337 The category for a coding system which has the same code range
3338 as ISO2022 of 7-bit environment. This doesn't use any locking
3339 shift and single shift functions. This can encode/decode all
3340 charsets. Assigned the coding-system (Lisp symbol)
3341 `iso-2022-7bit' by default.
3342
3343 o coding-category-iso-7-tight
3344
3345 Same as coding-category-iso-7 except that this can
3346 encode/decode only the specified charsets.
3347
3348 o coding-category-iso-8-1
3349
3350 The category for a coding system which has the same code range
3351 as ISO2022 of 8-bit environment and graphic plane 1 used only
3352 for DIMENSION1 charset. This doesn't use any locking shift
3353 and single shift functions. Assigned the coding-system (Lisp
3354 symbol) `iso-latin-1' by default.
3355
3356 o coding-category-iso-8-2
3357
3358 The category for a coding system which has the same code range
3359 as ISO2022 of 8-bit environment and graphic plane 1 used only
3360 for DIMENSION2 charset. This doesn't use any locking shift
3361 and single shift functions. Assigned the coding-system (Lisp
3362 symbol) `japanese-iso-8bit' by default.
3363
3364 o coding-category-iso-7-else
3365
3366 The category for a coding system which has the same code range
3367 as ISO2022 of 7-bit environemnt but uses locking shift or
3368 single shift functions. Assigned the coding-system (Lisp
3369 symbol) `iso-2022-7bit-lock' by default.
3370
3371 o coding-category-iso-8-else
3372
3373 The category for a coding system which has the same code range
3374 as ISO2022 of 8-bit environemnt but uses locking shift or
3375 single shift functions. Assigned the coding-system (Lisp
3376 symbol) `iso-2022-8bit-ss2' by default.
3377
3378 o coding-category-big5
3379
3380 The category for a coding system which has the same code range
3381 as BIG5. Assigned the coding-system (Lisp symbol)
3382 `cn-big5' by default.
3383
3384 o coding-category-utf-8
3385
3386 The category for a coding system which has the same code range
3387 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3388 symbol) `utf-8' by default.
3389
3390 o coding-category-utf-16-be
3391
3392 The category for a coding system in which a text has an
3393 Unicode signature (cf. Unicode Standard) in the order of BIG
3394 endian at the head. Assigned the coding-system (Lisp symbol)
3395 `utf-16-be' by default.
3396
3397 o coding-category-utf-16-le
3398
3399 The category for a coding system in which a text has an
3400 Unicode signature (cf. Unicode Standard) in the order of
3401 LITTLE endian at the head. Assigned the coding-system (Lisp
3402 symbol) `utf-16-le' by default.
3403
3404 o coding-category-ccl
3405
3406 The category for a coding system of which encoder/decoder is
3407 written in CCL programs. The default value is nil, i.e., no
3408 coding system is assigned.
3409
3410 o coding-category-binary
3411
3412 The category for a coding system not categorized in any of the
3413 above. Assigned the coding-system (Lisp symbol)
3414 `no-conversion' by default.
3415
3416 Each of them is a Lisp symbol and the value is an actual
3417 `coding-system's (this is also a Lisp symbol) assigned by a user.
3418 What Emacs does actually is to detect a category of coding system.
3419 Then, it uses a `coding-system' assigned to it. If Emacs can't
3420 decide only one possible category, it selects a category of the
3421 highest priority. Priorities of categories are also specified by a
3422 user in a Lisp variable `coding-category-list'.
3423
3424 */
3425
3426 static
3427 int ascii_skip_code[256];
3428
3429 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3430 If it detects possible coding systems, return an integer in which
3431 appropriate flag bits are set. Flag bits are defined by macros
3432 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3433 it should point the table `coding_priorities'. In that case, only
3434 the flag bit for a coding system of the highest priority is set in
3435 the returned value.
3436
3437 How many ASCII characters are at the head is returned as *SKIP. */
3438
3439 static int
3440 detect_coding_mask (source, src_bytes, priorities, skip)
3441 unsigned char *source;
3442 int src_bytes, *priorities, *skip;
3443 {
3444 register unsigned char c;
3445 unsigned char *src = source, *src_end = source + src_bytes;
3446 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3447 int i, idx;
3448
3449 /* At first, skip all ASCII characters and control characters except
3450 for three ISO2022 specific control characters. */
3451 ascii_skip_code[ISO_CODE_SO] = 0;
3452 ascii_skip_code[ISO_CODE_SI] = 0;
3453 ascii_skip_code[ISO_CODE_ESC] = 0;
3454
3455 label_loop_detect_coding:
3456 while (src < src_end && ascii_skip_code[*src]) src++;
3457 *skip = src - source;
3458
3459 if (src >= src_end)
3460 /* We found nothing other than ASCII. There's nothing to do. */
3461 return 0;
3462
3463 c = *src;
3464 /* The text seems to be encoded in some multilingual coding system.
3465 Now, try to find in which coding system the text is encoded. */
3466 if (c < 0x80)
3467 {
3468 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3469 /* C is an ISO2022 specific control code of C0. */
3470 mask = detect_coding_iso2022 (src, src_end);
3471 if (mask == 0)
3472 {
3473 /* No valid ISO2022 code follows C. Try again. */
3474 src++;
3475 if (c == ISO_CODE_ESC)
3476 ascii_skip_code[ISO_CODE_ESC] = 1;
3477 else
3478 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3479 goto label_loop_detect_coding;
3480 }
3481 if (priorities)
3482 {
3483 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3484 {
3485 if (mask & priorities[i])
3486 return priorities[i];
3487 }
3488 return CODING_CATEGORY_MASK_RAW_TEXT;
3489 }
3490 }
3491 else
3492 {
3493 int try;
3494
3495 if (c < 0xA0)
3496 {
3497 /* C is the first byte of SJIS character code,
3498 or a leading-code of Emacs' internal format (emacs-mule),
3499 or the first byte of UTF-16. */
3500 try = (CODING_CATEGORY_MASK_SJIS
3501 | CODING_CATEGORY_MASK_EMACS_MULE
3502 | CODING_CATEGORY_MASK_UTF_16_BE
3503 | CODING_CATEGORY_MASK_UTF_16_LE);
3504
3505 /* Or, if C is a special latin extra code,
3506 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3507 or is an ISO2022 control-sequence-introducer (CSI),
3508 we should also consider the possibility of ISO2022 codings. */
3509 if ((VECTORP (Vlatin_extra_code_table)
3510 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3511 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3512 || (c == ISO_CODE_CSI
3513 && (src < src_end
3514 && (*src == ']'
3515 || ((*src == '0' || *src == '1' || *src == '2')
3516 && src + 1 < src_end
3517 && src[1] == ']')))))
3518 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3519 | CODING_CATEGORY_MASK_ISO_8BIT);
3520 }
3521 else
3522 /* C is a character of ISO2022 in graphic plane right,
3523 or a SJIS's 1-byte character code (i.e. JISX0201),
3524 or the first byte of BIG5's 2-byte code,
3525 or the first byte of UTF-8/16. */
3526 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3527 | CODING_CATEGORY_MASK_ISO_8BIT
3528 | CODING_CATEGORY_MASK_SJIS
3529 | CODING_CATEGORY_MASK_BIG5
3530 | CODING_CATEGORY_MASK_UTF_8
3531 | CODING_CATEGORY_MASK_UTF_16_BE
3532 | CODING_CATEGORY_MASK_UTF_16_LE);
3533
3534 /* Or, we may have to consider the possibility of CCL. */
3535 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3536 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3537 ->spec.ccl.valid_codes)[c])
3538 try |= CODING_CATEGORY_MASK_CCL;
3539
3540 mask = 0;
3541 utf16_examined_p = iso2022_examined_p = 0;
3542 if (priorities)
3543 {
3544 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3545 {
3546 if (!iso2022_examined_p
3547 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3548 {
3549 mask |= detect_coding_iso2022 (src, src_end);
3550 iso2022_examined_p = 1;
3551 }
3552 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3553 mask |= detect_coding_sjis (src, src_end);
3554 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3555 mask |= detect_coding_utf_8 (src, src_end);
3556 else if (!utf16_examined_p
3557 && (priorities[i] & try &
3558 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3559 {
3560 mask |= detect_coding_utf_16 (src, src_end);
3561 utf16_examined_p = 1;
3562 }
3563 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3564 mask |= detect_coding_big5 (src, src_end);
3565 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3566 mask |= detect_coding_emacs_mule (src, src_end);
3567 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3568 mask |= detect_coding_ccl (src, src_end);
3569 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3570 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3571 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3572 mask |= CODING_CATEGORY_MASK_BINARY;
3573 if (mask & priorities[i])
3574 return priorities[i];
3575 }
3576 return CODING_CATEGORY_MASK_RAW_TEXT;
3577 }
3578 if (try & CODING_CATEGORY_MASK_ISO)
3579 mask |= detect_coding_iso2022 (src, src_end);
3580 if (try & CODING_CATEGORY_MASK_SJIS)
3581 mask |= detect_coding_sjis (src, src_end);
3582 if (try & CODING_CATEGORY_MASK_BIG5)
3583 mask |= detect_coding_big5 (src, src_end);
3584 if (try & CODING_CATEGORY_MASK_UTF_8)
3585 mask |= detect_coding_utf_8 (src, src_end);
3586 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3587 mask |= detect_coding_utf_16 (src, src_end);
3588 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3589 mask |= detect_coding_emacs_mule (src, src_end);
3590 if (try & CODING_CATEGORY_MASK_CCL)
3591 mask |= detect_coding_ccl (src, src_end);
3592 }
3593 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3594 }
3595
3596 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3597 The information of the detected coding system is set in CODING. */
3598
3599 void
3600 detect_coding (coding, src, src_bytes)
3601 struct coding_system *coding;
3602 unsigned char *src;
3603 int src_bytes;
3604 {
3605 unsigned int idx;
3606 int skip, mask, i;
3607 Lisp_Object val;
3608
3609 val = Vcoding_category_list;
3610 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3611 coding->heading_ascii = skip;
3612
3613 if (!mask) return;
3614
3615 /* We found a single coding system of the highest priority in MASK. */
3616 idx = 0;
3617 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3618 if (! mask)
3619 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3620
3621 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3622
3623 if (coding->eol_type != CODING_EOL_UNDECIDED)
3624 {
3625 Lisp_Object tmp;
3626
3627 tmp = Fget (val, Qeol_type);
3628 if (VECTORP (tmp))
3629 val = XVECTOR (tmp)->contents[coding->eol_type];
3630 }
3631
3632 /* Setup this new coding system while preserving some slots. */
3633 {
3634 int src_multibyte = coding->src_multibyte;
3635 int dst_multibyte = coding->dst_multibyte;
3636
3637 setup_coding_system (val, coding);
3638 coding->src_multibyte = src_multibyte;
3639 coding->dst_multibyte = dst_multibyte;
3640 coding->heading_ascii = skip;
3641 }
3642 }
3643
3644 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3645 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3646 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3647
3648 How many non-eol characters are at the head is returned as *SKIP. */
3649
3650 #define MAX_EOL_CHECK_COUNT 3
3651
3652 static int
3653 detect_eol_type (source, src_bytes, skip)
3654 unsigned char *source;
3655 int src_bytes, *skip;
3656 {
3657 unsigned char *src = source, *src_end = src + src_bytes;
3658 unsigned char c;
3659 int total = 0; /* How many end-of-lines are found so far. */
3660 int eol_type = CODING_EOL_UNDECIDED;
3661 int this_eol_type;
3662
3663 *skip = 0;
3664
3665 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3666 {
3667 c = *src++;
3668 if (c == '\n' || c == '\r')
3669 {
3670 if (*skip == 0)
3671 *skip = src - 1 - source;
3672 total++;
3673 if (c == '\n')
3674 this_eol_type = CODING_EOL_LF;
3675 else if (src >= src_end || *src != '\n')
3676 this_eol_type = CODING_EOL_CR;
3677 else
3678 this_eol_type = CODING_EOL_CRLF, src++;
3679
3680 if (eol_type == CODING_EOL_UNDECIDED)
3681 /* This is the first end-of-line. */
3682 eol_type = this_eol_type;
3683 else if (eol_type != this_eol_type)
3684 {
3685 /* The found type is different from what found before. */
3686 eol_type = CODING_EOL_INCONSISTENT;
3687 break;
3688 }
3689 }
3690 }
3691
3692 if (*skip == 0)
3693 *skip = src_end - source;
3694 return eol_type;
3695 }
3696
3697 /* Like detect_eol_type, but detect EOL type in 2-octet
3698 big-endian/little-endian format for coding systems utf-16-be and
3699 utf-16-le. */
3700
3701 static int
3702 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3703 unsigned char *source;
3704 int src_bytes, *skip;
3705 {
3706 unsigned char *src = source, *src_end = src + src_bytes;
3707 unsigned int c1, c2;
3708 int total = 0; /* How many end-of-lines are found so far. */
3709 int eol_type = CODING_EOL_UNDECIDED;
3710 int this_eol_type;
3711 int msb, lsb;
3712
3713 if (big_endian_p)
3714 msb = 0, lsb = 1;
3715 else
3716 msb = 1, lsb = 0;
3717
3718 *skip = 0;
3719
3720 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3721 {
3722 c1 = (src[msb] << 8) | (src[lsb]);
3723 src += 2;
3724
3725 if (c1 == '\n' || c1 == '\r')
3726 {
3727 if (*skip == 0)
3728 *skip = src - 2 - source;
3729 total++;
3730 if (c1 == '\n')
3731 {
3732 this_eol_type = CODING_EOL_LF;
3733 }
3734 else
3735 {
3736 if ((src + 1) >= src_end)
3737 {
3738 this_eol_type = CODING_EOL_CR;
3739 }
3740 else
3741 {
3742 c2 = (src[msb] << 8) | (src[lsb]);
3743 if (c2 == '\n')
3744 this_eol_type = CODING_EOL_CRLF, src += 2;
3745 else
3746 this_eol_type = CODING_EOL_CR;
3747 }
3748 }
3749
3750 if (eol_type == CODING_EOL_UNDECIDED)
3751 /* This is the first end-of-line. */
3752 eol_type = this_eol_type;
3753 else if (eol_type != this_eol_type)
3754 {
3755 /* The found type is different from what found before. */
3756 eol_type = CODING_EOL_INCONSISTENT;
3757 break;
3758 }
3759 }
3760 }
3761
3762 if (*skip == 0)
3763 *skip = src_end - source;
3764 return eol_type;
3765 }
3766
3767 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3768 is encoded. If it detects an appropriate format of end-of-line, it
3769 sets the information in *CODING. */
3770
3771 void
3772 detect_eol (coding, src, src_bytes)
3773 struct coding_system *coding;
3774 unsigned char *src;
3775 int src_bytes;
3776 {
3777 Lisp_Object val;
3778 int skip;
3779 int eol_type;
3780
3781 switch (coding->category_idx)
3782 {
3783 case CODING_CATEGORY_IDX_UTF_16_BE:
3784 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3785 break;
3786 case CODING_CATEGORY_IDX_UTF_16_LE:
3787 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3788 break;
3789 default:
3790 eol_type = detect_eol_type (src, src_bytes, &skip);
3791 break;
3792 }
3793
3794 if (coding->heading_ascii > skip)
3795 coding->heading_ascii = skip;
3796 else
3797 skip = coding->heading_ascii;
3798
3799 if (eol_type == CODING_EOL_UNDECIDED)
3800 return;
3801 if (eol_type == CODING_EOL_INCONSISTENT)
3802 {
3803 #if 0
3804 /* This code is suppressed until we find a better way to
3805 distinguish raw text file and binary file. */
3806
3807 /* If we have already detected that the coding is raw-text, the
3808 coding should actually be no-conversion. */
3809 if (coding->type == coding_type_raw_text)
3810 {
3811 setup_coding_system (Qno_conversion, coding);
3812 return;
3813 }
3814 /* Else, let's decode only text code anyway. */
3815 #endif /* 0 */
3816 eol_type = CODING_EOL_LF;
3817 }
3818
3819 val = Fget (coding->symbol, Qeol_type);
3820 if (VECTORP (val) && XVECTOR (val)->size == 3)
3821 {
3822 int src_multibyte = coding->src_multibyte;
3823 int dst_multibyte = coding->dst_multibyte;
3824
3825 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3826 coding->src_multibyte = src_multibyte;
3827 coding->dst_multibyte = dst_multibyte;
3828 coding->heading_ascii = skip;
3829 }
3830 }
3831
3832 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3833
3834 #define DECODING_BUFFER_MAG(coding) \
3835 (coding->type == coding_type_iso2022 \
3836 ? 3 \
3837 : (coding->type == coding_type_ccl \
3838 ? coding->spec.ccl.decoder.buf_magnification \
3839 : 2))
3840
3841 /* Return maximum size (bytes) of a buffer enough for decoding
3842 SRC_BYTES of text encoded in CODING. */
3843
3844 int
3845 decoding_buffer_size (coding, src_bytes)
3846 struct coding_system *coding;
3847 int src_bytes;
3848 {
3849 return (src_bytes * DECODING_BUFFER_MAG (coding)
3850 + CONVERSION_BUFFER_EXTRA_ROOM);
3851 }
3852
3853 /* Return maximum size (bytes) of a buffer enough for encoding
3854 SRC_BYTES of text to CODING. */
3855
3856 int
3857 encoding_buffer_size (coding, src_bytes)
3858 struct coding_system *coding;
3859 int src_bytes;
3860 {
3861 int magnification;
3862
3863 if (coding->type == coding_type_ccl)
3864 magnification = coding->spec.ccl.encoder.buf_magnification;
3865 else if (CODING_REQUIRE_ENCODING (coding))
3866 magnification = 3;
3867 else
3868 magnification = 1;
3869
3870 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3871 }
3872
3873 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3874 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3875 #endif
3876
3877 char *conversion_buffer;
3878 int conversion_buffer_size;
3879
3880 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3881 or decoding. Sufficient memory is allocated automatically. If we
3882 run out of memory, return NULL. */
3883
3884 char *
3885 get_conversion_buffer (size)
3886 int size;
3887 {
3888 if (size > conversion_buffer_size)
3889 {
3890 char *buf;
3891 int real_size = conversion_buffer_size * 2;
3892
3893 while (real_size < size) real_size *= 2;
3894 buf = (char *) xmalloc (real_size);
3895 xfree (conversion_buffer);
3896 conversion_buffer = buf;
3897 conversion_buffer_size = real_size;
3898 }
3899 return conversion_buffer;
3900 }
3901
3902 int
3903 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3904 struct coding_system *coding;
3905 unsigned char *source, *destination;
3906 int src_bytes, dst_bytes, encodep;
3907 {
3908 struct ccl_program *ccl
3909 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3910 int result;
3911
3912 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3913 if (encodep)
3914 ccl->eol_type = coding->eol_type;
3915 ccl->multibyte = coding->src_multibyte;
3916 coding->produced = ccl_driver (ccl, source, destination,
3917 src_bytes, dst_bytes, &(coding->consumed));
3918 if (encodep)
3919 coding->produced_char = coding->produced;
3920 else
3921 {
3922 int bytes
3923 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3924 coding->produced = str_as_multibyte (destination, bytes,
3925 coding->produced,
3926 &(coding->produced_char));
3927 }
3928
3929 switch (ccl->status)
3930 {
3931 case CCL_STAT_SUSPEND_BY_SRC:
3932 result = CODING_FINISH_INSUFFICIENT_SRC;
3933 break;
3934 case CCL_STAT_SUSPEND_BY_DST:
3935 result = CODING_FINISH_INSUFFICIENT_DST;
3936 break;
3937 case CCL_STAT_QUIT:
3938 case CCL_STAT_INVALID_CMD:
3939 result = CODING_FINISH_INTERRUPT;
3940 break;
3941 default:
3942 result = CODING_FINISH_NORMAL;
3943 break;
3944 }
3945 return result;
3946 }
3947
3948 /* Decode EOL format of the text at PTR of BYTES length destructively
3949 according to CODING->eol_type. This is called after the CCL
3950 program produced a decoded text at PTR. If we do CRLF->LF
3951 conversion, update CODING->produced and CODING->produced_char. */
3952
3953 static void
3954 decode_eol_post_ccl (coding, ptr, bytes)
3955 struct coding_system *coding;
3956 unsigned char *ptr;
3957 int bytes;
3958 {
3959 Lisp_Object val, saved_coding_symbol;
3960 unsigned char *pend = ptr + bytes;
3961 int dummy;
3962
3963 /* Remember the current coding system symbol. We set it back when
3964 an inconsistent EOL is found so that `last-coding-system-used' is
3965 set to the coding system that doesn't specify EOL conversion. */
3966 saved_coding_symbol = coding->symbol;
3967
3968 coding->spec.ccl.cr_carryover = 0;
3969 if (coding->eol_type == CODING_EOL_UNDECIDED)
3970 {
3971 /* Here, to avoid the call of setup_coding_system, we directly
3972 call detect_eol_type. */
3973 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3974 if (coding->eol_type == CODING_EOL_INCONSISTENT)
3975 coding->eol_type = CODING_EOL_LF;
3976 if (coding->eol_type != CODING_EOL_UNDECIDED)
3977 {
3978 val = Fget (coding->symbol, Qeol_type);
3979 if (VECTORP (val) && XVECTOR (val)->size == 3)
3980 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3981 }
3982 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3983 }
3984
3985 if (coding->eol_type == CODING_EOL_LF
3986 || coding->eol_type == CODING_EOL_UNDECIDED)
3987 {
3988 /* We have nothing to do. */
3989 ptr = pend;
3990 }
3991 else if (coding->eol_type == CODING_EOL_CRLF)
3992 {
3993 unsigned char *pstart = ptr, *p = ptr;
3994
3995 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3996 && *(pend - 1) == '\r')
3997 {
3998 /* If the last character is CR, we can't handle it here
3999 because LF will be in the not-yet-decoded source text.
4000 Recorded that the CR is not yet processed. */
4001 coding->spec.ccl.cr_carryover = 1;
4002 coding->produced--;
4003 coding->produced_char--;
4004 pend--;
4005 }
4006 while (ptr < pend)
4007 {
4008 if (*ptr == '\r')
4009 {
4010 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4011 {
4012 *p++ = '\n';
4013 ptr += 2;
4014 }
4015 else
4016 {
4017 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4018 goto undo_eol_conversion;
4019 *p++ = *ptr++;
4020 }
4021 }
4022 else if (*ptr == '\n'
4023 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4024 goto undo_eol_conversion;
4025 else
4026 *p++ = *ptr++;
4027 continue;
4028
4029 undo_eol_conversion:
4030 /* We have faced with inconsistent EOL format at PTR.
4031 Convert all LFs before PTR back to CRLFs. */
4032 for (p--, ptr--; p >= pstart; p--)
4033 {
4034 if (*p == '\n')
4035 *ptr-- = '\n', *ptr-- = '\r';
4036 else
4037 *ptr-- = *p;
4038 }
4039 /* If carryover is recorded, cancel it because we don't
4040 convert CRLF anymore. */
4041 if (coding->spec.ccl.cr_carryover)
4042 {
4043 coding->spec.ccl.cr_carryover = 0;
4044 coding->produced++;
4045 coding->produced_char++;
4046 pend++;
4047 }
4048 p = ptr = pend;
4049 coding->eol_type = CODING_EOL_LF;
4050 coding->symbol = saved_coding_symbol;
4051 }
4052 if (p < pend)
4053 {
4054 /* As each two-byte sequence CRLF was converted to LF, (PEND
4055 - P) is the number of deleted characters. */
4056 coding->produced -= pend - p;
4057 coding->produced_char -= pend - p;
4058 }
4059 }
4060 else /* i.e. coding->eol_type == CODING_EOL_CR */
4061 {
4062 unsigned char *p = ptr;
4063
4064 for (; ptr < pend; ptr++)
4065 {
4066 if (*ptr == '\r')
4067 *ptr = '\n';
4068 else if (*ptr == '\n'
4069 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4070 {
4071 for (; p < ptr; p++)
4072 {
4073 if (*p == '\n')
4074 *p = '\r';
4075 }
4076 ptr = pend;
4077 coding->eol_type = CODING_EOL_LF;
4078 coding->symbol = saved_coding_symbol;
4079 }
4080 }
4081 }
4082 }
4083
4084 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4085 decoding, it may detect coding system and format of end-of-line if
4086 those are not yet decided. The source should be unibyte, the
4087 result is multibyte if CODING->dst_multibyte is nonzero, else
4088 unibyte. */
4089
4090 int
4091 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4092 struct coding_system *coding;
4093 unsigned char *source, *destination;
4094 int src_bytes, dst_bytes;
4095 {
4096 if (coding->type == coding_type_undecided)
4097 detect_coding (coding, source, src_bytes);
4098
4099 if (coding->eol_type == CODING_EOL_UNDECIDED
4100 && coding->type != coding_type_ccl)
4101 detect_eol (coding, source, src_bytes);
4102
4103 coding->produced = coding->produced_char = 0;
4104 coding->consumed = coding->consumed_char = 0;
4105 coding->errors = 0;
4106 coding->result = CODING_FINISH_NORMAL;
4107
4108 switch (coding->type)
4109 {
4110 case coding_type_sjis:
4111 decode_coding_sjis_big5 (coding, source, destination,
4112 src_bytes, dst_bytes, 1);
4113 break;
4114
4115 case coding_type_iso2022:
4116 decode_coding_iso2022 (coding, source, destination,
4117 src_bytes, dst_bytes);
4118 break;
4119
4120 case coding_type_big5:
4121 decode_coding_sjis_big5 (coding, source, destination,
4122 src_bytes, dst_bytes, 0);
4123 break;
4124
4125 case coding_type_emacs_mule:
4126 decode_coding_emacs_mule (coding, source, destination,
4127 src_bytes, dst_bytes);
4128 break;
4129
4130 case coding_type_ccl:
4131 if (coding->spec.ccl.cr_carryover)
4132 {
4133 /* Set the CR which is not processed by the previous call of
4134 decode_eol_post_ccl in DESTINATION. */
4135 *destination = '\r';
4136 coding->produced++;
4137 coding->produced_char++;
4138 dst_bytes--;
4139 }
4140 ccl_coding_driver (coding, source,
4141 destination + coding->spec.ccl.cr_carryover,
4142 src_bytes, dst_bytes, 0);
4143 if (coding->eol_type != CODING_EOL_LF)
4144 decode_eol_post_ccl (coding, destination, coding->produced);
4145 break;
4146
4147 default:
4148 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4149 }
4150
4151 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4152 && coding->consumed == src_bytes)
4153 coding->result = CODING_FINISH_NORMAL;
4154
4155 if (coding->mode & CODING_MODE_LAST_BLOCK
4156 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4157 {
4158 unsigned char *src = source + coding->consumed;
4159 unsigned char *dst = destination + coding->produced;
4160
4161 src_bytes -= coding->consumed;
4162 coding->errors++;
4163 if (COMPOSING_P (coding))
4164 DECODE_COMPOSITION_END ('1');
4165 while (src_bytes--)
4166 {
4167 int c = *src++;
4168 dst += CHAR_STRING (c, dst);
4169 coding->produced_char++;
4170 }
4171 coding->consumed = coding->consumed_char = src - source;
4172 coding->produced = dst - destination;
4173 }
4174
4175 if (!coding->dst_multibyte)
4176 {
4177 coding->produced = str_as_unibyte (destination, coding->produced);
4178 coding->produced_char = coding->produced;
4179 }
4180
4181 return coding->result;
4182 }
4183
4184 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4185 multibyteness of the source is CODING->src_multibyte, the
4186 multibyteness of the result is always unibyte. */
4187
4188 int
4189 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4190 struct coding_system *coding;
4191 unsigned char *source, *destination;
4192 int src_bytes, dst_bytes;
4193 {
4194 coding->produced = coding->produced_char = 0;
4195 coding->consumed = coding->consumed_char = 0;
4196 coding->errors = 0;
4197 coding->result = CODING_FINISH_NORMAL;
4198
4199 switch (coding->type)
4200 {
4201 case coding_type_sjis:
4202 encode_coding_sjis_big5 (coding, source, destination,
4203 src_bytes, dst_bytes, 1);
4204 break;
4205
4206 case coding_type_iso2022:
4207 encode_coding_iso2022 (coding, source, destination,
4208 src_bytes, dst_bytes);
4209 break;
4210
4211 case coding_type_big5:
4212 encode_coding_sjis_big5 (coding, source, destination,
4213 src_bytes, dst_bytes, 0);
4214 break;
4215
4216 case coding_type_emacs_mule:
4217 encode_coding_emacs_mule (coding, source, destination,
4218 src_bytes, dst_bytes);
4219 break;
4220
4221 case coding_type_ccl:
4222 ccl_coding_driver (coding, source, destination,
4223 src_bytes, dst_bytes, 1);
4224 break;
4225
4226 default:
4227 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4228 }
4229
4230 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4231 && coding->consumed == src_bytes)
4232 coding->result = CODING_FINISH_NORMAL;
4233
4234 if (coding->mode & CODING_MODE_LAST_BLOCK)
4235 {
4236 unsigned char *src = source + coding->consumed;
4237 unsigned char *src_end = src + src_bytes;
4238 unsigned char *dst = destination + coding->produced;
4239
4240 if (coding->type == coding_type_iso2022)
4241 ENCODE_RESET_PLANE_AND_REGISTER;
4242 if (COMPOSING_P (coding))
4243 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4244 if (coding->consumed < src_bytes)
4245 {
4246 int len = src_bytes - coding->consumed;
4247
4248 BCOPY_SHORT (source + coding->consumed, dst, len);
4249 if (coding->src_multibyte)
4250 len = str_as_unibyte (dst, len);
4251 dst += len;
4252 coding->consumed = src_bytes;
4253 }
4254 coding->produced = coding->produced_char = dst - destination;
4255 }
4256
4257 return coding->result;
4258 }
4259
4260 /* Scan text in the region between *BEG and *END (byte positions),
4261 skip characters which we don't have to decode by coding system
4262 CODING at the head and tail, then set *BEG and *END to the region
4263 of the text we actually have to convert. The caller should move
4264 the gap out of the region in advance if the region is from a
4265 buffer.
4266
4267 If STR is not NULL, *BEG and *END are indices into STR. */
4268
4269 static void
4270 shrink_decoding_region (beg, end, coding, str)
4271 int *beg, *end;
4272 struct coding_system *coding;
4273 unsigned char *str;
4274 {
4275 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4276 int eol_conversion;
4277 Lisp_Object translation_table;
4278
4279 if (coding->type == coding_type_ccl
4280 || coding->type == coding_type_undecided
4281 || coding->eol_type != CODING_EOL_LF
4282 || !NILP (coding->post_read_conversion)
4283 || coding->composing != COMPOSITION_DISABLED)
4284 {
4285 /* We can't skip any data. */
4286 return;
4287 }
4288 if (coding->type == coding_type_no_conversion
4289 || coding->type == coding_type_raw_text
4290 || coding->type == coding_type_emacs_mule)
4291 {
4292 /* We need no conversion, but don't have to skip any data here.
4293 Decoding routine handles them effectively anyway. */
4294 return;
4295 }
4296
4297 translation_table = coding->translation_table_for_decode;
4298 if (NILP (translation_table) && !NILP (Venable_character_translation))
4299 translation_table = Vstandard_translation_table_for_decode;
4300 if (CHAR_TABLE_P (translation_table))
4301 {
4302 int i;
4303 for (i = 0; i < 128; i++)
4304 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4305 break;
4306 if (i < 128)
4307 /* Some ASCII character should be translated. We give up
4308 shrinking. */
4309 return;
4310 }
4311
4312 if (coding->heading_ascii >= 0)
4313 /* Detection routine has already found how much we can skip at the
4314 head. */
4315 *beg += coding->heading_ascii;
4316
4317 if (str)
4318 {
4319 begp_orig = begp = str + *beg;
4320 endp_orig = endp = str + *end;
4321 }
4322 else
4323 {
4324 begp_orig = begp = BYTE_POS_ADDR (*beg);
4325 endp_orig = endp = begp + *end - *beg;
4326 }
4327
4328 eol_conversion = (coding->eol_type == CODING_EOL_CR
4329 || coding->eol_type == CODING_EOL_CRLF);
4330
4331 switch (coding->type)
4332 {
4333 case coding_type_sjis:
4334 case coding_type_big5:
4335 /* We can skip all ASCII characters at the head. */
4336 if (coding->heading_ascii < 0)
4337 {
4338 if (eol_conversion)
4339 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4340 else
4341 while (begp < endp && *begp < 0x80) begp++;
4342 }
4343 /* We can skip all ASCII characters at the tail except for the
4344 second byte of SJIS or BIG5 code. */
4345 if (eol_conversion)
4346 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4347 else
4348 while (begp < endp && endp[-1] < 0x80) endp--;
4349 /* Do not consider LF as ascii if preceded by CR, since that
4350 confuses eol decoding. */
4351 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4352 endp++;
4353 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4354 endp++;
4355 break;
4356
4357 case coding_type_iso2022:
4358 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4359 /* We can't skip any data. */
4360 break;
4361 if (coding->heading_ascii < 0)
4362 {
4363 /* We can skip all ASCII characters at the head except for a
4364 few control codes. */
4365 while (begp < endp && (c = *begp) < 0x80
4366 && c != ISO_CODE_CR && c != ISO_CODE_SO
4367 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4368 && (!eol_conversion || c != ISO_CODE_LF))
4369 begp++;
4370 }
4371 switch (coding->category_idx)
4372 {
4373 case CODING_CATEGORY_IDX_ISO_8_1:
4374 case CODING_CATEGORY_IDX_ISO_8_2:
4375 /* We can skip all ASCII characters at the tail. */
4376 if (eol_conversion)
4377 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4378 else
4379 while (begp < endp && endp[-1] < 0x80) endp--;
4380 /* Do not consider LF as ascii if preceded by CR, since that
4381 confuses eol decoding. */
4382 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4383 endp++;
4384 break;
4385
4386 case CODING_CATEGORY_IDX_ISO_7:
4387 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4388 {
4389 /* We can skip all charactes at the tail except for 8-bit
4390 codes and ESC and the following 2-byte at the tail. */
4391 unsigned char *eight_bit = NULL;
4392
4393 if (eol_conversion)
4394 while (begp < endp
4395 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4396 {
4397 if (!eight_bit && c & 0x80) eight_bit = endp;
4398 endp--;
4399 }
4400 else
4401 while (begp < endp
4402 && (c = endp[-1]) != ISO_CODE_ESC)
4403 {
4404 if (!eight_bit && c & 0x80) eight_bit = endp;
4405 endp--;
4406 }
4407 /* Do not consider LF as ascii if preceded by CR, since that
4408 confuses eol decoding. */
4409 if (begp < endp && endp < endp_orig
4410 && endp[-1] == '\r' && endp[0] == '\n')
4411 endp++;
4412 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4413 {
4414 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4415 /* This is an ASCII designation sequence. We can
4416 surely skip the tail. But, if we have
4417 encountered an 8-bit code, skip only the codes
4418 after that. */
4419 endp = eight_bit ? eight_bit : endp + 2;
4420 else
4421 /* Hmmm, we can't skip the tail. */
4422 endp = endp_orig;
4423 }
4424 else if (eight_bit)
4425 endp = eight_bit;
4426 }
4427 }
4428 break;
4429
4430 default:
4431 abort ();
4432 }
4433 *beg += begp - begp_orig;
4434 *end += endp - endp_orig;
4435 return;
4436 }
4437
4438 /* Like shrink_decoding_region but for encoding. */
4439
4440 static void
4441 shrink_encoding_region (beg, end, coding, str)
4442 int *beg, *end;
4443 struct coding_system *coding;
4444 unsigned char *str;
4445 {
4446 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4447 int eol_conversion;
4448 Lisp_Object translation_table;
4449
4450 if (coding->type == coding_type_ccl
4451 || coding->eol_type == CODING_EOL_CRLF
4452 || coding->eol_type == CODING_EOL_CR
4453 || coding->cmp_data && coding->cmp_data->used > 0)
4454 {
4455 /* We can't skip any data. */
4456 return;
4457 }
4458 if (coding->type == coding_type_no_conversion
4459 || coding->type == coding_type_raw_text
4460 || coding->type == coding_type_emacs_mule
4461 || coding->type == coding_type_undecided)
4462 {
4463 /* We need no conversion, but don't have to skip any data here.
4464 Encoding routine handles them effectively anyway. */
4465 return;
4466 }
4467
4468 translation_table = coding->translation_table_for_encode;
4469 if (NILP (translation_table) && !NILP (Venable_character_translation))
4470 translation_table = Vstandard_translation_table_for_encode;
4471 if (CHAR_TABLE_P (translation_table))
4472 {
4473 int i;
4474 for (i = 0; i < 128; i++)
4475 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4476 break;
4477 if (i < 128)
4478 /* Some ASCII character should be tranlsated. We give up
4479 shrinking. */
4480 return;
4481 }
4482
4483 if (str)
4484 {
4485 begp_orig = begp = str + *beg;
4486 endp_orig = endp = str + *end;
4487 }
4488 else
4489 {
4490 begp_orig = begp = BYTE_POS_ADDR (*beg);
4491 endp_orig = endp = begp + *end - *beg;
4492 }
4493
4494 eol_conversion = (coding->eol_type == CODING_EOL_CR
4495 || coding->eol_type == CODING_EOL_CRLF);
4496
4497 /* Here, we don't have to check coding->pre_write_conversion because
4498 the caller is expected to have handled it already. */
4499 switch (coding->type)
4500 {
4501 case coding_type_iso2022:
4502 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4503 /* We can't skip any data. */
4504 break;
4505 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4506 {
4507 unsigned char *bol = begp;
4508 while (begp < endp && *begp < 0x80)
4509 {
4510 begp++;
4511 if (begp[-1] == '\n')
4512 bol = begp;
4513 }
4514 begp = bol;
4515 goto label_skip_tail;
4516 }
4517 /* fall down ... */
4518
4519 case coding_type_sjis:
4520 case coding_type_big5:
4521 /* We can skip all ASCII characters at the head and tail. */
4522 if (eol_conversion)
4523 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4524 else
4525 while (begp < endp && *begp < 0x80) begp++;
4526 label_skip_tail:
4527 if (eol_conversion)
4528 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4529 else
4530 while (begp < endp && *(endp - 1) < 0x80) endp--;
4531 break;
4532
4533 default:
4534 abort ();
4535 }
4536
4537 *beg += begp - begp_orig;
4538 *end += endp - endp_orig;
4539 return;
4540 }
4541
4542 /* As shrinking conversion region requires some overhead, we don't try
4543 shrinking if the length of conversion region is less than this
4544 value. */
4545 static int shrink_conversion_region_threshhold = 1024;
4546
4547 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4548 do { \
4549 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4550 { \
4551 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4552 else shrink_decoding_region (beg, end, coding, str); \
4553 } \
4554 } while (0)
4555
4556 static Lisp_Object
4557 code_convert_region_unwind (dummy)
4558 Lisp_Object dummy;
4559 {
4560 inhibit_pre_post_conversion = 0;
4561 return Qnil;
4562 }
4563
4564 /* Store information about all compositions in the range FROM and TO
4565 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4566 buffer or a string, defaults to the current buffer. */
4567
4568 void
4569 coding_save_composition (coding, from, to, obj)
4570 struct coding_system *coding;
4571 int from, to;
4572 Lisp_Object obj;
4573 {
4574 Lisp_Object prop;
4575 int start, end;
4576
4577 if (coding->composing == COMPOSITION_DISABLED)
4578 return;
4579 if (!coding->cmp_data)
4580 coding_allocate_composition_data (coding, from);
4581 if (!find_composition (from, to, &start, &end, &prop, obj)
4582 || end > to)
4583 return;
4584 if (start < from
4585 && (!find_composition (end, to, &start, &end, &prop, obj)
4586 || end > to))
4587 return;
4588 coding->composing = COMPOSITION_NO;
4589 do
4590 {
4591 if (COMPOSITION_VALID_P (start, end, prop))
4592 {
4593 enum composition_method method = COMPOSITION_METHOD (prop);
4594 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4595 >= COMPOSITION_DATA_SIZE)
4596 coding_allocate_composition_data (coding, from);
4597 /* For relative composition, we remember start and end
4598 positions, for the other compositions, we also remember
4599 components. */
4600 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4601 if (method != COMPOSITION_RELATIVE)
4602 {
4603 /* We must store a*/
4604 Lisp_Object val, ch;
4605
4606 val = COMPOSITION_COMPONENTS (prop);
4607 if (CONSP (val))
4608 while (CONSP (val))
4609 {
4610 ch = XCAR (val), val = XCDR (val);
4611 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4612 }
4613 else if (VECTORP (val) || STRINGP (val))
4614 {
4615 int len = (VECTORP (val)
4616 ? XVECTOR (val)->size : XSTRING (val)->size);
4617 int i;
4618 for (i = 0; i < len; i++)
4619 {
4620 ch = (STRINGP (val)
4621 ? Faref (val, make_number (i))
4622 : XVECTOR (val)->contents[i]);
4623 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4624 }
4625 }
4626 else /* INTEGERP (val) */
4627 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4628 }
4629 CODING_ADD_COMPOSITION_END (coding, end - from);
4630 }
4631 start = end;
4632 }
4633 while (start < to
4634 && find_composition (start, to, &start, &end, &prop, obj)
4635 && end <= to);
4636
4637 /* Make coding->cmp_data point to the first memory block. */
4638 while (coding->cmp_data->prev)
4639 coding->cmp_data = coding->cmp_data->prev;
4640 coding->cmp_data_start = 0;
4641 }
4642
4643 /* Reflect the saved information about compositions to OBJ.
4644 CODING->cmp_data points to a memory block for the informaiton. OBJ
4645 is a buffer or a string, defaults to the current buffer. */
4646
4647 void
4648 coding_restore_composition (coding, obj)
4649 struct coding_system *coding;
4650 Lisp_Object obj;
4651 {
4652 struct composition_data *cmp_data = coding->cmp_data;
4653
4654 if (!cmp_data)
4655 return;
4656
4657 while (cmp_data->prev)
4658 cmp_data = cmp_data->prev;
4659
4660 while (cmp_data)
4661 {
4662 int i;
4663
4664 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
4665 i += cmp_data->data[i])
4666 {
4667 int *data = cmp_data->data + i;
4668 enum composition_method method = (enum composition_method) data[3];
4669 Lisp_Object components;
4670
4671 if (method == COMPOSITION_RELATIVE)
4672 components = Qnil;
4673 else
4674 {
4675 int len = data[0] - 4, j;
4676 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4677
4678 for (j = 0; j < len; j++)
4679 args[j] = make_number (data[4 + j]);
4680 components = (method == COMPOSITION_WITH_ALTCHARS
4681 ? Fstring (len, args) : Fvector (len, args));
4682 }
4683 compose_text (data[1], data[2], components, Qnil, obj);
4684 }
4685 cmp_data = cmp_data->next;
4686 }
4687 }
4688
4689 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4690 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4691 coding system CODING, and return the status code of code conversion
4692 (currently, this value has no meaning).
4693
4694 How many characters (and bytes) are converted to how many
4695 characters (and bytes) are recorded in members of the structure
4696 CODING.
4697
4698 If REPLACE is nonzero, we do various things as if the original text
4699 is deleted and a new text is inserted. See the comments in
4700 replace_range (insdel.c) to know what we are doing.
4701
4702 If REPLACE is zero, it is assumed that the source text is unibyte.
4703 Otherwize, it is assumed that the source text is multibyte. */
4704
4705 int
4706 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4707 int from, from_byte, to, to_byte, encodep, replace;
4708 struct coding_system *coding;
4709 {
4710 int len = to - from, len_byte = to_byte - from_byte;
4711 int require, inserted, inserted_byte;
4712 int head_skip, tail_skip, total_skip = 0;
4713 Lisp_Object saved_coding_symbol;
4714 int first = 1;
4715 unsigned char *src, *dst;
4716 Lisp_Object deletion;
4717 int orig_point = PT, orig_len = len;
4718 int prev_Z;
4719 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4720
4721 coding->src_multibyte = replace && multibyte_p;
4722 coding->dst_multibyte = multibyte_p;
4723
4724 deletion = Qnil;
4725 saved_coding_symbol = Qnil;
4726
4727 if (from < PT && PT < to)
4728 {
4729 TEMP_SET_PT_BOTH (from, from_byte);
4730 orig_point = from;
4731 }
4732
4733 if (replace)
4734 {
4735 int saved_from = from;
4736 int saved_inhibit_modification_hooks;
4737
4738 prepare_to_modify_buffer (from, to, &from);
4739 if (saved_from != from)
4740 {
4741 to = from + len;
4742 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4743 len_byte = to_byte - from_byte;
4744 }
4745
4746 /* The code conversion routine can not preserve text properties
4747 for now. So, we must remove all text properties in the
4748 region. Here, we must suppress all modification hooks. */
4749 saved_inhibit_modification_hooks = inhibit_modification_hooks;
4750 inhibit_modification_hooks = 1;
4751 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4752 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4753 }
4754
4755 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4756 {
4757 /* We must detect encoding of text and eol format. */
4758
4759 if (from < GPT && to > GPT)
4760 move_gap_both (from, from_byte);
4761 if (coding->type == coding_type_undecided)
4762 {
4763 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4764 if (coding->type == coding_type_undecided)
4765 /* It seems that the text contains only ASCII, but we
4766 should not left it undecided because the deeper
4767 decoding routine (decode_coding) tries to detect the
4768 encodings again in vain. */
4769 coding->type = coding_type_emacs_mule;
4770 }
4771 if (coding->eol_type == CODING_EOL_UNDECIDED
4772 && coding->type != coding_type_ccl)
4773 {
4774 saved_coding_symbol = coding->symbol;
4775 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4776 if (coding->eol_type == CODING_EOL_UNDECIDED)
4777 coding->eol_type = CODING_EOL_LF;
4778 /* We had better recover the original eol format if we
4779 encounter an inconsitent eol format while decoding. */
4780 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4781 }
4782 }
4783
4784 /* Now we convert the text. */
4785
4786 /* For encoding, we must process pre-write-conversion in advance. */
4787 if (! inhibit_pre_post_conversion
4788 && encodep
4789 && SYMBOLP (coding->pre_write_conversion)
4790 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4791 {
4792 /* The function in pre-write-conversion may put a new text in a
4793 new buffer. */
4794 struct buffer *prev = current_buffer;
4795 Lisp_Object new;
4796 int count = specpdl_ptr - specpdl;
4797
4798 record_unwind_protect (code_convert_region_unwind, Qnil);
4799 /* We should not call any more pre-write/post-read-conversion
4800 functions while this pre-write-conversion is running. */
4801 inhibit_pre_post_conversion = 1;
4802 call2 (coding->pre_write_conversion,
4803 make_number (from), make_number (to));
4804 inhibit_pre_post_conversion = 0;
4805 /* Discard the unwind protect. */
4806 specpdl_ptr--;
4807
4808 if (current_buffer != prev)
4809 {
4810 len = ZV - BEGV;
4811 new = Fcurrent_buffer ();
4812 set_buffer_internal_1 (prev);
4813 del_range_2 (from, from_byte, to, to_byte, 0);
4814 TEMP_SET_PT_BOTH (from, from_byte);
4815 insert_from_buffer (XBUFFER (new), 1, len, 0);
4816 Fkill_buffer (new);
4817 if (orig_point >= to)
4818 orig_point += len - orig_len;
4819 else if (orig_point > from)
4820 orig_point = from;
4821 orig_len = len;
4822 to = from + len;
4823 from_byte = CHAR_TO_BYTE (from);
4824 to_byte = CHAR_TO_BYTE (to);
4825 len_byte = to_byte - from_byte;
4826 TEMP_SET_PT_BOTH (from, from_byte);
4827 }
4828 }
4829
4830 if (replace)
4831 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4832
4833 if (coding->composing != COMPOSITION_DISABLED)
4834 {
4835 if (encodep)
4836 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4837 else
4838 coding_allocate_composition_data (coding, from);
4839 }
4840
4841 /* Try to skip the heading and tailing ASCIIs. */
4842 if (coding->type != coding_type_ccl)
4843 {
4844 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4845
4846 if (from < GPT && GPT < to)
4847 move_gap_both (from, from_byte);
4848 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4849 if (from_byte == to_byte
4850 && (encodep || NILP (coding->post_read_conversion))
4851 && ! CODING_REQUIRE_FLUSHING (coding))
4852 {
4853 coding->produced = len_byte;
4854 coding->produced_char = len;
4855 if (!replace)
4856 /* We must record and adjust for this new text now. */
4857 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4858 return 0;
4859 }
4860
4861 head_skip = from_byte - from_byte_orig;
4862 tail_skip = to_byte_orig - to_byte;
4863 total_skip = head_skip + tail_skip;
4864 from += head_skip;
4865 to -= tail_skip;
4866 len -= total_skip; len_byte -= total_skip;
4867 }
4868
4869 /* For converion, we must put the gap before the text in addition to
4870 making the gap larger for efficient decoding. The required gap
4871 size starts from 2000 which is the magic number used in make_gap.
4872 But, after one batch of conversion, it will be incremented if we
4873 find that it is not enough . */
4874 require = 2000;
4875
4876 if (GAP_SIZE < require)
4877 make_gap (require - GAP_SIZE);
4878 move_gap_both (from, from_byte);
4879
4880 inserted = inserted_byte = 0;
4881
4882 GAP_SIZE += len_byte;
4883 ZV -= len;
4884 Z -= len;
4885 ZV_BYTE -= len_byte;
4886 Z_BYTE -= len_byte;
4887
4888 if (GPT - BEG < BEG_UNCHANGED)
4889 BEG_UNCHANGED = GPT - BEG;
4890 if (Z - GPT < END_UNCHANGED)
4891 END_UNCHANGED = Z - GPT;
4892
4893 if (!encodep && coding->src_multibyte)
4894 {
4895 /* Decoding routines expects that the source text is unibyte.
4896 We must convert 8-bit characters of multibyte form to
4897 unibyte. */
4898 int len_byte_orig = len_byte;
4899 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4900 if (len_byte < len_byte_orig)
4901 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4902 len_byte);
4903 coding->src_multibyte = 0;
4904 }
4905
4906 for (;;)
4907 {
4908 int result;
4909
4910 /* The buffer memory is now:
4911 +--------+converted-text+---------+-------original-text-------+---+
4912 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4913 |<---------------------- GAP ----------------------->| */
4914 src = GAP_END_ADDR - len_byte;
4915 dst = GPT_ADDR + inserted_byte;
4916
4917 if (encodep)
4918 result = encode_coding (coding, src, dst, len_byte, 0);
4919 else
4920 result = decode_coding (coding, src, dst, len_byte, 0);
4921
4922 /* The buffer memory is now:
4923 +--------+-------converted-text----+--+------original-text----+---+
4924 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4925 |<---------------------- GAP ----------------------->| */
4926
4927 inserted += coding->produced_char;
4928 inserted_byte += coding->produced;
4929 len_byte -= coding->consumed;
4930
4931 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4932 {
4933 coding_allocate_composition_data (coding, from + inserted);
4934 continue;
4935 }
4936
4937 src += coding->consumed;
4938 dst += coding->produced;
4939
4940 if (result == CODING_FINISH_NORMAL)
4941 {
4942 src += len_byte;
4943 break;
4944 }
4945 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4946 {
4947 unsigned char *pend = dst, *p = pend - inserted_byte;
4948 Lisp_Object eol_type;
4949
4950 /* Encode LFs back to the original eol format (CR or CRLF). */
4951 if (coding->eol_type == CODING_EOL_CR)
4952 {
4953 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4954 }
4955 else
4956 {
4957 int count = 0;
4958
4959 while (p < pend) if (*p++ == '\n') count++;
4960 if (src - dst < count)
4961 {
4962 /* We don't have sufficient room for encoding LFs
4963 back to CRLF. We must record converted and
4964 not-yet-converted text back to the buffer
4965 content, enlarge the gap, then record them out of
4966 the buffer contents again. */
4967 int add = len_byte + inserted_byte;
4968
4969 GAP_SIZE -= add;
4970 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4971 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4972 make_gap (count - GAP_SIZE);
4973 GAP_SIZE += add;
4974 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4975 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4976 /* Don't forget to update SRC, DST, and PEND. */
4977 src = GAP_END_ADDR - len_byte;
4978 dst = GPT_ADDR + inserted_byte;
4979 pend = dst;
4980 }
4981 inserted += count;
4982 inserted_byte += count;
4983 coding->produced += count;
4984 p = dst = pend + count;
4985 while (count)
4986 {
4987 *--p = *--pend;
4988 if (*p == '\n') count--, *--p = '\r';
4989 }
4990 }
4991
4992 /* Suppress eol-format conversion in the further conversion. */
4993 coding->eol_type = CODING_EOL_LF;
4994
4995 /* Set the coding system symbol to that for Unix-like EOL. */
4996 eol_type = Fget (saved_coding_symbol, Qeol_type);
4997 if (VECTORP (eol_type)
4998 && XVECTOR (eol_type)->size == 3
4999 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5000 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5001 else
5002 coding->symbol = saved_coding_symbol;
5003
5004 continue;
5005 }
5006 if (len_byte <= 0)
5007 {
5008 if (coding->type != coding_type_ccl
5009 || coding->mode & CODING_MODE_LAST_BLOCK)
5010 break;
5011 coding->mode |= CODING_MODE_LAST_BLOCK;
5012 continue;
5013 }
5014 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5015 {
5016 /* The source text ends in invalid codes. Let's just
5017 make them valid buffer contents, and finish conversion. */
5018 inserted += len_byte;
5019 inserted_byte += len_byte;
5020 while (len_byte--)
5021 *dst++ = *src++;
5022 break;
5023 }
5024 if (result == CODING_FINISH_INTERRUPT)
5025 {
5026 /* The conversion procedure was interrupted by a user. */
5027 break;
5028 }
5029 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5030 if (coding->consumed < 1)
5031 {
5032 /* It's quite strange to require more memory without
5033 consuming any bytes. Perhaps CCL program bug. */
5034 break;
5035 }
5036 if (first)
5037 {
5038 /* We have just done the first batch of conversion which was
5039 stoped because of insufficient gap. Let's reconsider the
5040 required gap size (i.e. SRT - DST) now.
5041
5042 We have converted ORIG bytes (== coding->consumed) into
5043 NEW bytes (coding->produced). To convert the remaining
5044 LEN bytes, we may need REQUIRE bytes of gap, where:
5045 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5046 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5047 Here, we are sure that NEW >= ORIG. */
5048 float ratio = coding->produced - coding->consumed;
5049 ratio /= coding->consumed;
5050 require = len_byte * ratio;
5051 first = 0;
5052 }
5053 if ((src - dst) < (require + 2000))
5054 {
5055 /* See the comment above the previous call of make_gap. */
5056 int add = len_byte + inserted_byte;
5057
5058 GAP_SIZE -= add;
5059 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5060 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5061 make_gap (require + 2000);
5062 GAP_SIZE += add;
5063 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5064 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5065 }
5066 }
5067 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5068
5069 if (encodep && coding->dst_multibyte)
5070 {
5071 /* The output is unibyte. We must convert 8-bit characters to
5072 multibyte form. */
5073 if (inserted_byte * 2 > GAP_SIZE)
5074 {
5075 GAP_SIZE -= inserted_byte;
5076 ZV += inserted_byte; Z += inserted_byte;
5077 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5078 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5079 make_gap (inserted_byte - GAP_SIZE);
5080 GAP_SIZE += inserted_byte;
5081 ZV -= inserted_byte; Z -= inserted_byte;
5082 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5083 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5084 }
5085 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5086 }
5087
5088 /* If we have shrinked the conversion area, adjust it now. */
5089 if (total_skip > 0)
5090 {
5091 if (tail_skip > 0)
5092 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5093 inserted += total_skip; inserted_byte += total_skip;
5094 GAP_SIZE += total_skip;
5095 GPT -= head_skip; GPT_BYTE -= head_skip;
5096 ZV -= total_skip; ZV_BYTE -= total_skip;
5097 Z -= total_skip; Z_BYTE -= total_skip;
5098 from -= head_skip; from_byte -= head_skip;
5099 to += tail_skip; to_byte += tail_skip;
5100 }
5101
5102 prev_Z = Z;
5103 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5104 inserted = Z - prev_Z;
5105
5106 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5107 coding_restore_composition (coding, Fcurrent_buffer ());
5108 coding_free_composition_data (coding);
5109
5110 if (! inhibit_pre_post_conversion
5111 && ! encodep && ! NILP (coding->post_read_conversion))
5112 {
5113 Lisp_Object val;
5114 int count = specpdl_ptr - specpdl;
5115
5116 if (from != PT)
5117 TEMP_SET_PT_BOTH (from, from_byte);
5118 prev_Z = Z;
5119 record_unwind_protect (code_convert_region_unwind, Qnil);
5120 /* We should not call any more pre-write/post-read-conversion
5121 functions while this post-read-conversion is running. */
5122 inhibit_pre_post_conversion = 1;
5123 val = call1 (coding->post_read_conversion, make_number (inserted));
5124 inhibit_pre_post_conversion = 0;
5125 /* Discard the unwind protect. */
5126 specpdl_ptr--;
5127 CHECK_NUMBER (val, 0);
5128 inserted += Z - prev_Z;
5129 }
5130
5131 if (orig_point >= from)
5132 {
5133 if (orig_point >= from + orig_len)
5134 orig_point += inserted - orig_len;
5135 else
5136 orig_point = from;
5137 TEMP_SET_PT (orig_point);
5138 }
5139
5140 if (replace)
5141 {
5142 signal_after_change (from, to - from, inserted);
5143 update_compositions (from, from + inserted, CHECK_BORDER);
5144 }
5145
5146 {
5147 coding->consumed = to_byte - from_byte;
5148 coding->consumed_char = to - from;
5149 coding->produced = inserted_byte;
5150 coding->produced_char = inserted;
5151 }
5152
5153 return 0;
5154 }
5155
5156 Lisp_Object
5157 run_pre_post_conversion_on_str (str, coding, encodep)
5158 Lisp_Object str;
5159 struct coding_system *coding;
5160 int encodep;
5161 {
5162 int count = specpdl_ptr - specpdl;
5163 struct gcpro gcpro1;
5164 struct buffer *prev = current_buffer;
5165 int multibyte = STRING_MULTIBYTE (str);
5166
5167 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5168 record_unwind_protect (code_convert_region_unwind, Qnil);
5169 GCPRO1 (str);
5170 temp_output_buffer_setup (" *code-converting-work*");
5171 set_buffer_internal (XBUFFER (Vstandard_output));
5172 /* We must insert the contents of STR as is without
5173 unibyte<->multibyte conversion. For that, we adjust the
5174 multibyteness of the working buffer to that of STR. */
5175 Ferase_buffer ();
5176 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5177 insert_from_string (str, 0, 0,
5178 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5179 UNGCPRO;
5180 inhibit_pre_post_conversion = 1;
5181 if (encodep)
5182 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5183 else
5184 {
5185 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5186 call1 (coding->post_read_conversion, make_number (Z - BEG));
5187 }
5188 inhibit_pre_post_conversion = 0;
5189 str = make_buffer_string (BEG, Z, 1);
5190 return unbind_to (count, str);
5191 }
5192
5193 Lisp_Object
5194 decode_coding_string (str, coding, nocopy)
5195 Lisp_Object str;
5196 struct coding_system *coding;
5197 int nocopy;
5198 {
5199 int len;
5200 char *buf;
5201 int from, to, to_byte;
5202 struct gcpro gcpro1;
5203 Lisp_Object saved_coding_symbol;
5204 int result;
5205 int require_decoding;
5206
5207 from = 0;
5208 to = XSTRING (str)->size;
5209 to_byte = STRING_BYTES (XSTRING (str));
5210
5211 saved_coding_symbol = Qnil;
5212 if (CODING_REQUIRE_DETECTION (coding))
5213 {
5214 /* See the comments in code_convert_region. */
5215 if (coding->type == coding_type_undecided)
5216 {
5217 detect_coding (coding, XSTRING (str)->data, to_byte);
5218 if (coding->type == coding_type_undecided)
5219 coding->type = coding_type_emacs_mule;
5220 }
5221 if (coding->eol_type == CODING_EOL_UNDECIDED
5222 && coding->type != coding_type_ccl)
5223 {
5224 saved_coding_symbol = coding->symbol;
5225 detect_eol (coding, XSTRING (str)->data, to_byte);
5226 if (coding->eol_type == CODING_EOL_UNDECIDED)
5227 coding->eol_type = CODING_EOL_LF;
5228 /* We had better recover the original eol format if we
5229 encounter an inconsitent eol format while decoding. */
5230 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5231 }
5232 }
5233
5234 require_decoding = CODING_REQUIRE_DECODING (coding);
5235
5236 if (STRING_MULTIBYTE (str))
5237 {
5238 /* Decoding routines expect the source text to be unibyte. */
5239 str = Fstring_as_unibyte (str);
5240 to_byte = STRING_BYTES (XSTRING (str));
5241 nocopy = 1;
5242 }
5243 coding->src_multibyte = 0;
5244 coding->dst_multibyte = (coding->type != coding_type_no_conversion
5245 && coding->type != coding_type_raw_text);
5246
5247 /* Try to skip the heading and tailing ASCIIs. */
5248 if (require_decoding && coding->type != coding_type_ccl)
5249 {
5250 int from_orig = from;
5251
5252 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5253 0);
5254 if (from == to_byte)
5255 require_decoding = 0;
5256 }
5257
5258 if (!require_decoding)
5259 {
5260 coding->consumed = STRING_BYTES (XSTRING (str));
5261 coding->consumed_char = XSTRING (str)->size;
5262 if (coding->dst_multibyte)
5263 {
5264 str = Fstring_as_multibyte (str);
5265 nocopy = 1;
5266 }
5267 coding->produced = STRING_BYTES (XSTRING (str));
5268 coding->produced_char = XSTRING (str)->size;
5269 return (nocopy ? str : Fcopy_sequence (str));
5270 }
5271
5272 if (coding->composing != COMPOSITION_DISABLED)
5273 coding_allocate_composition_data (coding, from);
5274
5275 len = decoding_buffer_size (coding, to_byte - from);
5276 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5277 GCPRO1 (str);
5278 buf = get_conversion_buffer (len);
5279 UNGCPRO;
5280
5281 if (from > 0)
5282 bcopy (XSTRING (str)->data, buf, from);
5283 result = decode_coding (coding, XSTRING (str)->data + from,
5284 buf + from, to_byte - from, len);
5285 if (result == CODING_FINISH_INCONSISTENT_EOL)
5286 {
5287 /* We simply try to decode the whole string again but without
5288 eol-conversion this time. */
5289 coding->eol_type = CODING_EOL_LF;
5290 coding->symbol = saved_coding_symbol;
5291 coding_free_composition_data (coding);
5292 return decode_coding_string (str, coding, nocopy);
5293 }
5294
5295 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5296 STRING_BYTES (XSTRING (str)) - to_byte);
5297
5298 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5299 if (coding->dst_multibyte)
5300 str = make_multibyte_string (buf, len + coding->produced_char,
5301 len + coding->produced);
5302 else
5303 str = make_unibyte_string (buf, len + coding->produced);
5304
5305 if (coding->cmp_data && coding->cmp_data->used)
5306 coding_restore_composition (coding, str);
5307 coding_free_composition_data (coding);
5308
5309 if (SYMBOLP (coding->post_read_conversion)
5310 && !NILP (Ffboundp (coding->post_read_conversion)))
5311 str = run_pre_post_conversion_on_str (str, coding, 0);
5312
5313 return str;
5314 }
5315
5316 Lisp_Object
5317 encode_coding_string (str, coding, nocopy)
5318 Lisp_Object str;
5319 struct coding_system *coding;
5320 int nocopy;
5321 {
5322 int len;
5323 char *buf;
5324 int from, to, to_byte;
5325 struct gcpro gcpro1;
5326 Lisp_Object saved_coding_symbol;
5327 int result;
5328
5329 if (SYMBOLP (coding->pre_write_conversion)
5330 && !NILP (Ffboundp (coding->pre_write_conversion)))
5331 str = run_pre_post_conversion_on_str (str, coding, 1);
5332
5333 from = 0;
5334 to = XSTRING (str)->size;
5335 to_byte = STRING_BYTES (XSTRING (str));
5336
5337 saved_coding_symbol = Qnil;
5338 if (! CODING_REQUIRE_ENCODING (coding))
5339 {
5340 if (STRING_MULTIBYTE (str))
5341 {
5342 str = Fstring_as_unibyte (str);
5343 nocopy = 1;
5344 }
5345 return (nocopy ? str : Fcopy_sequence (str));
5346 }
5347
5348 /* Encoding routines determine the multibyteness of the source text
5349 by coding->src_multibyte. */
5350 coding->src_multibyte = STRING_MULTIBYTE (str);
5351 coding->dst_multibyte = 0;
5352
5353 if (coding->composing != COMPOSITION_DISABLED)
5354 coding_save_composition (coding, from, to, str);
5355
5356 /* Try to skip the heading and tailing ASCIIs. */
5357 if (coding->type != coding_type_ccl)
5358 {
5359 int from_orig = from;
5360
5361 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5362 1);
5363 if (from == to_byte)
5364 return (nocopy ? str : Fcopy_sequence (str));
5365 }
5366
5367 len = encoding_buffer_size (coding, to_byte - from);
5368 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5369 GCPRO1 (str);
5370 buf = get_conversion_buffer (len);
5371 UNGCPRO;
5372
5373 if (from > 0)
5374 bcopy (XSTRING (str)->data, buf, from);
5375 result = encode_coding (coding, XSTRING (str)->data + from,
5376 buf + from, to_byte - from, len);
5377 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5378 STRING_BYTES (XSTRING (str)) - to_byte);
5379
5380 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5381 str = make_unibyte_string (buf, len + coding->produced);
5382 coding_free_composition_data (coding);
5383
5384 return str;
5385 }
5386
5387 \f
5388 #ifdef emacs
5389 /*** 8. Emacs Lisp library functions ***/
5390
5391 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5392 "Return t if OBJECT is nil or a coding-system.\n\
5393 See the documentation of `make-coding-system' for information\n\
5394 about coding-system objects.")
5395 (obj)
5396 Lisp_Object obj;
5397 {
5398 if (NILP (obj))
5399 return Qt;
5400 if (!SYMBOLP (obj))
5401 return Qnil;
5402 /* Get coding-spec vector for OBJ. */
5403 obj = Fget (obj, Qcoding_system);
5404 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5405 ? Qt : Qnil);
5406 }
5407
5408 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5409 Sread_non_nil_coding_system, 1, 1, 0,
5410 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5411 (prompt)
5412 Lisp_Object prompt;
5413 {
5414 Lisp_Object val;
5415 do
5416 {
5417 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5418 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5419 }
5420 while (XSTRING (val)->size == 0);
5421 return (Fintern (val, Qnil));
5422 }
5423
5424 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5425 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5426 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5427 (prompt, default_coding_system)
5428 Lisp_Object prompt, default_coding_system;
5429 {
5430 Lisp_Object val;
5431 if (SYMBOLP (default_coding_system))
5432 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5433 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5434 Qt, Qnil, Qcoding_system_history,
5435 default_coding_system, Qnil);
5436 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5437 }
5438
5439 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5440 1, 1, 0,
5441 "Check validity of CODING-SYSTEM.\n\
5442 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5443 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5444 The value of property should be a vector of length 5.")
5445 (coding_system)
5446 Lisp_Object coding_system;
5447 {
5448 CHECK_SYMBOL (coding_system, 0);
5449 if (!NILP (Fcoding_system_p (coding_system)))
5450 return coding_system;
5451 while (1)
5452 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5453 }
5454 \f
5455 Lisp_Object
5456 detect_coding_system (src, src_bytes, highest)
5457 unsigned char *src;
5458 int src_bytes, highest;
5459 {
5460 int coding_mask, eol_type;
5461 Lisp_Object val, tmp;
5462 int dummy;
5463
5464 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5465 eol_type = detect_eol_type (src, src_bytes, &dummy);
5466 if (eol_type == CODING_EOL_INCONSISTENT)
5467 eol_type = CODING_EOL_UNDECIDED;
5468
5469 if (!coding_mask)
5470 {
5471 val = Qundecided;
5472 if (eol_type != CODING_EOL_UNDECIDED)
5473 {
5474 Lisp_Object val2;
5475 val2 = Fget (Qundecided, Qeol_type);
5476 if (VECTORP (val2))
5477 val = XVECTOR (val2)->contents[eol_type];
5478 }
5479 return (highest ? val : Fcons (val, Qnil));
5480 }
5481
5482 /* At first, gather possible coding systems in VAL. */
5483 val = Qnil;
5484 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5485 {
5486 Lisp_Object category_val, category_index;
5487
5488 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5489 category_val = Fsymbol_value (XCAR (tmp));
5490 if (!NILP (category_val)
5491 && NATNUMP (category_index)
5492 && (coding_mask & (1 << XFASTINT (category_index))))
5493 {
5494 val = Fcons (category_val, val);
5495 if (highest)
5496 break;
5497 }
5498 }
5499 if (!highest)
5500 val = Fnreverse (val);
5501
5502 /* Then, replace the elements with subsidiary coding systems. */
5503 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5504 {
5505 if (eol_type != CODING_EOL_UNDECIDED
5506 && eol_type != CODING_EOL_INCONSISTENT)
5507 {
5508 Lisp_Object eol;
5509 eol = Fget (XCAR (tmp), Qeol_type);
5510 if (VECTORP (eol))
5511 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5512 }
5513 }
5514 return (highest ? XCAR (val) : val);
5515 }
5516
5517 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5518 2, 3, 0,
5519 "Detect coding system of the text in the region between START and END.\n\
5520 Return a list of possible coding systems ordered by priority.\n\
5521 \n\
5522 If only ASCII characters are found, it returns a list of single element\n\
5523 `undecided' or its subsidiary coding system according to a detected\n\
5524 end-of-line format.\n\
5525 \n\
5526 If optional argument HIGHEST is non-nil, return the coding system of\n\
5527 highest priority.")
5528 (start, end, highest)
5529 Lisp_Object start, end, highest;
5530 {
5531 int from, to;
5532 int from_byte, to_byte;
5533
5534 CHECK_NUMBER_COERCE_MARKER (start, 0);
5535 CHECK_NUMBER_COERCE_MARKER (end, 1);
5536
5537 validate_region (&start, &end);
5538 from = XINT (start), to = XINT (end);
5539 from_byte = CHAR_TO_BYTE (from);
5540 to_byte = CHAR_TO_BYTE (to);
5541
5542 if (from < GPT && to >= GPT)
5543 move_gap_both (to, to_byte);
5544
5545 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5546 to_byte - from_byte,
5547 !NILP (highest));
5548 }
5549
5550 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5551 1, 2, 0,
5552 "Detect coding system of the text in STRING.\n\
5553 Return a list of possible coding systems ordered by priority.\n\
5554 \n\
5555 If only ASCII characters are found, it returns a list of single element\n\
5556 `undecided' or its subsidiary coding system according to a detected\n\
5557 end-of-line format.\n\
5558 \n\
5559 If optional argument HIGHEST is non-nil, return the coding system of\n\
5560 highest priority.")
5561 (string, highest)
5562 Lisp_Object string, highest;
5563 {
5564 CHECK_STRING (string, 0);
5565
5566 return detect_coding_system (XSTRING (string)->data,
5567 STRING_BYTES (XSTRING (string)),
5568 !NILP (highest));
5569 }
5570
5571 /* Return an intersection of lists L1 and L2. */
5572
5573 static Lisp_Object
5574 intersection (l1, l2)
5575 Lisp_Object l1, l2;
5576 {
5577 Lisp_Object val;
5578
5579 for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
5580 {
5581 if (!NILP (Fmemq (XCAR (l1), l2)))
5582 val = Fcons (XCAR (l1), val);
5583 }
5584 return val;
5585 }
5586
5587
5588 /* Subroutine for Fsafe_coding_systems_region_internal.
5589
5590 Return a list of coding systems that safely encode the multibyte
5591 text between P and PEND. SAFE_CODINGS, if non-nil, is a list of
5592 possible coding systems. If it is nil, it means that we have not
5593 yet found any coding systems.
5594
5595 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
5596 element of WORK_TABLE is set to t once the element is looked up.
5597
5598 If a non-ASCII single byte char is found, set
5599 *single_byte_char_found to 1. */
5600
5601 static Lisp_Object
5602 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
5603 unsigned char *p, *pend;
5604 Lisp_Object safe_codings, work_table;
5605 int *single_byte_char_found;
5606 {
5607 int c, len, idx;
5608 Lisp_Object val;
5609
5610 while (p < pend)
5611 {
5612 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
5613 p += len;
5614 if (ASCII_BYTE_P (c))
5615 /* We can ignore ASCII characters here. */
5616 continue;
5617 if (SINGLE_BYTE_CHAR_P (c))
5618 *single_byte_char_found = 1;
5619 if (NILP (safe_codings))
5620 continue;
5621 /* Check the safe coding systems for C. */
5622 val = char_table_ref_and_index (work_table, c, &idx);
5623 if (EQ (val, Qt))
5624 /* This element was already checked. Ignore it. */
5625 continue;
5626 /* Remember that we checked this element. */
5627 CHAR_TABLE_SET (work_table, make_number (idx), Qt);
5628
5629 /* If there are some safe coding systems for C and we have
5630 already found the other set of coding systems for the
5631 different characters, get the intersection of them. */
5632 if (!EQ (safe_codings, Qt) && !NILP (val))
5633 val = intersection (safe_codings, val);
5634 safe_codings = val;
5635 }
5636 return safe_codings;
5637 }
5638
5639
5640 /* Return a list of coding systems that safely encode the text between
5641 START and END. If the text contains only ASCII or is unibyte,
5642 return t. */
5643
5644 DEFUN ("find-coding-systems-region-internal",
5645 Ffind_coding_systems_region_internal,
5646 Sfind_coding_systems_region_internal, 2, 2, 0,
5647 "Internal use only.")
5648 (start, end)
5649 Lisp_Object start, end;
5650 {
5651 Lisp_Object work_table, safe_codings;
5652 int non_ascii_p = 0;
5653 int single_byte_char_found = 0;
5654 unsigned char *p1, *p1end, *p2, *p2end, *p;
5655 Lisp_Object args[2];
5656
5657 if (STRINGP (start))
5658 {
5659 if (!STRING_MULTIBYTE (start))
5660 return Qt;
5661 p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
5662 p2 = p2end = p1end;
5663 if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
5664 non_ascii_p = 1;
5665 }
5666 else
5667 {
5668 int from, to, stop;
5669
5670 CHECK_NUMBER_COERCE_MARKER (start, 0);
5671 CHECK_NUMBER_COERCE_MARKER (end, 1);
5672 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
5673 args_out_of_range (start, end);
5674 if (NILP (current_buffer->enable_multibyte_characters))
5675 return Qt;
5676 from = CHAR_TO_BYTE (XINT (start));
5677 to = CHAR_TO_BYTE (XINT (end));
5678 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
5679 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
5680 if (stop == to)
5681 p2 = p2end = p1end;
5682 else
5683 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
5684 if (XINT (end) - XINT (start) != to - from)
5685 non_ascii_p = 1;
5686 }
5687
5688 if (!non_ascii_p)
5689 {
5690 /* We are sure that the text contains no multibyte character.
5691 Check if it contains eight-bit-graphic. */
5692 p = p1;
5693 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
5694 if (p == p1end)
5695 {
5696 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
5697 if (p == p2end)
5698 return Qt;
5699 }
5700 }
5701
5702 /* The text contains non-ASCII characters. */
5703 work_table = Fcopy_sequence (Vchar_coding_system_table);
5704 safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
5705 &single_byte_char_found);
5706 if (p2 < p2end)
5707 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
5708 &single_byte_char_found);
5709
5710 if (!single_byte_char_found)
5711 {
5712 /* Append generic coding systems. */
5713 Lisp_Object args[2];
5714 args[0] = safe_codings;
5715 args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
5716 make_number (0));
5717 safe_codings = Fappend (2, args);
5718 }
5719 else
5720 safe_codings = Fcons (Qraw_text, Fcons (Qemacs_mule, safe_codings));
5721 return safe_codings;
5722 }
5723
5724
5725 Lisp_Object
5726 code_convert_region1 (start, end, coding_system, encodep)
5727 Lisp_Object start, end, coding_system;
5728 int encodep;
5729 {
5730 struct coding_system coding;
5731 int from, to, len;
5732
5733 CHECK_NUMBER_COERCE_MARKER (start, 0);
5734 CHECK_NUMBER_COERCE_MARKER (end, 1);
5735 CHECK_SYMBOL (coding_system, 2);
5736
5737 validate_region (&start, &end);
5738 from = XFASTINT (start);
5739 to = XFASTINT (end);
5740
5741 if (NILP (coding_system))
5742 return make_number (to - from);
5743
5744 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5745 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5746
5747 coding.mode |= CODING_MODE_LAST_BLOCK;
5748 coding.src_multibyte = coding.dst_multibyte
5749 = !NILP (current_buffer->enable_multibyte_characters);
5750 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5751 &coding, encodep, 1);
5752 Vlast_coding_system_used = coding.symbol;
5753 return make_number (coding.produced_char);
5754 }
5755
5756 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5757 3, 3, "r\nzCoding system: ",
5758 "Decode the current region by specified coding system.\n\
5759 When called from a program, takes three arguments:\n\
5760 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5761 This function sets `last-coding-system-used' to the precise coding system\n\
5762 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5763 not fully specified.)\n\
5764 It returns the length of the decoded text.")
5765 (start, end, coding_system)
5766 Lisp_Object start, end, coding_system;
5767 {
5768 return code_convert_region1 (start, end, coding_system, 0);
5769 }
5770
5771 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5772 3, 3, "r\nzCoding system: ",
5773 "Encode the current region by specified coding system.\n\
5774 When called from a program, takes three arguments:\n\
5775 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5776 This function sets `last-coding-system-used' to the precise coding system\n\
5777 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5778 not fully specified.)\n\
5779 It returns the length of the encoded text.")
5780 (start, end, coding_system)
5781 Lisp_Object start, end, coding_system;
5782 {
5783 return code_convert_region1 (start, end, coding_system, 1);
5784 }
5785
5786 Lisp_Object
5787 code_convert_string1 (string, coding_system, nocopy, encodep)
5788 Lisp_Object string, coding_system, nocopy;
5789 int encodep;
5790 {
5791 struct coding_system coding;
5792
5793 CHECK_STRING (string, 0);
5794 CHECK_SYMBOL (coding_system, 1);
5795
5796 if (NILP (coding_system))
5797 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5798
5799 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5800 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5801
5802 coding.mode |= CODING_MODE_LAST_BLOCK;
5803 string = (encodep
5804 ? encode_coding_string (string, &coding, !NILP (nocopy))
5805 : decode_coding_string (string, &coding, !NILP (nocopy)));
5806 Vlast_coding_system_used = coding.symbol;
5807
5808 return string;
5809 }
5810
5811 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5812 2, 3, 0,
5813 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5814 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5815 if the decoding operation is trivial.\n\
5816 This function sets `last-coding-system-used' to the precise coding system\n\
5817 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5818 not fully specified.)")
5819 (string, coding_system, nocopy)
5820 Lisp_Object string, coding_system, nocopy;
5821 {
5822 return code_convert_string1 (string, coding_system, nocopy, 0);
5823 }
5824
5825 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5826 2, 3, 0,
5827 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5828 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5829 if the encoding operation is trivial.\n\
5830 This function sets `last-coding-system-used' to the precise coding system\n\
5831 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5832 not fully specified.)")
5833 (string, coding_system, nocopy)
5834 Lisp_Object string, coding_system, nocopy;
5835 {
5836 return code_convert_string1 (string, coding_system, nocopy, 1);
5837 }
5838
5839 /* Encode or decode STRING according to CODING_SYSTEM.
5840 Do not set Vlast_coding_system_used.
5841
5842 This function is called only from macros DECODE_FILE and
5843 ENCODE_FILE, thus we ignore character composition. */
5844
5845 Lisp_Object
5846 code_convert_string_norecord (string, coding_system, encodep)
5847 Lisp_Object string, coding_system;
5848 int encodep;
5849 {
5850 struct coding_system coding;
5851
5852 CHECK_STRING (string, 0);
5853 CHECK_SYMBOL (coding_system, 1);
5854
5855 if (NILP (coding_system))
5856 return string;
5857
5858 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5859 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5860
5861 coding.composing = COMPOSITION_DISABLED;
5862 coding.mode |= CODING_MODE_LAST_BLOCK;
5863 return (encodep
5864 ? encode_coding_string (string, &coding, 1)
5865 : decode_coding_string (string, &coding, 1));
5866 }
5867 \f
5868 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5869 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5870 Return the corresponding character.")
5871 (code)
5872 Lisp_Object code;
5873 {
5874 unsigned char c1, c2, s1, s2;
5875 Lisp_Object val;
5876
5877 CHECK_NUMBER (code, 0);
5878 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5879 if (s1 == 0)
5880 {
5881 if (s2 < 0x80)
5882 XSETFASTINT (val, s2);
5883 else if (s2 >= 0xA0 || s2 <= 0xDF)
5884 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5885 else
5886 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5887 }
5888 else
5889 {
5890 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5891 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5892 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5893 DECODE_SJIS (s1, s2, c1, c2);
5894 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5895 }
5896 return val;
5897 }
5898
5899 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5900 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5901 Return the corresponding code in SJIS.")
5902 (ch)
5903 Lisp_Object ch;
5904 {
5905 int charset, c1, c2, s1, s2;
5906 Lisp_Object val;
5907
5908 CHECK_NUMBER (ch, 0);
5909 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5910 if (charset == CHARSET_ASCII)
5911 {
5912 val = ch;
5913 }
5914 else if (charset == charset_jisx0208
5915 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5916 {
5917 ENCODE_SJIS (c1, c2, s1, s2);
5918 XSETFASTINT (val, (s1 << 8) | s2);
5919 }
5920 else if (charset == charset_katakana_jisx0201
5921 && c1 > 0x20 && c2 < 0xE0)
5922 {
5923 XSETFASTINT (val, c1 | 0x80);
5924 }
5925 else
5926 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5927 return val;
5928 }
5929
5930 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5931 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5932 Return the corresponding character.")
5933 (code)
5934 Lisp_Object code;
5935 {
5936 int charset;
5937 unsigned char b1, b2, c1, c2;
5938 Lisp_Object val;
5939
5940 CHECK_NUMBER (code, 0);
5941 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5942 if (b1 == 0)
5943 {
5944 if (b2 >= 0x80)
5945 error ("Invalid BIG5 code: %x", XFASTINT (code));
5946 val = code;
5947 }
5948 else
5949 {
5950 if ((b1 < 0xA1 || b1 > 0xFE)
5951 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5952 error ("Invalid BIG5 code: %x", XFASTINT (code));
5953 DECODE_BIG5 (b1, b2, charset, c1, c2);
5954 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5955 }
5956 return val;
5957 }
5958
5959 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5960 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5961 Return the corresponding character code in Big5.")
5962 (ch)
5963 Lisp_Object ch;
5964 {
5965 int charset, c1, c2, b1, b2;
5966 Lisp_Object val;
5967
5968 CHECK_NUMBER (ch, 0);
5969 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5970 if (charset == CHARSET_ASCII)
5971 {
5972 val = ch;
5973 }
5974 else if ((charset == charset_big5_1
5975 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5976 || (charset == charset_big5_2
5977 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5978 {
5979 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5980 XSETFASTINT (val, (b1 << 8) | b2);
5981 }
5982 else
5983 error ("Can't encode to Big5: %d", XFASTINT (ch));
5984 return val;
5985 }
5986 \f
5987 DEFUN ("set-terminal-coding-system-internal",
5988 Fset_terminal_coding_system_internal,
5989 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5990 (coding_system)
5991 Lisp_Object coding_system;
5992 {
5993 CHECK_SYMBOL (coding_system, 0);
5994 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5995 /* We had better not send unsafe characters to terminal. */
5996 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5997 /* Characer composition should be disabled. */
5998 terminal_coding.composing = COMPOSITION_DISABLED;
5999 terminal_coding.src_multibyte = 1;
6000 terminal_coding.dst_multibyte = 0;
6001 return Qnil;
6002 }
6003
6004 DEFUN ("set-safe-terminal-coding-system-internal",
6005 Fset_safe_terminal_coding_system_internal,
6006 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
6007 (coding_system)
6008 Lisp_Object coding_system;
6009 {
6010 CHECK_SYMBOL (coding_system, 0);
6011 setup_coding_system (Fcheck_coding_system (coding_system),
6012 &safe_terminal_coding);
6013 /* Characer composition should be disabled. */
6014 safe_terminal_coding.composing = COMPOSITION_DISABLED;
6015 safe_terminal_coding.src_multibyte = 1;
6016 safe_terminal_coding.dst_multibyte = 0;
6017 return Qnil;
6018 }
6019
6020 DEFUN ("terminal-coding-system",
6021 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6022 "Return coding system specified for terminal output.")
6023 ()
6024 {
6025 return terminal_coding.symbol;
6026 }
6027
6028 DEFUN ("set-keyboard-coding-system-internal",
6029 Fset_keyboard_coding_system_internal,
6030 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
6031 (coding_system)
6032 Lisp_Object coding_system;
6033 {
6034 CHECK_SYMBOL (coding_system, 0);
6035 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6036 /* Characer composition should be disabled. */
6037 keyboard_coding.composing = COMPOSITION_DISABLED;
6038 return Qnil;
6039 }
6040
6041 DEFUN ("keyboard-coding-system",
6042 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6043 "Return coding system specified for decoding keyboard input.")
6044 ()
6045 {
6046 return keyboard_coding.symbol;
6047 }
6048
6049 \f
6050 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6051 Sfind_operation_coding_system, 1, MANY, 0,
6052 "Choose a coding system for an operation based on the target name.\n\
6053 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
6054 DECODING-SYSTEM is the coding system to use for decoding\n\
6055 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
6056 for encoding (in case OPERATION does encoding).\n\
6057 \n\
6058 The first argument OPERATION specifies an I/O primitive:\n\
6059 For file I/O, `insert-file-contents' or `write-region'.\n\
6060 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
6061 For network I/O, `open-network-stream'.\n\
6062 \n\
6063 The remaining arguments should be the same arguments that were passed\n\
6064 to the primitive. Depending on which primitive, one of those arguments\n\
6065 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
6066 whichever argument specifies the file name is TARGET.\n\
6067 \n\
6068 TARGET has a meaning which depends on OPERATION:\n\
6069 For file I/O, TARGET is a file name.\n\
6070 For process I/O, TARGET is a process name.\n\
6071 For network I/O, TARGET is a service name or a port number\n\
6072 \n\
6073 This function looks up what specified for TARGET in,\n\
6074 `file-coding-system-alist', `process-coding-system-alist',\n\
6075 or `network-coding-system-alist' depending on OPERATION.\n\
6076 They may specify a coding system, a cons of coding systems,\n\
6077 or a function symbol to call.\n\
6078 In the last case, we call the function with one argument,\n\
6079 which is a list of all the arguments given to this function.")
6080 (nargs, args)
6081 int nargs;
6082 Lisp_Object *args;
6083 {
6084 Lisp_Object operation, target_idx, target, val;
6085 register Lisp_Object chain;
6086
6087 if (nargs < 2)
6088 error ("Too few arguments");
6089 operation = args[0];
6090 if (!SYMBOLP (operation)
6091 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6092 error ("Invalid first arguement");
6093 if (nargs < 1 + XINT (target_idx))
6094 error ("Too few arguments for operation: %s",
6095 XSYMBOL (operation)->name->data);
6096 target = args[XINT (target_idx) + 1];
6097 if (!(STRINGP (target)
6098 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6099 error ("Invalid %dth argument", XINT (target_idx) + 1);
6100
6101 chain = ((EQ (operation, Qinsert_file_contents)
6102 || EQ (operation, Qwrite_region))
6103 ? Vfile_coding_system_alist
6104 : (EQ (operation, Qopen_network_stream)
6105 ? Vnetwork_coding_system_alist
6106 : Vprocess_coding_system_alist));
6107 if (NILP (chain))
6108 return Qnil;
6109
6110 for (; CONSP (chain); chain = XCDR (chain))
6111 {
6112 Lisp_Object elt;
6113 elt = XCAR (chain);
6114
6115 if (CONSP (elt)
6116 && ((STRINGP (target)
6117 && STRINGP (XCAR (elt))
6118 && fast_string_match (XCAR (elt), target) >= 0)
6119 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6120 {
6121 val = XCDR (elt);
6122 /* Here, if VAL is both a valid coding system and a valid
6123 function symbol, we return VAL as a coding system. */
6124 if (CONSP (val))
6125 return val;
6126 if (! SYMBOLP (val))
6127 return Qnil;
6128 if (! NILP (Fcoding_system_p (val)))
6129 return Fcons (val, val);
6130 if (! NILP (Ffboundp (val)))
6131 {
6132 val = call1 (val, Flist (nargs, args));
6133 if (CONSP (val))
6134 return val;
6135 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6136 return Fcons (val, val);
6137 }
6138 return Qnil;
6139 }
6140 }
6141 return Qnil;
6142 }
6143
6144 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
6145 Supdate_coding_systems_internal, 0, 0, 0,
6146 "Update internal database for ISO2022 and CCL based coding systems.\n\
6147 When values of any coding categories are changed, you must\n\
6148 call this function")
6149 ()
6150 {
6151 int i;
6152
6153 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6154 {
6155 Lisp_Object val;
6156
6157 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
6158 if (!NILP (val))
6159 {
6160 if (! coding_system_table[i])
6161 coding_system_table[i] = ((struct coding_system *)
6162 xmalloc (sizeof (struct coding_system)));
6163 setup_coding_system (val, coding_system_table[i]);
6164 }
6165 else if (coding_system_table[i])
6166 {
6167 xfree (coding_system_table[i]);
6168 coding_system_table[i] = NULL;
6169 }
6170 }
6171
6172 return Qnil;
6173 }
6174
6175 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6176 Sset_coding_priority_internal, 0, 0, 0,
6177 "Update internal database for the current value of `coding-category-list'.\n\
6178 This function is internal use only.")
6179 ()
6180 {
6181 int i = 0, idx;
6182 Lisp_Object val;
6183
6184 val = Vcoding_category_list;
6185
6186 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6187 {
6188 if (! SYMBOLP (XCAR (val)))
6189 break;
6190 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6191 if (idx >= CODING_CATEGORY_IDX_MAX)
6192 break;
6193 coding_priorities[i++] = (1 << idx);
6194 val = XCDR (val);
6195 }
6196 /* If coding-category-list is valid and contains all coding
6197 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
6198 the following code saves Emacs from crashing. */
6199 while (i < CODING_CATEGORY_IDX_MAX)
6200 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6201
6202 return Qnil;
6203 }
6204
6205 #endif /* emacs */
6206
6207 \f
6208 /*** 9. Post-amble ***/
6209
6210 void
6211 init_coding ()
6212 {
6213 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6214 }
6215
6216 void
6217 init_coding_once ()
6218 {
6219 int i;
6220
6221 /* Emacs' internal format specific initialize routine. */
6222 for (i = 0; i <= 0x20; i++)
6223 emacs_code_class[i] = EMACS_control_code;
6224 emacs_code_class[0x0A] = EMACS_linefeed_code;
6225 emacs_code_class[0x0D] = EMACS_carriage_return_code;
6226 for (i = 0x21 ; i < 0x7F; i++)
6227 emacs_code_class[i] = EMACS_ascii_code;
6228 emacs_code_class[0x7F] = EMACS_control_code;
6229 for (i = 0x80; i < 0xFF; i++)
6230 emacs_code_class[i] = EMACS_invalid_code;
6231 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6232 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6233 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6234 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6235
6236 /* ISO2022 specific initialize routine. */
6237 for (i = 0; i < 0x20; i++)
6238 iso_code_class[i] = ISO_control_0;
6239 for (i = 0x21; i < 0x7F; i++)
6240 iso_code_class[i] = ISO_graphic_plane_0;
6241 for (i = 0x80; i < 0xA0; i++)
6242 iso_code_class[i] = ISO_control_1;
6243 for (i = 0xA1; i < 0xFF; i++)
6244 iso_code_class[i] = ISO_graphic_plane_1;
6245 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6246 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6247 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6248 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6249 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6250 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6251 iso_code_class[ISO_CODE_ESC] = ISO_escape;
6252 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6253 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6254 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6255
6256 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6257
6258 setup_coding_system (Qnil, &keyboard_coding);
6259 setup_coding_system (Qnil, &terminal_coding);
6260 setup_coding_system (Qnil, &safe_terminal_coding);
6261 setup_coding_system (Qnil, &default_buffer_file_coding);
6262
6263 bzero (coding_system_table, sizeof coding_system_table);
6264
6265 bzero (ascii_skip_code, sizeof ascii_skip_code);
6266 for (i = 0; i < 128; i++)
6267 ascii_skip_code[i] = 1;
6268
6269 #if defined (MSDOS) || defined (WINDOWSNT)
6270 system_eol_type = CODING_EOL_CRLF;
6271 #else
6272 system_eol_type = CODING_EOL_LF;
6273 #endif
6274
6275 inhibit_pre_post_conversion = 0;
6276 }
6277
6278 #ifdef emacs
6279
6280 void
6281 syms_of_coding ()
6282 {
6283 Qtarget_idx = intern ("target-idx");
6284 staticpro (&Qtarget_idx);
6285
6286 Qcoding_system_history = intern ("coding-system-history");
6287 staticpro (&Qcoding_system_history);
6288 Fset (Qcoding_system_history, Qnil);
6289
6290 /* Target FILENAME is the first argument. */
6291 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6292 /* Target FILENAME is the third argument. */
6293 Fput (Qwrite_region, Qtarget_idx, make_number (2));
6294
6295 Qcall_process = intern ("call-process");
6296 staticpro (&Qcall_process);
6297 /* Target PROGRAM is the first argument. */
6298 Fput (Qcall_process, Qtarget_idx, make_number (0));
6299
6300 Qcall_process_region = intern ("call-process-region");
6301 staticpro (&Qcall_process_region);
6302 /* Target PROGRAM is the third argument. */
6303 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6304
6305 Qstart_process = intern ("start-process");
6306 staticpro (&Qstart_process);
6307 /* Target PROGRAM is the third argument. */
6308 Fput (Qstart_process, Qtarget_idx, make_number (2));
6309
6310 Qopen_network_stream = intern ("open-network-stream");
6311 staticpro (&Qopen_network_stream);
6312 /* Target SERVICE is the fourth argument. */
6313 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6314
6315 Qcoding_system = intern ("coding-system");
6316 staticpro (&Qcoding_system);
6317
6318 Qeol_type = intern ("eol-type");
6319 staticpro (&Qeol_type);
6320
6321 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6322 staticpro (&Qbuffer_file_coding_system);
6323
6324 Qpost_read_conversion = intern ("post-read-conversion");
6325 staticpro (&Qpost_read_conversion);
6326
6327 Qpre_write_conversion = intern ("pre-write-conversion");
6328 staticpro (&Qpre_write_conversion);
6329
6330 Qno_conversion = intern ("no-conversion");
6331 staticpro (&Qno_conversion);
6332
6333 Qundecided = intern ("undecided");
6334 staticpro (&Qundecided);
6335
6336 Qcoding_system_p = intern ("coding-system-p");
6337 staticpro (&Qcoding_system_p);
6338
6339 Qcoding_system_error = intern ("coding-system-error");
6340 staticpro (&Qcoding_system_error);
6341
6342 Fput (Qcoding_system_error, Qerror_conditions,
6343 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6344 Fput (Qcoding_system_error, Qerror_message,
6345 build_string ("Invalid coding system"));
6346
6347 Qcoding_category = intern ("coding-category");
6348 staticpro (&Qcoding_category);
6349 Qcoding_category_index = intern ("coding-category-index");
6350 staticpro (&Qcoding_category_index);
6351
6352 Vcoding_category_table
6353 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6354 staticpro (&Vcoding_category_table);
6355 {
6356 int i;
6357 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6358 {
6359 XVECTOR (Vcoding_category_table)->contents[i]
6360 = intern (coding_category_name[i]);
6361 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6362 Qcoding_category_index, make_number (i));
6363 }
6364 }
6365
6366 Qtranslation_table = intern ("translation-table");
6367 staticpro (&Qtranslation_table);
6368 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6369
6370 Qtranslation_table_id = intern ("translation-table-id");
6371 staticpro (&Qtranslation_table_id);
6372
6373 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6374 staticpro (&Qtranslation_table_for_decode);
6375
6376 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6377 staticpro (&Qtranslation_table_for_encode);
6378
6379 Qsafe_chars = intern ("safe-chars");
6380 staticpro (&Qsafe_chars);
6381
6382 Qchar_coding_system = intern ("char-coding-system");
6383 staticpro (&Qchar_coding_system);
6384
6385 /* Intern this now in case it isn't already done.
6386 Setting this variable twice is harmless.
6387 But don't staticpro it here--that is done in alloc.c. */
6388 Qchar_table_extra_slots = intern ("char-table-extra-slots");
6389 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
6390 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
6391
6392 Qvalid_codes = intern ("valid-codes");
6393 staticpro (&Qvalid_codes);
6394
6395 Qemacs_mule = intern ("emacs-mule");
6396 staticpro (&Qemacs_mule);
6397
6398 Qraw_text = intern ("raw-text");
6399 staticpro (&Qraw_text);
6400
6401 defsubr (&Scoding_system_p);
6402 defsubr (&Sread_coding_system);
6403 defsubr (&Sread_non_nil_coding_system);
6404 defsubr (&Scheck_coding_system);
6405 defsubr (&Sdetect_coding_region);
6406 defsubr (&Sdetect_coding_string);
6407 defsubr (&Sfind_coding_systems_region_internal);
6408 defsubr (&Sdecode_coding_region);
6409 defsubr (&Sencode_coding_region);
6410 defsubr (&Sdecode_coding_string);
6411 defsubr (&Sencode_coding_string);
6412 defsubr (&Sdecode_sjis_char);
6413 defsubr (&Sencode_sjis_char);
6414 defsubr (&Sdecode_big5_char);
6415 defsubr (&Sencode_big5_char);
6416 defsubr (&Sset_terminal_coding_system_internal);
6417 defsubr (&Sset_safe_terminal_coding_system_internal);
6418 defsubr (&Sterminal_coding_system);
6419 defsubr (&Sset_keyboard_coding_system_internal);
6420 defsubr (&Skeyboard_coding_system);
6421 defsubr (&Sfind_operation_coding_system);
6422 defsubr (&Supdate_coding_systems_internal);
6423 defsubr (&Sset_coding_priority_internal);
6424
6425 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6426 "List of coding systems.\n\
6427 \n\
6428 Do not alter the value of this variable manually. This variable should be\n\
6429 updated by the functions `make-coding-system' and\n\
6430 `define-coding-system-alias'.");
6431 Vcoding_system_list = Qnil;
6432
6433 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6434 "Alist of coding system names.\n\
6435 Each element is one element list of coding system name.\n\
6436 This variable is given to `completing-read' as TABLE argument.\n\
6437 \n\
6438 Do not alter the value of this variable manually. This variable should be\n\
6439 updated by the functions `make-coding-system' and\n\
6440 `define-coding-system-alias'.");
6441 Vcoding_system_alist = Qnil;
6442
6443 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6444 "List of coding-categories (symbols) ordered by priority.");
6445 {
6446 int i;
6447
6448 Vcoding_category_list = Qnil;
6449 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6450 Vcoding_category_list
6451 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6452 Vcoding_category_list);
6453 }
6454
6455 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6456 "Specify the coding system for read operations.\n\
6457 It is useful to bind this variable with `let', but do not set it globally.\n\
6458 If the value is a coding system, it is used for decoding on read operation.\n\
6459 If not, an appropriate element is used from one of the coding system alists:\n\
6460 There are three such tables, `file-coding-system-alist',\n\
6461 `process-coding-system-alist', and `network-coding-system-alist'.");
6462 Vcoding_system_for_read = Qnil;
6463
6464 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6465 "Specify the coding system for write operations.\n\
6466 Programs bind this variable with `let', but you should not set it globally.\n\
6467 If the value is a coding system, it is used for encoding of output,\n\
6468 when writing it to a file and when sending it to a file or subprocess.\n\
6469 \n\
6470 If this does not specify a coding system, an appropriate element\n\
6471 is used from one of the coding system alists:\n\
6472 There are three such tables, `file-coding-system-alist',\n\
6473 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6474 For output to files, if the above procedure does not specify a coding system,\n\
6475 the value of `buffer-file-coding-system' is used.");
6476 Vcoding_system_for_write = Qnil;
6477
6478 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6479 "Coding system used in the latest file or process I/O.");
6480 Vlast_coding_system_used = Qnil;
6481
6482 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6483 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6484 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6485 such conversion.");
6486 inhibit_eol_conversion = 0;
6487
6488 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6489 "Non-nil means process buffer inherits coding system of process output.\n\
6490 Bind it to t if the process output is to be treated as if it were a file\n\
6491 read from some filesystem.");
6492 inherit_process_coding_system = 0;
6493
6494 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6495 "Alist to decide a coding system to use for a file I/O operation.\n\
6496 The format is ((PATTERN . VAL) ...),\n\
6497 where PATTERN is a regular expression matching a file name,\n\
6498 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6499 If VAL is a coding system, it is used for both decoding and encoding\n\
6500 the file contents.\n\
6501 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6502 and the cdr part is used for encoding.\n\
6503 If VAL is a function symbol, the function must return a coding system\n\
6504 or a cons of coding systems which are used as above.\n\
6505 \n\
6506 See also the function `find-operation-coding-system'\n\
6507 and the variable `auto-coding-alist'.");
6508 Vfile_coding_system_alist = Qnil;
6509
6510 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6511 "Alist to decide a coding system to use for a process I/O operation.\n\
6512 The format is ((PATTERN . VAL) ...),\n\
6513 where PATTERN is a regular expression matching a program name,\n\
6514 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6515 If VAL is a coding system, it is used for both decoding what received\n\
6516 from the program and encoding what sent to the program.\n\
6517 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6518 and the cdr part is used for encoding.\n\
6519 If VAL is a function symbol, the function must return a coding system\n\
6520 or a cons of coding systems which are used as above.\n\
6521 \n\
6522 See also the function `find-operation-coding-system'.");
6523 Vprocess_coding_system_alist = Qnil;
6524
6525 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6526 "Alist to decide a coding system to use for a network I/O operation.\n\
6527 The format is ((PATTERN . VAL) ...),\n\
6528 where PATTERN is a regular expression matching a network service name\n\
6529 or is a port number to connect to,\n\
6530 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6531 If VAL is a coding system, it is used for both decoding what received\n\
6532 from the network stream and encoding what sent to the network stream.\n\
6533 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6534 and the cdr part is used for encoding.\n\
6535 If VAL is a function symbol, the function must return a coding system\n\
6536 or a cons of coding systems which are used as above.\n\
6537 \n\
6538 See also the function `find-operation-coding-system'.");
6539 Vnetwork_coding_system_alist = Qnil;
6540
6541 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6542 "Coding system to use with system messages.");
6543 Vlocale_coding_system = Qnil;
6544
6545 /* The eol mnemonics are reset in startup.el system-dependently. */
6546 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6547 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6548 eol_mnemonic_unix = build_string (":");
6549
6550 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6551 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6552 eol_mnemonic_dos = build_string ("\\");
6553
6554 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6555 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6556 eol_mnemonic_mac = build_string ("/");
6557
6558 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6559 "*String displayed in mode line when end-of-line format is not yet determined.");
6560 eol_mnemonic_undecided = build_string (":");
6561
6562 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6563 "*Non-nil enables character translation while encoding and decoding.");
6564 Venable_character_translation = Qt;
6565
6566 DEFVAR_LISP ("standard-translation-table-for-decode",
6567 &Vstandard_translation_table_for_decode,
6568 "Table for translating characters while decoding.");
6569 Vstandard_translation_table_for_decode = Qnil;
6570
6571 DEFVAR_LISP ("standard-translation-table-for-encode",
6572 &Vstandard_translation_table_for_encode,
6573 "Table for translationg characters while encoding.");
6574 Vstandard_translation_table_for_encode = Qnil;
6575
6576 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6577 "Alist of charsets vs revision numbers.\n\
6578 While encoding, if a charset (car part of an element) is found,\n\
6579 designate it with the escape sequence identifing revision (cdr part of the element).");
6580 Vcharset_revision_alist = Qnil;
6581
6582 DEFVAR_LISP ("default-process-coding-system",
6583 &Vdefault_process_coding_system,
6584 "Cons of coding systems used for process I/O by default.\n\
6585 The car part is used for decoding a process output,\n\
6586 the cdr part is used for encoding a text to be sent to a process.");
6587 Vdefault_process_coding_system = Qnil;
6588
6589 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6590 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6591 This is a vector of length 256.\n\
6592 If Nth element is non-nil, the existence of code N in a file\n\
6593 \(or output of subprocess) doesn't prevent it to be detected as\n\
6594 a coding system of ISO 2022 variant which has a flag\n\
6595 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6596 or reading output of a subprocess.\n\
6597 Only 128th through 159th elements has a meaning.");
6598 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6599
6600 DEFVAR_LISP ("select-safe-coding-system-function",
6601 &Vselect_safe_coding_system_function,
6602 "Function to call to select safe coding system for encoding a text.\n\
6603 \n\
6604 If set, this function is called to force a user to select a proper\n\
6605 coding system which can encode the text in the case that a default\n\
6606 coding system used in each operation can't encode the text.\n\
6607 \n\
6608 The default value is `select-safe-coding-system' (which see).");
6609 Vselect_safe_coding_system_function = Qnil;
6610
6611 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
6612 "Char-table containing safe coding systems of each characters.\n\
6613 Each element doesn't include such generic coding systems that can\n\
6614 encode any characters. They are in the first extra slot.");
6615 Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
6616
6617 DEFVAR_BOOL ("inhibit-iso-escape-detection",
6618 &inhibit_iso_escape_detection,
6619 "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6620 \n\
6621 By default, on reading a file, Emacs tries to detect how the text is\n\
6622 encoded. This code detection is sensitive to escape sequences. If\n\
6623 the sequence is valid as ISO2022, the code is determined as one of\n\
6624 the ISO2022 encodings, and the file is decoded by the corresponding\n\
6625 coding system (e.g. `iso-2022-7bit').\n\
6626 \n\
6627 However, there may be a case that you want to read escape sequences in\n\
6628 a file as is. In such a case, you can set this variable to non-nil.\n\
6629 Then, as the code detection ignores any escape sequences, no file is\n\
6630 detected as encoded in some ISO2022 encoding. The result is that all\n\
6631 escape sequences become visible in a buffer.\n\
6632 \n\
6633 The default value is nil, and it is strongly recommended not to change\n\
6634 it. That is because many Emacs Lisp source files that contain\n\
6635 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6636 in Emacs's distribution, and they won't be decoded correctly on\n\
6637 reading if you suppress escape sequence detection.\n\
6638 \n\
6639 The other way to read escape sequences in a file without decoding is\n\
6640 to explicitly specify some coding system that doesn't use ISO2022's\n\
6641 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6642 inhibit_iso_escape_detection = 0;
6643 }
6644
6645 char *
6646 emacs_strerror (error_number)
6647 int error_number;
6648 {
6649 char *str;
6650
6651 synchronize_system_messages_locale ();
6652 str = strerror (error_number);
6653
6654 if (! NILP (Vlocale_coding_system))
6655 {
6656 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6657 Vlocale_coding_system,
6658 0);
6659 str = (char *) XSTRING (dec)->data;
6660 }
6661
6662 return str;
6663 }
6664
6665 #endif /* emacs */
6666