(xim_initialize): Redo 6/24 change.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /*** TABLE OF CONTENTS ***
24
25 0. General comments
26 1. Preamble
27 2. Emacs' internal format (emacs-mule) handlers
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
35
36 */
37
38 /*** 0. General comments ***/
39
40
41 /*** GENERAL NOTE on CODING SYSTEMS ***
42
43 A coding system is an encoding mechanism for one or more character
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
46 Emacs' internal format (emacs-mule), and when we say "encode",
47 it means converting the coding system emacs-mule to some other
48 coding system.
49
50 0. Emacs' internal format (emacs-mule)
51
52 Emacs itself holds a multi-lingual character in buffers and strings
53 in a special format. Details are described in section 2.
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
66 section 4.
67
68 3. BIG5
69
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
75
76 4. Raw text
77
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
80
81 5. Other
82
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
90 information about it is set in a structure of type `struct
91 coding_system' for rapid processing. See section 6 for more details.
92
93 */
94
95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
99 whereas DOS's format is two-byte sequence of `carriage-return' and
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
102
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
107
108 */
109
110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
114 which appropriate flag bits for the category XXX are set. The flag
115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
117 of the range 0x80..0x9F are in multibyte form. */
118 #if 0
119 int
120 detect_coding_emacs_mule (src, src_end, multibytep)
121 unsigned char *src, *src_end;
122 int multibytep;
123 {
124 ...
125 }
126 #endif
127
128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
134
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
140
141 DST_BYTES zero means that the source area and destination area are
142 overlapped, which means that we can produce a decoded text until it
143 reaches the head of the not-yet-decoded source text.
144
145 Below is a template for these functions. */
146 #if 0
147 static void
148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 {
153 ...
154 }
155 #endif
156
157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
163
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
169
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
173
174 Below is a template for these functions. */
175 #if 0
176 static void
177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
181 {
182 ...
183 }
184 #endif
185
186 /*** COMMONLY USED MACROS ***/
187
188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
195
196 #define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
204 } while (0)
205
206 #define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
215 } while (0)
216
217
218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
233 /* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
242
243 #define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
258 c = translate_char (translation_table, c, -1, 0, 0); \
259 src += bytes; \
260 } while (0)
261
262
263 /* Produce a multibyte form of character C to `dst'. Jump to
264 `label_end_of_loop' if there's not enough space at `dst'.
265
266 If we are now in the middle of a composition sequence, the decoded
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273 #define EMIT_CHAR(c) \
274 do { \
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
288 \
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
296 } while (0)
297
298
299 #define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309 #define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319 #define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
328 } while (0)
329
330 \f
331 /*** 1. Preamble ***/
332
333 #ifdef emacs
334 #include <config.h>
335 #endif
336
337 #include <stdio.h>
338
339 #ifdef emacs
340
341 #include "lisp.h"
342 #include "buffer.h"
343 #include "charset.h"
344 #include "composite.h"
345 #include "ccl.h"
346 #include "coding.h"
347 #include "window.h"
348 #include "intervals.h"
349
350 #else /* not emacs */
351
352 #include "mulelib.h"
353
354 #endif /* not emacs */
355
356 Lisp_Object Qcoding_system, Qeol_type;
357 Lisp_Object Qbuffer_file_coding_system;
358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
359 Lisp_Object Qno_conversion, Qundecided;
360 Lisp_Object Qcoding_system_history;
361 Lisp_Object Qsafe_chars;
362 Lisp_Object Qvalid_codes;
363
364 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
366 Lisp_Object Qstart_process, Qopen_network_stream;
367 Lisp_Object Qtarget_idx;
368
369 Lisp_Object Vselect_safe_coding_system_function;
370
371 int coding_system_require_warning;
372
373 /* Mnemonic string for each format of end-of-line. */
374 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
375 /* Mnemonic string to indicate format of end-of-line is not yet
376 decided. */
377 Lisp_Object eol_mnemonic_undecided;
378
379 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
380 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
381 int system_eol_type;
382
383 #ifdef emacs
384
385 /* Information about which coding system is safe for which chars.
386 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
387
388 GENERIC-LIST is a list of generic coding systems which can encode
389 any characters.
390
391 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
392 corresponding char table that contains safe chars. */
393 Lisp_Object Vcoding_system_safe_chars;
394
395 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
396
397 Lisp_Object Qcoding_system_p, Qcoding_system_error;
398
399 /* Coding system emacs-mule and raw-text are for converting only
400 end-of-line format. */
401 Lisp_Object Qemacs_mule, Qraw_text;
402
403 Lisp_Object Qutf_8;
404
405 /* Coding-systems are handed between Emacs Lisp programs and C internal
406 routines by the following three variables. */
407 /* Coding-system for reading files and receiving data from process. */
408 Lisp_Object Vcoding_system_for_read;
409 /* Coding-system for writing files and sending data to process. */
410 Lisp_Object Vcoding_system_for_write;
411 /* Coding-system actually used in the latest I/O. */
412 Lisp_Object Vlast_coding_system_used;
413
414 /* A vector of length 256 which contains information about special
415 Latin codes (especially for dealing with Microsoft codes). */
416 Lisp_Object Vlatin_extra_code_table;
417
418 /* Flag to inhibit code conversion of end-of-line format. */
419 int inhibit_eol_conversion;
420
421 /* Flag to inhibit ISO2022 escape sequence detection. */
422 int inhibit_iso_escape_detection;
423
424 /* Flag to make buffer-file-coding-system inherit from process-coding. */
425 int inherit_process_coding_system;
426
427 /* Coding system to be used to encode text for terminal display. */
428 struct coding_system terminal_coding;
429
430 /* Coding system to be used to encode text for terminal display when
431 terminal coding system is nil. */
432 struct coding_system safe_terminal_coding;
433
434 /* Coding system of what is sent from terminal keyboard. */
435 struct coding_system keyboard_coding;
436
437 /* Default coding system to be used to write a file. */
438 struct coding_system default_buffer_file_coding;
439
440 Lisp_Object Vfile_coding_system_alist;
441 Lisp_Object Vprocess_coding_system_alist;
442 Lisp_Object Vnetwork_coding_system_alist;
443
444 Lisp_Object Vlocale_coding_system;
445
446 #endif /* emacs */
447
448 Lisp_Object Qcoding_category, Qcoding_category_index;
449
450 /* List of symbols `coding-category-xxx' ordered by priority. */
451 Lisp_Object Vcoding_category_list;
452
453 /* Table of coding categories (Lisp symbols). */
454 Lisp_Object Vcoding_category_table;
455
456 /* Table of names of symbol for each coding-category. */
457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
458 "coding-category-emacs-mule",
459 "coding-category-sjis",
460 "coding-category-iso-7",
461 "coding-category-iso-7-tight",
462 "coding-category-iso-8-1",
463 "coding-category-iso-8-2",
464 "coding-category-iso-7-else",
465 "coding-category-iso-8-else",
466 "coding-category-ccl",
467 "coding-category-big5",
468 "coding-category-utf-8",
469 "coding-category-utf-16-be",
470 "coding-category-utf-16-le",
471 "coding-category-raw-text",
472 "coding-category-binary"
473 };
474
475 /* Table of pointers to coding systems corresponding to each coding
476 categories. */
477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
478
479 /* Table of coding category masks. Nth element is a mask for a coding
480 category of which priority is Nth. */
481 static
482 int coding_priorities[CODING_CATEGORY_IDX_MAX];
483
484 /* Flag to tell if we look up translation table on character code
485 conversion. */
486 Lisp_Object Venable_character_translation;
487 /* Standard translation table to look up on decoding (reading). */
488 Lisp_Object Vstandard_translation_table_for_decode;
489 /* Standard translation table to look up on encoding (writing). */
490 Lisp_Object Vstandard_translation_table_for_encode;
491
492 Lisp_Object Qtranslation_table;
493 Lisp_Object Qtranslation_table_id;
494 Lisp_Object Qtranslation_table_for_decode;
495 Lisp_Object Qtranslation_table_for_encode;
496
497 /* Alist of charsets vs revision number. */
498 Lisp_Object Vcharset_revision_alist;
499
500 /* Default coding systems used for process I/O. */
501 Lisp_Object Vdefault_process_coding_system;
502
503 /* Char table for translating Quail and self-inserting input. */
504 Lisp_Object Vtranslation_table_for_input;
505
506 /* Global flag to tell that we can't call post-read-conversion and
507 pre-write-conversion functions. Usually the value is zero, but it
508 is set to 1 temporarily while such functions are running. This is
509 to avoid infinite recursive call. */
510 static int inhibit_pre_post_conversion;
511
512 Lisp_Object Qchar_coding_system;
513
514 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
515 its validity. */
516
517 Lisp_Object
518 coding_safe_chars (coding_system)
519 Lisp_Object coding_system;
520 {
521 Lisp_Object coding_spec, plist, safe_chars;
522
523 coding_spec = Fget (coding_system, Qcoding_system);
524 plist = XVECTOR (coding_spec)->contents[3];
525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
527 }
528
529 #define CODING_SAFE_CHAR_P(safe_chars, c) \
530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
531
532 \f
533 /*** 2. Emacs internal format (emacs-mule) handlers ***/
534
535 /* Emacs' internal format for representation of multiple character
536 sets is a kind of multi-byte encoding, i.e. characters are
537 represented by variable-length sequences of one-byte codes.
538
539 ASCII characters and control characters (e.g. `tab', `newline') are
540 represented by one-byte sequences which are their ASCII codes, in
541 the range 0x00 through 0x7F.
542
543 8-bit characters of the range 0x80..0x9F are represented by
544 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
545 code + 0x20).
546
547 8-bit characters of the range 0xA0..0xFF are represented by
548 one-byte sequences which are their 8-bit code.
549
550 The other characters are represented by a sequence of `base
551 leading-code', optional `extended leading-code', and one or two
552 `position-code's. The length of the sequence is determined by the
553 base leading-code. Leading-code takes the range 0x81 through 0x9D,
554 whereas extended leading-code and position-code take the range 0xA0
555 through 0xFF. See `charset.h' for more details about leading-code
556 and position-code.
557
558 --- CODE RANGE of Emacs' internal format ---
559 character set range
560 ------------- -----
561 ascii 0x00..0x7F
562 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
563 eight-bit-graphic 0xA0..0xBF
564 ELSE 0x81..0x9D + [0xA0..0xFF]+
565 ---------------------------------------------
566
567 As this is the internal character representation, the format is
568 usually not used externally (i.e. in a file or in a data sent to a
569 process). But, it is possible to have a text externally in this
570 format (i.e. by encoding by the coding system `emacs-mule').
571
572 In that case, a sequence of one-byte codes has a slightly different
573 form.
574
575 Firstly, all characters in eight-bit-control are represented by
576 one-byte sequences which are their 8-bit code.
577
578 Next, character composition data are represented by the byte
579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
580 where,
581 METHOD is 0xF0 plus one of composition method (enum
582 composition_method),
583
584 BYTES is 0xA0 plus the byte length of these composition data,
585
586 CHARS is 0xA0 plus the number of characters composed by these
587 data,
588
589 COMPONENTs are characters of multibyte form or composition
590 rules encoded by two-byte of ASCII codes.
591
592 In addition, for backward compatibility, the following formats are
593 also recognized as composition data on decoding.
594
595 0x80 MSEQ ...
596 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
597
598 Here,
599 MSEQ is a multibyte form but in these special format:
600 ASCII: 0xA0 ASCII_CODE+0x80,
601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
602 RULE is a one byte code of the range 0xA0..0xF0 that
603 represents a composition rule.
604 */
605
606 enum emacs_code_class_type emacs_code_class[256];
607
608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
609 Check if a text is encoded in Emacs' internal format. If it is,
610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
611
612 static int
613 detect_coding_emacs_mule (src, src_end, multibytep)
614 unsigned char *src, *src_end;
615 int multibytep;
616 {
617 unsigned char c;
618 int composing = 0;
619 /* Dummy for ONE_MORE_BYTE. */
620 struct coding_system dummy_coding;
621 struct coding_system *coding = &dummy_coding;
622
623 while (1)
624 {
625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
626
627 if (composing)
628 {
629 if (c < 0xA0)
630 composing = 0;
631 else if (c == 0xA0)
632 {
633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
634 c &= 0x7F;
635 }
636 else
637 c -= 0x20;
638 }
639
640 if (c < 0x20)
641 {
642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
643 return 0;
644 }
645 else if (c >= 0x80 && c < 0xA0)
646 {
647 if (c == 0x80)
648 /* Old leading code for a composite character. */
649 composing = 1;
650 else
651 {
652 unsigned char *src_base = src - 1;
653 int bytes;
654
655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
656 bytes))
657 return 0;
658 src = src_base + bytes;
659 }
660 }
661 }
662 label_end_of_loop:
663 return CODING_CATEGORY_MASK_EMACS_MULE;
664 }
665
666
667 /* Record the starting position START and METHOD of one composition. */
668
669 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
670 do { \
671 struct composition_data *cmp_data = coding->cmp_data; \
672 int *data = cmp_data->data + cmp_data->used; \
673 coding->cmp_data_start = cmp_data->used; \
674 data[0] = -1; \
675 data[1] = cmp_data->char_offset + start; \
676 data[3] = (int) method; \
677 cmp_data->used += 4; \
678 } while (0)
679
680 /* Record the ending position END of the current composition. */
681
682 #define CODING_ADD_COMPOSITION_END(coding, end) \
683 do { \
684 struct composition_data *cmp_data = coding->cmp_data; \
685 int *data = cmp_data->data + coding->cmp_data_start; \
686 data[0] = cmp_data->used - coding->cmp_data_start; \
687 data[2] = cmp_data->char_offset + end; \
688 } while (0)
689
690 /* Record one COMPONENT (alternate character or composition rule). */
691
692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
693 do { \
694 coding->cmp_data->data[coding->cmp_data->used++] = component; \
695 if (coding->cmp_data->used - coding->cmp_data_start \
696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
697 { \
698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
699 coding->composing = COMPOSITION_NO; \
700 } \
701 } while (0)
702
703
704 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
705 is not less than SRC_END, return -1 without incrementing Src. */
706
707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
708
709
710 /* Decode a character represented as a component of composition
711 sequence of Emacs 20 style at SRC. Set C to that character, store
712 its multibyte form sequence at P, and set P to the end of that
713 sequence. If no valid character is found, set C to -1. */
714
715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
716 do { \
717 int bytes; \
718 \
719 c = SAFE_ONE_MORE_BYTE (); \
720 if (c < 0) \
721 break; \
722 if (CHAR_HEAD_P (c)) \
723 c = -1; \
724 else if (c == 0xA0) \
725 { \
726 c = SAFE_ONE_MORE_BYTE (); \
727 if (c < 0xA0) \
728 c = -1; \
729 else \
730 { \
731 c -= 0xA0; \
732 *p++ = c; \
733 } \
734 } \
735 else if (BASE_LEADING_CODE_P (c - 0x20)) \
736 { \
737 unsigned char *p0 = p; \
738 \
739 c -= 0x20; \
740 *p++ = c; \
741 bytes = BYTES_BY_CHAR_HEAD (c); \
742 while (--bytes) \
743 { \
744 c = SAFE_ONE_MORE_BYTE (); \
745 if (c < 0) \
746 break; \
747 *p++ = c; \
748 } \
749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
750 || (coding->flags /* We are recovering a file. */ \
751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
752 && ! CHAR_HEAD_P (p0[1]))) \
753 c = STRING_CHAR (p0, bytes); \
754 else \
755 c = -1; \
756 } \
757 else \
758 c = -1; \
759 } while (0)
760
761
762 /* Decode a composition rule represented as a component of composition
763 sequence of Emacs 20 style at SRC. Set C to the rule. If not
764 valid rule is found, set C to -1. */
765
766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
767 do { \
768 c = SAFE_ONE_MORE_BYTE (); \
769 c -= 0xA0; \
770 if (c < 0 || c >= 81) \
771 c = -1; \
772 else \
773 { \
774 gref = c / 9, nref = c % 9; \
775 c = COMPOSITION_ENCODE_RULE (gref, nref); \
776 } \
777 } while (0)
778
779
780 /* Decode composition sequence encoded by `emacs-mule' at the source
781 pointed by SRC. SRC_END is the end of source. Store information
782 of the composition in CODING->cmp_data.
783
784 For backward compatibility, decode also a composition sequence of
785 Emacs 20 style. In that case, the composition sequence contains
786 characters that should be extracted into a buffer or string. Store
787 those characters at *DESTINATION in multibyte form.
788
789 If we encounter an invalid byte sequence, return 0.
790 If we encounter an insufficient source or destination, or
791 insufficient space in CODING->cmp_data, return 1.
792 Otherwise, return consumed bytes in the source.
793
794 */
795 static INLINE int
796 decode_composition_emacs_mule (coding, src, src_end,
797 destination, dst_end, dst_bytes)
798 struct coding_system *coding;
799 unsigned char *src, *src_end, **destination, *dst_end;
800 int dst_bytes;
801 {
802 unsigned char *dst = *destination;
803 int method, data_len, nchars;
804 unsigned char *src_base = src++;
805 /* Store components of composition. */
806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
807 int ncomponent;
808 /* Store multibyte form of characters to be composed. This is for
809 Emacs 20 style composition sequence. */
810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
811 unsigned char *bufp = buf;
812 int c, i, gref, nref;
813
814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
815 >= COMPOSITION_DATA_SIZE)
816 {
817 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
818 return -1;
819 }
820
821 ONE_MORE_BYTE (c);
822 if (c - 0xF0 >= COMPOSITION_RELATIVE
823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
824 {
825 int with_rule;
826
827 method = c - 0xF0;
828 with_rule = (method == COMPOSITION_WITH_RULE
829 || method == COMPOSITION_WITH_RULE_ALTCHARS);
830 ONE_MORE_BYTE (c);
831 data_len = c - 0xA0;
832 if (data_len < 4
833 || src_base + data_len > src_end)
834 return 0;
835 ONE_MORE_BYTE (c);
836 nchars = c - 0xA0;
837 if (c < 1)
838 return 0;
839 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
840 {
841 /* If it is longer than this, it can't be valid. */
842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
843 return 0;
844
845 if (ncomponent % 2 && with_rule)
846 {
847 ONE_MORE_BYTE (gref);
848 gref -= 32;
849 ONE_MORE_BYTE (nref);
850 nref -= 32;
851 c = COMPOSITION_ENCODE_RULE (gref, nref);
852 }
853 else
854 {
855 int bytes;
856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
857 || (coding->flags /* We are recovering a file. */
858 && src[0] == LEADING_CODE_8_BIT_CONTROL
859 && ! CHAR_HEAD_P (src[1])))
860 c = STRING_CHAR (src, bytes);
861 else
862 c = *src, bytes = 1;
863 src += bytes;
864 }
865 component[ncomponent] = c;
866 }
867 }
868 else
869 {
870 /* This may be an old Emacs 20 style format. See the comment at
871 the section 2 of this file. */
872 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
873 if (src == src_end
874 && !(coding->mode & CODING_MODE_LAST_BLOCK))
875 goto label_end_of_loop;
876
877 src_end = src;
878 src = src_base + 1;
879 if (c < 0xC0)
880 {
881 method = COMPOSITION_RELATIVE;
882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
883 {
884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
885 if (c < 0)
886 break;
887 component[ncomponent++] = c;
888 }
889 if (ncomponent < 2)
890 return 0;
891 nchars = ncomponent;
892 }
893 else if (c == 0xFF)
894 {
895 method = COMPOSITION_WITH_RULE;
896 src++;
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 return 0;
900 component[0] = c;
901 for (ncomponent = 1;
902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
903 {
904 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
905 if (c < 0)
906 break;
907 component[ncomponent++] = c;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 break;
911 component[ncomponent++] = c;
912 }
913 if (ncomponent < 3)
914 return 0;
915 nchars = (ncomponent + 1) / 2;
916 }
917 else
918 return 0;
919 }
920
921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
922 {
923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
924 for (i = 0; i < ncomponent; i++)
925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
927 if (buf < bufp)
928 {
929 unsigned char *p = buf;
930 EMIT_BYTES (p, bufp);
931 *destination += bufp - buf;
932 coding->produced_char += nchars;
933 }
934 return (src - src_base);
935 }
936 label_end_of_loop:
937 return -1;
938 }
939
940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
941
942 static void
943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
944 struct coding_system *coding;
945 unsigned char *source, *destination;
946 int src_bytes, dst_bytes;
947 {
948 unsigned char *src = source;
949 unsigned char *src_end = source + src_bytes;
950 unsigned char *dst = destination;
951 unsigned char *dst_end = destination + dst_bytes;
952 /* SRC_BASE remembers the start position in source in each loop.
953 The loop will be exited when there's not enough source code, or
954 when there's not enough destination area to produce a
955 character. */
956 unsigned char *src_base;
957
958 coding->produced_char = 0;
959 while ((src_base = src) < src_end)
960 {
961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
962 int bytes;
963
964 if (*src == '\r')
965 {
966 int c = *src++;
967
968 if (coding->eol_type == CODING_EOL_CR)
969 c = '\n';
970 else if (coding->eol_type == CODING_EOL_CRLF)
971 {
972 ONE_MORE_BYTE (c);
973 if (c != '\n')
974 {
975 src--;
976 c = '\r';
977 }
978 }
979 *dst++ = c;
980 coding->produced_char++;
981 continue;
982 }
983 else if (*src == '\n')
984 {
985 if ((coding->eol_type == CODING_EOL_CR
986 || coding->eol_type == CODING_EOL_CRLF)
987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
988 {
989 coding->result = CODING_FINISH_INCONSISTENT_EOL;
990 goto label_end_of_loop;
991 }
992 *dst++ = *src++;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == 0x80 && coding->cmp_data)
997 {
998 /* Start of composition data. */
999 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1000 &dst, dst_end,
1001 dst_bytes);
1002 if (consumed < 0)
1003 goto label_end_of_loop;
1004 else if (consumed > 0)
1005 {
1006 src += consumed;
1007 continue;
1008 }
1009 bytes = CHAR_STRING (*src, tmp);
1010 p = tmp;
1011 src++;
1012 }
1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014 || (coding->flags /* We are recovering a file. */
1015 && src[0] == LEADING_CODE_8_BIT_CONTROL
1016 && ! CHAR_HEAD_P (src[1])))
1017 {
1018 p = src;
1019 src += bytes;
1020 }
1021 else
1022 {
1023 bytes = CHAR_STRING (*src, tmp);
1024 p = tmp;
1025 src++;
1026 }
1027 if (dst + bytes >= (dst_bytes ? dst_end : src))
1028 {
1029 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1030 break;
1031 }
1032 while (bytes--) *dst++ = *p++;
1033 coding->produced_char++;
1034 }
1035 label_end_of_loop:
1036 coding->consumed = coding->consumed_char = src_base - source;
1037 coding->produced = dst - destination;
1038 }
1039
1040
1041 /* Encode composition data stored at DATA into a special byte sequence
1042 starting by 0x80. Update CODING->cmp_data_start and maybe
1043 CODING->cmp_data for the next call. */
1044
1045 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1046 do { \
1047 unsigned char buf[1024], *p0 = buf, *p; \
1048 int len = data[0]; \
1049 int i; \
1050 \
1051 buf[0] = 0x80; \
1052 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1053 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1054 p = buf + 4; \
1055 if (data[3] == COMPOSITION_WITH_RULE \
1056 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1057 { \
1058 p += CHAR_STRING (data[4], p); \
1059 for (i = 5; i < len; i += 2) \
1060 { \
1061 int gref, nref; \
1062 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1063 *p++ = 0x20 + gref; \
1064 *p++ = 0x20 + nref; \
1065 p += CHAR_STRING (data[i + 1], p); \
1066 } \
1067 } \
1068 else \
1069 { \
1070 for (i = 4; i < len; i++) \
1071 p += CHAR_STRING (data[i], p); \
1072 } \
1073 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1074 \
1075 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1076 { \
1077 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1078 goto label_end_of_loop; \
1079 } \
1080 while (p0 < p) \
1081 *dst++ = *p0++; \
1082 coding->cmp_data_start += data[0]; \
1083 if (coding->cmp_data_start == coding->cmp_data->used \
1084 && coding->cmp_data->next) \
1085 { \
1086 coding->cmp_data = coding->cmp_data->next; \
1087 coding->cmp_data_start = 0; \
1088 } \
1089 } while (0)
1090
1091
1092 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1093 unsigned char *, int, int));
1094
1095 static void
1096 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1097 struct coding_system *coding;
1098 unsigned char *source, *destination;
1099 int src_bytes, dst_bytes;
1100 {
1101 unsigned char *src = source;
1102 unsigned char *src_end = source + src_bytes;
1103 unsigned char *dst = destination;
1104 unsigned char *dst_end = destination + dst_bytes;
1105 unsigned char *src_base;
1106 int c;
1107 int char_offset;
1108 int *data;
1109
1110 Lisp_Object translation_table;
1111
1112 translation_table = Qnil;
1113
1114 /* Optimization for the case that there's no composition. */
1115 if (!coding->cmp_data || coding->cmp_data->used == 0)
1116 {
1117 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1118 return;
1119 }
1120
1121 char_offset = coding->cmp_data->char_offset;
1122 data = coding->cmp_data->data + coding->cmp_data_start;
1123 while (1)
1124 {
1125 src_base = src;
1126
1127 /* If SRC starts a composition, encode the information about the
1128 composition in advance. */
1129 if (coding->cmp_data_start < coding->cmp_data->used
1130 && char_offset + coding->consumed_char == data[1])
1131 {
1132 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1133 char_offset = coding->cmp_data->char_offset;
1134 data = coding->cmp_data->data + coding->cmp_data_start;
1135 }
1136
1137 ONE_MORE_CHAR (c);
1138 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1139 || coding->eol_type == CODING_EOL_CR))
1140 {
1141 if (coding->eol_type == CODING_EOL_CRLF)
1142 EMIT_TWO_BYTES ('\r', c);
1143 else
1144 EMIT_ONE_BYTE ('\r');
1145 }
1146 else if (SINGLE_BYTE_CHAR_P (c))
1147 {
1148 if (coding->flags && ! ASCII_BYTE_P (c))
1149 {
1150 /* As we are auto saving, retain the multibyte form for
1151 8-bit chars. */
1152 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1153 int bytes = CHAR_STRING (c, buf);
1154
1155 if (bytes == 1)
1156 EMIT_ONE_BYTE (buf[0]);
1157 else
1158 EMIT_TWO_BYTES (buf[0], buf[1]);
1159 }
1160 else
1161 EMIT_ONE_BYTE (c);
1162 }
1163 else
1164 EMIT_BYTES (src_base, src);
1165 coding->consumed_char++;
1166 }
1167 label_end_of_loop:
1168 coding->consumed = src_base - source;
1169 coding->produced = coding->produced_char = dst - destination;
1170 return;
1171 }
1172
1173 \f
1174 /*** 3. ISO2022 handlers ***/
1175
1176 /* The following note describes the coding system ISO2022 briefly.
1177 Since the intention of this note is to help understand the
1178 functions in this file, some parts are NOT ACCURATE or are OVERLY
1179 SIMPLIFIED. For thorough understanding, please refer to the
1180 original document of ISO2022. This is equivalent to the standard
1181 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1182
1183 ISO2022 provides many mechanisms to encode several character sets
1184 in 7-bit and 8-bit environments. For 7-bit environments, all text
1185 is encoded using bytes less than 128. This may make the encoded
1186 text a little bit longer, but the text passes more easily through
1187 several types of gateway, some of which strip off the MSB (Most
1188 Significant Bit).
1189
1190 There are two kinds of character sets: control character sets and
1191 graphic character sets. The former contain control characters such
1192 as `newline' and `escape' to provide control functions (control
1193 functions are also provided by escape sequences). The latter
1194 contain graphic characters such as 'A' and '-'. Emacs recognizes
1195 two control character sets and many graphic character sets.
1196
1197 Graphic character sets are classified into one of the following
1198 four classes, according to the number of bytes (DIMENSION) and
1199 number of characters in one dimension (CHARS) of the set:
1200 - DIMENSION1_CHARS94
1201 - DIMENSION1_CHARS96
1202 - DIMENSION2_CHARS94
1203 - DIMENSION2_CHARS96
1204
1205 In addition, each character set is assigned an identification tag,
1206 unique for each set, called the "final character" (denoted as <F>
1207 hereafter). The <F> of each character set is decided by ECMA(*)
1208 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1209 (0x30..0x3F are for private use only).
1210
1211 Note (*): ECMA = European Computer Manufacturers Association
1212
1213 Here are examples of graphic character sets [NAME(<F>)]:
1214 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1215 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1216 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1217 o DIMENSION2_CHARS96 -- none for the moment
1218
1219 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1220 C0 [0x00..0x1F] -- control character plane 0
1221 GL [0x20..0x7F] -- graphic character plane 0
1222 C1 [0x80..0x9F] -- control character plane 1
1223 GR [0xA0..0xFF] -- graphic character plane 1
1224
1225 A control character set is directly designated and invoked to C0 or
1226 C1 by an escape sequence. The most common case is that:
1227 - ISO646's control character set is designated/invoked to C0, and
1228 - ISO6429's control character set is designated/invoked to C1,
1229 and usually these designations/invocations are omitted in encoded
1230 text. In a 7-bit environment, only C0 can be used, and a control
1231 character for C1 is encoded by an appropriate escape sequence to
1232 fit into the environment. All control characters for C1 are
1233 defined to have corresponding escape sequences.
1234
1235 A graphic character set is at first designated to one of four
1236 graphic registers (G0 through G3), then these graphic registers are
1237 invoked to GL or GR. These designations and invocations can be
1238 done independently. The most common case is that G0 is invoked to
1239 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1240 these invocations and designations are omitted in encoded text.
1241 In a 7-bit environment, only GL can be used.
1242
1243 When a graphic character set of CHARS94 is invoked to GL, codes
1244 0x20 and 0x7F of the GL area work as control characters SPACE and
1245 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1246 be used.
1247
1248 There are two ways of invocation: locking-shift and single-shift.
1249 With locking-shift, the invocation lasts until the next different
1250 invocation, whereas with single-shift, the invocation affects the
1251 following character only and doesn't affect the locking-shift
1252 state. Invocations are done by the following control characters or
1253 escape sequences:
1254
1255 ----------------------------------------------------------------------
1256 abbrev function cntrl escape seq description
1257 ----------------------------------------------------------------------
1258 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1259 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1260 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1261 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1262 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1263 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1264 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1265 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1266 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1267 ----------------------------------------------------------------------
1268 (*) These are not used by any known coding system.
1269
1270 Control characters for these functions are defined by macros
1271 ISO_CODE_XXX in `coding.h'.
1272
1273 Designations are done by the following escape sequences:
1274 ----------------------------------------------------------------------
1275 escape sequence description
1276 ----------------------------------------------------------------------
1277 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1278 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1279 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1280 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1281 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1282 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1283 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1284 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1285 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1286 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1287 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1288 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1289 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1290 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1291 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1292 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1293 ----------------------------------------------------------------------
1294
1295 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1296 of dimension 1, chars 94, and final character <F>, etc...
1297
1298 Note (*): Although these designations are not allowed in ISO2022,
1299 Emacs accepts them on decoding, and produces them on encoding
1300 CHARS96 character sets in a coding system which is characterized as
1301 7-bit environment, non-locking-shift, and non-single-shift.
1302
1303 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1304 '(' can be omitted. We refer to this as "short-form" hereafter.
1305
1306 Now you may notice that there are a lot of ways of encoding the
1307 same multilingual text in ISO2022. Actually, there exist many
1308 coding systems such as Compound Text (used in X11's inter client
1309 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1310 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1311 localized platforms), and all of these are variants of ISO2022.
1312
1313 In addition to the above, Emacs handles two more kinds of escape
1314 sequences: ISO6429's direction specification and Emacs' private
1315 sequence for specifying character composition.
1316
1317 ISO6429's direction specification takes the following form:
1318 o CSI ']' -- end of the current direction
1319 o CSI '0' ']' -- end of the current direction
1320 o CSI '1' ']' -- start of left-to-right text
1321 o CSI '2' ']' -- start of right-to-left text
1322 The control character CSI (0x9B: control sequence introducer) is
1323 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1324
1325 Character composition specification takes the following form:
1326 o ESC '0' -- start relative composition
1327 o ESC '1' -- end composition
1328 o ESC '2' -- start rule-base composition (*)
1329 o ESC '3' -- start relative composition with alternate chars (**)
1330 o ESC '4' -- start rule-base composition with alternate chars (**)
1331 Since these are not standard escape sequences of any ISO standard,
1332 the use of them with these meanings is restricted to Emacs only.
1333
1334 (*) This form is used only in Emacs 20.5 and older versions,
1335 but the newer versions can safely decode it.
1336 (**) This form is used only in Emacs 21.1 and newer versions,
1337 and the older versions can't decode it.
1338
1339 Here's a list of example usages of these composition escape
1340 sequences (categorized by `enum composition_method').
1341
1342 COMPOSITION_RELATIVE:
1343 ESC 0 CHAR [ CHAR ] ESC 1
1344 COMPOSITION_WITH_RULE:
1345 ESC 2 CHAR [ RULE CHAR ] ESC 1
1346 COMPOSITION_WITH_ALTCHARS:
1347 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1348 COMPOSITION_WITH_RULE_ALTCHARS:
1349 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1350
1351 enum iso_code_class_type iso_code_class[256];
1352
1353 #define CHARSET_OK(idx, charset, c) \
1354 (coding_system_table[idx] \
1355 && (charset == CHARSET_ASCII \
1356 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1357 CODING_SAFE_CHAR_P (safe_chars, c))) \
1358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1359 charset) \
1360 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1361
1362 #define SHIFT_OUT_OK(idx) \
1363 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1364
1365 #define COMPOSITION_OK(idx) \
1366 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1367
1368 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1369 Check if a text is encoded in ISO2022. If it is, return an
1370 integer in which appropriate flag bits any of:
1371 CODING_CATEGORY_MASK_ISO_7
1372 CODING_CATEGORY_MASK_ISO_7_TIGHT
1373 CODING_CATEGORY_MASK_ISO_8_1
1374 CODING_CATEGORY_MASK_ISO_8_2
1375 CODING_CATEGORY_MASK_ISO_7_ELSE
1376 CODING_CATEGORY_MASK_ISO_8_ELSE
1377 are set. If a code which should never appear in ISO2022 is found,
1378 returns 0. */
1379
1380 static int
1381 detect_coding_iso2022 (src, src_end, multibytep)
1382 unsigned char *src, *src_end;
1383 int multibytep;
1384 {
1385 int mask = CODING_CATEGORY_MASK_ISO;
1386 int mask_found = 0;
1387 int reg[4], shift_out = 0, single_shifting = 0;
1388 int c, c1, charset;
1389 /* Dummy for ONE_MORE_BYTE. */
1390 struct coding_system dummy_coding;
1391 struct coding_system *coding = &dummy_coding;
1392 Lisp_Object safe_chars;
1393
1394 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1395 while (mask && src < src_end)
1396 {
1397 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1398 retry:
1399 switch (c)
1400 {
1401 case ISO_CODE_ESC:
1402 if (inhibit_iso_escape_detection)
1403 break;
1404 single_shifting = 0;
1405 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1406 if (c >= '(' && c <= '/')
1407 {
1408 /* Designation sequence for a charset of dimension 1. */
1409 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1410 if (c1 < ' ' || c1 >= 0x80
1411 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1412 /* Invalid designation sequence. Just ignore. */
1413 break;
1414 reg[(c - '(') % 4] = charset;
1415 }
1416 else if (c == '$')
1417 {
1418 /* Designation sequence for a charset of dimension 2. */
1419 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1420 if (c >= '@' && c <= 'B')
1421 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1422 reg[0] = charset = iso_charset_table[1][0][c];
1423 else if (c >= '(' && c <= '/')
1424 {
1425 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1426 if (c1 < ' ' || c1 >= 0x80
1427 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1428 /* Invalid designation sequence. Just ignore. */
1429 break;
1430 reg[(c - '(') % 4] = charset;
1431 }
1432 else
1433 /* Invalid designation sequence. Just ignore. */
1434 break;
1435 }
1436 else if (c == 'N' || c == 'O')
1437 {
1438 /* ESC <Fe> for SS2 or SS3. */
1439 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1440 break;
1441 }
1442 else if (c >= '0' && c <= '4')
1443 {
1444 /* ESC <Fp> for start/end composition. */
1445 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1446 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1447 else
1448 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1449 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1450 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1451 else
1452 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1453 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1454 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1455 else
1456 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1457 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1458 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1459 else
1460 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1461 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1462 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463 else
1464 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1465 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1466 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467 else
1468 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1469 break;
1470 }
1471 else
1472 /* Invalid escape sequence. Just ignore. */
1473 break;
1474
1475 /* We found a valid designation sequence for CHARSET. */
1476 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1477 c = MAKE_CHAR (charset, 0, 0);
1478 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1490 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1494 break;
1495
1496 case ISO_CODE_SO:
1497 if (inhibit_iso_escape_detection)
1498 break;
1499 single_shifting = 0;
1500 if (shift_out == 0
1501 && (reg[1] >= 0
1502 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1503 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1504 {
1505 /* Locking shift out. */
1506 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1507 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1508 }
1509 break;
1510
1511 case ISO_CODE_SI:
1512 if (inhibit_iso_escape_detection)
1513 break;
1514 single_shifting = 0;
1515 if (shift_out == 1)
1516 {
1517 /* Locking shift in. */
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1519 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1520 }
1521 break;
1522
1523 case ISO_CODE_CSI:
1524 single_shifting = 0;
1525 case ISO_CODE_SS2:
1526 case ISO_CODE_SS3:
1527 {
1528 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1529
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 if (c != ISO_CODE_CSI)
1533 {
1534 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1535 & CODING_FLAG_ISO_SINGLE_SHIFT)
1536 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1537 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1538 & CODING_FLAG_ISO_SINGLE_SHIFT)
1539 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1540 single_shifting = 1;
1541 }
1542 if (VECTORP (Vlatin_extra_code_table)
1543 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1544 {
1545 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1546 & CODING_FLAG_ISO_LATIN_EXTRA)
1547 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1548 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1549 & CODING_FLAG_ISO_LATIN_EXTRA)
1550 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1551 }
1552 mask &= newmask;
1553 mask_found |= newmask;
1554 }
1555 break;
1556
1557 default:
1558 if (c < 0x80)
1559 {
1560 single_shifting = 0;
1561 break;
1562 }
1563 else if (c < 0xA0)
1564 {
1565 single_shifting = 0;
1566 if (VECTORP (Vlatin_extra_code_table)
1567 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1568 {
1569 int newmask = 0;
1570
1571 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572 & CODING_FLAG_ISO_LATIN_EXTRA)
1573 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577 mask &= newmask;
1578 mask_found |= newmask;
1579 }
1580 else
1581 return 0;
1582 }
1583 else
1584 {
1585 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1586 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1587 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1588 /* Check the length of succeeding codes of the range
1589 0xA0..0FF. If the byte length is odd, we exclude
1590 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1591 when we are not single shifting. */
1592 if (!single_shifting
1593 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1594 {
1595 int i = 1;
1596
1597 c = -1;
1598 while (src < src_end)
1599 {
1600 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1601 if (c < 0xA0)
1602 break;
1603 i++;
1604 }
1605
1606 if (i & 1 && src < src_end)
1607 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1608 else
1609 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1610 if (c >= 0)
1611 /* This means that we have read one extra byte. */
1612 goto retry;
1613 }
1614 }
1615 break;
1616 }
1617 }
1618 label_end_of_loop:
1619 return (mask & mask_found);
1620 }
1621
1622 /* Decode a character of which charset is CHARSET, the 1st position
1623 code is C1, the 2nd position code is C2, and return the decoded
1624 character code. If the variable `translation_table' is non-nil,
1625 returned the translated code. */
1626
1627 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1628 (NILP (translation_table) \
1629 ? MAKE_CHAR (charset, c1, c2) \
1630 : translate_char (translation_table, -1, charset, c1, c2))
1631
1632 /* Set designation state into CODING. */
1633 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1634 do { \
1635 int charset, c; \
1636 \
1637 if (final_char < '0' || final_char >= 128) \
1638 goto label_invalid_code; \
1639 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1640 make_number (chars), \
1641 make_number (final_char)); \
1642 c = MAKE_CHAR (charset, 0, 0); \
1643 if (charset >= 0 \
1644 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1645 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1646 { \
1647 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1648 && reg == 0 \
1649 && charset == CHARSET_ASCII) \
1650 { \
1651 /* We should insert this designation sequence as is so \
1652 that it is surely written back to a file. */ \
1653 coding->spec.iso2022.last_invalid_designation_register = -1; \
1654 goto label_invalid_code; \
1655 } \
1656 coding->spec.iso2022.last_invalid_designation_register = -1; \
1657 if ((coding->mode & CODING_MODE_DIRECTION) \
1658 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1659 charset = CHARSET_REVERSE_CHARSET (charset); \
1660 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1661 } \
1662 else \
1663 { \
1664 coding->spec.iso2022.last_invalid_designation_register = reg; \
1665 goto label_invalid_code; \
1666 } \
1667 } while (0)
1668
1669 /* Allocate a memory block for storing information about compositions.
1670 The block is chained to the already allocated blocks. */
1671
1672 void
1673 coding_allocate_composition_data (coding, char_offset)
1674 struct coding_system *coding;
1675 int char_offset;
1676 {
1677 struct composition_data *cmp_data
1678 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1679
1680 cmp_data->char_offset = char_offset;
1681 cmp_data->used = 0;
1682 cmp_data->prev = coding->cmp_data;
1683 cmp_data->next = NULL;
1684 if (coding->cmp_data)
1685 coding->cmp_data->next = cmp_data;
1686 coding->cmp_data = cmp_data;
1687 coding->cmp_data_start = 0;
1688 }
1689
1690 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1691 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1692 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1693 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1694 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1695 */
1696
1697 #define DECODE_COMPOSITION_START(c1) \
1698 do { \
1699 if (coding->composing == COMPOSITION_DISABLED) \
1700 { \
1701 *dst++ = ISO_CODE_ESC; \
1702 *dst++ = c1 & 0x7f; \
1703 coding->produced_char += 2; \
1704 } \
1705 else if (!COMPOSING_P (coding)) \
1706 { \
1707 /* This is surely the start of a composition. We must be sure \
1708 that coding->cmp_data has enough space to store the \
1709 information about the composition. If not, terminate the \
1710 current decoding loop, allocate one more memory block for \
1711 coding->cmp_data in the caller, then start the decoding \
1712 loop again. We can't allocate memory here directly because \
1713 it may cause buffer/string relocation. */ \
1714 if (!coding->cmp_data \
1715 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1716 >= COMPOSITION_DATA_SIZE)) \
1717 { \
1718 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1719 goto label_end_of_loop; \
1720 } \
1721 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1722 : c1 == '2' ? COMPOSITION_WITH_RULE \
1723 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1724 : COMPOSITION_WITH_RULE_ALTCHARS); \
1725 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1726 coding->composing); \
1727 coding->composition_rule_follows = 0; \
1728 } \
1729 else \
1730 { \
1731 /* We are already handling a composition. If the method is \
1732 the following two, the codes following the current escape \
1733 sequence are actual characters stored in a buffer. */ \
1734 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1735 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1736 { \
1737 coding->composing = COMPOSITION_RELATIVE; \
1738 coding->composition_rule_follows = 0; \
1739 } \
1740 } \
1741 } while (0)
1742
1743 /* Handle composition end sequence ESC 1. */
1744
1745 #define DECODE_COMPOSITION_END(c1) \
1746 do { \
1747 if (! COMPOSING_P (coding)) \
1748 { \
1749 *dst++ = ISO_CODE_ESC; \
1750 *dst++ = c1; \
1751 coding->produced_char += 2; \
1752 } \
1753 else \
1754 { \
1755 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1756 coding->composing = COMPOSITION_NO; \
1757 } \
1758 } while (0)
1759
1760 /* Decode a composition rule from the byte C1 (and maybe one more byte
1761 from SRC) and store one encoded composition rule in
1762 coding->cmp_data. */
1763
1764 #define DECODE_COMPOSITION_RULE(c1) \
1765 do { \
1766 int rule = 0; \
1767 (c1) -= 32; \
1768 if (c1 < 81) /* old format (before ver.21) */ \
1769 { \
1770 int gref = (c1) / 9; \
1771 int nref = (c1) % 9; \
1772 if (gref == 4) gref = 10; \
1773 if (nref == 4) nref = 10; \
1774 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1775 } \
1776 else if (c1 < 93) /* new format (after ver.21) */ \
1777 { \
1778 ONE_MORE_BYTE (c2); \
1779 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1780 } \
1781 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1782 coding->composition_rule_follows = 0; \
1783 } while (0)
1784
1785
1786 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1787
1788 static void
1789 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1790 struct coding_system *coding;
1791 unsigned char *source, *destination;
1792 int src_bytes, dst_bytes;
1793 {
1794 unsigned char *src = source;
1795 unsigned char *src_end = source + src_bytes;
1796 unsigned char *dst = destination;
1797 unsigned char *dst_end = destination + dst_bytes;
1798 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1799 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1800 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1801 /* SRC_BASE remembers the start position in source in each loop.
1802 The loop will be exited when there's not enough source code
1803 (within macro ONE_MORE_BYTE), or when there's not enough
1804 destination area to produce a character (within macro
1805 EMIT_CHAR). */
1806 unsigned char *src_base;
1807 int c, charset;
1808 Lisp_Object translation_table;
1809 Lisp_Object safe_chars;
1810
1811 safe_chars = coding_safe_chars (coding->symbol);
1812
1813 if (NILP (Venable_character_translation))
1814 translation_table = Qnil;
1815 else
1816 {
1817 translation_table = coding->translation_table_for_decode;
1818 if (NILP (translation_table))
1819 translation_table = Vstandard_translation_table_for_decode;
1820 }
1821
1822 coding->result = CODING_FINISH_NORMAL;
1823
1824 while (1)
1825 {
1826 int c1, c2;
1827
1828 src_base = src;
1829 ONE_MORE_BYTE (c1);
1830
1831 /* We produce no character or one character. */
1832 switch (iso_code_class [c1])
1833 {
1834 case ISO_0x20_or_0x7F:
1835 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1836 {
1837 DECODE_COMPOSITION_RULE (c1);
1838 continue;
1839 }
1840 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1841 {
1842 /* This is SPACE or DEL. */
1843 charset = CHARSET_ASCII;
1844 break;
1845 }
1846 /* This is a graphic character, we fall down ... */
1847
1848 case ISO_graphic_plane_0:
1849 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1850 {
1851 DECODE_COMPOSITION_RULE (c1);
1852 continue;
1853 }
1854 charset = charset0;
1855 break;
1856
1857 case ISO_0xA0_or_0xFF:
1858 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1859 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1860 goto label_invalid_code;
1861 /* This is a graphic character, we fall down ... */
1862
1863 case ISO_graphic_plane_1:
1864 if (charset1 < 0)
1865 goto label_invalid_code;
1866 charset = charset1;
1867 break;
1868
1869 case ISO_control_0:
1870 if (COMPOSING_P (coding))
1871 DECODE_COMPOSITION_END ('1');
1872
1873 /* All ISO2022 control characters in this class have the
1874 same representation in Emacs internal format. */
1875 if (c1 == '\n'
1876 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1877 && (coding->eol_type == CODING_EOL_CR
1878 || coding->eol_type == CODING_EOL_CRLF))
1879 {
1880 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1881 goto label_end_of_loop;
1882 }
1883 charset = CHARSET_ASCII;
1884 break;
1885
1886 case ISO_control_1:
1887 if (COMPOSING_P (coding))
1888 DECODE_COMPOSITION_END ('1');
1889 goto label_invalid_code;
1890
1891 case ISO_carriage_return:
1892 if (COMPOSING_P (coding))
1893 DECODE_COMPOSITION_END ('1');
1894
1895 if (coding->eol_type == CODING_EOL_CR)
1896 c1 = '\n';
1897 else if (coding->eol_type == CODING_EOL_CRLF)
1898 {
1899 ONE_MORE_BYTE (c1);
1900 if (c1 != ISO_CODE_LF)
1901 {
1902 src--;
1903 c1 = '\r';
1904 }
1905 }
1906 charset = CHARSET_ASCII;
1907 break;
1908
1909 case ISO_shift_out:
1910 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1911 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1912 goto label_invalid_code;
1913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1915 continue;
1916
1917 case ISO_shift_in:
1918 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1919 goto label_invalid_code;
1920 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1921 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1922 continue;
1923
1924 case ISO_single_shift_2_7:
1925 case ISO_single_shift_2:
1926 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1927 goto label_invalid_code;
1928 /* SS2 is handled as an escape sequence of ESC 'N' */
1929 c1 = 'N';
1930 goto label_escape_sequence;
1931
1932 case ISO_single_shift_3:
1933 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1934 goto label_invalid_code;
1935 /* SS2 is handled as an escape sequence of ESC 'O' */
1936 c1 = 'O';
1937 goto label_escape_sequence;
1938
1939 case ISO_control_sequence_introducer:
1940 /* CSI is handled as an escape sequence of ESC '[' ... */
1941 c1 = '[';
1942 goto label_escape_sequence;
1943
1944 case ISO_escape:
1945 ONE_MORE_BYTE (c1);
1946 label_escape_sequence:
1947 /* Escape sequences handled by Emacs are invocation,
1948 designation, direction specification, and character
1949 composition specification. */
1950 switch (c1)
1951 {
1952 case '&': /* revision of following character set */
1953 ONE_MORE_BYTE (c1);
1954 if (!(c1 >= '@' && c1 <= '~'))
1955 goto label_invalid_code;
1956 ONE_MORE_BYTE (c1);
1957 if (c1 != ISO_CODE_ESC)
1958 goto label_invalid_code;
1959 ONE_MORE_BYTE (c1);
1960 goto label_escape_sequence;
1961
1962 case '$': /* designation of 2-byte character set */
1963 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1964 goto label_invalid_code;
1965 ONE_MORE_BYTE (c1);
1966 if (c1 >= '@' && c1 <= 'B')
1967 { /* designation of JISX0208.1978, GB2312.1980,
1968 or JISX0208.1980 */
1969 DECODE_DESIGNATION (0, 2, 94, c1);
1970 }
1971 else if (c1 >= 0x28 && c1 <= 0x2B)
1972 { /* designation of DIMENSION2_CHARS94 character set */
1973 ONE_MORE_BYTE (c2);
1974 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1975 }
1976 else if (c1 >= 0x2C && c1 <= 0x2F)
1977 { /* designation of DIMENSION2_CHARS96 character set */
1978 ONE_MORE_BYTE (c2);
1979 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1980 }
1981 else
1982 goto label_invalid_code;
1983 /* We must update these variables now. */
1984 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1985 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1986 continue;
1987
1988 case 'n': /* invocation of locking-shift-2 */
1989 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1990 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1991 goto label_invalid_code;
1992 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1993 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1994 continue;
1995
1996 case 'o': /* invocation of locking-shift-3 */
1997 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1998 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1999 goto label_invalid_code;
2000 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2001 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2002 continue;
2003
2004 case 'N': /* invocation of single-shift-2 */
2005 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2006 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2007 goto label_invalid_code;
2008 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2009 ONE_MORE_BYTE (c1);
2010 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2011 goto label_invalid_code;
2012 break;
2013
2014 case 'O': /* invocation of single-shift-3 */
2015 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2017 goto label_invalid_code;
2018 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2019 ONE_MORE_BYTE (c1);
2020 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2021 goto label_invalid_code;
2022 break;
2023
2024 case '0': case '2': case '3': case '4': /* start composition */
2025 DECODE_COMPOSITION_START (c1);
2026 continue;
2027
2028 case '1': /* end composition */
2029 DECODE_COMPOSITION_END (c1);
2030 continue;
2031
2032 case '[': /* specification of direction */
2033 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2034 goto label_invalid_code;
2035 /* For the moment, nested direction is not supported.
2036 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2037 left-to-right, and nonzero means right-to-left. */
2038 ONE_MORE_BYTE (c1);
2039 switch (c1)
2040 {
2041 case ']': /* end of the current direction */
2042 coding->mode &= ~CODING_MODE_DIRECTION;
2043
2044 case '0': /* end of the current direction */
2045 case '1': /* start of left-to-right direction */
2046 ONE_MORE_BYTE (c1);
2047 if (c1 == ']')
2048 coding->mode &= ~CODING_MODE_DIRECTION;
2049 else
2050 goto label_invalid_code;
2051 break;
2052
2053 case '2': /* start of right-to-left direction */
2054 ONE_MORE_BYTE (c1);
2055 if (c1 == ']')
2056 coding->mode |= CODING_MODE_DIRECTION;
2057 else
2058 goto label_invalid_code;
2059 break;
2060
2061 default:
2062 goto label_invalid_code;
2063 }
2064 continue;
2065
2066 case '%':
2067 if (COMPOSING_P (coding))
2068 DECODE_COMPOSITION_END ('1');
2069 ONE_MORE_BYTE (c1);
2070 if (c1 == '/')
2071 {
2072 /* CTEXT extended segment:
2073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2074 We keep these bytes as is for the moment.
2075 They may be decoded by post-read-conversion. */
2076 int dim, M, L;
2077 int size, required;
2078 int produced_chars;
2079
2080 ONE_MORE_BYTE (dim);
2081 ONE_MORE_BYTE (M);
2082 ONE_MORE_BYTE (L);
2083 size = ((M - 128) * 128) + (L - 128);
2084 required = 8 + size * 2;
2085 if (dst + required > (dst_bytes ? dst_end : src))
2086 goto label_end_of_loop;
2087 *dst++ = ISO_CODE_ESC;
2088 *dst++ = '%';
2089 *dst++ = '/';
2090 *dst++ = dim;
2091 produced_chars = 4;
2092 dst += CHAR_STRING (M, dst), produced_chars++;
2093 dst += CHAR_STRING (L, dst), produced_chars++;
2094 while (size-- > 0)
2095 {
2096 ONE_MORE_BYTE (c1);
2097 dst += CHAR_STRING (c1, dst), produced_chars++;
2098 }
2099 coding->produced_char += produced_chars;
2100 }
2101 else if (c1 == 'G')
2102 {
2103 unsigned char *d = dst;
2104 int produced_chars;
2105
2106 /* XFree86 extension for embedding UTF-8 in CTEXT:
2107 ESC % G --UTF-8-BYTES-- ESC % @
2108 We keep these bytes as is for the moment.
2109 They may be decoded by post-read-conversion. */
2110 if (d + 6 > (dst_bytes ? dst_end : src))
2111 goto label_end_of_loop;
2112 *d++ = ISO_CODE_ESC;
2113 *d++ = '%';
2114 *d++ = 'G';
2115 produced_chars = 3;
2116 while (d + 1 < (dst_bytes ? dst_end : src))
2117 {
2118 ONE_MORE_BYTE (c1);
2119 if (c1 == ISO_CODE_ESC
2120 && src + 1 < src_end
2121 && src[0] == '%'
2122 && src[1] == '@')
2123 break;
2124 d += CHAR_STRING (c1, d), produced_chars++;
2125 }
2126 if (d + 3 > (dst_bytes ? dst_end : src))
2127 goto label_end_of_loop;
2128 *d++ = ISO_CODE_ESC;
2129 *d++ = '%';
2130 *d++ = '@';
2131 dst = d;
2132 coding->produced_char += produced_chars + 3;
2133 }
2134 else
2135 goto label_invalid_code;
2136 continue;
2137
2138 default:
2139 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2140 goto label_invalid_code;
2141 if (c1 >= 0x28 && c1 <= 0x2B)
2142 { /* designation of DIMENSION1_CHARS94 character set */
2143 ONE_MORE_BYTE (c2);
2144 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2145 }
2146 else if (c1 >= 0x2C && c1 <= 0x2F)
2147 { /* designation of DIMENSION1_CHARS96 character set */
2148 ONE_MORE_BYTE (c2);
2149 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2150 }
2151 else
2152 goto label_invalid_code;
2153 /* We must update these variables now. */
2154 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2155 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2156 continue;
2157 }
2158 }
2159
2160 /* Now we know CHARSET and 1st position code C1 of a character.
2161 Produce a multibyte sequence for that character while getting
2162 2nd position code C2 if necessary. */
2163 if (CHARSET_DIMENSION (charset) == 2)
2164 {
2165 ONE_MORE_BYTE (c2);
2166 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2167 /* C2 is not in a valid range. */
2168 goto label_invalid_code;
2169 }
2170 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2171 EMIT_CHAR (c);
2172 continue;
2173
2174 label_invalid_code:
2175 coding->errors++;
2176 if (COMPOSING_P (coding))
2177 DECODE_COMPOSITION_END ('1');
2178 src = src_base;
2179 c = *src++;
2180 EMIT_CHAR (c);
2181 }
2182
2183 label_end_of_loop:
2184 coding->consumed = coding->consumed_char = src_base - source;
2185 coding->produced = dst - destination;
2186 return;
2187 }
2188
2189
2190 /* ISO2022 encoding stuff. */
2191
2192 /*
2193 It is not enough to say just "ISO2022" on encoding, we have to
2194 specify more details. In Emacs, each ISO2022 coding system
2195 variant has the following specifications:
2196 1. Initial designation to G0 through G3.
2197 2. Allows short-form designation?
2198 3. ASCII should be designated to G0 before control characters?
2199 4. ASCII should be designated to G0 at end of line?
2200 5. 7-bit environment or 8-bit environment?
2201 6. Use locking-shift?
2202 7. Use Single-shift?
2203 And the following two are only for Japanese:
2204 8. Use ASCII in place of JIS0201-1976-Roman?
2205 9. Use JISX0208-1983 in place of JISX0208-1978?
2206 These specifications are encoded in `coding->flags' as flag bits
2207 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2208 details.
2209 */
2210
2211 /* Produce codes (escape sequence) for designating CHARSET to graphic
2212 register REG at DST, and increment DST. If <final-char> of CHARSET is
2213 '@', 'A', or 'B' and the coding system CODING allows, produce
2214 designation sequence of short-form. */
2215
2216 #define ENCODE_DESIGNATION(charset, reg, coding) \
2217 do { \
2218 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2219 char *intermediate_char_94 = "()*+"; \
2220 char *intermediate_char_96 = ",-./"; \
2221 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2222 \
2223 if (revision < 255) \
2224 { \
2225 *dst++ = ISO_CODE_ESC; \
2226 *dst++ = '&'; \
2227 *dst++ = '@' + revision; \
2228 } \
2229 *dst++ = ISO_CODE_ESC; \
2230 if (CHARSET_DIMENSION (charset) == 1) \
2231 { \
2232 if (CHARSET_CHARS (charset) == 94) \
2233 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2234 else \
2235 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2236 } \
2237 else \
2238 { \
2239 *dst++ = '$'; \
2240 if (CHARSET_CHARS (charset) == 94) \
2241 { \
2242 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2243 || reg != 0 \
2244 || final_char < '@' || final_char > 'B') \
2245 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2246 } \
2247 else \
2248 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2249 } \
2250 *dst++ = final_char; \
2251 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2252 } while (0)
2253
2254 /* The following two macros produce codes (control character or escape
2255 sequence) for ISO2022 single-shift functions (single-shift-2 and
2256 single-shift-3). */
2257
2258 #define ENCODE_SINGLE_SHIFT_2 \
2259 do { \
2260 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2261 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2262 else \
2263 *dst++ = ISO_CODE_SS2; \
2264 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2265 } while (0)
2266
2267 #define ENCODE_SINGLE_SHIFT_3 \
2268 do { \
2269 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2270 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2271 else \
2272 *dst++ = ISO_CODE_SS3; \
2273 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2274 } while (0)
2275
2276 /* The following four macros produce codes (control character or
2277 escape sequence) for ISO2022 locking-shift functions (shift-in,
2278 shift-out, locking-shift-2, and locking-shift-3). */
2279
2280 #define ENCODE_SHIFT_IN \
2281 do { \
2282 *dst++ = ISO_CODE_SI; \
2283 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2284 } while (0)
2285
2286 #define ENCODE_SHIFT_OUT \
2287 do { \
2288 *dst++ = ISO_CODE_SO; \
2289 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2290 } while (0)
2291
2292 #define ENCODE_LOCKING_SHIFT_2 \
2293 do { \
2294 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2295 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2296 } while (0)
2297
2298 #define ENCODE_LOCKING_SHIFT_3 \
2299 do { \
2300 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2301 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2302 } while (0)
2303
2304 /* Produce codes for a DIMENSION1 character whose character set is
2305 CHARSET and whose position-code is C1. Designation and invocation
2306 sequences are also produced in advance if necessary. */
2307
2308 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2309 do { \
2310 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2311 { \
2312 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2313 *dst++ = c1 & 0x7F; \
2314 else \
2315 *dst++ = c1 | 0x80; \
2316 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2317 break; \
2318 } \
2319 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2320 { \
2321 *dst++ = c1 & 0x7F; \
2322 break; \
2323 } \
2324 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2325 { \
2326 *dst++ = c1 | 0x80; \
2327 break; \
2328 } \
2329 else \
2330 /* Since CHARSET is not yet invoked to any graphic planes, we \
2331 must invoke it, or, at first, designate it to some graphic \
2332 register. Then repeat the loop to actually produce the \
2333 character. */ \
2334 dst = encode_invocation_designation (charset, coding, dst); \
2335 } while (1)
2336
2337 /* Produce codes for a DIMENSION2 character whose character set is
2338 CHARSET and whose position-codes are C1 and C2. Designation and
2339 invocation codes are also produced in advance if necessary. */
2340
2341 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2342 do { \
2343 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2344 { \
2345 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2346 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2347 else \
2348 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2349 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2350 break; \
2351 } \
2352 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2353 { \
2354 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2355 break; \
2356 } \
2357 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2358 { \
2359 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2360 break; \
2361 } \
2362 else \
2363 /* Since CHARSET is not yet invoked to any graphic planes, we \
2364 must invoke it, or, at first, designate it to some graphic \
2365 register. Then repeat the loop to actually produce the \
2366 character. */ \
2367 dst = encode_invocation_designation (charset, coding, dst); \
2368 } while (1)
2369
2370 #define ENCODE_ISO_CHARACTER(c) \
2371 do { \
2372 int charset, c1, c2; \
2373 \
2374 SPLIT_CHAR (c, charset, c1, c2); \
2375 if (CHARSET_DEFINED_P (charset)) \
2376 { \
2377 if (CHARSET_DIMENSION (charset) == 1) \
2378 { \
2379 if (charset == CHARSET_ASCII \
2380 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2381 charset = charset_latin_jisx0201; \
2382 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2383 } \
2384 else \
2385 { \
2386 if (charset == charset_jisx0208 \
2387 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2388 charset = charset_jisx0208_1978; \
2389 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2390 } \
2391 } \
2392 else \
2393 { \
2394 *dst++ = c1; \
2395 if (c2 >= 0) \
2396 *dst++ = c2; \
2397 } \
2398 } while (0)
2399
2400
2401 /* Instead of encoding character C, produce one or two `?'s. */
2402
2403 #define ENCODE_UNSAFE_CHARACTER(c) \
2404 do { \
2405 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2406 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2407 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2408 } while (0)
2409
2410
2411 /* Produce designation and invocation codes at a place pointed by DST
2412 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2413 Return new DST. */
2414
2415 unsigned char *
2416 encode_invocation_designation (charset, coding, dst)
2417 int charset;
2418 struct coding_system *coding;
2419 unsigned char *dst;
2420 {
2421 int reg; /* graphic register number */
2422
2423 /* At first, check designations. */
2424 for (reg = 0; reg < 4; reg++)
2425 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2426 break;
2427
2428 if (reg >= 4)
2429 {
2430 /* CHARSET is not yet designated to any graphic registers. */
2431 /* At first check the requested designation. */
2432 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2433 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2434 /* Since CHARSET requests no special designation, designate it
2435 to graphic register 0. */
2436 reg = 0;
2437
2438 ENCODE_DESIGNATION (charset, reg, coding);
2439 }
2440
2441 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2442 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2443 {
2444 /* Since the graphic register REG is not invoked to any graphic
2445 planes, invoke it to graphic plane 0. */
2446 switch (reg)
2447 {
2448 case 0: /* graphic register 0 */
2449 ENCODE_SHIFT_IN;
2450 break;
2451
2452 case 1: /* graphic register 1 */
2453 ENCODE_SHIFT_OUT;
2454 break;
2455
2456 case 2: /* graphic register 2 */
2457 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2458 ENCODE_SINGLE_SHIFT_2;
2459 else
2460 ENCODE_LOCKING_SHIFT_2;
2461 break;
2462
2463 case 3: /* graphic register 3 */
2464 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2465 ENCODE_SINGLE_SHIFT_3;
2466 else
2467 ENCODE_LOCKING_SHIFT_3;
2468 break;
2469 }
2470 }
2471
2472 return dst;
2473 }
2474
2475 /* Produce 2-byte codes for encoded composition rule RULE. */
2476
2477 #define ENCODE_COMPOSITION_RULE(rule) \
2478 do { \
2479 int gref, nref; \
2480 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2481 *dst++ = 32 + 81 + gref; \
2482 *dst++ = 32 + nref; \
2483 } while (0)
2484
2485 /* Produce codes for indicating the start of a composition sequence
2486 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2487 which specify information about the composition. See the comment
2488 in coding.h for the format of DATA. */
2489
2490 #define ENCODE_COMPOSITION_START(coding, data) \
2491 do { \
2492 coding->composing = data[3]; \
2493 *dst++ = ISO_CODE_ESC; \
2494 if (coding->composing == COMPOSITION_RELATIVE) \
2495 *dst++ = '0'; \
2496 else \
2497 { \
2498 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2499 ? '3' : '4'); \
2500 coding->cmp_data_index = coding->cmp_data_start + 4; \
2501 coding->composition_rule_follows = 0; \
2502 } \
2503 } while (0)
2504
2505 /* Produce codes for indicating the end of the current composition. */
2506
2507 #define ENCODE_COMPOSITION_END(coding, data) \
2508 do { \
2509 *dst++ = ISO_CODE_ESC; \
2510 *dst++ = '1'; \
2511 coding->cmp_data_start += data[0]; \
2512 coding->composing = COMPOSITION_NO; \
2513 if (coding->cmp_data_start == coding->cmp_data->used \
2514 && coding->cmp_data->next) \
2515 { \
2516 coding->cmp_data = coding->cmp_data->next; \
2517 coding->cmp_data_start = 0; \
2518 } \
2519 } while (0)
2520
2521 /* Produce composition start sequence ESC 0. Here, this sequence
2522 doesn't mean the start of a new composition but means that we have
2523 just produced components (alternate chars and composition rules) of
2524 the composition and the actual text follows in SRC. */
2525
2526 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2527 do { \
2528 *dst++ = ISO_CODE_ESC; \
2529 *dst++ = '0'; \
2530 coding->composing = COMPOSITION_RELATIVE; \
2531 } while (0)
2532
2533 /* The following three macros produce codes for indicating direction
2534 of text. */
2535 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2536 do { \
2537 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2538 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2539 else \
2540 *dst++ = ISO_CODE_CSI; \
2541 } while (0)
2542
2543 #define ENCODE_DIRECTION_R2L \
2544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2545
2546 #define ENCODE_DIRECTION_L2R \
2547 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2548
2549 /* Produce codes for designation and invocation to reset the graphic
2550 planes and registers to initial state. */
2551 #define ENCODE_RESET_PLANE_AND_REGISTER \
2552 do { \
2553 int reg; \
2554 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2555 ENCODE_SHIFT_IN; \
2556 for (reg = 0; reg < 4; reg++) \
2557 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2558 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2559 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2560 ENCODE_DESIGNATION \
2561 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2562 } while (0)
2563
2564 /* Produce designation sequences of charsets in the line started from
2565 SRC to a place pointed by DST, and return updated DST.
2566
2567 If the current block ends before any end-of-line, we may fail to
2568 find all the necessary designations. */
2569
2570 static unsigned char *
2571 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2572 struct coding_system *coding;
2573 Lisp_Object translation_table;
2574 unsigned char *src, *src_end, *dst;
2575 {
2576 int charset, c, found = 0, reg;
2577 /* Table of charsets to be designated to each graphic register. */
2578 int r[4];
2579
2580 for (reg = 0; reg < 4; reg++)
2581 r[reg] = -1;
2582
2583 while (found < 4)
2584 {
2585 ONE_MORE_CHAR (c);
2586 if (c == '\n')
2587 break;
2588
2589 charset = CHAR_CHARSET (c);
2590 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2591 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2592 {
2593 found++;
2594 r[reg] = charset;
2595 }
2596 }
2597
2598 label_end_of_loop:
2599 if (found)
2600 {
2601 for (reg = 0; reg < 4; reg++)
2602 if (r[reg] >= 0
2603 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2604 ENCODE_DESIGNATION (r[reg], reg, coding);
2605 }
2606
2607 return dst;
2608 }
2609
2610 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2611
2612 static void
2613 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2614 struct coding_system *coding;
2615 unsigned char *source, *destination;
2616 int src_bytes, dst_bytes;
2617 {
2618 unsigned char *src = source;
2619 unsigned char *src_end = source + src_bytes;
2620 unsigned char *dst = destination;
2621 unsigned char *dst_end = destination + dst_bytes;
2622 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2623 from DST_END to assure overflow checking is necessary only at the
2624 head of loop. */
2625 unsigned char *adjusted_dst_end = dst_end - 19;
2626 /* SRC_BASE remembers the start position in source in each loop.
2627 The loop will be exited when there's not enough source text to
2628 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2629 there's not enough destination area to produce encoded codes
2630 (within macro EMIT_BYTES). */
2631 unsigned char *src_base;
2632 int c;
2633 Lisp_Object translation_table;
2634 Lisp_Object safe_chars;
2635
2636 if (coding->flags & CODING_FLAG_ISO_SAFE)
2637 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2638
2639 safe_chars = coding_safe_chars (coding->symbol);
2640
2641 if (NILP (Venable_character_translation))
2642 translation_table = Qnil;
2643 else
2644 {
2645 translation_table = coding->translation_table_for_encode;
2646 if (NILP (translation_table))
2647 translation_table = Vstandard_translation_table_for_encode;
2648 }
2649
2650 coding->consumed_char = 0;
2651 coding->errors = 0;
2652 while (1)
2653 {
2654 src_base = src;
2655
2656 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2657 {
2658 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2659 break;
2660 }
2661
2662 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2663 && CODING_SPEC_ISO_BOL (coding))
2664 {
2665 /* We have to produce designation sequences if any now. */
2666 dst = encode_designation_at_bol (coding, translation_table,
2667 src, src_end, dst);
2668 CODING_SPEC_ISO_BOL (coding) = 0;
2669 }
2670
2671 /* Check composition start and end. */
2672 if (coding->composing != COMPOSITION_DISABLED
2673 && coding->cmp_data_start < coding->cmp_data->used)
2674 {
2675 struct composition_data *cmp_data = coding->cmp_data;
2676 int *data = cmp_data->data + coding->cmp_data_start;
2677 int this_pos = cmp_data->char_offset + coding->consumed_char;
2678
2679 if (coding->composing == COMPOSITION_RELATIVE)
2680 {
2681 if (this_pos == data[2])
2682 {
2683 ENCODE_COMPOSITION_END (coding, data);
2684 cmp_data = coding->cmp_data;
2685 data = cmp_data->data + coding->cmp_data_start;
2686 }
2687 }
2688 else if (COMPOSING_P (coding))
2689 {
2690 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2691 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2692 /* We have consumed components of the composition.
2693 What follows in SRC is the composition's base
2694 text. */
2695 ENCODE_COMPOSITION_FAKE_START (coding);
2696 else
2697 {
2698 int c = cmp_data->data[coding->cmp_data_index++];
2699 if (coding->composition_rule_follows)
2700 {
2701 ENCODE_COMPOSITION_RULE (c);
2702 coding->composition_rule_follows = 0;
2703 }
2704 else
2705 {
2706 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2707 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2708 ENCODE_UNSAFE_CHARACTER (c);
2709 else
2710 ENCODE_ISO_CHARACTER (c);
2711 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2712 coding->composition_rule_follows = 1;
2713 }
2714 continue;
2715 }
2716 }
2717 if (!COMPOSING_P (coding))
2718 {
2719 if (this_pos == data[1])
2720 {
2721 ENCODE_COMPOSITION_START (coding, data);
2722 continue;
2723 }
2724 }
2725 }
2726
2727 ONE_MORE_CHAR (c);
2728
2729 /* Now encode the character C. */
2730 if (c < 0x20 || c == 0x7F)
2731 {
2732 if (c == '\r')
2733 {
2734 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2735 {
2736 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2737 ENCODE_RESET_PLANE_AND_REGISTER;
2738 *dst++ = c;
2739 continue;
2740 }
2741 /* fall down to treat '\r' as '\n' ... */
2742 c = '\n';
2743 }
2744 if (c == '\n')
2745 {
2746 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2747 ENCODE_RESET_PLANE_AND_REGISTER;
2748 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2749 bcopy (coding->spec.iso2022.initial_designation,
2750 coding->spec.iso2022.current_designation,
2751 sizeof coding->spec.iso2022.initial_designation);
2752 if (coding->eol_type == CODING_EOL_LF
2753 || coding->eol_type == CODING_EOL_UNDECIDED)
2754 *dst++ = ISO_CODE_LF;
2755 else if (coding->eol_type == CODING_EOL_CRLF)
2756 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2757 else
2758 *dst++ = ISO_CODE_CR;
2759 CODING_SPEC_ISO_BOL (coding) = 1;
2760 }
2761 else
2762 {
2763 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2764 ENCODE_RESET_PLANE_AND_REGISTER;
2765 *dst++ = c;
2766 }
2767 }
2768 else if (ASCII_BYTE_P (c))
2769 ENCODE_ISO_CHARACTER (c);
2770 else if (SINGLE_BYTE_CHAR_P (c))
2771 {
2772 *dst++ = c;
2773 coding->errors++;
2774 }
2775 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2776 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2777 ENCODE_UNSAFE_CHARACTER (c);
2778 else
2779 ENCODE_ISO_CHARACTER (c);
2780
2781 coding->consumed_char++;
2782 }
2783
2784 label_end_of_loop:
2785 coding->consumed = src_base - source;
2786 coding->produced = coding->produced_char = dst - destination;
2787 }
2788
2789 \f
2790 /*** 4. SJIS and BIG5 handlers ***/
2791
2792 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2793 quite widely. So, for the moment, Emacs supports them in the bare
2794 C code. But, in the future, they may be supported only by CCL. */
2795
2796 /* SJIS is a coding system encoding three character sets: ASCII, right
2797 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2798 as is. A character of charset katakana-jisx0201 is encoded by
2799 "position-code + 0x80". A character of charset japanese-jisx0208
2800 is encoded in 2-byte but two position-codes are divided and shifted
2801 so that it fits in the range below.
2802
2803 --- CODE RANGE of SJIS ---
2804 (character set) (range)
2805 ASCII 0x00 .. 0x7F
2806 KATAKANA-JISX0201 0xA1 .. 0xDF
2807 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2808 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2809 -------------------------------
2810
2811 */
2812
2813 /* BIG5 is a coding system encoding two character sets: ASCII and
2814 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2815 character set and is encoded in two bytes.
2816
2817 --- CODE RANGE of BIG5 ---
2818 (character set) (range)
2819 ASCII 0x00 .. 0x7F
2820 Big5 (1st byte) 0xA1 .. 0xFE
2821 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2822 --------------------------
2823
2824 Since the number of characters in Big5 is larger than maximum
2825 characters in Emacs' charset (96x96), it can't be handled as one
2826 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2827 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2828 contains frequently used characters and the latter contains less
2829 frequently used characters. */
2830
2831 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2832 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2833 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2834 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2835
2836 /* Number of Big5 characters which have the same code in 1st byte. */
2837 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2838
2839 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2840 do { \
2841 unsigned int temp \
2842 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2843 if (b1 < 0xC9) \
2844 charset = charset_big5_1; \
2845 else \
2846 { \
2847 charset = charset_big5_2; \
2848 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2849 } \
2850 c1 = temp / (0xFF - 0xA1) + 0x21; \
2851 c2 = temp % (0xFF - 0xA1) + 0x21; \
2852 } while (0)
2853
2854 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2855 do { \
2856 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2857 if (charset == charset_big5_2) \
2858 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2859 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2860 b2 = temp % BIG5_SAME_ROW; \
2861 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2862 } while (0)
2863
2864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2865 Check if a text is encoded in SJIS. If it is, return
2866 CODING_CATEGORY_MASK_SJIS, else return 0. */
2867
2868 static int
2869 detect_coding_sjis (src, src_end, multibytep)
2870 unsigned char *src, *src_end;
2871 int multibytep;
2872 {
2873 int c;
2874 /* Dummy for ONE_MORE_BYTE. */
2875 struct coding_system dummy_coding;
2876 struct coding_system *coding = &dummy_coding;
2877
2878 while (1)
2879 {
2880 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2881 if (c < 0x80)
2882 continue;
2883 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2884 return 0;
2885 if (c <= 0x9F || c >= 0xE0)
2886 {
2887 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2888 if (c < 0x40 || c == 0x7F || c > 0xFC)
2889 return 0;
2890 }
2891 }
2892 label_end_of_loop:
2893 return CODING_CATEGORY_MASK_SJIS;
2894 }
2895
2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2897 Check if a text is encoded in BIG5. If it is, return
2898 CODING_CATEGORY_MASK_BIG5, else return 0. */
2899
2900 static int
2901 detect_coding_big5 (src, src_end, multibytep)
2902 unsigned char *src, *src_end;
2903 int multibytep;
2904 {
2905 int c;
2906 /* Dummy for ONE_MORE_BYTE. */
2907 struct coding_system dummy_coding;
2908 struct coding_system *coding = &dummy_coding;
2909
2910 while (1)
2911 {
2912 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2913 if (c < 0x80)
2914 continue;
2915 if (c < 0xA1 || c > 0xFE)
2916 return 0;
2917 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2918 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2919 return 0;
2920 }
2921 label_end_of_loop:
2922 return CODING_CATEGORY_MASK_BIG5;
2923 }
2924
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926 Check if a text is encoded in UTF-8. If it is, return
2927 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2928
2929 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2930 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2931 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2932 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2933 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2934 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2935 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2936
2937 static int
2938 detect_coding_utf_8 (src, src_end, multibytep)
2939 unsigned char *src, *src_end;
2940 int multibytep;
2941 {
2942 unsigned char c;
2943 int seq_maybe_bytes;
2944 /* Dummy for ONE_MORE_BYTE. */
2945 struct coding_system dummy_coding;
2946 struct coding_system *coding = &dummy_coding;
2947
2948 while (1)
2949 {
2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951 if (UTF_8_1_OCTET_P (c))
2952 continue;
2953 else if (UTF_8_2_OCTET_LEADING_P (c))
2954 seq_maybe_bytes = 1;
2955 else if (UTF_8_3_OCTET_LEADING_P (c))
2956 seq_maybe_bytes = 2;
2957 else if (UTF_8_4_OCTET_LEADING_P (c))
2958 seq_maybe_bytes = 3;
2959 else if (UTF_8_5_OCTET_LEADING_P (c))
2960 seq_maybe_bytes = 4;
2961 else if (UTF_8_6_OCTET_LEADING_P (c))
2962 seq_maybe_bytes = 5;
2963 else
2964 return 0;
2965
2966 do
2967 {
2968 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2969 if (!UTF_8_EXTRA_OCTET_P (c))
2970 return 0;
2971 seq_maybe_bytes--;
2972 }
2973 while (seq_maybe_bytes > 0);
2974 }
2975
2976 label_end_of_loop:
2977 return CODING_CATEGORY_MASK_UTF_8;
2978 }
2979
2980 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2981 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2982 Little Endian (otherwise). If it is, return
2983 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2984 else return 0. */
2985
2986 #define UTF_16_INVALID_P(val) \
2987 (((val) == 0xFFFE) \
2988 || ((val) == 0xFFFF))
2989
2990 #define UTF_16_HIGH_SURROGATE_P(val) \
2991 (((val) & 0xD800) == 0xD800)
2992
2993 #define UTF_16_LOW_SURROGATE_P(val) \
2994 (((val) & 0xDC00) == 0xDC00)
2995
2996 static int
2997 detect_coding_utf_16 (src, src_end, multibytep)
2998 unsigned char *src, *src_end;
2999 int multibytep;
3000 {
3001 unsigned char c1, c2;
3002 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3003 struct coding_system dummy_coding;
3004 struct coding_system *coding = &dummy_coding;
3005
3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3007 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3008
3009 if ((c1 == 0xFF) && (c2 == 0xFE))
3010 return CODING_CATEGORY_MASK_UTF_16_LE;
3011 else if ((c1 == 0xFE) && (c2 == 0xFF))
3012 return CODING_CATEGORY_MASK_UTF_16_BE;
3013
3014 label_end_of_loop:
3015 return 0;
3016 }
3017
3018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3019 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3020
3021 static void
3022 decode_coding_sjis_big5 (coding, source, destination,
3023 src_bytes, dst_bytes, sjis_p)
3024 struct coding_system *coding;
3025 unsigned char *source, *destination;
3026 int src_bytes, dst_bytes;
3027 int sjis_p;
3028 {
3029 unsigned char *src = source;
3030 unsigned char *src_end = source + src_bytes;
3031 unsigned char *dst = destination;
3032 unsigned char *dst_end = destination + dst_bytes;
3033 /* SRC_BASE remembers the start position in source in each loop.
3034 The loop will be exited when there's not enough source code
3035 (within macro ONE_MORE_BYTE), or when there's not enough
3036 destination area to produce a character (within macro
3037 EMIT_CHAR). */
3038 unsigned char *src_base;
3039 Lisp_Object translation_table;
3040
3041 if (NILP (Venable_character_translation))
3042 translation_table = Qnil;
3043 else
3044 {
3045 translation_table = coding->translation_table_for_decode;
3046 if (NILP (translation_table))
3047 translation_table = Vstandard_translation_table_for_decode;
3048 }
3049
3050 coding->produced_char = 0;
3051 while (1)
3052 {
3053 int c, charset, c1, c2;
3054
3055 src_base = src;
3056 ONE_MORE_BYTE (c1);
3057
3058 if (c1 < 0x80)
3059 {
3060 charset = CHARSET_ASCII;
3061 if (c1 < 0x20)
3062 {
3063 if (c1 == '\r')
3064 {
3065 if (coding->eol_type == CODING_EOL_CRLF)
3066 {
3067 ONE_MORE_BYTE (c2);
3068 if (c2 == '\n')
3069 c1 = c2;
3070 else
3071 /* To process C2 again, SRC is subtracted by 1. */
3072 src--;
3073 }
3074 else if (coding->eol_type == CODING_EOL_CR)
3075 c1 = '\n';
3076 }
3077 else if (c1 == '\n'
3078 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3079 && (coding->eol_type == CODING_EOL_CR
3080 || coding->eol_type == CODING_EOL_CRLF))
3081 {
3082 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3083 goto label_end_of_loop;
3084 }
3085 }
3086 }
3087 else
3088 {
3089 if (sjis_p)
3090 {
3091 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3092 goto label_invalid_code;
3093 if (c1 <= 0x9F || c1 >= 0xE0)
3094 {
3095 /* SJIS -> JISX0208 */
3096 ONE_MORE_BYTE (c2);
3097 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3098 goto label_invalid_code;
3099 DECODE_SJIS (c1, c2, c1, c2);
3100 charset = charset_jisx0208;
3101 }
3102 else
3103 /* SJIS -> JISX0201-Kana */
3104 charset = charset_katakana_jisx0201;
3105 }
3106 else
3107 {
3108 /* BIG5 -> Big5 */
3109 if (c1 < 0xA0 || c1 > 0xFE)
3110 goto label_invalid_code;
3111 ONE_MORE_BYTE (c2);
3112 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3113 goto label_invalid_code;
3114 DECODE_BIG5 (c1, c2, charset, c1, c2);
3115 }
3116 }
3117
3118 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3119 EMIT_CHAR (c);
3120 continue;
3121
3122 label_invalid_code:
3123 coding->errors++;
3124 src = src_base;
3125 c = *src++;
3126 EMIT_CHAR (c);
3127 }
3128
3129 label_end_of_loop:
3130 coding->consumed = coding->consumed_char = src_base - source;
3131 coding->produced = dst - destination;
3132 return;
3133 }
3134
3135 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3136 This function can encode charsets `ascii', `katakana-jisx0201',
3137 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3138 are sure that all these charsets are registered as official charset
3139 (i.e. do not have extended leading-codes). Characters of other
3140 charsets are produced without any encoding. If SJIS_P is 1, encode
3141 SJIS text, else encode BIG5 text. */
3142
3143 static void
3144 encode_coding_sjis_big5 (coding, source, destination,
3145 src_bytes, dst_bytes, sjis_p)
3146 struct coding_system *coding;
3147 unsigned char *source, *destination;
3148 int src_bytes, dst_bytes;
3149 int sjis_p;
3150 {
3151 unsigned char *src = source;
3152 unsigned char *src_end = source + src_bytes;
3153 unsigned char *dst = destination;
3154 unsigned char *dst_end = destination + dst_bytes;
3155 /* SRC_BASE remembers the start position in source in each loop.
3156 The loop will be exited when there's not enough source text to
3157 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3158 there's not enough destination area to produce encoded codes
3159 (within macro EMIT_BYTES). */
3160 unsigned char *src_base;
3161 Lisp_Object translation_table;
3162
3163 if (NILP (Venable_character_translation))
3164 translation_table = Qnil;
3165 else
3166 {
3167 translation_table = coding->translation_table_for_encode;
3168 if (NILP (translation_table))
3169 translation_table = Vstandard_translation_table_for_encode;
3170 }
3171
3172 while (1)
3173 {
3174 int c, charset, c1, c2;
3175
3176 src_base = src;
3177 ONE_MORE_CHAR (c);
3178
3179 /* Now encode the character C. */
3180 if (SINGLE_BYTE_CHAR_P (c))
3181 {
3182 switch (c)
3183 {
3184 case '\r':
3185 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3186 {
3187 EMIT_ONE_BYTE (c);
3188 break;
3189 }
3190 c = '\n';
3191 case '\n':
3192 if (coding->eol_type == CODING_EOL_CRLF)
3193 {
3194 EMIT_TWO_BYTES ('\r', c);
3195 break;
3196 }
3197 else if (coding->eol_type == CODING_EOL_CR)
3198 c = '\r';
3199 default:
3200 EMIT_ONE_BYTE (c);
3201 }
3202 }
3203 else
3204 {
3205 SPLIT_CHAR (c, charset, c1, c2);
3206 if (sjis_p)
3207 {
3208 if (charset == charset_jisx0208
3209 || charset == charset_jisx0208_1978)
3210 {
3211 ENCODE_SJIS (c1, c2, c1, c2);
3212 EMIT_TWO_BYTES (c1, c2);
3213 }
3214 else if (charset == charset_katakana_jisx0201)
3215 EMIT_ONE_BYTE (c1 | 0x80);
3216 else if (charset == charset_latin_jisx0201)
3217 EMIT_ONE_BYTE (c1);
3218 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3219 {
3220 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3221 if (CHARSET_WIDTH (charset) > 1)
3222 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3223 }
3224 else
3225 /* There's no way other than producing the internal
3226 codes as is. */
3227 EMIT_BYTES (src_base, src);
3228 }
3229 else
3230 {
3231 if (charset == charset_big5_1 || charset == charset_big5_2)
3232 {
3233 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3234 EMIT_TWO_BYTES (c1, c2);
3235 }
3236 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3237 {
3238 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3239 if (CHARSET_WIDTH (charset) > 1)
3240 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3241 }
3242 else
3243 /* There's no way other than producing the internal
3244 codes as is. */
3245 EMIT_BYTES (src_base, src);
3246 }
3247 }
3248 coding->consumed_char++;
3249 }
3250
3251 label_end_of_loop:
3252 coding->consumed = src_base - source;
3253 coding->produced = coding->produced_char = dst - destination;
3254 }
3255
3256 \f
3257 /*** 5. CCL handlers ***/
3258
3259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3260 Check if a text is encoded in a coding system of which
3261 encoder/decoder are written in CCL program. If it is, return
3262 CODING_CATEGORY_MASK_CCL, else return 0. */
3263
3264 static int
3265 detect_coding_ccl (src, src_end, multibytep)
3266 unsigned char *src, *src_end;
3267 int multibytep;
3268 {
3269 unsigned char *valid;
3270 int c;
3271 /* Dummy for ONE_MORE_BYTE. */
3272 struct coding_system dummy_coding;
3273 struct coding_system *coding = &dummy_coding;
3274
3275 /* No coding system is assigned to coding-category-ccl. */
3276 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3277 return 0;
3278
3279 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3280 while (1)
3281 {
3282 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3283 if (! valid[c])
3284 return 0;
3285 }
3286 label_end_of_loop:
3287 return CODING_CATEGORY_MASK_CCL;
3288 }
3289
3290 \f
3291 /*** 6. End-of-line handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3294
3295 static void
3296 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3297 struct coding_system *coding;
3298 unsigned char *source, *destination;
3299 int src_bytes, dst_bytes;
3300 {
3301 unsigned char *src = source;
3302 unsigned char *dst = destination;
3303 unsigned char *src_end = src + src_bytes;
3304 unsigned char *dst_end = dst + dst_bytes;
3305 Lisp_Object translation_table;
3306 /* SRC_BASE remembers the start position in source in each loop.
3307 The loop will be exited when there's not enough source code
3308 (within macro ONE_MORE_BYTE), or when there's not enough
3309 destination area to produce a character (within macro
3310 EMIT_CHAR). */
3311 unsigned char *src_base;
3312 int c;
3313
3314 translation_table = Qnil;
3315 switch (coding->eol_type)
3316 {
3317 case CODING_EOL_CRLF:
3318 while (1)
3319 {
3320 src_base = src;
3321 ONE_MORE_BYTE (c);
3322 if (c == '\r')
3323 {
3324 ONE_MORE_BYTE (c);
3325 if (c != '\n')
3326 {
3327 src--;
3328 c = '\r';
3329 }
3330 }
3331 else if (c == '\n'
3332 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3333 {
3334 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3335 goto label_end_of_loop;
3336 }
3337 EMIT_CHAR (c);
3338 }
3339 break;
3340
3341 case CODING_EOL_CR:
3342 while (1)
3343 {
3344 src_base = src;
3345 ONE_MORE_BYTE (c);
3346 if (c == '\n')
3347 {
3348 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3349 {
3350 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3351 goto label_end_of_loop;
3352 }
3353 }
3354 else if (c == '\r')
3355 c = '\n';
3356 EMIT_CHAR (c);
3357 }
3358 break;
3359
3360 default: /* no need for EOL handling */
3361 while (1)
3362 {
3363 src_base = src;
3364 ONE_MORE_BYTE (c);
3365 EMIT_CHAR (c);
3366 }
3367 }
3368
3369 label_end_of_loop:
3370 coding->consumed = coding->consumed_char = src_base - source;
3371 coding->produced = dst - destination;
3372 return;
3373 }
3374
3375 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3376 format of end-of-line according to `coding->eol_type'. It also
3377 convert multibyte form 8-bit characters to unibyte if
3378 CODING->src_multibyte is nonzero. If `coding->mode &
3379 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3380 also means end-of-line. */
3381
3382 static void
3383 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3384 struct coding_system *coding;
3385 const unsigned char *source;
3386 unsigned char *destination;
3387 int src_bytes, dst_bytes;
3388 {
3389 const unsigned char *src = source;
3390 unsigned char *dst = destination;
3391 const unsigned char *src_end = src + src_bytes;
3392 unsigned char *dst_end = dst + dst_bytes;
3393 Lisp_Object translation_table;
3394 /* SRC_BASE remembers the start position in source in each loop.
3395 The loop will be exited when there's not enough source text to
3396 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3397 there's not enough destination area to produce encoded codes
3398 (within macro EMIT_BYTES). */
3399 const unsigned char *src_base;
3400 unsigned char *tmp;
3401 int c;
3402 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3403
3404 translation_table = Qnil;
3405 if (coding->src_multibyte
3406 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3407 {
3408 src_end--;
3409 src_bytes--;
3410 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3411 }
3412
3413 if (coding->eol_type == CODING_EOL_CRLF)
3414 {
3415 while (src < src_end)
3416 {
3417 src_base = src;
3418 c = *src++;
3419 if (c >= 0x20)
3420 EMIT_ONE_BYTE (c);
3421 else if (c == '\n' || (c == '\r' && selective_display))
3422 EMIT_TWO_BYTES ('\r', '\n');
3423 else
3424 EMIT_ONE_BYTE (c);
3425 }
3426 src_base = src;
3427 label_end_of_loop:
3428 ;
3429 }
3430 else
3431 {
3432 if (!dst_bytes || src_bytes <= dst_bytes)
3433 {
3434 safe_bcopy (src, dst, src_bytes);
3435 src_base = src_end;
3436 dst += src_bytes;
3437 }
3438 else
3439 {
3440 if (coding->src_multibyte
3441 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3442 dst_bytes--;
3443 safe_bcopy (src, dst, dst_bytes);
3444 src_base = src + dst_bytes;
3445 dst = destination + dst_bytes;
3446 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3447 }
3448 if (coding->eol_type == CODING_EOL_CR)
3449 {
3450 for (tmp = destination; tmp < dst; tmp++)
3451 if (*tmp == '\n') *tmp = '\r';
3452 }
3453 else if (selective_display)
3454 {
3455 for (tmp = destination; tmp < dst; tmp++)
3456 if (*tmp == '\r') *tmp = '\n';
3457 }
3458 }
3459 if (coding->src_multibyte)
3460 dst = destination + str_as_unibyte (destination, dst - destination);
3461
3462 coding->consumed = src_base - source;
3463 coding->produced = dst - destination;
3464 coding->produced_char = coding->produced;
3465 }
3466
3467 \f
3468 /*** 7. C library functions ***/
3469
3470 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3471 has a property `coding-system'. The value of this property is a
3472 vector of length 5 (called the coding-vector). Among elements of
3473 this vector, the first (element[0]) and the fifth (element[4])
3474 carry important information for decoding/encoding. Before
3475 decoding/encoding, this information should be set in fields of a
3476 structure of type `coding_system'.
3477
3478 The value of the property `coding-system' can be a symbol of another
3479 subsidiary coding-system. In that case, Emacs gets coding-vector
3480 from that symbol.
3481
3482 `element[0]' contains information to be set in `coding->type'. The
3483 value and its meaning is as follows:
3484
3485 0 -- coding_type_emacs_mule
3486 1 -- coding_type_sjis
3487 2 -- coding_type_iso2022
3488 3 -- coding_type_big5
3489 4 -- coding_type_ccl encoder/decoder written in CCL
3490 nil -- coding_type_no_conversion
3491 t -- coding_type_undecided (automatic conversion on decoding,
3492 no-conversion on encoding)
3493
3494 `element[4]' contains information to be set in `coding->flags' and
3495 `coding->spec'. The meaning varies by `coding->type'.
3496
3497 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3498 of length 32 (of which the first 13 sub-elements are used now).
3499 Meanings of these sub-elements are:
3500
3501 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3502 If the value is an integer of valid charset, the charset is
3503 assumed to be designated to graphic register N initially.
3504
3505 If the value is minus, it is a minus value of charset which
3506 reserves graphic register N, which means that the charset is
3507 not designated initially but should be designated to graphic
3508 register N just before encoding a character in that charset.
3509
3510 If the value is nil, graphic register N is never used on
3511 encoding.
3512
3513 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3514 Each value takes t or nil. See the section ISO2022 of
3515 `coding.h' for more information.
3516
3517 If `coding->type' is `coding_type_big5', element[4] is t to denote
3518 BIG5-ETen or nil to denote BIG5-HKU.
3519
3520 If `coding->type' takes the other value, element[4] is ignored.
3521
3522 Emacs Lisp's coding systems also carry information about format of
3523 end-of-line in a value of property `eol-type'. If the value is
3524 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3525 means CODING_EOL_CR. If it is not integer, it should be a vector
3526 of subsidiary coding systems of which property `eol-type' has one
3527 of the above values.
3528
3529 */
3530
3531 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3532 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3533 is setup so that no conversion is necessary and return -1, else
3534 return 0. */
3535
3536 int
3537 setup_coding_system (coding_system, coding)
3538 Lisp_Object coding_system;
3539 struct coding_system *coding;
3540 {
3541 Lisp_Object coding_spec, coding_type, eol_type, plist;
3542 Lisp_Object val;
3543
3544 /* At first, zero clear all members. */
3545 bzero (coding, sizeof (struct coding_system));
3546
3547 /* Initialize some fields required for all kinds of coding systems. */
3548 coding->symbol = coding_system;
3549 coding->heading_ascii = -1;
3550 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3551 coding->composing = COMPOSITION_DISABLED;
3552 coding->cmp_data = NULL;
3553
3554 if (NILP (coding_system))
3555 goto label_invalid_coding_system;
3556
3557 coding_spec = Fget (coding_system, Qcoding_system);
3558
3559 if (!VECTORP (coding_spec)
3560 || XVECTOR (coding_spec)->size != 5
3561 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3562 goto label_invalid_coding_system;
3563
3564 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3565 if (VECTORP (eol_type))
3566 {
3567 coding->eol_type = CODING_EOL_UNDECIDED;
3568 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3569 }
3570 else if (XFASTINT (eol_type) == 1)
3571 {
3572 coding->eol_type = CODING_EOL_CRLF;
3573 coding->common_flags
3574 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3575 }
3576 else if (XFASTINT (eol_type) == 2)
3577 {
3578 coding->eol_type = CODING_EOL_CR;
3579 coding->common_flags
3580 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3581 }
3582 else
3583 coding->eol_type = CODING_EOL_LF;
3584
3585 coding_type = XVECTOR (coding_spec)->contents[0];
3586 /* Try short cut. */
3587 if (SYMBOLP (coding_type))
3588 {
3589 if (EQ (coding_type, Qt))
3590 {
3591 coding->type = coding_type_undecided;
3592 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3593 }
3594 else
3595 coding->type = coding_type_no_conversion;
3596 /* Initialize this member. Any thing other than
3597 CODING_CATEGORY_IDX_UTF_16_BE and
3598 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3599 special treatment in detect_eol. */
3600 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3601
3602 return 0;
3603 }
3604
3605 /* Get values of coding system properties:
3606 `post-read-conversion', `pre-write-conversion',
3607 `translation-table-for-decode', `translation-table-for-encode'. */
3608 plist = XVECTOR (coding_spec)->contents[3];
3609 /* Pre & post conversion functions should be disabled if
3610 inhibit_eol_conversion is nonzero. This is the case that a code
3611 conversion function is called while those functions are running. */
3612 if (! inhibit_pre_post_conversion)
3613 {
3614 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3615 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3616 }
3617 val = Fplist_get (plist, Qtranslation_table_for_decode);
3618 if (SYMBOLP (val))
3619 val = Fget (val, Qtranslation_table_for_decode);
3620 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3621 val = Fplist_get (plist, Qtranslation_table_for_encode);
3622 if (SYMBOLP (val))
3623 val = Fget (val, Qtranslation_table_for_encode);
3624 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3625 val = Fplist_get (plist, Qcoding_category);
3626 if (!NILP (val))
3627 {
3628 val = Fget (val, Qcoding_category_index);
3629 if (INTEGERP (val))
3630 coding->category_idx = XINT (val);
3631 else
3632 goto label_invalid_coding_system;
3633 }
3634 else
3635 goto label_invalid_coding_system;
3636
3637 /* If the coding system has non-nil `composition' property, enable
3638 composition handling. */
3639 val = Fplist_get (plist, Qcomposition);
3640 if (!NILP (val))
3641 coding->composing = COMPOSITION_NO;
3642
3643 switch (XFASTINT (coding_type))
3644 {
3645 case 0:
3646 coding->type = coding_type_emacs_mule;
3647 coding->common_flags
3648 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3649 if (!NILP (coding->post_read_conversion))
3650 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3651 if (!NILP (coding->pre_write_conversion))
3652 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3653 break;
3654
3655 case 1:
3656 coding->type = coding_type_sjis;
3657 coding->common_flags
3658 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3659 break;
3660
3661 case 2:
3662 coding->type = coding_type_iso2022;
3663 coding->common_flags
3664 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3665 {
3666 Lisp_Object val, temp;
3667 Lisp_Object *flags;
3668 int i, charset, reg_bits = 0;
3669
3670 val = XVECTOR (coding_spec)->contents[4];
3671
3672 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3673 goto label_invalid_coding_system;
3674
3675 flags = XVECTOR (val)->contents;
3676 coding->flags
3677 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3678 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3679 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3680 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3681 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3682 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3683 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3684 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3685 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3686 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3687 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3688 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3689 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3690 );
3691
3692 /* Invoke graphic register 0 to plane 0. */
3693 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3694 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3695 CODING_SPEC_ISO_INVOCATION (coding, 1)
3696 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3697 /* Not single shifting at first. */
3698 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3699 /* Beginning of buffer should also be regarded as bol. */
3700 CODING_SPEC_ISO_BOL (coding) = 1;
3701
3702 for (charset = 0; charset <= MAX_CHARSET; charset++)
3703 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3704 val = Vcharset_revision_alist;
3705 while (CONSP (val))
3706 {
3707 charset = get_charset_id (Fcar_safe (XCAR (val)));
3708 if (charset >= 0
3709 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3710 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3711 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3712 val = XCDR (val);
3713 }
3714
3715 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3716 FLAGS[REG] can be one of below:
3717 integer CHARSET: CHARSET occupies register I,
3718 t: designate nothing to REG initially, but can be used
3719 by any charsets,
3720 list of integer, nil, or t: designate the first
3721 element (if integer) to REG initially, the remaining
3722 elements (if integer) is designated to REG on request,
3723 if an element is t, REG can be used by any charsets,
3724 nil: REG is never used. */
3725 for (charset = 0; charset <= MAX_CHARSET; charset++)
3726 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3727 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3728 for (i = 0; i < 4; i++)
3729 {
3730 if ((INTEGERP (flags[i])
3731 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3732 || (charset = get_charset_id (flags[i])) >= 0)
3733 {
3734 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3735 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3736 }
3737 else if (EQ (flags[i], Qt))
3738 {
3739 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3740 reg_bits |= 1 << i;
3741 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3742 }
3743 else if (CONSP (flags[i]))
3744 {
3745 Lisp_Object tail;
3746 tail = flags[i];
3747
3748 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3749 if ((INTEGERP (XCAR (tail))
3750 && (charset = XINT (XCAR (tail)),
3751 CHARSET_VALID_P (charset)))
3752 || (charset = get_charset_id (XCAR (tail))) >= 0)
3753 {
3754 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3755 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3756 }
3757 else
3758 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3759 tail = XCDR (tail);
3760 while (CONSP (tail))
3761 {
3762 if ((INTEGERP (XCAR (tail))
3763 && (charset = XINT (XCAR (tail)),
3764 CHARSET_VALID_P (charset)))
3765 || (charset = get_charset_id (XCAR (tail))) >= 0)
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = i;
3768 else if (EQ (XCAR (tail), Qt))
3769 reg_bits |= 1 << i;
3770 tail = XCDR (tail);
3771 }
3772 }
3773 else
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3775
3776 CODING_SPEC_ISO_DESIGNATION (coding, i)
3777 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3778 }
3779
3780 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3781 {
3782 /* REG 1 can be used only by locking shift in 7-bit env. */
3783 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3784 reg_bits &= ~2;
3785 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3786 /* Without any shifting, only REG 0 and 1 can be used. */
3787 reg_bits &= 3;
3788 }
3789
3790 if (reg_bits)
3791 for (charset = 0; charset <= MAX_CHARSET; charset++)
3792 {
3793 if (CHARSET_DEFINED_P (charset)
3794 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3795 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3796 {
3797 /* There exist some default graphic registers to be
3798 used by CHARSET. */
3799
3800 /* We had better avoid designating a charset of
3801 CHARS96 to REG 0 as far as possible. */
3802 if (CHARSET_CHARS (charset) == 96)
3803 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3804 = (reg_bits & 2
3805 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3806 else
3807 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3808 = (reg_bits & 1
3809 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3810 }
3811 }
3812 }
3813 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3814 coding->spec.iso2022.last_invalid_designation_register = -1;
3815 break;
3816
3817 case 3:
3818 coding->type = coding_type_big5;
3819 coding->common_flags
3820 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3821 coding->flags
3822 = (NILP (XVECTOR (coding_spec)->contents[4])
3823 ? CODING_FLAG_BIG5_HKU
3824 : CODING_FLAG_BIG5_ETEN);
3825 break;
3826
3827 case 4:
3828 coding->type = coding_type_ccl;
3829 coding->common_flags
3830 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3831 {
3832 val = XVECTOR (coding_spec)->contents[4];
3833 if (! CONSP (val)
3834 || setup_ccl_program (&(coding->spec.ccl.decoder),
3835 XCAR (val)) < 0
3836 || setup_ccl_program (&(coding->spec.ccl.encoder),
3837 XCDR (val)) < 0)
3838 goto label_invalid_coding_system;
3839
3840 bzero (coding->spec.ccl.valid_codes, 256);
3841 val = Fplist_get (plist, Qvalid_codes);
3842 if (CONSP (val))
3843 {
3844 Lisp_Object this;
3845
3846 for (; CONSP (val); val = XCDR (val))
3847 {
3848 this = XCAR (val);
3849 if (INTEGERP (this)
3850 && XINT (this) >= 0 && XINT (this) < 256)
3851 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3852 else if (CONSP (this)
3853 && INTEGERP (XCAR (this))
3854 && INTEGERP (XCDR (this)))
3855 {
3856 int start = XINT (XCAR (this));
3857 int end = XINT (XCDR (this));
3858
3859 if (start >= 0 && start <= end && end < 256)
3860 while (start <= end)
3861 coding->spec.ccl.valid_codes[start++] = 1;
3862 }
3863 }
3864 }
3865 }
3866 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3867 coding->spec.ccl.cr_carryover = 0;
3868 coding->spec.ccl.eight_bit_carryover[0] = 0;
3869 break;
3870
3871 case 5:
3872 coding->type = coding_type_raw_text;
3873 break;
3874
3875 default:
3876 goto label_invalid_coding_system;
3877 }
3878 return 0;
3879
3880 label_invalid_coding_system:
3881 coding->type = coding_type_no_conversion;
3882 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3883 coding->common_flags = 0;
3884 coding->eol_type = CODING_EOL_LF;
3885 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3886 return -1;
3887 }
3888
3889 /* Free memory blocks allocated for storing composition information. */
3890
3891 void
3892 coding_free_composition_data (coding)
3893 struct coding_system *coding;
3894 {
3895 struct composition_data *cmp_data = coding->cmp_data, *next;
3896
3897 if (!cmp_data)
3898 return;
3899 /* Memory blocks are chained. At first, rewind to the first, then,
3900 free blocks one by one. */
3901 while (cmp_data->prev)
3902 cmp_data = cmp_data->prev;
3903 while (cmp_data)
3904 {
3905 next = cmp_data->next;
3906 xfree (cmp_data);
3907 cmp_data = next;
3908 }
3909 coding->cmp_data = NULL;
3910 }
3911
3912 /* Set `char_offset' member of all memory blocks pointed by
3913 coding->cmp_data to POS. */
3914
3915 void
3916 coding_adjust_composition_offset (coding, pos)
3917 struct coding_system *coding;
3918 int pos;
3919 {
3920 struct composition_data *cmp_data;
3921
3922 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3923 cmp_data->char_offset = pos;
3924 }
3925
3926 /* Setup raw-text or one of its subsidiaries in the structure
3927 coding_system CODING according to the already setup value eol_type
3928 in CODING. CODING should be setup for some coding system in
3929 advance. */
3930
3931 void
3932 setup_raw_text_coding_system (coding)
3933 struct coding_system *coding;
3934 {
3935 if (coding->type != coding_type_raw_text)
3936 {
3937 coding->symbol = Qraw_text;
3938 coding->type = coding_type_raw_text;
3939 if (coding->eol_type != CODING_EOL_UNDECIDED)
3940 {
3941 Lisp_Object subsidiaries;
3942 subsidiaries = Fget (Qraw_text, Qeol_type);
3943
3944 if (VECTORP (subsidiaries)
3945 && XVECTOR (subsidiaries)->size == 3)
3946 coding->symbol
3947 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3948 }
3949 setup_coding_system (coding->symbol, coding);
3950 }
3951 return;
3952 }
3953
3954 /* Emacs has a mechanism to automatically detect a coding system if it
3955 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3956 it's impossible to distinguish some coding systems accurately
3957 because they use the same range of codes. So, at first, coding
3958 systems are categorized into 7, those are:
3959
3960 o coding-category-emacs-mule
3961
3962 The category for a coding system which has the same code range
3963 as Emacs' internal format. Assigned the coding-system (Lisp
3964 symbol) `emacs-mule' by default.
3965
3966 o coding-category-sjis
3967
3968 The category for a coding system which has the same code range
3969 as SJIS. Assigned the coding-system (Lisp
3970 symbol) `japanese-shift-jis' by default.
3971
3972 o coding-category-iso-7
3973
3974 The category for a coding system which has the same code range
3975 as ISO2022 of 7-bit environment. This doesn't use any locking
3976 shift and single shift functions. This can encode/decode all
3977 charsets. Assigned the coding-system (Lisp symbol)
3978 `iso-2022-7bit' by default.
3979
3980 o coding-category-iso-7-tight
3981
3982 Same as coding-category-iso-7 except that this can
3983 encode/decode only the specified charsets.
3984
3985 o coding-category-iso-8-1
3986
3987 The category for a coding system which has the same code range
3988 as ISO2022 of 8-bit environment and graphic plane 1 used only
3989 for DIMENSION1 charset. This doesn't use any locking shift
3990 and single shift functions. Assigned the coding-system (Lisp
3991 symbol) `iso-latin-1' by default.
3992
3993 o coding-category-iso-8-2
3994
3995 The category for a coding system which has the same code range
3996 as ISO2022 of 8-bit environment and graphic plane 1 used only
3997 for DIMENSION2 charset. This doesn't use any locking shift
3998 and single shift functions. Assigned the coding-system (Lisp
3999 symbol) `japanese-iso-8bit' by default.
4000
4001 o coding-category-iso-7-else
4002
4003 The category for a coding system which has the same code range
4004 as ISO2022 of 7-bit environment but uses locking shift or
4005 single shift functions. Assigned the coding-system (Lisp
4006 symbol) `iso-2022-7bit-lock' by default.
4007
4008 o coding-category-iso-8-else
4009
4010 The category for a coding system which has the same code range
4011 as ISO2022 of 8-bit environment but uses locking shift or
4012 single shift functions. Assigned the coding-system (Lisp
4013 symbol) `iso-2022-8bit-ss2' by default.
4014
4015 o coding-category-big5
4016
4017 The category for a coding system which has the same code range
4018 as BIG5. Assigned the coding-system (Lisp symbol)
4019 `cn-big5' by default.
4020
4021 o coding-category-utf-8
4022
4023 The category for a coding system which has the same code range
4024 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4025 symbol) `utf-8' by default.
4026
4027 o coding-category-utf-16-be
4028
4029 The category for a coding system in which a text has an
4030 Unicode signature (cf. Unicode Standard) in the order of BIG
4031 endian at the head. Assigned the coding-system (Lisp symbol)
4032 `utf-16-be' by default.
4033
4034 o coding-category-utf-16-le
4035
4036 The category for a coding system in which a text has an
4037 Unicode signature (cf. Unicode Standard) in the order of
4038 LITTLE endian at the head. Assigned the coding-system (Lisp
4039 symbol) `utf-16-le' by default.
4040
4041 o coding-category-ccl
4042
4043 The category for a coding system of which encoder/decoder is
4044 written in CCL programs. The default value is nil, i.e., no
4045 coding system is assigned.
4046
4047 o coding-category-binary
4048
4049 The category for a coding system not categorized in any of the
4050 above. Assigned the coding-system (Lisp symbol)
4051 `no-conversion' by default.
4052
4053 Each of them is a Lisp symbol and the value is an actual
4054 `coding-system' (this is also a Lisp symbol) assigned by a user.
4055 What Emacs does actually is to detect a category of coding system.
4056 Then, it uses a `coding-system' assigned to it. If Emacs can't
4057 decide a single possible category, it selects a category of the
4058 highest priority. Priorities of categories are also specified by a
4059 user in a Lisp variable `coding-category-list'.
4060
4061 */
4062
4063 static
4064 int ascii_skip_code[256];
4065
4066 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4067 If it detects possible coding systems, return an integer in which
4068 appropriate flag bits are set. Flag bits are defined by macros
4069 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4070 it should point the table `coding_priorities'. In that case, only
4071 the flag bit for a coding system of the highest priority is set in
4072 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4073 range 0x80..0x9F are in multibyte form.
4074
4075 How many ASCII characters are at the head is returned as *SKIP. */
4076
4077 static int
4078 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4079 unsigned char *source;
4080 int src_bytes, *priorities, *skip;
4081 int multibytep;
4082 {
4083 register unsigned char c;
4084 unsigned char *src = source, *src_end = source + src_bytes;
4085 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4086 int i;
4087
4088 /* At first, skip all ASCII characters and control characters except
4089 for three ISO2022 specific control characters. */
4090 ascii_skip_code[ISO_CODE_SO] = 0;
4091 ascii_skip_code[ISO_CODE_SI] = 0;
4092 ascii_skip_code[ISO_CODE_ESC] = 0;
4093
4094 label_loop_detect_coding:
4095 while (src < src_end && ascii_skip_code[*src]) src++;
4096 *skip = src - source;
4097
4098 if (src >= src_end)
4099 /* We found nothing other than ASCII. There's nothing to do. */
4100 return 0;
4101
4102 c = *src;
4103 /* The text seems to be encoded in some multilingual coding system.
4104 Now, try to find in which coding system the text is encoded. */
4105 if (c < 0x80)
4106 {
4107 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4108 /* C is an ISO2022 specific control code of C0. */
4109 mask = detect_coding_iso2022 (src, src_end, multibytep);
4110 if (mask == 0)
4111 {
4112 /* No valid ISO2022 code follows C. Try again. */
4113 src++;
4114 if (c == ISO_CODE_ESC)
4115 ascii_skip_code[ISO_CODE_ESC] = 1;
4116 else
4117 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4118 goto label_loop_detect_coding;
4119 }
4120 if (priorities)
4121 {
4122 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4123 {
4124 if (mask & priorities[i])
4125 return priorities[i];
4126 }
4127 return CODING_CATEGORY_MASK_RAW_TEXT;
4128 }
4129 }
4130 else
4131 {
4132 int try;
4133
4134 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4135 c = src[1] - 0x20;
4136
4137 if (c < 0xA0)
4138 {
4139 /* C is the first byte of SJIS character code,
4140 or a leading-code of Emacs' internal format (emacs-mule),
4141 or the first byte of UTF-16. */
4142 try = (CODING_CATEGORY_MASK_SJIS
4143 | CODING_CATEGORY_MASK_EMACS_MULE
4144 | CODING_CATEGORY_MASK_UTF_16_BE
4145 | CODING_CATEGORY_MASK_UTF_16_LE);
4146
4147 /* Or, if C is a special latin extra code,
4148 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4149 or is an ISO2022 control-sequence-introducer (CSI),
4150 we should also consider the possibility of ISO2022 codings. */
4151 if ((VECTORP (Vlatin_extra_code_table)
4152 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4153 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4154 || (c == ISO_CODE_CSI
4155 && (src < src_end
4156 && (*src == ']'
4157 || ((*src == '0' || *src == '1' || *src == '2')
4158 && src + 1 < src_end
4159 && src[1] == ']')))))
4160 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4161 | CODING_CATEGORY_MASK_ISO_8BIT);
4162 }
4163 else
4164 /* C is a character of ISO2022 in graphic plane right,
4165 or a SJIS's 1-byte character code (i.e. JISX0201),
4166 or the first byte of BIG5's 2-byte code,
4167 or the first byte of UTF-8/16. */
4168 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4169 | CODING_CATEGORY_MASK_ISO_8BIT
4170 | CODING_CATEGORY_MASK_SJIS
4171 | CODING_CATEGORY_MASK_BIG5
4172 | CODING_CATEGORY_MASK_UTF_8
4173 | CODING_CATEGORY_MASK_UTF_16_BE
4174 | CODING_CATEGORY_MASK_UTF_16_LE);
4175
4176 /* Or, we may have to consider the possibility of CCL. */
4177 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4178 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4179 ->spec.ccl.valid_codes)[c])
4180 try |= CODING_CATEGORY_MASK_CCL;
4181
4182 mask = 0;
4183 utf16_examined_p = iso2022_examined_p = 0;
4184 if (priorities)
4185 {
4186 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4187 {
4188 if (!iso2022_examined_p
4189 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4190 {
4191 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4192 iso2022_examined_p = 1;
4193 }
4194 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4195 mask |= detect_coding_sjis (src, src_end, multibytep);
4196 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4197 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4198 else if (!utf16_examined_p
4199 && (priorities[i] & try &
4200 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4201 {
4202 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4203 utf16_examined_p = 1;
4204 }
4205 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4206 mask |= detect_coding_big5 (src, src_end, multibytep);
4207 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4208 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4209 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4210 mask |= detect_coding_ccl (src, src_end, multibytep);
4211 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4212 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4213 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4214 mask |= CODING_CATEGORY_MASK_BINARY;
4215 if (mask & priorities[i])
4216 return priorities[i];
4217 }
4218 return CODING_CATEGORY_MASK_RAW_TEXT;
4219 }
4220 if (try & CODING_CATEGORY_MASK_ISO)
4221 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4222 if (try & CODING_CATEGORY_MASK_SJIS)
4223 mask |= detect_coding_sjis (src, src_end, multibytep);
4224 if (try & CODING_CATEGORY_MASK_BIG5)
4225 mask |= detect_coding_big5 (src, src_end, multibytep);
4226 if (try & CODING_CATEGORY_MASK_UTF_8)
4227 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4228 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4229 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4230 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4231 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4232 if (try & CODING_CATEGORY_MASK_CCL)
4233 mask |= detect_coding_ccl (src, src_end, multibytep);
4234 }
4235 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4236 }
4237
4238 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4239 The information of the detected coding system is set in CODING. */
4240
4241 void
4242 detect_coding (coding, src, src_bytes)
4243 struct coding_system *coding;
4244 const unsigned char *src;
4245 int src_bytes;
4246 {
4247 unsigned int idx;
4248 int skip, mask;
4249 Lisp_Object val;
4250
4251 val = Vcoding_category_list;
4252 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4253 coding->src_multibyte);
4254 coding->heading_ascii = skip;
4255
4256 if (!mask) return;
4257
4258 /* We found a single coding system of the highest priority in MASK. */
4259 idx = 0;
4260 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4261 if (! mask)
4262 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4263
4264 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4265
4266 if (coding->eol_type != CODING_EOL_UNDECIDED)
4267 {
4268 Lisp_Object tmp;
4269
4270 tmp = Fget (val, Qeol_type);
4271 if (VECTORP (tmp))
4272 val = XVECTOR (tmp)->contents[coding->eol_type];
4273 }
4274
4275 /* Setup this new coding system while preserving some slots. */
4276 {
4277 int src_multibyte = coding->src_multibyte;
4278 int dst_multibyte = coding->dst_multibyte;
4279
4280 setup_coding_system (val, coding);
4281 coding->src_multibyte = src_multibyte;
4282 coding->dst_multibyte = dst_multibyte;
4283 coding->heading_ascii = skip;
4284 }
4285 }
4286
4287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4288 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4289 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4290
4291 How many non-eol characters are at the head is returned as *SKIP. */
4292
4293 #define MAX_EOL_CHECK_COUNT 3
4294
4295 static int
4296 detect_eol_type (source, src_bytes, skip)
4297 unsigned char *source;
4298 int src_bytes, *skip;
4299 {
4300 unsigned char *src = source, *src_end = src + src_bytes;
4301 unsigned char c;
4302 int total = 0; /* How many end-of-lines are found so far. */
4303 int eol_type = CODING_EOL_UNDECIDED;
4304 int this_eol_type;
4305
4306 *skip = 0;
4307
4308 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4309 {
4310 c = *src++;
4311 if (c == '\n' || c == '\r')
4312 {
4313 if (*skip == 0)
4314 *skip = src - 1 - source;
4315 total++;
4316 if (c == '\n')
4317 this_eol_type = CODING_EOL_LF;
4318 else if (src >= src_end || *src != '\n')
4319 this_eol_type = CODING_EOL_CR;
4320 else
4321 this_eol_type = CODING_EOL_CRLF, src++;
4322
4323 if (eol_type == CODING_EOL_UNDECIDED)
4324 /* This is the first end-of-line. */
4325 eol_type = this_eol_type;
4326 else if (eol_type != this_eol_type)
4327 {
4328 /* The found type is different from what found before. */
4329 eol_type = CODING_EOL_INCONSISTENT;
4330 break;
4331 }
4332 }
4333 }
4334
4335 if (*skip == 0)
4336 *skip = src_end - source;
4337 return eol_type;
4338 }
4339
4340 /* Like detect_eol_type, but detect EOL type in 2-octet
4341 big-endian/little-endian format for coding systems utf-16-be and
4342 utf-16-le. */
4343
4344 static int
4345 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4346 unsigned char *source;
4347 int src_bytes, *skip, big_endian_p;
4348 {
4349 unsigned char *src = source, *src_end = src + src_bytes;
4350 unsigned int c1, c2;
4351 int total = 0; /* How many end-of-lines are found so far. */
4352 int eol_type = CODING_EOL_UNDECIDED;
4353 int this_eol_type;
4354 int msb, lsb;
4355
4356 if (big_endian_p)
4357 msb = 0, lsb = 1;
4358 else
4359 msb = 1, lsb = 0;
4360
4361 *skip = 0;
4362
4363 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4364 {
4365 c1 = (src[msb] << 8) | (src[lsb]);
4366 src += 2;
4367
4368 if (c1 == '\n' || c1 == '\r')
4369 {
4370 if (*skip == 0)
4371 *skip = src - 2 - source;
4372 total++;
4373 if (c1 == '\n')
4374 {
4375 this_eol_type = CODING_EOL_LF;
4376 }
4377 else
4378 {
4379 if ((src + 1) >= src_end)
4380 {
4381 this_eol_type = CODING_EOL_CR;
4382 }
4383 else
4384 {
4385 c2 = (src[msb] << 8) | (src[lsb]);
4386 if (c2 == '\n')
4387 this_eol_type = CODING_EOL_CRLF, src += 2;
4388 else
4389 this_eol_type = CODING_EOL_CR;
4390 }
4391 }
4392
4393 if (eol_type == CODING_EOL_UNDECIDED)
4394 /* This is the first end-of-line. */
4395 eol_type = this_eol_type;
4396 else if (eol_type != this_eol_type)
4397 {
4398 /* The found type is different from what found before. */
4399 eol_type = CODING_EOL_INCONSISTENT;
4400 break;
4401 }
4402 }
4403 }
4404
4405 if (*skip == 0)
4406 *skip = src_end - source;
4407 return eol_type;
4408 }
4409
4410 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4411 is encoded. If it detects an appropriate format of end-of-line, it
4412 sets the information in *CODING. */
4413
4414 void
4415 detect_eol (coding, src, src_bytes)
4416 struct coding_system *coding;
4417 const unsigned char *src;
4418 int src_bytes;
4419 {
4420 Lisp_Object val;
4421 int skip;
4422 int eol_type;
4423
4424 switch (coding->category_idx)
4425 {
4426 case CODING_CATEGORY_IDX_UTF_16_BE:
4427 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4428 break;
4429 case CODING_CATEGORY_IDX_UTF_16_LE:
4430 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4431 break;
4432 default:
4433 eol_type = detect_eol_type (src, src_bytes, &skip);
4434 break;
4435 }
4436
4437 if (coding->heading_ascii > skip)
4438 coding->heading_ascii = skip;
4439 else
4440 skip = coding->heading_ascii;
4441
4442 if (eol_type == CODING_EOL_UNDECIDED)
4443 return;
4444 if (eol_type == CODING_EOL_INCONSISTENT)
4445 {
4446 #if 0
4447 /* This code is suppressed until we find a better way to
4448 distinguish raw text file and binary file. */
4449
4450 /* If we have already detected that the coding is raw-text, the
4451 coding should actually be no-conversion. */
4452 if (coding->type == coding_type_raw_text)
4453 {
4454 setup_coding_system (Qno_conversion, coding);
4455 return;
4456 }
4457 /* Else, let's decode only text code anyway. */
4458 #endif /* 0 */
4459 eol_type = CODING_EOL_LF;
4460 }
4461
4462 val = Fget (coding->symbol, Qeol_type);
4463 if (VECTORP (val) && XVECTOR (val)->size == 3)
4464 {
4465 int src_multibyte = coding->src_multibyte;
4466 int dst_multibyte = coding->dst_multibyte;
4467 struct composition_data *cmp_data = coding->cmp_data;
4468
4469 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4470 coding->src_multibyte = src_multibyte;
4471 coding->dst_multibyte = dst_multibyte;
4472 coding->heading_ascii = skip;
4473 coding->cmp_data = cmp_data;
4474 }
4475 }
4476
4477 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4478
4479 #define DECODING_BUFFER_MAG(coding) \
4480 (coding->type == coding_type_iso2022 \
4481 ? 3 \
4482 : (coding->type == coding_type_ccl \
4483 ? coding->spec.ccl.decoder.buf_magnification \
4484 : 2))
4485
4486 /* Return maximum size (bytes) of a buffer enough for decoding
4487 SRC_BYTES of text encoded in CODING. */
4488
4489 int
4490 decoding_buffer_size (coding, src_bytes)
4491 struct coding_system *coding;
4492 int src_bytes;
4493 {
4494 return (src_bytes * DECODING_BUFFER_MAG (coding)
4495 + CONVERSION_BUFFER_EXTRA_ROOM);
4496 }
4497
4498 /* Return maximum size (bytes) of a buffer enough for encoding
4499 SRC_BYTES of text to CODING. */
4500
4501 int
4502 encoding_buffer_size (coding, src_bytes)
4503 struct coding_system *coding;
4504 int src_bytes;
4505 {
4506 int magnification;
4507
4508 if (coding->type == coding_type_ccl)
4509 {
4510 magnification = coding->spec.ccl.encoder.buf_magnification;
4511 if (coding->eol_type == CODING_EOL_CRLF)
4512 magnification *= 2;
4513 }
4514 else if (CODING_REQUIRE_ENCODING (coding))
4515 magnification = 3;
4516 else
4517 magnification = 1;
4518
4519 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4520 }
4521
4522 /* Working buffer for code conversion. */
4523 struct conversion_buffer
4524 {
4525 int size; /* size of data. */
4526 int on_stack; /* 1 if allocated by alloca. */
4527 unsigned char *data;
4528 };
4529
4530 /* Don't use alloca for allocating memory space larger than this, lest
4531 we overflow their stack. */
4532 #define MAX_ALLOCA 16*1024
4533
4534 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4535 #define allocate_conversion_buffer(buf, len) \
4536 do { \
4537 if (len < MAX_ALLOCA) \
4538 { \
4539 buf.data = (unsigned char *) alloca (len); \
4540 buf.on_stack = 1; \
4541 } \
4542 else \
4543 { \
4544 buf.data = (unsigned char *) xmalloc (len); \
4545 buf.on_stack = 0; \
4546 } \
4547 buf.size = len; \
4548 } while (0)
4549
4550 /* Double the allocated memory for *BUF. */
4551 static void
4552 extend_conversion_buffer (buf)
4553 struct conversion_buffer *buf;
4554 {
4555 if (buf->on_stack)
4556 {
4557 unsigned char *save = buf->data;
4558 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4559 bcopy (save, buf->data, buf->size);
4560 buf->on_stack = 0;
4561 }
4562 else
4563 {
4564 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4565 }
4566 buf->size *= 2;
4567 }
4568
4569 /* Free the allocated memory for BUF if it is not on stack. */
4570 static void
4571 free_conversion_buffer (buf)
4572 struct conversion_buffer *buf;
4573 {
4574 if (!buf->on_stack)
4575 xfree (buf->data);
4576 }
4577
4578 int
4579 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4580 struct coding_system *coding;
4581 unsigned char *source, *destination;
4582 int src_bytes, dst_bytes, encodep;
4583 {
4584 struct ccl_program *ccl
4585 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4586 unsigned char *dst = destination;
4587
4588 ccl->suppress_error = coding->suppress_error;
4589 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4590 if (encodep)
4591 {
4592 /* On encoding, EOL format is converted within ccl_driver. For
4593 that, setup proper information in the structure CCL. */
4594 ccl->eol_type = coding->eol_type;
4595 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4596 ccl->eol_type = CODING_EOL_LF;
4597 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4598 ccl->eight_bit_control = coding->dst_multibyte;
4599 }
4600 else
4601 ccl->eight_bit_control = 1;
4602 ccl->multibyte = coding->src_multibyte;
4603 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4604 {
4605 /* Move carryover bytes to DESTINATION. */
4606 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4607 while (*p)
4608 *dst++ = *p++;
4609 coding->spec.ccl.eight_bit_carryover[0] = 0;
4610 if (dst_bytes)
4611 dst_bytes -= dst - destination;
4612 }
4613
4614 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4615 &(coding->consumed))
4616 + dst - destination);
4617
4618 if (encodep)
4619 {
4620 coding->produced_char = coding->produced;
4621 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4622 }
4623 else if (!ccl->eight_bit_control)
4624 {
4625 /* The produced bytes forms a valid multibyte sequence. */
4626 coding->produced_char
4627 = multibyte_chars_in_text (destination, coding->produced);
4628 coding->spec.ccl.eight_bit_carryover[0] = 0;
4629 }
4630 else
4631 {
4632 /* On decoding, the destination should always multibyte. But,
4633 CCL program might have been generated an invalid multibyte
4634 sequence. Here we make such a sequence valid as
4635 multibyte. */
4636 int bytes
4637 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4638
4639 if ((coding->consumed < src_bytes
4640 || !ccl->last_block)
4641 && coding->produced >= 1
4642 && destination[coding->produced - 1] >= 0x80)
4643 {
4644 /* We should not convert the tailing 8-bit codes to
4645 multibyte form even if they doesn't form a valid
4646 multibyte sequence. They may form a valid sequence in
4647 the next call. */
4648 int carryover = 0;
4649
4650 if (destination[coding->produced - 1] < 0xA0)
4651 carryover = 1;
4652 else if (coding->produced >= 2)
4653 {
4654 if (destination[coding->produced - 2] >= 0x80)
4655 {
4656 if (destination[coding->produced - 2] < 0xA0)
4657 carryover = 2;
4658 else if (coding->produced >= 3
4659 && destination[coding->produced - 3] >= 0x80
4660 && destination[coding->produced - 3] < 0xA0)
4661 carryover = 3;
4662 }
4663 }
4664 if (carryover > 0)
4665 {
4666 BCOPY_SHORT (destination + coding->produced - carryover,
4667 coding->spec.ccl.eight_bit_carryover,
4668 carryover);
4669 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4670 coding->produced -= carryover;
4671 }
4672 }
4673 coding->produced = str_as_multibyte (destination, bytes,
4674 coding->produced,
4675 &(coding->produced_char));
4676 }
4677
4678 switch (ccl->status)
4679 {
4680 case CCL_STAT_SUSPEND_BY_SRC:
4681 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4682 break;
4683 case CCL_STAT_SUSPEND_BY_DST:
4684 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4685 break;
4686 case CCL_STAT_QUIT:
4687 case CCL_STAT_INVALID_CMD:
4688 coding->result = CODING_FINISH_INTERRUPT;
4689 break;
4690 default:
4691 coding->result = CODING_FINISH_NORMAL;
4692 break;
4693 }
4694 return coding->result;
4695 }
4696
4697 /* Decode EOL format of the text at PTR of BYTES length destructively
4698 according to CODING->eol_type. This is called after the CCL
4699 program produced a decoded text at PTR. If we do CRLF->LF
4700 conversion, update CODING->produced and CODING->produced_char. */
4701
4702 static void
4703 decode_eol_post_ccl (coding, ptr, bytes)
4704 struct coding_system *coding;
4705 unsigned char *ptr;
4706 int bytes;
4707 {
4708 Lisp_Object val, saved_coding_symbol;
4709 unsigned char *pend = ptr + bytes;
4710 int dummy;
4711
4712 /* Remember the current coding system symbol. We set it back when
4713 an inconsistent EOL is found so that `last-coding-system-used' is
4714 set to the coding system that doesn't specify EOL conversion. */
4715 saved_coding_symbol = coding->symbol;
4716
4717 coding->spec.ccl.cr_carryover = 0;
4718 if (coding->eol_type == CODING_EOL_UNDECIDED)
4719 {
4720 /* Here, to avoid the call of setup_coding_system, we directly
4721 call detect_eol_type. */
4722 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4723 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4724 coding->eol_type = CODING_EOL_LF;
4725 if (coding->eol_type != CODING_EOL_UNDECIDED)
4726 {
4727 val = Fget (coding->symbol, Qeol_type);
4728 if (VECTORP (val) && XVECTOR (val)->size == 3)
4729 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4730 }
4731 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4732 }
4733
4734 if (coding->eol_type == CODING_EOL_LF
4735 || coding->eol_type == CODING_EOL_UNDECIDED)
4736 {
4737 /* We have nothing to do. */
4738 ptr = pend;
4739 }
4740 else if (coding->eol_type == CODING_EOL_CRLF)
4741 {
4742 unsigned char *pstart = ptr, *p = ptr;
4743
4744 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4745 && *(pend - 1) == '\r')
4746 {
4747 /* If the last character is CR, we can't handle it here
4748 because LF will be in the not-yet-decoded source text.
4749 Record that the CR is not yet processed. */
4750 coding->spec.ccl.cr_carryover = 1;
4751 coding->produced--;
4752 coding->produced_char--;
4753 pend--;
4754 }
4755 while (ptr < pend)
4756 {
4757 if (*ptr == '\r')
4758 {
4759 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4760 {
4761 *p++ = '\n';
4762 ptr += 2;
4763 }
4764 else
4765 {
4766 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4767 goto undo_eol_conversion;
4768 *p++ = *ptr++;
4769 }
4770 }
4771 else if (*ptr == '\n'
4772 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4773 goto undo_eol_conversion;
4774 else
4775 *p++ = *ptr++;
4776 continue;
4777
4778 undo_eol_conversion:
4779 /* We have faced with inconsistent EOL format at PTR.
4780 Convert all LFs before PTR back to CRLFs. */
4781 for (p--, ptr--; p >= pstart; p--)
4782 {
4783 if (*p == '\n')
4784 *ptr-- = '\n', *ptr-- = '\r';
4785 else
4786 *ptr-- = *p;
4787 }
4788 /* If carryover is recorded, cancel it because we don't
4789 convert CRLF anymore. */
4790 if (coding->spec.ccl.cr_carryover)
4791 {
4792 coding->spec.ccl.cr_carryover = 0;
4793 coding->produced++;
4794 coding->produced_char++;
4795 pend++;
4796 }
4797 p = ptr = pend;
4798 coding->eol_type = CODING_EOL_LF;
4799 coding->symbol = saved_coding_symbol;
4800 }
4801 if (p < pend)
4802 {
4803 /* As each two-byte sequence CRLF was converted to LF, (PEND
4804 - P) is the number of deleted characters. */
4805 coding->produced -= pend - p;
4806 coding->produced_char -= pend - p;
4807 }
4808 }
4809 else /* i.e. coding->eol_type == CODING_EOL_CR */
4810 {
4811 unsigned char *p = ptr;
4812
4813 for (; ptr < pend; ptr++)
4814 {
4815 if (*ptr == '\r')
4816 *ptr = '\n';
4817 else if (*ptr == '\n'
4818 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4819 {
4820 for (; p < ptr; p++)
4821 {
4822 if (*p == '\n')
4823 *p = '\r';
4824 }
4825 ptr = pend;
4826 coding->eol_type = CODING_EOL_LF;
4827 coding->symbol = saved_coding_symbol;
4828 }
4829 }
4830 }
4831 }
4832
4833 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4834 decoding, it may detect coding system and format of end-of-line if
4835 those are not yet decided. The source should be unibyte, the
4836 result is multibyte if CODING->dst_multibyte is nonzero, else
4837 unibyte. */
4838
4839 int
4840 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4841 struct coding_system *coding;
4842 const unsigned char *source;
4843 unsigned char *destination;
4844 int src_bytes, dst_bytes;
4845 {
4846 int extra = 0;
4847
4848 if (coding->type == coding_type_undecided)
4849 detect_coding (coding, source, src_bytes);
4850
4851 if (coding->eol_type == CODING_EOL_UNDECIDED
4852 && coding->type != coding_type_ccl)
4853 {
4854 detect_eol (coding, source, src_bytes);
4855 /* We had better recover the original eol format if we
4856 encounter an inconsistent eol format while decoding. */
4857 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4858 }
4859
4860 coding->produced = coding->produced_char = 0;
4861 coding->consumed = coding->consumed_char = 0;
4862 coding->errors = 0;
4863 coding->result = CODING_FINISH_NORMAL;
4864
4865 switch (coding->type)
4866 {
4867 case coding_type_sjis:
4868 decode_coding_sjis_big5 (coding, source, destination,
4869 src_bytes, dst_bytes, 1);
4870 break;
4871
4872 case coding_type_iso2022:
4873 decode_coding_iso2022 (coding, source, destination,
4874 src_bytes, dst_bytes);
4875 break;
4876
4877 case coding_type_big5:
4878 decode_coding_sjis_big5 (coding, source, destination,
4879 src_bytes, dst_bytes, 0);
4880 break;
4881
4882 case coding_type_emacs_mule:
4883 decode_coding_emacs_mule (coding, source, destination,
4884 src_bytes, dst_bytes);
4885 break;
4886
4887 case coding_type_ccl:
4888 if (coding->spec.ccl.cr_carryover)
4889 {
4890 /* Put the CR which was not processed by the previous call
4891 of decode_eol_post_ccl in DESTINATION. It will be
4892 decoded together with the following LF by the call to
4893 decode_eol_post_ccl below. */
4894 *destination = '\r';
4895 coding->produced++;
4896 coding->produced_char++;
4897 dst_bytes--;
4898 extra = coding->spec.ccl.cr_carryover;
4899 }
4900 ccl_coding_driver (coding, source, destination + extra,
4901 src_bytes, dst_bytes, 0);
4902 if (coding->eol_type != CODING_EOL_LF)
4903 {
4904 coding->produced += extra;
4905 coding->produced_char += extra;
4906 decode_eol_post_ccl (coding, destination, coding->produced);
4907 }
4908 break;
4909
4910 default:
4911 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4912 }
4913
4914 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4915 && coding->mode & CODING_MODE_LAST_BLOCK
4916 && coding->consumed == src_bytes)
4917 coding->result = CODING_FINISH_NORMAL;
4918
4919 if (coding->mode & CODING_MODE_LAST_BLOCK
4920 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4921 {
4922 const unsigned char *src = source + coding->consumed;
4923 unsigned char *dst = destination + coding->produced;
4924
4925 src_bytes -= coding->consumed;
4926 coding->errors++;
4927 if (COMPOSING_P (coding))
4928 DECODE_COMPOSITION_END ('1');
4929 while (src_bytes--)
4930 {
4931 int c = *src++;
4932 dst += CHAR_STRING (c, dst);
4933 coding->produced_char++;
4934 }
4935 coding->consumed = coding->consumed_char = src - source;
4936 coding->produced = dst - destination;
4937 coding->result = CODING_FINISH_NORMAL;
4938 }
4939
4940 if (!coding->dst_multibyte)
4941 {
4942 coding->produced = str_as_unibyte (destination, coding->produced);
4943 coding->produced_char = coding->produced;
4944 }
4945
4946 return coding->result;
4947 }
4948
4949 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4950 multibyteness of the source is CODING->src_multibyte, the
4951 multibyteness of the result is always unibyte. */
4952
4953 int
4954 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4955 struct coding_system *coding;
4956 const unsigned char *source;
4957 unsigned char *destination;
4958 int src_bytes, dst_bytes;
4959 {
4960 coding->produced = coding->produced_char = 0;
4961 coding->consumed = coding->consumed_char = 0;
4962 coding->errors = 0;
4963 coding->result = CODING_FINISH_NORMAL;
4964
4965 switch (coding->type)
4966 {
4967 case coding_type_sjis:
4968 encode_coding_sjis_big5 (coding, source, destination,
4969 src_bytes, dst_bytes, 1);
4970 break;
4971
4972 case coding_type_iso2022:
4973 encode_coding_iso2022 (coding, source, destination,
4974 src_bytes, dst_bytes);
4975 break;
4976
4977 case coding_type_big5:
4978 encode_coding_sjis_big5 (coding, source, destination,
4979 src_bytes, dst_bytes, 0);
4980 break;
4981
4982 case coding_type_emacs_mule:
4983 encode_coding_emacs_mule (coding, source, destination,
4984 src_bytes, dst_bytes);
4985 break;
4986
4987 case coding_type_ccl:
4988 ccl_coding_driver (coding, source, destination,
4989 src_bytes, dst_bytes, 1);
4990 break;
4991
4992 default:
4993 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4994 }
4995
4996 if (coding->mode & CODING_MODE_LAST_BLOCK
4997 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4998 {
4999 const unsigned char *src = source + coding->consumed;
5000 unsigned char *dst = destination + coding->produced;
5001
5002 if (coding->type == coding_type_iso2022)
5003 ENCODE_RESET_PLANE_AND_REGISTER;
5004 if (COMPOSING_P (coding))
5005 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5006 if (coding->consumed < src_bytes)
5007 {
5008 int len = src_bytes - coding->consumed;
5009
5010 BCOPY_SHORT (src, dst, len);
5011 if (coding->src_multibyte)
5012 len = str_as_unibyte (dst, len);
5013 dst += len;
5014 coding->consumed = src_bytes;
5015 }
5016 coding->produced = coding->produced_char = dst - destination;
5017 coding->result = CODING_FINISH_NORMAL;
5018 }
5019
5020 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5021 && coding->consumed == src_bytes)
5022 coding->result = CODING_FINISH_NORMAL;
5023
5024 return coding->result;
5025 }
5026
5027 /* Scan text in the region between *BEG and *END (byte positions),
5028 skip characters which we don't have to decode by coding system
5029 CODING at the head and tail, then set *BEG and *END to the region
5030 of the text we actually have to convert. The caller should move
5031 the gap out of the region in advance if the region is from a
5032 buffer.
5033
5034 If STR is not NULL, *BEG and *END are indices into STR. */
5035
5036 static void
5037 shrink_decoding_region (beg, end, coding, str)
5038 int *beg, *end;
5039 struct coding_system *coding;
5040 unsigned char *str;
5041 {
5042 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5043 int eol_conversion;
5044 Lisp_Object translation_table;
5045
5046 if (coding->type == coding_type_ccl
5047 || coding->type == coding_type_undecided
5048 || coding->eol_type != CODING_EOL_LF
5049 || !NILP (coding->post_read_conversion)
5050 || coding->composing != COMPOSITION_DISABLED)
5051 {
5052 /* We can't skip any data. */
5053 return;
5054 }
5055 if (coding->type == coding_type_no_conversion
5056 || coding->type == coding_type_raw_text
5057 || coding->type == coding_type_emacs_mule)
5058 {
5059 /* We need no conversion, but don't have to skip any data here.
5060 Decoding routine handles them effectively anyway. */
5061 return;
5062 }
5063
5064 translation_table = coding->translation_table_for_decode;
5065 if (NILP (translation_table) && !NILP (Venable_character_translation))
5066 translation_table = Vstandard_translation_table_for_decode;
5067 if (CHAR_TABLE_P (translation_table))
5068 {
5069 int i;
5070 for (i = 0; i < 128; i++)
5071 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5072 break;
5073 if (i < 128)
5074 /* Some ASCII character should be translated. We give up
5075 shrinking. */
5076 return;
5077 }
5078
5079 if (coding->heading_ascii >= 0)
5080 /* Detection routine has already found how much we can skip at the
5081 head. */
5082 *beg += coding->heading_ascii;
5083
5084 if (str)
5085 {
5086 begp_orig = begp = str + *beg;
5087 endp_orig = endp = str + *end;
5088 }
5089 else
5090 {
5091 begp_orig = begp = BYTE_POS_ADDR (*beg);
5092 endp_orig = endp = begp + *end - *beg;
5093 }
5094
5095 eol_conversion = (coding->eol_type == CODING_EOL_CR
5096 || coding->eol_type == CODING_EOL_CRLF);
5097
5098 switch (coding->type)
5099 {
5100 case coding_type_sjis:
5101 case coding_type_big5:
5102 /* We can skip all ASCII characters at the head. */
5103 if (coding->heading_ascii < 0)
5104 {
5105 if (eol_conversion)
5106 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5107 else
5108 while (begp < endp && *begp < 0x80) begp++;
5109 }
5110 /* We can skip all ASCII characters at the tail except for the
5111 second byte of SJIS or BIG5 code. */
5112 if (eol_conversion)
5113 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5114 else
5115 while (begp < endp && endp[-1] < 0x80) endp--;
5116 /* Do not consider LF as ascii if preceded by CR, since that
5117 confuses eol decoding. */
5118 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5119 endp++;
5120 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5121 endp++;
5122 break;
5123
5124 case coding_type_iso2022:
5125 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5126 /* We can't skip any data. */
5127 break;
5128 if (coding->heading_ascii < 0)
5129 {
5130 /* We can skip all ASCII characters at the head except for a
5131 few control codes. */
5132 while (begp < endp && (c = *begp) < 0x80
5133 && c != ISO_CODE_CR && c != ISO_CODE_SO
5134 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5135 && (!eol_conversion || c != ISO_CODE_LF))
5136 begp++;
5137 }
5138 switch (coding->category_idx)
5139 {
5140 case CODING_CATEGORY_IDX_ISO_8_1:
5141 case CODING_CATEGORY_IDX_ISO_8_2:
5142 /* We can skip all ASCII characters at the tail. */
5143 if (eol_conversion)
5144 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5145 else
5146 while (begp < endp && endp[-1] < 0x80) endp--;
5147 /* Do not consider LF as ascii if preceded by CR, since that
5148 confuses eol decoding. */
5149 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5150 endp++;
5151 break;
5152
5153 case CODING_CATEGORY_IDX_ISO_7:
5154 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5155 {
5156 /* We can skip all characters at the tail except for 8-bit
5157 codes and ESC and the following 2-byte at the tail. */
5158 unsigned char *eight_bit = NULL;
5159
5160 if (eol_conversion)
5161 while (begp < endp
5162 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5163 {
5164 if (!eight_bit && c & 0x80) eight_bit = endp;
5165 endp--;
5166 }
5167 else
5168 while (begp < endp
5169 && (c = endp[-1]) != ISO_CODE_ESC)
5170 {
5171 if (!eight_bit && c & 0x80) eight_bit = endp;
5172 endp--;
5173 }
5174 /* Do not consider LF as ascii if preceded by CR, since that
5175 confuses eol decoding. */
5176 if (begp < endp && endp < endp_orig
5177 && endp[-1] == '\r' && endp[0] == '\n')
5178 endp++;
5179 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5180 {
5181 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5182 /* This is an ASCII designation sequence. We can
5183 surely skip the tail. But, if we have
5184 encountered an 8-bit code, skip only the codes
5185 after that. */
5186 endp = eight_bit ? eight_bit : endp + 2;
5187 else
5188 /* Hmmm, we can't skip the tail. */
5189 endp = endp_orig;
5190 }
5191 else if (eight_bit)
5192 endp = eight_bit;
5193 }
5194 }
5195 break;
5196
5197 default:
5198 abort ();
5199 }
5200 *beg += begp - begp_orig;
5201 *end += endp - endp_orig;
5202 return;
5203 }
5204
5205 /* Like shrink_decoding_region but for encoding. */
5206
5207 static void
5208 shrink_encoding_region (beg, end, coding, str)
5209 int *beg, *end;
5210 struct coding_system *coding;
5211 unsigned char *str;
5212 {
5213 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5214 int eol_conversion;
5215 Lisp_Object translation_table;
5216
5217 if (coding->type == coding_type_ccl
5218 || coding->eol_type == CODING_EOL_CRLF
5219 || coding->eol_type == CODING_EOL_CR
5220 || (coding->cmp_data && coding->cmp_data->used > 0))
5221 {
5222 /* We can't skip any data. */
5223 return;
5224 }
5225 if (coding->type == coding_type_no_conversion
5226 || coding->type == coding_type_raw_text
5227 || coding->type == coding_type_emacs_mule
5228 || coding->type == coding_type_undecided)
5229 {
5230 /* We need no conversion, but don't have to skip any data here.
5231 Encoding routine handles them effectively anyway. */
5232 return;
5233 }
5234
5235 translation_table = coding->translation_table_for_encode;
5236 if (NILP (translation_table) && !NILP (Venable_character_translation))
5237 translation_table = Vstandard_translation_table_for_encode;
5238 if (CHAR_TABLE_P (translation_table))
5239 {
5240 int i;
5241 for (i = 0; i < 128; i++)
5242 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5243 break;
5244 if (i < 128)
5245 /* Some ASCII character should be translated. We give up
5246 shrinking. */
5247 return;
5248 }
5249
5250 if (str)
5251 {
5252 begp_orig = begp = str + *beg;
5253 endp_orig = endp = str + *end;
5254 }
5255 else
5256 {
5257 begp_orig = begp = BYTE_POS_ADDR (*beg);
5258 endp_orig = endp = begp + *end - *beg;
5259 }
5260
5261 eol_conversion = (coding->eol_type == CODING_EOL_CR
5262 || coding->eol_type == CODING_EOL_CRLF);
5263
5264 /* Here, we don't have to check coding->pre_write_conversion because
5265 the caller is expected to have handled it already. */
5266 switch (coding->type)
5267 {
5268 case coding_type_iso2022:
5269 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5270 /* We can't skip any data. */
5271 break;
5272 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5273 {
5274 unsigned char *bol = begp;
5275 while (begp < endp && *begp < 0x80)
5276 {
5277 begp++;
5278 if (begp[-1] == '\n')
5279 bol = begp;
5280 }
5281 begp = bol;
5282 goto label_skip_tail;
5283 }
5284 /* fall down ... */
5285
5286 case coding_type_sjis:
5287 case coding_type_big5:
5288 /* We can skip all ASCII characters at the head and tail. */
5289 if (eol_conversion)
5290 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5291 else
5292 while (begp < endp && *begp < 0x80) begp++;
5293 label_skip_tail:
5294 if (eol_conversion)
5295 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5296 else
5297 while (begp < endp && *(endp - 1) < 0x80) endp--;
5298 break;
5299
5300 default:
5301 abort ();
5302 }
5303
5304 *beg += begp - begp_orig;
5305 *end += endp - endp_orig;
5306 return;
5307 }
5308
5309 /* As shrinking conversion region requires some overhead, we don't try
5310 shrinking if the length of conversion region is less than this
5311 value. */
5312 static int shrink_conversion_region_threshhold = 1024;
5313
5314 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5315 do { \
5316 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5317 { \
5318 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5319 else shrink_decoding_region (beg, end, coding, str); \
5320 } \
5321 } while (0)
5322
5323 static Lisp_Object
5324 code_convert_region_unwind (arg)
5325 Lisp_Object arg;
5326 {
5327 inhibit_pre_post_conversion = 0;
5328 Vlast_coding_system_used = arg;
5329 return Qnil;
5330 }
5331
5332 /* Store information about all compositions in the range FROM and TO
5333 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5334 buffer or a string, defaults to the current buffer. */
5335
5336 void
5337 coding_save_composition (coding, from, to, obj)
5338 struct coding_system *coding;
5339 int from, to;
5340 Lisp_Object obj;
5341 {
5342 Lisp_Object prop;
5343 int start, end;
5344
5345 if (coding->composing == COMPOSITION_DISABLED)
5346 return;
5347 if (!coding->cmp_data)
5348 coding_allocate_composition_data (coding, from);
5349 if (!find_composition (from, to, &start, &end, &prop, obj)
5350 || end > to)
5351 return;
5352 if (start < from
5353 && (!find_composition (end, to, &start, &end, &prop, obj)
5354 || end > to))
5355 return;
5356 coding->composing = COMPOSITION_NO;
5357 do
5358 {
5359 if (COMPOSITION_VALID_P (start, end, prop))
5360 {
5361 enum composition_method method = COMPOSITION_METHOD (prop);
5362 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5363 >= COMPOSITION_DATA_SIZE)
5364 coding_allocate_composition_data (coding, from);
5365 /* For relative composition, we remember start and end
5366 positions, for the other compositions, we also remember
5367 components. */
5368 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5369 if (method != COMPOSITION_RELATIVE)
5370 {
5371 /* We must store a*/
5372 Lisp_Object val, ch;
5373
5374 val = COMPOSITION_COMPONENTS (prop);
5375 if (CONSP (val))
5376 while (CONSP (val))
5377 {
5378 ch = XCAR (val), val = XCDR (val);
5379 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5380 }
5381 else if (VECTORP (val) || STRINGP (val))
5382 {
5383 int len = (VECTORP (val)
5384 ? XVECTOR (val)->size : SCHARS (val));
5385 int i;
5386 for (i = 0; i < len; i++)
5387 {
5388 ch = (STRINGP (val)
5389 ? Faref (val, make_number (i))
5390 : XVECTOR (val)->contents[i]);
5391 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5392 }
5393 }
5394 else /* INTEGERP (val) */
5395 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5396 }
5397 CODING_ADD_COMPOSITION_END (coding, end - from);
5398 }
5399 start = end;
5400 }
5401 while (start < to
5402 && find_composition (start, to, &start, &end, &prop, obj)
5403 && end <= to);
5404
5405 /* Make coding->cmp_data point to the first memory block. */
5406 while (coding->cmp_data->prev)
5407 coding->cmp_data = coding->cmp_data->prev;
5408 coding->cmp_data_start = 0;
5409 }
5410
5411 /* Reflect the saved information about compositions to OBJ.
5412 CODING->cmp_data points to a memory block for the information. OBJ
5413 is a buffer or a string, defaults to the current buffer. */
5414
5415 void
5416 coding_restore_composition (coding, obj)
5417 struct coding_system *coding;
5418 Lisp_Object obj;
5419 {
5420 struct composition_data *cmp_data = coding->cmp_data;
5421
5422 if (!cmp_data)
5423 return;
5424
5425 while (cmp_data->prev)
5426 cmp_data = cmp_data->prev;
5427
5428 while (cmp_data)
5429 {
5430 int i;
5431
5432 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5433 i += cmp_data->data[i])
5434 {
5435 int *data = cmp_data->data + i;
5436 enum composition_method method = (enum composition_method) data[3];
5437 Lisp_Object components;
5438
5439 if (method == COMPOSITION_RELATIVE)
5440 components = Qnil;
5441 else
5442 {
5443 int len = data[0] - 4, j;
5444 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5445
5446 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5447 && len % 2 == 0)
5448 len --;
5449 for (j = 0; j < len; j++)
5450 args[j] = make_number (data[4 + j]);
5451 components = (method == COMPOSITION_WITH_ALTCHARS
5452 ? Fstring (len, args) : Fvector (len, args));
5453 }
5454 compose_text (data[1], data[2], components, Qnil, obj);
5455 }
5456 cmp_data = cmp_data->next;
5457 }
5458 }
5459
5460 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5461 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5462 coding system CODING, and return the status code of code conversion
5463 (currently, this value has no meaning).
5464
5465 How many characters (and bytes) are converted to how many
5466 characters (and bytes) are recorded in members of the structure
5467 CODING.
5468
5469 If REPLACE is nonzero, we do various things as if the original text
5470 is deleted and a new text is inserted. See the comments in
5471 replace_range (insdel.c) to know what we are doing.
5472
5473 If REPLACE is zero, it is assumed that the source text is unibyte.
5474 Otherwise, it is assumed that the source text is multibyte. */
5475
5476 int
5477 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5478 int from, from_byte, to, to_byte, encodep, replace;
5479 struct coding_system *coding;
5480 {
5481 int len = to - from, len_byte = to_byte - from_byte;
5482 int nchars_del = 0, nbytes_del = 0;
5483 int require, inserted, inserted_byte;
5484 int head_skip, tail_skip, total_skip = 0;
5485 Lisp_Object saved_coding_symbol;
5486 int first = 1;
5487 unsigned char *src, *dst;
5488 Lisp_Object deletion;
5489 int orig_point = PT, orig_len = len;
5490 int prev_Z;
5491 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5492
5493 deletion = Qnil;
5494 saved_coding_symbol = coding->symbol;
5495
5496 if (from < PT && PT < to)
5497 {
5498 TEMP_SET_PT_BOTH (from, from_byte);
5499 orig_point = from;
5500 }
5501
5502 if (replace)
5503 {
5504 int saved_from = from;
5505 int saved_inhibit_modification_hooks;
5506
5507 prepare_to_modify_buffer (from, to, &from);
5508 if (saved_from != from)
5509 {
5510 to = from + len;
5511 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5512 len_byte = to_byte - from_byte;
5513 }
5514
5515 /* The code conversion routine can not preserve text properties
5516 for now. So, we must remove all text properties in the
5517 region. Here, we must suppress all modification hooks. */
5518 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5519 inhibit_modification_hooks = 1;
5520 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5521 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5522 }
5523
5524 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5525 {
5526 /* We must detect encoding of text and eol format. */
5527
5528 if (from < GPT && to > GPT)
5529 move_gap_both (from, from_byte);
5530 if (coding->type == coding_type_undecided)
5531 {
5532 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5533 if (coding->type == coding_type_undecided)
5534 {
5535 /* It seems that the text contains only ASCII, but we
5536 should not leave it undecided because the deeper
5537 decoding routine (decode_coding) tries to detect the
5538 encodings again in vain. */
5539 coding->type = coding_type_emacs_mule;
5540 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5541 /* As emacs-mule decoder will handle composition, we
5542 need this setting to allocate coding->cmp_data
5543 later. */
5544 coding->composing = COMPOSITION_NO;
5545 }
5546 }
5547 if (coding->eol_type == CODING_EOL_UNDECIDED
5548 && coding->type != coding_type_ccl)
5549 {
5550 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5551 if (coding->eol_type == CODING_EOL_UNDECIDED)
5552 coding->eol_type = CODING_EOL_LF;
5553 /* We had better recover the original eol format if we
5554 encounter an inconsistent eol format while decoding. */
5555 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5556 }
5557 }
5558
5559 /* Now we convert the text. */
5560
5561 /* For encoding, we must process pre-write-conversion in advance. */
5562 if (! inhibit_pre_post_conversion
5563 && encodep
5564 && SYMBOLP (coding->pre_write_conversion)
5565 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5566 {
5567 /* The function in pre-write-conversion may put a new text in a
5568 new buffer. */
5569 struct buffer *prev = current_buffer;
5570 Lisp_Object new;
5571
5572 record_unwind_protect (code_convert_region_unwind,
5573 Vlast_coding_system_used);
5574 /* We should not call any more pre-write/post-read-conversion
5575 functions while this pre-write-conversion is running. */
5576 inhibit_pre_post_conversion = 1;
5577 call2 (coding->pre_write_conversion,
5578 make_number (from), make_number (to));
5579 inhibit_pre_post_conversion = 0;
5580 /* Discard the unwind protect. */
5581 specpdl_ptr--;
5582
5583 if (current_buffer != prev)
5584 {
5585 len = ZV - BEGV;
5586 new = Fcurrent_buffer ();
5587 set_buffer_internal_1 (prev);
5588 del_range_2 (from, from_byte, to, to_byte, 0);
5589 TEMP_SET_PT_BOTH (from, from_byte);
5590 insert_from_buffer (XBUFFER (new), 1, len, 0);
5591 Fkill_buffer (new);
5592 if (orig_point >= to)
5593 orig_point += len - orig_len;
5594 else if (orig_point > from)
5595 orig_point = from;
5596 orig_len = len;
5597 to = from + len;
5598 from_byte = CHAR_TO_BYTE (from);
5599 to_byte = CHAR_TO_BYTE (to);
5600 len_byte = to_byte - from_byte;
5601 TEMP_SET_PT_BOTH (from, from_byte);
5602 }
5603 }
5604
5605 if (replace)
5606 {
5607 if (! EQ (current_buffer->undo_list, Qt))
5608 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5609 else
5610 {
5611 nchars_del = to - from;
5612 nbytes_del = to_byte - from_byte;
5613 }
5614 }
5615
5616 if (coding->composing != COMPOSITION_DISABLED)
5617 {
5618 if (encodep)
5619 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5620 else
5621 coding_allocate_composition_data (coding, from);
5622 }
5623
5624 /* Try to skip the heading and tailing ASCIIs. */
5625 if (coding->type != coding_type_ccl)
5626 {
5627 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5628
5629 if (from < GPT && GPT < to)
5630 move_gap_both (from, from_byte);
5631 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5632 if (from_byte == to_byte
5633 && (encodep || NILP (coding->post_read_conversion))
5634 && ! CODING_REQUIRE_FLUSHING (coding))
5635 {
5636 coding->produced = len_byte;
5637 coding->produced_char = len;
5638 if (!replace)
5639 /* We must record and adjust for this new text now. */
5640 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5641 return 0;
5642 }
5643
5644 head_skip = from_byte - from_byte_orig;
5645 tail_skip = to_byte_orig - to_byte;
5646 total_skip = head_skip + tail_skip;
5647 from += head_skip;
5648 to -= tail_skip;
5649 len -= total_skip; len_byte -= total_skip;
5650 }
5651
5652 /* For conversion, we must put the gap before the text in addition to
5653 making the gap larger for efficient decoding. The required gap
5654 size starts from 2000 which is the magic number used in make_gap.
5655 But, after one batch of conversion, it will be incremented if we
5656 find that it is not enough . */
5657 require = 2000;
5658
5659 if (GAP_SIZE < require)
5660 make_gap (require - GAP_SIZE);
5661 move_gap_both (from, from_byte);
5662
5663 inserted = inserted_byte = 0;
5664
5665 GAP_SIZE += len_byte;
5666 ZV -= len;
5667 Z -= len;
5668 ZV_BYTE -= len_byte;
5669 Z_BYTE -= len_byte;
5670
5671 if (GPT - BEG < BEG_UNCHANGED)
5672 BEG_UNCHANGED = GPT - BEG;
5673 if (Z - GPT < END_UNCHANGED)
5674 END_UNCHANGED = Z - GPT;
5675
5676 if (!encodep && coding->src_multibyte)
5677 {
5678 /* Decoding routines expects that the source text is unibyte.
5679 We must convert 8-bit characters of multibyte form to
5680 unibyte. */
5681 int len_byte_orig = len_byte;
5682 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5683 if (len_byte < len_byte_orig)
5684 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5685 len_byte);
5686 coding->src_multibyte = 0;
5687 }
5688
5689 for (;;)
5690 {
5691 int result;
5692
5693 /* The buffer memory is now:
5694 +--------+converted-text+---------+-------original-text-------+---+
5695 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5696 |<---------------------- GAP ----------------------->| */
5697 src = GAP_END_ADDR - len_byte;
5698 dst = GPT_ADDR + inserted_byte;
5699
5700 if (encodep)
5701 result = encode_coding (coding, src, dst, len_byte, 0);
5702 else
5703 {
5704 if (coding->composing != COMPOSITION_DISABLED)
5705 coding->cmp_data->char_offset = from + inserted;
5706 result = decode_coding (coding, src, dst, len_byte, 0);
5707 }
5708
5709 /* The buffer memory is now:
5710 +--------+-------converted-text----+--+------original-text----+---+
5711 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5712 |<---------------------- GAP ----------------------->| */
5713
5714 inserted += coding->produced_char;
5715 inserted_byte += coding->produced;
5716 len_byte -= coding->consumed;
5717
5718 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5719 {
5720 coding_allocate_composition_data (coding, from + inserted);
5721 continue;
5722 }
5723
5724 src += coding->consumed;
5725 dst += coding->produced;
5726
5727 if (result == CODING_FINISH_NORMAL)
5728 {
5729 src += len_byte;
5730 break;
5731 }
5732 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5733 {
5734 unsigned char *pend = dst, *p = pend - inserted_byte;
5735 Lisp_Object eol_type;
5736
5737 /* Encode LFs back to the original eol format (CR or CRLF). */
5738 if (coding->eol_type == CODING_EOL_CR)
5739 {
5740 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5741 }
5742 else
5743 {
5744 int count = 0;
5745
5746 while (p < pend) if (*p++ == '\n') count++;
5747 if (src - dst < count)
5748 {
5749 /* We don't have sufficient room for encoding LFs
5750 back to CRLF. We must record converted and
5751 not-yet-converted text back to the buffer
5752 content, enlarge the gap, then record them out of
5753 the buffer contents again. */
5754 int add = len_byte + inserted_byte;
5755
5756 GAP_SIZE -= add;
5757 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5758 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5759 make_gap (count - GAP_SIZE);
5760 GAP_SIZE += add;
5761 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5762 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5763 /* Don't forget to update SRC, DST, and PEND. */
5764 src = GAP_END_ADDR - len_byte;
5765 dst = GPT_ADDR + inserted_byte;
5766 pend = dst;
5767 }
5768 inserted += count;
5769 inserted_byte += count;
5770 coding->produced += count;
5771 p = dst = pend + count;
5772 while (count)
5773 {
5774 *--p = *--pend;
5775 if (*p == '\n') count--, *--p = '\r';
5776 }
5777 }
5778
5779 /* Suppress eol-format conversion in the further conversion. */
5780 coding->eol_type = CODING_EOL_LF;
5781
5782 /* Set the coding system symbol to that for Unix-like EOL. */
5783 eol_type = Fget (saved_coding_symbol, Qeol_type);
5784 if (VECTORP (eol_type)
5785 && XVECTOR (eol_type)->size == 3
5786 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5787 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5788 else
5789 coding->symbol = saved_coding_symbol;
5790
5791 continue;
5792 }
5793 if (len_byte <= 0)
5794 {
5795 if (coding->type != coding_type_ccl
5796 || coding->mode & CODING_MODE_LAST_BLOCK)
5797 break;
5798 coding->mode |= CODING_MODE_LAST_BLOCK;
5799 continue;
5800 }
5801 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5802 {
5803 /* The source text ends in invalid codes. Let's just
5804 make them valid buffer contents, and finish conversion. */
5805 if (multibyte_p)
5806 {
5807 unsigned char *start = dst;
5808
5809 inserted += len_byte;
5810 while (len_byte--)
5811 {
5812 int c = *src++;
5813 dst += CHAR_STRING (c, dst);
5814 }
5815
5816 inserted_byte += dst - start;
5817 }
5818 else
5819 {
5820 inserted += len_byte;
5821 inserted_byte += len_byte;
5822 while (len_byte--)
5823 *dst++ = *src++;
5824 }
5825 break;
5826 }
5827 if (result == CODING_FINISH_INTERRUPT)
5828 {
5829 /* The conversion procedure was interrupted by a user. */
5830 break;
5831 }
5832 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5833 if (coding->consumed < 1)
5834 {
5835 /* It's quite strange to require more memory without
5836 consuming any bytes. Perhaps CCL program bug. */
5837 break;
5838 }
5839 if (first)
5840 {
5841 /* We have just done the first batch of conversion which was
5842 stopped because of insufficient gap. Let's reconsider the
5843 required gap size (i.e. SRT - DST) now.
5844
5845 We have converted ORIG bytes (== coding->consumed) into
5846 NEW bytes (coding->produced). To convert the remaining
5847 LEN bytes, we may need REQUIRE bytes of gap, where:
5848 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5849 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5850 Here, we are sure that NEW >= ORIG. */
5851 float ratio;
5852
5853 if (coding->produced <= coding->consumed)
5854 {
5855 /* This happens because of CCL-based coding system with
5856 eol-type CRLF. */
5857 require = 0;
5858 }
5859 else
5860 {
5861 ratio = (coding->produced - coding->consumed) / coding->consumed;
5862 require = len_byte * ratio;
5863 }
5864 first = 0;
5865 }
5866 if ((src - dst) < (require + 2000))
5867 {
5868 /* See the comment above the previous call of make_gap. */
5869 int add = len_byte + inserted_byte;
5870
5871 GAP_SIZE -= add;
5872 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5873 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5874 make_gap (require + 2000);
5875 GAP_SIZE += add;
5876 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5877 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5878 }
5879 }
5880 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5881
5882 if (encodep && coding->dst_multibyte)
5883 {
5884 /* The output is unibyte. We must convert 8-bit characters to
5885 multibyte form. */
5886 if (inserted_byte * 2 > GAP_SIZE)
5887 {
5888 GAP_SIZE -= inserted_byte;
5889 ZV += inserted_byte; Z += inserted_byte;
5890 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5891 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5892 make_gap (inserted_byte - GAP_SIZE);
5893 GAP_SIZE += inserted_byte;
5894 ZV -= inserted_byte; Z -= inserted_byte;
5895 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5896 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897 }
5898 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5899 }
5900
5901 /* If we shrank the conversion area, adjust it now. */
5902 if (total_skip > 0)
5903 {
5904 if (tail_skip > 0)
5905 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5906 inserted += total_skip; inserted_byte += total_skip;
5907 GAP_SIZE += total_skip;
5908 GPT -= head_skip; GPT_BYTE -= head_skip;
5909 ZV -= total_skip; ZV_BYTE -= total_skip;
5910 Z -= total_skip; Z_BYTE -= total_skip;
5911 from -= head_skip; from_byte -= head_skip;
5912 to += tail_skip; to_byte += tail_skip;
5913 }
5914
5915 prev_Z = Z;
5916 if (! EQ (current_buffer->undo_list, Qt))
5917 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5918 else
5919 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5920 inserted, inserted_byte);
5921 inserted = Z - prev_Z;
5922
5923 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5924 coding_restore_composition (coding, Fcurrent_buffer ());
5925 coding_free_composition_data (coding);
5926
5927 if (! inhibit_pre_post_conversion
5928 && ! encodep && ! NILP (coding->post_read_conversion))
5929 {
5930 Lisp_Object val;
5931 Lisp_Object saved_coding_system;
5932
5933 if (from != PT)
5934 TEMP_SET_PT_BOTH (from, from_byte);
5935 prev_Z = Z;
5936 record_unwind_protect (code_convert_region_unwind,
5937 Vlast_coding_system_used);
5938 saved_coding_system = Vlast_coding_system_used;
5939 Vlast_coding_system_used = coding->symbol;
5940 /* We should not call any more pre-write/post-read-conversion
5941 functions while this post-read-conversion is running. */
5942 inhibit_pre_post_conversion = 1;
5943 val = call1 (coding->post_read_conversion, make_number (inserted));
5944 inhibit_pre_post_conversion = 0;
5945 coding->symbol = Vlast_coding_system_used;
5946 Vlast_coding_system_used = saved_coding_system;
5947 /* Discard the unwind protect. */
5948 specpdl_ptr--;
5949 CHECK_NUMBER (val);
5950 inserted += Z - prev_Z;
5951 }
5952
5953 if (orig_point >= from)
5954 {
5955 if (orig_point >= from + orig_len)
5956 orig_point += inserted - orig_len;
5957 else
5958 orig_point = from;
5959 TEMP_SET_PT (orig_point);
5960 }
5961
5962 if (replace)
5963 {
5964 signal_after_change (from, to - from, inserted);
5965 update_compositions (from, from + inserted, CHECK_BORDER);
5966 }
5967
5968 {
5969 coding->consumed = to_byte - from_byte;
5970 coding->consumed_char = to - from;
5971 coding->produced = inserted_byte;
5972 coding->produced_char = inserted;
5973 }
5974
5975 return 0;
5976 }
5977
5978 Lisp_Object
5979 run_pre_post_conversion_on_str (str, coding, encodep)
5980 Lisp_Object str;
5981 struct coding_system *coding;
5982 int encodep;
5983 {
5984 int count = SPECPDL_INDEX ();
5985 struct gcpro gcpro1, gcpro2;
5986 int multibyte = STRING_MULTIBYTE (str);
5987 Lisp_Object buffer;
5988 struct buffer *buf;
5989 Lisp_Object old_deactivate_mark;
5990
5991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5992 record_unwind_protect (code_convert_region_unwind,
5993 Vlast_coding_system_used);
5994 /* It is not crucial to specbind this. */
5995 old_deactivate_mark = Vdeactivate_mark;
5996 GCPRO2 (str, old_deactivate_mark);
5997
5998 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5999 buf = XBUFFER (buffer);
6000
6001 buf->directory = current_buffer->directory;
6002 buf->read_only = Qnil;
6003 buf->filename = Qnil;
6004 buf->undo_list = Qt;
6005 buf->overlays_before = NULL;
6006 buf->overlays_after = NULL;
6007
6008 set_buffer_internal (buf);
6009 /* We must insert the contents of STR as is without
6010 unibyte<->multibyte conversion. For that, we adjust the
6011 multibyteness of the working buffer to that of STR. */
6012 Ferase_buffer ();
6013 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6014
6015 insert_from_string (str, 0, 0,
6016 SCHARS (str), SBYTES (str), 0);
6017 UNGCPRO;
6018 inhibit_pre_post_conversion = 1;
6019 if (encodep)
6020 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6021 else
6022 {
6023 Vlast_coding_system_used = coding->symbol;
6024 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6025 call1 (coding->post_read_conversion, make_number (Z - BEG));
6026 coding->symbol = Vlast_coding_system_used;
6027 }
6028 inhibit_pre_post_conversion = 0;
6029 Vdeactivate_mark = old_deactivate_mark;
6030 str = make_buffer_string (BEG, Z, 1);
6031 return unbind_to (count, str);
6032 }
6033
6034 Lisp_Object
6035 decode_coding_string (str, coding, nocopy)
6036 Lisp_Object str;
6037 struct coding_system *coding;
6038 int nocopy;
6039 {
6040 int len;
6041 struct conversion_buffer buf;
6042 int from, to_byte;
6043 Lisp_Object saved_coding_symbol;
6044 int result;
6045 int require_decoding;
6046 int shrinked_bytes = 0;
6047 Lisp_Object newstr;
6048 int consumed, consumed_char, produced, produced_char;
6049
6050 from = 0;
6051 to_byte = SBYTES (str);
6052
6053 saved_coding_symbol = coding->symbol;
6054 coding->src_multibyte = STRING_MULTIBYTE (str);
6055 coding->dst_multibyte = 1;
6056 if (CODING_REQUIRE_DETECTION (coding))
6057 {
6058 /* See the comments in code_convert_region. */
6059 if (coding->type == coding_type_undecided)
6060 {
6061 detect_coding (coding, SDATA (str), to_byte);
6062 if (coding->type == coding_type_undecided)
6063 {
6064 coding->type = coding_type_emacs_mule;
6065 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6066 /* As emacs-mule decoder will handle composition, we
6067 need this setting to allocate coding->cmp_data
6068 later. */
6069 coding->composing = COMPOSITION_NO;
6070 }
6071 }
6072 if (coding->eol_type == CODING_EOL_UNDECIDED
6073 && coding->type != coding_type_ccl)
6074 {
6075 saved_coding_symbol = coding->symbol;
6076 detect_eol (coding, SDATA (str), to_byte);
6077 if (coding->eol_type == CODING_EOL_UNDECIDED)
6078 coding->eol_type = CODING_EOL_LF;
6079 /* We had better recover the original eol format if we
6080 encounter an inconsistent eol format while decoding. */
6081 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6082 }
6083 }
6084
6085 if (coding->type == coding_type_no_conversion
6086 || coding->type == coding_type_raw_text)
6087 coding->dst_multibyte = 0;
6088
6089 require_decoding = CODING_REQUIRE_DECODING (coding);
6090
6091 if (STRING_MULTIBYTE (str))
6092 {
6093 /* Decoding routines expect the source text to be unibyte. */
6094 str = Fstring_as_unibyte (str);
6095 to_byte = SBYTES (str);
6096 nocopy = 1;
6097 coding->src_multibyte = 0;
6098 }
6099
6100 /* Try to skip the heading and tailing ASCIIs. */
6101 if (require_decoding && coding->type != coding_type_ccl)
6102 {
6103 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6104 0);
6105 if (from == to_byte)
6106 require_decoding = 0;
6107 shrinked_bytes = from + (SBYTES (str) - to_byte);
6108 }
6109
6110 if (!require_decoding
6111 && !(SYMBOLP (coding->post_read_conversion)
6112 && !NILP (Ffboundp (coding->post_read_conversion))))
6113 {
6114 coding->consumed = SBYTES (str);
6115 coding->consumed_char = SCHARS (str);
6116 if (coding->dst_multibyte)
6117 {
6118 str = Fstring_as_multibyte (str);
6119 nocopy = 1;
6120 }
6121 coding->produced = SBYTES (str);
6122 coding->produced_char = SCHARS (str);
6123 return (nocopy ? str : Fcopy_sequence (str));
6124 }
6125
6126 if (coding->composing != COMPOSITION_DISABLED)
6127 coding_allocate_composition_data (coding, from);
6128 len = decoding_buffer_size (coding, to_byte - from);
6129 allocate_conversion_buffer (buf, len);
6130
6131 consumed = consumed_char = produced = produced_char = 0;
6132 while (1)
6133 {
6134 result = decode_coding (coding, SDATA (str) + from + consumed,
6135 buf.data + produced, to_byte - from - consumed,
6136 buf.size - produced);
6137 consumed += coding->consumed;
6138 consumed_char += coding->consumed_char;
6139 produced += coding->produced;
6140 produced_char += coding->produced_char;
6141 if (result == CODING_FINISH_NORMAL
6142 || (result == CODING_FINISH_INSUFFICIENT_SRC
6143 && coding->consumed == 0))
6144 break;
6145 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6146 coding_allocate_composition_data (coding, from + produced_char);
6147 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6148 extend_conversion_buffer (&buf);
6149 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6150 {
6151 Lisp_Object eol_type;
6152
6153 /* Recover the original EOL format. */
6154 if (coding->eol_type == CODING_EOL_CR)
6155 {
6156 unsigned char *p;
6157 for (p = buf.data; p < buf.data + produced; p++)
6158 if (*p == '\n') *p = '\r';
6159 }
6160 else if (coding->eol_type == CODING_EOL_CRLF)
6161 {
6162 int num_eol = 0;
6163 unsigned char *p0, *p1;
6164 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6165 if (*p0 == '\n') num_eol++;
6166 if (produced + num_eol >= buf.size)
6167 extend_conversion_buffer (&buf);
6168 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6169 {
6170 *--p1 = *--p0;
6171 if (*p0 == '\n') *--p1 = '\r';
6172 }
6173 produced += num_eol;
6174 produced_char += num_eol;
6175 }
6176 /* Suppress eol-format conversion in the further conversion. */
6177 coding->eol_type = CODING_EOL_LF;
6178
6179 /* Set the coding system symbol to that for Unix-like EOL. */
6180 eol_type = Fget (saved_coding_symbol, Qeol_type);
6181 if (VECTORP (eol_type)
6182 && XVECTOR (eol_type)->size == 3
6183 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6184 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6185 else
6186 coding->symbol = saved_coding_symbol;
6187
6188
6189 }
6190 }
6191
6192 coding->consumed = consumed;
6193 coding->consumed_char = consumed_char;
6194 coding->produced = produced;
6195 coding->produced_char = produced_char;
6196
6197 if (coding->dst_multibyte)
6198 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6199 produced + shrinked_bytes);
6200 else
6201 newstr = make_uninit_string (produced + shrinked_bytes);
6202 if (from > 0)
6203 STRING_COPYIN (newstr, 0, SDATA (str), from);
6204 STRING_COPYIN (newstr, from, buf.data, produced);
6205 if (shrinked_bytes > from)
6206 STRING_COPYIN (newstr, from + produced,
6207 SDATA (str) + to_byte,
6208 shrinked_bytes - from);
6209 free_conversion_buffer (&buf);
6210
6211 if (coding->cmp_data && coding->cmp_data->used)
6212 coding_restore_composition (coding, newstr);
6213 coding_free_composition_data (coding);
6214
6215 if (SYMBOLP (coding->post_read_conversion)
6216 && !NILP (Ffboundp (coding->post_read_conversion)))
6217 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6218
6219 return newstr;
6220 }
6221
6222 Lisp_Object
6223 encode_coding_string (str, coding, nocopy)
6224 Lisp_Object str;
6225 struct coding_system *coding;
6226 int nocopy;
6227 {
6228 int len;
6229 struct conversion_buffer buf;
6230 int from, to, to_byte;
6231 int result;
6232 int shrinked_bytes = 0;
6233 Lisp_Object newstr;
6234 int consumed, consumed_char, produced, produced_char;
6235
6236 if (SYMBOLP (coding->pre_write_conversion)
6237 && !NILP (Ffboundp (coding->pre_write_conversion)))
6238 str = run_pre_post_conversion_on_str (str, coding, 1);
6239
6240 from = 0;
6241 to = SCHARS (str);
6242 to_byte = SBYTES (str);
6243
6244 /* Encoding routines determine the multibyteness of the source text
6245 by coding->src_multibyte. */
6246 coding->src_multibyte = STRING_MULTIBYTE (str);
6247 coding->dst_multibyte = 0;
6248 if (! CODING_REQUIRE_ENCODING (coding))
6249 {
6250 coding->consumed = SBYTES (str);
6251 coding->consumed_char = SCHARS (str);
6252 if (STRING_MULTIBYTE (str))
6253 {
6254 str = Fstring_as_unibyte (str);
6255 nocopy = 1;
6256 }
6257 coding->produced = SBYTES (str);
6258 coding->produced_char = SCHARS (str);
6259 return (nocopy ? str : Fcopy_sequence (str));
6260 }
6261
6262 if (coding->composing != COMPOSITION_DISABLED)
6263 coding_save_composition (coding, from, to, str);
6264
6265 /* Try to skip the heading and tailing ASCIIs. */
6266 if (coding->type != coding_type_ccl)
6267 {
6268 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6269 1);
6270 if (from == to_byte)
6271 return (nocopy ? str : Fcopy_sequence (str));
6272 shrinked_bytes = from + (SBYTES (str) - to_byte);
6273 }
6274
6275 len = encoding_buffer_size (coding, to_byte - from);
6276 allocate_conversion_buffer (buf, len);
6277
6278 consumed = consumed_char = produced = produced_char = 0;
6279 while (1)
6280 {
6281 result = encode_coding (coding, SDATA (str) + from + consumed,
6282 buf.data + produced, to_byte - from - consumed,
6283 buf.size - produced);
6284 consumed += coding->consumed;
6285 consumed_char += coding->consumed_char;
6286 produced += coding->produced;
6287 produced_char += coding->produced_char;
6288 if (result == CODING_FINISH_NORMAL
6289 || (result == CODING_FINISH_INSUFFICIENT_SRC
6290 && coding->consumed == 0))
6291 break;
6292 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6293 extend_conversion_buffer (&buf);
6294 }
6295
6296 coding->consumed = consumed;
6297 coding->consumed_char = consumed_char;
6298 coding->produced = produced;
6299 coding->produced_char = produced_char;
6300
6301 newstr = make_uninit_string (produced + shrinked_bytes);
6302 if (from > 0)
6303 STRING_COPYIN (newstr, 0, SDATA (str), from);
6304 STRING_COPYIN (newstr, from, buf.data, produced);
6305 if (shrinked_bytes > from)
6306 STRING_COPYIN (newstr, from + produced,
6307 SDATA (str) + to_byte,
6308 shrinked_bytes - from);
6309
6310 free_conversion_buffer (&buf);
6311 coding_free_composition_data (coding);
6312
6313 return newstr;
6314 }
6315
6316 \f
6317 #ifdef emacs
6318 /*** 8. Emacs Lisp library functions ***/
6319
6320 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6321 doc: /* Return t if OBJECT is nil or a coding-system.
6322 See the documentation of `make-coding-system' for information
6323 about coding-system objects. */)
6324 (obj)
6325 Lisp_Object obj;
6326 {
6327 if (NILP (obj))
6328 return Qt;
6329 if (!SYMBOLP (obj))
6330 return Qnil;
6331 /* Get coding-spec vector for OBJ. */
6332 obj = Fget (obj, Qcoding_system);
6333 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6334 ? Qt : Qnil);
6335 }
6336
6337 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6338 Sread_non_nil_coding_system, 1, 1, 0,
6339 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6340 (prompt)
6341 Lisp_Object prompt;
6342 {
6343 Lisp_Object val;
6344 do
6345 {
6346 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6347 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6348 }
6349 while (SCHARS (val) == 0);
6350 return (Fintern (val, Qnil));
6351 }
6352
6353 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6354 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6355 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6356 (prompt, default_coding_system)
6357 Lisp_Object prompt, default_coding_system;
6358 {
6359 Lisp_Object val;
6360 if (SYMBOLP (default_coding_system))
6361 default_coding_system = SYMBOL_NAME (default_coding_system);
6362 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6363 Qt, Qnil, Qcoding_system_history,
6364 default_coding_system, Qnil);
6365 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6366 }
6367
6368 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6369 1, 1, 0,
6370 doc: /* Check validity of CODING-SYSTEM.
6371 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6372 It is valid if it is a symbol with a non-nil `coding-system' property.
6373 The value of property should be a vector of length 5. */)
6374 (coding_system)
6375 Lisp_Object coding_system;
6376 {
6377 CHECK_SYMBOL (coding_system);
6378 if (!NILP (Fcoding_system_p (coding_system)))
6379 return coding_system;
6380 while (1)
6381 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6382 }
6383 \f
6384 Lisp_Object
6385 detect_coding_system (src, src_bytes, highest, multibytep)
6386 const unsigned char *src;
6387 int src_bytes, highest;
6388 int multibytep;
6389 {
6390 int coding_mask, eol_type;
6391 Lisp_Object val, tmp;
6392 int dummy;
6393
6394 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6395 eol_type = detect_eol_type (src, src_bytes, &dummy);
6396 if (eol_type == CODING_EOL_INCONSISTENT)
6397 eol_type = CODING_EOL_UNDECIDED;
6398
6399 if (!coding_mask)
6400 {
6401 val = Qundecided;
6402 if (eol_type != CODING_EOL_UNDECIDED)
6403 {
6404 Lisp_Object val2;
6405 val2 = Fget (Qundecided, Qeol_type);
6406 if (VECTORP (val2))
6407 val = XVECTOR (val2)->contents[eol_type];
6408 }
6409 return (highest ? val : Fcons (val, Qnil));
6410 }
6411
6412 /* At first, gather possible coding systems in VAL. */
6413 val = Qnil;
6414 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6415 {
6416 Lisp_Object category_val, category_index;
6417
6418 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6419 category_val = Fsymbol_value (XCAR (tmp));
6420 if (!NILP (category_val)
6421 && NATNUMP (category_index)
6422 && (coding_mask & (1 << XFASTINT (category_index))))
6423 {
6424 val = Fcons (category_val, val);
6425 if (highest)
6426 break;
6427 }
6428 }
6429 if (!highest)
6430 val = Fnreverse (val);
6431
6432 /* Then, replace the elements with subsidiary coding systems. */
6433 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6434 {
6435 if (eol_type != CODING_EOL_UNDECIDED
6436 && eol_type != CODING_EOL_INCONSISTENT)
6437 {
6438 Lisp_Object eol;
6439 eol = Fget (XCAR (tmp), Qeol_type);
6440 if (VECTORP (eol))
6441 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6442 }
6443 }
6444 return (highest ? XCAR (val) : val);
6445 }
6446
6447 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6448 2, 3, 0,
6449 doc: /* Detect how the byte sequence in the region is encoded.
6450 Return a list of possible coding systems used on decoding a byte
6451 sequence containing the bytes in the region between START and END when
6452 the coding system `undecided' is specified. The list is ordered by
6453 priority decided in the current language environment.
6454
6455 If only ASCII characters are found, it returns a list of single element
6456 `undecided' or its subsidiary coding system according to a detected
6457 end-of-line format.
6458
6459 If optional argument HIGHEST is non-nil, return the coding system of
6460 highest priority. */)
6461 (start, end, highest)
6462 Lisp_Object start, end, highest;
6463 {
6464 int from, to;
6465 int from_byte, to_byte;
6466 int include_anchor_byte = 0;
6467
6468 CHECK_NUMBER_COERCE_MARKER (start);
6469 CHECK_NUMBER_COERCE_MARKER (end);
6470
6471 validate_region (&start, &end);
6472 from = XINT (start), to = XINT (end);
6473 from_byte = CHAR_TO_BYTE (from);
6474 to_byte = CHAR_TO_BYTE (to);
6475
6476 if (from < GPT && to >= GPT)
6477 move_gap_both (to, to_byte);
6478 /* If we an anchor byte `\0' follows the region, we include it in
6479 the detecting source. Then code detectors can handle the tailing
6480 byte sequence more accurately.
6481
6482 Fix me: This is not a perfect solution. It is better that we
6483 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6484 */
6485 if (to == Z || (to == GPT && GAP_SIZE > 0))
6486 include_anchor_byte = 1;
6487 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6488 to_byte - from_byte + include_anchor_byte,
6489 !NILP (highest),
6490 !NILP (current_buffer
6491 ->enable_multibyte_characters));
6492 }
6493
6494 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6495 1, 2, 0,
6496 doc: /* Detect how the byte sequence in STRING is encoded.
6497 Return a list of possible coding systems used on decoding a byte
6498 sequence containing the bytes in STRING when the coding system
6499 `undecided' is specified. The list is ordered by priority decided in
6500 the current language environment.
6501
6502 If only ASCII characters are found, it returns a list of single element
6503 `undecided' or its subsidiary coding system according to a detected
6504 end-of-line format.
6505
6506 If optional argument HIGHEST is non-nil, return the coding system of
6507 highest priority. */)
6508 (string, highest)
6509 Lisp_Object string, highest;
6510 {
6511 CHECK_STRING (string);
6512
6513 return detect_coding_system (SDATA (string),
6514 /* "+ 1" is to include the anchor byte
6515 `\0'. With this, code detectors can
6516 handle the tailing bytes more
6517 accurately. */
6518 SBYTES (string) + 1,
6519 !NILP (highest),
6520 STRING_MULTIBYTE (string));
6521 }
6522
6523 /* Subroutine for Fsafe_coding_systems_region_internal.
6524
6525 Return a list of coding systems that safely encode the multibyte
6526 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6527 possible coding systems. If it is nil, it means that we have not
6528 yet found any coding systems.
6529
6530 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
6531 element of WORK_TABLE is set to t once the element is looked up.
6532
6533 If a non-ASCII single byte char is found, set
6534 *single_byte_char_found to 1. */
6535
6536 static Lisp_Object
6537 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6538 unsigned char *p, *pend;
6539 Lisp_Object safe_codings, work_table;
6540 int *single_byte_char_found;
6541 {
6542 int c, len;
6543 Lisp_Object val, ch;
6544 Lisp_Object prev, tail;
6545
6546 while (p < pend)
6547 {
6548 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6549 p += len;
6550 if (ASCII_BYTE_P (c))
6551 /* We can ignore ASCII characters here. */
6552 continue;
6553 if (SINGLE_BYTE_CHAR_P (c))
6554 *single_byte_char_found = 1;
6555 if (NILP (safe_codings))
6556 /* Already all coding systems are excluded. But, we can't
6557 terminate the loop here because non-ASCII single-byte char
6558 must be found. */
6559 continue;
6560 /* Check the safe coding systems for C. */
6561 ch = make_number (c);
6562 val = Faref (work_table, ch);
6563 if (EQ (val, Qt))
6564 /* This element was already checked. Ignore it. */
6565 continue;
6566 /* Remember that we checked this element. */
6567 Faset (work_table, ch, Qt);
6568
6569 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6570 {
6571 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6572 int encodable;
6573
6574 elt = XCAR (tail);
6575 if (CONSP (XCDR (elt)))
6576 {
6577 /* This entry has this format now:
6578 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6579 ACCEPT-LATIN-EXTRA ) */
6580 val = XCDR (elt);
6581 encodable = ! NILP (Faref (XCAR (val), ch));
6582 if (! encodable)
6583 {
6584 val = XCDR (val);
6585 translation_table = XCAR (val);
6586 hash_table = XCAR (XCDR (val));
6587 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6588 }
6589 }
6590 else
6591 {
6592 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6593 encodable = ! NILP (Faref (XCDR (elt), ch));
6594 if (! encodable)
6595 {
6596 /* Transform the format to:
6597 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6598 ACCEPT-LATIN-EXTRA ) */
6599 val = Fget (XCAR (elt), Qcoding_system);
6600 translation_table
6601 = Fplist_get (AREF (val, 3),
6602 Qtranslation_table_for_encode);
6603 if (SYMBOLP (translation_table))
6604 translation_table = Fget (translation_table,
6605 Qtranslation_table);
6606 hash_table
6607 = (CHAR_TABLE_P (translation_table)
6608 ? XCHAR_TABLE (translation_table)->extras[1]
6609 : Qnil);
6610 accept_latin_extra
6611 = ((EQ (AREF (val, 0), make_number (2))
6612 && VECTORP (AREF (val, 4)))
6613 ? AREF (AREF (val, 4), 16)
6614 : Qnil);
6615 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6616 translation_table, hash_table,
6617 accept_latin_extra));
6618 }
6619 }
6620
6621 if (! encodable
6622 && ((CHAR_TABLE_P (translation_table)
6623 && ! NILP (Faref (translation_table, ch)))
6624 || (HASH_TABLE_P (hash_table)
6625 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6626 || (SINGLE_BYTE_CHAR_P (c)
6627 && ! NILP (accept_latin_extra)
6628 && VECTORP (Vlatin_extra_code_table)
6629 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6630 encodable = 1;
6631 if (encodable)
6632 prev = tail;
6633 else
6634 {
6635 /* Exclude this coding system from SAFE_CODINGS. */
6636 if (EQ (tail, safe_codings))
6637 safe_codings = XCDR (safe_codings);
6638 else
6639 XSETCDR (prev, XCDR (tail));
6640 }
6641 }
6642 }
6643 return safe_codings;
6644 }
6645
6646 DEFUN ("find-coding-systems-region-internal",
6647 Ffind_coding_systems_region_internal,
6648 Sfind_coding_systems_region_internal, 2, 2, 0,
6649 doc: /* Internal use only. */)
6650 (start, end)
6651 Lisp_Object start, end;
6652 {
6653 Lisp_Object work_table, safe_codings;
6654 int non_ascii_p = 0;
6655 int single_byte_char_found = 0;
6656 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6657
6658 if (STRINGP (start))
6659 {
6660 if (!STRING_MULTIBYTE (start))
6661 return Qt;
6662 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6663 p2 = p2end = p1end;
6664 if (SCHARS (start) != SBYTES (start))
6665 non_ascii_p = 1;
6666 }
6667 else
6668 {
6669 int from, to, stop;
6670
6671 CHECK_NUMBER_COERCE_MARKER (start);
6672 CHECK_NUMBER_COERCE_MARKER (end);
6673 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6674 args_out_of_range (start, end);
6675 if (NILP (current_buffer->enable_multibyte_characters))
6676 return Qt;
6677 from = CHAR_TO_BYTE (XINT (start));
6678 to = CHAR_TO_BYTE (XINT (end));
6679 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6680 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6681 if (stop == to)
6682 p2 = p2end = p1end;
6683 else
6684 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6685 if (XINT (end) - XINT (start) != to - from)
6686 non_ascii_p = 1;
6687 }
6688
6689 if (!non_ascii_p)
6690 {
6691 /* We are sure that the text contains no multibyte character.
6692 Check if it contains eight-bit-graphic. */
6693 p = p1;
6694 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6695 if (p == p1end)
6696 {
6697 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6698 if (p == p2end)
6699 return Qt;
6700 }
6701 }
6702
6703 /* The text contains non-ASCII characters. */
6704
6705 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6706 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6707
6708 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6709 &single_byte_char_found);
6710 if (p2 < p2end)
6711 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6712 &single_byte_char_found);
6713 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6714 safe_codings = Qt;
6715 else
6716 {
6717 /* Turn safe_codings to a list of coding systems... */
6718 Lisp_Object val;
6719
6720 if (single_byte_char_found)
6721 /* ... and append these for eight-bit chars. */
6722 val = Fcons (Qraw_text,
6723 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6724 else
6725 /* ... and append generic coding systems. */
6726 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6727
6728 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6729 val = Fcons (XCAR (XCAR (safe_codings)), val);
6730 safe_codings = val;
6731 }
6732
6733 return safe_codings;
6734 }
6735
6736
6737 /* Search from position POS for such characters that are unencodable
6738 accoding to SAFE_CHARS, and return a list of their positions. P
6739 points where in the memory the character at POS exists. Limit the
6740 search at PEND or when Nth unencodable characters are found.
6741
6742 If SAFE_CHARS is a char table, an element for an unencodable
6743 character is nil.
6744
6745 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6746
6747 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6748 eight-bit-graphic characters are unencodable. */
6749
6750 static Lisp_Object
6751 unencodable_char_position (safe_chars, pos, p, pend, n)
6752 Lisp_Object safe_chars;
6753 int pos;
6754 unsigned char *p, *pend;
6755 int n;
6756 {
6757 Lisp_Object pos_list;
6758
6759 pos_list = Qnil;
6760 while (p < pend)
6761 {
6762 int len;
6763 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6764
6765 if (c >= 128
6766 && (CHAR_TABLE_P (safe_chars)
6767 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6768 : (NILP (safe_chars) || c < 256)))
6769 {
6770 pos_list = Fcons (make_number (pos), pos_list);
6771 if (--n <= 0)
6772 break;
6773 }
6774 pos++;
6775 p += len;
6776 }
6777 return Fnreverse (pos_list);
6778 }
6779
6780
6781 DEFUN ("unencodable-char-position", Funencodable_char_position,
6782 Sunencodable_char_position, 3, 5, 0,
6783 doc: /*
6784 Return position of first un-encodable character in a region.
6785 START and END specfiy the region and CODING-SYSTEM specifies the
6786 encoding to check. Return nil if CODING-SYSTEM does encode the region.
6787
6788 If optional 4th argument COUNT is non-nil, it specifies at most how
6789 many un-encodable characters to search. In this case, the value is a
6790 list of positions.
6791
6792 If optional 5th argument STRING is non-nil, it is a string to search
6793 for un-encodable characters. In that case, START and END are indexes
6794 to the string. */)
6795 (start, end, coding_system, count, string)
6796 Lisp_Object start, end, coding_system, count, string;
6797 {
6798 int n;
6799 Lisp_Object safe_chars;
6800 struct coding_system coding;
6801 Lisp_Object positions;
6802 int from, to;
6803 unsigned char *p, *pend;
6804
6805 if (NILP (string))
6806 {
6807 validate_region (&start, &end);
6808 from = XINT (start);
6809 to = XINT (end);
6810 if (NILP (current_buffer->enable_multibyte_characters))
6811 return Qnil;
6812 p = CHAR_POS_ADDR (from);
6813 if (to == GPT)
6814 pend = GPT_ADDR;
6815 else
6816 pend = CHAR_POS_ADDR (to);
6817 }
6818 else
6819 {
6820 CHECK_STRING (string);
6821 CHECK_NATNUM (start);
6822 CHECK_NATNUM (end);
6823 from = XINT (start);
6824 to = XINT (end);
6825 if (from > to
6826 || to > SCHARS (string))
6827 args_out_of_range_3 (string, start, end);
6828 if (! STRING_MULTIBYTE (string))
6829 return Qnil;
6830 p = SDATA (string) + string_char_to_byte (string, from);
6831 pend = SDATA (string) + string_char_to_byte (string, to);
6832 }
6833
6834 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6835
6836 if (NILP (count))
6837 n = 1;
6838 else
6839 {
6840 CHECK_NATNUM (count);
6841 n = XINT (count);
6842 }
6843
6844 if (coding.type == coding_type_no_conversion
6845 || coding.type == coding_type_raw_text)
6846 return Qnil;
6847
6848 if (coding.type == coding_type_undecided)
6849 safe_chars = Qnil;
6850 else
6851 safe_chars = coding_safe_chars (coding_system);
6852
6853 if (STRINGP (string)
6854 || from >= GPT || to <= GPT)
6855 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6856 else
6857 {
6858 Lisp_Object args[2];
6859
6860 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6861 n -= XINT (Flength (args[0]));
6862 if (n <= 0)
6863 positions = args[0];
6864 else
6865 {
6866 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6867 pend, n);
6868 positions = Fappend (2, args);
6869 }
6870 }
6871
6872 return (NILP (count) ? Fcar (positions) : positions);
6873 }
6874
6875
6876 Lisp_Object
6877 code_convert_region1 (start, end, coding_system, encodep)
6878 Lisp_Object start, end, coding_system;
6879 int encodep;
6880 {
6881 struct coding_system coding;
6882 int from, to;
6883
6884 CHECK_NUMBER_COERCE_MARKER (start);
6885 CHECK_NUMBER_COERCE_MARKER (end);
6886 CHECK_SYMBOL (coding_system);
6887
6888 validate_region (&start, &end);
6889 from = XFASTINT (start);
6890 to = XFASTINT (end);
6891
6892 if (NILP (coding_system))
6893 return make_number (to - from);
6894
6895 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6896 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6897
6898 coding.mode |= CODING_MODE_LAST_BLOCK;
6899 coding.src_multibyte = coding.dst_multibyte
6900 = !NILP (current_buffer->enable_multibyte_characters);
6901 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6902 &coding, encodep, 1);
6903 Vlast_coding_system_used = coding.symbol;
6904 return make_number (coding.produced_char);
6905 }
6906
6907 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6908 3, 3, "r\nzCoding system: ",
6909 doc: /* Decode the current region from the specified coding system.
6910 When called from a program, takes three arguments:
6911 START, END, and CODING-SYSTEM. START and END are buffer positions.
6912 This function sets `last-coding-system-used' to the precise coding system
6913 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6914 not fully specified.)
6915 It returns the length of the decoded text. */)
6916 (start, end, coding_system)
6917 Lisp_Object start, end, coding_system;
6918 {
6919 return code_convert_region1 (start, end, coding_system, 0);
6920 }
6921
6922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6923 3, 3, "r\nzCoding system: ",
6924 doc: /* Encode the current region into the specified coding system.
6925 When called from a program, takes three arguments:
6926 START, END, and CODING-SYSTEM. START and END are buffer positions.
6927 This function sets `last-coding-system-used' to the precise coding system
6928 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6929 not fully specified.)
6930 It returns the length of the encoded text. */)
6931 (start, end, coding_system)
6932 Lisp_Object start, end, coding_system;
6933 {
6934 return code_convert_region1 (start, end, coding_system, 1);
6935 }
6936
6937 Lisp_Object
6938 code_convert_string1 (string, coding_system, nocopy, encodep)
6939 Lisp_Object string, coding_system, nocopy;
6940 int encodep;
6941 {
6942 struct coding_system coding;
6943
6944 CHECK_STRING (string);
6945 CHECK_SYMBOL (coding_system);
6946
6947 if (NILP (coding_system))
6948 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6949
6950 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6951 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6952
6953 coding.mode |= CODING_MODE_LAST_BLOCK;
6954 string = (encodep
6955 ? encode_coding_string (string, &coding, !NILP (nocopy))
6956 : decode_coding_string (string, &coding, !NILP (nocopy)));
6957 Vlast_coding_system_used = coding.symbol;
6958
6959 return string;
6960 }
6961
6962 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6963 2, 3, 0,
6964 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6965 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6966 if the decoding operation is trivial.
6967 This function sets `last-coding-system-used' to the precise coding system
6968 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6969 not fully specified.) */)
6970 (string, coding_system, nocopy)
6971 Lisp_Object string, coding_system, nocopy;
6972 {
6973 return code_convert_string1 (string, coding_system, nocopy, 0);
6974 }
6975
6976 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6977 2, 3, 0,
6978 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6979 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6980 if the encoding operation is trivial.
6981 This function sets `last-coding-system-used' to the precise coding system
6982 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6983 not fully specified.) */)
6984 (string, coding_system, nocopy)
6985 Lisp_Object string, coding_system, nocopy;
6986 {
6987 return code_convert_string1 (string, coding_system, nocopy, 1);
6988 }
6989
6990 /* Encode or decode STRING according to CODING_SYSTEM.
6991 Do not set Vlast_coding_system_used.
6992
6993 This function is called only from macros DECODE_FILE and
6994 ENCODE_FILE, thus we ignore character composition. */
6995
6996 Lisp_Object
6997 code_convert_string_norecord (string, coding_system, encodep)
6998 Lisp_Object string, coding_system;
6999 int encodep;
7000 {
7001 struct coding_system coding;
7002
7003 CHECK_STRING (string);
7004 CHECK_SYMBOL (coding_system);
7005
7006 if (NILP (coding_system))
7007 return string;
7008
7009 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7010 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7011
7012 coding.composing = COMPOSITION_DISABLED;
7013 coding.mode |= CODING_MODE_LAST_BLOCK;
7014 return (encodep
7015 ? encode_coding_string (string, &coding, 1)
7016 : decode_coding_string (string, &coding, 1));
7017 }
7018 \f
7019 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7020 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7021 Return the corresponding character. */)
7022 (code)
7023 Lisp_Object code;
7024 {
7025 unsigned char c1, c2, s1, s2;
7026 Lisp_Object val;
7027
7028 CHECK_NUMBER (code);
7029 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7030 if (s1 == 0)
7031 {
7032 if (s2 < 0x80)
7033 XSETFASTINT (val, s2);
7034 else if (s2 >= 0xA0 || s2 <= 0xDF)
7035 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7036 else
7037 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7038 }
7039 else
7040 {
7041 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7042 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7043 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7044 DECODE_SJIS (s1, s2, c1, c2);
7045 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7046 }
7047 return val;
7048 }
7049
7050 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7051 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7052 Return the corresponding code in SJIS. */)
7053 (ch)
7054 Lisp_Object ch;
7055 {
7056 int charset, c1, c2, s1, s2;
7057 Lisp_Object val;
7058
7059 CHECK_NUMBER (ch);
7060 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7061 if (charset == CHARSET_ASCII)
7062 {
7063 val = ch;
7064 }
7065 else if (charset == charset_jisx0208
7066 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7067 {
7068 ENCODE_SJIS (c1, c2, s1, s2);
7069 XSETFASTINT (val, (s1 << 8) | s2);
7070 }
7071 else if (charset == charset_katakana_jisx0201
7072 && c1 > 0x20 && c2 < 0xE0)
7073 {
7074 XSETFASTINT (val, c1 | 0x80);
7075 }
7076 else
7077 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7078 return val;
7079 }
7080
7081 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7082 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7083 Return the corresponding character. */)
7084 (code)
7085 Lisp_Object code;
7086 {
7087 int charset;
7088 unsigned char b1, b2, c1, c2;
7089 Lisp_Object val;
7090
7091 CHECK_NUMBER (code);
7092 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7093 if (b1 == 0)
7094 {
7095 if (b2 >= 0x80)
7096 error ("Invalid BIG5 code: %x", XFASTINT (code));
7097 val = code;
7098 }
7099 else
7100 {
7101 if ((b1 < 0xA1 || b1 > 0xFE)
7102 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7103 error ("Invalid BIG5 code: %x", XFASTINT (code));
7104 DECODE_BIG5 (b1, b2, charset, c1, c2);
7105 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7106 }
7107 return val;
7108 }
7109
7110 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7111 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7112 Return the corresponding character code in Big5. */)
7113 (ch)
7114 Lisp_Object ch;
7115 {
7116 int charset, c1, c2, b1, b2;
7117 Lisp_Object val;
7118
7119 CHECK_NUMBER (ch);
7120 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7121 if (charset == CHARSET_ASCII)
7122 {
7123 val = ch;
7124 }
7125 else if ((charset == charset_big5_1
7126 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7127 || (charset == charset_big5_2
7128 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7129 {
7130 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7131 XSETFASTINT (val, (b1 << 8) | b2);
7132 }
7133 else
7134 error ("Can't encode to Big5: %d", XFASTINT (ch));
7135 return val;
7136 }
7137 \f
7138 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7139 Sset_terminal_coding_system_internal, 1, 1, 0,
7140 doc: /* Internal use only. */)
7141 (coding_system)
7142 Lisp_Object coding_system;
7143 {
7144 CHECK_SYMBOL (coding_system);
7145 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7146 /* We had better not send unsafe characters to terminal. */
7147 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7148 /* Character composition should be disabled. */
7149 terminal_coding.composing = COMPOSITION_DISABLED;
7150 /* Error notification should be suppressed. */
7151 terminal_coding.suppress_error = 1;
7152 terminal_coding.src_multibyte = 1;
7153 terminal_coding.dst_multibyte = 0;
7154 return Qnil;
7155 }
7156
7157 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7158 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7159 doc: /* Internal use only. */)
7160 (coding_system)
7161 Lisp_Object coding_system;
7162 {
7163 CHECK_SYMBOL (coding_system);
7164 setup_coding_system (Fcheck_coding_system (coding_system),
7165 &safe_terminal_coding);
7166 /* Character composition should be disabled. */
7167 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7168 /* Error notification should be suppressed. */
7169 terminal_coding.suppress_error = 1;
7170 safe_terminal_coding.src_multibyte = 1;
7171 safe_terminal_coding.dst_multibyte = 0;
7172 return Qnil;
7173 }
7174
7175 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7176 Sterminal_coding_system, 0, 0, 0,
7177 doc: /* Return coding system specified for terminal output. */)
7178 ()
7179 {
7180 return terminal_coding.symbol;
7181 }
7182
7183 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7184 Sset_keyboard_coding_system_internal, 1, 1, 0,
7185 doc: /* Internal use only. */)
7186 (coding_system)
7187 Lisp_Object coding_system;
7188 {
7189 CHECK_SYMBOL (coding_system);
7190 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7191 /* Character composition should be disabled. */
7192 keyboard_coding.composing = COMPOSITION_DISABLED;
7193 return Qnil;
7194 }
7195
7196 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7197 Skeyboard_coding_system, 0, 0, 0,
7198 doc: /* Return coding system specified for decoding keyboard input. */)
7199 ()
7200 {
7201 return keyboard_coding.symbol;
7202 }
7203
7204 \f
7205 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7206 Sfind_operation_coding_system, 1, MANY, 0,
7207 doc: /* Choose a coding system for an operation based on the target name.
7208 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7209 DECODING-SYSTEM is the coding system to use for decoding
7210 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7211 for encoding (in case OPERATION does encoding).
7212
7213 The first argument OPERATION specifies an I/O primitive:
7214 For file I/O, `insert-file-contents' or `write-region'.
7215 For process I/O, `call-process', `call-process-region', or `start-process'.
7216 For network I/O, `open-network-stream'.
7217
7218 The remaining arguments should be the same arguments that were passed
7219 to the primitive. Depending on which primitive, one of those arguments
7220 is selected as the TARGET. For example, if OPERATION does file I/O,
7221 whichever argument specifies the file name is TARGET.
7222
7223 TARGET has a meaning which depends on OPERATION:
7224 For file I/O, TARGET is a file name.
7225 For process I/O, TARGET is a process name.
7226 For network I/O, TARGET is a service name or a port number
7227
7228 This function looks up what specified for TARGET in,
7229 `file-coding-system-alist', `process-coding-system-alist',
7230 or `network-coding-system-alist' depending on OPERATION.
7231 They may specify a coding system, a cons of coding systems,
7232 or a function symbol to call.
7233 In the last case, we call the function with one argument,
7234 which is a list of all the arguments given to this function.
7235
7236 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7237 (nargs, args)
7238 int nargs;
7239 Lisp_Object *args;
7240 {
7241 Lisp_Object operation, target_idx, target, val;
7242 register Lisp_Object chain;
7243
7244 if (nargs < 2)
7245 error ("Too few arguments");
7246 operation = args[0];
7247 if (!SYMBOLP (operation)
7248 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7249 error ("Invalid first argument");
7250 if (nargs < 1 + XINT (target_idx))
7251 error ("Too few arguments for operation: %s",
7252 SDATA (SYMBOL_NAME (operation)));
7253 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7254 argument to write-region) is string, it must be treated as a
7255 target file name. */
7256 if (EQ (operation, Qwrite_region)
7257 && nargs > 5
7258 && STRINGP (args[5]))
7259 target_idx = make_number (4);
7260 target = args[XINT (target_idx) + 1];
7261 if (!(STRINGP (target)
7262 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7263 error ("Invalid argument %d", XINT (target_idx) + 1);
7264
7265 chain = ((EQ (operation, Qinsert_file_contents)
7266 || EQ (operation, Qwrite_region))
7267 ? Vfile_coding_system_alist
7268 : (EQ (operation, Qopen_network_stream)
7269 ? Vnetwork_coding_system_alist
7270 : Vprocess_coding_system_alist));
7271 if (NILP (chain))
7272 return Qnil;
7273
7274 for (; CONSP (chain); chain = XCDR (chain))
7275 {
7276 Lisp_Object elt;
7277 elt = XCAR (chain);
7278
7279 if (CONSP (elt)
7280 && ((STRINGP (target)
7281 && STRINGP (XCAR (elt))
7282 && fast_string_match (XCAR (elt), target) >= 0)
7283 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7284 {
7285 val = XCDR (elt);
7286 /* Here, if VAL is both a valid coding system and a valid
7287 function symbol, we return VAL as a coding system. */
7288 if (CONSP (val))
7289 return val;
7290 if (! SYMBOLP (val))
7291 return Qnil;
7292 if (! NILP (Fcoding_system_p (val)))
7293 return Fcons (val, val);
7294 if (! NILP (Ffboundp (val)))
7295 {
7296 val = call1 (val, Flist (nargs, args));
7297 if (CONSP (val))
7298 return val;
7299 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7300 return Fcons (val, val);
7301 }
7302 return Qnil;
7303 }
7304 }
7305 return Qnil;
7306 }
7307
7308 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7309 Supdate_coding_systems_internal, 0, 0, 0,
7310 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7311 When values of any coding categories are changed, you must
7312 call this function. */)
7313 ()
7314 {
7315 int i;
7316
7317 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7318 {
7319 Lisp_Object val;
7320
7321 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7322 if (!NILP (val))
7323 {
7324 if (! coding_system_table[i])
7325 coding_system_table[i] = ((struct coding_system *)
7326 xmalloc (sizeof (struct coding_system)));
7327 setup_coding_system (val, coding_system_table[i]);
7328 }
7329 else if (coding_system_table[i])
7330 {
7331 xfree (coding_system_table[i]);
7332 coding_system_table[i] = NULL;
7333 }
7334 }
7335
7336 return Qnil;
7337 }
7338
7339 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7340 Sset_coding_priority_internal, 0, 0, 0,
7341 doc: /* Update internal database for the current value of `coding-category-list'.
7342 This function is internal use only. */)
7343 ()
7344 {
7345 int i = 0, idx;
7346 Lisp_Object val;
7347
7348 val = Vcoding_category_list;
7349
7350 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7351 {
7352 if (! SYMBOLP (XCAR (val)))
7353 break;
7354 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7355 if (idx >= CODING_CATEGORY_IDX_MAX)
7356 break;
7357 coding_priorities[i++] = (1 << idx);
7358 val = XCDR (val);
7359 }
7360 /* If coding-category-list is valid and contains all coding
7361 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7362 the following code saves Emacs from crashing. */
7363 while (i < CODING_CATEGORY_IDX_MAX)
7364 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7365
7366 return Qnil;
7367 }
7368
7369 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7370 Sdefine_coding_system_internal, 1, 1, 0,
7371 doc: /* Register CODING-SYSTEM as a base coding system.
7372 This function is internal use only. */)
7373 (coding_system)
7374 Lisp_Object coding_system;
7375 {
7376 Lisp_Object safe_chars, slot;
7377
7378 if (NILP (Fcheck_coding_system (coding_system)))
7379 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7380 safe_chars = coding_safe_chars (coding_system);
7381 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7382 error ("No valid safe-chars property for %s",
7383 SDATA (SYMBOL_NAME (coding_system)));
7384 if (EQ (safe_chars, Qt))
7385 {
7386 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7387 XSETCAR (Vcoding_system_safe_chars,
7388 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7389 }
7390 else
7391 {
7392 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7393 if (NILP (slot))
7394 XSETCDR (Vcoding_system_safe_chars,
7395 nconc2 (XCDR (Vcoding_system_safe_chars),
7396 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7397 else
7398 XSETCDR (slot, safe_chars);
7399 }
7400 return Qnil;
7401 }
7402
7403 #endif /* emacs */
7404
7405 \f
7406 /*** 9. Post-amble ***/
7407
7408 void
7409 init_coding_once ()
7410 {
7411 int i;
7412
7413 /* Emacs' internal format specific initialize routine. */
7414 for (i = 0; i <= 0x20; i++)
7415 emacs_code_class[i] = EMACS_control_code;
7416 emacs_code_class[0x0A] = EMACS_linefeed_code;
7417 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7418 for (i = 0x21 ; i < 0x7F; i++)
7419 emacs_code_class[i] = EMACS_ascii_code;
7420 emacs_code_class[0x7F] = EMACS_control_code;
7421 for (i = 0x80; i < 0xFF; i++)
7422 emacs_code_class[i] = EMACS_invalid_code;
7423 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7424 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7425 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7426 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7427
7428 /* ISO2022 specific initialize routine. */
7429 for (i = 0; i < 0x20; i++)
7430 iso_code_class[i] = ISO_control_0;
7431 for (i = 0x21; i < 0x7F; i++)
7432 iso_code_class[i] = ISO_graphic_plane_0;
7433 for (i = 0x80; i < 0xA0; i++)
7434 iso_code_class[i] = ISO_control_1;
7435 for (i = 0xA1; i < 0xFF; i++)
7436 iso_code_class[i] = ISO_graphic_plane_1;
7437 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7438 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7439 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7440 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7441 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7442 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7443 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7444 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7445 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7446 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7447
7448 setup_coding_system (Qnil, &keyboard_coding);
7449 setup_coding_system (Qnil, &terminal_coding);
7450 setup_coding_system (Qnil, &safe_terminal_coding);
7451 setup_coding_system (Qnil, &default_buffer_file_coding);
7452
7453 bzero (coding_system_table, sizeof coding_system_table);
7454
7455 bzero (ascii_skip_code, sizeof ascii_skip_code);
7456 for (i = 0; i < 128; i++)
7457 ascii_skip_code[i] = 1;
7458
7459 #if defined (MSDOS) || defined (WINDOWSNT)
7460 system_eol_type = CODING_EOL_CRLF;
7461 #else
7462 system_eol_type = CODING_EOL_LF;
7463 #endif
7464
7465 inhibit_pre_post_conversion = 0;
7466 }
7467
7468 #ifdef emacs
7469
7470 void
7471 syms_of_coding ()
7472 {
7473 Qtarget_idx = intern ("target-idx");
7474 staticpro (&Qtarget_idx);
7475
7476 Qcoding_system_history = intern ("coding-system-history");
7477 staticpro (&Qcoding_system_history);
7478 Fset (Qcoding_system_history, Qnil);
7479
7480 /* Target FILENAME is the first argument. */
7481 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7482 /* Target FILENAME is the third argument. */
7483 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7484
7485 Qcall_process = intern ("call-process");
7486 staticpro (&Qcall_process);
7487 /* Target PROGRAM is the first argument. */
7488 Fput (Qcall_process, Qtarget_idx, make_number (0));
7489
7490 Qcall_process_region = intern ("call-process-region");
7491 staticpro (&Qcall_process_region);
7492 /* Target PROGRAM is the third argument. */
7493 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7494
7495 Qstart_process = intern ("start-process");
7496 staticpro (&Qstart_process);
7497 /* Target PROGRAM is the third argument. */
7498 Fput (Qstart_process, Qtarget_idx, make_number (2));
7499
7500 Qopen_network_stream = intern ("open-network-stream");
7501 staticpro (&Qopen_network_stream);
7502 /* Target SERVICE is the fourth argument. */
7503 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7504
7505 Qcoding_system = intern ("coding-system");
7506 staticpro (&Qcoding_system);
7507
7508 Qeol_type = intern ("eol-type");
7509 staticpro (&Qeol_type);
7510
7511 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7512 staticpro (&Qbuffer_file_coding_system);
7513
7514 Qpost_read_conversion = intern ("post-read-conversion");
7515 staticpro (&Qpost_read_conversion);
7516
7517 Qpre_write_conversion = intern ("pre-write-conversion");
7518 staticpro (&Qpre_write_conversion);
7519
7520 Qno_conversion = intern ("no-conversion");
7521 staticpro (&Qno_conversion);
7522
7523 Qundecided = intern ("undecided");
7524 staticpro (&Qundecided);
7525
7526 Qcoding_system_p = intern ("coding-system-p");
7527 staticpro (&Qcoding_system_p);
7528
7529 Qcoding_system_error = intern ("coding-system-error");
7530 staticpro (&Qcoding_system_error);
7531
7532 Fput (Qcoding_system_error, Qerror_conditions,
7533 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7534 Fput (Qcoding_system_error, Qerror_message,
7535 build_string ("Invalid coding system"));
7536
7537 Qcoding_category = intern ("coding-category");
7538 staticpro (&Qcoding_category);
7539 Qcoding_category_index = intern ("coding-category-index");
7540 staticpro (&Qcoding_category_index);
7541
7542 Vcoding_category_table
7543 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7544 staticpro (&Vcoding_category_table);
7545 {
7546 int i;
7547 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7548 {
7549 XVECTOR (Vcoding_category_table)->contents[i]
7550 = intern (coding_category_name[i]);
7551 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7552 Qcoding_category_index, make_number (i));
7553 }
7554 }
7555
7556 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7557 staticpro (&Vcoding_system_safe_chars);
7558
7559 Qtranslation_table = intern ("translation-table");
7560 staticpro (&Qtranslation_table);
7561 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7562
7563 Qtranslation_table_id = intern ("translation-table-id");
7564 staticpro (&Qtranslation_table_id);
7565
7566 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7567 staticpro (&Qtranslation_table_for_decode);
7568
7569 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7570 staticpro (&Qtranslation_table_for_encode);
7571
7572 Qsafe_chars = intern ("safe-chars");
7573 staticpro (&Qsafe_chars);
7574
7575 Qchar_coding_system = intern ("char-coding-system");
7576 staticpro (&Qchar_coding_system);
7577
7578 /* Intern this now in case it isn't already done.
7579 Setting this variable twice is harmless.
7580 But don't staticpro it here--that is done in alloc.c. */
7581 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7582 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7583 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7584
7585 Qvalid_codes = intern ("valid-codes");
7586 staticpro (&Qvalid_codes);
7587
7588 Qemacs_mule = intern ("emacs-mule");
7589 staticpro (&Qemacs_mule);
7590
7591 Qraw_text = intern ("raw-text");
7592 staticpro (&Qraw_text);
7593
7594 Qutf_8 = intern ("utf-8");
7595 staticpro (&Qutf_8);
7596
7597 defsubr (&Scoding_system_p);
7598 defsubr (&Sread_coding_system);
7599 defsubr (&Sread_non_nil_coding_system);
7600 defsubr (&Scheck_coding_system);
7601 defsubr (&Sdetect_coding_region);
7602 defsubr (&Sdetect_coding_string);
7603 defsubr (&Sfind_coding_systems_region_internal);
7604 defsubr (&Sunencodable_char_position);
7605 defsubr (&Sdecode_coding_region);
7606 defsubr (&Sencode_coding_region);
7607 defsubr (&Sdecode_coding_string);
7608 defsubr (&Sencode_coding_string);
7609 defsubr (&Sdecode_sjis_char);
7610 defsubr (&Sencode_sjis_char);
7611 defsubr (&Sdecode_big5_char);
7612 defsubr (&Sencode_big5_char);
7613 defsubr (&Sset_terminal_coding_system_internal);
7614 defsubr (&Sset_safe_terminal_coding_system_internal);
7615 defsubr (&Sterminal_coding_system);
7616 defsubr (&Sset_keyboard_coding_system_internal);
7617 defsubr (&Skeyboard_coding_system);
7618 defsubr (&Sfind_operation_coding_system);
7619 defsubr (&Supdate_coding_systems_internal);
7620 defsubr (&Sset_coding_priority_internal);
7621 defsubr (&Sdefine_coding_system_internal);
7622
7623 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7624 doc: /* List of coding systems.
7625
7626 Do not alter the value of this variable manually. This variable should be
7627 updated by the functions `make-coding-system' and
7628 `define-coding-system-alias'. */);
7629 Vcoding_system_list = Qnil;
7630
7631 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7632 doc: /* Alist of coding system names.
7633 Each element is one element list of coding system name.
7634 This variable is given to `completing-read' as TABLE argument.
7635
7636 Do not alter the value of this variable manually. This variable should be
7637 updated by the functions `make-coding-system' and
7638 `define-coding-system-alias'. */);
7639 Vcoding_system_alist = Qnil;
7640
7641 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7642 doc: /* List of coding-categories (symbols) ordered by priority.
7643
7644 On detecting a coding system, Emacs tries code detection algorithms
7645 associated with each coding-category one by one in this order. When
7646 one algorithm agrees with a byte sequence of source text, the coding
7647 system bound to the corresponding coding-category is selected. */);
7648 {
7649 int i;
7650
7651 Vcoding_category_list = Qnil;
7652 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7653 Vcoding_category_list
7654 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7655 Vcoding_category_list);
7656 }
7657
7658 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7659 doc: /* Specify the coding system for read operations.
7660 It is useful to bind this variable with `let', but do not set it globally.
7661 If the value is a coding system, it is used for decoding on read operation.
7662 If not, an appropriate element is used from one of the coding system alists:
7663 There are three such tables, `file-coding-system-alist',
7664 `process-coding-system-alist', and `network-coding-system-alist'. */);
7665 Vcoding_system_for_read = Qnil;
7666
7667 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7668 doc: /* Specify the coding system for write operations.
7669 Programs bind this variable with `let', but you should not set it globally.
7670 If the value is a coding system, it is used for encoding of output,
7671 when writing it to a file and when sending it to a file or subprocess.
7672
7673 If this does not specify a coding system, an appropriate element
7674 is used from one of the coding system alists:
7675 There are three such tables, `file-coding-system-alist',
7676 `process-coding-system-alist', and `network-coding-system-alist'.
7677 For output to files, if the above procedure does not specify a coding system,
7678 the value of `buffer-file-coding-system' is used. */);
7679 Vcoding_system_for_write = Qnil;
7680
7681 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7682 doc: /* Coding system used in the latest file or process I/O.
7683 Also set by `encode-coding-region', `decode-coding-region',
7684 `encode-coding-string' and `decode-coding-string'. */);
7685 Vlast_coding_system_used = Qnil;
7686
7687 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7688 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7689 See info node `Coding Systems' and info node `Text and Binary' concerning
7690 such conversion. */);
7691 inhibit_eol_conversion = 0;
7692
7693 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7694 doc: /* Non-nil means process buffer inherits coding system of process output.
7695 Bind it to t if the process output is to be treated as if it were a file
7696 read from some filesystem. */);
7697 inherit_process_coding_system = 0;
7698
7699 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7700 doc: /* Alist to decide a coding system to use for a file I/O operation.
7701 The format is ((PATTERN . VAL) ...),
7702 where PATTERN is a regular expression matching a file name,
7703 VAL is a coding system, a cons of coding systems, or a function symbol.
7704 If VAL is a coding system, it is used for both decoding and encoding
7705 the file contents.
7706 If VAL is a cons of coding systems, the car part is used for decoding,
7707 and the cdr part is used for encoding.
7708 If VAL is a function symbol, the function must return a coding system
7709 or a cons of coding systems which are used as above. The function gets
7710 the arguments with which `find-operation-coding-system' was called.
7711
7712 See also the function `find-operation-coding-system'
7713 and the variable `auto-coding-alist'. */);
7714 Vfile_coding_system_alist = Qnil;
7715
7716 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7717 doc: /* Alist to decide a coding system to use for a process I/O operation.
7718 The format is ((PATTERN . VAL) ...),
7719 where PATTERN is a regular expression matching a program name,
7720 VAL is a coding system, a cons of coding systems, or a function symbol.
7721 If VAL is a coding system, it is used for both decoding what received
7722 from the program and encoding what sent to the program.
7723 If VAL is a cons of coding systems, the car part is used for decoding,
7724 and the cdr part is used for encoding.
7725 If VAL is a function symbol, the function must return a coding system
7726 or a cons of coding systems which are used as above.
7727
7728 See also the function `find-operation-coding-system'. */);
7729 Vprocess_coding_system_alist = Qnil;
7730
7731 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7732 doc: /* Alist to decide a coding system to use for a network I/O operation.
7733 The format is ((PATTERN . VAL) ...),
7734 where PATTERN is a regular expression matching a network service name
7735 or is a port number to connect to,
7736 VAL is a coding system, a cons of coding systems, or a function symbol.
7737 If VAL is a coding system, it is used for both decoding what received
7738 from the network stream and encoding what sent to the network stream.
7739 If VAL is a cons of coding systems, the car part is used for decoding,
7740 and the cdr part is used for encoding.
7741 If VAL is a function symbol, the function must return a coding system
7742 or a cons of coding systems which are used as above.
7743
7744 See also the function `find-operation-coding-system'. */);
7745 Vnetwork_coding_system_alist = Qnil;
7746
7747 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7748 doc: /* Coding system to use with system messages.
7749 Also used for decoding keyboard input on X Window system. */);
7750 Vlocale_coding_system = Qnil;
7751
7752 /* The eol mnemonics are reset in startup.el system-dependently. */
7753 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7754 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7755 eol_mnemonic_unix = build_string (":");
7756
7757 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7758 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7759 eol_mnemonic_dos = build_string ("\\");
7760
7761 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7762 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7763 eol_mnemonic_mac = build_string ("/");
7764
7765 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7766 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7767 eol_mnemonic_undecided = build_string (":");
7768
7769 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7770 doc: /* *Non-nil enables character translation while encoding and decoding. */);
7771 Venable_character_translation = Qt;
7772
7773 DEFVAR_LISP ("standard-translation-table-for-decode",
7774 &Vstandard_translation_table_for_decode,
7775 doc: /* Table for translating characters while decoding. */);
7776 Vstandard_translation_table_for_decode = Qnil;
7777
7778 DEFVAR_LISP ("standard-translation-table-for-encode",
7779 &Vstandard_translation_table_for_encode,
7780 doc: /* Table for translating characters while encoding. */);
7781 Vstandard_translation_table_for_encode = Qnil;
7782
7783 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7784 doc: /* Alist of charsets vs revision numbers.
7785 While encoding, if a charset (car part of an element) is found,
7786 designate it with the escape sequence identifying revision (cdr part of the element). */);
7787 Vcharset_revision_alist = Qnil;
7788
7789 DEFVAR_LISP ("default-process-coding-system",
7790 &Vdefault_process_coding_system,
7791 doc: /* Cons of coding systems used for process I/O by default.
7792 The car part is used for decoding a process output,
7793 the cdr part is used for encoding a text to be sent to a process. */);
7794 Vdefault_process_coding_system = Qnil;
7795
7796 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7797 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7798 This is a vector of length 256.
7799 If Nth element is non-nil, the existence of code N in a file
7800 \(or output of subprocess) doesn't prevent it to be detected as
7801 a coding system of ISO 2022 variant which has a flag
7802 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7803 or reading output of a subprocess.
7804 Only 128th through 159th elements has a meaning. */);
7805 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7806
7807 DEFVAR_LISP ("select-safe-coding-system-function",
7808 &Vselect_safe_coding_system_function,
7809 doc: /* Function to call to select safe coding system for encoding a text.
7810
7811 If set, this function is called to force a user to select a proper
7812 coding system which can encode the text in the case that a default
7813 coding system used in each operation can't encode the text.
7814
7815 The default value is `select-safe-coding-system' (which see). */);
7816 Vselect_safe_coding_system_function = Qnil;
7817
7818 DEFVAR_BOOL ("coding-system-require-warning",
7819 &coding_system_require_warning,
7820 doc: /* Internal use only.
7821 If non-nil, on writing a file, `select-safe-coding-system-function' is
7822 called even if `coding-system-for-write' is non-nil. The command
7823 `universal-coding-system-argument' binds this variable to t temporarily. */);
7824 coding_system_require_warning = 0;
7825
7826
7827 DEFVAR_BOOL ("inhibit-iso-escape-detection",
7828 &inhibit_iso_escape_detection,
7829 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7830
7831 By default, on reading a file, Emacs tries to detect how the text is
7832 encoded. This code detection is sensitive to escape sequences. If
7833 the sequence is valid as ISO2022, the code is determined as one of
7834 the ISO2022 encodings, and the file is decoded by the corresponding
7835 coding system (e.g. `iso-2022-7bit').
7836
7837 However, there may be a case that you want to read escape sequences in
7838 a file as is. In such a case, you can set this variable to non-nil.
7839 Then, as the code detection ignores any escape sequences, no file is
7840 detected as encoded in some ISO2022 encoding. The result is that all
7841 escape sequences become visible in a buffer.
7842
7843 The default value is nil, and it is strongly recommended not to change
7844 it. That is because many Emacs Lisp source files that contain
7845 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7846 in Emacs's distribution, and they won't be decoded correctly on
7847 reading if you suppress escape sequence detection.
7848
7849 The other way to read escape sequences in a file without decoding is
7850 to explicitly specify some coding system that doesn't use ISO2022's
7851 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
7852 inhibit_iso_escape_detection = 0;
7853
7854 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7855 doc: /* Char table for translating self-inserting characters.
7856 This is applied to the result of input methods, not their input. See also
7857 `keyboard-translate-table'. */);
7858 Vtranslation_table_for_input = Qnil;
7859 }
7860
7861 char *
7862 emacs_strerror (error_number)
7863 int error_number;
7864 {
7865 char *str;
7866
7867 synchronize_system_messages_locale ();
7868 str = strerror (error_number);
7869
7870 if (! NILP (Vlocale_coding_system))
7871 {
7872 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7873 Vlocale_coding_system,
7874 0);
7875 str = (char *) SDATA (dec);
7876 }
7877
7878 return str;
7879 }
7880
7881 #endif /* emacs */
7882