Add 2008 to copyright years.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 Boston, MA 02110-1301, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-mule) handlers
31 3. ISO2022 handlers
32 4. Shift-JIS and BIG5 handlers
33 5. CCL handlers
34 6. End-of-line handlers
35 7. C library functions
36 8. Emacs Lisp library functions
37 9. Post-amble
38
39 */
40
41 /*** 0. General comments ***/
42
43
44 /*** GENERAL NOTE on CODING SYSTEMS ***
45
46 A coding system is an encoding mechanism for one or more character
47 sets. Here's a list of coding systems which Emacs can handle. When
48 we say "decode", it means converting some other coding system to
49 Emacs' internal format (emacs-mule), and when we say "encode",
50 it means converting the coding system emacs-mule to some other
51 coding system.
52
53 0. Emacs' internal format (emacs-mule)
54
55 Emacs itself holds a multi-lingual character in buffers and strings
56 in a special format. Details are described in section 2.
57
58 1. ISO2022
59
60 The most famous coding system for multiple character sets. X's
61 Compound Text, various EUCs (Extended Unix Code), and coding
62 systems used in Internet communication such as ISO-2022-JP are
63 all variants of ISO2022. Details are described in section 3.
64
65 2. SJIS (or Shift-JIS or MS-Kanji-Code)
66
67 A coding system to encode character sets: ASCII, JISX0201, and
68 JISX0208. Widely used for PC's in Japan. Details are described in
69 section 4.
70
71 3. BIG5
72
73 A coding system to encode the character sets ASCII and Big5. Widely
74 used for Chinese (mainly in Taiwan and Hong Kong). Details are
75 described in section 4. In this file, when we write "BIG5"
76 (all uppercase), we mean the coding system, and when we write
77 "Big5" (capitalized), we mean the character set.
78
79 4. Raw text
80
81 A coding system for text containing random 8-bit code. Emacs does
82 no code conversion on such text except for end-of-line format.
83
84 5. Other
85
86 If a user wants to read/write text encoded in a coding system not
87 listed above, he can supply a decoder and an encoder for it as CCL
88 (Code Conversion Language) programs. Emacs executes the CCL program
89 while reading/writing.
90
91 Emacs represents a coding system by a Lisp symbol that has a property
92 `coding-system'. But, before actually using the coding system, the
93 information about it is set in a structure of type `struct
94 coding_system' for rapid processing. See section 6 for more details.
95
96 */
97
98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
99
100 How end-of-line of text is encoded depends on the operating system.
101 For instance, Unix's format is just one byte of `line-feed' code,
102 whereas DOS's format is two-byte sequence of `carriage-return' and
103 `line-feed' codes. MacOS's format is usually one byte of
104 `carriage-return'.
105
106 Since text character encoding and end-of-line encoding are
107 independent, any coding system described above can have any
108 end-of-line format. So Emacs has information about end-of-line
109 format in each coding-system. See section 6 for more details.
110
111 */
112
113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
114
115 These functions check if a text between SRC and SRC_END is encoded
116 in the coding system category XXX. Each returns an integer value in
117 which appropriate flag bits for the category XXX are set. The flag
118 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
119 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
120 of the range 0x80..0x9F are in multibyte form. */
121 #if 0
122 int
123 detect_coding_emacs_mule (src, src_end, multibytep)
124 unsigned char *src, *src_end;
125 int multibytep;
126 {
127 ...
128 }
129 #endif
130
131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
132
133 These functions decode SRC_BYTES length of unibyte text at SOURCE
134 encoded in CODING to Emacs' internal format. The resulting
135 multibyte text goes to a place pointed to by DESTINATION, the length
136 of which should not exceed DST_BYTES.
137
138 These functions set the information about original and decoded texts
139 in the members `produced', `produced_char', `consumed', and
140 `consumed_char' of the structure *CODING. They also set the member
141 `result' to one of CODING_FINISH_XXX indicating how the decoding
142 finished.
143
144 DST_BYTES zero means that the source area and destination area are
145 overlapped, which means that we can produce a decoded text until it
146 reaches the head of the not-yet-decoded source text.
147
148 Below is a template for these functions. */
149 #if 0
150 static void
151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
152 struct coding_system *coding;
153 const unsigned char *source;
154 unsigned char *destination;
155 int src_bytes, dst_bytes;
156 {
157 ...
158 }
159 #endif
160
161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
162
163 These functions encode SRC_BYTES length text at SOURCE from Emacs'
164 internal multibyte format to CODING. The resulting unibyte text
165 goes to a place pointed to by DESTINATION, the length of which
166 should not exceed DST_BYTES.
167
168 These functions set the information about original and encoded texts
169 in the members `produced', `produced_char', `consumed', and
170 `consumed_char' of the structure *CODING. They also set the member
171 `result' to one of CODING_FINISH_XXX indicating how the encoding
172 finished.
173
174 DST_BYTES zero means that the source area and destination area are
175 overlapped, which means that we can produce encoded text until it
176 reaches at the head of the not-yet-encoded source text.
177
178 Below is a template for these functions. */
179 #if 0
180 static void
181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
182 struct coding_system *coding;
183 unsigned char *source, *destination;
184 int src_bytes, dst_bytes;
185 {
186 ...
187 }
188 #endif
189
190 /*** COMMONLY USED MACROS ***/
191
192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
193 get one, two, and three bytes from the source text respectively.
194 If there are not enough bytes in the source, they jump to
195 `label_end_of_loop'. The caller should set variables `coding',
196 `src' and `src_end' to appropriate pointer in advance. These
197 macros are called from decoding routines `decode_coding_XXX', thus
198 it is assumed that the source text is unibyte. */
199
200 #define ONE_MORE_BYTE(c1) \
201 do { \
202 if (src >= src_end) \
203 { \
204 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
205 goto label_end_of_loop; \
206 } \
207 c1 = *src++; \
208 } while (0)
209
210 #define TWO_MORE_BYTES(c1, c2) \
211 do { \
212 if (src + 1 >= src_end) \
213 { \
214 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
215 goto label_end_of_loop; \
216 } \
217 c1 = *src++; \
218 c2 = *src++; \
219 } while (0)
220
221
222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
223 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
224 than SRC_END, return with RET. */
225
226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
227 do { \
228 if (src >= src_end) \
229 { \
230 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
231 return ret; \
232 } \
233 c1 = *src++; \
234 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
235 c1 = *src++ - 0x20; \
236 } while (0)
237
238 /* Set C to the next character at the source text pointed by `src'.
239 If there are not enough characters in the source, jump to
240 `label_end_of_loop'. The caller should set variables `coding'
241 `src', `src_end', and `translation_table' to appropriate pointers
242 in advance. This macro is used in encoding routines
243 `encode_coding_XXX', thus it assumes that the source text is in
244 multibyte form except for 8-bit characters. 8-bit characters are
245 in multibyte form if coding->src_multibyte is nonzero, else they
246 are represented by a single byte. */
247
248 #define ONE_MORE_CHAR(c) \
249 do { \
250 int len = src_end - src; \
251 int bytes; \
252 if (len <= 0) \
253 { \
254 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
255 goto label_end_of_loop; \
256 } \
257 if (coding->src_multibyte \
258 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
259 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
260 else \
261 c = *src, bytes = 1; \
262 if (!NILP (translation_table)) \
263 c = translate_char (translation_table, c, -1, 0, 0); \
264 src += bytes; \
265 } while (0)
266
267
268 /* Produce a multibyte form of character C to `dst'. Jump to
269 `label_end_of_loop' if there's not enough space at `dst'.
270
271 If we are now in the middle of a composition sequence, the decoded
272 character may be ALTCHAR (for the current composition). In that
273 case, the character goes to coding->cmp_data->data instead of
274 `dst'.
275
276 This macro is used in decoding routines. */
277
278 #define EMIT_CHAR(c) \
279 do { \
280 if (! COMPOSING_P (coding) \
281 || coding->composing == COMPOSITION_RELATIVE \
282 || coding->composing == COMPOSITION_WITH_RULE) \
283 { \
284 int bytes = CHAR_BYTES (c); \
285 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
286 { \
287 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
288 goto label_end_of_loop; \
289 } \
290 dst += CHAR_STRING (c, dst); \
291 coding->produced_char++; \
292 } \
293 \
294 if (COMPOSING_P (coding) \
295 && coding->composing != COMPOSITION_RELATIVE) \
296 { \
297 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
298 coding->composition_rule_follows \
299 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
300 } \
301 } while (0)
302
303
304 #define EMIT_ONE_BYTE(c) \
305 do { \
306 if (dst >= (dst_bytes ? dst_end : src)) \
307 { \
308 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
309 goto label_end_of_loop; \
310 } \
311 *dst++ = c; \
312 } while (0)
313
314 #define EMIT_TWO_BYTES(c1, c2) \
315 do { \
316 if (dst + 2 > (dst_bytes ? dst_end : src)) \
317 { \
318 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
319 goto label_end_of_loop; \
320 } \
321 *dst++ = c1, *dst++ = c2; \
322 } while (0)
323
324 #define EMIT_BYTES(from, to) \
325 do { \
326 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
327 { \
328 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
329 goto label_end_of_loop; \
330 } \
331 while (from < to) \
332 *dst++ = *from++; \
333 } while (0)
334
335 \f
336 /*** 1. Preamble ***/
337
338 #ifdef emacs
339 #include <config.h>
340 #endif
341
342 #include <stdio.h>
343
344 #ifdef emacs
345
346 #include "lisp.h"
347 #include "buffer.h"
348 #include "charset.h"
349 #include "composite.h"
350 #include "ccl.h"
351 #include "coding.h"
352 #include "window.h"
353 #include "intervals.h"
354
355 #else /* not emacs */
356
357 #include "mulelib.h"
358
359 #endif /* not emacs */
360
361 Lisp_Object Qcoding_system, Qeol_type;
362 Lisp_Object Qbuffer_file_coding_system;
363 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
364 Lisp_Object Qno_conversion, Qundecided;
365 Lisp_Object Qcoding_system_history;
366 Lisp_Object Qsafe_chars;
367 Lisp_Object Qvalid_codes;
368 Lisp_Object Qascii_incompatible;
369
370 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
371 Lisp_Object Qcall_process, Qcall_process_region;
372 Lisp_Object Qstart_process, Qopen_network_stream;
373 Lisp_Object Qtarget_idx;
374
375 extern Lisp_Object Qcompletion_ignore_case;
376
377 /* If a symbol has this property, evaluate the value to define the
378 symbol as a coding system. */
379 Lisp_Object Qcoding_system_define_form;
380
381 Lisp_Object Vselect_safe_coding_system_function;
382
383 int coding_system_require_warning;
384
385 /* Mnemonic string for each format of end-of-line. */
386 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
387 /* Mnemonic string to indicate format of end-of-line is not yet
388 decided. */
389 Lisp_Object eol_mnemonic_undecided;
390
391 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
392 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
393 This has an effect only for external encoding (i.e. for output to
394 file and process), not for in-buffer or Lisp string encoding. */
395 int system_eol_type;
396
397 #ifdef emacs
398
399 /* Information about which coding system is safe for which chars.
400 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
401
402 GENERIC-LIST is a list of generic coding systems which can encode
403 any characters.
404
405 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
406 corresponding char table that contains safe chars. */
407 Lisp_Object Vcoding_system_safe_chars;
408
409 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
410
411 Lisp_Object Qcoding_system_p, Qcoding_system_error;
412
413 /* Coding system emacs-mule and raw-text are for converting only
414 end-of-line format. */
415 Lisp_Object Qemacs_mule, Qraw_text;
416
417 Lisp_Object Qutf_8;
418
419 /* Coding-systems are handed between Emacs Lisp programs and C internal
420 routines by the following three variables. */
421 /* Coding-system for reading files and receiving data from process. */
422 Lisp_Object Vcoding_system_for_read;
423 /* Coding-system for writing files and sending data to process. */
424 Lisp_Object Vcoding_system_for_write;
425 /* Coding-system actually used in the latest I/O. */
426 Lisp_Object Vlast_coding_system_used;
427
428 /* A vector of length 256 which contains information about special
429 Latin codes (especially for dealing with Microsoft codes). */
430 Lisp_Object Vlatin_extra_code_table;
431
432 /* Flag to inhibit code conversion of end-of-line format. */
433 int inhibit_eol_conversion;
434
435 /* Flag to inhibit ISO2022 escape sequence detection. */
436 int inhibit_iso_escape_detection;
437
438 /* Flag to make buffer-file-coding-system inherit from process-coding. */
439 int inherit_process_coding_system;
440
441 /* Coding system to be used to encode text for terminal display. */
442 struct coding_system terminal_coding;
443
444 /* Coding system to be used to encode text for terminal display when
445 terminal coding system is nil. */
446 struct coding_system safe_terminal_coding;
447
448 /* Coding system of what is sent from terminal keyboard. */
449 struct coding_system keyboard_coding;
450
451 /* Default coding system to be used to write a file. */
452 struct coding_system default_buffer_file_coding;
453
454 Lisp_Object Vfile_coding_system_alist;
455 Lisp_Object Vprocess_coding_system_alist;
456 Lisp_Object Vnetwork_coding_system_alist;
457
458 Lisp_Object Vlocale_coding_system;
459
460 #endif /* emacs */
461
462 Lisp_Object Qcoding_category, Qcoding_category_index;
463
464 /* List of symbols `coding-category-xxx' ordered by priority. */
465 Lisp_Object Vcoding_category_list;
466
467 /* Table of coding categories (Lisp symbols). */
468 Lisp_Object Vcoding_category_table;
469
470 /* Table of names of symbol for each coding-category. */
471 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
472 "coding-category-emacs-mule",
473 "coding-category-sjis",
474 "coding-category-iso-7",
475 "coding-category-iso-7-tight",
476 "coding-category-iso-8-1",
477 "coding-category-iso-8-2",
478 "coding-category-iso-7-else",
479 "coding-category-iso-8-else",
480 "coding-category-ccl",
481 "coding-category-big5",
482 "coding-category-utf-8",
483 "coding-category-utf-16-be",
484 "coding-category-utf-16-le",
485 "coding-category-raw-text",
486 "coding-category-binary"
487 };
488
489 /* Table of pointers to coding systems corresponding to each coding
490 categories. */
491 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
492
493 /* Table of coding category masks. Nth element is a mask for a coding
494 category of which priority is Nth. */
495 static
496 int coding_priorities[CODING_CATEGORY_IDX_MAX];
497
498 /* Flag to tell if we look up translation table on character code
499 conversion. */
500 Lisp_Object Venable_character_translation;
501 /* Standard translation table to look up on decoding (reading). */
502 Lisp_Object Vstandard_translation_table_for_decode;
503 /* Standard translation table to look up on encoding (writing). */
504 Lisp_Object Vstandard_translation_table_for_encode;
505
506 Lisp_Object Qtranslation_table;
507 Lisp_Object Qtranslation_table_id;
508 Lisp_Object Qtranslation_table_for_decode;
509 Lisp_Object Qtranslation_table_for_encode;
510
511 /* Alist of charsets vs revision number. */
512 Lisp_Object Vcharset_revision_alist;
513
514 /* Default coding systems used for process I/O. */
515 Lisp_Object Vdefault_process_coding_system;
516
517 /* Char table for translating Quail and self-inserting input. */
518 Lisp_Object Vtranslation_table_for_input;
519
520 /* Global flag to tell that we can't call post-read-conversion and
521 pre-write-conversion functions. Usually the value is zero, but it
522 is set to 1 temporarily while such functions are running. This is
523 to avoid infinite recursive call. */
524 static int inhibit_pre_post_conversion;
525
526 Lisp_Object Qchar_coding_system;
527
528 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
529 its validity. */
530
531 Lisp_Object
532 coding_safe_chars (coding_system)
533 Lisp_Object coding_system;
534 {
535 Lisp_Object coding_spec, plist, safe_chars;
536
537 coding_spec = Fget (coding_system, Qcoding_system);
538 plist = XVECTOR (coding_spec)->contents[3];
539 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
540 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
541 }
542
543 #define CODING_SAFE_CHAR_P(safe_chars, c) \
544 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
545
546 \f
547 /*** 2. Emacs internal format (emacs-mule) handlers ***/
548
549 /* Emacs' internal format for representation of multiple character
550 sets is a kind of multi-byte encoding, i.e. characters are
551 represented by variable-length sequences of one-byte codes.
552
553 ASCII characters and control characters (e.g. `tab', `newline') are
554 represented by one-byte sequences which are their ASCII codes, in
555 the range 0x00 through 0x7F.
556
557 8-bit characters of the range 0x80..0x9F are represented by
558 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
559 code + 0x20).
560
561 8-bit characters of the range 0xA0..0xFF are represented by
562 one-byte sequences which are their 8-bit code.
563
564 The other characters are represented by a sequence of `base
565 leading-code', optional `extended leading-code', and one or two
566 `position-code's. The length of the sequence is determined by the
567 base leading-code. Leading-code takes the range 0x81 through 0x9D,
568 whereas extended leading-code and position-code take the range 0xA0
569 through 0xFF. See `charset.h' for more details about leading-code
570 and position-code.
571
572 --- CODE RANGE of Emacs' internal format ---
573 character set range
574 ------------- -----
575 ascii 0x00..0x7F
576 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
577 eight-bit-graphic 0xA0..0xBF
578 ELSE 0x81..0x9D + [0xA0..0xFF]+
579 ---------------------------------------------
580
581 As this is the internal character representation, the format is
582 usually not used externally (i.e. in a file or in a data sent to a
583 process). But, it is possible to have a text externally in this
584 format (i.e. by encoding by the coding system `emacs-mule').
585
586 In that case, a sequence of one-byte codes has a slightly different
587 form.
588
589 Firstly, all characters in eight-bit-control are represented by
590 one-byte sequences which are their 8-bit code.
591
592 Next, character composition data are represented by the byte
593 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
594 where,
595 METHOD is 0xF0 plus one of composition method (enum
596 composition_method),
597
598 BYTES is 0xA0 plus the byte length of these composition data,
599
600 CHARS is 0xA0 plus the number of characters composed by these
601 data,
602
603 COMPONENTs are characters of multibyte form or composition
604 rules encoded by two-byte of ASCII codes.
605
606 In addition, for backward compatibility, the following formats are
607 also recognized as composition data on decoding.
608
609 0x80 MSEQ ...
610 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
611
612 Here,
613 MSEQ is a multibyte form but in these special format:
614 ASCII: 0xA0 ASCII_CODE+0x80,
615 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
616 RULE is a one byte code of the range 0xA0..0xF0 that
617 represents a composition rule.
618 */
619
620 enum emacs_code_class_type emacs_code_class[256];
621
622 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
623 Check if a text is encoded in Emacs' internal format. If it is,
624 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
625
626 static int
627 detect_coding_emacs_mule (src, src_end, multibytep)
628 unsigned char *src, *src_end;
629 int multibytep;
630 {
631 unsigned char c;
632 int composing = 0;
633 /* Dummy for ONE_MORE_BYTE. */
634 struct coding_system dummy_coding;
635 struct coding_system *coding = &dummy_coding;
636
637 while (1)
638 {
639 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
640 CODING_CATEGORY_MASK_EMACS_MULE);
641 if (composing)
642 {
643 if (c < 0xA0)
644 composing = 0;
645 else if (c == 0xA0)
646 {
647 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
648 c &= 0x7F;
649 }
650 else
651 c -= 0x20;
652 }
653
654 if (c < 0x20)
655 {
656 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
657 return 0;
658 }
659 else if (c >= 0x80 && c < 0xA0)
660 {
661 if (c == 0x80)
662 /* Old leading code for a composite character. */
663 composing = 1;
664 else
665 {
666 unsigned char *src_base = src - 1;
667 int bytes;
668
669 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
670 bytes))
671 return 0;
672 src = src_base + bytes;
673 }
674 }
675 }
676 }
677
678
679 /* Record the starting position START and METHOD of one composition. */
680
681 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
682 do { \
683 struct composition_data *cmp_data = coding->cmp_data; \
684 int *data = cmp_data->data + cmp_data->used; \
685 coding->cmp_data_start = cmp_data->used; \
686 data[0] = -1; \
687 data[1] = cmp_data->char_offset + start; \
688 data[3] = (int) method; \
689 cmp_data->used += 4; \
690 } while (0)
691
692 /* Record the ending position END of the current composition. */
693
694 #define CODING_ADD_COMPOSITION_END(coding, end) \
695 do { \
696 struct composition_data *cmp_data = coding->cmp_data; \
697 int *data = cmp_data->data + coding->cmp_data_start; \
698 data[0] = cmp_data->used - coding->cmp_data_start; \
699 data[2] = cmp_data->char_offset + end; \
700 } while (0)
701
702 /* Record one COMPONENT (alternate character or composition rule). */
703
704 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
705 do { \
706 coding->cmp_data->data[coding->cmp_data->used++] = component; \
707 if (coding->cmp_data->used - coding->cmp_data_start \
708 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
709 { \
710 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
711 coding->composing = COMPOSITION_NO; \
712 } \
713 } while (0)
714
715
716 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
717 is not less than SRC_END, return -1 without incrementing Src. */
718
719 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
720
721
722 /* Decode a character represented as a component of composition
723 sequence of Emacs 20 style at SRC. Set C to that character, store
724 its multibyte form sequence at P, and set P to the end of that
725 sequence. If no valid character is found, set C to -1. */
726
727 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
728 do { \
729 int bytes; \
730 \
731 c = SAFE_ONE_MORE_BYTE (); \
732 if (c < 0) \
733 break; \
734 if (CHAR_HEAD_P (c)) \
735 c = -1; \
736 else if (c == 0xA0) \
737 { \
738 c = SAFE_ONE_MORE_BYTE (); \
739 if (c < 0xA0) \
740 c = -1; \
741 else \
742 { \
743 c -= 0x80; \
744 *p++ = c; \
745 } \
746 } \
747 else if (BASE_LEADING_CODE_P (c - 0x20)) \
748 { \
749 unsigned char *p0 = p; \
750 \
751 c -= 0x20; \
752 *p++ = c; \
753 bytes = BYTES_BY_CHAR_HEAD (c); \
754 while (--bytes) \
755 { \
756 c = SAFE_ONE_MORE_BYTE (); \
757 if (c < 0) \
758 break; \
759 *p++ = c; \
760 } \
761 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
762 || (coding->flags /* We are recovering a file. */ \
763 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
764 && ! CHAR_HEAD_P (p0[1]))) \
765 c = STRING_CHAR (p0, bytes); \
766 else \
767 c = -1; \
768 } \
769 else \
770 c = -1; \
771 } while (0)
772
773
774 /* Decode a composition rule represented as a component of composition
775 sequence of Emacs 20 style at SRC. Set C to the rule. If not
776 valid rule is found, set C to -1. */
777
778 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
779 do { \
780 c = SAFE_ONE_MORE_BYTE (); \
781 c -= 0xA0; \
782 if (c < 0 || c >= 81) \
783 c = -1; \
784 else \
785 { \
786 gref = c / 9, nref = c % 9; \
787 c = COMPOSITION_ENCODE_RULE (gref, nref); \
788 } \
789 } while (0)
790
791
792 /* Decode composition sequence encoded by `emacs-mule' at the source
793 pointed by SRC. SRC_END is the end of source. Store information
794 of the composition in CODING->cmp_data.
795
796 For backward compatibility, decode also a composition sequence of
797 Emacs 20 style. In that case, the composition sequence contains
798 characters that should be extracted into a buffer or string. Store
799 those characters at *DESTINATION in multibyte form.
800
801 If we encounter an invalid byte sequence, return 0.
802 If we encounter an insufficient source or destination, or
803 insufficient space in CODING->cmp_data, return 1.
804 Otherwise, return consumed bytes in the source.
805
806 */
807 static INLINE int
808 decode_composition_emacs_mule (coding, src, src_end,
809 destination, dst_end, dst_bytes)
810 struct coding_system *coding;
811 const unsigned char *src, *src_end;
812 unsigned char **destination, *dst_end;
813 int dst_bytes;
814 {
815 unsigned char *dst = *destination;
816 int method, data_len, nchars;
817 const unsigned char *src_base = src++;
818 /* Store components of composition. */
819 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
820 int ncomponent;
821 /* Store multibyte form of characters to be composed. This is for
822 Emacs 20 style composition sequence. */
823 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
824 unsigned char *bufp = buf;
825 int c, i, gref, nref;
826
827 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
828 >= COMPOSITION_DATA_SIZE)
829 {
830 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
831 return -1;
832 }
833
834 ONE_MORE_BYTE (c);
835 if (c - 0xF0 >= COMPOSITION_RELATIVE
836 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
837 {
838 int with_rule;
839
840 method = c - 0xF0;
841 with_rule = (method == COMPOSITION_WITH_RULE
842 || method == COMPOSITION_WITH_RULE_ALTCHARS);
843 ONE_MORE_BYTE (c);
844 data_len = c - 0xA0;
845 if (data_len < 4
846 || src_base + data_len > src_end)
847 return 0;
848 ONE_MORE_BYTE (c);
849 nchars = c - 0xA0;
850 if (c < 1)
851 return 0;
852 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
853 {
854 /* If it is longer than this, it can't be valid. */
855 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
856 return 0;
857
858 if (ncomponent % 2 && with_rule)
859 {
860 ONE_MORE_BYTE (gref);
861 gref -= 32;
862 ONE_MORE_BYTE (nref);
863 nref -= 32;
864 c = COMPOSITION_ENCODE_RULE (gref, nref);
865 }
866 else
867 {
868 int bytes;
869 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
870 || (coding->flags /* We are recovering a file. */
871 && src[0] == LEADING_CODE_8_BIT_CONTROL
872 && ! CHAR_HEAD_P (src[1])))
873 c = STRING_CHAR (src, bytes);
874 else
875 c = *src, bytes = 1;
876 src += bytes;
877 }
878 component[ncomponent] = c;
879 }
880 }
881 else if (c >= 0x80)
882 {
883 /* This may be an old Emacs 20 style format. See the comment at
884 the section 2 of this file. */
885 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
886 if (src == src_end
887 && !(coding->mode & CODING_MODE_LAST_BLOCK))
888 goto label_end_of_loop;
889
890 src_end = src;
891 src = src_base + 1;
892 if (c < 0xC0)
893 {
894 method = COMPOSITION_RELATIVE;
895 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
896 {
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 break;
900 component[ncomponent++] = c;
901 }
902 if (ncomponent < 2)
903 return 0;
904 nchars = ncomponent;
905 }
906 else if (c == 0xFF)
907 {
908 method = COMPOSITION_WITH_RULE;
909 src++;
910 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
911 if (c < 0)
912 return 0;
913 component[0] = c;
914 for (ncomponent = 1;
915 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
916 {
917 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
918 if (c < 0)
919 break;
920 component[ncomponent++] = c;
921 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
922 if (c < 0)
923 break;
924 component[ncomponent++] = c;
925 }
926 if (ncomponent < 3)
927 return 0;
928 nchars = (ncomponent + 1) / 2;
929 }
930 else
931 return 0;
932 }
933 else
934 return 0;
935
936 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
937 {
938 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
939 for (i = 0; i < ncomponent; i++)
940 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
941 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
942 if (buf < bufp)
943 {
944 unsigned char *p = buf;
945 EMIT_BYTES (p, bufp);
946 *destination += bufp - buf;
947 coding->produced_char += nchars;
948 }
949 return (src - src_base);
950 }
951 label_end_of_loop:
952 return -1;
953 }
954
955 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
956
957 static void
958 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
959 struct coding_system *coding;
960 const unsigned char *source;
961 unsigned char *destination;
962 int src_bytes, dst_bytes;
963 {
964 const unsigned char *src = source;
965 const unsigned char *src_end = source + src_bytes;
966 unsigned char *dst = destination;
967 unsigned char *dst_end = destination + dst_bytes;
968 /* SRC_BASE remembers the start position in source in each loop.
969 The loop will be exited when there's not enough source code, or
970 when there's not enough destination area to produce a
971 character. */
972 const unsigned char *src_base;
973
974 coding->produced_char = 0;
975 while ((src_base = src) < src_end)
976 {
977 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
978 const unsigned char *p;
979 int bytes;
980
981 if (*src == '\r')
982 {
983 int c = *src++;
984
985 if (coding->eol_type == CODING_EOL_CR)
986 c = '\n';
987 else if (coding->eol_type == CODING_EOL_CRLF)
988 {
989 ONE_MORE_BYTE (c);
990 if (c != '\n')
991 {
992 src--;
993 c = '\r';
994 }
995 }
996 *dst++ = c;
997 coding->produced_char++;
998 continue;
999 }
1000 else if (*src == '\n')
1001 {
1002 if ((coding->eol_type == CODING_EOL_CR
1003 || coding->eol_type == CODING_EOL_CRLF)
1004 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1005 {
1006 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1007 goto label_end_of_loop;
1008 }
1009 *dst++ = *src++;
1010 coding->produced_char++;
1011 continue;
1012 }
1013 else if (*src == 0x80 && coding->cmp_data)
1014 {
1015 /* Start of composition data. */
1016 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1017 &dst, dst_end,
1018 dst_bytes);
1019 if (consumed < 0)
1020 goto label_end_of_loop;
1021 else if (consumed > 0)
1022 {
1023 src += consumed;
1024 continue;
1025 }
1026 bytes = CHAR_STRING (*src, tmp);
1027 p = tmp;
1028 src++;
1029 }
1030 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1031 || (coding->flags /* We are recovering a file. */
1032 && src[0] == LEADING_CODE_8_BIT_CONTROL
1033 && ! CHAR_HEAD_P (src[1])))
1034 {
1035 p = src;
1036 src += bytes;
1037 }
1038 else
1039 {
1040 int i, c;
1041
1042 bytes = BYTES_BY_CHAR_HEAD (*src);
1043 src++;
1044 for (i = 1; i < bytes; i++)
1045 {
1046 ONE_MORE_BYTE (c);
1047 if (CHAR_HEAD_P (c))
1048 break;
1049 }
1050 if (i < bytes)
1051 {
1052 bytes = CHAR_STRING (*src_base, tmp);
1053 p = tmp;
1054 src = src_base + 1;
1055 }
1056 else
1057 {
1058 p = src_base;
1059 }
1060 }
1061 if (dst + bytes >= (dst_bytes ? dst_end : src))
1062 {
1063 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1064 break;
1065 }
1066 while (bytes--) *dst++ = *p++;
1067 coding->produced_char++;
1068 }
1069 label_end_of_loop:
1070 coding->consumed = coding->consumed_char = src_base - source;
1071 coding->produced = dst - destination;
1072 }
1073
1074
1075 /* Encode composition data stored at DATA into a special byte sequence
1076 starting by 0x80. Update CODING->cmp_data_start and maybe
1077 CODING->cmp_data for the next call. */
1078
1079 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1080 do { \
1081 unsigned char buf[1024], *p0 = buf, *p; \
1082 int len = data[0]; \
1083 int i; \
1084 \
1085 buf[0] = 0x80; \
1086 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1087 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1088 p = buf + 4; \
1089 if (data[3] == COMPOSITION_WITH_RULE \
1090 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1091 { \
1092 p += CHAR_STRING (data[4], p); \
1093 for (i = 5; i < len; i += 2) \
1094 { \
1095 int gref, nref; \
1096 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1097 *p++ = 0x20 + gref; \
1098 *p++ = 0x20 + nref; \
1099 p += CHAR_STRING (data[i + 1], p); \
1100 } \
1101 } \
1102 else \
1103 { \
1104 for (i = 4; i < len; i++) \
1105 p += CHAR_STRING (data[i], p); \
1106 } \
1107 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1108 \
1109 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1110 { \
1111 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1112 goto label_end_of_loop; \
1113 } \
1114 while (p0 < p) \
1115 *dst++ = *p0++; \
1116 coding->cmp_data_start += data[0]; \
1117 if (coding->cmp_data_start == coding->cmp_data->used \
1118 && coding->cmp_data->next) \
1119 { \
1120 coding->cmp_data = coding->cmp_data->next; \
1121 coding->cmp_data_start = 0; \
1122 } \
1123 } while (0)
1124
1125
1126 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1127 unsigned char *, int, int));
1128
1129 static void
1130 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1131 struct coding_system *coding;
1132 const unsigned char *source;
1133 unsigned char *destination;
1134 int src_bytes, dst_bytes;
1135 {
1136 const unsigned char *src = source;
1137 const unsigned char *src_end = source + src_bytes;
1138 unsigned char *dst = destination;
1139 unsigned char *dst_end = destination + dst_bytes;
1140 const unsigned char *src_base;
1141 int c;
1142 int char_offset;
1143 int *data;
1144
1145 Lisp_Object translation_table;
1146
1147 translation_table = Qnil;
1148
1149 /* Optimization for the case that there's no composition. */
1150 if (!coding->cmp_data || coding->cmp_data->used == 0)
1151 {
1152 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1153 return;
1154 }
1155
1156 char_offset = coding->cmp_data->char_offset;
1157 data = coding->cmp_data->data + coding->cmp_data_start;
1158 while (1)
1159 {
1160 src_base = src;
1161
1162 /* If SRC starts a composition, encode the information about the
1163 composition in advance. */
1164 if (coding->cmp_data_start < coding->cmp_data->used
1165 && char_offset + coding->consumed_char == data[1])
1166 {
1167 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1168 char_offset = coding->cmp_data->char_offset;
1169 data = coding->cmp_data->data + coding->cmp_data_start;
1170 }
1171
1172 ONE_MORE_CHAR (c);
1173 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1174 || coding->eol_type == CODING_EOL_CR))
1175 {
1176 if (coding->eol_type == CODING_EOL_CRLF)
1177 EMIT_TWO_BYTES ('\r', c);
1178 else
1179 EMIT_ONE_BYTE ('\r');
1180 }
1181 else if (SINGLE_BYTE_CHAR_P (c))
1182 {
1183 if (coding->flags && ! ASCII_BYTE_P (c))
1184 {
1185 /* As we are auto saving, retain the multibyte form for
1186 8-bit chars. */
1187 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1188 int bytes = CHAR_STRING (c, buf);
1189
1190 if (bytes == 1)
1191 EMIT_ONE_BYTE (buf[0]);
1192 else
1193 EMIT_TWO_BYTES (buf[0], buf[1]);
1194 }
1195 else
1196 EMIT_ONE_BYTE (c);
1197 }
1198 else
1199 EMIT_BYTES (src_base, src);
1200 coding->consumed_char++;
1201 }
1202 label_end_of_loop:
1203 coding->consumed = src_base - source;
1204 coding->produced = coding->produced_char = dst - destination;
1205 return;
1206 }
1207
1208 \f
1209 /*** 3. ISO2022 handlers ***/
1210
1211 /* The following note describes the coding system ISO2022 briefly.
1212 Since the intention of this note is to help understand the
1213 functions in this file, some parts are NOT ACCURATE or are OVERLY
1214 SIMPLIFIED. For thorough understanding, please refer to the
1215 original document of ISO2022. This is equivalent to the standard
1216 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1217
1218 ISO2022 provides many mechanisms to encode several character sets
1219 in 7-bit and 8-bit environments. For 7-bit environments, all text
1220 is encoded using bytes less than 128. This may make the encoded
1221 text a little bit longer, but the text passes more easily through
1222 several types of gateway, some of which strip off the MSB (Most
1223 Significant Bit).
1224
1225 There are two kinds of character sets: control character sets and
1226 graphic character sets. The former contain control characters such
1227 as `newline' and `escape' to provide control functions (control
1228 functions are also provided by escape sequences). The latter
1229 contain graphic characters such as 'A' and '-'. Emacs recognizes
1230 two control character sets and many graphic character sets.
1231
1232 Graphic character sets are classified into one of the following
1233 four classes, according to the number of bytes (DIMENSION) and
1234 number of characters in one dimension (CHARS) of the set:
1235 - DIMENSION1_CHARS94
1236 - DIMENSION1_CHARS96
1237 - DIMENSION2_CHARS94
1238 - DIMENSION2_CHARS96
1239
1240 In addition, each character set is assigned an identification tag,
1241 unique for each set, called the "final character" (denoted as <F>
1242 hereafter). The <F> of each character set is decided by ECMA(*)
1243 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1244 (0x30..0x3F are for private use only).
1245
1246 Note (*): ECMA = European Computer Manufacturers Association
1247
1248 Here are examples of graphic character sets [NAME(<F>)]:
1249 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1250 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1251 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1252 o DIMENSION2_CHARS96 -- none for the moment
1253
1254 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1255 C0 [0x00..0x1F] -- control character plane 0
1256 GL [0x20..0x7F] -- graphic character plane 0
1257 C1 [0x80..0x9F] -- control character plane 1
1258 GR [0xA0..0xFF] -- graphic character plane 1
1259
1260 A control character set is directly designated and invoked to C0 or
1261 C1 by an escape sequence. The most common case is that:
1262 - ISO646's control character set is designated/invoked to C0, and
1263 - ISO6429's control character set is designated/invoked to C1,
1264 and usually these designations/invocations are omitted in encoded
1265 text. In a 7-bit environment, only C0 can be used, and a control
1266 character for C1 is encoded by an appropriate escape sequence to
1267 fit into the environment. All control characters for C1 are
1268 defined to have corresponding escape sequences.
1269
1270 A graphic character set is at first designated to one of four
1271 graphic registers (G0 through G3), then these graphic registers are
1272 invoked to GL or GR. These designations and invocations can be
1273 done independently. The most common case is that G0 is invoked to
1274 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1275 these invocations and designations are omitted in encoded text.
1276 In a 7-bit environment, only GL can be used.
1277
1278 When a graphic character set of CHARS94 is invoked to GL, codes
1279 0x20 and 0x7F of the GL area work as control characters SPACE and
1280 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1281 be used.
1282
1283 There are two ways of invocation: locking-shift and single-shift.
1284 With locking-shift, the invocation lasts until the next different
1285 invocation, whereas with single-shift, the invocation affects the
1286 following character only and doesn't affect the locking-shift
1287 state. Invocations are done by the following control characters or
1288 escape sequences:
1289
1290 ----------------------------------------------------------------------
1291 abbrev function cntrl escape seq description
1292 ----------------------------------------------------------------------
1293 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1294 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1295 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1296 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1297 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1298 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1299 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1300 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1301 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1302 ----------------------------------------------------------------------
1303 (*) These are not used by any known coding system.
1304
1305 Control characters for these functions are defined by macros
1306 ISO_CODE_XXX in `coding.h'.
1307
1308 Designations are done by the following escape sequences:
1309 ----------------------------------------------------------------------
1310 escape sequence description
1311 ----------------------------------------------------------------------
1312 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1313 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1314 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1315 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1316 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1317 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1318 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1319 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1320 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1321 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1322 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1323 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1324 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1325 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1326 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1327 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1328 ----------------------------------------------------------------------
1329
1330 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1331 of dimension 1, chars 94, and final character <F>, etc...
1332
1333 Note (*): Although these designations are not allowed in ISO2022,
1334 Emacs accepts them on decoding, and produces them on encoding
1335 CHARS96 character sets in a coding system which is characterized as
1336 7-bit environment, non-locking-shift, and non-single-shift.
1337
1338 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1339 '(' can be omitted. We refer to this as "short-form" hereafter.
1340
1341 Now you may notice that there are a lot of ways of encoding the
1342 same multilingual text in ISO2022. Actually, there exist many
1343 coding systems such as Compound Text (used in X11's inter client
1344 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1345 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1346 localized platforms), and all of these are variants of ISO2022.
1347
1348 In addition to the above, Emacs handles two more kinds of escape
1349 sequences: ISO6429's direction specification and Emacs' private
1350 sequence for specifying character composition.
1351
1352 ISO6429's direction specification takes the following form:
1353 o CSI ']' -- end of the current direction
1354 o CSI '0' ']' -- end of the current direction
1355 o CSI '1' ']' -- start of left-to-right text
1356 o CSI '2' ']' -- start of right-to-left text
1357 The control character CSI (0x9B: control sequence introducer) is
1358 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1359
1360 Character composition specification takes the following form:
1361 o ESC '0' -- start relative composition
1362 o ESC '1' -- end composition
1363 o ESC '2' -- start rule-base composition (*)
1364 o ESC '3' -- start relative composition with alternate chars (**)
1365 o ESC '4' -- start rule-base composition with alternate chars (**)
1366 Since these are not standard escape sequences of any ISO standard,
1367 the use of them with these meanings is restricted to Emacs only.
1368
1369 (*) This form is used only in Emacs 20.5 and older versions,
1370 but the newer versions can safely decode it.
1371 (**) This form is used only in Emacs 21.1 and newer versions,
1372 and the older versions can't decode it.
1373
1374 Here's a list of example usages of these composition escape
1375 sequences (categorized by `enum composition_method').
1376
1377 COMPOSITION_RELATIVE:
1378 ESC 0 CHAR [ CHAR ] ESC 1
1379 COMPOSITION_WITH_RULE:
1380 ESC 2 CHAR [ RULE CHAR ] ESC 1
1381 COMPOSITION_WITH_ALTCHARS:
1382 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1383 COMPOSITION_WITH_RULE_ALTCHARS:
1384 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1385
1386 enum iso_code_class_type iso_code_class[256];
1387
1388 #define CHARSET_OK(idx, charset, c) \
1389 (coding_system_table[idx] \
1390 && (charset == CHARSET_ASCII \
1391 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1392 CODING_SAFE_CHAR_P (safe_chars, c))) \
1393 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1394 charset) \
1395 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1396
1397 #define SHIFT_OUT_OK(idx) \
1398 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1399
1400 #define COMPOSITION_OK(idx) \
1401 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1402
1403 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1404 Check if a text is encoded in ISO2022. If it is, return an
1405 integer in which appropriate flag bits any of:
1406 CODING_CATEGORY_MASK_ISO_7
1407 CODING_CATEGORY_MASK_ISO_7_TIGHT
1408 CODING_CATEGORY_MASK_ISO_8_1
1409 CODING_CATEGORY_MASK_ISO_8_2
1410 CODING_CATEGORY_MASK_ISO_7_ELSE
1411 CODING_CATEGORY_MASK_ISO_8_ELSE
1412 are set. If a code which should never appear in ISO2022 is found,
1413 returns 0. */
1414
1415 static int
1416 detect_coding_iso2022 (src, src_end, multibytep)
1417 unsigned char *src, *src_end;
1418 int multibytep;
1419 {
1420 int mask = CODING_CATEGORY_MASK_ISO;
1421 int mask_found = 0;
1422 int reg[4], shift_out = 0, single_shifting = 0;
1423 int c, c1, charset;
1424 /* Dummy for ONE_MORE_BYTE. */
1425 struct coding_system dummy_coding;
1426 struct coding_system *coding = &dummy_coding;
1427 Lisp_Object safe_chars;
1428
1429 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1430 while (mask)
1431 {
1432 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1433 retry:
1434 switch (c)
1435 {
1436 case ISO_CODE_ESC:
1437 if (inhibit_iso_escape_detection)
1438 break;
1439 single_shifting = 0;
1440 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1441 if (c >= '(' && c <= '/')
1442 {
1443 /* Designation sequence for a charset of dimension 1. */
1444 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1445 if (c1 < ' ' || c1 >= 0x80
1446 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1447 /* Invalid designation sequence. Just ignore. */
1448 break;
1449 reg[(c - '(') % 4] = charset;
1450 }
1451 else if (c == '$')
1452 {
1453 /* Designation sequence for a charset of dimension 2. */
1454 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1455 if (c >= '@' && c <= 'B')
1456 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1457 reg[0] = charset = iso_charset_table[1][0][c];
1458 else if (c >= '(' && c <= '/')
1459 {
1460 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1461 mask & mask_found);
1462 if (c1 < ' ' || c1 >= 0x80
1463 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1464 /* Invalid designation sequence. Just ignore. */
1465 break;
1466 reg[(c - '(') % 4] = charset;
1467 }
1468 else
1469 /* Invalid designation sequence. Just ignore. */
1470 break;
1471 }
1472 else if (c == 'N' || c == 'O')
1473 {
1474 /* ESC <Fe> for SS2 or SS3. */
1475 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1476 break;
1477 }
1478 else if (c >= '0' && c <= '4')
1479 {
1480 /* ESC <Fp> for start/end composition. */
1481 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1482 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1483 else
1484 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1485 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1486 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1487 else
1488 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1489 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1490 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1491 else
1492 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1493 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1494 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1495 else
1496 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1497 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1498 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1499 else
1500 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1501 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1502 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1503 else
1504 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1505 break;
1506 }
1507 else
1508 /* Invalid escape sequence. Just ignore. */
1509 break;
1510
1511 /* We found a valid designation sequence for CHARSET. */
1512 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1513 c = MAKE_CHAR (charset, 0, 0);
1514 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1515 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1516 else
1517 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1518 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1519 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1520 else
1521 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1522 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1523 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1524 else
1525 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1526 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1527 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1528 else
1529 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1530 break;
1531
1532 case ISO_CODE_SO:
1533 if (inhibit_iso_escape_detection)
1534 break;
1535 single_shifting = 0;
1536 if (shift_out == 0
1537 && (reg[1] >= 0
1538 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1539 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1540 {
1541 /* Locking shift out. */
1542 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1543 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1544 }
1545 break;
1546
1547 case ISO_CODE_SI:
1548 if (inhibit_iso_escape_detection)
1549 break;
1550 single_shifting = 0;
1551 if (shift_out == 1)
1552 {
1553 /* Locking shift in. */
1554 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1555 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1556 }
1557 break;
1558
1559 case ISO_CODE_CSI:
1560 single_shifting = 0;
1561 case ISO_CODE_SS2:
1562 case ISO_CODE_SS3:
1563 {
1564 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1565
1566 if (inhibit_iso_escape_detection)
1567 break;
1568 if (c != ISO_CODE_CSI)
1569 {
1570 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1571 & CODING_FLAG_ISO_SINGLE_SHIFT)
1572 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1573 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1574 & CODING_FLAG_ISO_SINGLE_SHIFT)
1575 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1576 single_shifting = 1;
1577 }
1578 if (VECTORP (Vlatin_extra_code_table)
1579 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1580 {
1581 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1582 & CODING_FLAG_ISO_LATIN_EXTRA)
1583 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1584 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1585 & CODING_FLAG_ISO_LATIN_EXTRA)
1586 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1587 }
1588 mask &= newmask;
1589 mask_found |= newmask;
1590 }
1591 break;
1592
1593 default:
1594 if (c < 0x80)
1595 {
1596 single_shifting = 0;
1597 break;
1598 }
1599 else if (c < 0xA0)
1600 {
1601 single_shifting = 0;
1602 if (VECTORP (Vlatin_extra_code_table)
1603 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1604 {
1605 int newmask = 0;
1606
1607 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1608 & CODING_FLAG_ISO_LATIN_EXTRA)
1609 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1610 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1611 & CODING_FLAG_ISO_LATIN_EXTRA)
1612 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1613 mask &= newmask;
1614 mask_found |= newmask;
1615 }
1616 else
1617 return 0;
1618 }
1619 else
1620 {
1621 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1622 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1623 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1624 /* Check the length of succeeding codes of the range
1625 0xA0..0FF. If the byte length is odd, we exclude
1626 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1627 when we are not single shifting. */
1628 if (!single_shifting
1629 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1630 {
1631 int i = 1;
1632
1633 c = -1;
1634 while (src < src_end)
1635 {
1636 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1637 mask & mask_found);
1638 if (c < 0xA0)
1639 break;
1640 i++;
1641 }
1642
1643 if (i & 1 && src < src_end)
1644 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1645 else
1646 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1647 if (c >= 0)
1648 /* This means that we have read one extra byte. */
1649 goto retry;
1650 }
1651 }
1652 break;
1653 }
1654 }
1655 return (mask & mask_found);
1656 }
1657
1658 /* Decode a character of which charset is CHARSET, the 1st position
1659 code is C1, the 2nd position code is C2, and return the decoded
1660 character code. If the variable `translation_table' is non-nil,
1661 returned the translated code. */
1662
1663 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1664 (NILP (translation_table) \
1665 ? MAKE_CHAR (charset, c1, c2) \
1666 : translate_char (translation_table, -1, charset, c1, c2))
1667
1668 /* Set designation state into CODING. */
1669 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1670 do { \
1671 int charset, c; \
1672 \
1673 if (final_char < '0' || final_char >= 128) \
1674 goto label_invalid_code; \
1675 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1676 make_number (chars), \
1677 make_number (final_char)); \
1678 c = MAKE_CHAR (charset, 0, 0); \
1679 if (charset >= 0 \
1680 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1681 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1682 { \
1683 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1684 && reg == 0 \
1685 && charset == CHARSET_ASCII) \
1686 { \
1687 /* We should insert this designation sequence as is so \
1688 that it is surely written back to a file. */ \
1689 coding->spec.iso2022.last_invalid_designation_register = -1; \
1690 goto label_invalid_code; \
1691 } \
1692 coding->spec.iso2022.last_invalid_designation_register = -1; \
1693 if ((coding->mode & CODING_MODE_DIRECTION) \
1694 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1695 charset = CHARSET_REVERSE_CHARSET (charset); \
1696 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1697 } \
1698 else \
1699 { \
1700 coding->spec.iso2022.last_invalid_designation_register = reg; \
1701 goto label_invalid_code; \
1702 } \
1703 } while (0)
1704
1705 /* Allocate a memory block for storing information about compositions.
1706 The block is chained to the already allocated blocks. */
1707
1708 void
1709 coding_allocate_composition_data (coding, char_offset)
1710 struct coding_system *coding;
1711 int char_offset;
1712 {
1713 struct composition_data *cmp_data
1714 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1715
1716 cmp_data->char_offset = char_offset;
1717 cmp_data->used = 0;
1718 cmp_data->prev = coding->cmp_data;
1719 cmp_data->next = NULL;
1720 if (coding->cmp_data)
1721 coding->cmp_data->next = cmp_data;
1722 coding->cmp_data = cmp_data;
1723 coding->cmp_data_start = 0;
1724 coding->composing = COMPOSITION_NO;
1725 }
1726
1727 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1728 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1729 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1730 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1731 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1732 */
1733
1734 #define DECODE_COMPOSITION_START(c1) \
1735 do { \
1736 if (coding->composing == COMPOSITION_DISABLED) \
1737 { \
1738 *dst++ = ISO_CODE_ESC; \
1739 *dst++ = c1 & 0x7f; \
1740 coding->produced_char += 2; \
1741 } \
1742 else if (!COMPOSING_P (coding)) \
1743 { \
1744 /* This is surely the start of a composition. We must be sure \
1745 that coding->cmp_data has enough space to store the \
1746 information about the composition. If not, terminate the \
1747 current decoding loop, allocate one more memory block for \
1748 coding->cmp_data in the caller, then start the decoding \
1749 loop again. We can't allocate memory here directly because \
1750 it may cause buffer/string relocation. */ \
1751 if (!coding->cmp_data \
1752 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1753 >= COMPOSITION_DATA_SIZE)) \
1754 { \
1755 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1756 goto label_end_of_loop; \
1757 } \
1758 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1759 : c1 == '2' ? COMPOSITION_WITH_RULE \
1760 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1761 : COMPOSITION_WITH_RULE_ALTCHARS); \
1762 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1763 coding->composing); \
1764 coding->composition_rule_follows = 0; \
1765 } \
1766 else \
1767 { \
1768 /* We are already handling a composition. If the method is \
1769 the following two, the codes following the current escape \
1770 sequence are actual characters stored in a buffer. */ \
1771 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1772 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1773 { \
1774 coding->composing = COMPOSITION_RELATIVE; \
1775 coding->composition_rule_follows = 0; \
1776 } \
1777 } \
1778 } while (0)
1779
1780 /* Handle composition end sequence ESC 1. */
1781
1782 #define DECODE_COMPOSITION_END(c1) \
1783 do { \
1784 if (! COMPOSING_P (coding)) \
1785 { \
1786 *dst++ = ISO_CODE_ESC; \
1787 *dst++ = c1; \
1788 coding->produced_char += 2; \
1789 } \
1790 else \
1791 { \
1792 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1793 coding->composing = COMPOSITION_NO; \
1794 } \
1795 } while (0)
1796
1797 /* Decode a composition rule from the byte C1 (and maybe one more byte
1798 from SRC) and store one encoded composition rule in
1799 coding->cmp_data. */
1800
1801 #define DECODE_COMPOSITION_RULE(c1) \
1802 do { \
1803 int rule = 0; \
1804 (c1) -= 32; \
1805 if (c1 < 81) /* old format (before ver.21) */ \
1806 { \
1807 int gref = (c1) / 9; \
1808 int nref = (c1) % 9; \
1809 if (gref == 4) gref = 10; \
1810 if (nref == 4) nref = 10; \
1811 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1812 } \
1813 else if (c1 < 93) /* new format (after ver.21) */ \
1814 { \
1815 ONE_MORE_BYTE (c2); \
1816 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1817 } \
1818 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1819 coding->composition_rule_follows = 0; \
1820 } while (0)
1821
1822
1823 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1824
1825 static void
1826 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1827 struct coding_system *coding;
1828 const unsigned char *source;
1829 unsigned char *destination;
1830 int src_bytes, dst_bytes;
1831 {
1832 const unsigned char *src = source;
1833 const unsigned char *src_end = source + src_bytes;
1834 unsigned char *dst = destination;
1835 unsigned char *dst_end = destination + dst_bytes;
1836 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1837 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1838 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1839 /* SRC_BASE remembers the start position in source in each loop.
1840 The loop will be exited when there's not enough source code
1841 (within macro ONE_MORE_BYTE), or when there's not enough
1842 destination area to produce a character (within macro
1843 EMIT_CHAR). */
1844 const unsigned char *src_base;
1845 int c, charset;
1846 Lisp_Object translation_table;
1847 Lisp_Object safe_chars;
1848
1849 safe_chars = coding_safe_chars (coding->symbol);
1850
1851 if (NILP (Venable_character_translation))
1852 translation_table = Qnil;
1853 else
1854 {
1855 translation_table = coding->translation_table_for_decode;
1856 if (NILP (translation_table))
1857 translation_table = Vstandard_translation_table_for_decode;
1858 }
1859
1860 coding->result = CODING_FINISH_NORMAL;
1861
1862 while (1)
1863 {
1864 int c1, c2 = 0;
1865
1866 src_base = src;
1867 ONE_MORE_BYTE (c1);
1868
1869 /* We produce no character or one character. */
1870 switch (iso_code_class [c1])
1871 {
1872 case ISO_0x20_or_0x7F:
1873 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1874 {
1875 DECODE_COMPOSITION_RULE (c1);
1876 continue;
1877 }
1878 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1879 {
1880 /* This is SPACE or DEL. */
1881 charset = CHARSET_ASCII;
1882 break;
1883 }
1884 /* This is a graphic character, we fall down ... */
1885
1886 case ISO_graphic_plane_0:
1887 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1888 {
1889 DECODE_COMPOSITION_RULE (c1);
1890 continue;
1891 }
1892 charset = charset0;
1893 break;
1894
1895 case ISO_0xA0_or_0xFF:
1896 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1897 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1898 goto label_invalid_code;
1899 /* This is a graphic character, we fall down ... */
1900
1901 case ISO_graphic_plane_1:
1902 if (charset1 < 0)
1903 goto label_invalid_code;
1904 charset = charset1;
1905 break;
1906
1907 case ISO_control_0:
1908 if (COMPOSING_P (coding))
1909 DECODE_COMPOSITION_END ('1');
1910
1911 /* All ISO2022 control characters in this class have the
1912 same representation in Emacs internal format. */
1913 if (c1 == '\n'
1914 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1915 && (coding->eol_type == CODING_EOL_CR
1916 || coding->eol_type == CODING_EOL_CRLF))
1917 {
1918 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1919 goto label_end_of_loop;
1920 }
1921 charset = CHARSET_ASCII;
1922 break;
1923
1924 case ISO_control_1:
1925 if (COMPOSING_P (coding))
1926 DECODE_COMPOSITION_END ('1');
1927 goto label_invalid_code;
1928
1929 case ISO_carriage_return:
1930 if (COMPOSING_P (coding))
1931 DECODE_COMPOSITION_END ('1');
1932
1933 if (coding->eol_type == CODING_EOL_CR)
1934 c1 = '\n';
1935 else if (coding->eol_type == CODING_EOL_CRLF)
1936 {
1937 ONE_MORE_BYTE (c1);
1938 if (c1 != ISO_CODE_LF)
1939 {
1940 src--;
1941 c1 = '\r';
1942 }
1943 }
1944 charset = CHARSET_ASCII;
1945 break;
1946
1947 case ISO_shift_out:
1948 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1949 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1950 goto label_invalid_code;
1951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1953 continue;
1954
1955 case ISO_shift_in:
1956 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1957 goto label_invalid_code;
1958 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1959 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1960 continue;
1961
1962 case ISO_single_shift_2_7:
1963 case ISO_single_shift_2:
1964 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965 goto label_invalid_code;
1966 /* SS2 is handled as an escape sequence of ESC 'N' */
1967 c1 = 'N';
1968 goto label_escape_sequence;
1969
1970 case ISO_single_shift_3:
1971 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1972 goto label_invalid_code;
1973 /* SS2 is handled as an escape sequence of ESC 'O' */
1974 c1 = 'O';
1975 goto label_escape_sequence;
1976
1977 case ISO_control_sequence_introducer:
1978 /* CSI is handled as an escape sequence of ESC '[' ... */
1979 c1 = '[';
1980 goto label_escape_sequence;
1981
1982 case ISO_escape:
1983 ONE_MORE_BYTE (c1);
1984 label_escape_sequence:
1985 /* Escape sequences handled by Emacs are invocation,
1986 designation, direction specification, and character
1987 composition specification. */
1988 switch (c1)
1989 {
1990 case '&': /* revision of following character set */
1991 ONE_MORE_BYTE (c1);
1992 if (!(c1 >= '@' && c1 <= '~'))
1993 goto label_invalid_code;
1994 ONE_MORE_BYTE (c1);
1995 if (c1 != ISO_CODE_ESC)
1996 goto label_invalid_code;
1997 ONE_MORE_BYTE (c1);
1998 goto label_escape_sequence;
1999
2000 case '$': /* designation of 2-byte character set */
2001 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2002 goto label_invalid_code;
2003 ONE_MORE_BYTE (c1);
2004 if (c1 >= '@' && c1 <= 'B')
2005 { /* designation of JISX0208.1978, GB2312.1980,
2006 or JISX0208.1980 */
2007 DECODE_DESIGNATION (0, 2, 94, c1);
2008 }
2009 else if (c1 >= 0x28 && c1 <= 0x2B)
2010 { /* designation of DIMENSION2_CHARS94 character set */
2011 ONE_MORE_BYTE (c2);
2012 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2013 }
2014 else if (c1 >= 0x2C && c1 <= 0x2F)
2015 { /* designation of DIMENSION2_CHARS96 character set */
2016 ONE_MORE_BYTE (c2);
2017 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2018 }
2019 else
2020 goto label_invalid_code;
2021 /* We must update these variables now. */
2022 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2023 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2024 continue;
2025
2026 case 'n': /* invocation of locking-shift-2 */
2027 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2028 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2029 goto label_invalid_code;
2030 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2031 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2032 continue;
2033
2034 case 'o': /* invocation of locking-shift-3 */
2035 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2036 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2037 goto label_invalid_code;
2038 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2039 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2040 continue;
2041
2042 case 'N': /* invocation of single-shift-2 */
2043 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2044 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2045 goto label_invalid_code;
2046 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2047 ONE_MORE_BYTE (c1);
2048 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2049 goto label_invalid_code;
2050 break;
2051
2052 case 'O': /* invocation of single-shift-3 */
2053 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2054 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2055 goto label_invalid_code;
2056 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2057 ONE_MORE_BYTE (c1);
2058 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2059 goto label_invalid_code;
2060 break;
2061
2062 case '0': case '2': case '3': case '4': /* start composition */
2063 DECODE_COMPOSITION_START (c1);
2064 continue;
2065
2066 case '1': /* end composition */
2067 DECODE_COMPOSITION_END (c1);
2068 continue;
2069
2070 case '[': /* specification of direction */
2071 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2072 goto label_invalid_code;
2073 /* For the moment, nested direction is not supported.
2074 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2075 left-to-right, and nonzero means right-to-left. */
2076 ONE_MORE_BYTE (c1);
2077 switch (c1)
2078 {
2079 case ']': /* end of the current direction */
2080 coding->mode &= ~CODING_MODE_DIRECTION;
2081
2082 case '0': /* end of the current direction */
2083 case '1': /* start of left-to-right direction */
2084 ONE_MORE_BYTE (c1);
2085 if (c1 == ']')
2086 coding->mode &= ~CODING_MODE_DIRECTION;
2087 else
2088 goto label_invalid_code;
2089 break;
2090
2091 case '2': /* start of right-to-left direction */
2092 ONE_MORE_BYTE (c1);
2093 if (c1 == ']')
2094 coding->mode |= CODING_MODE_DIRECTION;
2095 else
2096 goto label_invalid_code;
2097 break;
2098
2099 default:
2100 goto label_invalid_code;
2101 }
2102 continue;
2103
2104 case '%':
2105 if (COMPOSING_P (coding))
2106 DECODE_COMPOSITION_END ('1');
2107 ONE_MORE_BYTE (c1);
2108 if (c1 == '/')
2109 {
2110 /* CTEXT extended segment:
2111 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2112 We keep these bytes as is for the moment.
2113 They may be decoded by post-read-conversion. */
2114 int dim, M, L;
2115 int size, required;
2116 int produced_chars;
2117
2118 ONE_MORE_BYTE (dim);
2119 ONE_MORE_BYTE (M);
2120 ONE_MORE_BYTE (L);
2121 size = ((M - 128) * 128) + (L - 128);
2122 required = 8 + size * 2;
2123 if (dst + required > (dst_bytes ? dst_end : src))
2124 goto label_end_of_loop;
2125 *dst++ = ISO_CODE_ESC;
2126 *dst++ = '%';
2127 *dst++ = '/';
2128 *dst++ = dim;
2129 produced_chars = 4;
2130 dst += CHAR_STRING (M, dst), produced_chars++;
2131 dst += CHAR_STRING (L, dst), produced_chars++;
2132 while (size-- > 0)
2133 {
2134 ONE_MORE_BYTE (c1);
2135 dst += CHAR_STRING (c1, dst), produced_chars++;
2136 }
2137 coding->produced_char += produced_chars;
2138 }
2139 else if (c1 == 'G')
2140 {
2141 unsigned char *d = dst;
2142 int produced_chars;
2143
2144 /* XFree86 extension for embedding UTF-8 in CTEXT:
2145 ESC % G --UTF-8-BYTES-- ESC % @
2146 We keep these bytes as is for the moment.
2147 They may be decoded by post-read-conversion. */
2148 if (d + 6 > (dst_bytes ? dst_end : src))
2149 goto label_end_of_loop;
2150 *d++ = ISO_CODE_ESC;
2151 *d++ = '%';
2152 *d++ = 'G';
2153 produced_chars = 3;
2154 while (d + 1 < (dst_bytes ? dst_end : src))
2155 {
2156 ONE_MORE_BYTE (c1);
2157 if (c1 == ISO_CODE_ESC
2158 && src + 1 < src_end
2159 && src[0] == '%'
2160 && src[1] == '@')
2161 {
2162 src += 2;
2163 break;
2164 }
2165 d += CHAR_STRING (c1, d), produced_chars++;
2166 }
2167 if (d + 3 > (dst_bytes ? dst_end : src))
2168 goto label_end_of_loop;
2169 *d++ = ISO_CODE_ESC;
2170 *d++ = '%';
2171 *d++ = '@';
2172 dst = d;
2173 coding->produced_char += produced_chars + 3;
2174 }
2175 else
2176 goto label_invalid_code;
2177 continue;
2178
2179 default:
2180 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2181 goto label_invalid_code;
2182 if (c1 >= 0x28 && c1 <= 0x2B)
2183 { /* designation of DIMENSION1_CHARS94 character set */
2184 ONE_MORE_BYTE (c2);
2185 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2186 }
2187 else if (c1 >= 0x2C && c1 <= 0x2F)
2188 { /* designation of DIMENSION1_CHARS96 character set */
2189 ONE_MORE_BYTE (c2);
2190 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2191 }
2192 else
2193 goto label_invalid_code;
2194 /* We must update these variables now. */
2195 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2196 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2197 continue;
2198 }
2199 }
2200
2201 /* Now we know CHARSET and 1st position code C1 of a character.
2202 Produce a multibyte sequence for that character while getting
2203 2nd position code C2 if necessary. */
2204 if (CHARSET_DIMENSION (charset) == 2)
2205 {
2206 ONE_MORE_BYTE (c2);
2207 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2208 /* C2 is not in a valid range. */
2209 goto label_invalid_code;
2210 }
2211 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2212 EMIT_CHAR (c);
2213 continue;
2214
2215 label_invalid_code:
2216 coding->errors++;
2217 if (COMPOSING_P (coding))
2218 DECODE_COMPOSITION_END ('1');
2219 src = src_base;
2220 c = *src++;
2221 if (! NILP (translation_table))
2222 c = translate_char (translation_table, c, 0, 0, 0);
2223 EMIT_CHAR (c);
2224 }
2225
2226 label_end_of_loop:
2227 coding->consumed = coding->consumed_char = src_base - source;
2228 coding->produced = dst - destination;
2229 return;
2230 }
2231
2232
2233 /* ISO2022 encoding stuff. */
2234
2235 /*
2236 It is not enough to say just "ISO2022" on encoding, we have to
2237 specify more details. In Emacs, each ISO2022 coding system
2238 variant has the following specifications:
2239 1. Initial designation to G0 through G3.
2240 2. Allows short-form designation?
2241 3. ASCII should be designated to G0 before control characters?
2242 4. ASCII should be designated to G0 at end of line?
2243 5. 7-bit environment or 8-bit environment?
2244 6. Use locking-shift?
2245 7. Use Single-shift?
2246 And the following two are only for Japanese:
2247 8. Use ASCII in place of JIS0201-1976-Roman?
2248 9. Use JISX0208-1983 in place of JISX0208-1978?
2249 These specifications are encoded in `coding->flags' as flag bits
2250 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2251 details.
2252 */
2253
2254 /* Produce codes (escape sequence) for designating CHARSET to graphic
2255 register REG at DST, and increment DST. If <final-char> of CHARSET is
2256 '@', 'A', or 'B' and the coding system CODING allows, produce
2257 designation sequence of short-form. */
2258
2259 #define ENCODE_DESIGNATION(charset, reg, coding) \
2260 do { \
2261 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2262 char *intermediate_char_94 = "()*+"; \
2263 char *intermediate_char_96 = ",-./"; \
2264 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2265 \
2266 if (revision < 255) \
2267 { \
2268 *dst++ = ISO_CODE_ESC; \
2269 *dst++ = '&'; \
2270 *dst++ = '@' + revision; \
2271 } \
2272 *dst++ = ISO_CODE_ESC; \
2273 if (CHARSET_DIMENSION (charset) == 1) \
2274 { \
2275 if (CHARSET_CHARS (charset) == 94) \
2276 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2277 else \
2278 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2279 } \
2280 else \
2281 { \
2282 *dst++ = '$'; \
2283 if (CHARSET_CHARS (charset) == 94) \
2284 { \
2285 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2286 || reg != 0 \
2287 || final_char < '@' || final_char > 'B') \
2288 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2289 } \
2290 else \
2291 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2292 } \
2293 *dst++ = final_char; \
2294 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2295 } while (0)
2296
2297 /* The following two macros produce codes (control character or escape
2298 sequence) for ISO2022 single-shift functions (single-shift-2 and
2299 single-shift-3). */
2300
2301 #define ENCODE_SINGLE_SHIFT_2 \
2302 do { \
2303 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2304 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2305 else \
2306 *dst++ = ISO_CODE_SS2; \
2307 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2308 } while (0)
2309
2310 #define ENCODE_SINGLE_SHIFT_3 \
2311 do { \
2312 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2313 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2314 else \
2315 *dst++ = ISO_CODE_SS3; \
2316 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2317 } while (0)
2318
2319 /* The following four macros produce codes (control character or
2320 escape sequence) for ISO2022 locking-shift functions (shift-in,
2321 shift-out, locking-shift-2, and locking-shift-3). */
2322
2323 #define ENCODE_SHIFT_IN \
2324 do { \
2325 *dst++ = ISO_CODE_SI; \
2326 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2327 } while (0)
2328
2329 #define ENCODE_SHIFT_OUT \
2330 do { \
2331 *dst++ = ISO_CODE_SO; \
2332 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2333 } while (0)
2334
2335 #define ENCODE_LOCKING_SHIFT_2 \
2336 do { \
2337 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2338 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2339 } while (0)
2340
2341 #define ENCODE_LOCKING_SHIFT_3 \
2342 do { \
2343 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2344 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2345 } while (0)
2346
2347 /* Produce codes for a DIMENSION1 character whose character set is
2348 CHARSET and whose position-code is C1. Designation and invocation
2349 sequences are also produced in advance if necessary. */
2350
2351 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2352 do { \
2353 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2354 { \
2355 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2356 *dst++ = c1 & 0x7F; \
2357 else \
2358 *dst++ = c1 | 0x80; \
2359 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2360 break; \
2361 } \
2362 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2363 { \
2364 *dst++ = c1 & 0x7F; \
2365 break; \
2366 } \
2367 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2368 { \
2369 *dst++ = c1 | 0x80; \
2370 break; \
2371 } \
2372 else \
2373 /* Since CHARSET is not yet invoked to any graphic planes, we \
2374 must invoke it, or, at first, designate it to some graphic \
2375 register. Then repeat the loop to actually produce the \
2376 character. */ \
2377 dst = encode_invocation_designation (charset, coding, dst); \
2378 } while (1)
2379
2380 /* Produce codes for a DIMENSION2 character whose character set is
2381 CHARSET and whose position-codes are C1 and C2. Designation and
2382 invocation codes are also produced in advance if necessary. */
2383
2384 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2385 do { \
2386 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2387 { \
2388 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2389 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2390 else \
2391 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2392 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2393 break; \
2394 } \
2395 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2396 { \
2397 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2398 break; \
2399 } \
2400 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2401 { \
2402 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2403 break; \
2404 } \
2405 else \
2406 /* Since CHARSET is not yet invoked to any graphic planes, we \
2407 must invoke it, or, at first, designate it to some graphic \
2408 register. Then repeat the loop to actually produce the \
2409 character. */ \
2410 dst = encode_invocation_designation (charset, coding, dst); \
2411 } while (1)
2412
2413 #define ENCODE_ISO_CHARACTER(c) \
2414 do { \
2415 int charset, c1, c2; \
2416 \
2417 SPLIT_CHAR (c, charset, c1, c2); \
2418 if (CHARSET_DEFINED_P (charset)) \
2419 { \
2420 if (CHARSET_DIMENSION (charset) == 1) \
2421 { \
2422 if (charset == CHARSET_ASCII \
2423 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2424 charset = charset_latin_jisx0201; \
2425 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2426 } \
2427 else \
2428 { \
2429 if (charset == charset_jisx0208 \
2430 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2431 charset = charset_jisx0208_1978; \
2432 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2433 } \
2434 } \
2435 else \
2436 { \
2437 *dst++ = c1; \
2438 if (c2 >= 0) \
2439 *dst++ = c2; \
2440 } \
2441 } while (0)
2442
2443
2444 /* Instead of encoding character C, produce one or two `?'s. */
2445
2446 #define ENCODE_UNSAFE_CHARACTER(c) \
2447 do { \
2448 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2449 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2450 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2451 } while (0)
2452
2453
2454 /* Produce designation and invocation codes at a place pointed by DST
2455 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2456 Return new DST. */
2457
2458 unsigned char *
2459 encode_invocation_designation (charset, coding, dst)
2460 int charset;
2461 struct coding_system *coding;
2462 unsigned char *dst;
2463 {
2464 int reg; /* graphic register number */
2465
2466 /* At first, check designations. */
2467 for (reg = 0; reg < 4; reg++)
2468 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2469 break;
2470
2471 if (reg >= 4)
2472 {
2473 /* CHARSET is not yet designated to any graphic registers. */
2474 /* At first check the requested designation. */
2475 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2476 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2477 /* Since CHARSET requests no special designation, designate it
2478 to graphic register 0. */
2479 reg = 0;
2480
2481 ENCODE_DESIGNATION (charset, reg, coding);
2482 }
2483
2484 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2485 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2486 {
2487 /* Since the graphic register REG is not invoked to any graphic
2488 planes, invoke it to graphic plane 0. */
2489 switch (reg)
2490 {
2491 case 0: /* graphic register 0 */
2492 ENCODE_SHIFT_IN;
2493 break;
2494
2495 case 1: /* graphic register 1 */
2496 ENCODE_SHIFT_OUT;
2497 break;
2498
2499 case 2: /* graphic register 2 */
2500 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501 ENCODE_SINGLE_SHIFT_2;
2502 else
2503 ENCODE_LOCKING_SHIFT_2;
2504 break;
2505
2506 case 3: /* graphic register 3 */
2507 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2508 ENCODE_SINGLE_SHIFT_3;
2509 else
2510 ENCODE_LOCKING_SHIFT_3;
2511 break;
2512 }
2513 }
2514
2515 return dst;
2516 }
2517
2518 /* Produce 2-byte codes for encoded composition rule RULE. */
2519
2520 #define ENCODE_COMPOSITION_RULE(rule) \
2521 do { \
2522 int gref, nref; \
2523 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2524 *dst++ = 32 + 81 + gref; \
2525 *dst++ = 32 + nref; \
2526 } while (0)
2527
2528 /* Produce codes for indicating the start of a composition sequence
2529 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2530 which specify information about the composition. See the comment
2531 in coding.h for the format of DATA. */
2532
2533 #define ENCODE_COMPOSITION_START(coding, data) \
2534 do { \
2535 coding->composing = data[3]; \
2536 *dst++ = ISO_CODE_ESC; \
2537 if (coding->composing == COMPOSITION_RELATIVE) \
2538 *dst++ = '0'; \
2539 else \
2540 { \
2541 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2542 ? '3' : '4'); \
2543 coding->cmp_data_index = coding->cmp_data_start + 4; \
2544 coding->composition_rule_follows = 0; \
2545 } \
2546 } while (0)
2547
2548 /* Produce codes for indicating the end of the current composition. */
2549
2550 #define ENCODE_COMPOSITION_END(coding, data) \
2551 do { \
2552 *dst++ = ISO_CODE_ESC; \
2553 *dst++ = '1'; \
2554 coding->cmp_data_start += data[0]; \
2555 coding->composing = COMPOSITION_NO; \
2556 if (coding->cmp_data_start == coding->cmp_data->used \
2557 && coding->cmp_data->next) \
2558 { \
2559 coding->cmp_data = coding->cmp_data->next; \
2560 coding->cmp_data_start = 0; \
2561 } \
2562 } while (0)
2563
2564 /* Produce composition start sequence ESC 0. Here, this sequence
2565 doesn't mean the start of a new composition but means that we have
2566 just produced components (alternate chars and composition rules) of
2567 the composition and the actual text follows in SRC. */
2568
2569 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2570 do { \
2571 *dst++ = ISO_CODE_ESC; \
2572 *dst++ = '0'; \
2573 coding->composing = COMPOSITION_RELATIVE; \
2574 } while (0)
2575
2576 /* The following three macros produce codes for indicating direction
2577 of text. */
2578 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2579 do { \
2580 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2581 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2582 else \
2583 *dst++ = ISO_CODE_CSI; \
2584 } while (0)
2585
2586 #define ENCODE_DIRECTION_R2L \
2587 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2588
2589 #define ENCODE_DIRECTION_L2R \
2590 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2591
2592 /* Produce codes for designation and invocation to reset the graphic
2593 planes and registers to initial state. */
2594 #define ENCODE_RESET_PLANE_AND_REGISTER \
2595 do { \
2596 int reg; \
2597 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2598 ENCODE_SHIFT_IN; \
2599 for (reg = 0; reg < 4; reg++) \
2600 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2601 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2602 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2603 ENCODE_DESIGNATION \
2604 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2605 } while (0)
2606
2607 /* Produce designation sequences of charsets in the line started from
2608 SRC to a place pointed by DST, and return updated DST.
2609
2610 If the current block ends before any end-of-line, we may fail to
2611 find all the necessary designations. */
2612
2613 static unsigned char *
2614 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2615 struct coding_system *coding;
2616 Lisp_Object translation_table;
2617 const unsigned char *src, *src_end;
2618 unsigned char *dst;
2619 {
2620 int charset, c, found = 0, reg;
2621 /* Table of charsets to be designated to each graphic register. */
2622 int r[4];
2623
2624 for (reg = 0; reg < 4; reg++)
2625 r[reg] = -1;
2626
2627 while (found < 4)
2628 {
2629 ONE_MORE_CHAR (c);
2630 if (c == '\n')
2631 break;
2632
2633 charset = CHAR_CHARSET (c);
2634 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2635 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2636 {
2637 found++;
2638 r[reg] = charset;
2639 }
2640 }
2641
2642 label_end_of_loop:
2643 if (found)
2644 {
2645 for (reg = 0; reg < 4; reg++)
2646 if (r[reg] >= 0
2647 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2648 ENCODE_DESIGNATION (r[reg], reg, coding);
2649 }
2650
2651 return dst;
2652 }
2653
2654 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2655
2656 static void
2657 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2658 struct coding_system *coding;
2659 const unsigned char *source;
2660 unsigned char *destination;
2661 int src_bytes, dst_bytes;
2662 {
2663 const unsigned char *src = source;
2664 const unsigned char *src_end = source + src_bytes;
2665 unsigned char *dst = destination;
2666 unsigned char *dst_end = destination + dst_bytes;
2667 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2668 from DST_END to assure overflow checking is necessary only at the
2669 head of loop. */
2670 unsigned char *adjusted_dst_end = dst_end - 19;
2671 /* SRC_BASE remembers the start position in source in each loop.
2672 The loop will be exited when there's not enough source text to
2673 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2674 there's not enough destination area to produce encoded codes
2675 (within macro EMIT_BYTES). */
2676 const unsigned char *src_base;
2677 int c;
2678 Lisp_Object translation_table;
2679 Lisp_Object safe_chars;
2680
2681 if (coding->flags & CODING_FLAG_ISO_SAFE)
2682 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2683
2684 safe_chars = coding_safe_chars (coding->symbol);
2685
2686 if (NILP (Venable_character_translation))
2687 translation_table = Qnil;
2688 else
2689 {
2690 translation_table = coding->translation_table_for_encode;
2691 if (NILP (translation_table))
2692 translation_table = Vstandard_translation_table_for_encode;
2693 }
2694
2695 coding->consumed_char = 0;
2696 coding->errors = 0;
2697 while (1)
2698 {
2699 src_base = src;
2700
2701 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2702 {
2703 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2704 break;
2705 }
2706
2707 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2708 && CODING_SPEC_ISO_BOL (coding))
2709 {
2710 /* We have to produce designation sequences if any now. */
2711 dst = encode_designation_at_bol (coding, translation_table,
2712 src, src_end, dst);
2713 CODING_SPEC_ISO_BOL (coding) = 0;
2714 }
2715
2716 /* Check composition start and end. */
2717 if (coding->composing != COMPOSITION_DISABLED
2718 && coding->cmp_data_start < coding->cmp_data->used)
2719 {
2720 struct composition_data *cmp_data = coding->cmp_data;
2721 int *data = cmp_data->data + coding->cmp_data_start;
2722 int this_pos = cmp_data->char_offset + coding->consumed_char;
2723
2724 if (coding->composing == COMPOSITION_RELATIVE)
2725 {
2726 if (this_pos == data[2])
2727 {
2728 ENCODE_COMPOSITION_END (coding, data);
2729 cmp_data = coding->cmp_data;
2730 data = cmp_data->data + coding->cmp_data_start;
2731 }
2732 }
2733 else if (COMPOSING_P (coding))
2734 {
2735 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2736 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2737 /* We have consumed components of the composition.
2738 What follows in SRC is the composition's base
2739 text. */
2740 ENCODE_COMPOSITION_FAKE_START (coding);
2741 else
2742 {
2743 int c = cmp_data->data[coding->cmp_data_index++];
2744 if (coding->composition_rule_follows)
2745 {
2746 ENCODE_COMPOSITION_RULE (c);
2747 coding->composition_rule_follows = 0;
2748 }
2749 else
2750 {
2751 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2752 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2753 ENCODE_UNSAFE_CHARACTER (c);
2754 else
2755 ENCODE_ISO_CHARACTER (c);
2756 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2757 coding->composition_rule_follows = 1;
2758 }
2759 continue;
2760 }
2761 }
2762 if (!COMPOSING_P (coding))
2763 {
2764 if (this_pos == data[1])
2765 {
2766 ENCODE_COMPOSITION_START (coding, data);
2767 continue;
2768 }
2769 }
2770 }
2771
2772 ONE_MORE_CHAR (c);
2773
2774 /* Now encode the character C. */
2775 if (c < 0x20 || c == 0x7F)
2776 {
2777 if (c == '\r')
2778 {
2779 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2780 {
2781 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2782 ENCODE_RESET_PLANE_AND_REGISTER;
2783 *dst++ = c;
2784 continue;
2785 }
2786 /* fall down to treat '\r' as '\n' ... */
2787 c = '\n';
2788 }
2789 if (c == '\n')
2790 {
2791 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2792 ENCODE_RESET_PLANE_AND_REGISTER;
2793 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2794 bcopy (coding->spec.iso2022.initial_designation,
2795 coding->spec.iso2022.current_designation,
2796 sizeof coding->spec.iso2022.initial_designation);
2797 if (coding->eol_type == CODING_EOL_LF
2798 || coding->eol_type == CODING_EOL_UNDECIDED)
2799 *dst++ = ISO_CODE_LF;
2800 else if (coding->eol_type == CODING_EOL_CRLF)
2801 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2802 else
2803 *dst++ = ISO_CODE_CR;
2804 CODING_SPEC_ISO_BOL (coding) = 1;
2805 }
2806 else
2807 {
2808 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2809 ENCODE_RESET_PLANE_AND_REGISTER;
2810 *dst++ = c;
2811 }
2812 }
2813 else if (ASCII_BYTE_P (c))
2814 ENCODE_ISO_CHARACTER (c);
2815 else if (SINGLE_BYTE_CHAR_P (c))
2816 {
2817 *dst++ = c;
2818 coding->errors++;
2819 }
2820 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2821 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2822 ENCODE_UNSAFE_CHARACTER (c);
2823 else
2824 ENCODE_ISO_CHARACTER (c);
2825
2826 coding->consumed_char++;
2827 }
2828
2829 label_end_of_loop:
2830 coding->consumed = src_base - source;
2831 coding->produced = coding->produced_char = dst - destination;
2832 }
2833
2834 \f
2835 /*** 4. SJIS and BIG5 handlers ***/
2836
2837 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2838 quite widely. So, for the moment, Emacs supports them in the bare
2839 C code. But, in the future, they may be supported only by CCL. */
2840
2841 /* SJIS is a coding system encoding three character sets: ASCII, right
2842 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2843 as is. A character of charset katakana-jisx0201 is encoded by
2844 "position-code + 0x80". A character of charset japanese-jisx0208
2845 is encoded in 2-byte but two position-codes are divided and shifted
2846 so that it fits in the range below.
2847
2848 --- CODE RANGE of SJIS ---
2849 (character set) (range)
2850 ASCII 0x00 .. 0x7F
2851 KATAKANA-JISX0201 0xA1 .. 0xDF
2852 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2853 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2854 -------------------------------
2855
2856 */
2857
2858 /* BIG5 is a coding system encoding two character sets: ASCII and
2859 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2860 character set and is encoded in two bytes.
2861
2862 --- CODE RANGE of BIG5 ---
2863 (character set) (range)
2864 ASCII 0x00 .. 0x7F
2865 Big5 (1st byte) 0xA1 .. 0xFE
2866 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2867 --------------------------
2868
2869 Since the number of characters in Big5 is larger than maximum
2870 characters in Emacs' charset (96x96), it can't be handled as one
2871 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2872 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2873 contains frequently used characters and the latter contains less
2874 frequently used characters. */
2875
2876 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2877 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2878 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2879 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2880
2881 /* Number of Big5 characters which have the same code in 1st byte. */
2882 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2883
2884 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2885 do { \
2886 unsigned int temp \
2887 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2888 if (b1 < 0xC9) \
2889 charset = charset_big5_1; \
2890 else \
2891 { \
2892 charset = charset_big5_2; \
2893 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2894 } \
2895 c1 = temp / (0xFF - 0xA1) + 0x21; \
2896 c2 = temp % (0xFF - 0xA1) + 0x21; \
2897 } while (0)
2898
2899 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2900 do { \
2901 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2902 if (charset == charset_big5_2) \
2903 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2904 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2905 b2 = temp % BIG5_SAME_ROW; \
2906 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2907 } while (0)
2908
2909 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2910 Check if a text is encoded in SJIS. If it is, return
2911 CODING_CATEGORY_MASK_SJIS, else return 0. */
2912
2913 static int
2914 detect_coding_sjis (src, src_end, multibytep)
2915 unsigned char *src, *src_end;
2916 int multibytep;
2917 {
2918 int c;
2919 /* Dummy for ONE_MORE_BYTE. */
2920 struct coding_system dummy_coding;
2921 struct coding_system *coding = &dummy_coding;
2922
2923 while (1)
2924 {
2925 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2926 if (c < 0x80)
2927 continue;
2928 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2929 return 0;
2930 if (c <= 0x9F || c >= 0xE0)
2931 {
2932 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2933 if (c < 0x40 || c == 0x7F || c > 0xFC)
2934 return 0;
2935 }
2936 }
2937 }
2938
2939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2940 Check if a text is encoded in BIG5. If it is, return
2941 CODING_CATEGORY_MASK_BIG5, else return 0. */
2942
2943 static int
2944 detect_coding_big5 (src, src_end, multibytep)
2945 unsigned char *src, *src_end;
2946 int multibytep;
2947 {
2948 int c;
2949 /* Dummy for ONE_MORE_BYTE. */
2950 struct coding_system dummy_coding;
2951 struct coding_system *coding = &dummy_coding;
2952
2953 while (1)
2954 {
2955 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2956 if (c < 0x80)
2957 continue;
2958 if (c < 0xA1 || c > 0xFE)
2959 return 0;
2960 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2961 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2962 return 0;
2963 }
2964 }
2965
2966 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2967 Check if a text is encoded in UTF-8. If it is, return
2968 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2969
2970 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2971 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2972 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2973 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2974 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2975 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2976 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2977
2978 static int
2979 detect_coding_utf_8 (src, src_end, multibytep)
2980 unsigned char *src, *src_end;
2981 int multibytep;
2982 {
2983 unsigned char c;
2984 int seq_maybe_bytes;
2985 /* Dummy for ONE_MORE_BYTE. */
2986 struct coding_system dummy_coding;
2987 struct coding_system *coding = &dummy_coding;
2988
2989 while (1)
2990 {
2991 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2992 if (UTF_8_1_OCTET_P (c))
2993 continue;
2994 else if (UTF_8_2_OCTET_LEADING_P (c))
2995 seq_maybe_bytes = 1;
2996 else if (UTF_8_3_OCTET_LEADING_P (c))
2997 seq_maybe_bytes = 2;
2998 else if (UTF_8_4_OCTET_LEADING_P (c))
2999 seq_maybe_bytes = 3;
3000 else if (UTF_8_5_OCTET_LEADING_P (c))
3001 seq_maybe_bytes = 4;
3002 else if (UTF_8_6_OCTET_LEADING_P (c))
3003 seq_maybe_bytes = 5;
3004 else
3005 return 0;
3006
3007 do
3008 {
3009 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3010 if (!UTF_8_EXTRA_OCTET_P (c))
3011 return 0;
3012 seq_maybe_bytes--;
3013 }
3014 while (seq_maybe_bytes > 0);
3015 }
3016 }
3017
3018 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3019 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3020 Little Endian (otherwise). If it is, return
3021 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3022 else return 0. */
3023
3024 #define UTF_16_INVALID_P(val) \
3025 (((val) == 0xFFFE) \
3026 || ((val) == 0xFFFF))
3027
3028 #define UTF_16_HIGH_SURROGATE_P(val) \
3029 (((val) & 0xD800) == 0xD800)
3030
3031 #define UTF_16_LOW_SURROGATE_P(val) \
3032 (((val) & 0xDC00) == 0xDC00)
3033
3034 static int
3035 detect_coding_utf_16 (src, src_end, multibytep)
3036 unsigned char *src, *src_end;
3037 int multibytep;
3038 {
3039 unsigned char c1, c2;
3040 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3041 struct coding_system dummy_coding;
3042 struct coding_system *coding = &dummy_coding;
3043
3044 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3045 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3046
3047 if ((c1 == 0xFF) && (c2 == 0xFE))
3048 return CODING_CATEGORY_MASK_UTF_16_LE;
3049 else if ((c1 == 0xFE) && (c2 == 0xFF))
3050 return CODING_CATEGORY_MASK_UTF_16_BE;
3051 return 0;
3052 }
3053
3054 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3055 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3056
3057 static void
3058 decode_coding_sjis_big5 (coding, source, destination,
3059 src_bytes, dst_bytes, sjis_p)
3060 struct coding_system *coding;
3061 const unsigned char *source;
3062 unsigned char *destination;
3063 int src_bytes, dst_bytes;
3064 int sjis_p;
3065 {
3066 const unsigned char *src = source;
3067 const unsigned char *src_end = source + src_bytes;
3068 unsigned char *dst = destination;
3069 unsigned char *dst_end = destination + dst_bytes;
3070 /* SRC_BASE remembers the start position in source in each loop.
3071 The loop will be exited when there's not enough source code
3072 (within macro ONE_MORE_BYTE), or when there's not enough
3073 destination area to produce a character (within macro
3074 EMIT_CHAR). */
3075 const unsigned char *src_base;
3076 Lisp_Object translation_table;
3077
3078 if (NILP (Venable_character_translation))
3079 translation_table = Qnil;
3080 else
3081 {
3082 translation_table = coding->translation_table_for_decode;
3083 if (NILP (translation_table))
3084 translation_table = Vstandard_translation_table_for_decode;
3085 }
3086
3087 coding->produced_char = 0;
3088 while (1)
3089 {
3090 int c, charset, c1, c2 = 0;
3091
3092 src_base = src;
3093 ONE_MORE_BYTE (c1);
3094
3095 if (c1 < 0x80)
3096 {
3097 charset = CHARSET_ASCII;
3098 if (c1 < 0x20)
3099 {
3100 if (c1 == '\r')
3101 {
3102 if (coding->eol_type == CODING_EOL_CRLF)
3103 {
3104 ONE_MORE_BYTE (c2);
3105 if (c2 == '\n')
3106 c1 = c2;
3107 else
3108 /* To process C2 again, SRC is subtracted by 1. */
3109 src--;
3110 }
3111 else if (coding->eol_type == CODING_EOL_CR)
3112 c1 = '\n';
3113 }
3114 else if (c1 == '\n'
3115 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3116 && (coding->eol_type == CODING_EOL_CR
3117 || coding->eol_type == CODING_EOL_CRLF))
3118 {
3119 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3120 goto label_end_of_loop;
3121 }
3122 }
3123 }
3124 else
3125 {
3126 if (sjis_p)
3127 {
3128 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3129 goto label_invalid_code;
3130 if (c1 <= 0x9F || c1 >= 0xE0)
3131 {
3132 /* SJIS -> JISX0208 */
3133 ONE_MORE_BYTE (c2);
3134 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3135 goto label_invalid_code;
3136 DECODE_SJIS (c1, c2, c1, c2);
3137 charset = charset_jisx0208;
3138 }
3139 else
3140 /* SJIS -> JISX0201-Kana */
3141 charset = charset_katakana_jisx0201;
3142 }
3143 else
3144 {
3145 /* BIG5 -> Big5 */
3146 if (c1 < 0xA0 || c1 > 0xFE)
3147 goto label_invalid_code;
3148 ONE_MORE_BYTE (c2);
3149 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3150 goto label_invalid_code;
3151 DECODE_BIG5 (c1, c2, charset, c1, c2);
3152 }
3153 }
3154
3155 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3156 EMIT_CHAR (c);
3157 continue;
3158
3159 label_invalid_code:
3160 coding->errors++;
3161 src = src_base;
3162 c = *src++;
3163 EMIT_CHAR (c);
3164 }
3165
3166 label_end_of_loop:
3167 coding->consumed = coding->consumed_char = src_base - source;
3168 coding->produced = dst - destination;
3169 return;
3170 }
3171
3172 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3173 This function can encode charsets `ascii', `katakana-jisx0201',
3174 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3175 are sure that all these charsets are registered as official charset
3176 (i.e. do not have extended leading-codes). Characters of other
3177 charsets are produced without any encoding. If SJIS_P is 1, encode
3178 SJIS text, else encode BIG5 text. */
3179
3180 static void
3181 encode_coding_sjis_big5 (coding, source, destination,
3182 src_bytes, dst_bytes, sjis_p)
3183 struct coding_system *coding;
3184 unsigned char *source, *destination;
3185 int src_bytes, dst_bytes;
3186 int sjis_p;
3187 {
3188 unsigned char *src = source;
3189 unsigned char *src_end = source + src_bytes;
3190 unsigned char *dst = destination;
3191 unsigned char *dst_end = destination + dst_bytes;
3192 /* SRC_BASE remembers the start position in source in each loop.
3193 The loop will be exited when there's not enough source text to
3194 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3195 there's not enough destination area to produce encoded codes
3196 (within macro EMIT_BYTES). */
3197 unsigned char *src_base;
3198 Lisp_Object translation_table;
3199
3200 if (NILP (Venable_character_translation))
3201 translation_table = Qnil;
3202 else
3203 {
3204 translation_table = coding->translation_table_for_encode;
3205 if (NILP (translation_table))
3206 translation_table = Vstandard_translation_table_for_encode;
3207 }
3208
3209 while (1)
3210 {
3211 int c, charset, c1, c2;
3212
3213 src_base = src;
3214 ONE_MORE_CHAR (c);
3215
3216 /* Now encode the character C. */
3217 if (SINGLE_BYTE_CHAR_P (c))
3218 {
3219 switch (c)
3220 {
3221 case '\r':
3222 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3223 {
3224 EMIT_ONE_BYTE (c);
3225 break;
3226 }
3227 c = '\n';
3228 case '\n':
3229 if (coding->eol_type == CODING_EOL_CRLF)
3230 {
3231 EMIT_TWO_BYTES ('\r', c);
3232 break;
3233 }
3234 else if (coding->eol_type == CODING_EOL_CR)
3235 c = '\r';
3236 default:
3237 EMIT_ONE_BYTE (c);
3238 }
3239 }
3240 else
3241 {
3242 SPLIT_CHAR (c, charset, c1, c2);
3243 if (sjis_p)
3244 {
3245 if (charset == charset_jisx0208
3246 || charset == charset_jisx0208_1978)
3247 {
3248 ENCODE_SJIS (c1, c2, c1, c2);
3249 EMIT_TWO_BYTES (c1, c2);
3250 }
3251 else if (charset == charset_katakana_jisx0201)
3252 EMIT_ONE_BYTE (c1 | 0x80);
3253 else if (charset == charset_latin_jisx0201)
3254 EMIT_ONE_BYTE (c1);
3255 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3256 {
3257 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3258 if (CHARSET_WIDTH (charset) > 1)
3259 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260 }
3261 else
3262 /* There's no way other than producing the internal
3263 codes as is. */
3264 EMIT_BYTES (src_base, src);
3265 }
3266 else
3267 {
3268 if (charset == charset_big5_1 || charset == charset_big5_2)
3269 {
3270 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3271 EMIT_TWO_BYTES (c1, c2);
3272 }
3273 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3274 {
3275 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3276 if (CHARSET_WIDTH (charset) > 1)
3277 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3278 }
3279 else
3280 /* There's no way other than producing the internal
3281 codes as is. */
3282 EMIT_BYTES (src_base, src);
3283 }
3284 }
3285 coding->consumed_char++;
3286 }
3287
3288 label_end_of_loop:
3289 coding->consumed = src_base - source;
3290 coding->produced = coding->produced_char = dst - destination;
3291 }
3292
3293 \f
3294 /*** 5. CCL handlers ***/
3295
3296 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3297 Check if a text is encoded in a coding system of which
3298 encoder/decoder are written in CCL program. If it is, return
3299 CODING_CATEGORY_MASK_CCL, else return 0. */
3300
3301 static int
3302 detect_coding_ccl (src, src_end, multibytep)
3303 unsigned char *src, *src_end;
3304 int multibytep;
3305 {
3306 unsigned char *valid;
3307 int c;
3308 /* Dummy for ONE_MORE_BYTE. */
3309 struct coding_system dummy_coding;
3310 struct coding_system *coding = &dummy_coding;
3311
3312 /* No coding system is assigned to coding-category-ccl. */
3313 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3314 return 0;
3315
3316 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3317 while (1)
3318 {
3319 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3320 if (! valid[c])
3321 return 0;
3322 }
3323 }
3324
3325 \f
3326 /*** 6. End-of-line handlers ***/
3327
3328 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3329
3330 static void
3331 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3332 struct coding_system *coding;
3333 const unsigned char *source;
3334 unsigned char *destination;
3335 int src_bytes, dst_bytes;
3336 {
3337 const unsigned char *src = source;
3338 unsigned char *dst = destination;
3339 const unsigned char *src_end = src + src_bytes;
3340 unsigned char *dst_end = dst + dst_bytes;
3341 Lisp_Object translation_table;
3342 /* SRC_BASE remembers the start position in source in each loop.
3343 The loop will be exited when there's not enough source code
3344 (within macro ONE_MORE_BYTE), or when there's not enough
3345 destination area to produce a character (within macro
3346 EMIT_CHAR). */
3347 const unsigned char *src_base;
3348 int c;
3349
3350 translation_table = Qnil;
3351 switch (coding->eol_type)
3352 {
3353 case CODING_EOL_CRLF:
3354 while (1)
3355 {
3356 src_base = src;
3357 ONE_MORE_BYTE (c);
3358 if (c == '\r')
3359 {
3360 ONE_MORE_BYTE (c);
3361 if (c != '\n')
3362 {
3363 src--;
3364 c = '\r';
3365 }
3366 }
3367 else if (c == '\n'
3368 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3369 {
3370 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3371 goto label_end_of_loop;
3372 }
3373 EMIT_CHAR (c);
3374 }
3375 break;
3376
3377 case CODING_EOL_CR:
3378 while (1)
3379 {
3380 src_base = src;
3381 ONE_MORE_BYTE (c);
3382 if (c == '\n')
3383 {
3384 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3385 {
3386 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3387 goto label_end_of_loop;
3388 }
3389 }
3390 else if (c == '\r')
3391 c = '\n';
3392 EMIT_CHAR (c);
3393 }
3394 break;
3395
3396 default: /* no need for EOL handling */
3397 while (1)
3398 {
3399 src_base = src;
3400 ONE_MORE_BYTE (c);
3401 EMIT_CHAR (c);
3402 }
3403 }
3404
3405 label_end_of_loop:
3406 coding->consumed = coding->consumed_char = src_base - source;
3407 coding->produced = dst - destination;
3408 return;
3409 }
3410
3411 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3412 format of end-of-line according to `coding->eol_type'. It also
3413 convert multibyte form 8-bit characters to unibyte if
3414 CODING->src_multibyte is nonzero. If `coding->mode &
3415 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3416 also means end-of-line. */
3417
3418 static void
3419 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3420 struct coding_system *coding;
3421 const unsigned char *source;
3422 unsigned char *destination;
3423 int src_bytes, dst_bytes;
3424 {
3425 const unsigned char *src = source;
3426 unsigned char *dst = destination;
3427 const unsigned char *src_end = src + src_bytes;
3428 unsigned char *dst_end = dst + dst_bytes;
3429 Lisp_Object translation_table;
3430 /* SRC_BASE remembers the start position in source in each loop.
3431 The loop will be exited when there's not enough source text to
3432 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3433 there's not enough destination area to produce encoded codes
3434 (within macro EMIT_BYTES). */
3435 const unsigned char *src_base;
3436 unsigned char *tmp;
3437 int c;
3438 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3439
3440 translation_table = Qnil;
3441 if (coding->src_multibyte
3442 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3443 {
3444 src_end--;
3445 src_bytes--;
3446 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3447 }
3448
3449 if (coding->eol_type == CODING_EOL_CRLF)
3450 {
3451 while (src < src_end)
3452 {
3453 src_base = src;
3454 c = *src++;
3455 if (c >= 0x20)
3456 EMIT_ONE_BYTE (c);
3457 else if (c == '\n' || (c == '\r' && selective_display))
3458 EMIT_TWO_BYTES ('\r', '\n');
3459 else
3460 EMIT_ONE_BYTE (c);
3461 }
3462 src_base = src;
3463 label_end_of_loop:
3464 ;
3465 }
3466 else
3467 {
3468 if (!dst_bytes || src_bytes <= dst_bytes)
3469 {
3470 safe_bcopy (src, dst, src_bytes);
3471 src_base = src_end;
3472 dst += src_bytes;
3473 }
3474 else
3475 {
3476 if (coding->src_multibyte
3477 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3478 dst_bytes--;
3479 safe_bcopy (src, dst, dst_bytes);
3480 src_base = src + dst_bytes;
3481 dst = destination + dst_bytes;
3482 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3483 }
3484 if (coding->eol_type == CODING_EOL_CR)
3485 {
3486 for (tmp = destination; tmp < dst; tmp++)
3487 if (*tmp == '\n') *tmp = '\r';
3488 }
3489 else if (selective_display)
3490 {
3491 for (tmp = destination; tmp < dst; tmp++)
3492 if (*tmp == '\r') *tmp = '\n';
3493 }
3494 }
3495 if (coding->src_multibyte)
3496 dst = destination + str_as_unibyte (destination, dst - destination);
3497
3498 coding->consumed = src_base - source;
3499 coding->produced = dst - destination;
3500 coding->produced_char = coding->produced;
3501 }
3502
3503 \f
3504 /*** 7. C library functions ***/
3505
3506 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3507 has a property `coding-system'. The value of this property is a
3508 vector of length 5 (called the coding-vector). Among elements of
3509 this vector, the first (element[0]) and the fifth (element[4])
3510 carry important information for decoding/encoding. Before
3511 decoding/encoding, this information should be set in fields of a
3512 structure of type `coding_system'.
3513
3514 The value of the property `coding-system' can be a symbol of another
3515 subsidiary coding-system. In that case, Emacs gets coding-vector
3516 from that symbol.
3517
3518 `element[0]' contains information to be set in `coding->type'. The
3519 value and its meaning is as follows:
3520
3521 0 -- coding_type_emacs_mule
3522 1 -- coding_type_sjis
3523 2 -- coding_type_iso2022
3524 3 -- coding_type_big5
3525 4 -- coding_type_ccl encoder/decoder written in CCL
3526 nil -- coding_type_no_conversion
3527 t -- coding_type_undecided (automatic conversion on decoding,
3528 no-conversion on encoding)
3529
3530 `element[4]' contains information to be set in `coding->flags' and
3531 `coding->spec'. The meaning varies by `coding->type'.
3532
3533 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3534 of length 32 (of which the first 13 sub-elements are used now).
3535 Meanings of these sub-elements are:
3536
3537 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3538 If the value is an integer of valid charset, the charset is
3539 assumed to be designated to graphic register N initially.
3540
3541 If the value is minus, it is a minus value of charset which
3542 reserves graphic register N, which means that the charset is
3543 not designated initially but should be designated to graphic
3544 register N just before encoding a character in that charset.
3545
3546 If the value is nil, graphic register N is never used on
3547 encoding.
3548
3549 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3550 Each value takes t or nil. See the section ISO2022 of
3551 `coding.h' for more information.
3552
3553 If `coding->type' is `coding_type_big5', element[4] is t to denote
3554 BIG5-ETen or nil to denote BIG5-HKU.
3555
3556 If `coding->type' takes the other value, element[4] is ignored.
3557
3558 Emacs Lisp's coding systems also carry information about format of
3559 end-of-line in a value of property `eol-type'. If the value is
3560 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3561 means CODING_EOL_CR. If it is not integer, it should be a vector
3562 of subsidiary coding systems of which property `eol-type' has one
3563 of the above values.
3564
3565 */
3566
3567 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3568 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3569 is setup so that no conversion is necessary and return -1, else
3570 return 0. */
3571
3572 int
3573 setup_coding_system (coding_system, coding)
3574 Lisp_Object coding_system;
3575 struct coding_system *coding;
3576 {
3577 Lisp_Object coding_spec, coding_type, eol_type, plist;
3578 Lisp_Object val;
3579
3580 /* At first, zero clear all members. */
3581 bzero (coding, sizeof (struct coding_system));
3582
3583 /* Initialize some fields required for all kinds of coding systems. */
3584 coding->symbol = coding_system;
3585 coding->heading_ascii = -1;
3586 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3587 coding->composing = COMPOSITION_DISABLED;
3588 coding->cmp_data = NULL;
3589
3590 if (NILP (coding_system))
3591 goto label_invalid_coding_system;
3592
3593 coding_spec = Fget (coding_system, Qcoding_system);
3594
3595 if (!VECTORP (coding_spec)
3596 || XVECTOR (coding_spec)->size != 5
3597 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3598 goto label_invalid_coding_system;
3599
3600 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3601 if (VECTORP (eol_type))
3602 {
3603 coding->eol_type = CODING_EOL_UNDECIDED;
3604 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3605 if (system_eol_type != CODING_EOL_LF)
3606 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3607 }
3608 else if (XFASTINT (eol_type) == 1)
3609 {
3610 coding->eol_type = CODING_EOL_CRLF;
3611 coding->common_flags
3612 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3613 }
3614 else if (XFASTINT (eol_type) == 2)
3615 {
3616 coding->eol_type = CODING_EOL_CR;
3617 coding->common_flags
3618 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3619 }
3620 else
3621 {
3622 coding->common_flags = 0;
3623 coding->eol_type = CODING_EOL_LF;
3624 }
3625
3626 coding_type = XVECTOR (coding_spec)->contents[0];
3627 /* Try short cut. */
3628 if (SYMBOLP (coding_type))
3629 {
3630 if (EQ (coding_type, Qt))
3631 {
3632 coding->type = coding_type_undecided;
3633 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3634 }
3635 else
3636 coding->type = coding_type_no_conversion;
3637 /* Initialize this member. Any thing other than
3638 CODING_CATEGORY_IDX_UTF_16_BE and
3639 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3640 special treatment in detect_eol. */
3641 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3642
3643 return 0;
3644 }
3645
3646 /* Get values of coding system properties:
3647 `post-read-conversion', `pre-write-conversion',
3648 `translation-table-for-decode', `translation-table-for-encode'. */
3649 plist = XVECTOR (coding_spec)->contents[3];
3650 /* Pre & post conversion functions should be disabled if
3651 inhibit_eol_conversion is nonzero. This is the case that a code
3652 conversion function is called while those functions are running. */
3653 if (! inhibit_pre_post_conversion)
3654 {
3655 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3656 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3657 }
3658 val = Fplist_get (plist, Qtranslation_table_for_decode);
3659 if (SYMBOLP (val))
3660 val = Fget (val, Qtranslation_table_for_decode);
3661 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3662 val = Fplist_get (plist, Qtranslation_table_for_encode);
3663 if (SYMBOLP (val))
3664 val = Fget (val, Qtranslation_table_for_encode);
3665 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3666 val = Fplist_get (plist, Qcoding_category);
3667 if (!NILP (val))
3668 {
3669 val = Fget (val, Qcoding_category_index);
3670 if (INTEGERP (val))
3671 coding->category_idx = XINT (val);
3672 else
3673 goto label_invalid_coding_system;
3674 }
3675 else
3676 goto label_invalid_coding_system;
3677
3678 /* If the coding system has non-nil `composition' property, enable
3679 composition handling. */
3680 val = Fplist_get (plist, Qcomposition);
3681 if (!NILP (val))
3682 coding->composing = COMPOSITION_NO;
3683
3684 /* If the coding system is ascii-incompatible, record it in
3685 common_flags. */
3686 val = Fplist_get (plist, Qascii_incompatible);
3687 if (! NILP (val))
3688 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3689
3690 switch (XFASTINT (coding_type))
3691 {
3692 case 0:
3693 coding->type = coding_type_emacs_mule;
3694 coding->common_flags
3695 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3696 if (!NILP (coding->post_read_conversion))
3697 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3698 if (!NILP (coding->pre_write_conversion))
3699 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3700 break;
3701
3702 case 1:
3703 coding->type = coding_type_sjis;
3704 coding->common_flags
3705 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3706 break;
3707
3708 case 2:
3709 coding->type = coding_type_iso2022;
3710 coding->common_flags
3711 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3712 {
3713 Lisp_Object val, temp;
3714 Lisp_Object *flags;
3715 int i, charset, reg_bits = 0;
3716
3717 val = XVECTOR (coding_spec)->contents[4];
3718
3719 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3720 goto label_invalid_coding_system;
3721
3722 flags = XVECTOR (val)->contents;
3723 coding->flags
3724 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3725 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3726 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3727 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3728 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3729 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3730 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3731 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3732 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3733 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3734 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3735 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3736 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3737 );
3738
3739 /* Invoke graphic register 0 to plane 0. */
3740 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3741 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3742 CODING_SPEC_ISO_INVOCATION (coding, 1)
3743 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3744 /* Not single shifting at first. */
3745 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3746 /* Beginning of buffer should also be regarded as bol. */
3747 CODING_SPEC_ISO_BOL (coding) = 1;
3748
3749 for (charset = 0; charset <= MAX_CHARSET; charset++)
3750 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3751 val = Vcharset_revision_alist;
3752 while (CONSP (val))
3753 {
3754 charset = get_charset_id (Fcar_safe (XCAR (val)));
3755 if (charset >= 0
3756 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3757 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3758 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3759 val = XCDR (val);
3760 }
3761
3762 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3763 FLAGS[REG] can be one of below:
3764 integer CHARSET: CHARSET occupies register I,
3765 t: designate nothing to REG initially, but can be used
3766 by any charsets,
3767 list of integer, nil, or t: designate the first
3768 element (if integer) to REG initially, the remaining
3769 elements (if integer) is designated to REG on request,
3770 if an element is t, REG can be used by any charsets,
3771 nil: REG is never used. */
3772 for (charset = 0; charset <= MAX_CHARSET; charset++)
3773 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3774 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3775 for (i = 0; i < 4; i++)
3776 {
3777 if ((INTEGERP (flags[i])
3778 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3779 || (charset = get_charset_id (flags[i])) >= 0)
3780 {
3781 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3782 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3783 }
3784 else if (EQ (flags[i], Qt))
3785 {
3786 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3787 reg_bits |= 1 << i;
3788 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3789 }
3790 else if (CONSP (flags[i]))
3791 {
3792 Lisp_Object tail;
3793 tail = flags[i];
3794
3795 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3796 if ((INTEGERP (XCAR (tail))
3797 && (charset = XINT (XCAR (tail)),
3798 CHARSET_VALID_P (charset)))
3799 || (charset = get_charset_id (XCAR (tail))) >= 0)
3800 {
3801 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3802 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3803 }
3804 else
3805 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3806 tail = XCDR (tail);
3807 while (CONSP (tail))
3808 {
3809 if ((INTEGERP (XCAR (tail))
3810 && (charset = XINT (XCAR (tail)),
3811 CHARSET_VALID_P (charset)))
3812 || (charset = get_charset_id (XCAR (tail))) >= 0)
3813 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3814 = i;
3815 else if (EQ (XCAR (tail), Qt))
3816 reg_bits |= 1 << i;
3817 tail = XCDR (tail);
3818 }
3819 }
3820 else
3821 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3822
3823 CODING_SPEC_ISO_DESIGNATION (coding, i)
3824 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3825 }
3826
3827 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3828 {
3829 /* REG 1 can be used only by locking shift in 7-bit env. */
3830 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3831 reg_bits &= ~2;
3832 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3833 /* Without any shifting, only REG 0 and 1 can be used. */
3834 reg_bits &= 3;
3835 }
3836
3837 if (reg_bits)
3838 for (charset = 0; charset <= MAX_CHARSET; charset++)
3839 {
3840 if (CHARSET_DEFINED_P (charset)
3841 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3842 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3843 {
3844 /* There exist some default graphic registers to be
3845 used by CHARSET. */
3846
3847 /* We had better avoid designating a charset of
3848 CHARS96 to REG 0 as far as possible. */
3849 if (CHARSET_CHARS (charset) == 96)
3850 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3851 = (reg_bits & 2
3852 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3853 else
3854 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3855 = (reg_bits & 1
3856 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3857 }
3858 }
3859 }
3860 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3861 coding->spec.iso2022.last_invalid_designation_register = -1;
3862 break;
3863
3864 case 3:
3865 coding->type = coding_type_big5;
3866 coding->common_flags
3867 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3868 coding->flags
3869 = (NILP (XVECTOR (coding_spec)->contents[4])
3870 ? CODING_FLAG_BIG5_HKU
3871 : CODING_FLAG_BIG5_ETEN);
3872 break;
3873
3874 case 4:
3875 coding->type = coding_type_ccl;
3876 coding->common_flags
3877 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3878 {
3879 val = XVECTOR (coding_spec)->contents[4];
3880 if (! CONSP (val)
3881 || setup_ccl_program (&(coding->spec.ccl.decoder),
3882 XCAR (val)) < 0
3883 || setup_ccl_program (&(coding->spec.ccl.encoder),
3884 XCDR (val)) < 0)
3885 goto label_invalid_coding_system;
3886
3887 bzero (coding->spec.ccl.valid_codes, 256);
3888 val = Fplist_get (plist, Qvalid_codes);
3889 if (CONSP (val))
3890 {
3891 Lisp_Object this;
3892
3893 for (; CONSP (val); val = XCDR (val))
3894 {
3895 this = XCAR (val);
3896 if (INTEGERP (this)
3897 && XINT (this) >= 0 && XINT (this) < 256)
3898 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3899 else if (CONSP (this)
3900 && INTEGERP (XCAR (this))
3901 && INTEGERP (XCDR (this)))
3902 {
3903 int start = XINT (XCAR (this));
3904 int end = XINT (XCDR (this));
3905
3906 if (start >= 0 && start <= end && end < 256)
3907 while (start <= end)
3908 coding->spec.ccl.valid_codes[start++] = 1;
3909 }
3910 }
3911 }
3912 }
3913 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3914 coding->spec.ccl.cr_carryover = 0;
3915 coding->spec.ccl.eight_bit_carryover[0] = 0;
3916 break;
3917
3918 case 5:
3919 coding->type = coding_type_raw_text;
3920 break;
3921
3922 default:
3923 goto label_invalid_coding_system;
3924 }
3925 return 0;
3926
3927 label_invalid_coding_system:
3928 coding->type = coding_type_no_conversion;
3929 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3930 coding->common_flags = 0;
3931 coding->eol_type = CODING_EOL_UNDECIDED;
3932 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3933 return NILP (coding_system) ? 0 : -1;
3934 }
3935
3936 /* Free memory blocks allocated for storing composition information. */
3937
3938 void
3939 coding_free_composition_data (coding)
3940 struct coding_system *coding;
3941 {
3942 struct composition_data *cmp_data = coding->cmp_data, *next;
3943
3944 if (!cmp_data)
3945 return;
3946 /* Memory blocks are chained. At first, rewind to the first, then,
3947 free blocks one by one. */
3948 while (cmp_data->prev)
3949 cmp_data = cmp_data->prev;
3950 while (cmp_data)
3951 {
3952 next = cmp_data->next;
3953 xfree (cmp_data);
3954 cmp_data = next;
3955 }
3956 coding->cmp_data = NULL;
3957 }
3958
3959 /* Set `char_offset' member of all memory blocks pointed by
3960 coding->cmp_data to POS. */
3961
3962 void
3963 coding_adjust_composition_offset (coding, pos)
3964 struct coding_system *coding;
3965 int pos;
3966 {
3967 struct composition_data *cmp_data;
3968
3969 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3970 cmp_data->char_offset = pos;
3971 }
3972
3973 /* Setup raw-text or one of its subsidiaries in the structure
3974 coding_system CODING according to the already setup value eol_type
3975 in CODING. CODING should be setup for some coding system in
3976 advance. */
3977
3978 void
3979 setup_raw_text_coding_system (coding)
3980 struct coding_system *coding;
3981 {
3982 if (coding->type != coding_type_raw_text)
3983 {
3984 coding->symbol = Qraw_text;
3985 coding->type = coding_type_raw_text;
3986 if (coding->eol_type != CODING_EOL_UNDECIDED)
3987 {
3988 Lisp_Object subsidiaries;
3989 subsidiaries = Fget (Qraw_text, Qeol_type);
3990
3991 if (VECTORP (subsidiaries)
3992 && XVECTOR (subsidiaries)->size == 3)
3993 coding->symbol
3994 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3995 }
3996 setup_coding_system (coding->symbol, coding);
3997 }
3998 return;
3999 }
4000
4001 /* Emacs has a mechanism to automatically detect a coding system if it
4002 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4003 it's impossible to distinguish some coding systems accurately
4004 because they use the same range of codes. So, at first, coding
4005 systems are categorized into 7, those are:
4006
4007 o coding-category-emacs-mule
4008
4009 The category for a coding system which has the same code range
4010 as Emacs' internal format. Assigned the coding-system (Lisp
4011 symbol) `emacs-mule' by default.
4012
4013 o coding-category-sjis
4014
4015 The category for a coding system which has the same code range
4016 as SJIS. Assigned the coding-system (Lisp
4017 symbol) `japanese-shift-jis' by default.
4018
4019 o coding-category-iso-7
4020
4021 The category for a coding system which has the same code range
4022 as ISO2022 of 7-bit environment. This doesn't use any locking
4023 shift and single shift functions. This can encode/decode all
4024 charsets. Assigned the coding-system (Lisp symbol)
4025 `iso-2022-7bit' by default.
4026
4027 o coding-category-iso-7-tight
4028
4029 Same as coding-category-iso-7 except that this can
4030 encode/decode only the specified charsets.
4031
4032 o coding-category-iso-8-1
4033
4034 The category for a coding system which has the same code range
4035 as ISO2022 of 8-bit environment and graphic plane 1 used only
4036 for DIMENSION1 charset. This doesn't use any locking shift
4037 and single shift functions. Assigned the coding-system (Lisp
4038 symbol) `iso-latin-1' by default.
4039
4040 o coding-category-iso-8-2
4041
4042 The category for a coding system which has the same code range
4043 as ISO2022 of 8-bit environment and graphic plane 1 used only
4044 for DIMENSION2 charset. This doesn't use any locking shift
4045 and single shift functions. Assigned the coding-system (Lisp
4046 symbol) `japanese-iso-8bit' by default.
4047
4048 o coding-category-iso-7-else
4049
4050 The category for a coding system which has the same code range
4051 as ISO2022 of 7-bit environment but uses locking shift or
4052 single shift functions. Assigned the coding-system (Lisp
4053 symbol) `iso-2022-7bit-lock' by default.
4054
4055 o coding-category-iso-8-else
4056
4057 The category for a coding system which has the same code range
4058 as ISO2022 of 8-bit environment but uses locking shift or
4059 single shift functions. Assigned the coding-system (Lisp
4060 symbol) `iso-2022-8bit-ss2' by default.
4061
4062 o coding-category-big5
4063
4064 The category for a coding system which has the same code range
4065 as BIG5. Assigned the coding-system (Lisp symbol)
4066 `cn-big5' by default.
4067
4068 o coding-category-utf-8
4069
4070 The category for a coding system which has the same code range
4071 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4072 symbol) `utf-8' by default.
4073
4074 o coding-category-utf-16-be
4075
4076 The category for a coding system in which a text has an
4077 Unicode signature (cf. Unicode Standard) in the order of BIG
4078 endian at the head. Assigned the coding-system (Lisp symbol)
4079 `utf-16-be' by default.
4080
4081 o coding-category-utf-16-le
4082
4083 The category for a coding system in which a text has an
4084 Unicode signature (cf. Unicode Standard) in the order of
4085 LITTLE endian at the head. Assigned the coding-system (Lisp
4086 symbol) `utf-16-le' by default.
4087
4088 o coding-category-ccl
4089
4090 The category for a coding system of which encoder/decoder is
4091 written in CCL programs. The default value is nil, i.e., no
4092 coding system is assigned.
4093
4094 o coding-category-binary
4095
4096 The category for a coding system not categorized in any of the
4097 above. Assigned the coding-system (Lisp symbol)
4098 `no-conversion' by default.
4099
4100 Each of them is a Lisp symbol and the value is an actual
4101 `coding-system' (this is also a Lisp symbol) assigned by a user.
4102 What Emacs does actually is to detect a category of coding system.
4103 Then, it uses a `coding-system' assigned to it. If Emacs can't
4104 decide a single possible category, it selects a category of the
4105 highest priority. Priorities of categories are also specified by a
4106 user in a Lisp variable `coding-category-list'.
4107
4108 */
4109
4110 static
4111 int ascii_skip_code[256];
4112
4113 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4114 If it detects possible coding systems, return an integer in which
4115 appropriate flag bits are set. Flag bits are defined by macros
4116 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4117 it should point the table `coding_priorities'. In that case, only
4118 the flag bit for a coding system of the highest priority is set in
4119 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4120 range 0x80..0x9F are in multibyte form.
4121
4122 How many ASCII characters are at the head is returned as *SKIP. */
4123
4124 static int
4125 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4126 unsigned char *source;
4127 int src_bytes, *priorities, *skip;
4128 int multibytep;
4129 {
4130 register unsigned char c;
4131 unsigned char *src = source, *src_end = source + src_bytes;
4132 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4133 int i;
4134
4135 /* At first, skip all ASCII characters and control characters except
4136 for three ISO2022 specific control characters. */
4137 ascii_skip_code[ISO_CODE_SO] = 0;
4138 ascii_skip_code[ISO_CODE_SI] = 0;
4139 ascii_skip_code[ISO_CODE_ESC] = 0;
4140
4141 label_loop_detect_coding:
4142 while (src < src_end && ascii_skip_code[*src]) src++;
4143 *skip = src - source;
4144
4145 if (src >= src_end)
4146 /* We found nothing other than ASCII. There's nothing to do. */
4147 return 0;
4148
4149 c = *src;
4150 /* The text seems to be encoded in some multilingual coding system.
4151 Now, try to find in which coding system the text is encoded. */
4152 if (c < 0x80)
4153 {
4154 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4155 /* C is an ISO2022 specific control code of C0. */
4156 mask = detect_coding_iso2022 (src, src_end, multibytep);
4157 if (mask == 0)
4158 {
4159 /* No valid ISO2022 code follows C. Try again. */
4160 src++;
4161 if (c == ISO_CODE_ESC)
4162 ascii_skip_code[ISO_CODE_ESC] = 1;
4163 else
4164 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4165 goto label_loop_detect_coding;
4166 }
4167 if (priorities)
4168 {
4169 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4170 {
4171 if (mask & priorities[i])
4172 return priorities[i];
4173 }
4174 return CODING_CATEGORY_MASK_RAW_TEXT;
4175 }
4176 }
4177 else
4178 {
4179 int try;
4180
4181 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4182 c = src[1] - 0x20;
4183
4184 if (c < 0xA0)
4185 {
4186 /* C is the first byte of SJIS character code,
4187 or a leading-code of Emacs' internal format (emacs-mule),
4188 or the first byte of UTF-16. */
4189 try = (CODING_CATEGORY_MASK_SJIS
4190 | CODING_CATEGORY_MASK_EMACS_MULE
4191 | CODING_CATEGORY_MASK_UTF_16_BE
4192 | CODING_CATEGORY_MASK_UTF_16_LE);
4193
4194 /* Or, if C is a special latin extra code,
4195 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4196 or is an ISO2022 control-sequence-introducer (CSI),
4197 we should also consider the possibility of ISO2022 codings. */
4198 if ((VECTORP (Vlatin_extra_code_table)
4199 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4200 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4201 || (c == ISO_CODE_CSI
4202 && (src < src_end
4203 && (*src == ']'
4204 || ((*src == '0' || *src == '1' || *src == '2')
4205 && src + 1 < src_end
4206 && src[1] == ']')))))
4207 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4208 | CODING_CATEGORY_MASK_ISO_8BIT);
4209 }
4210 else
4211 /* C is a character of ISO2022 in graphic plane right,
4212 or a SJIS's 1-byte character code (i.e. JISX0201),
4213 or the first byte of BIG5's 2-byte code,
4214 or the first byte of UTF-8/16. */
4215 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4216 | CODING_CATEGORY_MASK_ISO_8BIT
4217 | CODING_CATEGORY_MASK_SJIS
4218 | CODING_CATEGORY_MASK_BIG5
4219 | CODING_CATEGORY_MASK_UTF_8
4220 | CODING_CATEGORY_MASK_UTF_16_BE
4221 | CODING_CATEGORY_MASK_UTF_16_LE);
4222
4223 /* Or, we may have to consider the possibility of CCL. */
4224 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4225 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4226 ->spec.ccl.valid_codes)[c])
4227 try |= CODING_CATEGORY_MASK_CCL;
4228
4229 mask = 0;
4230 utf16_examined_p = iso2022_examined_p = 0;
4231 if (priorities)
4232 {
4233 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4234 {
4235 if (!iso2022_examined_p
4236 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4237 {
4238 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4239 iso2022_examined_p = 1;
4240 }
4241 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4242 mask |= detect_coding_sjis (src, src_end, multibytep);
4243 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4244 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4245 else if (!utf16_examined_p
4246 && (priorities[i] & try &
4247 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4248 {
4249 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4250 utf16_examined_p = 1;
4251 }
4252 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4253 mask |= detect_coding_big5 (src, src_end, multibytep);
4254 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4255 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4256 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4257 mask |= detect_coding_ccl (src, src_end, multibytep);
4258 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4259 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4260 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4261 mask |= CODING_CATEGORY_MASK_BINARY;
4262 if (mask & priorities[i])
4263 return priorities[i];
4264 }
4265 return CODING_CATEGORY_MASK_RAW_TEXT;
4266 }
4267 if (try & CODING_CATEGORY_MASK_ISO)
4268 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4269 if (try & CODING_CATEGORY_MASK_SJIS)
4270 mask |= detect_coding_sjis (src, src_end, multibytep);
4271 if (try & CODING_CATEGORY_MASK_BIG5)
4272 mask |= detect_coding_big5 (src, src_end, multibytep);
4273 if (try & CODING_CATEGORY_MASK_UTF_8)
4274 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4275 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4276 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4277 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4278 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4279 if (try & CODING_CATEGORY_MASK_CCL)
4280 mask |= detect_coding_ccl (src, src_end, multibytep);
4281 }
4282 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4283 }
4284
4285 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4286 The information of the detected coding system is set in CODING. */
4287
4288 void
4289 detect_coding (coding, src, src_bytes)
4290 struct coding_system *coding;
4291 const unsigned char *src;
4292 int src_bytes;
4293 {
4294 unsigned int idx;
4295 int skip, mask;
4296 Lisp_Object val;
4297
4298 val = Vcoding_category_list;
4299 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4300 coding->src_multibyte);
4301 coding->heading_ascii = skip;
4302
4303 if (!mask) return;
4304
4305 /* We found a single coding system of the highest priority in MASK. */
4306 idx = 0;
4307 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4308 if (! mask)
4309 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4310
4311 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4312
4313 if (coding->eol_type != CODING_EOL_UNDECIDED)
4314 {
4315 Lisp_Object tmp;
4316
4317 tmp = Fget (val, Qeol_type);
4318 if (VECTORP (tmp))
4319 val = XVECTOR (tmp)->contents[coding->eol_type];
4320 }
4321
4322 /* Setup this new coding system while preserving some slots. */
4323 {
4324 int src_multibyte = coding->src_multibyte;
4325 int dst_multibyte = coding->dst_multibyte;
4326
4327 setup_coding_system (val, coding);
4328 coding->src_multibyte = src_multibyte;
4329 coding->dst_multibyte = dst_multibyte;
4330 coding->heading_ascii = skip;
4331 }
4332 }
4333
4334 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4335 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4336 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4337
4338 How many non-eol characters are at the head is returned as *SKIP. */
4339
4340 #define MAX_EOL_CHECK_COUNT 3
4341
4342 static int
4343 detect_eol_type (source, src_bytes, skip)
4344 unsigned char *source;
4345 int src_bytes, *skip;
4346 {
4347 unsigned char *src = source, *src_end = src + src_bytes;
4348 unsigned char c;
4349 int total = 0; /* How many end-of-lines are found so far. */
4350 int eol_type = CODING_EOL_UNDECIDED;
4351 int this_eol_type;
4352
4353 *skip = 0;
4354
4355 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4356 {
4357 c = *src++;
4358 if (c == '\n' || c == '\r')
4359 {
4360 if (*skip == 0)
4361 *skip = src - 1 - source;
4362 total++;
4363 if (c == '\n')
4364 this_eol_type = CODING_EOL_LF;
4365 else if (src >= src_end || *src != '\n')
4366 this_eol_type = CODING_EOL_CR;
4367 else
4368 this_eol_type = CODING_EOL_CRLF, src++;
4369
4370 if (eol_type == CODING_EOL_UNDECIDED)
4371 /* This is the first end-of-line. */
4372 eol_type = this_eol_type;
4373 else if (eol_type != this_eol_type)
4374 {
4375 /* The found type is different from what found before. */
4376 eol_type = CODING_EOL_INCONSISTENT;
4377 break;
4378 }
4379 }
4380 }
4381
4382 if (*skip == 0)
4383 *skip = src_end - source;
4384 return eol_type;
4385 }
4386
4387 /* Like detect_eol_type, but detect EOL type in 2-octet
4388 big-endian/little-endian format for coding systems utf-16-be and
4389 utf-16-le. */
4390
4391 static int
4392 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4393 unsigned char *source;
4394 int src_bytes, *skip, big_endian_p;
4395 {
4396 unsigned char *src = source, *src_end = src + src_bytes;
4397 unsigned int c1, c2;
4398 int total = 0; /* How many end-of-lines are found so far. */
4399 int eol_type = CODING_EOL_UNDECIDED;
4400 int this_eol_type;
4401 int msb, lsb;
4402
4403 if (big_endian_p)
4404 msb = 0, lsb = 1;
4405 else
4406 msb = 1, lsb = 0;
4407
4408 *skip = 0;
4409
4410 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4411 {
4412 c1 = (src[msb] << 8) | (src[lsb]);
4413 src += 2;
4414
4415 if (c1 == '\n' || c1 == '\r')
4416 {
4417 if (*skip == 0)
4418 *skip = src - 2 - source;
4419 total++;
4420 if (c1 == '\n')
4421 {
4422 this_eol_type = CODING_EOL_LF;
4423 }
4424 else
4425 {
4426 if ((src + 1) >= src_end)
4427 {
4428 this_eol_type = CODING_EOL_CR;
4429 }
4430 else
4431 {
4432 c2 = (src[msb] << 8) | (src[lsb]);
4433 if (c2 == '\n')
4434 this_eol_type = CODING_EOL_CRLF, src += 2;
4435 else
4436 this_eol_type = CODING_EOL_CR;
4437 }
4438 }
4439
4440 if (eol_type == CODING_EOL_UNDECIDED)
4441 /* This is the first end-of-line. */
4442 eol_type = this_eol_type;
4443 else if (eol_type != this_eol_type)
4444 {
4445 /* The found type is different from what found before. */
4446 eol_type = CODING_EOL_INCONSISTENT;
4447 break;
4448 }
4449 }
4450 }
4451
4452 if (*skip == 0)
4453 *skip = src_end - source;
4454 return eol_type;
4455 }
4456
4457 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4458 is encoded. If it detects an appropriate format of end-of-line, it
4459 sets the information in *CODING. */
4460
4461 void
4462 detect_eol (coding, src, src_bytes)
4463 struct coding_system *coding;
4464 const unsigned char *src;
4465 int src_bytes;
4466 {
4467 Lisp_Object val;
4468 int skip;
4469 int eol_type;
4470
4471 switch (coding->category_idx)
4472 {
4473 case CODING_CATEGORY_IDX_UTF_16_BE:
4474 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4475 break;
4476 case CODING_CATEGORY_IDX_UTF_16_LE:
4477 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4478 break;
4479 default:
4480 eol_type = detect_eol_type (src, src_bytes, &skip);
4481 break;
4482 }
4483
4484 if (coding->heading_ascii > skip)
4485 coding->heading_ascii = skip;
4486 else
4487 skip = coding->heading_ascii;
4488
4489 if (eol_type == CODING_EOL_UNDECIDED)
4490 return;
4491 if (eol_type == CODING_EOL_INCONSISTENT)
4492 {
4493 #if 0
4494 /* This code is suppressed until we find a better way to
4495 distinguish raw text file and binary file. */
4496
4497 /* If we have already detected that the coding is raw-text, the
4498 coding should actually be no-conversion. */
4499 if (coding->type == coding_type_raw_text)
4500 {
4501 setup_coding_system (Qno_conversion, coding);
4502 return;
4503 }
4504 /* Else, let's decode only text code anyway. */
4505 #endif /* 0 */
4506 eol_type = CODING_EOL_LF;
4507 }
4508
4509 val = Fget (coding->symbol, Qeol_type);
4510 if (VECTORP (val) && XVECTOR (val)->size == 3)
4511 {
4512 int src_multibyte = coding->src_multibyte;
4513 int dst_multibyte = coding->dst_multibyte;
4514 struct composition_data *cmp_data = coding->cmp_data;
4515
4516 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4517 coding->src_multibyte = src_multibyte;
4518 coding->dst_multibyte = dst_multibyte;
4519 coding->heading_ascii = skip;
4520 coding->cmp_data = cmp_data;
4521 }
4522 }
4523
4524 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4525
4526 #define DECODING_BUFFER_MAG(coding) \
4527 (coding->type == coding_type_iso2022 \
4528 ? 3 \
4529 : (coding->type == coding_type_ccl \
4530 ? coding->spec.ccl.decoder.buf_magnification \
4531 : 2))
4532
4533 /* Return maximum size (bytes) of a buffer enough for decoding
4534 SRC_BYTES of text encoded in CODING. */
4535
4536 int
4537 decoding_buffer_size (coding, src_bytes)
4538 struct coding_system *coding;
4539 int src_bytes;
4540 {
4541 return (src_bytes * DECODING_BUFFER_MAG (coding)
4542 + CONVERSION_BUFFER_EXTRA_ROOM);
4543 }
4544
4545 /* Return maximum size (bytes) of a buffer enough for encoding
4546 SRC_BYTES of text to CODING. */
4547
4548 int
4549 encoding_buffer_size (coding, src_bytes)
4550 struct coding_system *coding;
4551 int src_bytes;
4552 {
4553 int magnification;
4554
4555 if (coding->type == coding_type_ccl)
4556 {
4557 magnification = coding->spec.ccl.encoder.buf_magnification;
4558 if (coding->eol_type == CODING_EOL_CRLF)
4559 magnification *= 2;
4560 }
4561 else if (CODING_REQUIRE_ENCODING (coding))
4562 magnification = 3;
4563 else
4564 magnification = 1;
4565
4566 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4567 }
4568
4569 /* Working buffer for code conversion. */
4570 struct conversion_buffer
4571 {
4572 int size; /* size of data. */
4573 int on_stack; /* 1 if allocated by alloca. */
4574 unsigned char *data;
4575 };
4576
4577 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4578 #define allocate_conversion_buffer(buf, len) \
4579 do { \
4580 if (len < MAX_ALLOCA) \
4581 { \
4582 buf.data = (unsigned char *) alloca (len); \
4583 buf.on_stack = 1; \
4584 } \
4585 else \
4586 { \
4587 buf.data = (unsigned char *) xmalloc (len); \
4588 buf.on_stack = 0; \
4589 } \
4590 buf.size = len; \
4591 } while (0)
4592
4593 /* Double the allocated memory for *BUF. */
4594 static void
4595 extend_conversion_buffer (buf)
4596 struct conversion_buffer *buf;
4597 {
4598 if (buf->on_stack)
4599 {
4600 unsigned char *save = buf->data;
4601 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4602 bcopy (save, buf->data, buf->size);
4603 buf->on_stack = 0;
4604 }
4605 else
4606 {
4607 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4608 }
4609 buf->size *= 2;
4610 }
4611
4612 /* Free the allocated memory for BUF if it is not on stack. */
4613 static void
4614 free_conversion_buffer (buf)
4615 struct conversion_buffer *buf;
4616 {
4617 if (!buf->on_stack)
4618 xfree (buf->data);
4619 }
4620
4621 int
4622 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4623 struct coding_system *coding;
4624 unsigned char *source, *destination;
4625 int src_bytes, dst_bytes, encodep;
4626 {
4627 struct ccl_program *ccl
4628 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4629 unsigned char *dst = destination;
4630
4631 ccl->suppress_error = coding->suppress_error;
4632 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4633 if (encodep)
4634 {
4635 /* On encoding, EOL format is converted within ccl_driver. For
4636 that, setup proper information in the structure CCL. */
4637 ccl->eol_type = coding->eol_type;
4638 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4639 ccl->eol_type = CODING_EOL_LF;
4640 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4641 ccl->eight_bit_control = coding->dst_multibyte;
4642 }
4643 else
4644 ccl->eight_bit_control = 1;
4645 ccl->multibyte = coding->src_multibyte;
4646 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4647 {
4648 /* Move carryover bytes to DESTINATION. */
4649 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4650 while (*p)
4651 *dst++ = *p++;
4652 coding->spec.ccl.eight_bit_carryover[0] = 0;
4653 if (dst_bytes)
4654 dst_bytes -= dst - destination;
4655 }
4656
4657 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4658 &(coding->consumed))
4659 + dst - destination);
4660
4661 if (encodep)
4662 {
4663 coding->produced_char = coding->produced;
4664 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4665 }
4666 else if (!ccl->eight_bit_control)
4667 {
4668 /* The produced bytes forms a valid multibyte sequence. */
4669 coding->produced_char
4670 = multibyte_chars_in_text (destination, coding->produced);
4671 coding->spec.ccl.eight_bit_carryover[0] = 0;
4672 }
4673 else
4674 {
4675 /* On decoding, the destination should always multibyte. But,
4676 CCL program might have been generated an invalid multibyte
4677 sequence. Here we make such a sequence valid as
4678 multibyte. */
4679 int bytes
4680 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4681
4682 if ((coding->consumed < src_bytes
4683 || !ccl->last_block)
4684 && coding->produced >= 1
4685 && destination[coding->produced - 1] >= 0x80)
4686 {
4687 /* We should not convert the tailing 8-bit codes to
4688 multibyte form even if they doesn't form a valid
4689 multibyte sequence. They may form a valid sequence in
4690 the next call. */
4691 int carryover = 0;
4692
4693 if (destination[coding->produced - 1] < 0xA0)
4694 carryover = 1;
4695 else if (coding->produced >= 2)
4696 {
4697 if (destination[coding->produced - 2] >= 0x80)
4698 {
4699 if (destination[coding->produced - 2] < 0xA0)
4700 carryover = 2;
4701 else if (coding->produced >= 3
4702 && destination[coding->produced - 3] >= 0x80
4703 && destination[coding->produced - 3] < 0xA0)
4704 carryover = 3;
4705 }
4706 }
4707 if (carryover > 0)
4708 {
4709 BCOPY_SHORT (destination + coding->produced - carryover,
4710 coding->spec.ccl.eight_bit_carryover,
4711 carryover);
4712 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4713 coding->produced -= carryover;
4714 }
4715 }
4716 coding->produced = str_as_multibyte (destination, bytes,
4717 coding->produced,
4718 &(coding->produced_char));
4719 }
4720
4721 switch (ccl->status)
4722 {
4723 case CCL_STAT_SUSPEND_BY_SRC:
4724 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4725 break;
4726 case CCL_STAT_SUSPEND_BY_DST:
4727 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4728 break;
4729 case CCL_STAT_QUIT:
4730 case CCL_STAT_INVALID_CMD:
4731 coding->result = CODING_FINISH_INTERRUPT;
4732 break;
4733 default:
4734 coding->result = CODING_FINISH_NORMAL;
4735 break;
4736 }
4737 return coding->result;
4738 }
4739
4740 /* Decode EOL format of the text at PTR of BYTES length destructively
4741 according to CODING->eol_type. This is called after the CCL
4742 program produced a decoded text at PTR. If we do CRLF->LF
4743 conversion, update CODING->produced and CODING->produced_char. */
4744
4745 static void
4746 decode_eol_post_ccl (coding, ptr, bytes)
4747 struct coding_system *coding;
4748 unsigned char *ptr;
4749 int bytes;
4750 {
4751 Lisp_Object val, saved_coding_symbol;
4752 unsigned char *pend = ptr + bytes;
4753 int dummy;
4754
4755 /* Remember the current coding system symbol. We set it back when
4756 an inconsistent EOL is found so that `last-coding-system-used' is
4757 set to the coding system that doesn't specify EOL conversion. */
4758 saved_coding_symbol = coding->symbol;
4759
4760 coding->spec.ccl.cr_carryover = 0;
4761 if (coding->eol_type == CODING_EOL_UNDECIDED)
4762 {
4763 /* Here, to avoid the call of setup_coding_system, we directly
4764 call detect_eol_type. */
4765 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4766 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4767 coding->eol_type = CODING_EOL_LF;
4768 if (coding->eol_type != CODING_EOL_UNDECIDED)
4769 {
4770 val = Fget (coding->symbol, Qeol_type);
4771 if (VECTORP (val) && XVECTOR (val)->size == 3)
4772 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4773 }
4774 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4775 }
4776
4777 if (coding->eol_type == CODING_EOL_LF
4778 || coding->eol_type == CODING_EOL_UNDECIDED)
4779 {
4780 /* We have nothing to do. */
4781 ptr = pend;
4782 }
4783 else if (coding->eol_type == CODING_EOL_CRLF)
4784 {
4785 unsigned char *pstart = ptr, *p = ptr;
4786
4787 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4788 && *(pend - 1) == '\r')
4789 {
4790 /* If the last character is CR, we can't handle it here
4791 because LF will be in the not-yet-decoded source text.
4792 Record that the CR is not yet processed. */
4793 coding->spec.ccl.cr_carryover = 1;
4794 coding->produced--;
4795 coding->produced_char--;
4796 pend--;
4797 }
4798 while (ptr < pend)
4799 {
4800 if (*ptr == '\r')
4801 {
4802 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4803 {
4804 *p++ = '\n';
4805 ptr += 2;
4806 }
4807 else
4808 {
4809 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4810 goto undo_eol_conversion;
4811 *p++ = *ptr++;
4812 }
4813 }
4814 else if (*ptr == '\n'
4815 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4816 goto undo_eol_conversion;
4817 else
4818 *p++ = *ptr++;
4819 continue;
4820
4821 undo_eol_conversion:
4822 /* We have faced with inconsistent EOL format at PTR.
4823 Convert all LFs before PTR back to CRLFs. */
4824 for (p--, ptr--; p >= pstart; p--)
4825 {
4826 if (*p == '\n')
4827 *ptr-- = '\n', *ptr-- = '\r';
4828 else
4829 *ptr-- = *p;
4830 }
4831 /* If carryover is recorded, cancel it because we don't
4832 convert CRLF anymore. */
4833 if (coding->spec.ccl.cr_carryover)
4834 {
4835 coding->spec.ccl.cr_carryover = 0;
4836 coding->produced++;
4837 coding->produced_char++;
4838 pend++;
4839 }
4840 p = ptr = pend;
4841 coding->eol_type = CODING_EOL_LF;
4842 coding->symbol = saved_coding_symbol;
4843 }
4844 if (p < pend)
4845 {
4846 /* As each two-byte sequence CRLF was converted to LF, (PEND
4847 - P) is the number of deleted characters. */
4848 coding->produced -= pend - p;
4849 coding->produced_char -= pend - p;
4850 }
4851 }
4852 else /* i.e. coding->eol_type == CODING_EOL_CR */
4853 {
4854 unsigned char *p = ptr;
4855
4856 for (; ptr < pend; ptr++)
4857 {
4858 if (*ptr == '\r')
4859 *ptr = '\n';
4860 else if (*ptr == '\n'
4861 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4862 {
4863 for (; p < ptr; p++)
4864 {
4865 if (*p == '\n')
4866 *p = '\r';
4867 }
4868 ptr = pend;
4869 coding->eol_type = CODING_EOL_LF;
4870 coding->symbol = saved_coding_symbol;
4871 }
4872 }
4873 }
4874 }
4875
4876 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4877 decoding, it may detect coding system and format of end-of-line if
4878 those are not yet decided. The source should be unibyte, the
4879 result is multibyte if CODING->dst_multibyte is nonzero, else
4880 unibyte. */
4881
4882 int
4883 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4884 struct coding_system *coding;
4885 const unsigned char *source;
4886 unsigned char *destination;
4887 int src_bytes, dst_bytes;
4888 {
4889 int extra = 0;
4890
4891 if (coding->type == coding_type_undecided)
4892 detect_coding (coding, source, src_bytes);
4893
4894 if (coding->eol_type == CODING_EOL_UNDECIDED
4895 && coding->type != coding_type_ccl)
4896 {
4897 detect_eol (coding, source, src_bytes);
4898 /* We had better recover the original eol format if we
4899 encounter an inconsistent eol format while decoding. */
4900 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4901 }
4902
4903 coding->produced = coding->produced_char = 0;
4904 coding->consumed = coding->consumed_char = 0;
4905 coding->errors = 0;
4906 coding->result = CODING_FINISH_NORMAL;
4907
4908 switch (coding->type)
4909 {
4910 case coding_type_sjis:
4911 decode_coding_sjis_big5 (coding, source, destination,
4912 src_bytes, dst_bytes, 1);
4913 break;
4914
4915 case coding_type_iso2022:
4916 decode_coding_iso2022 (coding, source, destination,
4917 src_bytes, dst_bytes);
4918 break;
4919
4920 case coding_type_big5:
4921 decode_coding_sjis_big5 (coding, source, destination,
4922 src_bytes, dst_bytes, 0);
4923 break;
4924
4925 case coding_type_emacs_mule:
4926 decode_coding_emacs_mule (coding, source, destination,
4927 src_bytes, dst_bytes);
4928 break;
4929
4930 case coding_type_ccl:
4931 if (coding->spec.ccl.cr_carryover)
4932 {
4933 /* Put the CR which was not processed by the previous call
4934 of decode_eol_post_ccl in DESTINATION. It will be
4935 decoded together with the following LF by the call to
4936 decode_eol_post_ccl below. */
4937 *destination = '\r';
4938 coding->produced++;
4939 coding->produced_char++;
4940 dst_bytes--;
4941 extra = coding->spec.ccl.cr_carryover;
4942 }
4943 ccl_coding_driver (coding, source, destination + extra,
4944 src_bytes, dst_bytes, 0);
4945 if (coding->eol_type != CODING_EOL_LF)
4946 {
4947 coding->produced += extra;
4948 coding->produced_char += extra;
4949 decode_eol_post_ccl (coding, destination, coding->produced);
4950 }
4951 break;
4952
4953 default:
4954 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4955 }
4956
4957 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4958 && coding->mode & CODING_MODE_LAST_BLOCK
4959 && coding->consumed == src_bytes)
4960 coding->result = CODING_FINISH_NORMAL;
4961
4962 if (coding->mode & CODING_MODE_LAST_BLOCK
4963 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4964 {
4965 const unsigned char *src = source + coding->consumed;
4966 unsigned char *dst = destination + coding->produced;
4967
4968 src_bytes -= coding->consumed;
4969 coding->errors++;
4970 if (COMPOSING_P (coding))
4971 DECODE_COMPOSITION_END ('1');
4972 while (src_bytes--)
4973 {
4974 int c = *src++;
4975 dst += CHAR_STRING (c, dst);
4976 coding->produced_char++;
4977 }
4978 coding->consumed = coding->consumed_char = src - source;
4979 coding->produced = dst - destination;
4980 coding->result = CODING_FINISH_NORMAL;
4981 }
4982
4983 if (!coding->dst_multibyte)
4984 {
4985 coding->produced = str_as_unibyte (destination, coding->produced);
4986 coding->produced_char = coding->produced;
4987 }
4988
4989 return coding->result;
4990 }
4991
4992 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4993 multibyteness of the source is CODING->src_multibyte, the
4994 multibyteness of the result is always unibyte. */
4995
4996 int
4997 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4998 struct coding_system *coding;
4999 const unsigned char *source;
5000 unsigned char *destination;
5001 int src_bytes, dst_bytes;
5002 {
5003 coding->produced = coding->produced_char = 0;
5004 coding->consumed = coding->consumed_char = 0;
5005 coding->errors = 0;
5006 coding->result = CODING_FINISH_NORMAL;
5007 if (coding->eol_type == CODING_EOL_UNDECIDED)
5008 coding->eol_type = CODING_EOL_LF;
5009
5010 switch (coding->type)
5011 {
5012 case coding_type_sjis:
5013 encode_coding_sjis_big5 (coding, source, destination,
5014 src_bytes, dst_bytes, 1);
5015 break;
5016
5017 case coding_type_iso2022:
5018 encode_coding_iso2022 (coding, source, destination,
5019 src_bytes, dst_bytes);
5020 break;
5021
5022 case coding_type_big5:
5023 encode_coding_sjis_big5 (coding, source, destination,
5024 src_bytes, dst_bytes, 0);
5025 break;
5026
5027 case coding_type_emacs_mule:
5028 encode_coding_emacs_mule (coding, source, destination,
5029 src_bytes, dst_bytes);
5030 break;
5031
5032 case coding_type_ccl:
5033 ccl_coding_driver (coding, source, destination,
5034 src_bytes, dst_bytes, 1);
5035 break;
5036
5037 default:
5038 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5039 }
5040
5041 if (coding->mode & CODING_MODE_LAST_BLOCK
5042 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5043 {
5044 const unsigned char *src = source + coding->consumed;
5045 unsigned char *dst = destination + coding->produced;
5046
5047 if (coding->type == coding_type_iso2022)
5048 ENCODE_RESET_PLANE_AND_REGISTER;
5049 if (COMPOSING_P (coding))
5050 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5051 if (coding->consumed < src_bytes)
5052 {
5053 int len = src_bytes - coding->consumed;
5054
5055 BCOPY_SHORT (src, dst, len);
5056 if (coding->src_multibyte)
5057 len = str_as_unibyte (dst, len);
5058 dst += len;
5059 coding->consumed = src_bytes;
5060 }
5061 coding->produced = coding->produced_char = dst - destination;
5062 coding->result = CODING_FINISH_NORMAL;
5063 }
5064
5065 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5066 && coding->consumed == src_bytes)
5067 coding->result = CODING_FINISH_NORMAL;
5068
5069 return coding->result;
5070 }
5071
5072 /* Scan text in the region between *BEG and *END (byte positions),
5073 skip characters which we don't have to decode by coding system
5074 CODING at the head and tail, then set *BEG and *END to the region
5075 of the text we actually have to convert. The caller should move
5076 the gap out of the region in advance if the region is from a
5077 buffer.
5078
5079 If STR is not NULL, *BEG and *END are indices into STR. */
5080
5081 static void
5082 shrink_decoding_region (beg, end, coding, str)
5083 int *beg, *end;
5084 struct coding_system *coding;
5085 unsigned char *str;
5086 {
5087 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5088 int eol_conversion;
5089 Lisp_Object translation_table;
5090
5091 if (coding->type == coding_type_ccl
5092 || coding->type == coding_type_undecided
5093 || coding->eol_type != CODING_EOL_LF
5094 || !NILP (coding->post_read_conversion)
5095 || coding->composing != COMPOSITION_DISABLED)
5096 {
5097 /* We can't skip any data. */
5098 return;
5099 }
5100 if (coding->type == coding_type_no_conversion
5101 || coding->type == coding_type_raw_text
5102 || coding->type == coding_type_emacs_mule)
5103 {
5104 /* We need no conversion, but don't have to skip any data here.
5105 Decoding routine handles them effectively anyway. */
5106 return;
5107 }
5108
5109 translation_table = coding->translation_table_for_decode;
5110 if (NILP (translation_table) && !NILP (Venable_character_translation))
5111 translation_table = Vstandard_translation_table_for_decode;
5112 if (CHAR_TABLE_P (translation_table))
5113 {
5114 int i;
5115 for (i = 0; i < 128; i++)
5116 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5117 break;
5118 if (i < 128)
5119 /* Some ASCII character should be translated. We give up
5120 shrinking. */
5121 return;
5122 }
5123
5124 if (coding->heading_ascii >= 0)
5125 /* Detection routine has already found how much we can skip at the
5126 head. */
5127 *beg += coding->heading_ascii;
5128
5129 if (str)
5130 {
5131 begp_orig = begp = str + *beg;
5132 endp_orig = endp = str + *end;
5133 }
5134 else
5135 {
5136 begp_orig = begp = BYTE_POS_ADDR (*beg);
5137 endp_orig = endp = begp + *end - *beg;
5138 }
5139
5140 eol_conversion = (coding->eol_type == CODING_EOL_CR
5141 || coding->eol_type == CODING_EOL_CRLF);
5142
5143 switch (coding->type)
5144 {
5145 case coding_type_sjis:
5146 case coding_type_big5:
5147 /* We can skip all ASCII characters at the head. */
5148 if (coding->heading_ascii < 0)
5149 {
5150 if (eol_conversion)
5151 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5152 else
5153 while (begp < endp && *begp < 0x80) begp++;
5154 }
5155 /* We can skip all ASCII characters at the tail except for the
5156 second byte of SJIS or BIG5 code. */
5157 if (eol_conversion)
5158 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5159 else
5160 while (begp < endp && endp[-1] < 0x80) endp--;
5161 /* Do not consider LF as ascii if preceded by CR, since that
5162 confuses eol decoding. */
5163 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5164 endp++;
5165 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5166 endp++;
5167 break;
5168
5169 case coding_type_iso2022:
5170 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5171 /* We can't skip any data. */
5172 break;
5173 if (coding->heading_ascii < 0)
5174 {
5175 /* We can skip all ASCII characters at the head except for a
5176 few control codes. */
5177 while (begp < endp && (c = *begp) < 0x80
5178 && c != ISO_CODE_CR && c != ISO_CODE_SO
5179 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5180 && (!eol_conversion || c != ISO_CODE_LF))
5181 begp++;
5182 }
5183 switch (coding->category_idx)
5184 {
5185 case CODING_CATEGORY_IDX_ISO_8_1:
5186 case CODING_CATEGORY_IDX_ISO_8_2:
5187 /* We can skip all ASCII characters at the tail. */
5188 if (eol_conversion)
5189 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5190 else
5191 while (begp < endp && endp[-1] < 0x80) endp--;
5192 /* Do not consider LF as ascii if preceded by CR, since that
5193 confuses eol decoding. */
5194 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5195 endp++;
5196 break;
5197
5198 case CODING_CATEGORY_IDX_ISO_7:
5199 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5200 {
5201 /* We can skip all characters at the tail except for 8-bit
5202 codes and ESC and the following 2-byte at the tail. */
5203 unsigned char *eight_bit = NULL;
5204
5205 if (eol_conversion)
5206 while (begp < endp
5207 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5208 {
5209 if (!eight_bit && c & 0x80) eight_bit = endp;
5210 endp--;
5211 }
5212 else
5213 while (begp < endp
5214 && (c = endp[-1]) != ISO_CODE_ESC)
5215 {
5216 if (!eight_bit && c & 0x80) eight_bit = endp;
5217 endp--;
5218 }
5219 /* Do not consider LF as ascii if preceded by CR, since that
5220 confuses eol decoding. */
5221 if (begp < endp && endp < endp_orig
5222 && endp[-1] == '\r' && endp[0] == '\n')
5223 endp++;
5224 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5225 {
5226 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5227 /* This is an ASCII designation sequence. We can
5228 surely skip the tail. But, if we have
5229 encountered an 8-bit code, skip only the codes
5230 after that. */
5231 endp = eight_bit ? eight_bit : endp + 2;
5232 else
5233 /* Hmmm, we can't skip the tail. */
5234 endp = endp_orig;
5235 }
5236 else if (eight_bit)
5237 endp = eight_bit;
5238 }
5239 }
5240 break;
5241
5242 default:
5243 abort ();
5244 }
5245 *beg += begp - begp_orig;
5246 *end += endp - endp_orig;
5247 return;
5248 }
5249
5250 /* Like shrink_decoding_region but for encoding. */
5251
5252 static void
5253 shrink_encoding_region (beg, end, coding, str)
5254 int *beg, *end;
5255 struct coding_system *coding;
5256 unsigned char *str;
5257 {
5258 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5259 int eol_conversion;
5260 Lisp_Object translation_table;
5261
5262 if (coding->type == coding_type_ccl
5263 || coding->eol_type == CODING_EOL_CRLF
5264 || coding->eol_type == CODING_EOL_CR
5265 || (coding->cmp_data && coding->cmp_data->used > 0))
5266 {
5267 /* We can't skip any data. */
5268 return;
5269 }
5270 if (coding->type == coding_type_no_conversion
5271 || coding->type == coding_type_raw_text
5272 || coding->type == coding_type_emacs_mule
5273 || coding->type == coding_type_undecided)
5274 {
5275 /* We need no conversion, but don't have to skip any data here.
5276 Encoding routine handles them effectively anyway. */
5277 return;
5278 }
5279
5280 translation_table = coding->translation_table_for_encode;
5281 if (NILP (translation_table) && !NILP (Venable_character_translation))
5282 translation_table = Vstandard_translation_table_for_encode;
5283 if (CHAR_TABLE_P (translation_table))
5284 {
5285 int i;
5286 for (i = 0; i < 128; i++)
5287 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5288 break;
5289 if (i < 128)
5290 /* Some ASCII character should be translated. We give up
5291 shrinking. */
5292 return;
5293 }
5294
5295 if (str)
5296 {
5297 begp_orig = begp = str + *beg;
5298 endp_orig = endp = str + *end;
5299 }
5300 else
5301 {
5302 begp_orig = begp = BYTE_POS_ADDR (*beg);
5303 endp_orig = endp = begp + *end - *beg;
5304 }
5305
5306 eol_conversion = (coding->eol_type == CODING_EOL_CR
5307 || coding->eol_type == CODING_EOL_CRLF);
5308
5309 /* Here, we don't have to check coding->pre_write_conversion because
5310 the caller is expected to have handled it already. */
5311 switch (coding->type)
5312 {
5313 case coding_type_iso2022:
5314 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5315 /* We can't skip any data. */
5316 break;
5317 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5318 {
5319 unsigned char *bol = begp;
5320 while (begp < endp && *begp < 0x80)
5321 {
5322 begp++;
5323 if (begp[-1] == '\n')
5324 bol = begp;
5325 }
5326 begp = bol;
5327 goto label_skip_tail;
5328 }
5329 /* fall down ... */
5330
5331 case coding_type_sjis:
5332 case coding_type_big5:
5333 /* We can skip all ASCII characters at the head and tail. */
5334 if (eol_conversion)
5335 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5336 else
5337 while (begp < endp && *begp < 0x80) begp++;
5338 label_skip_tail:
5339 if (eol_conversion)
5340 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5341 else
5342 while (begp < endp && *(endp - 1) < 0x80) endp--;
5343 break;
5344
5345 default:
5346 abort ();
5347 }
5348
5349 *beg += begp - begp_orig;
5350 *end += endp - endp_orig;
5351 return;
5352 }
5353
5354 /* As shrinking conversion region requires some overhead, we don't try
5355 shrinking if the length of conversion region is less than this
5356 value. */
5357 static int shrink_conversion_region_threshhold = 1024;
5358
5359 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5360 do { \
5361 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5362 { \
5363 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5364 else shrink_decoding_region (beg, end, coding, str); \
5365 } \
5366 } while (0)
5367
5368 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5369 Vlast_coding_system_used and the remaining elements are buffers to
5370 kill. */
5371 static Lisp_Object
5372 code_convert_region_unwind (arg)
5373 Lisp_Object arg;
5374 {
5375 struct gcpro gcpro1;
5376 GCPRO1 (arg);
5377
5378 inhibit_pre_post_conversion = 0;
5379 Vlast_coding_system_used = XCAR (arg);
5380 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5381 Fkill_buffer (XCAR (arg));
5382
5383 UNGCPRO;
5384 return Qnil;
5385 }
5386
5387 /* Store information about all compositions in the range FROM and TO
5388 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5389 buffer or a string, defaults to the current buffer. */
5390
5391 void
5392 coding_save_composition (coding, from, to, obj)
5393 struct coding_system *coding;
5394 int from, to;
5395 Lisp_Object obj;
5396 {
5397 Lisp_Object prop;
5398 int start, end;
5399
5400 if (coding->composing == COMPOSITION_DISABLED)
5401 return;
5402 if (!coding->cmp_data)
5403 coding_allocate_composition_data (coding, from);
5404 if (!find_composition (from, to, &start, &end, &prop, obj)
5405 || end > to)
5406 return;
5407 if (start < from
5408 && (!find_composition (end, to, &start, &end, &prop, obj)
5409 || end > to))
5410 return;
5411 coding->composing = COMPOSITION_NO;
5412 do
5413 {
5414 if (COMPOSITION_VALID_P (start, end, prop))
5415 {
5416 enum composition_method method = COMPOSITION_METHOD (prop);
5417 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5418 >= COMPOSITION_DATA_SIZE)
5419 coding_allocate_composition_data (coding, from);
5420 /* For relative composition, we remember start and end
5421 positions, for the other compositions, we also remember
5422 components. */
5423 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5424 if (method != COMPOSITION_RELATIVE)
5425 {
5426 /* We must store a*/
5427 Lisp_Object val, ch;
5428
5429 val = COMPOSITION_COMPONENTS (prop);
5430 if (CONSP (val))
5431 while (CONSP (val))
5432 {
5433 ch = XCAR (val), val = XCDR (val);
5434 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5435 }
5436 else if (VECTORP (val) || STRINGP (val))
5437 {
5438 int len = (VECTORP (val)
5439 ? XVECTOR (val)->size : SCHARS (val));
5440 int i;
5441 for (i = 0; i < len; i++)
5442 {
5443 ch = (STRINGP (val)
5444 ? Faref (val, make_number (i))
5445 : XVECTOR (val)->contents[i]);
5446 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5447 }
5448 }
5449 else /* INTEGERP (val) */
5450 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5451 }
5452 CODING_ADD_COMPOSITION_END (coding, end - from);
5453 }
5454 start = end;
5455 }
5456 while (start < to
5457 && find_composition (start, to, &start, &end, &prop, obj)
5458 && end <= to);
5459
5460 /* Make coding->cmp_data point to the first memory block. */
5461 while (coding->cmp_data->prev)
5462 coding->cmp_data = coding->cmp_data->prev;
5463 coding->cmp_data_start = 0;
5464 }
5465
5466 /* Reflect the saved information about compositions to OBJ.
5467 CODING->cmp_data points to a memory block for the information. OBJ
5468 is a buffer or a string, defaults to the current buffer. */
5469
5470 void
5471 coding_restore_composition (coding, obj)
5472 struct coding_system *coding;
5473 Lisp_Object obj;
5474 {
5475 struct composition_data *cmp_data = coding->cmp_data;
5476
5477 if (!cmp_data)
5478 return;
5479
5480 while (cmp_data->prev)
5481 cmp_data = cmp_data->prev;
5482
5483 while (cmp_data)
5484 {
5485 int i;
5486
5487 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5488 i += cmp_data->data[i])
5489 {
5490 int *data = cmp_data->data + i;
5491 enum composition_method method = (enum composition_method) data[3];
5492 Lisp_Object components;
5493
5494 if (data[0] < 0 || i + data[0] > cmp_data->used)
5495 /* Invalid composition data. */
5496 break;
5497
5498 if (method == COMPOSITION_RELATIVE)
5499 components = Qnil;
5500 else
5501 {
5502 int len = data[0] - 4, j;
5503 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5504
5505 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5506 && len % 2 == 0)
5507 len --;
5508 if (len < 1)
5509 /* Invalid composition data. */
5510 break;
5511 for (j = 0; j < len; j++)
5512 args[j] = make_number (data[4 + j]);
5513 components = (method == COMPOSITION_WITH_ALTCHARS
5514 ? Fstring (len, args)
5515 : Fvector (len, args));
5516 }
5517 compose_text (data[1], data[2], components, Qnil, obj);
5518 }
5519 cmp_data = cmp_data->next;
5520 }
5521 }
5522
5523 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5524 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5525 coding system CODING, and return the status code of code conversion
5526 (currently, this value has no meaning).
5527
5528 How many characters (and bytes) are converted to how many
5529 characters (and bytes) are recorded in members of the structure
5530 CODING.
5531
5532 If REPLACE is nonzero, we do various things as if the original text
5533 is deleted and a new text is inserted. See the comments in
5534 replace_range (insdel.c) to know what we are doing.
5535
5536 If REPLACE is zero, it is assumed that the source text is unibyte.
5537 Otherwise, it is assumed that the source text is multibyte. */
5538
5539 int
5540 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5541 int from, from_byte, to, to_byte, encodep, replace;
5542 struct coding_system *coding;
5543 {
5544 int len = to - from, len_byte = to_byte - from_byte;
5545 int nchars_del = 0, nbytes_del = 0;
5546 int require, inserted, inserted_byte;
5547 int head_skip, tail_skip, total_skip = 0;
5548 Lisp_Object saved_coding_symbol;
5549 int first = 1;
5550 unsigned char *src, *dst;
5551 Lisp_Object deletion;
5552 int orig_point = PT, orig_len = len;
5553 int prev_Z;
5554 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5555
5556 deletion = Qnil;
5557 saved_coding_symbol = coding->symbol;
5558
5559 if (from < PT && PT < to)
5560 {
5561 TEMP_SET_PT_BOTH (from, from_byte);
5562 orig_point = from;
5563 }
5564
5565 if (replace)
5566 {
5567 int saved_from = from;
5568 int saved_inhibit_modification_hooks;
5569
5570 prepare_to_modify_buffer (from, to, &from);
5571 if (saved_from != from)
5572 {
5573 to = from + len;
5574 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5575 len_byte = to_byte - from_byte;
5576 }
5577
5578 /* The code conversion routine can not preserve text properties
5579 for now. So, we must remove all text properties in the
5580 region. Here, we must suppress all modification hooks. */
5581 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5582 inhibit_modification_hooks = 1;
5583 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5584 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5585 }
5586
5587 coding->heading_ascii = 0;
5588
5589 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5590 {
5591 /* We must detect encoding of text and eol format. */
5592
5593 if (from < GPT && to > GPT)
5594 move_gap_both (from, from_byte);
5595 if (coding->type == coding_type_undecided)
5596 {
5597 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5598 if (coding->type == coding_type_undecided)
5599 {
5600 /* It seems that the text contains only ASCII, but we
5601 should not leave it undecided because the deeper
5602 decoding routine (decode_coding) tries to detect the
5603 encodings again in vain. */
5604 coding->type = coding_type_emacs_mule;
5605 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5606 /* As emacs-mule decoder will handle composition, we
5607 need this setting to allocate coding->cmp_data
5608 later. */
5609 coding->composing = COMPOSITION_NO;
5610 }
5611 }
5612 if (coding->eol_type == CODING_EOL_UNDECIDED
5613 && coding->type != coding_type_ccl)
5614 {
5615 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5616 if (coding->eol_type == CODING_EOL_UNDECIDED)
5617 coding->eol_type = CODING_EOL_LF;
5618 /* We had better recover the original eol format if we
5619 encounter an inconsistent eol format while decoding. */
5620 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5621 }
5622 }
5623
5624 /* Now we convert the text. */
5625
5626 /* For encoding, we must process pre-write-conversion in advance. */
5627 if (! inhibit_pre_post_conversion
5628 && encodep
5629 && SYMBOLP (coding->pre_write_conversion)
5630 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5631 {
5632 /* The function in pre-write-conversion may put a new text in a
5633 new buffer. */
5634 struct buffer *prev = current_buffer;
5635 Lisp_Object new;
5636
5637 record_unwind_protect (code_convert_region_unwind,
5638 Fcons (Vlast_coding_system_used, Qnil));
5639 /* We should not call any more pre-write/post-read-conversion
5640 functions while this pre-write-conversion is running. */
5641 inhibit_pre_post_conversion = 1;
5642 call2 (coding->pre_write_conversion,
5643 make_number (from), make_number (to));
5644 inhibit_pre_post_conversion = 0;
5645 /* Discard the unwind protect. */
5646 specpdl_ptr--;
5647
5648 if (current_buffer != prev)
5649 {
5650 len = ZV - BEGV;
5651 new = Fcurrent_buffer ();
5652 set_buffer_internal_1 (prev);
5653 del_range_2 (from, from_byte, to, to_byte, 0);
5654 TEMP_SET_PT_BOTH (from, from_byte);
5655 insert_from_buffer (XBUFFER (new), 1, len, 0);
5656 Fkill_buffer (new);
5657 if (orig_point >= to)
5658 orig_point += len - orig_len;
5659 else if (orig_point > from)
5660 orig_point = from;
5661 orig_len = len;
5662 to = from + len;
5663 from_byte = CHAR_TO_BYTE (from);
5664 to_byte = CHAR_TO_BYTE (to);
5665 len_byte = to_byte - from_byte;
5666 TEMP_SET_PT_BOTH (from, from_byte);
5667 }
5668 }
5669
5670 if (replace)
5671 {
5672 if (! EQ (current_buffer->undo_list, Qt))
5673 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5674 else
5675 {
5676 nchars_del = to - from;
5677 nbytes_del = to_byte - from_byte;
5678 }
5679 }
5680
5681 if (coding->composing != COMPOSITION_DISABLED)
5682 {
5683 if (encodep)
5684 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5685 else
5686 coding_allocate_composition_data (coding, from);
5687 }
5688
5689 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5690 if we must run CCL program or there are compositions to
5691 encode. */
5692 if (coding->type != coding_type_ccl
5693 && (! coding->cmp_data || coding->cmp_data->used == 0))
5694 {
5695 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5696
5697 if (from < GPT && GPT < to)
5698 move_gap_both (from, from_byte);
5699 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5700 if (from_byte == to_byte
5701 && (encodep || NILP (coding->post_read_conversion))
5702 && ! CODING_REQUIRE_FLUSHING (coding))
5703 {
5704 coding->produced = len_byte;
5705 coding->produced_char = len;
5706 if (!replace)
5707 /* We must record and adjust for this new text now. */
5708 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5709 coding_free_composition_data (coding);
5710 return 0;
5711 }
5712
5713 head_skip = from_byte - from_byte_orig;
5714 tail_skip = to_byte_orig - to_byte;
5715 total_skip = head_skip + tail_skip;
5716 from += head_skip;
5717 to -= tail_skip;
5718 len -= total_skip; len_byte -= total_skip;
5719 }
5720
5721 /* For conversion, we must put the gap before the text in addition to
5722 making the gap larger for efficient decoding. The required gap
5723 size starts from 2000 which is the magic number used in make_gap.
5724 But, after one batch of conversion, it will be incremented if we
5725 find that it is not enough . */
5726 require = 2000;
5727
5728 if (GAP_SIZE < require)
5729 make_gap (require - GAP_SIZE);
5730 move_gap_both (from, from_byte);
5731
5732 inserted = inserted_byte = 0;
5733
5734 GAP_SIZE += len_byte;
5735 ZV -= len;
5736 Z -= len;
5737 ZV_BYTE -= len_byte;
5738 Z_BYTE -= len_byte;
5739
5740 if (GPT - BEG < BEG_UNCHANGED)
5741 BEG_UNCHANGED = GPT - BEG;
5742 if (Z - GPT < END_UNCHANGED)
5743 END_UNCHANGED = Z - GPT;
5744
5745 if (!encodep && coding->src_multibyte)
5746 {
5747 /* Decoding routines expects that the source text is unibyte.
5748 We must convert 8-bit characters of multibyte form to
5749 unibyte. */
5750 int len_byte_orig = len_byte;
5751 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5752 if (len_byte < len_byte_orig)
5753 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5754 len_byte);
5755 coding->src_multibyte = 0;
5756 }
5757
5758 for (;;)
5759 {
5760 int result;
5761
5762 /* The buffer memory is now:
5763 +--------+converted-text+---------+-------original-text-------+---+
5764 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5765 |<---------------------- GAP ----------------------->| */
5766 src = GAP_END_ADDR - len_byte;
5767 dst = GPT_ADDR + inserted_byte;
5768
5769 if (encodep)
5770 result = encode_coding (coding, src, dst, len_byte, 0);
5771 else
5772 {
5773 if (coding->composing != COMPOSITION_DISABLED)
5774 coding->cmp_data->char_offset = from + inserted;
5775 result = decode_coding (coding, src, dst, len_byte, 0);
5776 }
5777
5778 /* The buffer memory is now:
5779 +--------+-------converted-text----+--+------original-text----+---+
5780 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5781 |<---------------------- GAP ----------------------->| */
5782
5783 inserted += coding->produced_char;
5784 inserted_byte += coding->produced;
5785 len_byte -= coding->consumed;
5786
5787 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5788 {
5789 coding_allocate_composition_data (coding, from + inserted);
5790 continue;
5791 }
5792
5793 src += coding->consumed;
5794 dst += coding->produced;
5795
5796 if (result == CODING_FINISH_NORMAL)
5797 {
5798 src += len_byte;
5799 break;
5800 }
5801 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5802 {
5803 unsigned char *pend = dst, *p = pend - inserted_byte;
5804 Lisp_Object eol_type;
5805
5806 /* Encode LFs back to the original eol format (CR or CRLF). */
5807 if (coding->eol_type == CODING_EOL_CR)
5808 {
5809 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5810 }
5811 else
5812 {
5813 int count = 0;
5814
5815 while (p < pend) if (*p++ == '\n') count++;
5816 if (src - dst < count)
5817 {
5818 /* We don't have sufficient room for encoding LFs
5819 back to CRLF. We must record converted and
5820 not-yet-converted text back to the buffer
5821 content, enlarge the gap, then record them out of
5822 the buffer contents again. */
5823 int add = len_byte + inserted_byte;
5824
5825 GAP_SIZE -= add;
5826 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5827 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5828 make_gap (count - GAP_SIZE);
5829 GAP_SIZE += add;
5830 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5831 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5832 /* Don't forget to update SRC, DST, and PEND. */
5833 src = GAP_END_ADDR - len_byte;
5834 dst = GPT_ADDR + inserted_byte;
5835 pend = dst;
5836 }
5837 inserted += count;
5838 inserted_byte += count;
5839 coding->produced += count;
5840 p = dst = pend + count;
5841 while (count)
5842 {
5843 *--p = *--pend;
5844 if (*p == '\n') count--, *--p = '\r';
5845 }
5846 }
5847
5848 /* Suppress eol-format conversion in the further conversion. */
5849 coding->eol_type = CODING_EOL_LF;
5850
5851 /* Set the coding system symbol to that for Unix-like EOL. */
5852 eol_type = Fget (saved_coding_symbol, Qeol_type);
5853 if (VECTORP (eol_type)
5854 && XVECTOR (eol_type)->size == 3
5855 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5856 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5857 else
5858 coding->symbol = saved_coding_symbol;
5859
5860 continue;
5861 }
5862 if (len_byte <= 0)
5863 {
5864 if (coding->type != coding_type_ccl
5865 || coding->mode & CODING_MODE_LAST_BLOCK)
5866 break;
5867 coding->mode |= CODING_MODE_LAST_BLOCK;
5868 continue;
5869 }
5870 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5871 {
5872 /* The source text ends in invalid codes. Let's just
5873 make them valid buffer contents, and finish conversion. */
5874 if (multibyte_p)
5875 {
5876 unsigned char *start = dst;
5877
5878 inserted += len_byte;
5879 while (len_byte--)
5880 {
5881 int c = *src++;
5882 dst += CHAR_STRING (c, dst);
5883 }
5884
5885 inserted_byte += dst - start;
5886 }
5887 else
5888 {
5889 inserted += len_byte;
5890 inserted_byte += len_byte;
5891 while (len_byte--)
5892 *dst++ = *src++;
5893 }
5894 break;
5895 }
5896 if (result == CODING_FINISH_INTERRUPT)
5897 {
5898 /* The conversion procedure was interrupted by a user. */
5899 break;
5900 }
5901 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5902 if (coding->consumed < 1)
5903 {
5904 /* It's quite strange to require more memory without
5905 consuming any bytes. Perhaps CCL program bug. */
5906 break;
5907 }
5908 if (first)
5909 {
5910 /* We have just done the first batch of conversion which was
5911 stopped because of insufficient gap. Let's reconsider the
5912 required gap size (i.e. SRT - DST) now.
5913
5914 We have converted ORIG bytes (== coding->consumed) into
5915 NEW bytes (coding->produced). To convert the remaining
5916 LEN bytes, we may need REQUIRE bytes of gap, where:
5917 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5918 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5919 Here, we are sure that NEW >= ORIG. */
5920
5921 if (coding->produced <= coding->consumed)
5922 {
5923 /* This happens because of CCL-based coding system with
5924 eol-type CRLF. */
5925 require = 0;
5926 }
5927 else
5928 {
5929 float ratio = coding->produced - coding->consumed;
5930 ratio /= coding->consumed;
5931 require = len_byte * ratio;
5932 }
5933 first = 0;
5934 }
5935 if ((src - dst) < (require + 2000))
5936 {
5937 /* See the comment above the previous call of make_gap. */
5938 int add = len_byte + inserted_byte;
5939
5940 GAP_SIZE -= add;
5941 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5942 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5943 make_gap (require + 2000);
5944 GAP_SIZE += add;
5945 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5946 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5947 }
5948 }
5949 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5950
5951 if (encodep && coding->dst_multibyte)
5952 {
5953 /* The output is unibyte. We must convert 8-bit characters to
5954 multibyte form. */
5955 if (inserted_byte * 2 > GAP_SIZE)
5956 {
5957 GAP_SIZE -= inserted_byte;
5958 ZV += inserted_byte; Z += inserted_byte;
5959 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5960 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5961 make_gap (inserted_byte - GAP_SIZE);
5962 GAP_SIZE += inserted_byte;
5963 ZV -= inserted_byte; Z -= inserted_byte;
5964 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5965 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5966 }
5967 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5968 }
5969
5970 /* If we shrank the conversion area, adjust it now. */
5971 if (total_skip > 0)
5972 {
5973 if (tail_skip > 0)
5974 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5975 inserted += total_skip; inserted_byte += total_skip;
5976 GAP_SIZE += total_skip;
5977 GPT -= head_skip; GPT_BYTE -= head_skip;
5978 ZV -= total_skip; ZV_BYTE -= total_skip;
5979 Z -= total_skip; Z_BYTE -= total_skip;
5980 from -= head_skip; from_byte -= head_skip;
5981 to += tail_skip; to_byte += tail_skip;
5982 }
5983
5984 prev_Z = Z;
5985 if (! EQ (current_buffer->undo_list, Qt))
5986 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5987 else
5988 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5989 inserted, inserted_byte);
5990 inserted = Z - prev_Z;
5991
5992 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5993 coding_restore_composition (coding, Fcurrent_buffer ());
5994 coding_free_composition_data (coding);
5995
5996 if (! inhibit_pre_post_conversion
5997 && ! encodep && ! NILP (coding->post_read_conversion))
5998 {
5999 Lisp_Object val;
6000 Lisp_Object saved_coding_system;
6001
6002 if (from != PT)
6003 TEMP_SET_PT_BOTH (from, from_byte);
6004 prev_Z = Z;
6005 record_unwind_protect (code_convert_region_unwind,
6006 Fcons (Vlast_coding_system_used, Qnil));
6007 saved_coding_system = Vlast_coding_system_used;
6008 Vlast_coding_system_used = coding->symbol;
6009 /* We should not call any more pre-write/post-read-conversion
6010 functions while this post-read-conversion is running. */
6011 inhibit_pre_post_conversion = 1;
6012 val = call1 (coding->post_read_conversion, make_number (inserted));
6013 inhibit_pre_post_conversion = 0;
6014 coding->symbol = Vlast_coding_system_used;
6015 Vlast_coding_system_used = saved_coding_system;
6016 /* Discard the unwind protect. */
6017 specpdl_ptr--;
6018 CHECK_NUMBER (val);
6019 inserted += Z - prev_Z;
6020 }
6021
6022 if (orig_point >= from)
6023 {
6024 if (orig_point >= from + orig_len)
6025 orig_point += inserted - orig_len;
6026 else
6027 orig_point = from;
6028 TEMP_SET_PT (orig_point);
6029 }
6030
6031 if (replace)
6032 {
6033 signal_after_change (from, to - from, inserted);
6034 update_compositions (from, from + inserted, CHECK_BORDER);
6035 }
6036
6037 {
6038 coding->consumed = to_byte - from_byte;
6039 coding->consumed_char = to - from;
6040 coding->produced = inserted_byte;
6041 coding->produced_char = inserted;
6042 }
6043
6044 return 0;
6045 }
6046
6047 /* Name (or base name) of work buffer for code conversion. */
6048 static Lisp_Object Vcode_conversion_workbuf_name;
6049
6050 /* Set the current buffer to the working buffer prepared for
6051 code-conversion. MULTIBYTE specifies the multibyteness of the
6052 buffer. Return the buffer we set if it must be killed after use.
6053 Otherwise return Qnil. */
6054
6055 static Lisp_Object
6056 set_conversion_work_buffer (multibyte)
6057 int multibyte;
6058 {
6059 Lisp_Object buffer, buffer_to_kill;
6060 struct buffer *buf;
6061
6062 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6063 buf = XBUFFER (buffer);
6064 if (buf == current_buffer)
6065 {
6066 /* As we are already in the work buffer, we must generate a new
6067 buffer for the work. */
6068 Lisp_Object name;
6069
6070 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6071 buffer = buffer_to_kill = Fget_buffer_create (name);
6072 buf = XBUFFER (buffer);
6073 }
6074 else
6075 buffer_to_kill = Qnil;
6076
6077 delete_all_overlays (buf);
6078 buf->directory = current_buffer->directory;
6079 buf->read_only = Qnil;
6080 buf->filename = Qnil;
6081 buf->undo_list = Qt;
6082 eassert (buf->overlays_before == NULL);
6083 eassert (buf->overlays_after == NULL);
6084 set_buffer_internal (buf);
6085 if (BEG != BEGV || Z != ZV)
6086 Fwiden ();
6087 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6088 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6089 return buffer_to_kill;
6090 }
6091
6092 Lisp_Object
6093 run_pre_post_conversion_on_str (str, coding, encodep)
6094 Lisp_Object str;
6095 struct coding_system *coding;
6096 int encodep;
6097 {
6098 int count = SPECPDL_INDEX ();
6099 struct gcpro gcpro1, gcpro2;
6100 int multibyte = STRING_MULTIBYTE (str);
6101 Lisp_Object old_deactivate_mark;
6102 Lisp_Object buffer_to_kill;
6103 Lisp_Object unwind_arg;
6104
6105 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6106 /* It is not crucial to specbind this. */
6107 old_deactivate_mark = Vdeactivate_mark;
6108 GCPRO2 (str, old_deactivate_mark);
6109
6110 /* We must insert the contents of STR as is without
6111 unibyte<->multibyte conversion. For that, we adjust the
6112 multibyteness of the working buffer to that of STR. */
6113 buffer_to_kill = set_conversion_work_buffer (multibyte);
6114 if (NILP (buffer_to_kill))
6115 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6116 else
6117 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6118 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6119
6120 insert_from_string (str, 0, 0,
6121 SCHARS (str), SBYTES (str), 0);
6122 UNGCPRO;
6123 inhibit_pre_post_conversion = 1;
6124 if (encodep)
6125 {
6126 struct buffer *prev = current_buffer;
6127
6128 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6129 if (prev != current_buffer)
6130 /* We must kill the current buffer too. */
6131 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6132 }
6133 else
6134 {
6135 Vlast_coding_system_used = coding->symbol;
6136 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6137 call1 (coding->post_read_conversion, make_number (Z - BEG));
6138 coding->symbol = Vlast_coding_system_used;
6139 }
6140 inhibit_pre_post_conversion = 0;
6141 Vdeactivate_mark = old_deactivate_mark;
6142 str = make_buffer_string (BEG, Z, 1);
6143 return unbind_to (count, str);
6144 }
6145
6146
6147 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6148 text in *STR. *SIZE is the allocated bytes for STR. As it
6149 is intended that this function is called from encode_terminal_code,
6150 the pre-write-conversion function is run by safe_call and thus
6151 "Error during redisplay: ..." is logged when an error occurs.
6152
6153 Store the resulting text in *STR and set CODING->produced_char and
6154 CODING->produced to the number of characters and bytes
6155 respectively. If the size of *STR is too small, enlarge it by
6156 xrealloc and update *STR and *SIZE. */
6157
6158 void
6159 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6160 unsigned char **str;
6161 int *size, nchars, nbytes;
6162 struct coding_system *coding;
6163 {
6164 struct gcpro gcpro1, gcpro2;
6165 struct buffer *cur = current_buffer;
6166 struct buffer *prev;
6167 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6168 Lisp_Object args[3];
6169 Lisp_Object buffer_to_kill;
6170
6171 /* It is not crucial to specbind this. */
6172 old_deactivate_mark = Vdeactivate_mark;
6173 old_last_coding_system_used = Vlast_coding_system_used;
6174 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6175
6176 /* We must insert the contents of STR as is without
6177 unibyte<->multibyte conversion. For that, we adjust the
6178 multibyteness of the working buffer to that of STR. */
6179 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6180 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6181 UNGCPRO;
6182 inhibit_pre_post_conversion = 1;
6183 prev = current_buffer;
6184 args[0] = coding->pre_write_conversion;
6185 args[1] = make_number (BEG);
6186 args[2] = make_number (Z);
6187 safe_call (3, args);
6188 inhibit_pre_post_conversion = 0;
6189 Vdeactivate_mark = old_deactivate_mark;
6190 Vlast_coding_system_used = old_last_coding_system_used;
6191 coding->produced_char = Z - BEG;
6192 coding->produced = Z_BYTE - BEG_BYTE;
6193 if (coding->produced > *size)
6194 {
6195 *size = coding->produced;
6196 *str = xrealloc (*str, *size);
6197 }
6198 if (BEG < GPT && GPT < Z)
6199 move_gap (BEG);
6200 bcopy (BEG_ADDR, *str, coding->produced);
6201 coding->src_multibyte
6202 = ! NILP (current_buffer->enable_multibyte_characters);
6203 if (prev != current_buffer)
6204 Fkill_buffer (Fcurrent_buffer ());
6205 set_buffer_internal (cur);
6206 if (! NILP (buffer_to_kill))
6207 Fkill_buffer (buffer_to_kill);
6208 }
6209
6210
6211 Lisp_Object
6212 decode_coding_string (str, coding, nocopy)
6213 Lisp_Object str;
6214 struct coding_system *coding;
6215 int nocopy;
6216 {
6217 int len;
6218 struct conversion_buffer buf;
6219 int from, to_byte;
6220 Lisp_Object saved_coding_symbol;
6221 int result;
6222 int require_decoding;
6223 int shrinked_bytes = 0;
6224 Lisp_Object newstr;
6225 int consumed, consumed_char, produced, produced_char;
6226
6227 from = 0;
6228 to_byte = SBYTES (str);
6229
6230 saved_coding_symbol = coding->symbol;
6231 coding->src_multibyte = STRING_MULTIBYTE (str);
6232 coding->dst_multibyte = 1;
6233 coding->heading_ascii = 0;
6234
6235 if (CODING_REQUIRE_DETECTION (coding))
6236 {
6237 /* See the comments in code_convert_region. */
6238 if (coding->type == coding_type_undecided)
6239 {
6240 detect_coding (coding, SDATA (str), to_byte);
6241 if (coding->type == coding_type_undecided)
6242 {
6243 coding->type = coding_type_emacs_mule;
6244 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6245 /* As emacs-mule decoder will handle composition, we
6246 need this setting to allocate coding->cmp_data
6247 later. */
6248 coding->composing = COMPOSITION_NO;
6249 }
6250 }
6251 if (coding->eol_type == CODING_EOL_UNDECIDED
6252 && coding->type != coding_type_ccl)
6253 {
6254 saved_coding_symbol = coding->symbol;
6255 detect_eol (coding, SDATA (str), to_byte);
6256 if (coding->eol_type == CODING_EOL_UNDECIDED)
6257 coding->eol_type = CODING_EOL_LF;
6258 /* We had better recover the original eol format if we
6259 encounter an inconsistent eol format while decoding. */
6260 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6261 }
6262 }
6263
6264 if (coding->type == coding_type_no_conversion
6265 || coding->type == coding_type_raw_text)
6266 coding->dst_multibyte = 0;
6267
6268 require_decoding = CODING_REQUIRE_DECODING (coding);
6269
6270 if (STRING_MULTIBYTE (str))
6271 {
6272 /* Decoding routines expect the source text to be unibyte. */
6273 str = Fstring_as_unibyte (str);
6274 to_byte = SBYTES (str);
6275 nocopy = 1;
6276 coding->src_multibyte = 0;
6277 }
6278
6279 /* Try to skip the heading and tailing ASCIIs. */
6280 if (require_decoding && coding->type != coding_type_ccl)
6281 {
6282 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6283 0);
6284 if (from == to_byte)
6285 require_decoding = 0;
6286 shrinked_bytes = from + (SBYTES (str) - to_byte);
6287 }
6288
6289 if (!require_decoding
6290 && !(SYMBOLP (coding->post_read_conversion)
6291 && !NILP (Ffboundp (coding->post_read_conversion))))
6292 {
6293 coding->consumed = SBYTES (str);
6294 coding->consumed_char = SCHARS (str);
6295 if (coding->dst_multibyte)
6296 {
6297 str = Fstring_as_multibyte (str);
6298 nocopy = 1;
6299 }
6300 coding->produced = SBYTES (str);
6301 coding->produced_char = SCHARS (str);
6302 return (nocopy ? str : Fcopy_sequence (str));
6303 }
6304
6305 if (coding->composing != COMPOSITION_DISABLED)
6306 coding_allocate_composition_data (coding, from);
6307 len = decoding_buffer_size (coding, to_byte - from);
6308 allocate_conversion_buffer (buf, len);
6309
6310 consumed = consumed_char = produced = produced_char = 0;
6311 while (1)
6312 {
6313 result = decode_coding (coding, SDATA (str) + from + consumed,
6314 buf.data + produced, to_byte - from - consumed,
6315 buf.size - produced);
6316 consumed += coding->consumed;
6317 consumed_char += coding->consumed_char;
6318 produced += coding->produced;
6319 produced_char += coding->produced_char;
6320 if (result == CODING_FINISH_NORMAL
6321 || result == CODING_FINISH_INTERRUPT
6322 || (result == CODING_FINISH_INSUFFICIENT_SRC
6323 && coding->consumed == 0))
6324 break;
6325 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6326 coding_allocate_composition_data (coding, from + produced_char);
6327 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6328 extend_conversion_buffer (&buf);
6329 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6330 {
6331 Lisp_Object eol_type;
6332
6333 /* Recover the original EOL format. */
6334 if (coding->eol_type == CODING_EOL_CR)
6335 {
6336 unsigned char *p;
6337 for (p = buf.data; p < buf.data + produced; p++)
6338 if (*p == '\n') *p = '\r';
6339 }
6340 else if (coding->eol_type == CODING_EOL_CRLF)
6341 {
6342 int num_eol = 0;
6343 unsigned char *p0, *p1;
6344 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6345 if (*p0 == '\n') num_eol++;
6346 if (produced + num_eol >= buf.size)
6347 extend_conversion_buffer (&buf);
6348 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6349 {
6350 *--p1 = *--p0;
6351 if (*p0 == '\n') *--p1 = '\r';
6352 }
6353 produced += num_eol;
6354 produced_char += num_eol;
6355 }
6356 /* Suppress eol-format conversion in the further conversion. */
6357 coding->eol_type = CODING_EOL_LF;
6358
6359 /* Set the coding system symbol to that for Unix-like EOL. */
6360 eol_type = Fget (saved_coding_symbol, Qeol_type);
6361 if (VECTORP (eol_type)
6362 && XVECTOR (eol_type)->size == 3
6363 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6364 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6365 else
6366 coding->symbol = saved_coding_symbol;
6367
6368
6369 }
6370 }
6371
6372 coding->consumed = consumed;
6373 coding->consumed_char = consumed_char;
6374 coding->produced = produced;
6375 coding->produced_char = produced_char;
6376
6377 if (coding->dst_multibyte)
6378 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6379 produced + shrinked_bytes);
6380 else
6381 newstr = make_uninit_string (produced + shrinked_bytes);
6382 if (from > 0)
6383 STRING_COPYIN (newstr, 0, SDATA (str), from);
6384 STRING_COPYIN (newstr, from, buf.data, produced);
6385 if (shrinked_bytes > from)
6386 STRING_COPYIN (newstr, from + produced,
6387 SDATA (str) + to_byte,
6388 shrinked_bytes - from);
6389 free_conversion_buffer (&buf);
6390
6391 coding->consumed += shrinked_bytes;
6392 coding->consumed_char += shrinked_bytes;
6393 coding->produced += shrinked_bytes;
6394 coding->produced_char += shrinked_bytes;
6395
6396 if (coding->cmp_data && coding->cmp_data->used)
6397 coding_restore_composition (coding, newstr);
6398 coding_free_composition_data (coding);
6399
6400 if (SYMBOLP (coding->post_read_conversion)
6401 && !NILP (Ffboundp (coding->post_read_conversion)))
6402 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6403
6404 return newstr;
6405 }
6406
6407 Lisp_Object
6408 encode_coding_string (str, coding, nocopy)
6409 Lisp_Object str;
6410 struct coding_system *coding;
6411 int nocopy;
6412 {
6413 int len;
6414 struct conversion_buffer buf;
6415 int from, to, to_byte;
6416 int result;
6417 int shrinked_bytes = 0;
6418 Lisp_Object newstr;
6419 int consumed, consumed_char, produced, produced_char;
6420
6421 if (SYMBOLP (coding->pre_write_conversion)
6422 && !NILP (Ffboundp (coding->pre_write_conversion)))
6423 {
6424 str = run_pre_post_conversion_on_str (str, coding, 1);
6425 /* As STR is just newly generated, we don't have to copy it
6426 anymore. */
6427 nocopy = 1;
6428 }
6429
6430 from = 0;
6431 to = SCHARS (str);
6432 to_byte = SBYTES (str);
6433
6434 /* Encoding routines determine the multibyteness of the source text
6435 by coding->src_multibyte. */
6436 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6437 coding->dst_multibyte = 0;
6438 if (! CODING_REQUIRE_ENCODING (coding))
6439 goto no_need_of_encoding;
6440
6441 if (coding->composing != COMPOSITION_DISABLED)
6442 coding_save_composition (coding, from, to, str);
6443
6444 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6445 if we must run CCL program or there are compositions to
6446 encode. */
6447 coding->heading_ascii = 0;
6448 if (coding->type != coding_type_ccl
6449 && (! coding->cmp_data || coding->cmp_data->used == 0))
6450 {
6451 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6452 1);
6453 if (from == to_byte)
6454 {
6455 coding_free_composition_data (coding);
6456 goto no_need_of_encoding;
6457 }
6458 shrinked_bytes = from + (SBYTES (str) - to_byte);
6459 }
6460
6461 len = encoding_buffer_size (coding, to_byte - from);
6462 allocate_conversion_buffer (buf, len);
6463
6464 consumed = consumed_char = produced = produced_char = 0;
6465 while (1)
6466 {
6467 result = encode_coding (coding, SDATA (str) + from + consumed,
6468 buf.data + produced, to_byte - from - consumed,
6469 buf.size - produced);
6470 consumed += coding->consumed;
6471 consumed_char += coding->consumed_char;
6472 produced += coding->produced;
6473 produced_char += coding->produced_char;
6474 if (result == CODING_FINISH_NORMAL
6475 || result == CODING_FINISH_INTERRUPT
6476 || (result == CODING_FINISH_INSUFFICIENT_SRC
6477 && coding->consumed == 0))
6478 break;
6479 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6480 extend_conversion_buffer (&buf);
6481 }
6482
6483 coding->consumed = consumed;
6484 coding->consumed_char = consumed_char;
6485 coding->produced = produced;
6486 coding->produced_char = produced_char;
6487
6488 newstr = make_uninit_string (produced + shrinked_bytes);
6489 if (from > 0)
6490 STRING_COPYIN (newstr, 0, SDATA (str), from);
6491 STRING_COPYIN (newstr, from, buf.data, produced);
6492 if (shrinked_bytes > from)
6493 STRING_COPYIN (newstr, from + produced,
6494 SDATA (str) + to_byte,
6495 shrinked_bytes - from);
6496
6497 free_conversion_buffer (&buf);
6498 coding_free_composition_data (coding);
6499
6500 return newstr;
6501
6502 no_need_of_encoding:
6503 coding->consumed = SBYTES (str);
6504 coding->consumed_char = SCHARS (str);
6505 if (STRING_MULTIBYTE (str))
6506 {
6507 if (nocopy)
6508 /* We are sure that STR doesn't contain a multibyte
6509 character. */
6510 STRING_SET_UNIBYTE (str);
6511 else
6512 {
6513 str = Fstring_as_unibyte (str);
6514 nocopy = 1;
6515 }
6516 }
6517 coding->produced = SBYTES (str);
6518 coding->produced_char = SCHARS (str);
6519 return (nocopy ? str : Fcopy_sequence (str));
6520 }
6521
6522 \f
6523 #ifdef emacs
6524 /*** 8. Emacs Lisp library functions ***/
6525
6526 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6527 doc: /* Return t if OBJECT is nil or a coding-system.
6528 See the documentation of `make-coding-system' for information
6529 about coding-system objects. */)
6530 (obj)
6531 Lisp_Object obj;
6532 {
6533 if (NILP (obj))
6534 return Qt;
6535 if (!SYMBOLP (obj))
6536 return Qnil;
6537 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6538 return Qt;
6539 /* Get coding-spec vector for OBJ. */
6540 obj = Fget (obj, Qcoding_system);
6541 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6542 ? Qt : Qnil);
6543 }
6544
6545 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6546 Sread_non_nil_coding_system, 1, 1, 0,
6547 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6548 (prompt)
6549 Lisp_Object prompt;
6550 {
6551 Lisp_Object val;
6552 do
6553 {
6554 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6555 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6556 }
6557 while (SCHARS (val) == 0);
6558 return (Fintern (val, Qnil));
6559 }
6560
6561 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6562 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6563 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6564 Ignores case when completing coding systems (all Emacs coding systems
6565 are lower-case). */)
6566 (prompt, default_coding_system)
6567 Lisp_Object prompt, default_coding_system;
6568 {
6569 Lisp_Object val;
6570 int count = SPECPDL_INDEX ();
6571
6572 if (SYMBOLP (default_coding_system))
6573 default_coding_system = SYMBOL_NAME (default_coding_system);
6574 specbind (Qcompletion_ignore_case, Qt);
6575 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6576 Qt, Qnil, Qcoding_system_history,
6577 default_coding_system, Qnil);
6578 unbind_to (count, Qnil);
6579 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6580 }
6581
6582 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6583 1, 1, 0,
6584 doc: /* Check validity of CODING-SYSTEM.
6585 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6586 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6587 The value of this property should be a vector of length 5. */)
6588 (coding_system)
6589 Lisp_Object coding_system;
6590 {
6591 Lisp_Object define_form;
6592
6593 define_form = Fget (coding_system, Qcoding_system_define_form);
6594 if (! NILP (define_form))
6595 {
6596 Fput (coding_system, Qcoding_system_define_form, Qnil);
6597 safe_eval (define_form);
6598 }
6599 if (!NILP (Fcoding_system_p (coding_system)))
6600 return coding_system;
6601 xsignal1 (Qcoding_system_error, coding_system);
6602 }
6603 \f
6604 Lisp_Object
6605 detect_coding_system (src, src_bytes, highest, multibytep)
6606 const unsigned char *src;
6607 int src_bytes, highest;
6608 int multibytep;
6609 {
6610 int coding_mask, eol_type;
6611 Lisp_Object val, tmp;
6612 int dummy;
6613
6614 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6615 eol_type = detect_eol_type (src, src_bytes, &dummy);
6616 if (eol_type == CODING_EOL_INCONSISTENT)
6617 eol_type = CODING_EOL_UNDECIDED;
6618
6619 if (!coding_mask)
6620 {
6621 val = Qundecided;
6622 if (eol_type != CODING_EOL_UNDECIDED)
6623 {
6624 Lisp_Object val2;
6625 val2 = Fget (Qundecided, Qeol_type);
6626 if (VECTORP (val2))
6627 val = XVECTOR (val2)->contents[eol_type];
6628 }
6629 return (highest ? val : Fcons (val, Qnil));
6630 }
6631
6632 /* At first, gather possible coding systems in VAL. */
6633 val = Qnil;
6634 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6635 {
6636 Lisp_Object category_val, category_index;
6637
6638 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6639 category_val = Fsymbol_value (XCAR (tmp));
6640 if (!NILP (category_val)
6641 && NATNUMP (category_index)
6642 && (coding_mask & (1 << XFASTINT (category_index))))
6643 {
6644 val = Fcons (category_val, val);
6645 if (highest)
6646 break;
6647 }
6648 }
6649 if (!highest)
6650 val = Fnreverse (val);
6651
6652 /* Then, replace the elements with subsidiary coding systems. */
6653 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6654 {
6655 if (eol_type != CODING_EOL_UNDECIDED
6656 && eol_type != CODING_EOL_INCONSISTENT)
6657 {
6658 Lisp_Object eol;
6659 eol = Fget (XCAR (tmp), Qeol_type);
6660 if (VECTORP (eol))
6661 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6662 }
6663 }
6664 return (highest ? XCAR (val) : val);
6665 }
6666
6667 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6668 2, 3, 0,
6669 doc: /* Detect how the byte sequence in the region is encoded.
6670 Return a list of possible coding systems used on decoding a byte
6671 sequence containing the bytes in the region between START and END when
6672 the coding system `undecided' is specified. The list is ordered by
6673 priority decided in the current language environment.
6674
6675 If only ASCII characters are found (except for such ISO-2022 control
6676 characters ISO-2022 as ESC), it returns a list of single element
6677 `undecided' or its subsidiary coding system according to a detected
6678 end-of-line format.
6679
6680 If optional argument HIGHEST is non-nil, return the coding system of
6681 highest priority. */)
6682 (start, end, highest)
6683 Lisp_Object start, end, highest;
6684 {
6685 int from, to;
6686 int from_byte, to_byte;
6687 int include_anchor_byte = 0;
6688
6689 CHECK_NUMBER_COERCE_MARKER (start);
6690 CHECK_NUMBER_COERCE_MARKER (end);
6691
6692 validate_region (&start, &end);
6693 from = XINT (start), to = XINT (end);
6694 from_byte = CHAR_TO_BYTE (from);
6695 to_byte = CHAR_TO_BYTE (to);
6696
6697 if (from < GPT && to >= GPT)
6698 move_gap_both (to, to_byte);
6699 /* If we an anchor byte `\0' follows the region, we include it in
6700 the detecting source. Then code detectors can handle the tailing
6701 byte sequence more accurately.
6702
6703 Fix me: This is not a perfect solution. It is better that we
6704 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6705 */
6706 if (to == Z || (to == GPT && GAP_SIZE > 0))
6707 include_anchor_byte = 1;
6708 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6709 to_byte - from_byte + include_anchor_byte,
6710 !NILP (highest),
6711 !NILP (current_buffer
6712 ->enable_multibyte_characters));
6713 }
6714
6715 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6716 1, 2, 0,
6717 doc: /* Detect how the byte sequence in STRING is encoded.
6718 Return a list of possible coding systems used on decoding a byte
6719 sequence containing the bytes in STRING when the coding system
6720 `undecided' is specified. The list is ordered by priority decided in
6721 the current language environment.
6722
6723 If only ASCII characters are found (except for such ISO-2022 control
6724 characters ISO-2022 as ESC), it returns a list of single element
6725 `undecided' or its subsidiary coding system according to a detected
6726 end-of-line format.
6727
6728 If optional argument HIGHEST is non-nil, return the coding system of
6729 highest priority. */)
6730 (string, highest)
6731 Lisp_Object string, highest;
6732 {
6733 CHECK_STRING (string);
6734
6735 return detect_coding_system (SDATA (string),
6736 /* "+ 1" is to include the anchor byte
6737 `\0'. With this, code detectors can
6738 handle the tailing bytes more
6739 accurately. */
6740 SBYTES (string) + 1,
6741 !NILP (highest),
6742 STRING_MULTIBYTE (string));
6743 }
6744
6745 /* Subroutine for Ffind_coding_systems_region_internal.
6746
6747 Return a list of coding systems that safely encode the multibyte
6748 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6749 possible coding systems. If it is nil, it means that we have not
6750 yet found any coding systems.
6751
6752 WORK_TABLE a char-table of which element is set to t once the
6753 element is looked up.
6754
6755 If a non-ASCII single byte char is found, set
6756 *single_byte_char_found to 1. */
6757
6758 static Lisp_Object
6759 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6760 unsigned char *p, *pend;
6761 Lisp_Object safe_codings, work_table;
6762 int *single_byte_char_found;
6763 {
6764 int c, len;
6765 Lisp_Object val, ch;
6766 Lisp_Object prev, tail;
6767
6768 if (NILP (safe_codings))
6769 goto done_safe_codings;
6770 while (p < pend)
6771 {
6772 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6773 p += len;
6774 if (ASCII_BYTE_P (c))
6775 /* We can ignore ASCII characters here. */
6776 continue;
6777 if (SINGLE_BYTE_CHAR_P (c))
6778 *single_byte_char_found = 1;
6779 /* Check the safe coding systems for C. */
6780 ch = make_number (c);
6781 val = Faref (work_table, ch);
6782 if (EQ (val, Qt))
6783 /* This element was already checked. Ignore it. */
6784 continue;
6785 /* Remember that we checked this element. */
6786 Faset (work_table, ch, Qt);
6787
6788 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6789 {
6790 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6791 int encodable;
6792
6793 elt = XCAR (tail);
6794 if (CONSP (XCDR (elt)))
6795 {
6796 /* This entry has this format now:
6797 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6798 ACCEPT-LATIN-EXTRA ) */
6799 val = XCDR (elt);
6800 encodable = ! NILP (Faref (XCAR (val), ch));
6801 if (! encodable)
6802 {
6803 val = XCDR (val);
6804 translation_table = XCAR (val);
6805 hash_table = XCAR (XCDR (val));
6806 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6807 }
6808 }
6809 else
6810 {
6811 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6812 encodable = ! NILP (Faref (XCDR (elt), ch));
6813 if (! encodable)
6814 {
6815 /* Transform the format to:
6816 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6817 ACCEPT-LATIN-EXTRA ) */
6818 val = Fget (XCAR (elt), Qcoding_system);
6819 translation_table
6820 = Fplist_get (AREF (val, 3),
6821 Qtranslation_table_for_encode);
6822 if (SYMBOLP (translation_table))
6823 translation_table = Fget (translation_table,
6824 Qtranslation_table);
6825 hash_table
6826 = (CHAR_TABLE_P (translation_table)
6827 ? XCHAR_TABLE (translation_table)->extras[1]
6828 : Qnil);
6829 accept_latin_extra
6830 = ((EQ (AREF (val, 0), make_number (2))
6831 && VECTORP (AREF (val, 4)))
6832 ? AREF (AREF (val, 4), 16)
6833 : Qnil);
6834 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6835 translation_table, hash_table,
6836 accept_latin_extra));
6837 }
6838 }
6839
6840 if (! encodable
6841 && ((CHAR_TABLE_P (translation_table)
6842 && ! NILP (Faref (translation_table, ch)))
6843 || (HASH_TABLE_P (hash_table)
6844 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6845 || (SINGLE_BYTE_CHAR_P (c)
6846 && ! NILP (accept_latin_extra)
6847 && VECTORP (Vlatin_extra_code_table)
6848 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6849 encodable = 1;
6850 if (encodable)
6851 prev = tail;
6852 else
6853 {
6854 /* Exclude this coding system from SAFE_CODINGS. */
6855 if (EQ (tail, safe_codings))
6856 {
6857 safe_codings = XCDR (safe_codings);
6858 if (NILP (safe_codings))
6859 goto done_safe_codings;
6860 }
6861 else
6862 XSETCDR (prev, XCDR (tail));
6863 }
6864 }
6865 }
6866
6867 done_safe_codings:
6868 /* If the above loop was terminated before P reaches PEND, it means
6869 SAFE_CODINGS was set to nil. If we have not yet found an
6870 non-ASCII single-byte char, check it now. */
6871 if (! *single_byte_char_found)
6872 while (p < pend)
6873 {
6874 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6875 p += len;
6876 if (! ASCII_BYTE_P (c)
6877 && SINGLE_BYTE_CHAR_P (c))
6878 {
6879 *single_byte_char_found = 1;
6880 break;
6881 }
6882 }
6883 return safe_codings;
6884 }
6885
6886 DEFUN ("find-coding-systems-region-internal",
6887 Ffind_coding_systems_region_internal,
6888 Sfind_coding_systems_region_internal, 2, 2, 0,
6889 doc: /* Internal use only. */)
6890 (start, end)
6891 Lisp_Object start, end;
6892 {
6893 Lisp_Object work_table, safe_codings;
6894 int non_ascii_p = 0;
6895 int single_byte_char_found = 0;
6896 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6897
6898 if (STRINGP (start))
6899 {
6900 if (!STRING_MULTIBYTE (start))
6901 return Qt;
6902 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6903 p2 = p2end = p1end;
6904 if (SCHARS (start) != SBYTES (start))
6905 non_ascii_p = 1;
6906 }
6907 else
6908 {
6909 int from, to, stop;
6910
6911 CHECK_NUMBER_COERCE_MARKER (start);
6912 CHECK_NUMBER_COERCE_MARKER (end);
6913 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6914 args_out_of_range (start, end);
6915 if (NILP (current_buffer->enable_multibyte_characters))
6916 return Qt;
6917 from = CHAR_TO_BYTE (XINT (start));
6918 to = CHAR_TO_BYTE (XINT (end));
6919 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6920 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6921 if (stop == to)
6922 p2 = p2end = p1end;
6923 else
6924 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6925 if (XINT (end) - XINT (start) != to - from)
6926 non_ascii_p = 1;
6927 }
6928
6929 if (!non_ascii_p)
6930 {
6931 /* We are sure that the text contains no multibyte character.
6932 Check if it contains eight-bit-graphic. */
6933 p = p1;
6934 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6935 if (p == p1end)
6936 {
6937 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6938 if (p == p2end)
6939 return Qt;
6940 }
6941 }
6942
6943 /* The text contains non-ASCII characters. */
6944
6945 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6946 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6947
6948 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6949 &single_byte_char_found);
6950 if (p2 < p2end)
6951 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6952 &single_byte_char_found);
6953 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6954 safe_codings = Qt;
6955 else
6956 {
6957 /* Turn safe_codings to a list of coding systems... */
6958 Lisp_Object val;
6959
6960 if (single_byte_char_found)
6961 /* ... and append these for eight-bit chars. */
6962 val = Fcons (Qraw_text,
6963 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6964 else
6965 /* ... and append generic coding systems. */
6966 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6967
6968 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6969 val = Fcons (XCAR (XCAR (safe_codings)), val);
6970 safe_codings = val;
6971 }
6972
6973 return safe_codings;
6974 }
6975
6976
6977 /* Search from position POS for such characters that are unencodable
6978 accoding to SAFE_CHARS, and return a list of their positions. P
6979 points where in the memory the character at POS exists. Limit the
6980 search at PEND or when Nth unencodable characters are found.
6981
6982 If SAFE_CHARS is a char table, an element for an unencodable
6983 character is nil.
6984
6985 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6986
6987 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6988 eight-bit-graphic characters are unencodable. */
6989
6990 static Lisp_Object
6991 unencodable_char_position (safe_chars, pos, p, pend, n)
6992 Lisp_Object safe_chars;
6993 int pos;
6994 unsigned char *p, *pend;
6995 int n;
6996 {
6997 Lisp_Object pos_list;
6998
6999 pos_list = Qnil;
7000 while (p < pend)
7001 {
7002 int len;
7003 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7004
7005 if (c >= 128
7006 && (CHAR_TABLE_P (safe_chars)
7007 ? NILP (CHAR_TABLE_REF (safe_chars, c))
7008 : (NILP (safe_chars) || c < 256)))
7009 {
7010 pos_list = Fcons (make_number (pos), pos_list);
7011 if (--n <= 0)
7012 break;
7013 }
7014 pos++;
7015 p += len;
7016 }
7017 return Fnreverse (pos_list);
7018 }
7019
7020
7021 DEFUN ("unencodable-char-position", Funencodable_char_position,
7022 Sunencodable_char_position, 3, 5, 0,
7023 doc: /*
7024 Return position of first un-encodable character in a region.
7025 START and END specfiy the region and CODING-SYSTEM specifies the
7026 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7027
7028 If optional 4th argument COUNT is non-nil, it specifies at most how
7029 many un-encodable characters to search. In this case, the value is a
7030 list of positions.
7031
7032 If optional 5th argument STRING is non-nil, it is a string to search
7033 for un-encodable characters. In that case, START and END are indexes
7034 to the string. */)
7035 (start, end, coding_system, count, string)
7036 Lisp_Object start, end, coding_system, count, string;
7037 {
7038 int n;
7039 Lisp_Object safe_chars;
7040 struct coding_system coding;
7041 Lisp_Object positions;
7042 int from, to;
7043 unsigned char *p, *pend;
7044
7045 if (NILP (string))
7046 {
7047 validate_region (&start, &end);
7048 from = XINT (start);
7049 to = XINT (end);
7050 if (NILP (current_buffer->enable_multibyte_characters))
7051 return Qnil;
7052 p = CHAR_POS_ADDR (from);
7053 if (to == GPT)
7054 pend = GPT_ADDR;
7055 else
7056 pend = CHAR_POS_ADDR (to);
7057 }
7058 else
7059 {
7060 CHECK_STRING (string);
7061 CHECK_NATNUM (start);
7062 CHECK_NATNUM (end);
7063 from = XINT (start);
7064 to = XINT (end);
7065 if (from > to
7066 || to > SCHARS (string))
7067 args_out_of_range_3 (string, start, end);
7068 if (! STRING_MULTIBYTE (string))
7069 return Qnil;
7070 p = SDATA (string) + string_char_to_byte (string, from);
7071 pend = SDATA (string) + string_char_to_byte (string, to);
7072 }
7073
7074 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7075
7076 if (NILP (count))
7077 n = 1;
7078 else
7079 {
7080 CHECK_NATNUM (count);
7081 n = XINT (count);
7082 }
7083
7084 if (coding.type == coding_type_no_conversion
7085 || coding.type == coding_type_raw_text)
7086 return Qnil;
7087
7088 if (coding.type == coding_type_undecided)
7089 safe_chars = Qnil;
7090 else
7091 safe_chars = coding_safe_chars (coding_system);
7092
7093 if (STRINGP (string)
7094 || from >= GPT || to <= GPT)
7095 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7096 else
7097 {
7098 Lisp_Object args[2];
7099
7100 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7101 n -= XINT (Flength (args[0]));
7102 if (n <= 0)
7103 positions = args[0];
7104 else
7105 {
7106 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7107 pend, n);
7108 positions = Fappend (2, args);
7109 }
7110 }
7111
7112 return (NILP (count) ? Fcar (positions) : positions);
7113 }
7114
7115
7116 Lisp_Object
7117 code_convert_region1 (start, end, coding_system, encodep)
7118 Lisp_Object start, end, coding_system;
7119 int encodep;
7120 {
7121 struct coding_system coding;
7122 int from, to;
7123
7124 CHECK_NUMBER_COERCE_MARKER (start);
7125 CHECK_NUMBER_COERCE_MARKER (end);
7126 CHECK_SYMBOL (coding_system);
7127
7128 validate_region (&start, &end);
7129 from = XFASTINT (start);
7130 to = XFASTINT (end);
7131
7132 if (NILP (coding_system))
7133 return make_number (to - from);
7134
7135 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7136 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7137
7138 coding.mode |= CODING_MODE_LAST_BLOCK;
7139 coding.src_multibyte = coding.dst_multibyte
7140 = !NILP (current_buffer->enable_multibyte_characters);
7141 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7142 &coding, encodep, 1);
7143 Vlast_coding_system_used = coding.symbol;
7144 return make_number (coding.produced_char);
7145 }
7146
7147 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7148 3, 3, "r\nzCoding system: ",
7149 doc: /* Decode the current region from the specified coding system.
7150 When called from a program, takes three arguments:
7151 START, END, and CODING-SYSTEM. START and END are buffer positions.
7152 This function sets `last-coding-system-used' to the precise coding system
7153 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7154 not fully specified.)
7155 It returns the length of the decoded text. */)
7156 (start, end, coding_system)
7157 Lisp_Object start, end, coding_system;
7158 {
7159 return code_convert_region1 (start, end, coding_system, 0);
7160 }
7161
7162 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7163 3, 3, "r\nzCoding system: ",
7164 doc: /* Encode the current region into the specified coding system.
7165 When called from a program, takes three arguments:
7166 START, END, and CODING-SYSTEM. START and END are buffer positions.
7167 This function sets `last-coding-system-used' to the precise coding system
7168 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7169 not fully specified.)
7170 It returns the length of the encoded text. */)
7171 (start, end, coding_system)
7172 Lisp_Object start, end, coding_system;
7173 {
7174 return code_convert_region1 (start, end, coding_system, 1);
7175 }
7176
7177 Lisp_Object
7178 code_convert_string1 (string, coding_system, nocopy, encodep)
7179 Lisp_Object string, coding_system, nocopy;
7180 int encodep;
7181 {
7182 struct coding_system coding;
7183
7184 CHECK_STRING (string);
7185 CHECK_SYMBOL (coding_system);
7186
7187 if (NILP (coding_system))
7188 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7189
7190 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7191 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7192
7193 coding.mode |= CODING_MODE_LAST_BLOCK;
7194 string = (encodep
7195 ? encode_coding_string (string, &coding, !NILP (nocopy))
7196 : decode_coding_string (string, &coding, !NILP (nocopy)));
7197 Vlast_coding_system_used = coding.symbol;
7198
7199 return string;
7200 }
7201
7202 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7203 2, 3, 0,
7204 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7205 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7206 if the decoding operation is trivial.
7207 This function sets `last-coding-system-used' to the precise coding system
7208 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7209 not fully specified.) */)
7210 (string, coding_system, nocopy)
7211 Lisp_Object string, coding_system, nocopy;
7212 {
7213 return code_convert_string1 (string, coding_system, nocopy, 0);
7214 }
7215
7216 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7217 2, 3, 0,
7218 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7219 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7220 if the encoding operation is trivial.
7221 This function sets `last-coding-system-used' to the precise coding system
7222 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7223 not fully specified.) */)
7224 (string, coding_system, nocopy)
7225 Lisp_Object string, coding_system, nocopy;
7226 {
7227 return code_convert_string1 (string, coding_system, nocopy, 1);
7228 }
7229
7230 /* Encode or decode STRING according to CODING_SYSTEM.
7231 Do not set Vlast_coding_system_used.
7232
7233 This function is called only from macros DECODE_FILE and
7234 ENCODE_FILE, thus we ignore character composition. */
7235
7236 Lisp_Object
7237 code_convert_string_norecord (string, coding_system, encodep)
7238 Lisp_Object string, coding_system;
7239 int encodep;
7240 {
7241 struct coding_system coding;
7242
7243 CHECK_STRING (string);
7244 CHECK_SYMBOL (coding_system);
7245
7246 if (NILP (coding_system))
7247 return string;
7248
7249 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7250 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7251
7252 coding.composing = COMPOSITION_DISABLED;
7253 coding.mode |= CODING_MODE_LAST_BLOCK;
7254 return (encodep
7255 ? encode_coding_string (string, &coding, 1)
7256 : decode_coding_string (string, &coding, 1));
7257 }
7258 \f
7259 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7260 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7261 Return the corresponding character. */)
7262 (code)
7263 Lisp_Object code;
7264 {
7265 unsigned char c1, c2, s1, s2;
7266 Lisp_Object val;
7267
7268 CHECK_NUMBER (code);
7269 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7270 if (s1 == 0)
7271 {
7272 if (s2 < 0x80)
7273 XSETFASTINT (val, s2);
7274 else if (s2 >= 0xA0 || s2 <= 0xDF)
7275 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7276 else
7277 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7278 }
7279 else
7280 {
7281 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7282 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7283 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7284 DECODE_SJIS (s1, s2, c1, c2);
7285 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7286 }
7287 return val;
7288 }
7289
7290 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7291 doc: /* Encode a Japanese character CH to shift_jis encoding.
7292 Return the corresponding code in SJIS. */)
7293 (ch)
7294 Lisp_Object ch;
7295 {
7296 int charset, c1, c2, s1, s2;
7297 Lisp_Object val;
7298
7299 CHECK_NUMBER (ch);
7300 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7301 if (charset == CHARSET_ASCII)
7302 {
7303 val = ch;
7304 }
7305 else if (charset == charset_jisx0208
7306 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7307 {
7308 ENCODE_SJIS (c1, c2, s1, s2);
7309 XSETFASTINT (val, (s1 << 8) | s2);
7310 }
7311 else if (charset == charset_katakana_jisx0201
7312 && c1 > 0x20 && c2 < 0xE0)
7313 {
7314 XSETFASTINT (val, c1 | 0x80);
7315 }
7316 else
7317 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7318 return val;
7319 }
7320
7321 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7322 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7323 Return the corresponding character. */)
7324 (code)
7325 Lisp_Object code;
7326 {
7327 int charset;
7328 unsigned char b1, b2, c1, c2;
7329 Lisp_Object val;
7330
7331 CHECK_NUMBER (code);
7332 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7333 if (b1 == 0)
7334 {
7335 if (b2 >= 0x80)
7336 error ("Invalid BIG5 code: %x", XFASTINT (code));
7337 val = code;
7338 }
7339 else
7340 {
7341 if ((b1 < 0xA1 || b1 > 0xFE)
7342 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7343 error ("Invalid BIG5 code: %x", XFASTINT (code));
7344 DECODE_BIG5 (b1, b2, charset, c1, c2);
7345 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7346 }
7347 return val;
7348 }
7349
7350 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7351 doc: /* Encode the Big5 character CH to BIG5 coding system.
7352 Return the corresponding character code in Big5. */)
7353 (ch)
7354 Lisp_Object ch;
7355 {
7356 int charset, c1, c2, b1, b2;
7357 Lisp_Object val;
7358
7359 CHECK_NUMBER (ch);
7360 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7361 if (charset == CHARSET_ASCII)
7362 {
7363 val = ch;
7364 }
7365 else if ((charset == charset_big5_1
7366 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7367 || (charset == charset_big5_2
7368 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7369 {
7370 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7371 XSETFASTINT (val, (b1 << 8) | b2);
7372 }
7373 else
7374 error ("Can't encode to Big5: %d", XFASTINT (ch));
7375 return val;
7376 }
7377 \f
7378 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7379 Sset_terminal_coding_system_internal, 1, 1, 0,
7380 doc: /* Internal use only. */)
7381 (coding_system)
7382 Lisp_Object coding_system;
7383 {
7384 CHECK_SYMBOL (coding_system);
7385 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7386 /* We had better not send unsafe characters to terminal. */
7387 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7388 /* Character composition should be disabled. */
7389 terminal_coding.composing = COMPOSITION_DISABLED;
7390 /* Error notification should be suppressed. */
7391 terminal_coding.suppress_error = 1;
7392 terminal_coding.src_multibyte = 1;
7393 terminal_coding.dst_multibyte = 0;
7394 return Qnil;
7395 }
7396
7397 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7398 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7399 doc: /* Internal use only. */)
7400 (coding_system)
7401 Lisp_Object coding_system;
7402 {
7403 CHECK_SYMBOL (coding_system);
7404 setup_coding_system (Fcheck_coding_system (coding_system),
7405 &safe_terminal_coding);
7406 /* Character composition should be disabled. */
7407 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7408 /* Error notification should be suppressed. */
7409 safe_terminal_coding.suppress_error = 1;
7410 safe_terminal_coding.src_multibyte = 1;
7411 safe_terminal_coding.dst_multibyte = 0;
7412 return Qnil;
7413 }
7414
7415 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7416 Sterminal_coding_system, 0, 0, 0,
7417 doc: /* Return coding system specified for terminal output. */)
7418 ()
7419 {
7420 return terminal_coding.symbol;
7421 }
7422
7423 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7424 Sset_keyboard_coding_system_internal, 1, 1, 0,
7425 doc: /* Internal use only. */)
7426 (coding_system)
7427 Lisp_Object coding_system;
7428 {
7429 CHECK_SYMBOL (coding_system);
7430 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7431 /* Character composition should be disabled. */
7432 keyboard_coding.composing = COMPOSITION_DISABLED;
7433 return Qnil;
7434 }
7435
7436 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7437 Skeyboard_coding_system, 0, 0, 0,
7438 doc: /* Return coding system specified for decoding keyboard input. */)
7439 ()
7440 {
7441 return keyboard_coding.symbol;
7442 }
7443
7444 \f
7445 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7446 Sfind_operation_coding_system, 1, MANY, 0,
7447 doc: /* Choose a coding system for an operation based on the target name.
7448 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7449 DECODING-SYSTEM is the coding system to use for decoding
7450 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7451 for encoding (in case OPERATION does encoding).
7452
7453 The first argument OPERATION specifies an I/O primitive:
7454 For file I/O, `insert-file-contents' or `write-region'.
7455 For process I/O, `call-process', `call-process-region', or `start-process'.
7456 For network I/O, `open-network-stream'.
7457
7458 The remaining arguments should be the same arguments that were passed
7459 to the primitive. Depending on which primitive, one of those arguments
7460 is selected as the TARGET. For example, if OPERATION does file I/O,
7461 whichever argument specifies the file name is TARGET.
7462
7463 TARGET has a meaning which depends on OPERATION:
7464 For file I/O, TARGET is a file name (except for the special case below).
7465 For process I/O, TARGET is a process name.
7466 For network I/O, TARGET is a service name or a port number
7467
7468 This function looks up what specified for TARGET in,
7469 `file-coding-system-alist', `process-coding-system-alist',
7470 or `network-coding-system-alist' depending on OPERATION.
7471 They may specify a coding system, a cons of coding systems,
7472 or a function symbol to call.
7473 In the last case, we call the function with one argument,
7474 which is a list of all the arguments given to this function.
7475
7476 If OPERATION is `insert-file-contents', the argument corresponding to
7477 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7478 file name to look up, and BUFFER is a buffer that contains the file's
7479 contents (not yet decoded). If `file-coding-system-alist' specifies a
7480 function to call for FILENAME, that function should examine the
7481 contents of BUFFER instead of reading the file.
7482
7483 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
7484 (nargs, args)
7485 int nargs;
7486 Lisp_Object *args;
7487 {
7488 Lisp_Object operation, target_idx, target, val;
7489 register Lisp_Object chain;
7490
7491 if (nargs < 2)
7492 error ("Too few arguments");
7493 operation = args[0];
7494 if (!SYMBOLP (operation)
7495 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7496 error ("Invalid first argument");
7497 if (nargs < 1 + XINT (target_idx))
7498 error ("Too few arguments for operation: %s",
7499 SDATA (SYMBOL_NAME (operation)));
7500 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7501 argument to write-region) is string, it must be treated as a
7502 target file name. */
7503 if (EQ (operation, Qwrite_region)
7504 && nargs > 5
7505 && STRINGP (args[5]))
7506 target_idx = make_number (4);
7507 target = args[XINT (target_idx) + 1];
7508 if (!(STRINGP (target)
7509 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7510 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7511 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7512 error ("Invalid argument %d", XINT (target_idx) + 1);
7513 if (CONSP (target))
7514 target = XCAR (target);
7515
7516 chain = ((EQ (operation, Qinsert_file_contents)
7517 || EQ (operation, Qwrite_region))
7518 ? Vfile_coding_system_alist
7519 : (EQ (operation, Qopen_network_stream)
7520 ? Vnetwork_coding_system_alist
7521 : Vprocess_coding_system_alist));
7522 if (NILP (chain))
7523 return Qnil;
7524
7525 for (; CONSP (chain); chain = XCDR (chain))
7526 {
7527 Lisp_Object elt;
7528 elt = XCAR (chain);
7529
7530 if (CONSP (elt)
7531 && ((STRINGP (target)
7532 && STRINGP (XCAR (elt))
7533 && fast_string_match (XCAR (elt), target) >= 0)
7534 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7535 {
7536 val = XCDR (elt);
7537 /* Here, if VAL is both a valid coding system and a valid
7538 function symbol, we return VAL as a coding system. */
7539 if (CONSP (val))
7540 return val;
7541 if (! SYMBOLP (val))
7542 return Qnil;
7543 if (! NILP (Fcoding_system_p (val)))
7544 return Fcons (val, val);
7545 if (! NILP (Ffboundp (val)))
7546 {
7547 /* We use call1 rather than safe_call1
7548 so as to get bug reports about functions called here
7549 which don't handle the current interface. */
7550 val = call1 (val, Flist (nargs, args));
7551 if (CONSP (val))
7552 return val;
7553 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7554 return Fcons (val, val);
7555 }
7556 return Qnil;
7557 }
7558 }
7559 return Qnil;
7560 }
7561
7562 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7563 Supdate_coding_systems_internal, 0, 0, 0,
7564 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7565 When values of any coding categories are changed, you must
7566 call this function. */)
7567 ()
7568 {
7569 int i;
7570
7571 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7572 {
7573 Lisp_Object val;
7574
7575 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7576 if (!NILP (val))
7577 {
7578 if (! coding_system_table[i])
7579 coding_system_table[i] = ((struct coding_system *)
7580 xmalloc (sizeof (struct coding_system)));
7581 setup_coding_system (val, coding_system_table[i]);
7582 }
7583 else if (coding_system_table[i])
7584 {
7585 xfree (coding_system_table[i]);
7586 coding_system_table[i] = NULL;
7587 }
7588 }
7589
7590 return Qnil;
7591 }
7592
7593 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7594 Sset_coding_priority_internal, 0, 0, 0,
7595 doc: /* Update internal database for the current value of `coding-category-list'.
7596 This function is internal use only. */)
7597 ()
7598 {
7599 int i = 0, idx;
7600 Lisp_Object val;
7601
7602 val = Vcoding_category_list;
7603
7604 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7605 {
7606 if (! SYMBOLP (XCAR (val)))
7607 break;
7608 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7609 if (idx >= CODING_CATEGORY_IDX_MAX)
7610 break;
7611 coding_priorities[i++] = (1 << idx);
7612 val = XCDR (val);
7613 }
7614 /* If coding-category-list is valid and contains all coding
7615 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7616 the following code saves Emacs from crashing. */
7617 while (i < CODING_CATEGORY_IDX_MAX)
7618 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7619
7620 return Qnil;
7621 }
7622
7623 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7624 Sdefine_coding_system_internal, 1, 1, 0,
7625 doc: /* Register CODING-SYSTEM as a base coding system.
7626 This function is internal use only. */)
7627 (coding_system)
7628 Lisp_Object coding_system;
7629 {
7630 Lisp_Object safe_chars, slot;
7631
7632 if (NILP (Fcheck_coding_system (coding_system)))
7633 xsignal1 (Qcoding_system_error, coding_system);
7634
7635 safe_chars = coding_safe_chars (coding_system);
7636 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7637 error ("No valid safe-chars property for %s",
7638 SDATA (SYMBOL_NAME (coding_system)));
7639
7640 if (EQ (safe_chars, Qt))
7641 {
7642 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7643 XSETCAR (Vcoding_system_safe_chars,
7644 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7645 }
7646 else
7647 {
7648 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7649 if (NILP (slot))
7650 XSETCDR (Vcoding_system_safe_chars,
7651 nconc2 (XCDR (Vcoding_system_safe_chars),
7652 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7653 else
7654 XSETCDR (slot, safe_chars);
7655 }
7656 return Qnil;
7657 }
7658
7659 #endif /* emacs */
7660
7661 \f
7662 /*** 9. Post-amble ***/
7663
7664 void
7665 init_coding_once ()
7666 {
7667 int i;
7668
7669 /* Emacs' internal format specific initialize routine. */
7670 for (i = 0; i <= 0x20; i++)
7671 emacs_code_class[i] = EMACS_control_code;
7672 emacs_code_class[0x0A] = EMACS_linefeed_code;
7673 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7674 for (i = 0x21 ; i < 0x7F; i++)
7675 emacs_code_class[i] = EMACS_ascii_code;
7676 emacs_code_class[0x7F] = EMACS_control_code;
7677 for (i = 0x80; i < 0xFF; i++)
7678 emacs_code_class[i] = EMACS_invalid_code;
7679 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7680 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7681 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7682 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7683
7684 /* ISO2022 specific initialize routine. */
7685 for (i = 0; i < 0x20; i++)
7686 iso_code_class[i] = ISO_control_0;
7687 for (i = 0x21; i < 0x7F; i++)
7688 iso_code_class[i] = ISO_graphic_plane_0;
7689 for (i = 0x80; i < 0xA0; i++)
7690 iso_code_class[i] = ISO_control_1;
7691 for (i = 0xA1; i < 0xFF; i++)
7692 iso_code_class[i] = ISO_graphic_plane_1;
7693 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7694 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7695 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7696 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7697 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7698 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7699 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7700 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7701 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7702 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7703
7704 setup_coding_system (Qnil, &keyboard_coding);
7705 setup_coding_system (Qnil, &terminal_coding);
7706 setup_coding_system (Qnil, &safe_terminal_coding);
7707 setup_coding_system (Qnil, &default_buffer_file_coding);
7708
7709 bzero (coding_system_table, sizeof coding_system_table);
7710
7711 bzero (ascii_skip_code, sizeof ascii_skip_code);
7712 for (i = 0; i < 128; i++)
7713 ascii_skip_code[i] = 1;
7714
7715 #if defined (MSDOS) || defined (WINDOWSNT)
7716 system_eol_type = CODING_EOL_CRLF;
7717 #else
7718 system_eol_type = CODING_EOL_LF;
7719 #endif
7720
7721 inhibit_pre_post_conversion = 0;
7722 }
7723
7724 #ifdef emacs
7725
7726 void
7727 syms_of_coding ()
7728 {
7729 staticpro (&Vcode_conversion_workbuf_name);
7730 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7731
7732 Qtarget_idx = intern ("target-idx");
7733 staticpro (&Qtarget_idx);
7734
7735 Qcoding_system_history = intern ("coding-system-history");
7736 staticpro (&Qcoding_system_history);
7737 Fset (Qcoding_system_history, Qnil);
7738
7739 /* Target FILENAME is the first argument. */
7740 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7741 /* Target FILENAME is the third argument. */
7742 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7743
7744 Qcall_process = intern ("call-process");
7745 staticpro (&Qcall_process);
7746 /* Target PROGRAM is the first argument. */
7747 Fput (Qcall_process, Qtarget_idx, make_number (0));
7748
7749 Qcall_process_region = intern ("call-process-region");
7750 staticpro (&Qcall_process_region);
7751 /* Target PROGRAM is the third argument. */
7752 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7753
7754 Qstart_process = intern ("start-process");
7755 staticpro (&Qstart_process);
7756 /* Target PROGRAM is the third argument. */
7757 Fput (Qstart_process, Qtarget_idx, make_number (2));
7758
7759 Qopen_network_stream = intern ("open-network-stream");
7760 staticpro (&Qopen_network_stream);
7761 /* Target SERVICE is the fourth argument. */
7762 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7763
7764 Qcoding_system = intern ("coding-system");
7765 staticpro (&Qcoding_system);
7766
7767 Qeol_type = intern ("eol-type");
7768 staticpro (&Qeol_type);
7769
7770 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7771 staticpro (&Qbuffer_file_coding_system);
7772
7773 Qpost_read_conversion = intern ("post-read-conversion");
7774 staticpro (&Qpost_read_conversion);
7775
7776 Qpre_write_conversion = intern ("pre-write-conversion");
7777 staticpro (&Qpre_write_conversion);
7778
7779 Qno_conversion = intern ("no-conversion");
7780 staticpro (&Qno_conversion);
7781
7782 Qundecided = intern ("undecided");
7783 staticpro (&Qundecided);
7784
7785 Qcoding_system_p = intern ("coding-system-p");
7786 staticpro (&Qcoding_system_p);
7787
7788 Qcoding_system_error = intern ("coding-system-error");
7789 staticpro (&Qcoding_system_error);
7790
7791 Fput (Qcoding_system_error, Qerror_conditions,
7792 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7793 Fput (Qcoding_system_error, Qerror_message,
7794 build_string ("Invalid coding system"));
7795
7796 Qcoding_category = intern ("coding-category");
7797 staticpro (&Qcoding_category);
7798 Qcoding_category_index = intern ("coding-category-index");
7799 staticpro (&Qcoding_category_index);
7800
7801 Vcoding_category_table
7802 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7803 staticpro (&Vcoding_category_table);
7804 {
7805 int i;
7806 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7807 {
7808 XVECTOR (Vcoding_category_table)->contents[i]
7809 = intern (coding_category_name[i]);
7810 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7811 Qcoding_category_index, make_number (i));
7812 }
7813 }
7814
7815 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7816 staticpro (&Vcoding_system_safe_chars);
7817
7818 Qtranslation_table = intern ("translation-table");
7819 staticpro (&Qtranslation_table);
7820 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7821
7822 Qtranslation_table_id = intern ("translation-table-id");
7823 staticpro (&Qtranslation_table_id);
7824
7825 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7826 staticpro (&Qtranslation_table_for_decode);
7827
7828 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7829 staticpro (&Qtranslation_table_for_encode);
7830
7831 Qsafe_chars = intern ("safe-chars");
7832 staticpro (&Qsafe_chars);
7833
7834 Qchar_coding_system = intern ("char-coding-system");
7835 staticpro (&Qchar_coding_system);
7836
7837 /* Intern this now in case it isn't already done.
7838 Setting this variable twice is harmless.
7839 But don't staticpro it here--that is done in alloc.c. */
7840 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7841 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7842 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7843
7844 Qvalid_codes = intern ("valid-codes");
7845 staticpro (&Qvalid_codes);
7846
7847 Qascii_incompatible = intern ("ascii-incompatible");
7848 staticpro (&Qascii_incompatible);
7849
7850 Qemacs_mule = intern ("emacs-mule");
7851 staticpro (&Qemacs_mule);
7852
7853 Qraw_text = intern ("raw-text");
7854 staticpro (&Qraw_text);
7855
7856 Qutf_8 = intern ("utf-8");
7857 staticpro (&Qutf_8);
7858
7859 Qcoding_system_define_form = intern ("coding-system-define-form");
7860 staticpro (&Qcoding_system_define_form);
7861
7862 defsubr (&Scoding_system_p);
7863 defsubr (&Sread_coding_system);
7864 defsubr (&Sread_non_nil_coding_system);
7865 defsubr (&Scheck_coding_system);
7866 defsubr (&Sdetect_coding_region);
7867 defsubr (&Sdetect_coding_string);
7868 defsubr (&Sfind_coding_systems_region_internal);
7869 defsubr (&Sunencodable_char_position);
7870 defsubr (&Sdecode_coding_region);
7871 defsubr (&Sencode_coding_region);
7872 defsubr (&Sdecode_coding_string);
7873 defsubr (&Sencode_coding_string);
7874 defsubr (&Sdecode_sjis_char);
7875 defsubr (&Sencode_sjis_char);
7876 defsubr (&Sdecode_big5_char);
7877 defsubr (&Sencode_big5_char);
7878 defsubr (&Sset_terminal_coding_system_internal);
7879 defsubr (&Sset_safe_terminal_coding_system_internal);
7880 defsubr (&Sterminal_coding_system);
7881 defsubr (&Sset_keyboard_coding_system_internal);
7882 defsubr (&Skeyboard_coding_system);
7883 defsubr (&Sfind_operation_coding_system);
7884 defsubr (&Supdate_coding_systems_internal);
7885 defsubr (&Sset_coding_priority_internal);
7886 defsubr (&Sdefine_coding_system_internal);
7887
7888 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7889 doc: /* List of coding systems.
7890
7891 Do not alter the value of this variable manually. This variable should be
7892 updated by the functions `make-coding-system' and
7893 `define-coding-system-alias'. */);
7894 Vcoding_system_list = Qnil;
7895
7896 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7897 doc: /* Alist of coding system names.
7898 Each element is one element list of coding system name.
7899 This variable is given to `completing-read' as TABLE argument.
7900
7901 Do not alter the value of this variable manually. This variable should be
7902 updated by the functions `make-coding-system' and
7903 `define-coding-system-alias'. */);
7904 Vcoding_system_alist = Qnil;
7905
7906 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7907 doc: /* List of coding-categories (symbols) ordered by priority.
7908
7909 On detecting a coding system, Emacs tries code detection algorithms
7910 associated with each coding-category one by one in this order. When
7911 one algorithm agrees with a byte sequence of source text, the coding
7912 system bound to the corresponding coding-category is selected.
7913
7914 Don't modify this variable directly, but use `set-coding-priority'. */);
7915 {
7916 int i;
7917
7918 Vcoding_category_list = Qnil;
7919 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7920 Vcoding_category_list
7921 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7922 Vcoding_category_list);
7923 }
7924
7925 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7926 doc: /* Specify the coding system for read operations.
7927 It is useful to bind this variable with `let', but do not set it globally.
7928 If the value is a coding system, it is used for decoding on read operation.
7929 If not, an appropriate element is used from one of the coding system alists:
7930 There are three such tables, `file-coding-system-alist',
7931 `process-coding-system-alist', and `network-coding-system-alist'. */);
7932 Vcoding_system_for_read = Qnil;
7933
7934 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7935 doc: /* Specify the coding system for write operations.
7936 Programs bind this variable with `let', but you should not set it globally.
7937 If the value is a coding system, it is used for encoding of output,
7938 when writing it to a file and when sending it to a file or subprocess.
7939
7940 If this does not specify a coding system, an appropriate element
7941 is used from one of the coding system alists:
7942 There are three such tables, `file-coding-system-alist',
7943 `process-coding-system-alist', and `network-coding-system-alist'.
7944 For output to files, if the above procedure does not specify a coding system,
7945 the value of `buffer-file-coding-system' is used. */);
7946 Vcoding_system_for_write = Qnil;
7947
7948 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7949 doc: /* Coding system used in the latest file or process I/O.
7950 Also set by `encode-coding-region', `decode-coding-region',
7951 `encode-coding-string' and `decode-coding-string'. */);
7952 Vlast_coding_system_used = Qnil;
7953
7954 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7955 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7956 See info node `Coding Systems' and info node `Text and Binary' concerning
7957 such conversion. */);
7958 inhibit_eol_conversion = 0;
7959
7960 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7961 doc: /* Non-nil means process buffer inherits coding system of process output.
7962 Bind it to t if the process output is to be treated as if it were a file
7963 read from some filesystem. */);
7964 inherit_process_coding_system = 0;
7965
7966 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7967 doc: /* Alist to decide a coding system to use for a file I/O operation.
7968 The format is ((PATTERN . VAL) ...),
7969 where PATTERN is a regular expression matching a file name,
7970 VAL is a coding system, a cons of coding systems, or a function symbol.
7971 If VAL is a coding system, it is used for both decoding and encoding
7972 the file contents.
7973 If VAL is a cons of coding systems, the car part is used for decoding,
7974 and the cdr part is used for encoding.
7975 If VAL is a function symbol, the function must return a coding system
7976 or a cons of coding systems which are used as above. The function is
7977 called with an argument that is a list of the arguments with which
7978 `find-operation-coding-system' was called.
7979
7980 See also the function `find-operation-coding-system'
7981 and the variable `auto-coding-alist'. */);
7982 Vfile_coding_system_alist = Qnil;
7983
7984 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7985 doc: /* Alist to decide a coding system to use for a process I/O operation.
7986 The format is ((PATTERN . VAL) ...),
7987 where PATTERN is a regular expression matching a program name,
7988 VAL is a coding system, a cons of coding systems, or a function symbol.
7989 If VAL is a coding system, it is used for both decoding what received
7990 from the program and encoding what sent to the program.
7991 If VAL is a cons of coding systems, the car part is used for decoding,
7992 and the cdr part is used for encoding.
7993 If VAL is a function symbol, the function must return a coding system
7994 or a cons of coding systems which are used as above.
7995
7996 See also the function `find-operation-coding-system'. */);
7997 Vprocess_coding_system_alist = Qnil;
7998
7999 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8000 doc: /* Alist to decide a coding system to use for a network I/O operation.
8001 The format is ((PATTERN . VAL) ...),
8002 where PATTERN is a regular expression matching a network service name
8003 or is a port number to connect to,
8004 VAL is a coding system, a cons of coding systems, or a function symbol.
8005 If VAL is a coding system, it is used for both decoding what received
8006 from the network stream and encoding what sent to the network stream.
8007 If VAL is a cons of coding systems, the car part is used for decoding,
8008 and the cdr part is used for encoding.
8009 If VAL is a function symbol, the function must return a coding system
8010 or a cons of coding systems which are used as above.
8011
8012 See also the function `find-operation-coding-system'. */);
8013 Vnetwork_coding_system_alist = Qnil;
8014
8015 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8016 doc: /* Coding system to use with system messages.
8017 Also used for decoding keyboard input on X Window system. */);
8018 Vlocale_coding_system = Qnil;
8019
8020 /* The eol mnemonics are reset in startup.el system-dependently. */
8021 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8022 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8023 eol_mnemonic_unix = build_string (":");
8024
8025 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8026 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8027 eol_mnemonic_dos = build_string ("\\");
8028
8029 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8030 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8031 eol_mnemonic_mac = build_string ("/");
8032
8033 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8034 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8035 eol_mnemonic_undecided = build_string (":");
8036
8037 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8038 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8039 Venable_character_translation = Qt;
8040
8041 DEFVAR_LISP ("standard-translation-table-for-decode",
8042 &Vstandard_translation_table_for_decode,
8043 doc: /* Table for translating characters while decoding. */);
8044 Vstandard_translation_table_for_decode = Qnil;
8045
8046 DEFVAR_LISP ("standard-translation-table-for-encode",
8047 &Vstandard_translation_table_for_encode,
8048 doc: /* Table for translating characters while encoding. */);
8049 Vstandard_translation_table_for_encode = Qnil;
8050
8051 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8052 doc: /* Alist of charsets vs revision numbers.
8053 While encoding, if a charset (car part of an element) is found,
8054 designate it with the escape sequence identifying revision (cdr part of the element). */);
8055 Vcharset_revision_alist = Qnil;
8056
8057 DEFVAR_LISP ("default-process-coding-system",
8058 &Vdefault_process_coding_system,
8059 doc: /* Cons of coding systems used for process I/O by default.
8060 The car part is used for decoding a process output,
8061 the cdr part is used for encoding a text to be sent to a process. */);
8062 Vdefault_process_coding_system = Qnil;
8063
8064 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8065 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8066 This is a vector of length 256.
8067 If Nth element is non-nil, the existence of code N in a file
8068 \(or output of subprocess) doesn't prevent it to be detected as
8069 a coding system of ISO 2022 variant which has a flag
8070 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8071 or reading output of a subprocess.
8072 Only 128th through 159th elements has a meaning. */);
8073 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8074
8075 DEFVAR_LISP ("select-safe-coding-system-function",
8076 &Vselect_safe_coding_system_function,
8077 doc: /* Function to call to select safe coding system for encoding a text.
8078
8079 If set, this function is called to force a user to select a proper
8080 coding system which can encode the text in the case that a default
8081 coding system used in each operation can't encode the text.
8082
8083 The default value is `select-safe-coding-system' (which see). */);
8084 Vselect_safe_coding_system_function = Qnil;
8085
8086 DEFVAR_BOOL ("coding-system-require-warning",
8087 &coding_system_require_warning,
8088 doc: /* Internal use only.
8089 If non-nil, on writing a file, `select-safe-coding-system-function' is
8090 called even if `coding-system-for-write' is non-nil. The command
8091 `universal-coding-system-argument' binds this variable to t temporarily. */);
8092 coding_system_require_warning = 0;
8093
8094
8095 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8096 &inhibit_iso_escape_detection,
8097 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8098
8099 By default, on reading a file, Emacs tries to detect how the text is
8100 encoded. This code detection is sensitive to escape sequences. If
8101 the sequence is valid as ISO2022, the code is determined as one of
8102 the ISO2022 encodings, and the file is decoded by the corresponding
8103 coding system (e.g. `iso-2022-7bit').
8104
8105 However, there may be a case that you want to read escape sequences in
8106 a file as is. In such a case, you can set this variable to non-nil.
8107 Then, as the code detection ignores any escape sequences, no file is
8108 detected as encoded in some ISO2022 encoding. The result is that all
8109 escape sequences become visible in a buffer.
8110
8111 The default value is nil, and it is strongly recommended not to change
8112 it. That is because many Emacs Lisp source files that contain
8113 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8114 in Emacs's distribution, and they won't be decoded correctly on
8115 reading if you suppress escape sequence detection.
8116
8117 The other way to read escape sequences in a file without decoding is
8118 to explicitly specify some coding system that doesn't use ISO2022's
8119 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8120 inhibit_iso_escape_detection = 0;
8121
8122 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8123 doc: /* Char table for translating self-inserting characters.
8124 This is applied to the result of input methods, not their input. See also
8125 `keyboard-translate-table'. */);
8126 Vtranslation_table_for_input = Qnil;
8127 }
8128
8129 char *
8130 emacs_strerror (error_number)
8131 int error_number;
8132 {
8133 char *str;
8134
8135 synchronize_system_messages_locale ();
8136 str = strerror (error_number);
8137
8138 if (! NILP (Vlocale_coding_system))
8139 {
8140 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8141 Vlocale_coding_system,
8142 0);
8143 str = (char *) SDATA (dec);
8144 }
8145
8146 return str;
8147 }
8148
8149 #endif /* emacs */
8150
8151 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8152 (do not change this comment) */