Merge from emacs--rel--22
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 Boston, MA 02110-1301, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-mule) handlers
31 3. ISO2022 handlers
32 4. Shift-JIS and BIG5 handlers
33 5. CCL handlers
34 6. End-of-line handlers
35 7. C library functions
36 8. Emacs Lisp library functions
37 9. Post-amble
38
39 */
40
41 /*** 0. General comments ***/
42
43
44 /*** GENERAL NOTE on CODING SYSTEMS ***
45
46 A coding system is an encoding mechanism for one or more character
47 sets. Here's a list of coding systems which Emacs can handle. When
48 we say "decode", it means converting some other coding system to
49 Emacs' internal format (emacs-mule), and when we say "encode",
50 it means converting the coding system emacs-mule to some other
51 coding system.
52
53 0. Emacs' internal format (emacs-mule)
54
55 Emacs itself holds a multi-lingual character in buffers and strings
56 in a special format. Details are described in section 2.
57
58 1. ISO2022
59
60 The most famous coding system for multiple character sets. X's
61 Compound Text, various EUCs (Extended Unix Code), and coding
62 systems used in Internet communication such as ISO-2022-JP are
63 all variants of ISO2022. Details are described in section 3.
64
65 2. SJIS (or Shift-JIS or MS-Kanji-Code)
66
67 A coding system to encode character sets: ASCII, JISX0201, and
68 JISX0208. Widely used for PC's in Japan. Details are described in
69 section 4.
70
71 3. BIG5
72
73 A coding system to encode the character sets ASCII and Big5. Widely
74 used for Chinese (mainly in Taiwan and Hong Kong). Details are
75 described in section 4. In this file, when we write "BIG5"
76 (all uppercase), we mean the coding system, and when we write
77 "Big5" (capitalized), we mean the character set.
78
79 4. Raw text
80
81 A coding system for text containing random 8-bit code. Emacs does
82 no code conversion on such text except for end-of-line format.
83
84 5. Other
85
86 If a user wants to read/write text encoded in a coding system not
87 listed above, he can supply a decoder and an encoder for it as CCL
88 (Code Conversion Language) programs. Emacs executes the CCL program
89 while reading/writing.
90
91 Emacs represents a coding system by a Lisp symbol that has a property
92 `coding-system'. But, before actually using the coding system, the
93 information about it is set in a structure of type `struct
94 coding_system' for rapid processing. See section 6 for more details.
95
96 */
97
98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
99
100 How end-of-line of text is encoded depends on the operating system.
101 For instance, Unix's format is just one byte of `line-feed' code,
102 whereas DOS's format is two-byte sequence of `carriage-return' and
103 `line-feed' codes. MacOS's format is usually one byte of
104 `carriage-return'.
105
106 Since text character encoding and end-of-line encoding are
107 independent, any coding system described above can have any
108 end-of-line format. So Emacs has information about end-of-line
109 format in each coding-system. See section 6 for more details.
110
111 */
112
113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
114
115 These functions check if a text between SRC and SRC_END is encoded
116 in the coding system category XXX. Each returns an integer value in
117 which appropriate flag bits for the category XXX are set. The flag
118 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
119 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
120 of the range 0x80..0x9F are in multibyte form. */
121 #if 0
122 int
123 detect_coding_emacs_mule (src, src_end, multibytep)
124 unsigned char *src, *src_end;
125 int multibytep;
126 {
127 ...
128 }
129 #endif
130
131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
132
133 These functions decode SRC_BYTES length of unibyte text at SOURCE
134 encoded in CODING to Emacs' internal format. The resulting
135 multibyte text goes to a place pointed to by DESTINATION, the length
136 of which should not exceed DST_BYTES.
137
138 These functions set the information about original and decoded texts
139 in the members `produced', `produced_char', `consumed', and
140 `consumed_char' of the structure *CODING. They also set the member
141 `result' to one of CODING_FINISH_XXX indicating how the decoding
142 finished.
143
144 DST_BYTES zero means that the source area and destination area are
145 overlapped, which means that we can produce a decoded text until it
146 reaches the head of the not-yet-decoded source text.
147
148 Below is a template for these functions. */
149 #if 0
150 static void
151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
152 struct coding_system *coding;
153 const unsigned char *source;
154 unsigned char *destination;
155 int src_bytes, dst_bytes;
156 {
157 ...
158 }
159 #endif
160
161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
162
163 These functions encode SRC_BYTES length text at SOURCE from Emacs'
164 internal multibyte format to CODING. The resulting unibyte text
165 goes to a place pointed to by DESTINATION, the length of which
166 should not exceed DST_BYTES.
167
168 These functions set the information about original and encoded texts
169 in the members `produced', `produced_char', `consumed', and
170 `consumed_char' of the structure *CODING. They also set the member
171 `result' to one of CODING_FINISH_XXX indicating how the encoding
172 finished.
173
174 DST_BYTES zero means that the source area and destination area are
175 overlapped, which means that we can produce encoded text until it
176 reaches at the head of the not-yet-encoded source text.
177
178 Below is a template for these functions. */
179 #if 0
180 static void
181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
182 struct coding_system *coding;
183 unsigned char *source, *destination;
184 int src_bytes, dst_bytes;
185 {
186 ...
187 }
188 #endif
189
190 /*** COMMONLY USED MACROS ***/
191
192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
193 get one, two, and three bytes from the source text respectively.
194 If there are not enough bytes in the source, they jump to
195 `label_end_of_loop'. The caller should set variables `coding',
196 `src' and `src_end' to appropriate pointer in advance. These
197 macros are called from decoding routines `decode_coding_XXX', thus
198 it is assumed that the source text is unibyte. */
199
200 #define ONE_MORE_BYTE(c1) \
201 do { \
202 if (src >= src_end) \
203 { \
204 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
205 goto label_end_of_loop; \
206 } \
207 c1 = *src++; \
208 } while (0)
209
210 #define TWO_MORE_BYTES(c1, c2) \
211 do { \
212 if (src + 1 >= src_end) \
213 { \
214 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
215 goto label_end_of_loop; \
216 } \
217 c1 = *src++; \
218 c2 = *src++; \
219 } while (0)
220
221
222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
223 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
224 than SRC_END, return with RET. */
225
226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
227 do { \
228 if (src >= src_end) \
229 { \
230 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
231 return ret; \
232 } \
233 c1 = *src++; \
234 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
235 c1 = *src++ - 0x20; \
236 } while (0)
237
238 /* Set C to the next character at the source text pointed by `src'.
239 If there are not enough characters in the source, jump to
240 `label_end_of_loop'. The caller should set variables `coding'
241 `src', `src_end', and `translation_table' to appropriate pointers
242 in advance. This macro is used in encoding routines
243 `encode_coding_XXX', thus it assumes that the source text is in
244 multibyte form except for 8-bit characters. 8-bit characters are
245 in multibyte form if coding->src_multibyte is nonzero, else they
246 are represented by a single byte. */
247
248 #define ONE_MORE_CHAR(c) \
249 do { \
250 int len = src_end - src; \
251 int bytes; \
252 if (len <= 0) \
253 { \
254 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
255 goto label_end_of_loop; \
256 } \
257 if (coding->src_multibyte \
258 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
259 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
260 else \
261 c = *src, bytes = 1; \
262 if (!NILP (translation_table)) \
263 c = translate_char (translation_table, c, -1, 0, 0); \
264 src += bytes; \
265 } while (0)
266
267
268 /* Produce a multibyte form of character C to `dst'. Jump to
269 `label_end_of_loop' if there's not enough space at `dst'.
270
271 If we are now in the middle of a composition sequence, the decoded
272 character may be ALTCHAR (for the current composition). In that
273 case, the character goes to coding->cmp_data->data instead of
274 `dst'.
275
276 This macro is used in decoding routines. */
277
278 #define EMIT_CHAR(c) \
279 do { \
280 if (! COMPOSING_P (coding) \
281 || coding->composing == COMPOSITION_RELATIVE \
282 || coding->composing == COMPOSITION_WITH_RULE) \
283 { \
284 int bytes = CHAR_BYTES (c); \
285 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
286 { \
287 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
288 goto label_end_of_loop; \
289 } \
290 dst += CHAR_STRING (c, dst); \
291 coding->produced_char++; \
292 } \
293 \
294 if (COMPOSING_P (coding) \
295 && coding->composing != COMPOSITION_RELATIVE) \
296 { \
297 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
298 coding->composition_rule_follows \
299 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
300 } \
301 } while (0)
302
303
304 #define EMIT_ONE_BYTE(c) \
305 do { \
306 if (dst >= (dst_bytes ? dst_end : src)) \
307 { \
308 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
309 goto label_end_of_loop; \
310 } \
311 *dst++ = c; \
312 } while (0)
313
314 #define EMIT_TWO_BYTES(c1, c2) \
315 do { \
316 if (dst + 2 > (dst_bytes ? dst_end : src)) \
317 { \
318 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
319 goto label_end_of_loop; \
320 } \
321 *dst++ = c1, *dst++ = c2; \
322 } while (0)
323
324 #define EMIT_BYTES(from, to) \
325 do { \
326 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
327 { \
328 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
329 goto label_end_of_loop; \
330 } \
331 while (from < to) \
332 *dst++ = *from++; \
333 } while (0)
334
335 \f
336 /*** 1. Preamble ***/
337
338 #ifdef emacs
339 #include <config.h>
340 #endif
341
342 #include <stdio.h>
343
344 #ifdef emacs
345
346 #include "lisp.h"
347 #include "buffer.h"
348 #include "charset.h"
349 #include "composite.h"
350 #include "ccl.h"
351 #include "coding.h"
352 #include "window.h"
353 #include "intervals.h"
354 #include "frame.h"
355 #include "termhooks.h"
356
357 #else /* not emacs */
358
359 #include "mulelib.h"
360
361 #endif /* not emacs */
362
363 Lisp_Object Qcoding_system, Qeol_type;
364 Lisp_Object Qbuffer_file_coding_system;
365 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
366 Lisp_Object Qno_conversion, Qundecided;
367 Lisp_Object Qcoding_system_history;
368 Lisp_Object Qsafe_chars;
369 Lisp_Object Qvalid_codes;
370 Lisp_Object Qascii_incompatible;
371
372 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
373 Lisp_Object Qcall_process, Qcall_process_region;
374 Lisp_Object Qstart_process, Qopen_network_stream;
375 Lisp_Object Qtarget_idx;
376
377 extern Lisp_Object Qcompletion_ignore_case;
378
379 /* If a symbol has this property, evaluate the value to define the
380 symbol as a coding system. */
381 Lisp_Object Qcoding_system_define_form;
382
383 Lisp_Object Vselect_safe_coding_system_function;
384
385 int coding_system_require_warning;
386
387 /* Mnemonic string for each format of end-of-line. */
388 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
389 /* Mnemonic string to indicate format of end-of-line is not yet
390 decided. */
391 Lisp_Object eol_mnemonic_undecided;
392
393 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
394 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
395 This has an effect only for external encoding (i.e. for output to
396 file and process), not for in-buffer or Lisp string encoding. */
397 int system_eol_type;
398
399 #ifdef emacs
400
401 /* Information about which coding system is safe for which chars.
402 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
403
404 GENERIC-LIST is a list of generic coding systems which can encode
405 any characters.
406
407 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
408 corresponding char table that contains safe chars. */
409 Lisp_Object Vcoding_system_safe_chars;
410
411 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
412
413 Lisp_Object Qcoding_system_p, Qcoding_system_error;
414
415 /* Coding system emacs-mule and raw-text are for converting only
416 end-of-line format. */
417 Lisp_Object Qemacs_mule, Qraw_text;
418
419 Lisp_Object Qutf_8;
420
421 /* Coding-systems are handed between Emacs Lisp programs and C internal
422 routines by the following three variables. */
423 /* Coding-system for reading files and receiving data from process. */
424 Lisp_Object Vcoding_system_for_read;
425 /* Coding-system for writing files and sending data to process. */
426 Lisp_Object Vcoding_system_for_write;
427 /* Coding-system actually used in the latest I/O. */
428 Lisp_Object Vlast_coding_system_used;
429
430 /* A vector of length 256 which contains information about special
431 Latin codes (especially for dealing with Microsoft codes). */
432 Lisp_Object Vlatin_extra_code_table;
433
434 /* Flag to inhibit code conversion of end-of-line format. */
435 int inhibit_eol_conversion;
436
437 /* Flag to inhibit ISO2022 escape sequence detection. */
438 int inhibit_iso_escape_detection;
439
440 /* Flag to make buffer-file-coding-system inherit from process-coding. */
441 int inherit_process_coding_system;
442
443 /* Coding system to be used to encode text for terminal display when
444 terminal coding system is nil. */
445 struct coding_system safe_terminal_coding;
446
447 /* Default coding system to be used to write a file. */
448 struct coding_system default_buffer_file_coding;
449
450 Lisp_Object Vfile_coding_system_alist;
451 Lisp_Object Vprocess_coding_system_alist;
452 Lisp_Object Vnetwork_coding_system_alist;
453
454 Lisp_Object Vlocale_coding_system;
455
456 #endif /* emacs */
457
458 Lisp_Object Qcoding_category, Qcoding_category_index;
459
460 /* List of symbols `coding-category-xxx' ordered by priority. */
461 Lisp_Object Vcoding_category_list;
462
463 /* Table of coding categories (Lisp symbols). */
464 Lisp_Object Vcoding_category_table;
465
466 /* Table of names of symbol for each coding-category. */
467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
468 "coding-category-emacs-mule",
469 "coding-category-sjis",
470 "coding-category-iso-7",
471 "coding-category-iso-7-tight",
472 "coding-category-iso-8-1",
473 "coding-category-iso-8-2",
474 "coding-category-iso-7-else",
475 "coding-category-iso-8-else",
476 "coding-category-ccl",
477 "coding-category-big5",
478 "coding-category-utf-8",
479 "coding-category-utf-16-be",
480 "coding-category-utf-16-le",
481 "coding-category-raw-text",
482 "coding-category-binary"
483 };
484
485 /* Table of pointers to coding systems corresponding to each coding
486 categories. */
487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
488
489 /* Table of coding category masks. Nth element is a mask for a coding
490 category of which priority is Nth. */
491 static
492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
493
494 /* Flag to tell if we look up translation table on character code
495 conversion. */
496 Lisp_Object Venable_character_translation;
497 /* Standard translation table to look up on decoding (reading). */
498 Lisp_Object Vstandard_translation_table_for_decode;
499 /* Standard translation table to look up on encoding (writing). */
500 Lisp_Object Vstandard_translation_table_for_encode;
501
502 Lisp_Object Qtranslation_table;
503 Lisp_Object Qtranslation_table_id;
504 Lisp_Object Qtranslation_table_for_decode;
505 Lisp_Object Qtranslation_table_for_encode;
506
507 /* Alist of charsets vs revision number. */
508 Lisp_Object Vcharset_revision_alist;
509
510 /* Default coding systems used for process I/O. */
511 Lisp_Object Vdefault_process_coding_system;
512
513 /* Char table for translating Quail and self-inserting input. */
514 Lisp_Object Vtranslation_table_for_input;
515
516 /* Global flag to tell that we can't call post-read-conversion and
517 pre-write-conversion functions. Usually the value is zero, but it
518 is set to 1 temporarily while such functions are running. This is
519 to avoid infinite recursive call. */
520 static int inhibit_pre_post_conversion;
521
522 Lisp_Object Qchar_coding_system;
523
524 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
525 its validity. */
526
527 Lisp_Object
528 coding_safe_chars (coding_system)
529 Lisp_Object coding_system;
530 {
531 Lisp_Object coding_spec, plist, safe_chars;
532
533 coding_spec = Fget (coding_system, Qcoding_system);
534 plist = XVECTOR (coding_spec)->contents[3];
535 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
536 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
537 }
538
539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
540 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
541
542 \f
543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
544
545 /* Emacs' internal format for representation of multiple character
546 sets is a kind of multi-byte encoding, i.e. characters are
547 represented by variable-length sequences of one-byte codes.
548
549 ASCII characters and control characters (e.g. `tab', `newline') are
550 represented by one-byte sequences which are their ASCII codes, in
551 the range 0x00 through 0x7F.
552
553 8-bit characters of the range 0x80..0x9F are represented by
554 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
555 code + 0x20).
556
557 8-bit characters of the range 0xA0..0xFF are represented by
558 one-byte sequences which are their 8-bit code.
559
560 The other characters are represented by a sequence of `base
561 leading-code', optional `extended leading-code', and one or two
562 `position-code's. The length of the sequence is determined by the
563 base leading-code. Leading-code takes the range 0x81 through 0x9D,
564 whereas extended leading-code and position-code take the range 0xA0
565 through 0xFF. See `charset.h' for more details about leading-code
566 and position-code.
567
568 --- CODE RANGE of Emacs' internal format ---
569 character set range
570 ------------- -----
571 ascii 0x00..0x7F
572 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
573 eight-bit-graphic 0xA0..0xBF
574 ELSE 0x81..0x9D + [0xA0..0xFF]+
575 ---------------------------------------------
576
577 As this is the internal character representation, the format is
578 usually not used externally (i.e. in a file or in a data sent to a
579 process). But, it is possible to have a text externally in this
580 format (i.e. by encoding by the coding system `emacs-mule').
581
582 In that case, a sequence of one-byte codes has a slightly different
583 form.
584
585 Firstly, all characters in eight-bit-control are represented by
586 one-byte sequences which are their 8-bit code.
587
588 Next, character composition data are represented by the byte
589 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
590 where,
591 METHOD is 0xF0 plus one of composition method (enum
592 composition_method),
593
594 BYTES is 0xA0 plus the byte length of these composition data,
595
596 CHARS is 0xA0 plus the number of characters composed by these
597 data,
598
599 COMPONENTs are characters of multibyte form or composition
600 rules encoded by two-byte of ASCII codes.
601
602 In addition, for backward compatibility, the following formats are
603 also recognized as composition data on decoding.
604
605 0x80 MSEQ ...
606 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
607
608 Here,
609 MSEQ is a multibyte form but in these special format:
610 ASCII: 0xA0 ASCII_CODE+0x80,
611 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
612 RULE is a one byte code of the range 0xA0..0xF0 that
613 represents a composition rule.
614 */
615
616 enum emacs_code_class_type emacs_code_class[256];
617
618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
619 Check if a text is encoded in Emacs' internal format. If it is,
620 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
621
622 static int
623 detect_coding_emacs_mule (src, src_end, multibytep)
624 unsigned char *src, *src_end;
625 int multibytep;
626 {
627 unsigned char c;
628 int composing = 0;
629 /* Dummy for ONE_MORE_BYTE. */
630 struct coding_system dummy_coding;
631 struct coding_system *coding = &dummy_coding;
632
633 while (1)
634 {
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
636 CODING_CATEGORY_MASK_EMACS_MULE);
637 if (composing)
638 {
639 if (c < 0xA0)
640 composing = 0;
641 else if (c == 0xA0)
642 {
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
644 c &= 0x7F;
645 }
646 else
647 c -= 0x20;
648 }
649
650 if (c < 0x20)
651 {
652 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
653 return 0;
654 }
655 else if (c >= 0x80 && c < 0xA0)
656 {
657 if (c == 0x80)
658 /* Old leading code for a composite character. */
659 composing = 1;
660 else
661 {
662 unsigned char *src_base = src - 1;
663 int bytes;
664
665 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
666 bytes))
667 return 0;
668 src = src_base + bytes;
669 }
670 }
671 }
672 }
673
674
675 /* Record the starting position START and METHOD of one composition. */
676
677 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
678 do { \
679 struct composition_data *cmp_data = coding->cmp_data; \
680 int *data = cmp_data->data + cmp_data->used; \
681 coding->cmp_data_start = cmp_data->used; \
682 data[0] = -1; \
683 data[1] = cmp_data->char_offset + start; \
684 data[3] = (int) method; \
685 cmp_data->used += 4; \
686 } while (0)
687
688 /* Record the ending position END of the current composition. */
689
690 #define CODING_ADD_COMPOSITION_END(coding, end) \
691 do { \
692 struct composition_data *cmp_data = coding->cmp_data; \
693 int *data = cmp_data->data + coding->cmp_data_start; \
694 data[0] = cmp_data->used - coding->cmp_data_start; \
695 data[2] = cmp_data->char_offset + end; \
696 } while (0)
697
698 /* Record one COMPONENT (alternate character or composition rule). */
699
700 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
701 do { \
702 coding->cmp_data->data[coding->cmp_data->used++] = component; \
703 if (coding->cmp_data->used - coding->cmp_data_start \
704 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
705 { \
706 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
707 coding->composing = COMPOSITION_NO; \
708 } \
709 } while (0)
710
711
712 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
713 is not less than SRC_END, return -1 without incrementing Src. */
714
715 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
716
717
718 /* Decode a character represented as a component of composition
719 sequence of Emacs 20 style at SRC. Set C to that character, store
720 its multibyte form sequence at P, and set P to the end of that
721 sequence. If no valid character is found, set C to -1. */
722
723 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
724 do { \
725 int bytes; \
726 \
727 c = SAFE_ONE_MORE_BYTE (); \
728 if (c < 0) \
729 break; \
730 if (CHAR_HEAD_P (c)) \
731 c = -1; \
732 else if (c == 0xA0) \
733 { \
734 c = SAFE_ONE_MORE_BYTE (); \
735 if (c < 0xA0) \
736 c = -1; \
737 else \
738 { \
739 c -= 0x80; \
740 *p++ = c; \
741 } \
742 } \
743 else if (BASE_LEADING_CODE_P (c - 0x20)) \
744 { \
745 unsigned char *p0 = p; \
746 \
747 c -= 0x20; \
748 *p++ = c; \
749 bytes = BYTES_BY_CHAR_HEAD (c); \
750 while (--bytes) \
751 { \
752 c = SAFE_ONE_MORE_BYTE (); \
753 if (c < 0) \
754 break; \
755 *p++ = c; \
756 } \
757 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
758 || (coding->flags /* We are recovering a file. */ \
759 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
760 && ! CHAR_HEAD_P (p0[1]))) \
761 c = STRING_CHAR (p0, bytes); \
762 else \
763 c = -1; \
764 } \
765 else \
766 c = -1; \
767 } while (0)
768
769
770 /* Decode a composition rule represented as a component of composition
771 sequence of Emacs 20 style at SRC. Set C to the rule. If not
772 valid rule is found, set C to -1. */
773
774 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
775 do { \
776 c = SAFE_ONE_MORE_BYTE (); \
777 c -= 0xA0; \
778 if (c < 0 || c >= 81) \
779 c = -1; \
780 else \
781 { \
782 gref = c / 9, nref = c % 9; \
783 c = COMPOSITION_ENCODE_RULE (gref, nref); \
784 } \
785 } while (0)
786
787
788 /* Decode composition sequence encoded by `emacs-mule' at the source
789 pointed by SRC. SRC_END is the end of source. Store information
790 of the composition in CODING->cmp_data.
791
792 For backward compatibility, decode also a composition sequence of
793 Emacs 20 style. In that case, the composition sequence contains
794 characters that should be extracted into a buffer or string. Store
795 those characters at *DESTINATION in multibyte form.
796
797 If we encounter an invalid byte sequence, return 0.
798 If we encounter an insufficient source or destination, or
799 insufficient space in CODING->cmp_data, return 1.
800 Otherwise, return consumed bytes in the source.
801
802 */
803 static INLINE int
804 decode_composition_emacs_mule (coding, src, src_end,
805 destination, dst_end, dst_bytes)
806 struct coding_system *coding;
807 const unsigned char *src, *src_end;
808 unsigned char **destination, *dst_end;
809 int dst_bytes;
810 {
811 unsigned char *dst = *destination;
812 int method, data_len, nchars;
813 const unsigned char *src_base = src++;
814 /* Store components of composition. */
815 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
816 int ncomponent;
817 /* Store multibyte form of characters to be composed. This is for
818 Emacs 20 style composition sequence. */
819 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
820 unsigned char *bufp = buf;
821 int c, i, gref, nref;
822
823 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
824 >= COMPOSITION_DATA_SIZE)
825 {
826 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
827 return -1;
828 }
829
830 ONE_MORE_BYTE (c);
831 if (c - 0xF0 >= COMPOSITION_RELATIVE
832 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
833 {
834 int with_rule;
835
836 method = c - 0xF0;
837 with_rule = (method == COMPOSITION_WITH_RULE
838 || method == COMPOSITION_WITH_RULE_ALTCHARS);
839 ONE_MORE_BYTE (c);
840 data_len = c - 0xA0;
841 if (data_len < 4
842 || src_base + data_len > src_end)
843 return 0;
844 ONE_MORE_BYTE (c);
845 nchars = c - 0xA0;
846 if (c < 1)
847 return 0;
848 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
849 {
850 /* If it is longer than this, it can't be valid. */
851 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
852 return 0;
853
854 if (ncomponent % 2 && with_rule)
855 {
856 ONE_MORE_BYTE (gref);
857 gref -= 32;
858 ONE_MORE_BYTE (nref);
859 nref -= 32;
860 c = COMPOSITION_ENCODE_RULE (gref, nref);
861 }
862 else
863 {
864 int bytes;
865 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
866 || (coding->flags /* We are recovering a file. */
867 && src[0] == LEADING_CODE_8_BIT_CONTROL
868 && ! CHAR_HEAD_P (src[1])))
869 c = STRING_CHAR (src, bytes);
870 else
871 c = *src, bytes = 1;
872 src += bytes;
873 }
874 component[ncomponent] = c;
875 }
876 }
877 else if (c >= 0x80)
878 {
879 /* This may be an old Emacs 20 style format. See the comment at
880 the section 2 of this file. */
881 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
882 if (src == src_end
883 && !(coding->mode & CODING_MODE_LAST_BLOCK))
884 goto label_end_of_loop;
885
886 src_end = src;
887 src = src_base + 1;
888 if (c < 0xC0)
889 {
890 method = COMPOSITION_RELATIVE;
891 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
892 {
893 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
894 if (c < 0)
895 break;
896 component[ncomponent++] = c;
897 }
898 if (ncomponent < 2)
899 return 0;
900 nchars = ncomponent;
901 }
902 else if (c == 0xFF)
903 {
904 method = COMPOSITION_WITH_RULE;
905 src++;
906 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
907 if (c < 0)
908 return 0;
909 component[0] = c;
910 for (ncomponent = 1;
911 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
912 {
913 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
914 if (c < 0)
915 break;
916 component[ncomponent++] = c;
917 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
918 if (c < 0)
919 break;
920 component[ncomponent++] = c;
921 }
922 if (ncomponent < 3)
923 return 0;
924 nchars = (ncomponent + 1) / 2;
925 }
926 else
927 return 0;
928 }
929 else
930 return 0;
931
932 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
933 {
934 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
935 for (i = 0; i < ncomponent; i++)
936 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
937 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
938 if (buf < bufp)
939 {
940 unsigned char *p = buf;
941 EMIT_BYTES (p, bufp);
942 *destination += bufp - buf;
943 coding->produced_char += nchars;
944 }
945 return (src - src_base);
946 }
947 label_end_of_loop:
948 return -1;
949 }
950
951 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
952
953 static void
954 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
955 struct coding_system *coding;
956 const unsigned char *source;
957 unsigned char *destination;
958 int src_bytes, dst_bytes;
959 {
960 const unsigned char *src = source;
961 const unsigned char *src_end = source + src_bytes;
962 unsigned char *dst = destination;
963 unsigned char *dst_end = destination + dst_bytes;
964 /* SRC_BASE remembers the start position in source in each loop.
965 The loop will be exited when there's not enough source code, or
966 when there's not enough destination area to produce a
967 character. */
968 const unsigned char *src_base;
969
970 coding->produced_char = 0;
971 while ((src_base = src) < src_end)
972 {
973 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
974 const unsigned char *p;
975 int bytes;
976
977 if (*src == '\r')
978 {
979 int c = *src++;
980
981 if (coding->eol_type == CODING_EOL_CR)
982 c = '\n';
983 else if (coding->eol_type == CODING_EOL_CRLF)
984 {
985 ONE_MORE_BYTE (c);
986 if (c != '\n')
987 {
988 src--;
989 c = '\r';
990 }
991 }
992 *dst++ = c;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == '\n')
997 {
998 if ((coding->eol_type == CODING_EOL_CR
999 || coding->eol_type == CODING_EOL_CRLF)
1000 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1001 {
1002 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1003 goto label_end_of_loop;
1004 }
1005 *dst++ = *src++;
1006 coding->produced_char++;
1007 continue;
1008 }
1009 else if (*src == 0x80 && coding->cmp_data)
1010 {
1011 /* Start of composition data. */
1012 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1013 &dst, dst_end,
1014 dst_bytes);
1015 if (consumed < 0)
1016 goto label_end_of_loop;
1017 else if (consumed > 0)
1018 {
1019 src += consumed;
1020 continue;
1021 }
1022 bytes = CHAR_STRING (*src, tmp);
1023 p = tmp;
1024 src++;
1025 }
1026 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1027 || (coding->flags /* We are recovering a file. */
1028 && src[0] == LEADING_CODE_8_BIT_CONTROL
1029 && ! CHAR_HEAD_P (src[1])))
1030 {
1031 p = src;
1032 src += bytes;
1033 }
1034 else
1035 {
1036 int i, c;
1037
1038 bytes = BYTES_BY_CHAR_HEAD (*src);
1039 src++;
1040 for (i = 1; i < bytes; i++)
1041 {
1042 ONE_MORE_BYTE (c);
1043 if (CHAR_HEAD_P (c))
1044 break;
1045 }
1046 if (i < bytes)
1047 {
1048 bytes = CHAR_STRING (*src_base, tmp);
1049 p = tmp;
1050 src = src_base + 1;
1051 }
1052 else
1053 {
1054 p = src_base;
1055 }
1056 }
1057 if (dst + bytes >= (dst_bytes ? dst_end : src))
1058 {
1059 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1060 break;
1061 }
1062 while (bytes--) *dst++ = *p++;
1063 coding->produced_char++;
1064 }
1065 label_end_of_loop:
1066 coding->consumed = coding->consumed_char = src_base - source;
1067 coding->produced = dst - destination;
1068 }
1069
1070
1071 /* Encode composition data stored at DATA into a special byte sequence
1072 starting by 0x80. Update CODING->cmp_data_start and maybe
1073 CODING->cmp_data for the next call. */
1074
1075 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1076 do { \
1077 unsigned char buf[1024], *p0 = buf, *p; \
1078 int len = data[0]; \
1079 int i; \
1080 \
1081 buf[0] = 0x80; \
1082 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1083 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1084 p = buf + 4; \
1085 if (data[3] == COMPOSITION_WITH_RULE \
1086 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1087 { \
1088 p += CHAR_STRING (data[4], p); \
1089 for (i = 5; i < len; i += 2) \
1090 { \
1091 int gref, nref; \
1092 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1093 *p++ = 0x20 + gref; \
1094 *p++ = 0x20 + nref; \
1095 p += CHAR_STRING (data[i + 1], p); \
1096 } \
1097 } \
1098 else \
1099 { \
1100 for (i = 4; i < len; i++) \
1101 p += CHAR_STRING (data[i], p); \
1102 } \
1103 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1104 \
1105 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1106 { \
1107 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1108 goto label_end_of_loop; \
1109 } \
1110 while (p0 < p) \
1111 *dst++ = *p0++; \
1112 coding->cmp_data_start += data[0]; \
1113 if (coding->cmp_data_start == coding->cmp_data->used \
1114 && coding->cmp_data->next) \
1115 { \
1116 coding->cmp_data = coding->cmp_data->next; \
1117 coding->cmp_data_start = 0; \
1118 } \
1119 } while (0)
1120
1121
1122 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1123 unsigned char *, int, int));
1124
1125 static void
1126 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1127 struct coding_system *coding;
1128 const unsigned char *source;
1129 unsigned char *destination;
1130 int src_bytes, dst_bytes;
1131 {
1132 const unsigned char *src = source;
1133 const unsigned char *src_end = source + src_bytes;
1134 unsigned char *dst = destination;
1135 unsigned char *dst_end = destination + dst_bytes;
1136 const unsigned char *src_base;
1137 int c;
1138 int char_offset;
1139 int *data;
1140
1141 Lisp_Object translation_table;
1142
1143 translation_table = Qnil;
1144
1145 /* Optimization for the case that there's no composition. */
1146 if (!coding->cmp_data || coding->cmp_data->used == 0)
1147 {
1148 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1149 return;
1150 }
1151
1152 char_offset = coding->cmp_data->char_offset;
1153 data = coding->cmp_data->data + coding->cmp_data_start;
1154 while (1)
1155 {
1156 src_base = src;
1157
1158 /* If SRC starts a composition, encode the information about the
1159 composition in advance. */
1160 if (coding->cmp_data_start < coding->cmp_data->used
1161 && char_offset + coding->consumed_char == data[1])
1162 {
1163 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1164 char_offset = coding->cmp_data->char_offset;
1165 data = coding->cmp_data->data + coding->cmp_data_start;
1166 }
1167
1168 ONE_MORE_CHAR (c);
1169 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1170 || coding->eol_type == CODING_EOL_CR))
1171 {
1172 if (coding->eol_type == CODING_EOL_CRLF)
1173 EMIT_TWO_BYTES ('\r', c);
1174 else
1175 EMIT_ONE_BYTE ('\r');
1176 }
1177 else if (SINGLE_BYTE_CHAR_P (c))
1178 {
1179 if (coding->flags && ! ASCII_BYTE_P (c))
1180 {
1181 /* As we are auto saving, retain the multibyte form for
1182 8-bit chars. */
1183 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1184 int bytes = CHAR_STRING (c, buf);
1185
1186 if (bytes == 1)
1187 EMIT_ONE_BYTE (buf[0]);
1188 else
1189 EMIT_TWO_BYTES (buf[0], buf[1]);
1190 }
1191 else
1192 EMIT_ONE_BYTE (c);
1193 }
1194 else
1195 EMIT_BYTES (src_base, src);
1196 coding->consumed_char++;
1197 }
1198 label_end_of_loop:
1199 coding->consumed = src_base - source;
1200 coding->produced = coding->produced_char = dst - destination;
1201 return;
1202 }
1203
1204 \f
1205 /*** 3. ISO2022 handlers ***/
1206
1207 /* The following note describes the coding system ISO2022 briefly.
1208 Since the intention of this note is to help understand the
1209 functions in this file, some parts are NOT ACCURATE or are OVERLY
1210 SIMPLIFIED. For thorough understanding, please refer to the
1211 original document of ISO2022. This is equivalent to the standard
1212 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1213
1214 ISO2022 provides many mechanisms to encode several character sets
1215 in 7-bit and 8-bit environments. For 7-bit environments, all text
1216 is encoded using bytes less than 128. This may make the encoded
1217 text a little bit longer, but the text passes more easily through
1218 several types of gateway, some of which strip off the MSB (Most
1219 Significant Bit).
1220
1221 There are two kinds of character sets: control character sets and
1222 graphic character sets. The former contain control characters such
1223 as `newline' and `escape' to provide control functions (control
1224 functions are also provided by escape sequences). The latter
1225 contain graphic characters such as 'A' and '-'. Emacs recognizes
1226 two control character sets and many graphic character sets.
1227
1228 Graphic character sets are classified into one of the following
1229 four classes, according to the number of bytes (DIMENSION) and
1230 number of characters in one dimension (CHARS) of the set:
1231 - DIMENSION1_CHARS94
1232 - DIMENSION1_CHARS96
1233 - DIMENSION2_CHARS94
1234 - DIMENSION2_CHARS96
1235
1236 In addition, each character set is assigned an identification tag,
1237 unique for each set, called the "final character" (denoted as <F>
1238 hereafter). The <F> of each character set is decided by ECMA(*)
1239 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1240 (0x30..0x3F are for private use only).
1241
1242 Note (*): ECMA = European Computer Manufacturers Association
1243
1244 Here are examples of graphic character sets [NAME(<F>)]:
1245 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1246 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1247 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1248 o DIMENSION2_CHARS96 -- none for the moment
1249
1250 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1251 C0 [0x00..0x1F] -- control character plane 0
1252 GL [0x20..0x7F] -- graphic character plane 0
1253 C1 [0x80..0x9F] -- control character plane 1
1254 GR [0xA0..0xFF] -- graphic character plane 1
1255
1256 A control character set is directly designated and invoked to C0 or
1257 C1 by an escape sequence. The most common case is that:
1258 - ISO646's control character set is designated/invoked to C0, and
1259 - ISO6429's control character set is designated/invoked to C1,
1260 and usually these designations/invocations are omitted in encoded
1261 text. In a 7-bit environment, only C0 can be used, and a control
1262 character for C1 is encoded by an appropriate escape sequence to
1263 fit into the environment. All control characters for C1 are
1264 defined to have corresponding escape sequences.
1265
1266 A graphic character set is at first designated to one of four
1267 graphic registers (G0 through G3), then these graphic registers are
1268 invoked to GL or GR. These designations and invocations can be
1269 done independently. The most common case is that G0 is invoked to
1270 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1271 these invocations and designations are omitted in encoded text.
1272 In a 7-bit environment, only GL can be used.
1273
1274 When a graphic character set of CHARS94 is invoked to GL, codes
1275 0x20 and 0x7F of the GL area work as control characters SPACE and
1276 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1277 be used.
1278
1279 There are two ways of invocation: locking-shift and single-shift.
1280 With locking-shift, the invocation lasts until the next different
1281 invocation, whereas with single-shift, the invocation affects the
1282 following character only and doesn't affect the locking-shift
1283 state. Invocations are done by the following control characters or
1284 escape sequences:
1285
1286 ----------------------------------------------------------------------
1287 abbrev function cntrl escape seq description
1288 ----------------------------------------------------------------------
1289 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1290 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1291 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1292 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1293 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1294 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1295 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1296 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1297 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1298 ----------------------------------------------------------------------
1299 (*) These are not used by any known coding system.
1300
1301 Control characters for these functions are defined by macros
1302 ISO_CODE_XXX in `coding.h'.
1303
1304 Designations are done by the following escape sequences:
1305 ----------------------------------------------------------------------
1306 escape sequence description
1307 ----------------------------------------------------------------------
1308 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1309 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1310 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1311 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1312 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1313 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1314 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1315 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1316 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1317 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1318 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1319 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1320 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1321 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1322 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1323 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1324 ----------------------------------------------------------------------
1325
1326 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1327 of dimension 1, chars 94, and final character <F>, etc...
1328
1329 Note (*): Although these designations are not allowed in ISO2022,
1330 Emacs accepts them on decoding, and produces them on encoding
1331 CHARS96 character sets in a coding system which is characterized as
1332 7-bit environment, non-locking-shift, and non-single-shift.
1333
1334 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1335 '(' can be omitted. We refer to this as "short-form" hereafter.
1336
1337 Now you may notice that there are a lot of ways of encoding the
1338 same multilingual text in ISO2022. Actually, there exist many
1339 coding systems such as Compound Text (used in X11's inter client
1340 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1341 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1342 localized platforms), and all of these are variants of ISO2022.
1343
1344 In addition to the above, Emacs handles two more kinds of escape
1345 sequences: ISO6429's direction specification and Emacs' private
1346 sequence for specifying character composition.
1347
1348 ISO6429's direction specification takes the following form:
1349 o CSI ']' -- end of the current direction
1350 o CSI '0' ']' -- end of the current direction
1351 o CSI '1' ']' -- start of left-to-right text
1352 o CSI '2' ']' -- start of right-to-left text
1353 The control character CSI (0x9B: control sequence introducer) is
1354 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1355
1356 Character composition specification takes the following form:
1357 o ESC '0' -- start relative composition
1358 o ESC '1' -- end composition
1359 o ESC '2' -- start rule-base composition (*)
1360 o ESC '3' -- start relative composition with alternate chars (**)
1361 o ESC '4' -- start rule-base composition with alternate chars (**)
1362 Since these are not standard escape sequences of any ISO standard,
1363 the use of them with these meanings is restricted to Emacs only.
1364
1365 (*) This form is used only in Emacs 20.5 and older versions,
1366 but the newer versions can safely decode it.
1367 (**) This form is used only in Emacs 21.1 and newer versions,
1368 and the older versions can't decode it.
1369
1370 Here's a list of example usages of these composition escape
1371 sequences (categorized by `enum composition_method').
1372
1373 COMPOSITION_RELATIVE:
1374 ESC 0 CHAR [ CHAR ] ESC 1
1375 COMPOSITION_WITH_RULE:
1376 ESC 2 CHAR [ RULE CHAR ] ESC 1
1377 COMPOSITION_WITH_ALTCHARS:
1378 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1379 COMPOSITION_WITH_RULE_ALTCHARS:
1380 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1381
1382 enum iso_code_class_type iso_code_class[256];
1383
1384 #define CHARSET_OK(idx, charset, c) \
1385 (coding_system_table[idx] \
1386 && (charset == CHARSET_ASCII \
1387 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1388 CODING_SAFE_CHAR_P (safe_chars, c))) \
1389 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1390 charset) \
1391 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1392
1393 #define SHIFT_OUT_OK(idx) \
1394 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1395
1396 #define COMPOSITION_OK(idx) \
1397 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1398
1399 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1400 Check if a text is encoded in ISO2022. If it is, return an
1401 integer in which appropriate flag bits any of:
1402 CODING_CATEGORY_MASK_ISO_7
1403 CODING_CATEGORY_MASK_ISO_7_TIGHT
1404 CODING_CATEGORY_MASK_ISO_8_1
1405 CODING_CATEGORY_MASK_ISO_8_2
1406 CODING_CATEGORY_MASK_ISO_7_ELSE
1407 CODING_CATEGORY_MASK_ISO_8_ELSE
1408 are set. If a code which should never appear in ISO2022 is found,
1409 returns 0.
1410
1411 If *latin_extra_code_state is zero and Latin extra codes are found,
1412 set *latin_extra_code_state to 1 and return 0. If it is nonzero,
1413 accept Latin extra codes. */
1414
1415 static int
1416 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
1417 unsigned char *src, *src_end;
1418 int multibytep;
1419 int *latin_extra_code_state;
1420 {
1421 int mask = CODING_CATEGORY_MASK_ISO;
1422 int mask_found = 0;
1423 int reg[4], shift_out = 0, single_shifting = 0;
1424 int c, c1, charset;
1425 /* Dummy for ONE_MORE_BYTE. */
1426 struct coding_system dummy_coding;
1427 struct coding_system *coding = &dummy_coding;
1428 Lisp_Object safe_chars;
1429
1430 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1431 while (mask)
1432 {
1433 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1434 retry:
1435 switch (c)
1436 {
1437 case ISO_CODE_ESC:
1438 if (inhibit_iso_escape_detection)
1439 break;
1440 single_shifting = 0;
1441 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1442 if (c >= '(' && c <= '/')
1443 {
1444 /* Designation sequence for a charset of dimension 1. */
1445 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1446 if (c1 < ' ' || c1 >= 0x80
1447 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1448 /* Invalid designation sequence. Just ignore. */
1449 break;
1450 reg[(c - '(') % 4] = charset;
1451 }
1452 else if (c == '$')
1453 {
1454 /* Designation sequence for a charset of dimension 2. */
1455 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1456 if (c >= '@' && c <= 'B')
1457 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1458 reg[0] = charset = iso_charset_table[1][0][c];
1459 else if (c >= '(' && c <= '/')
1460 {
1461 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1462 mask & mask_found);
1463 if (c1 < ' ' || c1 >= 0x80
1464 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1465 /* Invalid designation sequence. Just ignore. */
1466 break;
1467 reg[(c - '(') % 4] = charset;
1468 }
1469 else
1470 /* Invalid designation sequence. Just ignore. */
1471 break;
1472 }
1473 else if (c == 'N' || c == 'O')
1474 {
1475 /* ESC <Fe> for SS2 or SS3. */
1476 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1477 break;
1478 }
1479 else if (c >= '0' && c <= '4')
1480 {
1481 /* ESC <Fp> for start/end composition. */
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1498 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1499 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1500 else
1501 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1502 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1503 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1504 else
1505 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1506 break;
1507 }
1508 else
1509 /* Invalid escape sequence. Just ignore. */
1510 break;
1511
1512 /* We found a valid designation sequence for CHARSET. */
1513 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1514 c = MAKE_CHAR (charset, 0, 0);
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1523 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1524 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1525 else
1526 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1527 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1528 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1529 else
1530 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1531 break;
1532
1533 case ISO_CODE_SO:
1534 if (inhibit_iso_escape_detection)
1535 break;
1536 single_shifting = 0;
1537 if (shift_out == 0
1538 && (reg[1] >= 0
1539 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1540 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1541 {
1542 /* Locking shift out. */
1543 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1544 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1545 }
1546 break;
1547
1548 case ISO_CODE_SI:
1549 if (inhibit_iso_escape_detection)
1550 break;
1551 single_shifting = 0;
1552 if (shift_out == 1)
1553 {
1554 /* Locking shift in. */
1555 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1556 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1557 }
1558 break;
1559
1560 case ISO_CODE_CSI:
1561 single_shifting = 0;
1562 case ISO_CODE_SS2:
1563 case ISO_CODE_SS3:
1564 {
1565 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1566
1567 if (inhibit_iso_escape_detection)
1568 break;
1569 if (c != ISO_CODE_CSI)
1570 {
1571 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572 & CODING_FLAG_ISO_SINGLE_SHIFT)
1573 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575 & CODING_FLAG_ISO_SINGLE_SHIFT)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577 single_shifting = 1;
1578 }
1579 if (VECTORP (Vlatin_extra_code_table)
1580 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1581 {
1582 if (! *latin_extra_code_state)
1583 {
1584 *latin_extra_code_state = 1;
1585 return 0;
1586 }
1587 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1588 & CODING_FLAG_ISO_LATIN_EXTRA)
1589 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1590 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1591 & CODING_FLAG_ISO_LATIN_EXTRA)
1592 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1593 }
1594 mask &= newmask;
1595 mask_found |= newmask;
1596 }
1597 break;
1598
1599 default:
1600 if (c < 0x80)
1601 {
1602 single_shifting = 0;
1603 break;
1604 }
1605 else if (c < 0xA0)
1606 {
1607 single_shifting = 0;
1608 if (VECTORP (Vlatin_extra_code_table)
1609 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1610 {
1611 int newmask = 0;
1612
1613 if (! *latin_extra_code_state)
1614 {
1615 *latin_extra_code_state = 1;
1616 return 0;
1617 }
1618 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1619 & CODING_FLAG_ISO_LATIN_EXTRA)
1620 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1621 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1622 & CODING_FLAG_ISO_LATIN_EXTRA)
1623 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1624 mask &= newmask;
1625 mask_found |= newmask;
1626 }
1627 else
1628 return 0;
1629 }
1630 else
1631 {
1632 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1633 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1634 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1635 /* Check the length of succeeding codes of the range
1636 0xA0..0FF. If the byte length is odd, we exclude
1637 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1638 when we are not single shifting. */
1639 if (!single_shifting
1640 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1641 {
1642 int i = 1;
1643
1644 c = -1;
1645 while (src < src_end)
1646 {
1647 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1648 mask & mask_found);
1649 if (c < 0xA0)
1650 break;
1651 i++;
1652 }
1653
1654 if (i & 1 && src < src_end)
1655 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1656 else
1657 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1658 if (c >= 0)
1659 /* This means that we have read one extra byte. */
1660 goto retry;
1661 }
1662 }
1663 break;
1664 }
1665 }
1666 return (mask & mask_found);
1667 }
1668
1669 /* Decode a character of which charset is CHARSET, the 1st position
1670 code is C1, the 2nd position code is C2, and return the decoded
1671 character code. If the variable `translation_table' is non-nil,
1672 returned the translated code. */
1673
1674 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1675 (NILP (translation_table) \
1676 ? MAKE_CHAR (charset, c1, c2) \
1677 : translate_char (translation_table, -1, charset, c1, c2))
1678
1679 /* Set designation state into CODING. */
1680 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1681 do { \
1682 int charset, c; \
1683 \
1684 if (final_char < '0' || final_char >= 128) \
1685 goto label_invalid_code; \
1686 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1687 make_number (chars), \
1688 make_number (final_char)); \
1689 c = MAKE_CHAR (charset, 0, 0); \
1690 if (charset >= 0 \
1691 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1692 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1693 { \
1694 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1695 && reg == 0 \
1696 && charset == CHARSET_ASCII) \
1697 { \
1698 /* We should insert this designation sequence as is so \
1699 that it is surely written back to a file. */ \
1700 coding->spec.iso2022.last_invalid_designation_register = -1; \
1701 goto label_invalid_code; \
1702 } \
1703 coding->spec.iso2022.last_invalid_designation_register = -1; \
1704 if ((coding->mode & CODING_MODE_DIRECTION) \
1705 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1706 charset = CHARSET_REVERSE_CHARSET (charset); \
1707 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1708 } \
1709 else \
1710 { \
1711 coding->spec.iso2022.last_invalid_designation_register = reg; \
1712 goto label_invalid_code; \
1713 } \
1714 } while (0)
1715
1716 /* Allocate a memory block for storing information about compositions.
1717 The block is chained to the already allocated blocks. */
1718
1719 void
1720 coding_allocate_composition_data (coding, char_offset)
1721 struct coding_system *coding;
1722 int char_offset;
1723 {
1724 struct composition_data *cmp_data
1725 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1726
1727 cmp_data->char_offset = char_offset;
1728 cmp_data->used = 0;
1729 cmp_data->prev = coding->cmp_data;
1730 cmp_data->next = NULL;
1731 if (coding->cmp_data)
1732 coding->cmp_data->next = cmp_data;
1733 coding->cmp_data = cmp_data;
1734 coding->cmp_data_start = 0;
1735 coding->composing = COMPOSITION_NO;
1736 }
1737
1738 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1739 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1740 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1741 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1742 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1743 */
1744
1745 #define DECODE_COMPOSITION_START(c1) \
1746 do { \
1747 if (coding->composing == COMPOSITION_DISABLED) \
1748 { \
1749 *dst++ = ISO_CODE_ESC; \
1750 *dst++ = c1 & 0x7f; \
1751 coding->produced_char += 2; \
1752 } \
1753 else if (!COMPOSING_P (coding)) \
1754 { \
1755 /* This is surely the start of a composition. We must be sure \
1756 that coding->cmp_data has enough space to store the \
1757 information about the composition. If not, terminate the \
1758 current decoding loop, allocate one more memory block for \
1759 coding->cmp_data in the caller, then start the decoding \
1760 loop again. We can't allocate memory here directly because \
1761 it may cause buffer/string relocation. */ \
1762 if (!coding->cmp_data \
1763 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1764 >= COMPOSITION_DATA_SIZE)) \
1765 { \
1766 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1767 goto label_end_of_loop; \
1768 } \
1769 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1770 : c1 == '2' ? COMPOSITION_WITH_RULE \
1771 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1772 : COMPOSITION_WITH_RULE_ALTCHARS); \
1773 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1774 coding->composing); \
1775 coding->composition_rule_follows = 0; \
1776 } \
1777 else \
1778 { \
1779 /* We are already handling a composition. If the method is \
1780 the following two, the codes following the current escape \
1781 sequence are actual characters stored in a buffer. */ \
1782 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1783 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1784 { \
1785 coding->composing = COMPOSITION_RELATIVE; \
1786 coding->composition_rule_follows = 0; \
1787 } \
1788 } \
1789 } while (0)
1790
1791 /* Handle composition end sequence ESC 1. */
1792
1793 #define DECODE_COMPOSITION_END(c1) \
1794 do { \
1795 if (! COMPOSING_P (coding)) \
1796 { \
1797 *dst++ = ISO_CODE_ESC; \
1798 *dst++ = c1; \
1799 coding->produced_char += 2; \
1800 } \
1801 else \
1802 { \
1803 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1804 coding->composing = COMPOSITION_NO; \
1805 } \
1806 } while (0)
1807
1808 /* Decode a composition rule from the byte C1 (and maybe one more byte
1809 from SRC) and store one encoded composition rule in
1810 coding->cmp_data. */
1811
1812 #define DECODE_COMPOSITION_RULE(c1) \
1813 do { \
1814 int rule = 0; \
1815 (c1) -= 32; \
1816 if (c1 < 81) /* old format (before ver.21) */ \
1817 { \
1818 int gref = (c1) / 9; \
1819 int nref = (c1) % 9; \
1820 if (gref == 4) gref = 10; \
1821 if (nref == 4) nref = 10; \
1822 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1823 } \
1824 else if (c1 < 93) /* new format (after ver.21) */ \
1825 { \
1826 ONE_MORE_BYTE (c2); \
1827 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1828 } \
1829 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1830 coding->composition_rule_follows = 0; \
1831 } while (0)
1832
1833
1834 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1835
1836 static void
1837 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1838 struct coding_system *coding;
1839 const unsigned char *source;
1840 unsigned char *destination;
1841 int src_bytes, dst_bytes;
1842 {
1843 const unsigned char *src = source;
1844 const unsigned char *src_end = source + src_bytes;
1845 unsigned char *dst = destination;
1846 unsigned char *dst_end = destination + dst_bytes;
1847 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1848 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1849 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1850 /* SRC_BASE remembers the start position in source in each loop.
1851 The loop will be exited when there's not enough source code
1852 (within macro ONE_MORE_BYTE), or when there's not enough
1853 destination area to produce a character (within macro
1854 EMIT_CHAR). */
1855 const unsigned char *src_base;
1856 int c, charset;
1857 Lisp_Object translation_table;
1858 Lisp_Object safe_chars;
1859
1860 safe_chars = coding_safe_chars (coding->symbol);
1861
1862 if (NILP (Venable_character_translation))
1863 translation_table = Qnil;
1864 else
1865 {
1866 translation_table = coding->translation_table_for_decode;
1867 if (NILP (translation_table))
1868 translation_table = Vstandard_translation_table_for_decode;
1869 }
1870
1871 coding->result = CODING_FINISH_NORMAL;
1872
1873 while (1)
1874 {
1875 int c1, c2 = 0;
1876
1877 src_base = src;
1878 ONE_MORE_BYTE (c1);
1879
1880 /* We produce no character or one character. */
1881 switch (iso_code_class [c1])
1882 {
1883 case ISO_0x20_or_0x7F:
1884 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885 {
1886 DECODE_COMPOSITION_RULE (c1);
1887 continue;
1888 }
1889 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1890 {
1891 /* This is SPACE or DEL. */
1892 charset = CHARSET_ASCII;
1893 break;
1894 }
1895 /* This is a graphic character, we fall down ... */
1896
1897 case ISO_graphic_plane_0:
1898 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1899 {
1900 DECODE_COMPOSITION_RULE (c1);
1901 continue;
1902 }
1903 charset = charset0;
1904 break;
1905
1906 case ISO_0xA0_or_0xFF:
1907 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1908 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1909 goto label_invalid_code;
1910 /* This is a graphic character, we fall down ... */
1911
1912 case ISO_graphic_plane_1:
1913 if (charset1 < 0)
1914 goto label_invalid_code;
1915 charset = charset1;
1916 break;
1917
1918 case ISO_control_0:
1919 if (COMPOSING_P (coding))
1920 DECODE_COMPOSITION_END ('1');
1921
1922 /* All ISO2022 control characters in this class have the
1923 same representation in Emacs internal format. */
1924 if (c1 == '\n'
1925 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1926 && (coding->eol_type == CODING_EOL_CR
1927 || coding->eol_type == CODING_EOL_CRLF))
1928 {
1929 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1930 goto label_end_of_loop;
1931 }
1932 charset = CHARSET_ASCII;
1933 break;
1934
1935 case ISO_control_1:
1936 if (COMPOSING_P (coding))
1937 DECODE_COMPOSITION_END ('1');
1938 goto label_invalid_code;
1939
1940 case ISO_carriage_return:
1941 if (COMPOSING_P (coding))
1942 DECODE_COMPOSITION_END ('1');
1943
1944 if (coding->eol_type == CODING_EOL_CR)
1945 c1 = '\n';
1946 else if (coding->eol_type == CODING_EOL_CRLF)
1947 {
1948 ONE_MORE_BYTE (c1);
1949 if (c1 != ISO_CODE_LF)
1950 {
1951 src--;
1952 c1 = '\r';
1953 }
1954 }
1955 charset = CHARSET_ASCII;
1956 break;
1957
1958 case ISO_shift_out:
1959 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1960 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1961 goto label_invalid_code;
1962 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1963 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1964 continue;
1965
1966 case ISO_shift_in:
1967 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1968 goto label_invalid_code;
1969 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1970 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1971 continue;
1972
1973 case ISO_single_shift_2_7:
1974 case ISO_single_shift_2:
1975 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1976 goto label_invalid_code;
1977 /* SS2 is handled as an escape sequence of ESC 'N' */
1978 c1 = 'N';
1979 goto label_escape_sequence;
1980
1981 case ISO_single_shift_3:
1982 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1983 goto label_invalid_code;
1984 /* SS2 is handled as an escape sequence of ESC 'O' */
1985 c1 = 'O';
1986 goto label_escape_sequence;
1987
1988 case ISO_control_sequence_introducer:
1989 /* CSI is handled as an escape sequence of ESC '[' ... */
1990 c1 = '[';
1991 goto label_escape_sequence;
1992
1993 case ISO_escape:
1994 ONE_MORE_BYTE (c1);
1995 label_escape_sequence:
1996 /* Escape sequences handled by Emacs are invocation,
1997 designation, direction specification, and character
1998 composition specification. */
1999 switch (c1)
2000 {
2001 case '&': /* revision of following character set */
2002 ONE_MORE_BYTE (c1);
2003 if (!(c1 >= '@' && c1 <= '~'))
2004 goto label_invalid_code;
2005 ONE_MORE_BYTE (c1);
2006 if (c1 != ISO_CODE_ESC)
2007 goto label_invalid_code;
2008 ONE_MORE_BYTE (c1);
2009 goto label_escape_sequence;
2010
2011 case '$': /* designation of 2-byte character set */
2012 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2013 goto label_invalid_code;
2014 ONE_MORE_BYTE (c1);
2015 if (c1 >= '@' && c1 <= 'B')
2016 { /* designation of JISX0208.1978, GB2312.1980,
2017 or JISX0208.1980 */
2018 DECODE_DESIGNATION (0, 2, 94, c1);
2019 }
2020 else if (c1 >= 0x28 && c1 <= 0x2B)
2021 { /* designation of DIMENSION2_CHARS94 character set */
2022 ONE_MORE_BYTE (c2);
2023 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2024 }
2025 else if (c1 >= 0x2C && c1 <= 0x2F)
2026 { /* designation of DIMENSION2_CHARS96 character set */
2027 ONE_MORE_BYTE (c2);
2028 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2029 }
2030 else
2031 goto label_invalid_code;
2032 /* We must update these variables now. */
2033 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2034 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2035 continue;
2036
2037 case 'n': /* invocation of locking-shift-2 */
2038 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2039 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2040 goto label_invalid_code;
2041 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2042 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2043 continue;
2044
2045 case 'o': /* invocation of locking-shift-3 */
2046 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2047 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048 goto label_invalid_code;
2049 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2050 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2051 continue;
2052
2053 case 'N': /* invocation of single-shift-2 */
2054 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2055 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2056 goto label_invalid_code;
2057 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2058 ONE_MORE_BYTE (c1);
2059 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2060 goto label_invalid_code;
2061 break;
2062
2063 case 'O': /* invocation of single-shift-3 */
2064 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2065 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2066 goto label_invalid_code;
2067 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2068 ONE_MORE_BYTE (c1);
2069 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2070 goto label_invalid_code;
2071 break;
2072
2073 case '0': case '2': case '3': case '4': /* start composition */
2074 DECODE_COMPOSITION_START (c1);
2075 continue;
2076
2077 case '1': /* end composition */
2078 DECODE_COMPOSITION_END (c1);
2079 continue;
2080
2081 case '[': /* specification of direction */
2082 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2083 goto label_invalid_code;
2084 /* For the moment, nested direction is not supported.
2085 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2086 left-to-right, and nonzero means right-to-left. */
2087 ONE_MORE_BYTE (c1);
2088 switch (c1)
2089 {
2090 case ']': /* end of the current direction */
2091 coding->mode &= ~CODING_MODE_DIRECTION;
2092
2093 case '0': /* end of the current direction */
2094 case '1': /* start of left-to-right direction */
2095 ONE_MORE_BYTE (c1);
2096 if (c1 == ']')
2097 coding->mode &= ~CODING_MODE_DIRECTION;
2098 else
2099 goto label_invalid_code;
2100 break;
2101
2102 case '2': /* start of right-to-left direction */
2103 ONE_MORE_BYTE (c1);
2104 if (c1 == ']')
2105 coding->mode |= CODING_MODE_DIRECTION;
2106 else
2107 goto label_invalid_code;
2108 break;
2109
2110 default:
2111 goto label_invalid_code;
2112 }
2113 continue;
2114
2115 case '%':
2116 if (COMPOSING_P (coding))
2117 DECODE_COMPOSITION_END ('1');
2118 ONE_MORE_BYTE (c1);
2119 if (c1 == '/')
2120 {
2121 /* CTEXT extended segment:
2122 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2123 We keep these bytes as is for the moment.
2124 They may be decoded by post-read-conversion. */
2125 int dim, M, L;
2126 int size, required;
2127 int produced_chars;
2128
2129 ONE_MORE_BYTE (dim);
2130 ONE_MORE_BYTE (M);
2131 ONE_MORE_BYTE (L);
2132 size = ((M - 128) * 128) + (L - 128);
2133 required = 8 + size * 2;
2134 if (dst + required > (dst_bytes ? dst_end : src))
2135 goto label_end_of_loop;
2136 *dst++ = ISO_CODE_ESC;
2137 *dst++ = '%';
2138 *dst++ = '/';
2139 *dst++ = dim;
2140 produced_chars = 4;
2141 dst += CHAR_STRING (M, dst), produced_chars++;
2142 dst += CHAR_STRING (L, dst), produced_chars++;
2143 while (size-- > 0)
2144 {
2145 ONE_MORE_BYTE (c1);
2146 dst += CHAR_STRING (c1, dst), produced_chars++;
2147 }
2148 coding->produced_char += produced_chars;
2149 }
2150 else if (c1 == 'G')
2151 {
2152 unsigned char *d = dst;
2153 int produced_chars;
2154
2155 /* XFree86 extension for embedding UTF-8 in CTEXT:
2156 ESC % G --UTF-8-BYTES-- ESC % @
2157 We keep these bytes as is for the moment.
2158 They may be decoded by post-read-conversion. */
2159 if (d + 6 > (dst_bytes ? dst_end : src))
2160 goto label_end_of_loop;
2161 *d++ = ISO_CODE_ESC;
2162 *d++ = '%';
2163 *d++ = 'G';
2164 produced_chars = 3;
2165 while (d + 1 < (dst_bytes ? dst_end : src))
2166 {
2167 ONE_MORE_BYTE (c1);
2168 if (c1 == ISO_CODE_ESC
2169 && src + 1 < src_end
2170 && src[0] == '%'
2171 && src[1] == '@')
2172 {
2173 src += 2;
2174 break;
2175 }
2176 d += CHAR_STRING (c1, d), produced_chars++;
2177 }
2178 if (d + 3 > (dst_bytes ? dst_end : src))
2179 goto label_end_of_loop;
2180 *d++ = ISO_CODE_ESC;
2181 *d++ = '%';
2182 *d++ = '@';
2183 dst = d;
2184 coding->produced_char += produced_chars + 3;
2185 }
2186 else
2187 goto label_invalid_code;
2188 continue;
2189
2190 default:
2191 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2192 goto label_invalid_code;
2193 if (c1 >= 0x28 && c1 <= 0x2B)
2194 { /* designation of DIMENSION1_CHARS94 character set */
2195 ONE_MORE_BYTE (c2);
2196 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2197 }
2198 else if (c1 >= 0x2C && c1 <= 0x2F)
2199 { /* designation of DIMENSION1_CHARS96 character set */
2200 ONE_MORE_BYTE (c2);
2201 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2202 }
2203 else
2204 goto label_invalid_code;
2205 /* We must update these variables now. */
2206 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2207 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2208 continue;
2209 }
2210 }
2211
2212 /* Now we know CHARSET and 1st position code C1 of a character.
2213 Produce a multibyte sequence for that character while getting
2214 2nd position code C2 if necessary. */
2215 if (CHARSET_DIMENSION (charset) == 2)
2216 {
2217 ONE_MORE_BYTE (c2);
2218 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2219 /* C2 is not in a valid range. */
2220 goto label_invalid_code;
2221 }
2222 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2223 EMIT_CHAR (c);
2224 continue;
2225
2226 label_invalid_code:
2227 coding->errors++;
2228 if (COMPOSING_P (coding))
2229 DECODE_COMPOSITION_END ('1');
2230 src = src_base;
2231 c = *src++;
2232 if (! NILP (translation_table))
2233 c = translate_char (translation_table, c, 0, 0, 0);
2234 EMIT_CHAR (c);
2235 }
2236
2237 label_end_of_loop:
2238 coding->consumed = coding->consumed_char = src_base - source;
2239 coding->produced = dst - destination;
2240 return;
2241 }
2242
2243
2244 /* ISO2022 encoding stuff. */
2245
2246 /*
2247 It is not enough to say just "ISO2022" on encoding, we have to
2248 specify more details. In Emacs, each ISO2022 coding system
2249 variant has the following specifications:
2250 1. Initial designation to G0 through G3.
2251 2. Allows short-form designation?
2252 3. ASCII should be designated to G0 before control characters?
2253 4. ASCII should be designated to G0 at end of line?
2254 5. 7-bit environment or 8-bit environment?
2255 6. Use locking-shift?
2256 7. Use Single-shift?
2257 And the following two are only for Japanese:
2258 8. Use ASCII in place of JIS0201-1976-Roman?
2259 9. Use JISX0208-1983 in place of JISX0208-1978?
2260 These specifications are encoded in `coding->flags' as flag bits
2261 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2262 details.
2263 */
2264
2265 /* Produce codes (escape sequence) for designating CHARSET to graphic
2266 register REG at DST, and increment DST. If <final-char> of CHARSET is
2267 '@', 'A', or 'B' and the coding system CODING allows, produce
2268 designation sequence of short-form. */
2269
2270 #define ENCODE_DESIGNATION(charset, reg, coding) \
2271 do { \
2272 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2273 char *intermediate_char_94 = "()*+"; \
2274 char *intermediate_char_96 = ",-./"; \
2275 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2276 \
2277 if (revision < 255) \
2278 { \
2279 *dst++ = ISO_CODE_ESC; \
2280 *dst++ = '&'; \
2281 *dst++ = '@' + revision; \
2282 } \
2283 *dst++ = ISO_CODE_ESC; \
2284 if (CHARSET_DIMENSION (charset) == 1) \
2285 { \
2286 if (CHARSET_CHARS (charset) == 94) \
2287 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2288 else \
2289 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2290 } \
2291 else \
2292 { \
2293 *dst++ = '$'; \
2294 if (CHARSET_CHARS (charset) == 94) \
2295 { \
2296 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2297 || reg != 0 \
2298 || final_char < '@' || final_char > 'B') \
2299 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2300 } \
2301 else \
2302 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2303 } \
2304 *dst++ = final_char; \
2305 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2306 } while (0)
2307
2308 /* The following two macros produce codes (control character or escape
2309 sequence) for ISO2022 single-shift functions (single-shift-2 and
2310 single-shift-3). */
2311
2312 #define ENCODE_SINGLE_SHIFT_2 \
2313 do { \
2314 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2315 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2316 else \
2317 *dst++ = ISO_CODE_SS2; \
2318 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2319 } while (0)
2320
2321 #define ENCODE_SINGLE_SHIFT_3 \
2322 do { \
2323 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2324 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2325 else \
2326 *dst++ = ISO_CODE_SS3; \
2327 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2328 } while (0)
2329
2330 /* The following four macros produce codes (control character or
2331 escape sequence) for ISO2022 locking-shift functions (shift-in,
2332 shift-out, locking-shift-2, and locking-shift-3). */
2333
2334 #define ENCODE_SHIFT_IN \
2335 do { \
2336 *dst++ = ISO_CODE_SI; \
2337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2338 } while (0)
2339
2340 #define ENCODE_SHIFT_OUT \
2341 do { \
2342 *dst++ = ISO_CODE_SO; \
2343 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2344 } while (0)
2345
2346 #define ENCODE_LOCKING_SHIFT_2 \
2347 do { \
2348 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2349 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2350 } while (0)
2351
2352 #define ENCODE_LOCKING_SHIFT_3 \
2353 do { \
2354 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2355 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2356 } while (0)
2357
2358 /* Produce codes for a DIMENSION1 character whose character set is
2359 CHARSET and whose position-code is C1. Designation and invocation
2360 sequences are also produced in advance if necessary. */
2361
2362 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2363 do { \
2364 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2365 { \
2366 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2367 *dst++ = c1 & 0x7F; \
2368 else \
2369 *dst++ = c1 | 0x80; \
2370 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2371 break; \
2372 } \
2373 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2374 { \
2375 *dst++ = c1 & 0x7F; \
2376 break; \
2377 } \
2378 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2379 { \
2380 *dst++ = c1 | 0x80; \
2381 break; \
2382 } \
2383 else \
2384 /* Since CHARSET is not yet invoked to any graphic planes, we \
2385 must invoke it, or, at first, designate it to some graphic \
2386 register. Then repeat the loop to actually produce the \
2387 character. */ \
2388 dst = encode_invocation_designation (charset, coding, dst); \
2389 } while (1)
2390
2391 /* Produce codes for a DIMENSION2 character whose character set is
2392 CHARSET and whose position-codes are C1 and C2. Designation and
2393 invocation codes are also produced in advance if necessary. */
2394
2395 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2396 do { \
2397 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2398 { \
2399 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2400 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2401 else \
2402 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2403 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2404 break; \
2405 } \
2406 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2407 { \
2408 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2409 break; \
2410 } \
2411 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2412 { \
2413 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2414 break; \
2415 } \
2416 else \
2417 /* Since CHARSET is not yet invoked to any graphic planes, we \
2418 must invoke it, or, at first, designate it to some graphic \
2419 register. Then repeat the loop to actually produce the \
2420 character. */ \
2421 dst = encode_invocation_designation (charset, coding, dst); \
2422 } while (1)
2423
2424 #define ENCODE_ISO_CHARACTER(c) \
2425 do { \
2426 int charset, c1, c2; \
2427 \
2428 SPLIT_CHAR (c, charset, c1, c2); \
2429 if (CHARSET_DEFINED_P (charset)) \
2430 { \
2431 if (CHARSET_DIMENSION (charset) == 1) \
2432 { \
2433 if (charset == CHARSET_ASCII \
2434 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2435 charset = charset_latin_jisx0201; \
2436 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2437 } \
2438 else \
2439 { \
2440 if (charset == charset_jisx0208 \
2441 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2442 charset = charset_jisx0208_1978; \
2443 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2444 } \
2445 } \
2446 else \
2447 { \
2448 *dst++ = c1; \
2449 if (c2 >= 0) \
2450 *dst++ = c2; \
2451 } \
2452 } while (0)
2453
2454
2455 /* Instead of encoding character C, produce one or two `?'s. */
2456
2457 #define ENCODE_UNSAFE_CHARACTER(c) \
2458 do { \
2459 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2460 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2461 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2462 } while (0)
2463
2464
2465 /* Produce designation and invocation codes at a place pointed by DST
2466 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2467 Return new DST. */
2468
2469 unsigned char *
2470 encode_invocation_designation (charset, coding, dst)
2471 int charset;
2472 struct coding_system *coding;
2473 unsigned char *dst;
2474 {
2475 int reg; /* graphic register number */
2476
2477 /* At first, check designations. */
2478 for (reg = 0; reg < 4; reg++)
2479 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2480 break;
2481
2482 if (reg >= 4)
2483 {
2484 /* CHARSET is not yet designated to any graphic registers. */
2485 /* At first check the requested designation. */
2486 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2487 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2488 /* Since CHARSET requests no special designation, designate it
2489 to graphic register 0. */
2490 reg = 0;
2491
2492 ENCODE_DESIGNATION (charset, reg, coding);
2493 }
2494
2495 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2496 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2497 {
2498 /* Since the graphic register REG is not invoked to any graphic
2499 planes, invoke it to graphic plane 0. */
2500 switch (reg)
2501 {
2502 case 0: /* graphic register 0 */
2503 ENCODE_SHIFT_IN;
2504 break;
2505
2506 case 1: /* graphic register 1 */
2507 ENCODE_SHIFT_OUT;
2508 break;
2509
2510 case 2: /* graphic register 2 */
2511 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2512 ENCODE_SINGLE_SHIFT_2;
2513 else
2514 ENCODE_LOCKING_SHIFT_2;
2515 break;
2516
2517 case 3: /* graphic register 3 */
2518 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2519 ENCODE_SINGLE_SHIFT_3;
2520 else
2521 ENCODE_LOCKING_SHIFT_3;
2522 break;
2523 }
2524 }
2525
2526 return dst;
2527 }
2528
2529 /* Produce 2-byte codes for encoded composition rule RULE. */
2530
2531 #define ENCODE_COMPOSITION_RULE(rule) \
2532 do { \
2533 int gref, nref; \
2534 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2535 *dst++ = 32 + 81 + gref; \
2536 *dst++ = 32 + nref; \
2537 } while (0)
2538
2539 /* Produce codes for indicating the start of a composition sequence
2540 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2541 which specify information about the composition. See the comment
2542 in coding.h for the format of DATA. */
2543
2544 #define ENCODE_COMPOSITION_START(coding, data) \
2545 do { \
2546 coding->composing = data[3]; \
2547 *dst++ = ISO_CODE_ESC; \
2548 if (coding->composing == COMPOSITION_RELATIVE) \
2549 *dst++ = '0'; \
2550 else \
2551 { \
2552 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2553 ? '3' : '4'); \
2554 coding->cmp_data_index = coding->cmp_data_start + 4; \
2555 coding->composition_rule_follows = 0; \
2556 } \
2557 } while (0)
2558
2559 /* Produce codes for indicating the end of the current composition. */
2560
2561 #define ENCODE_COMPOSITION_END(coding, data) \
2562 do { \
2563 *dst++ = ISO_CODE_ESC; \
2564 *dst++ = '1'; \
2565 coding->cmp_data_start += data[0]; \
2566 coding->composing = COMPOSITION_NO; \
2567 if (coding->cmp_data_start == coding->cmp_data->used \
2568 && coding->cmp_data->next) \
2569 { \
2570 coding->cmp_data = coding->cmp_data->next; \
2571 coding->cmp_data_start = 0; \
2572 } \
2573 } while (0)
2574
2575 /* Produce composition start sequence ESC 0. Here, this sequence
2576 doesn't mean the start of a new composition but means that we have
2577 just produced components (alternate chars and composition rules) of
2578 the composition and the actual text follows in SRC. */
2579
2580 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2581 do { \
2582 *dst++ = ISO_CODE_ESC; \
2583 *dst++ = '0'; \
2584 coding->composing = COMPOSITION_RELATIVE; \
2585 } while (0)
2586
2587 /* The following three macros produce codes for indicating direction
2588 of text. */
2589 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2590 do { \
2591 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2592 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2593 else \
2594 *dst++ = ISO_CODE_CSI; \
2595 } while (0)
2596
2597 #define ENCODE_DIRECTION_R2L \
2598 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2599
2600 #define ENCODE_DIRECTION_L2R \
2601 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2602
2603 /* Produce codes for designation and invocation to reset the graphic
2604 planes and registers to initial state. */
2605 #define ENCODE_RESET_PLANE_AND_REGISTER \
2606 do { \
2607 int reg; \
2608 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2609 ENCODE_SHIFT_IN; \
2610 for (reg = 0; reg < 4; reg++) \
2611 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2612 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2613 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2614 ENCODE_DESIGNATION \
2615 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2616 } while (0)
2617
2618 /* Produce designation sequences of charsets in the line started from
2619 SRC to a place pointed by DST, and return updated DST.
2620
2621 If the current block ends before any end-of-line, we may fail to
2622 find all the necessary designations. */
2623
2624 static unsigned char *
2625 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2626 struct coding_system *coding;
2627 Lisp_Object translation_table;
2628 const unsigned char *src, *src_end;
2629 unsigned char *dst;
2630 {
2631 int charset, c, found = 0, reg;
2632 /* Table of charsets to be designated to each graphic register. */
2633 int r[4];
2634
2635 for (reg = 0; reg < 4; reg++)
2636 r[reg] = -1;
2637
2638 while (found < 4)
2639 {
2640 ONE_MORE_CHAR (c);
2641 if (c == '\n')
2642 break;
2643
2644 charset = CHAR_CHARSET (c);
2645 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2646 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2647 {
2648 found++;
2649 r[reg] = charset;
2650 }
2651 }
2652
2653 label_end_of_loop:
2654 if (found)
2655 {
2656 for (reg = 0; reg < 4; reg++)
2657 if (r[reg] >= 0
2658 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2659 ENCODE_DESIGNATION (r[reg], reg, coding);
2660 }
2661
2662 return dst;
2663 }
2664
2665 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2666
2667 static void
2668 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2669 struct coding_system *coding;
2670 const unsigned char *source;
2671 unsigned char *destination;
2672 int src_bytes, dst_bytes;
2673 {
2674 const unsigned char *src = source;
2675 const unsigned char *src_end = source + src_bytes;
2676 unsigned char *dst = destination;
2677 unsigned char *dst_end = destination + dst_bytes;
2678 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2679 from DST_END to assure overflow checking is necessary only at the
2680 head of loop. */
2681 unsigned char *adjusted_dst_end = dst_end - 19;
2682 /* SRC_BASE remembers the start position in source in each loop.
2683 The loop will be exited when there's not enough source text to
2684 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2685 there's not enough destination area to produce encoded codes
2686 (within macro EMIT_BYTES). */
2687 const unsigned char *src_base;
2688 int c;
2689 Lisp_Object translation_table;
2690 Lisp_Object safe_chars;
2691
2692 if (coding->flags & CODING_FLAG_ISO_SAFE)
2693 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2694
2695 safe_chars = coding_safe_chars (coding->symbol);
2696
2697 if (NILP (Venable_character_translation))
2698 translation_table = Qnil;
2699 else
2700 {
2701 translation_table = coding->translation_table_for_encode;
2702 if (NILP (translation_table))
2703 translation_table = Vstandard_translation_table_for_encode;
2704 }
2705
2706 coding->consumed_char = 0;
2707 coding->errors = 0;
2708 while (1)
2709 {
2710 src_base = src;
2711
2712 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2713 {
2714 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2715 break;
2716 }
2717
2718 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2719 && CODING_SPEC_ISO_BOL (coding))
2720 {
2721 /* We have to produce designation sequences if any now. */
2722 dst = encode_designation_at_bol (coding, translation_table,
2723 src, src_end, dst);
2724 CODING_SPEC_ISO_BOL (coding) = 0;
2725 }
2726
2727 /* Check composition start and end. */
2728 if (coding->composing != COMPOSITION_DISABLED
2729 && coding->cmp_data_start < coding->cmp_data->used)
2730 {
2731 struct composition_data *cmp_data = coding->cmp_data;
2732 int *data = cmp_data->data + coding->cmp_data_start;
2733 int this_pos = cmp_data->char_offset + coding->consumed_char;
2734
2735 if (coding->composing == COMPOSITION_RELATIVE)
2736 {
2737 if (this_pos == data[2])
2738 {
2739 ENCODE_COMPOSITION_END (coding, data);
2740 cmp_data = coding->cmp_data;
2741 data = cmp_data->data + coding->cmp_data_start;
2742 }
2743 }
2744 else if (COMPOSING_P (coding))
2745 {
2746 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2747 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2748 /* We have consumed components of the composition.
2749 What follows in SRC is the composition's base
2750 text. */
2751 ENCODE_COMPOSITION_FAKE_START (coding);
2752 else
2753 {
2754 int c = cmp_data->data[coding->cmp_data_index++];
2755 if (coding->composition_rule_follows)
2756 {
2757 ENCODE_COMPOSITION_RULE (c);
2758 coding->composition_rule_follows = 0;
2759 }
2760 else
2761 {
2762 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2763 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2764 ENCODE_UNSAFE_CHARACTER (c);
2765 else
2766 ENCODE_ISO_CHARACTER (c);
2767 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2768 coding->composition_rule_follows = 1;
2769 }
2770 continue;
2771 }
2772 }
2773 if (!COMPOSING_P (coding))
2774 {
2775 if (this_pos == data[1])
2776 {
2777 ENCODE_COMPOSITION_START (coding, data);
2778 continue;
2779 }
2780 }
2781 }
2782
2783 ONE_MORE_CHAR (c);
2784
2785 /* Now encode the character C. */
2786 if (c < 0x20 || c == 0x7F)
2787 {
2788 if (c == '\r')
2789 {
2790 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2791 {
2792 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2793 ENCODE_RESET_PLANE_AND_REGISTER;
2794 *dst++ = c;
2795 continue;
2796 }
2797 /* fall down to treat '\r' as '\n' ... */
2798 c = '\n';
2799 }
2800 if (c == '\n')
2801 {
2802 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2803 ENCODE_RESET_PLANE_AND_REGISTER;
2804 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2805 bcopy (coding->spec.iso2022.initial_designation,
2806 coding->spec.iso2022.current_designation,
2807 sizeof coding->spec.iso2022.initial_designation);
2808 if (coding->eol_type == CODING_EOL_LF
2809 || coding->eol_type == CODING_EOL_UNDECIDED)
2810 *dst++ = ISO_CODE_LF;
2811 else if (coding->eol_type == CODING_EOL_CRLF)
2812 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2813 else
2814 *dst++ = ISO_CODE_CR;
2815 CODING_SPEC_ISO_BOL (coding) = 1;
2816 }
2817 else
2818 {
2819 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2820 ENCODE_RESET_PLANE_AND_REGISTER;
2821 *dst++ = c;
2822 }
2823 }
2824 else if (ASCII_BYTE_P (c))
2825 ENCODE_ISO_CHARACTER (c);
2826 else if (SINGLE_BYTE_CHAR_P (c))
2827 {
2828 *dst++ = c;
2829 coding->errors++;
2830 }
2831 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2832 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2833 ENCODE_UNSAFE_CHARACTER (c);
2834 else
2835 ENCODE_ISO_CHARACTER (c);
2836
2837 coding->consumed_char++;
2838 }
2839
2840 label_end_of_loop:
2841 coding->consumed = src_base - source;
2842 coding->produced = coding->produced_char = dst - destination;
2843 }
2844
2845 \f
2846 /*** 4. SJIS and BIG5 handlers ***/
2847
2848 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2849 quite widely. So, for the moment, Emacs supports them in the bare
2850 C code. But, in the future, they may be supported only by CCL. */
2851
2852 /* SJIS is a coding system encoding three character sets: ASCII, right
2853 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2854 as is. A character of charset katakana-jisx0201 is encoded by
2855 "position-code + 0x80". A character of charset japanese-jisx0208
2856 is encoded in 2-byte but two position-codes are divided and shifted
2857 so that it fits in the range below.
2858
2859 --- CODE RANGE of SJIS ---
2860 (character set) (range)
2861 ASCII 0x00 .. 0x7F
2862 KATAKANA-JISX0201 0xA1 .. 0xDF
2863 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2864 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2865 -------------------------------
2866
2867 */
2868
2869 /* BIG5 is a coding system encoding two character sets: ASCII and
2870 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2871 character set and is encoded in two bytes.
2872
2873 --- CODE RANGE of BIG5 ---
2874 (character set) (range)
2875 ASCII 0x00 .. 0x7F
2876 Big5 (1st byte) 0xA1 .. 0xFE
2877 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2878 --------------------------
2879
2880 Since the number of characters in Big5 is larger than maximum
2881 characters in Emacs' charset (96x96), it can't be handled as one
2882 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2883 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2884 contains frequently used characters and the latter contains less
2885 frequently used characters. */
2886
2887 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2888 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2889 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2890 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2891
2892 /* Number of Big5 characters which have the same code in 1st byte. */
2893 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2894
2895 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2896 do { \
2897 unsigned int temp \
2898 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2899 if (b1 < 0xC9) \
2900 charset = charset_big5_1; \
2901 else \
2902 { \
2903 charset = charset_big5_2; \
2904 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2905 } \
2906 c1 = temp / (0xFF - 0xA1) + 0x21; \
2907 c2 = temp % (0xFF - 0xA1) + 0x21; \
2908 } while (0)
2909
2910 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2911 do { \
2912 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2913 if (charset == charset_big5_2) \
2914 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2915 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2916 b2 = temp % BIG5_SAME_ROW; \
2917 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2918 } while (0)
2919
2920 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2921 Check if a text is encoded in SJIS. If it is, return
2922 CODING_CATEGORY_MASK_SJIS, else return 0. */
2923
2924 static int
2925 detect_coding_sjis (src, src_end, multibytep)
2926 unsigned char *src, *src_end;
2927 int multibytep;
2928 {
2929 int c;
2930 /* Dummy for ONE_MORE_BYTE. */
2931 struct coding_system dummy_coding;
2932 struct coding_system *coding = &dummy_coding;
2933
2934 while (1)
2935 {
2936 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2937 if (c < 0x80)
2938 continue;
2939 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2940 return 0;
2941 if (c <= 0x9F || c >= 0xE0)
2942 {
2943 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2944 if (c < 0x40 || c == 0x7F || c > 0xFC)
2945 return 0;
2946 }
2947 }
2948 }
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951 Check if a text is encoded in BIG5. If it is, return
2952 CODING_CATEGORY_MASK_BIG5, else return 0. */
2953
2954 static int
2955 detect_coding_big5 (src, src_end, multibytep)
2956 unsigned char *src, *src_end;
2957 int multibytep;
2958 {
2959 int c;
2960 /* Dummy for ONE_MORE_BYTE. */
2961 struct coding_system dummy_coding;
2962 struct coding_system *coding = &dummy_coding;
2963
2964 while (1)
2965 {
2966 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2967 if (c < 0x80)
2968 continue;
2969 if (c < 0xA1 || c > 0xFE)
2970 return 0;
2971 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2972 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2973 return 0;
2974 }
2975 }
2976
2977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2978 Check if a text is encoded in UTF-8. If it is, return
2979 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2980
2981 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2982 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2983 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2984 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2985 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2986 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2987 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2988
2989 static int
2990 detect_coding_utf_8 (src, src_end, multibytep)
2991 unsigned char *src, *src_end;
2992 int multibytep;
2993 {
2994 unsigned char c;
2995 int seq_maybe_bytes;
2996 /* Dummy for ONE_MORE_BYTE. */
2997 struct coding_system dummy_coding;
2998 struct coding_system *coding = &dummy_coding;
2999
3000 while (1)
3001 {
3002 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
3003 if (UTF_8_1_OCTET_P (c))
3004 continue;
3005 else if (UTF_8_2_OCTET_LEADING_P (c))
3006 seq_maybe_bytes = 1;
3007 else if (UTF_8_3_OCTET_LEADING_P (c))
3008 seq_maybe_bytes = 2;
3009 else if (UTF_8_4_OCTET_LEADING_P (c))
3010 seq_maybe_bytes = 3;
3011 else if (UTF_8_5_OCTET_LEADING_P (c))
3012 seq_maybe_bytes = 4;
3013 else if (UTF_8_6_OCTET_LEADING_P (c))
3014 seq_maybe_bytes = 5;
3015 else
3016 return 0;
3017
3018 do
3019 {
3020 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3021 if (!UTF_8_EXTRA_OCTET_P (c))
3022 return 0;
3023 seq_maybe_bytes--;
3024 }
3025 while (seq_maybe_bytes > 0);
3026 }
3027 }
3028
3029 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3030 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3031 Little Endian (otherwise). If it is, return
3032 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3033 else return 0. */
3034
3035 #define UTF_16_INVALID_P(val) \
3036 (((val) == 0xFFFE) \
3037 || ((val) == 0xFFFF))
3038
3039 #define UTF_16_HIGH_SURROGATE_P(val) \
3040 (((val) & 0xD800) == 0xD800)
3041
3042 #define UTF_16_LOW_SURROGATE_P(val) \
3043 (((val) & 0xDC00) == 0xDC00)
3044
3045 static int
3046 detect_coding_utf_16 (src, src_end, multibytep)
3047 unsigned char *src, *src_end;
3048 int multibytep;
3049 {
3050 unsigned char c1, c2;
3051 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3052 struct coding_system dummy_coding;
3053 struct coding_system *coding = &dummy_coding;
3054
3055 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3056 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3057
3058 if ((c1 == 0xFF) && (c2 == 0xFE))
3059 return CODING_CATEGORY_MASK_UTF_16_LE;
3060 else if ((c1 == 0xFE) && (c2 == 0xFF))
3061 return CODING_CATEGORY_MASK_UTF_16_BE;
3062 return 0;
3063 }
3064
3065 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3066 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3067
3068 static void
3069 decode_coding_sjis_big5 (coding, source, destination,
3070 src_bytes, dst_bytes, sjis_p)
3071 struct coding_system *coding;
3072 const unsigned char *source;
3073 unsigned char *destination;
3074 int src_bytes, dst_bytes;
3075 int sjis_p;
3076 {
3077 const unsigned char *src = source;
3078 const unsigned char *src_end = source + src_bytes;
3079 unsigned char *dst = destination;
3080 unsigned char *dst_end = destination + dst_bytes;
3081 /* SRC_BASE remembers the start position in source in each loop.
3082 The loop will be exited when there's not enough source code
3083 (within macro ONE_MORE_BYTE), or when there's not enough
3084 destination area to produce a character (within macro
3085 EMIT_CHAR). */
3086 const unsigned char *src_base;
3087 Lisp_Object translation_table;
3088
3089 if (NILP (Venable_character_translation))
3090 translation_table = Qnil;
3091 else
3092 {
3093 translation_table = coding->translation_table_for_decode;
3094 if (NILP (translation_table))
3095 translation_table = Vstandard_translation_table_for_decode;
3096 }
3097
3098 coding->produced_char = 0;
3099 while (1)
3100 {
3101 int c, charset, c1, c2 = 0;
3102
3103 src_base = src;
3104 ONE_MORE_BYTE (c1);
3105
3106 if (c1 < 0x80)
3107 {
3108 charset = CHARSET_ASCII;
3109 if (c1 < 0x20)
3110 {
3111 if (c1 == '\r')
3112 {
3113 if (coding->eol_type == CODING_EOL_CRLF)
3114 {
3115 ONE_MORE_BYTE (c2);
3116 if (c2 == '\n')
3117 c1 = c2;
3118 else
3119 /* To process C2 again, SRC is subtracted by 1. */
3120 src--;
3121 }
3122 else if (coding->eol_type == CODING_EOL_CR)
3123 c1 = '\n';
3124 }
3125 else if (c1 == '\n'
3126 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3127 && (coding->eol_type == CODING_EOL_CR
3128 || coding->eol_type == CODING_EOL_CRLF))
3129 {
3130 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3131 goto label_end_of_loop;
3132 }
3133 }
3134 }
3135 else
3136 {
3137 if (sjis_p)
3138 {
3139 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3140 goto label_invalid_code;
3141 if (c1 <= 0x9F || c1 >= 0xE0)
3142 {
3143 /* SJIS -> JISX0208 */
3144 ONE_MORE_BYTE (c2);
3145 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3146 goto label_invalid_code;
3147 DECODE_SJIS (c1, c2, c1, c2);
3148 charset = charset_jisx0208;
3149 }
3150 else
3151 /* SJIS -> JISX0201-Kana */
3152 charset = charset_katakana_jisx0201;
3153 }
3154 else
3155 {
3156 /* BIG5 -> Big5 */
3157 if (c1 < 0xA0 || c1 > 0xFE)
3158 goto label_invalid_code;
3159 ONE_MORE_BYTE (c2);
3160 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3161 goto label_invalid_code;
3162 DECODE_BIG5 (c1, c2, charset, c1, c2);
3163 }
3164 }
3165
3166 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3167 EMIT_CHAR (c);
3168 continue;
3169
3170 label_invalid_code:
3171 coding->errors++;
3172 src = src_base;
3173 c = *src++;
3174 EMIT_CHAR (c);
3175 }
3176
3177 label_end_of_loop:
3178 coding->consumed = coding->consumed_char = src_base - source;
3179 coding->produced = dst - destination;
3180 return;
3181 }
3182
3183 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3184 This function can encode charsets `ascii', `katakana-jisx0201',
3185 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3186 are sure that all these charsets are registered as official charset
3187 (i.e. do not have extended leading-codes). Characters of other
3188 charsets are produced without any encoding. If SJIS_P is 1, encode
3189 SJIS text, else encode BIG5 text. */
3190
3191 static void
3192 encode_coding_sjis_big5 (coding, source, destination,
3193 src_bytes, dst_bytes, sjis_p)
3194 struct coding_system *coding;
3195 unsigned char *source, *destination;
3196 int src_bytes, dst_bytes;
3197 int sjis_p;
3198 {
3199 unsigned char *src = source;
3200 unsigned char *src_end = source + src_bytes;
3201 unsigned char *dst = destination;
3202 unsigned char *dst_end = destination + dst_bytes;
3203 /* SRC_BASE remembers the start position in source in each loop.
3204 The loop will be exited when there's not enough source text to
3205 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3206 there's not enough destination area to produce encoded codes
3207 (within macro EMIT_BYTES). */
3208 unsigned char *src_base;
3209 Lisp_Object translation_table;
3210
3211 if (NILP (Venable_character_translation))
3212 translation_table = Qnil;
3213 else
3214 {
3215 translation_table = coding->translation_table_for_encode;
3216 if (NILP (translation_table))
3217 translation_table = Vstandard_translation_table_for_encode;
3218 }
3219
3220 while (1)
3221 {
3222 int c, charset, c1, c2;
3223
3224 src_base = src;
3225 ONE_MORE_CHAR (c);
3226
3227 /* Now encode the character C. */
3228 if (SINGLE_BYTE_CHAR_P (c))
3229 {
3230 switch (c)
3231 {
3232 case '\r':
3233 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3234 {
3235 EMIT_ONE_BYTE (c);
3236 break;
3237 }
3238 c = '\n';
3239 case '\n':
3240 if (coding->eol_type == CODING_EOL_CRLF)
3241 {
3242 EMIT_TWO_BYTES ('\r', c);
3243 break;
3244 }
3245 else if (coding->eol_type == CODING_EOL_CR)
3246 c = '\r';
3247 default:
3248 EMIT_ONE_BYTE (c);
3249 }
3250 }
3251 else
3252 {
3253 SPLIT_CHAR (c, charset, c1, c2);
3254 if (sjis_p)
3255 {
3256 if (charset == charset_jisx0208
3257 || charset == charset_jisx0208_1978)
3258 {
3259 ENCODE_SJIS (c1, c2, c1, c2);
3260 EMIT_TWO_BYTES (c1, c2);
3261 }
3262 else if (charset == charset_katakana_jisx0201)
3263 EMIT_ONE_BYTE (c1 | 0x80);
3264 else if (charset == charset_latin_jisx0201)
3265 EMIT_ONE_BYTE (c1);
3266 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3267 {
3268 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3269 if (CHARSET_WIDTH (charset) > 1)
3270 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3271 }
3272 else
3273 /* There's no way other than producing the internal
3274 codes as is. */
3275 EMIT_BYTES (src_base, src);
3276 }
3277 else
3278 {
3279 if (charset == charset_big5_1 || charset == charset_big5_2)
3280 {
3281 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3282 EMIT_TWO_BYTES (c1, c2);
3283 }
3284 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3285 {
3286 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3287 if (CHARSET_WIDTH (charset) > 1)
3288 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3289 }
3290 else
3291 /* There's no way other than producing the internal
3292 codes as is. */
3293 EMIT_BYTES (src_base, src);
3294 }
3295 }
3296 coding->consumed_char++;
3297 }
3298
3299 label_end_of_loop:
3300 coding->consumed = src_base - source;
3301 coding->produced = coding->produced_char = dst - destination;
3302 }
3303
3304 \f
3305 /*** 5. CCL handlers ***/
3306
3307 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3308 Check if a text is encoded in a coding system of which
3309 encoder/decoder are written in CCL program. If it is, return
3310 CODING_CATEGORY_MASK_CCL, else return 0. */
3311
3312 static int
3313 detect_coding_ccl (src, src_end, multibytep)
3314 unsigned char *src, *src_end;
3315 int multibytep;
3316 {
3317 unsigned char *valid;
3318 int c;
3319 /* Dummy for ONE_MORE_BYTE. */
3320 struct coding_system dummy_coding;
3321 struct coding_system *coding = &dummy_coding;
3322
3323 /* No coding system is assigned to coding-category-ccl. */
3324 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3325 return 0;
3326
3327 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3328 while (1)
3329 {
3330 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3331 if (! valid[c])
3332 return 0;
3333 }
3334 }
3335
3336 \f
3337 /*** 6. End-of-line handlers ***/
3338
3339 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3340
3341 static void
3342 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3343 struct coding_system *coding;
3344 const unsigned char *source;
3345 unsigned char *destination;
3346 int src_bytes, dst_bytes;
3347 {
3348 const unsigned char *src = source;
3349 unsigned char *dst = destination;
3350 const unsigned char *src_end = src + src_bytes;
3351 unsigned char *dst_end = dst + dst_bytes;
3352 Lisp_Object translation_table;
3353 /* SRC_BASE remembers the start position in source in each loop.
3354 The loop will be exited when there's not enough source code
3355 (within macro ONE_MORE_BYTE), or when there's not enough
3356 destination area to produce a character (within macro
3357 EMIT_CHAR). */
3358 const unsigned char *src_base;
3359 int c;
3360
3361 translation_table = Qnil;
3362 switch (coding->eol_type)
3363 {
3364 case CODING_EOL_CRLF:
3365 while (1)
3366 {
3367 src_base = src;
3368 ONE_MORE_BYTE (c);
3369 if (c == '\r')
3370 {
3371 ONE_MORE_BYTE (c);
3372 if (c != '\n')
3373 {
3374 src--;
3375 c = '\r';
3376 }
3377 }
3378 else if (c == '\n'
3379 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3380 {
3381 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3382 goto label_end_of_loop;
3383 }
3384 EMIT_CHAR (c);
3385 }
3386 break;
3387
3388 case CODING_EOL_CR:
3389 while (1)
3390 {
3391 src_base = src;
3392 ONE_MORE_BYTE (c);
3393 if (c == '\n')
3394 {
3395 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3396 {
3397 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3398 goto label_end_of_loop;
3399 }
3400 }
3401 else if (c == '\r')
3402 c = '\n';
3403 EMIT_CHAR (c);
3404 }
3405 break;
3406
3407 default: /* no need for EOL handling */
3408 while (1)
3409 {
3410 src_base = src;
3411 ONE_MORE_BYTE (c);
3412 EMIT_CHAR (c);
3413 }
3414 }
3415
3416 label_end_of_loop:
3417 coding->consumed = coding->consumed_char = src_base - source;
3418 coding->produced = dst - destination;
3419 return;
3420 }
3421
3422 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3423 format of end-of-line according to `coding->eol_type'. It also
3424 convert multibyte form 8-bit characters to unibyte if
3425 CODING->src_multibyte is nonzero. If `coding->mode &
3426 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3427 also means end-of-line. */
3428
3429 static void
3430 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3431 struct coding_system *coding;
3432 const unsigned char *source;
3433 unsigned char *destination;
3434 int src_bytes, dst_bytes;
3435 {
3436 const unsigned char *src = source;
3437 unsigned char *dst = destination;
3438 const unsigned char *src_end = src + src_bytes;
3439 unsigned char *dst_end = dst + dst_bytes;
3440 Lisp_Object translation_table;
3441 /* SRC_BASE remembers the start position in source in each loop.
3442 The loop will be exited when there's not enough source text to
3443 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3444 there's not enough destination area to produce encoded codes
3445 (within macro EMIT_BYTES). */
3446 const unsigned char *src_base;
3447 unsigned char *tmp;
3448 int c;
3449 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3450
3451 translation_table = Qnil;
3452 if (coding->src_multibyte
3453 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3454 {
3455 src_end--;
3456 src_bytes--;
3457 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3458 }
3459
3460 if (coding->eol_type == CODING_EOL_CRLF)
3461 {
3462 while (src < src_end)
3463 {
3464 src_base = src;
3465 c = *src++;
3466 if (c >= 0x20)
3467 EMIT_ONE_BYTE (c);
3468 else if (c == '\n' || (c == '\r' && selective_display))
3469 EMIT_TWO_BYTES ('\r', '\n');
3470 else
3471 EMIT_ONE_BYTE (c);
3472 }
3473 src_base = src;
3474 label_end_of_loop:
3475 ;
3476 }
3477 else
3478 {
3479 if (!dst_bytes || src_bytes <= dst_bytes)
3480 {
3481 safe_bcopy (src, dst, src_bytes);
3482 src_base = src_end;
3483 dst += src_bytes;
3484 }
3485 else
3486 {
3487 if (coding->src_multibyte
3488 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3489 dst_bytes--;
3490 safe_bcopy (src, dst, dst_bytes);
3491 src_base = src + dst_bytes;
3492 dst = destination + dst_bytes;
3493 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3494 }
3495 if (coding->eol_type == CODING_EOL_CR)
3496 {
3497 for (tmp = destination; tmp < dst; tmp++)
3498 if (*tmp == '\n') *tmp = '\r';
3499 }
3500 else if (selective_display)
3501 {
3502 for (tmp = destination; tmp < dst; tmp++)
3503 if (*tmp == '\r') *tmp = '\n';
3504 }
3505 }
3506 if (coding->src_multibyte)
3507 dst = destination + str_as_unibyte (destination, dst - destination);
3508
3509 coding->consumed = src_base - source;
3510 coding->produced = dst - destination;
3511 coding->produced_char = coding->produced;
3512 }
3513
3514 \f
3515 /*** 7. C library functions ***/
3516
3517 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3518 has a property `coding-system'. The value of this property is a
3519 vector of length 5 (called the coding-vector). Among elements of
3520 this vector, the first (element[0]) and the fifth (element[4])
3521 carry important information for decoding/encoding. Before
3522 decoding/encoding, this information should be set in fields of a
3523 structure of type `coding_system'.
3524
3525 The value of the property `coding-system' can be a symbol of another
3526 subsidiary coding-system. In that case, Emacs gets coding-vector
3527 from that symbol.
3528
3529 `element[0]' contains information to be set in `coding->type'. The
3530 value and its meaning is as follows:
3531
3532 0 -- coding_type_emacs_mule
3533 1 -- coding_type_sjis
3534 2 -- coding_type_iso2022
3535 3 -- coding_type_big5
3536 4 -- coding_type_ccl encoder/decoder written in CCL
3537 nil -- coding_type_no_conversion
3538 t -- coding_type_undecided (automatic conversion on decoding,
3539 no-conversion on encoding)
3540
3541 `element[4]' contains information to be set in `coding->flags' and
3542 `coding->spec'. The meaning varies by `coding->type'.
3543
3544 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3545 of length 32 (of which the first 13 sub-elements are used now).
3546 Meanings of these sub-elements are:
3547
3548 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3549 If the value is an integer of valid charset, the charset is
3550 assumed to be designated to graphic register N initially.
3551
3552 If the value is minus, it is a minus value of charset which
3553 reserves graphic register N, which means that the charset is
3554 not designated initially but should be designated to graphic
3555 register N just before encoding a character in that charset.
3556
3557 If the value is nil, graphic register N is never used on
3558 encoding.
3559
3560 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3561 Each value takes t or nil. See the section ISO2022 of
3562 `coding.h' for more information.
3563
3564 If `coding->type' is `coding_type_big5', element[4] is t to denote
3565 BIG5-ETen or nil to denote BIG5-HKU.
3566
3567 If `coding->type' takes the other value, element[4] is ignored.
3568
3569 Emacs Lisp's coding systems also carry information about format of
3570 end-of-line in a value of property `eol-type'. If the value is
3571 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3572 means CODING_EOL_CR. If it is not integer, it should be a vector
3573 of subsidiary coding systems of which property `eol-type' has one
3574 of the above values.
3575
3576 */
3577
3578 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3579 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3580 is setup so that no conversion is necessary and return -1, else
3581 return 0. */
3582
3583 int
3584 setup_coding_system (coding_system, coding)
3585 Lisp_Object coding_system;
3586 struct coding_system *coding;
3587 {
3588 Lisp_Object coding_spec, coding_type, eol_type, plist;
3589 Lisp_Object val;
3590
3591 /* At first, zero clear all members. */
3592 bzero (coding, sizeof (struct coding_system));
3593
3594 /* Initialize some fields required for all kinds of coding systems. */
3595 coding->symbol = coding_system;
3596 coding->heading_ascii = -1;
3597 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3598 coding->composing = COMPOSITION_DISABLED;
3599 coding->cmp_data = NULL;
3600
3601 if (NILP (coding_system))
3602 goto label_invalid_coding_system;
3603
3604 coding_spec = Fget (coding_system, Qcoding_system);
3605
3606 if (!VECTORP (coding_spec)
3607 || XVECTOR (coding_spec)->size != 5
3608 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3609 goto label_invalid_coding_system;
3610
3611 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3612 if (VECTORP (eol_type))
3613 {
3614 coding->eol_type = CODING_EOL_UNDECIDED;
3615 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3616 if (system_eol_type != CODING_EOL_LF)
3617 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3618 }
3619 else if (XFASTINT (eol_type) == 1)
3620 {
3621 coding->eol_type = CODING_EOL_CRLF;
3622 coding->common_flags
3623 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3624 }
3625 else if (XFASTINT (eol_type) == 2)
3626 {
3627 coding->eol_type = CODING_EOL_CR;
3628 coding->common_flags
3629 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3630 }
3631 else
3632 {
3633 coding->common_flags = 0;
3634 coding->eol_type = CODING_EOL_LF;
3635 }
3636
3637 coding_type = XVECTOR (coding_spec)->contents[0];
3638 /* Try short cut. */
3639 if (SYMBOLP (coding_type))
3640 {
3641 if (EQ (coding_type, Qt))
3642 {
3643 coding->type = coding_type_undecided;
3644 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3645 }
3646 else
3647 coding->type = coding_type_no_conversion;
3648 /* Initialize this member. Any thing other than
3649 CODING_CATEGORY_IDX_UTF_16_BE and
3650 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3651 special treatment in detect_eol. */
3652 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3653
3654 return 0;
3655 }
3656
3657 /* Get values of coding system properties:
3658 `post-read-conversion', `pre-write-conversion',
3659 `translation-table-for-decode', `translation-table-for-encode'. */
3660 plist = XVECTOR (coding_spec)->contents[3];
3661 /* Pre & post conversion functions should be disabled if
3662 inhibit_eol_conversion is nonzero. This is the case that a code
3663 conversion function is called while those functions are running. */
3664 if (! inhibit_pre_post_conversion)
3665 {
3666 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3667 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3668 }
3669 val = Fplist_get (plist, Qtranslation_table_for_decode);
3670 if (SYMBOLP (val))
3671 val = Fget (val, Qtranslation_table_for_decode);
3672 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3673 val = Fplist_get (plist, Qtranslation_table_for_encode);
3674 if (SYMBOLP (val))
3675 val = Fget (val, Qtranslation_table_for_encode);
3676 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3677 val = Fplist_get (plist, Qcoding_category);
3678 if (!NILP (val))
3679 {
3680 val = Fget (val, Qcoding_category_index);
3681 if (INTEGERP (val))
3682 coding->category_idx = XINT (val);
3683 else
3684 goto label_invalid_coding_system;
3685 }
3686 else
3687 goto label_invalid_coding_system;
3688
3689 /* If the coding system has non-nil `composition' property, enable
3690 composition handling. */
3691 val = Fplist_get (plist, Qcomposition);
3692 if (!NILP (val))
3693 coding->composing = COMPOSITION_NO;
3694
3695 /* If the coding system is ascii-incompatible, record it in
3696 common_flags. */
3697 val = Fplist_get (plist, Qascii_incompatible);
3698 if (! NILP (val))
3699 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3700
3701 switch (XFASTINT (coding_type))
3702 {
3703 case 0:
3704 coding->type = coding_type_emacs_mule;
3705 coding->common_flags
3706 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3707 if (!NILP (coding->post_read_conversion))
3708 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3709 if (!NILP (coding->pre_write_conversion))
3710 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3711 break;
3712
3713 case 1:
3714 coding->type = coding_type_sjis;
3715 coding->common_flags
3716 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3717 break;
3718
3719 case 2:
3720 coding->type = coding_type_iso2022;
3721 coding->common_flags
3722 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3723 {
3724 Lisp_Object val, temp;
3725 Lisp_Object *flags;
3726 int i, charset, reg_bits = 0;
3727
3728 val = XVECTOR (coding_spec)->contents[4];
3729
3730 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3731 goto label_invalid_coding_system;
3732
3733 flags = XVECTOR (val)->contents;
3734 coding->flags
3735 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3736 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3737 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3738 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3739 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3740 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3741 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3742 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3743 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3744 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3745 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3746 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3747 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3748 );
3749
3750 /* Invoke graphic register 0 to plane 0. */
3751 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3752 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3753 CODING_SPEC_ISO_INVOCATION (coding, 1)
3754 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3755 /* Not single shifting at first. */
3756 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3757 /* Beginning of buffer should also be regarded as bol. */
3758 CODING_SPEC_ISO_BOL (coding) = 1;
3759
3760 for (charset = 0; charset <= MAX_CHARSET; charset++)
3761 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3762 val = Vcharset_revision_alist;
3763 while (CONSP (val))
3764 {
3765 charset = get_charset_id (Fcar_safe (XCAR (val)));
3766 if (charset >= 0
3767 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3768 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3769 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3770 val = XCDR (val);
3771 }
3772
3773 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3774 FLAGS[REG] can be one of below:
3775 integer CHARSET: CHARSET occupies register I,
3776 t: designate nothing to REG initially, but can be used
3777 by any charsets,
3778 list of integer, nil, or t: designate the first
3779 element (if integer) to REG initially, the remaining
3780 elements (if integer) is designated to REG on request,
3781 if an element is t, REG can be used by any charsets,
3782 nil: REG is never used. */
3783 for (charset = 0; charset <= MAX_CHARSET; charset++)
3784 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3785 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3786 for (i = 0; i < 4; i++)
3787 {
3788 if ((INTEGERP (flags[i])
3789 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3790 || (charset = get_charset_id (flags[i])) >= 0)
3791 {
3792 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3793 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3794 }
3795 else if (EQ (flags[i], Qt))
3796 {
3797 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3798 reg_bits |= 1 << i;
3799 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3800 }
3801 else if (CONSP (flags[i]))
3802 {
3803 Lisp_Object tail;
3804 tail = flags[i];
3805
3806 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3807 if ((INTEGERP (XCAR (tail))
3808 && (charset = XINT (XCAR (tail)),
3809 CHARSET_VALID_P (charset)))
3810 || (charset = get_charset_id (XCAR (tail))) >= 0)
3811 {
3812 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3813 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3814 }
3815 else
3816 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3817 tail = XCDR (tail);
3818 while (CONSP (tail))
3819 {
3820 if ((INTEGERP (XCAR (tail))
3821 && (charset = XINT (XCAR (tail)),
3822 CHARSET_VALID_P (charset)))
3823 || (charset = get_charset_id (XCAR (tail))) >= 0)
3824 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3825 = i;
3826 else if (EQ (XCAR (tail), Qt))
3827 reg_bits |= 1 << i;
3828 tail = XCDR (tail);
3829 }
3830 }
3831 else
3832 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3833
3834 CODING_SPEC_ISO_DESIGNATION (coding, i)
3835 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3836 }
3837
3838 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3839 {
3840 /* REG 1 can be used only by locking shift in 7-bit env. */
3841 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3842 reg_bits &= ~2;
3843 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3844 /* Without any shifting, only REG 0 and 1 can be used. */
3845 reg_bits &= 3;
3846 }
3847
3848 if (reg_bits)
3849 for (charset = 0; charset <= MAX_CHARSET; charset++)
3850 {
3851 if (CHARSET_DEFINED_P (charset)
3852 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3853 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3854 {
3855 /* There exist some default graphic registers to be
3856 used by CHARSET. */
3857
3858 /* We had better avoid designating a charset of
3859 CHARS96 to REG 0 as far as possible. */
3860 if (CHARSET_CHARS (charset) == 96)
3861 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3862 = (reg_bits & 2
3863 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3864 else
3865 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3866 = (reg_bits & 1
3867 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3868 }
3869 }
3870 }
3871 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3872 coding->spec.iso2022.last_invalid_designation_register = -1;
3873 break;
3874
3875 case 3:
3876 coding->type = coding_type_big5;
3877 coding->common_flags
3878 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3879 coding->flags
3880 = (NILP (XVECTOR (coding_spec)->contents[4])
3881 ? CODING_FLAG_BIG5_HKU
3882 : CODING_FLAG_BIG5_ETEN);
3883 break;
3884
3885 case 4:
3886 coding->type = coding_type_ccl;
3887 coding->common_flags
3888 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3889 {
3890 val = XVECTOR (coding_spec)->contents[4];
3891 if (! CONSP (val)
3892 || setup_ccl_program (&(coding->spec.ccl.decoder),
3893 XCAR (val)) < 0
3894 || setup_ccl_program (&(coding->spec.ccl.encoder),
3895 XCDR (val)) < 0)
3896 goto label_invalid_coding_system;
3897
3898 bzero (coding->spec.ccl.valid_codes, 256);
3899 val = Fplist_get (plist, Qvalid_codes);
3900 if (CONSP (val))
3901 {
3902 Lisp_Object this;
3903
3904 for (; CONSP (val); val = XCDR (val))
3905 {
3906 this = XCAR (val);
3907 if (INTEGERP (this)
3908 && XINT (this) >= 0 && XINT (this) < 256)
3909 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3910 else if (CONSP (this)
3911 && INTEGERP (XCAR (this))
3912 && INTEGERP (XCDR (this)))
3913 {
3914 int start = XINT (XCAR (this));
3915 int end = XINT (XCDR (this));
3916
3917 if (start >= 0 && start <= end && end < 256)
3918 while (start <= end)
3919 coding->spec.ccl.valid_codes[start++] = 1;
3920 }
3921 }
3922 }
3923 }
3924 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3925 coding->spec.ccl.cr_carryover = 0;
3926 coding->spec.ccl.eight_bit_carryover[0] = 0;
3927 break;
3928
3929 case 5:
3930 coding->type = coding_type_raw_text;
3931 break;
3932
3933 default:
3934 goto label_invalid_coding_system;
3935 }
3936 return 0;
3937
3938 label_invalid_coding_system:
3939 coding->type = coding_type_no_conversion;
3940 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3941 coding->common_flags = 0;
3942 coding->eol_type = CODING_EOL_UNDECIDED;
3943 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3944 return NILP (coding_system) ? 0 : -1;
3945 }
3946
3947 /* Free memory blocks allocated for storing composition information. */
3948
3949 void
3950 coding_free_composition_data (coding)
3951 struct coding_system *coding;
3952 {
3953 struct composition_data *cmp_data = coding->cmp_data, *next;
3954
3955 if (!cmp_data)
3956 return;
3957 /* Memory blocks are chained. At first, rewind to the first, then,
3958 free blocks one by one. */
3959 while (cmp_data->prev)
3960 cmp_data = cmp_data->prev;
3961 while (cmp_data)
3962 {
3963 next = cmp_data->next;
3964 xfree (cmp_data);
3965 cmp_data = next;
3966 }
3967 coding->cmp_data = NULL;
3968 }
3969
3970 /* Set `char_offset' member of all memory blocks pointed by
3971 coding->cmp_data to POS. */
3972
3973 void
3974 coding_adjust_composition_offset (coding, pos)
3975 struct coding_system *coding;
3976 int pos;
3977 {
3978 struct composition_data *cmp_data;
3979
3980 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3981 cmp_data->char_offset = pos;
3982 }
3983
3984 /* Setup raw-text or one of its subsidiaries in the structure
3985 coding_system CODING according to the already setup value eol_type
3986 in CODING. CODING should be setup for some coding system in
3987 advance. */
3988
3989 void
3990 setup_raw_text_coding_system (coding)
3991 struct coding_system *coding;
3992 {
3993 if (coding->type != coding_type_raw_text)
3994 {
3995 coding->symbol = Qraw_text;
3996 coding->type = coding_type_raw_text;
3997 if (coding->eol_type != CODING_EOL_UNDECIDED)
3998 {
3999 Lisp_Object subsidiaries;
4000 subsidiaries = Fget (Qraw_text, Qeol_type);
4001
4002 if (VECTORP (subsidiaries)
4003 && XVECTOR (subsidiaries)->size == 3)
4004 coding->symbol
4005 = XVECTOR (subsidiaries)->contents[coding->eol_type];
4006 }
4007 setup_coding_system (coding->symbol, coding);
4008 }
4009 return;
4010 }
4011
4012 /* Emacs has a mechanism to automatically detect a coding system if it
4013 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4014 it's impossible to distinguish some coding systems accurately
4015 because they use the same range of codes. So, at first, coding
4016 systems are categorized into 7, those are:
4017
4018 o coding-category-emacs-mule
4019
4020 The category for a coding system which has the same code range
4021 as Emacs' internal format. Assigned the coding-system (Lisp
4022 symbol) `emacs-mule' by default.
4023
4024 o coding-category-sjis
4025
4026 The category for a coding system which has the same code range
4027 as SJIS. Assigned the coding-system (Lisp
4028 symbol) `japanese-shift-jis' by default.
4029
4030 o coding-category-iso-7
4031
4032 The category for a coding system which has the same code range
4033 as ISO2022 of 7-bit environment. This doesn't use any locking
4034 shift and single shift functions. This can encode/decode all
4035 charsets. Assigned the coding-system (Lisp symbol)
4036 `iso-2022-7bit' by default.
4037
4038 o coding-category-iso-7-tight
4039
4040 Same as coding-category-iso-7 except that this can
4041 encode/decode only the specified charsets.
4042
4043 o coding-category-iso-8-1
4044
4045 The category for a coding system which has the same code range
4046 as ISO2022 of 8-bit environment and graphic plane 1 used only
4047 for DIMENSION1 charset. This doesn't use any locking shift
4048 and single shift functions. Assigned the coding-system (Lisp
4049 symbol) `iso-latin-1' by default.
4050
4051 o coding-category-iso-8-2
4052
4053 The category for a coding system which has the same code range
4054 as ISO2022 of 8-bit environment and graphic plane 1 used only
4055 for DIMENSION2 charset. This doesn't use any locking shift
4056 and single shift functions. Assigned the coding-system (Lisp
4057 symbol) `japanese-iso-8bit' by default.
4058
4059 o coding-category-iso-7-else
4060
4061 The category for a coding system which has the same code range
4062 as ISO2022 of 7-bit environment but uses locking shift or
4063 single shift functions. Assigned the coding-system (Lisp
4064 symbol) `iso-2022-7bit-lock' by default.
4065
4066 o coding-category-iso-8-else
4067
4068 The category for a coding system which has the same code range
4069 as ISO2022 of 8-bit environment but uses locking shift or
4070 single shift functions. Assigned the coding-system (Lisp
4071 symbol) `iso-2022-8bit-ss2' by default.
4072
4073 o coding-category-big5
4074
4075 The category for a coding system which has the same code range
4076 as BIG5. Assigned the coding-system (Lisp symbol)
4077 `cn-big5' by default.
4078
4079 o coding-category-utf-8
4080
4081 The category for a coding system which has the same code range
4082 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4083 symbol) `utf-8' by default.
4084
4085 o coding-category-utf-16-be
4086
4087 The category for a coding system in which a text has an
4088 Unicode signature (cf. Unicode Standard) in the order of BIG
4089 endian at the head. Assigned the coding-system (Lisp symbol)
4090 `utf-16-be' by default.
4091
4092 o coding-category-utf-16-le
4093
4094 The category for a coding system in which a text has an
4095 Unicode signature (cf. Unicode Standard) in the order of
4096 LITTLE endian at the head. Assigned the coding-system (Lisp
4097 symbol) `utf-16-le' by default.
4098
4099 o coding-category-ccl
4100
4101 The category for a coding system of which encoder/decoder is
4102 written in CCL programs. The default value is nil, i.e., no
4103 coding system is assigned.
4104
4105 o coding-category-binary
4106
4107 The category for a coding system not categorized in any of the
4108 above. Assigned the coding-system (Lisp symbol)
4109 `no-conversion' by default.
4110
4111 Each of them is a Lisp symbol and the value is an actual
4112 `coding-system' (this is also a Lisp symbol) assigned by a user.
4113 What Emacs does actually is to detect a category of coding system.
4114 Then, it uses a `coding-system' assigned to it. If Emacs can't
4115 decide a single possible category, it selects a category of the
4116 highest priority. Priorities of categories are also specified by a
4117 user in a Lisp variable `coding-category-list'.
4118
4119 */
4120
4121 static
4122 int ascii_skip_code[256];
4123
4124 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4125 If it detects possible coding systems, return an integer in which
4126 appropriate flag bits are set. Flag bits are defined by macros
4127 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4128 it should point the table `coding_priorities'. In that case, only
4129 the flag bit for a coding system of the highest priority is set in
4130 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4131 range 0x80..0x9F are in multibyte form.
4132
4133 How many ASCII characters are at the head is returned as *SKIP. */
4134
4135 static int
4136 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4137 unsigned char *source;
4138 int src_bytes, *priorities, *skip;
4139 int multibytep;
4140 {
4141 register unsigned char c;
4142 unsigned char *src = source, *src_end = source + src_bytes;
4143 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4144 int i;
4145 int null_byte_found;
4146 int latin_extra_code_state = 1;
4147
4148 /* At first, skip all ASCII characters and control characters except
4149 for three ISO2022 specific control characters. */
4150 ascii_skip_code[ISO_CODE_SO] = 0;
4151 ascii_skip_code[ISO_CODE_SI] = 0;
4152 ascii_skip_code[ISO_CODE_ESC] = 0;
4153
4154 label_loop_detect_coding:
4155 null_byte_found = 0;
4156 /* We stop this loop before the last byte because it may be a NULL
4157 anchor byte. */
4158 while (src < src_end - 1 && ascii_skip_code[*src])
4159 null_byte_found |= (! *src++);
4160 if (ascii_skip_code[*src])
4161 src++;
4162 else if (! null_byte_found)
4163 {
4164 unsigned char *p = src + 1;
4165 while (p < src_end - 1)
4166 null_byte_found |= (! *p++);
4167 }
4168 *skip = src - source;
4169
4170 if (src >= src_end)
4171 /* We found nothing other than ASCII (and NULL byte). There's
4172 nothing to do. */
4173 return 0;
4174
4175 c = *src;
4176 /* The text seems to be encoded in some multilingual coding system.
4177 Now, try to find in which coding system the text is encoded. */
4178 if (! null_byte_found && c < 0x80)
4179 {
4180 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4181 /* C is an ISO2022 specific control code of C0. */
4182 latin_extra_code_state = 1;
4183 mask = detect_coding_iso2022 (src, src_end, multibytep,
4184 &latin_extra_code_state);
4185 if (mask == 0)
4186 {
4187 /* No valid ISO2022 code follows C. Try again. */
4188 src++;
4189 if (c == ISO_CODE_ESC)
4190 ascii_skip_code[ISO_CODE_ESC] = 1;
4191 else
4192 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4193 goto label_loop_detect_coding;
4194 }
4195 if (priorities)
4196 {
4197 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4198 {
4199 if (mask & priorities[i])
4200 return priorities[i];
4201 }
4202 return CODING_CATEGORY_MASK_RAW_TEXT;
4203 }
4204 }
4205 else
4206 {
4207 int try;
4208
4209 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4210 c = src[1] - 0x20;
4211
4212 if (null_byte_found)
4213 {
4214 try = (CODING_CATEGORY_MASK_UTF_16_BE
4215 | CODING_CATEGORY_MASK_UTF_16_LE);
4216 }
4217 else if (c < 0xA0)
4218 {
4219 /* C is the first byte of SJIS character code,
4220 or a leading-code of Emacs' internal format (emacs-mule),
4221 or the first byte of UTF-16. */
4222 try = (CODING_CATEGORY_MASK_SJIS
4223 | CODING_CATEGORY_MASK_EMACS_MULE
4224 | CODING_CATEGORY_MASK_UTF_16_BE
4225 | CODING_CATEGORY_MASK_UTF_16_LE);
4226
4227 /* Or, if C is a special latin extra code,
4228 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4229 or is an ISO2022 control-sequence-introducer (CSI),
4230 we should also consider the possibility of ISO2022 codings. */
4231 if ((latin_extra_code_state
4232 && VECTORP (Vlatin_extra_code_table)
4233 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4234 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4235 || (c == ISO_CODE_CSI
4236 && (src < src_end
4237 && (*src == ']'
4238 || ((*src == '0' || *src == '1' || *src == '2')
4239 && src + 1 < src_end
4240 && src[1] == ']')))))
4241 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4242 | CODING_CATEGORY_MASK_ISO_8BIT);
4243 }
4244 else
4245 /* C is a character of ISO2022 in graphic plane right,
4246 or a SJIS's 1-byte character code (i.e. JISX0201),
4247 or the first byte of BIG5's 2-byte code,
4248 or the first byte of UTF-8/16. */
4249 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4250 | CODING_CATEGORY_MASK_ISO_8BIT
4251 | CODING_CATEGORY_MASK_SJIS
4252 | CODING_CATEGORY_MASK_BIG5
4253 | CODING_CATEGORY_MASK_UTF_8
4254 | CODING_CATEGORY_MASK_UTF_16_BE
4255 | CODING_CATEGORY_MASK_UTF_16_LE);
4256
4257 /* Or, we may have to consider the possibility of CCL. */
4258 if (! null_byte_found
4259 && coding_system_table[CODING_CATEGORY_IDX_CCL]
4260 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4261 ->spec.ccl.valid_codes)[c])
4262 try |= CODING_CATEGORY_MASK_CCL;
4263
4264 mask = 0;
4265 if (priorities)
4266 {
4267 /* At first try detection with Latin extra codes not-allowed.
4268 If no proper coding system is found because of Latin extra
4269 codes, try detection with Latin extra codes allowed. */
4270 latin_extra_code_state = 0;
4271 label_retry:
4272 utf16_examined_p = iso2022_examined_p = 0;
4273 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4274 {
4275 if (!iso2022_examined_p
4276 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4277 {
4278 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4279 &latin_extra_code_state);
4280 iso2022_examined_p = 1;
4281 }
4282 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4283 mask |= detect_coding_sjis (src, src_end, multibytep);
4284 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4285 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4286 else if (!utf16_examined_p
4287 && (priorities[i] & try &
4288 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4289 {
4290 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4291 utf16_examined_p = 1;
4292 }
4293 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4294 mask |= detect_coding_big5 (src, src_end, multibytep);
4295 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4296 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4297 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4298 mask |= detect_coding_ccl (src, src_end, multibytep);
4299 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4300 {
4301 if (latin_extra_code_state == 1)
4302 {
4303 /* Detection of ISO-2022 based coding system
4304 failed because of Latin extra codes. Before
4305 falling back to raw-text, try again with
4306 Latin extra codes allowed. */
4307 latin_extra_code_state = 2;
4308 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4309 | CODING_CATEGORY_MASK_ISO_8BIT);
4310 goto label_retry;
4311 }
4312 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4313 }
4314 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4315 {
4316 if (latin_extra_code_state == 1)
4317 {
4318 /* See the above comment. */
4319 latin_extra_code_state = 2;
4320 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4321 | CODING_CATEGORY_MASK_ISO_8BIT);
4322 goto label_retry;
4323 }
4324 mask |= CODING_CATEGORY_MASK_BINARY;
4325 }
4326 if (mask & priorities[i])
4327 return priorities[i];
4328 }
4329 return CODING_CATEGORY_MASK_RAW_TEXT;
4330 }
4331 if (try & CODING_CATEGORY_MASK_ISO)
4332 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4333 &latin_extra_code_state);
4334 if (try & CODING_CATEGORY_MASK_SJIS)
4335 mask |= detect_coding_sjis (src, src_end, multibytep);
4336 if (try & CODING_CATEGORY_MASK_BIG5)
4337 mask |= detect_coding_big5 (src, src_end, multibytep);
4338 if (try & CODING_CATEGORY_MASK_UTF_8)
4339 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4340 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4341 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4342 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4343 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4344 if (try & CODING_CATEGORY_MASK_CCL)
4345 mask |= detect_coding_ccl (src, src_end, multibytep);
4346 }
4347 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4348 }
4349
4350 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4351 The information of the detected coding system is set in CODING. */
4352
4353 void
4354 detect_coding (coding, src, src_bytes)
4355 struct coding_system *coding;
4356 const unsigned char *src;
4357 int src_bytes;
4358 {
4359 unsigned int idx;
4360 int skip, mask;
4361 Lisp_Object val;
4362
4363 val = Vcoding_category_list;
4364 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4365 coding->src_multibyte);
4366 coding->heading_ascii = skip;
4367
4368 if (!mask) return;
4369
4370 /* We found a single coding system of the highest priority in MASK. */
4371 idx = 0;
4372 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4373 if (! mask)
4374 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4375
4376 val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[idx]);
4377
4378 if (coding->eol_type != CODING_EOL_UNDECIDED)
4379 {
4380 Lisp_Object tmp;
4381
4382 tmp = Fget (val, Qeol_type);
4383 if (VECTORP (tmp))
4384 val = XVECTOR (tmp)->contents[coding->eol_type];
4385 }
4386
4387 /* Setup this new coding system while preserving some slots. */
4388 {
4389 int src_multibyte = coding->src_multibyte;
4390 int dst_multibyte = coding->dst_multibyte;
4391
4392 setup_coding_system (val, coding);
4393 coding->src_multibyte = src_multibyte;
4394 coding->dst_multibyte = dst_multibyte;
4395 coding->heading_ascii = skip;
4396 }
4397 }
4398
4399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4400 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4401 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4402
4403 How many non-eol characters are at the head is returned as *SKIP. */
4404
4405 #define MAX_EOL_CHECK_COUNT 3
4406
4407 static int
4408 detect_eol_type (source, src_bytes, skip)
4409 const unsigned char *source;
4410 int src_bytes, *skip;
4411 {
4412 const unsigned char *src = source, *src_end = src + src_bytes;
4413 unsigned char c;
4414 int total = 0; /* How many end-of-lines are found so far. */
4415 int eol_type = CODING_EOL_UNDECIDED;
4416 int this_eol_type;
4417
4418 *skip = 0;
4419
4420 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4421 {
4422 c = *src++;
4423 if (c == '\n' || c == '\r')
4424 {
4425 if (*skip == 0)
4426 *skip = src - 1 - source;
4427 total++;
4428 if (c == '\n')
4429 this_eol_type = CODING_EOL_LF;
4430 else if (src >= src_end || *src != '\n')
4431 this_eol_type = CODING_EOL_CR;
4432 else
4433 this_eol_type = CODING_EOL_CRLF, src++;
4434
4435 if (eol_type == CODING_EOL_UNDECIDED)
4436 /* This is the first end-of-line. */
4437 eol_type = this_eol_type;
4438 else if (eol_type != this_eol_type)
4439 {
4440 /* The found type is different from what found before. */
4441 eol_type = CODING_EOL_INCONSISTENT;
4442 break;
4443 }
4444 }
4445 }
4446
4447 if (*skip == 0)
4448 *skip = src_end - source;
4449 return eol_type;
4450 }
4451
4452 /* Like detect_eol_type, but detect EOL type in 2-octet
4453 big-endian/little-endian format for coding systems utf-16-be and
4454 utf-16-le. */
4455
4456 static int
4457 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4458 const unsigned char *source;
4459 int src_bytes, *skip, big_endian_p;
4460 {
4461 const unsigned char *src = source, *src_end = src + src_bytes;
4462 unsigned int c1, c2;
4463 int total = 0; /* How many end-of-lines are found so far. */
4464 int eol_type = CODING_EOL_UNDECIDED;
4465 int this_eol_type;
4466 int msb, lsb;
4467
4468 if (big_endian_p)
4469 msb = 0, lsb = 1;
4470 else
4471 msb = 1, lsb = 0;
4472
4473 *skip = 0;
4474
4475 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4476 {
4477 c1 = (src[msb] << 8) | (src[lsb]);
4478 src += 2;
4479
4480 if (c1 == '\n' || c1 == '\r')
4481 {
4482 if (*skip == 0)
4483 *skip = src - 2 - source;
4484 total++;
4485 if (c1 == '\n')
4486 {
4487 this_eol_type = CODING_EOL_LF;
4488 }
4489 else
4490 {
4491 if ((src + 1) >= src_end)
4492 {
4493 this_eol_type = CODING_EOL_CR;
4494 }
4495 else
4496 {
4497 c2 = (src[msb] << 8) | (src[lsb]);
4498 if (c2 == '\n')
4499 this_eol_type = CODING_EOL_CRLF, src += 2;
4500 else
4501 this_eol_type = CODING_EOL_CR;
4502 }
4503 }
4504
4505 if (eol_type == CODING_EOL_UNDECIDED)
4506 /* This is the first end-of-line. */
4507 eol_type = this_eol_type;
4508 else if (eol_type != this_eol_type)
4509 {
4510 /* The found type is different from what found before. */
4511 eol_type = CODING_EOL_INCONSISTENT;
4512 break;
4513 }
4514 }
4515 }
4516
4517 if (*skip == 0)
4518 *skip = src_end - source;
4519 return eol_type;
4520 }
4521
4522 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4523 is encoded. If it detects an appropriate format of end-of-line, it
4524 sets the information in *CODING. */
4525
4526 void
4527 detect_eol (coding, src, src_bytes)
4528 struct coding_system *coding;
4529 const unsigned char *src;
4530 int src_bytes;
4531 {
4532 Lisp_Object val;
4533 int skip;
4534 int eol_type;
4535
4536 switch (coding->category_idx)
4537 {
4538 case CODING_CATEGORY_IDX_UTF_16_BE:
4539 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4540 break;
4541 case CODING_CATEGORY_IDX_UTF_16_LE:
4542 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4543 break;
4544 default:
4545 eol_type = detect_eol_type (src, src_bytes, &skip);
4546 break;
4547 }
4548
4549 if (coding->heading_ascii > skip)
4550 coding->heading_ascii = skip;
4551 else
4552 skip = coding->heading_ascii;
4553
4554 if (eol_type == CODING_EOL_UNDECIDED)
4555 return;
4556 if (eol_type == CODING_EOL_INCONSISTENT)
4557 {
4558 #if 0
4559 /* This code is suppressed until we find a better way to
4560 distinguish raw text file and binary file. */
4561
4562 /* If we have already detected that the coding is raw-text, the
4563 coding should actually be no-conversion. */
4564 if (coding->type == coding_type_raw_text)
4565 {
4566 setup_coding_system (Qno_conversion, coding);
4567 return;
4568 }
4569 /* Else, let's decode only text code anyway. */
4570 #endif /* 0 */
4571 eol_type = CODING_EOL_LF;
4572 }
4573
4574 val = Fget (coding->symbol, Qeol_type);
4575 if (VECTORP (val) && XVECTOR (val)->size == 3)
4576 {
4577 int src_multibyte = coding->src_multibyte;
4578 int dst_multibyte = coding->dst_multibyte;
4579 struct composition_data *cmp_data = coding->cmp_data;
4580
4581 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4582 coding->src_multibyte = src_multibyte;
4583 coding->dst_multibyte = dst_multibyte;
4584 coding->heading_ascii = skip;
4585 coding->cmp_data = cmp_data;
4586 }
4587 }
4588
4589 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4590
4591 #define DECODING_BUFFER_MAG(coding) \
4592 (coding->type == coding_type_iso2022 \
4593 ? 3 \
4594 : (coding->type == coding_type_ccl \
4595 ? coding->spec.ccl.decoder.buf_magnification \
4596 : 2))
4597
4598 /* Return maximum size (bytes) of a buffer enough for decoding
4599 SRC_BYTES of text encoded in CODING. */
4600
4601 int
4602 decoding_buffer_size (coding, src_bytes)
4603 struct coding_system *coding;
4604 int src_bytes;
4605 {
4606 return (src_bytes * DECODING_BUFFER_MAG (coding)
4607 + CONVERSION_BUFFER_EXTRA_ROOM);
4608 }
4609
4610 /* Return maximum size (bytes) of a buffer enough for encoding
4611 SRC_BYTES of text to CODING. */
4612
4613 int
4614 encoding_buffer_size (coding, src_bytes)
4615 struct coding_system *coding;
4616 int src_bytes;
4617 {
4618 int magnification;
4619
4620 if (coding->type == coding_type_ccl)
4621 {
4622 magnification = coding->spec.ccl.encoder.buf_magnification;
4623 if (coding->eol_type == CODING_EOL_CRLF)
4624 magnification *= 2;
4625 }
4626 else if (CODING_REQUIRE_ENCODING (coding))
4627 magnification = 3;
4628 else
4629 magnification = 1;
4630
4631 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4632 }
4633
4634 /* Working buffer for code conversion. */
4635 struct conversion_buffer
4636 {
4637 int size; /* size of data. */
4638 int on_stack; /* 1 if allocated by alloca. */
4639 unsigned char *data;
4640 };
4641
4642 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4643 #define allocate_conversion_buffer(buf, len) \
4644 do { \
4645 if (len < MAX_ALLOCA) \
4646 { \
4647 buf.data = (unsigned char *) alloca (len); \
4648 buf.on_stack = 1; \
4649 } \
4650 else \
4651 { \
4652 buf.data = (unsigned char *) xmalloc (len); \
4653 buf.on_stack = 0; \
4654 } \
4655 buf.size = len; \
4656 } while (0)
4657
4658 /* Double the allocated memory for *BUF. */
4659 static void
4660 extend_conversion_buffer (buf)
4661 struct conversion_buffer *buf;
4662 {
4663 if (buf->on_stack)
4664 {
4665 unsigned char *save = buf->data;
4666 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4667 bcopy (save, buf->data, buf->size);
4668 buf->on_stack = 0;
4669 }
4670 else
4671 {
4672 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4673 }
4674 buf->size *= 2;
4675 }
4676
4677 /* Free the allocated memory for BUF if it is not on stack. */
4678 static void
4679 free_conversion_buffer (buf)
4680 struct conversion_buffer *buf;
4681 {
4682 if (!buf->on_stack)
4683 xfree (buf->data);
4684 }
4685
4686 int
4687 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4688 struct coding_system *coding;
4689 unsigned char *source, *destination;
4690 int src_bytes, dst_bytes, encodep;
4691 {
4692 struct ccl_program *ccl
4693 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4694 unsigned char *dst = destination;
4695
4696 ccl->suppress_error = coding->suppress_error;
4697 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4698 if (encodep)
4699 {
4700 /* On encoding, EOL format is converted within ccl_driver. For
4701 that, setup proper information in the structure CCL. */
4702 ccl->eol_type = coding->eol_type;
4703 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4704 ccl->eol_type = CODING_EOL_LF;
4705 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4706 ccl->eight_bit_control = coding->dst_multibyte;
4707 }
4708 else
4709 ccl->eight_bit_control = 1;
4710 ccl->multibyte = coding->src_multibyte;
4711 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4712 {
4713 /* Move carryover bytes to DESTINATION. */
4714 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4715 while (*p)
4716 *dst++ = *p++;
4717 coding->spec.ccl.eight_bit_carryover[0] = 0;
4718 if (dst_bytes)
4719 dst_bytes -= dst - destination;
4720 }
4721
4722 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4723 &(coding->consumed))
4724 + dst - destination);
4725
4726 if (encodep)
4727 {
4728 coding->produced_char = coding->produced;
4729 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4730 }
4731 else if (!ccl->eight_bit_control)
4732 {
4733 /* The produced bytes forms a valid multibyte sequence. */
4734 coding->produced_char
4735 = multibyte_chars_in_text (destination, coding->produced);
4736 coding->spec.ccl.eight_bit_carryover[0] = 0;
4737 }
4738 else
4739 {
4740 /* On decoding, the destination should always multibyte. But,
4741 CCL program might have been generated an invalid multibyte
4742 sequence. Here we make such a sequence valid as
4743 multibyte. */
4744 int bytes
4745 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4746
4747 if ((coding->consumed < src_bytes
4748 || !ccl->last_block)
4749 && coding->produced >= 1
4750 && destination[coding->produced - 1] >= 0x80)
4751 {
4752 /* We should not convert the tailing 8-bit codes to
4753 multibyte form even if they doesn't form a valid
4754 multibyte sequence. They may form a valid sequence in
4755 the next call. */
4756 int carryover = 0;
4757
4758 if (destination[coding->produced - 1] < 0xA0)
4759 carryover = 1;
4760 else if (coding->produced >= 2)
4761 {
4762 if (destination[coding->produced - 2] >= 0x80)
4763 {
4764 if (destination[coding->produced - 2] < 0xA0)
4765 carryover = 2;
4766 else if (coding->produced >= 3
4767 && destination[coding->produced - 3] >= 0x80
4768 && destination[coding->produced - 3] < 0xA0)
4769 carryover = 3;
4770 }
4771 }
4772 if (carryover > 0)
4773 {
4774 BCOPY_SHORT (destination + coding->produced - carryover,
4775 coding->spec.ccl.eight_bit_carryover,
4776 carryover);
4777 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4778 coding->produced -= carryover;
4779 }
4780 }
4781 coding->produced = str_as_multibyte (destination, bytes,
4782 coding->produced,
4783 &(coding->produced_char));
4784 }
4785
4786 switch (ccl->status)
4787 {
4788 case CCL_STAT_SUSPEND_BY_SRC:
4789 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4790 break;
4791 case CCL_STAT_SUSPEND_BY_DST:
4792 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4793 break;
4794 case CCL_STAT_QUIT:
4795 case CCL_STAT_INVALID_CMD:
4796 coding->result = CODING_FINISH_INTERRUPT;
4797 break;
4798 default:
4799 coding->result = CODING_FINISH_NORMAL;
4800 break;
4801 }
4802 return coding->result;
4803 }
4804
4805 /* Decode EOL format of the text at PTR of BYTES length destructively
4806 according to CODING->eol_type. This is called after the CCL
4807 program produced a decoded text at PTR. If we do CRLF->LF
4808 conversion, update CODING->produced and CODING->produced_char. */
4809
4810 static void
4811 decode_eol_post_ccl (coding, ptr, bytes)
4812 struct coding_system *coding;
4813 unsigned char *ptr;
4814 int bytes;
4815 {
4816 Lisp_Object val, saved_coding_symbol;
4817 unsigned char *pend = ptr + bytes;
4818 int dummy;
4819
4820 /* Remember the current coding system symbol. We set it back when
4821 an inconsistent EOL is found so that `last-coding-system-used' is
4822 set to the coding system that doesn't specify EOL conversion. */
4823 saved_coding_symbol = coding->symbol;
4824
4825 coding->spec.ccl.cr_carryover = 0;
4826 if (coding->eol_type == CODING_EOL_UNDECIDED)
4827 {
4828 /* Here, to avoid the call of setup_coding_system, we directly
4829 call detect_eol_type. */
4830 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4831 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4832 coding->eol_type = CODING_EOL_LF;
4833 if (coding->eol_type != CODING_EOL_UNDECIDED)
4834 {
4835 val = Fget (coding->symbol, Qeol_type);
4836 if (VECTORP (val) && XVECTOR (val)->size == 3)
4837 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4838 }
4839 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4840 }
4841
4842 if (coding->eol_type == CODING_EOL_LF
4843 || coding->eol_type == CODING_EOL_UNDECIDED)
4844 {
4845 /* We have nothing to do. */
4846 ptr = pend;
4847 }
4848 else if (coding->eol_type == CODING_EOL_CRLF)
4849 {
4850 unsigned char *pstart = ptr, *p = ptr;
4851
4852 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4853 && *(pend - 1) == '\r')
4854 {
4855 /* If the last character is CR, we can't handle it here
4856 because LF will be in the not-yet-decoded source text.
4857 Record that the CR is not yet processed. */
4858 coding->spec.ccl.cr_carryover = 1;
4859 coding->produced--;
4860 coding->produced_char--;
4861 pend--;
4862 }
4863 while (ptr < pend)
4864 {
4865 if (*ptr == '\r')
4866 {
4867 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4868 {
4869 *p++ = '\n';
4870 ptr += 2;
4871 }
4872 else
4873 {
4874 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4875 goto undo_eol_conversion;
4876 *p++ = *ptr++;
4877 }
4878 }
4879 else if (*ptr == '\n'
4880 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4881 goto undo_eol_conversion;
4882 else
4883 *p++ = *ptr++;
4884 continue;
4885
4886 undo_eol_conversion:
4887 /* We have faced with inconsistent EOL format at PTR.
4888 Convert all LFs before PTR back to CRLFs. */
4889 for (p--, ptr--; p >= pstart; p--)
4890 {
4891 if (*p == '\n')
4892 *ptr-- = '\n', *ptr-- = '\r';
4893 else
4894 *ptr-- = *p;
4895 }
4896 /* If carryover is recorded, cancel it because we don't
4897 convert CRLF anymore. */
4898 if (coding->spec.ccl.cr_carryover)
4899 {
4900 coding->spec.ccl.cr_carryover = 0;
4901 coding->produced++;
4902 coding->produced_char++;
4903 pend++;
4904 }
4905 p = ptr = pend;
4906 coding->eol_type = CODING_EOL_LF;
4907 coding->symbol = saved_coding_symbol;
4908 }
4909 if (p < pend)
4910 {
4911 /* As each two-byte sequence CRLF was converted to LF, (PEND
4912 - P) is the number of deleted characters. */
4913 coding->produced -= pend - p;
4914 coding->produced_char -= pend - p;
4915 }
4916 }
4917 else /* i.e. coding->eol_type == CODING_EOL_CR */
4918 {
4919 unsigned char *p = ptr;
4920
4921 for (; ptr < pend; ptr++)
4922 {
4923 if (*ptr == '\r')
4924 *ptr = '\n';
4925 else if (*ptr == '\n'
4926 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4927 {
4928 for (; p < ptr; p++)
4929 {
4930 if (*p == '\n')
4931 *p = '\r';
4932 }
4933 ptr = pend;
4934 coding->eol_type = CODING_EOL_LF;
4935 coding->symbol = saved_coding_symbol;
4936 }
4937 }
4938 }
4939 }
4940
4941 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4942 decoding, it may detect coding system and format of end-of-line if
4943 those are not yet decided. The source should be unibyte, the
4944 result is multibyte if CODING->dst_multibyte is nonzero, else
4945 unibyte. */
4946
4947 int
4948 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4949 struct coding_system *coding;
4950 const unsigned char *source;
4951 unsigned char *destination;
4952 int src_bytes, dst_bytes;
4953 {
4954 int extra = 0;
4955
4956 if (coding->type == coding_type_undecided)
4957 detect_coding (coding, source, src_bytes);
4958
4959 if (coding->eol_type == CODING_EOL_UNDECIDED
4960 && coding->type != coding_type_ccl)
4961 {
4962 detect_eol (coding, source, src_bytes);
4963 /* We had better recover the original eol format if we
4964 encounter an inconsistent eol format while decoding. */
4965 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4966 }
4967
4968 coding->produced = coding->produced_char = 0;
4969 coding->consumed = coding->consumed_char = 0;
4970 coding->errors = 0;
4971 coding->result = CODING_FINISH_NORMAL;
4972
4973 switch (coding->type)
4974 {
4975 case coding_type_sjis:
4976 decode_coding_sjis_big5 (coding, source, destination,
4977 src_bytes, dst_bytes, 1);
4978 break;
4979
4980 case coding_type_iso2022:
4981 decode_coding_iso2022 (coding, source, destination,
4982 src_bytes, dst_bytes);
4983 break;
4984
4985 case coding_type_big5:
4986 decode_coding_sjis_big5 (coding, source, destination,
4987 src_bytes, dst_bytes, 0);
4988 break;
4989
4990 case coding_type_emacs_mule:
4991 decode_coding_emacs_mule (coding, source, destination,
4992 src_bytes, dst_bytes);
4993 break;
4994
4995 case coding_type_ccl:
4996 if (coding->spec.ccl.cr_carryover)
4997 {
4998 /* Put the CR which was not processed by the previous call
4999 of decode_eol_post_ccl in DESTINATION. It will be
5000 decoded together with the following LF by the call to
5001 decode_eol_post_ccl below. */
5002 *destination = '\r';
5003 coding->produced++;
5004 coding->produced_char++;
5005 dst_bytes--;
5006 extra = coding->spec.ccl.cr_carryover;
5007 }
5008 ccl_coding_driver (coding, source, destination + extra,
5009 src_bytes, dst_bytes, 0);
5010 if (coding->eol_type != CODING_EOL_LF)
5011 {
5012 coding->produced += extra;
5013 coding->produced_char += extra;
5014 decode_eol_post_ccl (coding, destination, coding->produced);
5015 }
5016 break;
5017
5018 default:
5019 decode_eol (coding, source, destination, src_bytes, dst_bytes);
5020 }
5021
5022 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5023 && coding->mode & CODING_MODE_LAST_BLOCK
5024 && coding->consumed == src_bytes)
5025 coding->result = CODING_FINISH_NORMAL;
5026
5027 if (coding->mode & CODING_MODE_LAST_BLOCK
5028 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5029 {
5030 const unsigned char *src = source + coding->consumed;
5031 unsigned char *dst = destination + coding->produced;
5032
5033 src_bytes -= coding->consumed;
5034 coding->errors++;
5035 if (COMPOSING_P (coding))
5036 DECODE_COMPOSITION_END ('1');
5037 while (src_bytes--)
5038 {
5039 int c = *src++;
5040 dst += CHAR_STRING (c, dst);
5041 coding->produced_char++;
5042 }
5043 coding->consumed = coding->consumed_char = src - source;
5044 coding->produced = dst - destination;
5045 coding->result = CODING_FINISH_NORMAL;
5046 }
5047
5048 if (!coding->dst_multibyte)
5049 {
5050 coding->produced = str_as_unibyte (destination, coding->produced);
5051 coding->produced_char = coding->produced;
5052 }
5053
5054 return coding->result;
5055 }
5056
5057 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
5058 multibyteness of the source is CODING->src_multibyte, the
5059 multibyteness of the result is always unibyte. */
5060
5061 int
5062 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5063 struct coding_system *coding;
5064 const unsigned char *source;
5065 unsigned char *destination;
5066 int src_bytes, dst_bytes;
5067 {
5068 coding->produced = coding->produced_char = 0;
5069 coding->consumed = coding->consumed_char = 0;
5070 coding->errors = 0;
5071 coding->result = CODING_FINISH_NORMAL;
5072 if (coding->eol_type == CODING_EOL_UNDECIDED)
5073 coding->eol_type = CODING_EOL_LF;
5074
5075 switch (coding->type)
5076 {
5077 case coding_type_sjis:
5078 encode_coding_sjis_big5 (coding, source, destination,
5079 src_bytes, dst_bytes, 1);
5080 break;
5081
5082 case coding_type_iso2022:
5083 encode_coding_iso2022 (coding, source, destination,
5084 src_bytes, dst_bytes);
5085 break;
5086
5087 case coding_type_big5:
5088 encode_coding_sjis_big5 (coding, source, destination,
5089 src_bytes, dst_bytes, 0);
5090 break;
5091
5092 case coding_type_emacs_mule:
5093 encode_coding_emacs_mule (coding, source, destination,
5094 src_bytes, dst_bytes);
5095 break;
5096
5097 case coding_type_ccl:
5098 ccl_coding_driver (coding, source, destination,
5099 src_bytes, dst_bytes, 1);
5100 break;
5101
5102 default:
5103 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5104 }
5105
5106 if (coding->mode & CODING_MODE_LAST_BLOCK
5107 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5108 {
5109 const unsigned char *src = source + coding->consumed;
5110 unsigned char *dst = destination + coding->produced;
5111
5112 if (coding->type == coding_type_iso2022)
5113 ENCODE_RESET_PLANE_AND_REGISTER;
5114 if (COMPOSING_P (coding))
5115 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5116 if (coding->consumed < src_bytes)
5117 {
5118 int len = src_bytes - coding->consumed;
5119
5120 BCOPY_SHORT (src, dst, len);
5121 if (coding->src_multibyte)
5122 len = str_as_unibyte (dst, len);
5123 dst += len;
5124 coding->consumed = src_bytes;
5125 }
5126 coding->produced = coding->produced_char = dst - destination;
5127 coding->result = CODING_FINISH_NORMAL;
5128 }
5129
5130 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5131 && coding->consumed == src_bytes)
5132 coding->result = CODING_FINISH_NORMAL;
5133
5134 return coding->result;
5135 }
5136
5137 /* Scan text in the region between *BEG and *END (byte positions),
5138 skip characters which we don't have to decode by coding system
5139 CODING at the head and tail, then set *BEG and *END to the region
5140 of the text we actually have to convert. The caller should move
5141 the gap out of the region in advance if the region is from a
5142 buffer.
5143
5144 If STR is not NULL, *BEG and *END are indices into STR. */
5145
5146 static void
5147 shrink_decoding_region (beg, end, coding, str)
5148 int *beg, *end;
5149 struct coding_system *coding;
5150 unsigned char *str;
5151 {
5152 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5153 int eol_conversion;
5154 Lisp_Object translation_table;
5155
5156 if (coding->type == coding_type_ccl
5157 || coding->type == coding_type_undecided
5158 || coding->eol_type != CODING_EOL_LF
5159 || !NILP (coding->post_read_conversion)
5160 || coding->composing != COMPOSITION_DISABLED)
5161 {
5162 /* We can't skip any data. */
5163 return;
5164 }
5165 if (coding->type == coding_type_no_conversion
5166 || coding->type == coding_type_raw_text
5167 || coding->type == coding_type_emacs_mule)
5168 {
5169 /* We need no conversion, but don't have to skip any data here.
5170 Decoding routine handles them effectively anyway. */
5171 return;
5172 }
5173
5174 translation_table = coding->translation_table_for_decode;
5175 if (NILP (translation_table) && !NILP (Venable_character_translation))
5176 translation_table = Vstandard_translation_table_for_decode;
5177 if (CHAR_TABLE_P (translation_table))
5178 {
5179 int i;
5180 for (i = 0; i < 128; i++)
5181 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5182 break;
5183 if (i < 128)
5184 /* Some ASCII character should be translated. We give up
5185 shrinking. */
5186 return;
5187 }
5188
5189 if (coding->heading_ascii >= 0)
5190 /* Detection routine has already found how much we can skip at the
5191 head. */
5192 *beg += coding->heading_ascii;
5193
5194 if (str)
5195 {
5196 begp_orig = begp = str + *beg;
5197 endp_orig = endp = str + *end;
5198 }
5199 else
5200 {
5201 begp_orig = begp = BYTE_POS_ADDR (*beg);
5202 endp_orig = endp = begp + *end - *beg;
5203 }
5204
5205 eol_conversion = (coding->eol_type == CODING_EOL_CR
5206 || coding->eol_type == CODING_EOL_CRLF);
5207
5208 switch (coding->type)
5209 {
5210 case coding_type_sjis:
5211 case coding_type_big5:
5212 /* We can skip all ASCII characters at the head. */
5213 if (coding->heading_ascii < 0)
5214 {
5215 if (eol_conversion)
5216 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5217 else
5218 while (begp < endp && *begp < 0x80) begp++;
5219 }
5220 /* We can skip all ASCII characters at the tail except for the
5221 second byte of SJIS or BIG5 code. */
5222 if (eol_conversion)
5223 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5224 else
5225 while (begp < endp && endp[-1] < 0x80) endp--;
5226 /* Do not consider LF as ascii if preceded by CR, since that
5227 confuses eol decoding. */
5228 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5229 endp++;
5230 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5231 endp++;
5232 break;
5233
5234 case coding_type_iso2022:
5235 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5236 /* We can't skip any data. */
5237 break;
5238 if (coding->heading_ascii < 0)
5239 {
5240 /* We can skip all ASCII characters at the head except for a
5241 few control codes. */
5242 while (begp < endp && (c = *begp) < 0x80
5243 && c != ISO_CODE_CR && c != ISO_CODE_SO
5244 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5245 && (!eol_conversion || c != ISO_CODE_LF))
5246 begp++;
5247 }
5248 switch (coding->category_idx)
5249 {
5250 case CODING_CATEGORY_IDX_ISO_8_1:
5251 case CODING_CATEGORY_IDX_ISO_8_2:
5252 /* We can skip all ASCII characters at the tail. */
5253 if (eol_conversion)
5254 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5255 else
5256 while (begp < endp && endp[-1] < 0x80) endp--;
5257 /* Do not consider LF as ascii if preceded by CR, since that
5258 confuses eol decoding. */
5259 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5260 endp++;
5261 break;
5262
5263 case CODING_CATEGORY_IDX_ISO_7:
5264 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5265 {
5266 /* We can skip all characters at the tail except for 8-bit
5267 codes and ESC and the following 2-byte at the tail. */
5268 unsigned char *eight_bit = NULL;
5269
5270 if (eol_conversion)
5271 while (begp < endp
5272 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5273 {
5274 if (!eight_bit && c & 0x80) eight_bit = endp;
5275 endp--;
5276 }
5277 else
5278 while (begp < endp
5279 && (c = endp[-1]) != ISO_CODE_ESC)
5280 {
5281 if (!eight_bit && c & 0x80) eight_bit = endp;
5282 endp--;
5283 }
5284 /* Do not consider LF as ascii if preceded by CR, since that
5285 confuses eol decoding. */
5286 if (begp < endp && endp < endp_orig
5287 && endp[-1] == '\r' && endp[0] == '\n')
5288 endp++;
5289 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5290 {
5291 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5292 /* This is an ASCII designation sequence. We can
5293 surely skip the tail. But, if we have
5294 encountered an 8-bit code, skip only the codes
5295 after that. */
5296 endp = eight_bit ? eight_bit : endp + 2;
5297 else
5298 /* Hmmm, we can't skip the tail. */
5299 endp = endp_orig;
5300 }
5301 else if (eight_bit)
5302 endp = eight_bit;
5303 }
5304 }
5305 break;
5306
5307 default:
5308 abort ();
5309 }
5310 *beg += begp - begp_orig;
5311 *end += endp - endp_orig;
5312 return;
5313 }
5314
5315 /* Like shrink_decoding_region but for encoding. */
5316
5317 static void
5318 shrink_encoding_region (beg, end, coding, str)
5319 int *beg, *end;
5320 struct coding_system *coding;
5321 unsigned char *str;
5322 {
5323 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5324 int eol_conversion;
5325 Lisp_Object translation_table;
5326
5327 if (coding->type == coding_type_ccl
5328 || coding->eol_type == CODING_EOL_CRLF
5329 || coding->eol_type == CODING_EOL_CR
5330 || (coding->cmp_data && coding->cmp_data->used > 0))
5331 {
5332 /* We can't skip any data. */
5333 return;
5334 }
5335 if (coding->type == coding_type_no_conversion
5336 || coding->type == coding_type_raw_text
5337 || coding->type == coding_type_emacs_mule
5338 || coding->type == coding_type_undecided)
5339 {
5340 /* We need no conversion, but don't have to skip any data here.
5341 Encoding routine handles them effectively anyway. */
5342 return;
5343 }
5344
5345 translation_table = coding->translation_table_for_encode;
5346 if (NILP (translation_table) && !NILP (Venable_character_translation))
5347 translation_table = Vstandard_translation_table_for_encode;
5348 if (CHAR_TABLE_P (translation_table))
5349 {
5350 int i;
5351 for (i = 0; i < 128; i++)
5352 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5353 break;
5354 if (i < 128)
5355 /* Some ASCII character should be translated. We give up
5356 shrinking. */
5357 return;
5358 }
5359
5360 if (str)
5361 {
5362 begp_orig = begp = str + *beg;
5363 endp_orig = endp = str + *end;
5364 }
5365 else
5366 {
5367 begp_orig = begp = BYTE_POS_ADDR (*beg);
5368 endp_orig = endp = begp + *end - *beg;
5369 }
5370
5371 eol_conversion = (coding->eol_type == CODING_EOL_CR
5372 || coding->eol_type == CODING_EOL_CRLF);
5373
5374 /* Here, we don't have to check coding->pre_write_conversion because
5375 the caller is expected to have handled it already. */
5376 switch (coding->type)
5377 {
5378 case coding_type_iso2022:
5379 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5380 /* We can't skip any data. */
5381 break;
5382 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5383 {
5384 unsigned char *bol = begp;
5385 while (begp < endp && *begp < 0x80)
5386 {
5387 begp++;
5388 if (begp[-1] == '\n')
5389 bol = begp;
5390 }
5391 begp = bol;
5392 goto label_skip_tail;
5393 }
5394 /* fall down ... */
5395
5396 case coding_type_sjis:
5397 case coding_type_big5:
5398 /* We can skip all ASCII characters at the head and tail. */
5399 if (eol_conversion)
5400 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5401 else
5402 while (begp < endp && *begp < 0x80) begp++;
5403 label_skip_tail:
5404 if (eol_conversion)
5405 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5406 else
5407 while (begp < endp && *(endp - 1) < 0x80) endp--;
5408 break;
5409
5410 default:
5411 abort ();
5412 }
5413
5414 *beg += begp - begp_orig;
5415 *end += endp - endp_orig;
5416 return;
5417 }
5418
5419 /* As shrinking conversion region requires some overhead, we don't try
5420 shrinking if the length of conversion region is less than this
5421 value. */
5422 static int shrink_conversion_region_threshhold = 1024;
5423
5424 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5425 do { \
5426 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5427 { \
5428 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5429 else shrink_decoding_region (beg, end, coding, str); \
5430 } \
5431 } while (0)
5432
5433 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5434 Vlast_coding_system_used and the remaining elements are buffers to
5435 kill. */
5436 static Lisp_Object
5437 code_convert_region_unwind (arg)
5438 Lisp_Object arg;
5439 {
5440 struct gcpro gcpro1;
5441 GCPRO1 (arg);
5442
5443 inhibit_pre_post_conversion = 0;
5444 Vlast_coding_system_used = XCAR (arg);
5445 for (arg = XCDR (arg); CONSP (arg); arg = XCDR (arg))
5446 Fkill_buffer (XCAR (arg));
5447
5448 UNGCPRO;
5449 return Qnil;
5450 }
5451
5452 /* Store information about all compositions in the range FROM and TO
5453 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5454 buffer or a string, defaults to the current buffer. */
5455
5456 void
5457 coding_save_composition (coding, from, to, obj)
5458 struct coding_system *coding;
5459 int from, to;
5460 Lisp_Object obj;
5461 {
5462 Lisp_Object prop;
5463 int start, end;
5464
5465 if (coding->composing == COMPOSITION_DISABLED)
5466 return;
5467 if (!coding->cmp_data)
5468 coding_allocate_composition_data (coding, from);
5469 if (!find_composition (from, to, &start, &end, &prop, obj)
5470 || end > to)
5471 return;
5472 if (start < from
5473 && (!find_composition (end, to, &start, &end, &prop, obj)
5474 || end > to))
5475 return;
5476 coding->composing = COMPOSITION_NO;
5477 do
5478 {
5479 if (COMPOSITION_VALID_P (start, end, prop))
5480 {
5481 enum composition_method method = COMPOSITION_METHOD (prop);
5482 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5483 >= COMPOSITION_DATA_SIZE)
5484 coding_allocate_composition_data (coding, from);
5485 /* For relative composition, we remember start and end
5486 positions, for the other compositions, we also remember
5487 components. */
5488 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5489 if (method != COMPOSITION_RELATIVE)
5490 {
5491 /* We must store a*/
5492 Lisp_Object val, ch;
5493
5494 val = COMPOSITION_COMPONENTS (prop);
5495 if (CONSP (val))
5496 while (CONSP (val))
5497 {
5498 ch = XCAR (val), val = XCDR (val);
5499 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5500 }
5501 else if (VECTORP (val) || STRINGP (val))
5502 {
5503 int len = (VECTORP (val)
5504 ? XVECTOR (val)->size : SCHARS (val));
5505 int i;
5506 for (i = 0; i < len; i++)
5507 {
5508 ch = (STRINGP (val)
5509 ? Faref (val, make_number (i))
5510 : XVECTOR (val)->contents[i]);
5511 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5512 }
5513 }
5514 else /* INTEGERP (val) */
5515 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5516 }
5517 CODING_ADD_COMPOSITION_END (coding, end - from);
5518 }
5519 start = end;
5520 }
5521 while (start < to
5522 && find_composition (start, to, &start, &end, &prop, obj)
5523 && end <= to);
5524
5525 /* Make coding->cmp_data point to the first memory block. */
5526 while (coding->cmp_data->prev)
5527 coding->cmp_data = coding->cmp_data->prev;
5528 coding->cmp_data_start = 0;
5529 }
5530
5531 /* Reflect the saved information about compositions to OBJ.
5532 CODING->cmp_data points to a memory block for the information. OBJ
5533 is a buffer or a string, defaults to the current buffer. */
5534
5535 void
5536 coding_restore_composition (coding, obj)
5537 struct coding_system *coding;
5538 Lisp_Object obj;
5539 {
5540 struct composition_data *cmp_data = coding->cmp_data;
5541
5542 if (!cmp_data)
5543 return;
5544
5545 while (cmp_data->prev)
5546 cmp_data = cmp_data->prev;
5547
5548 while (cmp_data)
5549 {
5550 int i;
5551
5552 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5553 i += cmp_data->data[i])
5554 {
5555 int *data = cmp_data->data + i;
5556 enum composition_method method = (enum composition_method) data[3];
5557 Lisp_Object components;
5558
5559 if (data[0] < 0 || i + data[0] > cmp_data->used)
5560 /* Invalid composition data. */
5561 break;
5562
5563 if (method == COMPOSITION_RELATIVE)
5564 components = Qnil;
5565 else
5566 {
5567 int len = data[0] - 4, j;
5568 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5569
5570 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5571 && len % 2 == 0)
5572 len --;
5573 if (len < 1)
5574 /* Invalid composition data. */
5575 break;
5576 for (j = 0; j < len; j++)
5577 args[j] = make_number (data[4 + j]);
5578 components = (method == COMPOSITION_WITH_ALTCHARS
5579 ? Fstring (len, args)
5580 : Fvector (len, args));
5581 }
5582 compose_text (data[1], data[2], components, Qnil, obj);
5583 }
5584 cmp_data = cmp_data->next;
5585 }
5586 }
5587
5588 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5589 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5590 coding system CODING, and return the status code of code conversion
5591 (currently, this value has no meaning).
5592
5593 How many characters (and bytes) are converted to how many
5594 characters (and bytes) are recorded in members of the structure
5595 CODING.
5596
5597 If REPLACE is nonzero, we do various things as if the original text
5598 is deleted and a new text is inserted. See the comments in
5599 replace_range (insdel.c) to know what we are doing.
5600
5601 If REPLACE is zero, it is assumed that the source text is unibyte.
5602 Otherwise, it is assumed that the source text is multibyte. */
5603
5604 int
5605 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5606 int from, from_byte, to, to_byte, encodep, replace;
5607 struct coding_system *coding;
5608 {
5609 int len = to - from, len_byte = to_byte - from_byte;
5610 int nchars_del = 0, nbytes_del = 0;
5611 int require, inserted, inserted_byte;
5612 int head_skip, tail_skip, total_skip = 0;
5613 Lisp_Object saved_coding_symbol;
5614 int first = 1;
5615 unsigned char *src, *dst;
5616 Lisp_Object deletion;
5617 int orig_point = PT, orig_len = len;
5618 int prev_Z;
5619 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5620
5621 deletion = Qnil;
5622 saved_coding_symbol = coding->symbol;
5623
5624 if (from < PT && PT < to)
5625 {
5626 TEMP_SET_PT_BOTH (from, from_byte);
5627 orig_point = from;
5628 }
5629
5630 if (replace)
5631 {
5632 int saved_from = from;
5633 int saved_inhibit_modification_hooks;
5634
5635 prepare_to_modify_buffer (from, to, &from);
5636 if (saved_from != from)
5637 {
5638 to = from + len;
5639 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5640 len_byte = to_byte - from_byte;
5641 }
5642
5643 /* The code conversion routine can not preserve text properties
5644 for now. So, we must remove all text properties in the
5645 region. Here, we must suppress all modification hooks. */
5646 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5647 inhibit_modification_hooks = 1;
5648 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5649 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5650 }
5651
5652 coding->heading_ascii = 0;
5653
5654 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5655 {
5656 /* We must detect encoding of text and eol format. */
5657
5658 if (from < GPT && to > GPT)
5659 move_gap_both (from, from_byte);
5660 if (coding->type == coding_type_undecided)
5661 {
5662 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5663 if (coding->type == coding_type_undecided)
5664 {
5665 /* It seems that the text contains only ASCII, but we
5666 should not leave it undecided because the deeper
5667 decoding routine (decode_coding) tries to detect the
5668 encodings again in vain. */
5669 coding->type = coding_type_emacs_mule;
5670 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5671 /* As emacs-mule decoder will handle composition, we
5672 need this setting to allocate coding->cmp_data
5673 later. */
5674 coding->composing = COMPOSITION_NO;
5675 }
5676 }
5677 if (coding->eol_type == CODING_EOL_UNDECIDED
5678 && coding->type != coding_type_ccl)
5679 {
5680 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5681 if (coding->eol_type == CODING_EOL_UNDECIDED)
5682 coding->eol_type = CODING_EOL_LF;
5683 /* We had better recover the original eol format if we
5684 encounter an inconsistent eol format while decoding. */
5685 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5686 }
5687 }
5688
5689 /* Now we convert the text. */
5690
5691 /* For encoding, we must process pre-write-conversion in advance. */
5692 if (! inhibit_pre_post_conversion
5693 && encodep
5694 && SYMBOLP (coding->pre_write_conversion)
5695 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5696 {
5697 /* The function in pre-write-conversion may put a new text in a
5698 new buffer. */
5699 struct buffer *prev = current_buffer;
5700 Lisp_Object new;
5701
5702 record_unwind_protect (code_convert_region_unwind,
5703 Fcons (Vlast_coding_system_used, Qnil));
5704 /* We should not call any more pre-write/post-read-conversion
5705 functions while this pre-write-conversion is running. */
5706 inhibit_pre_post_conversion = 1;
5707 call2 (coding->pre_write_conversion,
5708 make_number (from), make_number (to));
5709 inhibit_pre_post_conversion = 0;
5710 /* Discard the unwind protect. */
5711 specpdl_ptr--;
5712
5713 if (current_buffer != prev)
5714 {
5715 len = ZV - BEGV;
5716 new = Fcurrent_buffer ();
5717 set_buffer_internal_1 (prev);
5718 del_range_2 (from, from_byte, to, to_byte, 0);
5719 TEMP_SET_PT_BOTH (from, from_byte);
5720 insert_from_buffer (XBUFFER (new), 1, len, 0);
5721 Fkill_buffer (new);
5722 if (orig_point >= to)
5723 orig_point += len - orig_len;
5724 else if (orig_point > from)
5725 orig_point = from;
5726 orig_len = len;
5727 to = from + len;
5728 from_byte = CHAR_TO_BYTE (from);
5729 to_byte = CHAR_TO_BYTE (to);
5730 len_byte = to_byte - from_byte;
5731 TEMP_SET_PT_BOTH (from, from_byte);
5732 }
5733 }
5734
5735 if (replace)
5736 {
5737 if (! EQ (current_buffer->undo_list, Qt))
5738 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5739 else
5740 {
5741 nchars_del = to - from;
5742 nbytes_del = to_byte - from_byte;
5743 }
5744 }
5745
5746 if (coding->composing != COMPOSITION_DISABLED)
5747 {
5748 if (encodep)
5749 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5750 else
5751 coding_allocate_composition_data (coding, from);
5752 }
5753
5754 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5755 if we must run CCL program or there are compositions to
5756 encode. */
5757 if (coding->type != coding_type_ccl
5758 && (! coding->cmp_data || coding->cmp_data->used == 0))
5759 {
5760 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5761
5762 if (from < GPT && GPT < to)
5763 move_gap_both (from, from_byte);
5764 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5765 if (from_byte == to_byte
5766 && (encodep || NILP (coding->post_read_conversion))
5767 && ! CODING_REQUIRE_FLUSHING (coding))
5768 {
5769 coding->produced = len_byte;
5770 coding->produced_char = len;
5771 if (!replace)
5772 /* We must record and adjust for this new text now. */
5773 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5774 coding_free_composition_data (coding);
5775 return 0;
5776 }
5777
5778 head_skip = from_byte - from_byte_orig;
5779 tail_skip = to_byte_orig - to_byte;
5780 total_skip = head_skip + tail_skip;
5781 from += head_skip;
5782 to -= tail_skip;
5783 len -= total_skip; len_byte -= total_skip;
5784 }
5785
5786 /* For conversion, we must put the gap before the text in addition to
5787 making the gap larger for efficient decoding. The required gap
5788 size starts from 2000 which is the magic number used in make_gap.
5789 But, after one batch of conversion, it will be incremented if we
5790 find that it is not enough . */
5791 require = 2000;
5792
5793 if (GAP_SIZE < require)
5794 make_gap (require - GAP_SIZE);
5795 move_gap_both (from, from_byte);
5796
5797 inserted = inserted_byte = 0;
5798
5799 GAP_SIZE += len_byte;
5800 ZV -= len;
5801 Z -= len;
5802 ZV_BYTE -= len_byte;
5803 Z_BYTE -= len_byte;
5804
5805 if (GPT - BEG < BEG_UNCHANGED)
5806 BEG_UNCHANGED = GPT - BEG;
5807 if (Z - GPT < END_UNCHANGED)
5808 END_UNCHANGED = Z - GPT;
5809
5810 if (!encodep && coding->src_multibyte)
5811 {
5812 /* Decoding routines expects that the source text is unibyte.
5813 We must convert 8-bit characters of multibyte form to
5814 unibyte. */
5815 int len_byte_orig = len_byte;
5816 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5817 if (len_byte < len_byte_orig)
5818 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5819 len_byte);
5820 coding->src_multibyte = 0;
5821 }
5822
5823 for (;;)
5824 {
5825 int result;
5826
5827 /* The buffer memory is now:
5828 +--------+converted-text+---------+-------original-text-------+---+
5829 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5830 |<---------------------- GAP ----------------------->| */
5831 src = GAP_END_ADDR - len_byte;
5832 dst = GPT_ADDR + inserted_byte;
5833
5834 if (encodep)
5835 result = encode_coding (coding, src, dst, len_byte, 0);
5836 else
5837 {
5838 if (coding->composing != COMPOSITION_DISABLED)
5839 coding->cmp_data->char_offset = from + inserted;
5840 result = decode_coding (coding, src, dst, len_byte, 0);
5841 }
5842
5843 /* The buffer memory is now:
5844 +--------+-------converted-text----+--+------original-text----+---+
5845 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5846 |<---------------------- GAP ----------------------->| */
5847
5848 inserted += coding->produced_char;
5849 inserted_byte += coding->produced;
5850 len_byte -= coding->consumed;
5851
5852 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5853 {
5854 coding_allocate_composition_data (coding, from + inserted);
5855 continue;
5856 }
5857
5858 src += coding->consumed;
5859 dst += coding->produced;
5860
5861 if (result == CODING_FINISH_NORMAL)
5862 {
5863 src += len_byte;
5864 break;
5865 }
5866 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5867 {
5868 unsigned char *pend = dst, *p = pend - inserted_byte;
5869 Lisp_Object eol_type;
5870
5871 /* Encode LFs back to the original eol format (CR or CRLF). */
5872 if (coding->eol_type == CODING_EOL_CR)
5873 {
5874 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5875 }
5876 else
5877 {
5878 int count = 0;
5879
5880 while (p < pend) if (*p++ == '\n') count++;
5881 if (src - dst < count)
5882 {
5883 /* We don't have sufficient room for encoding LFs
5884 back to CRLF. We must record converted and
5885 not-yet-converted text back to the buffer
5886 content, enlarge the gap, then record them out of
5887 the buffer contents again. */
5888 int add = len_byte + inserted_byte;
5889
5890 GAP_SIZE -= add;
5891 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5892 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5893 make_gap (count - GAP_SIZE);
5894 GAP_SIZE += add;
5895 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5896 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897 /* Don't forget to update SRC, DST, and PEND. */
5898 src = GAP_END_ADDR - len_byte;
5899 dst = GPT_ADDR + inserted_byte;
5900 pend = dst;
5901 }
5902 inserted += count;
5903 inserted_byte += count;
5904 coding->produced += count;
5905 p = dst = pend + count;
5906 while (count)
5907 {
5908 *--p = *--pend;
5909 if (*p == '\n') count--, *--p = '\r';
5910 }
5911 }
5912
5913 /* Suppress eol-format conversion in the further conversion. */
5914 coding->eol_type = CODING_EOL_LF;
5915
5916 /* Set the coding system symbol to that for Unix-like EOL. */
5917 eol_type = Fget (saved_coding_symbol, Qeol_type);
5918 if (VECTORP (eol_type)
5919 && XVECTOR (eol_type)->size == 3
5920 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5921 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5922 else
5923 coding->symbol = saved_coding_symbol;
5924
5925 continue;
5926 }
5927 if (len_byte <= 0)
5928 {
5929 if (coding->type != coding_type_ccl
5930 || coding->mode & CODING_MODE_LAST_BLOCK)
5931 break;
5932 coding->mode |= CODING_MODE_LAST_BLOCK;
5933 continue;
5934 }
5935 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5936 {
5937 /* The source text ends in invalid codes. Let's just
5938 make them valid buffer contents, and finish conversion. */
5939 if (multibyte_p)
5940 {
5941 unsigned char *start = dst;
5942
5943 inserted += len_byte;
5944 while (len_byte--)
5945 {
5946 int c = *src++;
5947 dst += CHAR_STRING (c, dst);
5948 }
5949
5950 inserted_byte += dst - start;
5951 }
5952 else
5953 {
5954 inserted += len_byte;
5955 inserted_byte += len_byte;
5956 while (len_byte--)
5957 *dst++ = *src++;
5958 }
5959 break;
5960 }
5961 if (result == CODING_FINISH_INTERRUPT)
5962 {
5963 /* The conversion procedure was interrupted by a user. */
5964 break;
5965 }
5966 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5967 if (coding->consumed < 1)
5968 {
5969 /* It's quite strange to require more memory without
5970 consuming any bytes. Perhaps CCL program bug. */
5971 break;
5972 }
5973 if (first)
5974 {
5975 /* We have just done the first batch of conversion which was
5976 stopped because of insufficient gap. Let's reconsider the
5977 required gap size (i.e. SRT - DST) now.
5978
5979 We have converted ORIG bytes (== coding->consumed) into
5980 NEW bytes (coding->produced). To convert the remaining
5981 LEN bytes, we may need REQUIRE bytes of gap, where:
5982 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5983 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5984 Here, we are sure that NEW >= ORIG. */
5985
5986 if (coding->produced <= coding->consumed)
5987 {
5988 /* This happens because of CCL-based coding system with
5989 eol-type CRLF. */
5990 require = 0;
5991 }
5992 else
5993 {
5994 float ratio = coding->produced - coding->consumed;
5995 ratio /= coding->consumed;
5996 require = len_byte * ratio;
5997 }
5998 first = 0;
5999 }
6000 if ((src - dst) < (require + 2000))
6001 {
6002 /* See the comment above the previous call of make_gap. */
6003 int add = len_byte + inserted_byte;
6004
6005 GAP_SIZE -= add;
6006 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
6007 GPT += inserted_byte; GPT_BYTE += inserted_byte;
6008 make_gap (require + 2000);
6009 GAP_SIZE += add;
6010 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
6011 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6012 }
6013 }
6014 if (src - dst > 0) *dst = 0; /* Put an anchor. */
6015
6016 if (encodep && coding->dst_multibyte)
6017 {
6018 /* The output is unibyte. We must convert 8-bit characters to
6019 multibyte form. */
6020 if (inserted_byte * 2 > GAP_SIZE)
6021 {
6022 GAP_SIZE -= inserted_byte;
6023 ZV += inserted_byte; Z += inserted_byte;
6024 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
6025 GPT += inserted_byte; GPT_BYTE += inserted_byte;
6026 make_gap (inserted_byte - GAP_SIZE);
6027 GAP_SIZE += inserted_byte;
6028 ZV -= inserted_byte; Z -= inserted_byte;
6029 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
6030 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
6031 }
6032 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
6033 }
6034
6035 /* If we shrank the conversion area, adjust it now. */
6036 if (total_skip > 0)
6037 {
6038 if (tail_skip > 0)
6039 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
6040 inserted += total_skip; inserted_byte += total_skip;
6041 GAP_SIZE += total_skip;
6042 GPT -= head_skip; GPT_BYTE -= head_skip;
6043 ZV -= total_skip; ZV_BYTE -= total_skip;
6044 Z -= total_skip; Z_BYTE -= total_skip;
6045 from -= head_skip; from_byte -= head_skip;
6046 to += tail_skip; to_byte += tail_skip;
6047 }
6048
6049 prev_Z = Z;
6050 if (! EQ (current_buffer->undo_list, Qt))
6051 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6052 else
6053 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
6054 inserted, inserted_byte);
6055 inserted = Z - prev_Z;
6056
6057 if (!encodep && coding->cmp_data && coding->cmp_data->used)
6058 coding_restore_composition (coding, Fcurrent_buffer ());
6059 coding_free_composition_data (coding);
6060
6061 if (! inhibit_pre_post_conversion
6062 && ! encodep && ! NILP (coding->post_read_conversion))
6063 {
6064 Lisp_Object val;
6065 Lisp_Object saved_coding_system;
6066
6067 if (from != PT)
6068 TEMP_SET_PT_BOTH (from, from_byte);
6069 prev_Z = Z;
6070 record_unwind_protect (code_convert_region_unwind,
6071 Fcons (Vlast_coding_system_used, Qnil));
6072 saved_coding_system = Vlast_coding_system_used;
6073 Vlast_coding_system_used = coding->symbol;
6074 /* We should not call any more pre-write/post-read-conversion
6075 functions while this post-read-conversion is running. */
6076 inhibit_pre_post_conversion = 1;
6077 val = call1 (coding->post_read_conversion, make_number (inserted));
6078 inhibit_pre_post_conversion = 0;
6079 coding->symbol = Vlast_coding_system_used;
6080 Vlast_coding_system_used = saved_coding_system;
6081 /* Discard the unwind protect. */
6082 specpdl_ptr--;
6083 CHECK_NUMBER (val);
6084 inserted += Z - prev_Z;
6085 }
6086
6087 if (orig_point >= from)
6088 {
6089 if (orig_point >= from + orig_len)
6090 orig_point += inserted - orig_len;
6091 else
6092 orig_point = from;
6093 TEMP_SET_PT (orig_point);
6094 }
6095
6096 if (replace)
6097 {
6098 signal_after_change (from, to - from, inserted);
6099 update_compositions (from, from + inserted, CHECK_BORDER);
6100 }
6101
6102 {
6103 coding->consumed = to_byte - from_byte;
6104 coding->consumed_char = to - from;
6105 coding->produced = inserted_byte;
6106 coding->produced_char = inserted;
6107 }
6108
6109 return 0;
6110 }
6111
6112 /* Name (or base name) of work buffer for code conversion. */
6113 static Lisp_Object Vcode_conversion_workbuf_name;
6114
6115 /* Set the current buffer to the working buffer prepared for
6116 code-conversion. MULTIBYTE specifies the multibyteness of the
6117 buffer. Return the buffer we set if it must be killed after use.
6118 Otherwise return Qnil. */
6119
6120 static Lisp_Object
6121 set_conversion_work_buffer (multibyte)
6122 int multibyte;
6123 {
6124 Lisp_Object buffer, buffer_to_kill;
6125 struct buffer *buf;
6126
6127 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6128 buf = XBUFFER (buffer);
6129 if (buf == current_buffer)
6130 {
6131 /* As we are already in the work buffer, we must generate a new
6132 buffer for the work. */
6133 Lisp_Object name;
6134
6135 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6136 buffer = buffer_to_kill = Fget_buffer_create (name);
6137 buf = XBUFFER (buffer);
6138 }
6139 else
6140 buffer_to_kill = Qnil;
6141
6142 delete_all_overlays (buf);
6143 buf->directory = current_buffer->directory;
6144 buf->read_only = Qnil;
6145 buf->filename = Qnil;
6146 buf->undo_list = Qt;
6147 eassert (buf->overlays_before == NULL);
6148 eassert (buf->overlays_after == NULL);
6149 set_buffer_internal (buf);
6150 if (BEG != BEGV || Z != ZV)
6151 Fwiden ();
6152 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6153 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6154 return buffer_to_kill;
6155 }
6156
6157 Lisp_Object
6158 run_pre_post_conversion_on_str (str, coding, encodep)
6159 Lisp_Object str;
6160 struct coding_system *coding;
6161 int encodep;
6162 {
6163 int count = SPECPDL_INDEX ();
6164 struct gcpro gcpro1, gcpro2;
6165 int multibyte = STRING_MULTIBYTE (str);
6166 Lisp_Object old_deactivate_mark;
6167 Lisp_Object buffer_to_kill;
6168 Lisp_Object unwind_arg;
6169
6170 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6171 /* It is not crucial to specbind this. */
6172 old_deactivate_mark = Vdeactivate_mark;
6173 GCPRO2 (str, old_deactivate_mark);
6174
6175 /* We must insert the contents of STR as is without
6176 unibyte<->multibyte conversion. For that, we adjust the
6177 multibyteness of the working buffer to that of STR. */
6178 buffer_to_kill = set_conversion_work_buffer (multibyte);
6179 if (NILP (buffer_to_kill))
6180 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6181 else
6182 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6183 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6184
6185 insert_from_string (str, 0, 0,
6186 SCHARS (str), SBYTES (str), 0);
6187 UNGCPRO;
6188 inhibit_pre_post_conversion = 1;
6189 if (encodep)
6190 {
6191 struct buffer *prev = current_buffer;
6192
6193 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6194 if (prev != current_buffer)
6195 /* We must kill the current buffer too. */
6196 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6197 }
6198 else
6199 {
6200 Vlast_coding_system_used = coding->symbol;
6201 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6202 call1 (coding->post_read_conversion, make_number (Z - BEG));
6203 coding->symbol = Vlast_coding_system_used;
6204 }
6205 inhibit_pre_post_conversion = 0;
6206 Vdeactivate_mark = old_deactivate_mark;
6207 str = make_buffer_string (BEG, Z, 1);
6208 return unbind_to (count, str);
6209 }
6210
6211
6212 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6213 text in *STR. *SIZE is the allocated bytes for STR. As it
6214 is intended that this function is called from encode_terminal_code,
6215 the pre-write-conversion function is run by safe_call and thus
6216 "Error during redisplay: ..." is logged when an error occurs.
6217
6218 Store the resulting text in *STR and set CODING->produced_char and
6219 CODING->produced to the number of characters and bytes
6220 respectively. If the size of *STR is too small, enlarge it by
6221 xrealloc and update *STR and *SIZE. */
6222
6223 void
6224 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6225 unsigned char **str;
6226 int *size, nchars, nbytes;
6227 struct coding_system *coding;
6228 {
6229 struct gcpro gcpro1, gcpro2;
6230 struct buffer *cur = current_buffer;
6231 struct buffer *prev;
6232 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6233 Lisp_Object args[3];
6234 Lisp_Object buffer_to_kill;
6235
6236 /* It is not crucial to specbind this. */
6237 old_deactivate_mark = Vdeactivate_mark;
6238 old_last_coding_system_used = Vlast_coding_system_used;
6239 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6240
6241 /* We must insert the contents of STR as is without
6242 unibyte<->multibyte conversion. For that, we adjust the
6243 multibyteness of the working buffer to that of STR. */
6244 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6245 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6246 UNGCPRO;
6247 inhibit_pre_post_conversion = 1;
6248 prev = current_buffer;
6249 args[0] = coding->pre_write_conversion;
6250 args[1] = make_number (BEG);
6251 args[2] = make_number (Z);
6252 safe_call (3, args);
6253 inhibit_pre_post_conversion = 0;
6254 Vdeactivate_mark = old_deactivate_mark;
6255 Vlast_coding_system_used = old_last_coding_system_used;
6256 coding->produced_char = Z - BEG;
6257 coding->produced = Z_BYTE - BEG_BYTE;
6258 if (coding->produced > *size)
6259 {
6260 *size = coding->produced;
6261 *str = xrealloc (*str, *size);
6262 }
6263 if (BEG < GPT && GPT < Z)
6264 move_gap (BEG);
6265 bcopy (BEG_ADDR, *str, coding->produced);
6266 coding->src_multibyte
6267 = ! NILP (current_buffer->enable_multibyte_characters);
6268 if (prev != current_buffer)
6269 Fkill_buffer (Fcurrent_buffer ());
6270 set_buffer_internal (cur);
6271 if (! NILP (buffer_to_kill))
6272 Fkill_buffer (buffer_to_kill);
6273 }
6274
6275
6276 Lisp_Object
6277 decode_coding_string (str, coding, nocopy)
6278 Lisp_Object str;
6279 struct coding_system *coding;
6280 int nocopy;
6281 {
6282 int len;
6283 struct conversion_buffer buf;
6284 int from, to_byte;
6285 Lisp_Object saved_coding_symbol;
6286 int result;
6287 int require_decoding;
6288 int shrinked_bytes = 0;
6289 Lisp_Object newstr;
6290 int consumed, consumed_char, produced, produced_char;
6291
6292 from = 0;
6293 to_byte = SBYTES (str);
6294
6295 saved_coding_symbol = coding->symbol;
6296 coding->src_multibyte = STRING_MULTIBYTE (str);
6297 coding->dst_multibyte = 1;
6298 coding->heading_ascii = 0;
6299
6300 if (CODING_REQUIRE_DETECTION (coding))
6301 {
6302 /* See the comments in code_convert_region. */
6303 if (coding->type == coding_type_undecided)
6304 {
6305 detect_coding (coding, SDATA (str), to_byte);
6306 if (coding->type == coding_type_undecided)
6307 {
6308 coding->type = coding_type_emacs_mule;
6309 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6310 /* As emacs-mule decoder will handle composition, we
6311 need this setting to allocate coding->cmp_data
6312 later. */
6313 coding->composing = COMPOSITION_NO;
6314 }
6315 }
6316 if (coding->eol_type == CODING_EOL_UNDECIDED
6317 && coding->type != coding_type_ccl)
6318 {
6319 saved_coding_symbol = coding->symbol;
6320 detect_eol (coding, SDATA (str), to_byte);
6321 if (coding->eol_type == CODING_EOL_UNDECIDED)
6322 coding->eol_type = CODING_EOL_LF;
6323 /* We had better recover the original eol format if we
6324 encounter an inconsistent eol format while decoding. */
6325 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6326 }
6327 }
6328
6329 if (coding->type == coding_type_no_conversion
6330 || coding->type == coding_type_raw_text)
6331 coding->dst_multibyte = 0;
6332
6333 require_decoding = CODING_REQUIRE_DECODING (coding);
6334
6335 if (STRING_MULTIBYTE (str))
6336 {
6337 /* Decoding routines expect the source text to be unibyte. */
6338 str = Fstring_as_unibyte (str);
6339 to_byte = SBYTES (str);
6340 nocopy = 1;
6341 coding->src_multibyte = 0;
6342 }
6343
6344 /* Try to skip the heading and tailing ASCIIs. */
6345 if (require_decoding && coding->type != coding_type_ccl)
6346 {
6347 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6348 0);
6349 if (from == to_byte)
6350 require_decoding = 0;
6351 shrinked_bytes = from + (SBYTES (str) - to_byte);
6352 }
6353
6354 if (!require_decoding
6355 && !(SYMBOLP (coding->post_read_conversion)
6356 && !NILP (Ffboundp (coding->post_read_conversion))))
6357 {
6358 coding->consumed = SBYTES (str);
6359 coding->consumed_char = SCHARS (str);
6360 if (coding->dst_multibyte)
6361 {
6362 str = Fstring_as_multibyte (str);
6363 nocopy = 1;
6364 }
6365 coding->produced = SBYTES (str);
6366 coding->produced_char = SCHARS (str);
6367 return (nocopy ? str : Fcopy_sequence (str));
6368 }
6369
6370 if (coding->composing != COMPOSITION_DISABLED)
6371 coding_allocate_composition_data (coding, from);
6372 len = decoding_buffer_size (coding, to_byte - from);
6373 allocate_conversion_buffer (buf, len);
6374
6375 consumed = consumed_char = produced = produced_char = 0;
6376 while (1)
6377 {
6378 result = decode_coding (coding, SDATA (str) + from + consumed,
6379 buf.data + produced, to_byte - from - consumed,
6380 buf.size - produced);
6381 consumed += coding->consumed;
6382 consumed_char += coding->consumed_char;
6383 produced += coding->produced;
6384 produced_char += coding->produced_char;
6385 if (result == CODING_FINISH_NORMAL
6386 || result == CODING_FINISH_INTERRUPT
6387 || (result == CODING_FINISH_INSUFFICIENT_SRC
6388 && coding->consumed == 0))
6389 break;
6390 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6391 coding_allocate_composition_data (coding, from + produced_char);
6392 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6393 extend_conversion_buffer (&buf);
6394 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6395 {
6396 Lisp_Object eol_type;
6397
6398 /* Recover the original EOL format. */
6399 if (coding->eol_type == CODING_EOL_CR)
6400 {
6401 unsigned char *p;
6402 for (p = buf.data; p < buf.data + produced; p++)
6403 if (*p == '\n') *p = '\r';
6404 }
6405 else if (coding->eol_type == CODING_EOL_CRLF)
6406 {
6407 int num_eol = 0;
6408 unsigned char *p0, *p1;
6409 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6410 if (*p0 == '\n') num_eol++;
6411 if (produced + num_eol >= buf.size)
6412 extend_conversion_buffer (&buf);
6413 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6414 {
6415 *--p1 = *--p0;
6416 if (*p0 == '\n') *--p1 = '\r';
6417 }
6418 produced += num_eol;
6419 produced_char += num_eol;
6420 }
6421 /* Suppress eol-format conversion in the further conversion. */
6422 coding->eol_type = CODING_EOL_LF;
6423
6424 /* Set the coding system symbol to that for Unix-like EOL. */
6425 eol_type = Fget (saved_coding_symbol, Qeol_type);
6426 if (VECTORP (eol_type)
6427 && XVECTOR (eol_type)->size == 3
6428 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6429 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6430 else
6431 coding->symbol = saved_coding_symbol;
6432
6433
6434 }
6435 }
6436
6437 coding->consumed = consumed;
6438 coding->consumed_char = consumed_char;
6439 coding->produced = produced;
6440 coding->produced_char = produced_char;
6441
6442 if (coding->dst_multibyte)
6443 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6444 produced + shrinked_bytes);
6445 else
6446 newstr = make_uninit_string (produced + shrinked_bytes);
6447 if (from > 0)
6448 STRING_COPYIN (newstr, 0, SDATA (str), from);
6449 STRING_COPYIN (newstr, from, buf.data, produced);
6450 if (shrinked_bytes > from)
6451 STRING_COPYIN (newstr, from + produced,
6452 SDATA (str) + to_byte,
6453 shrinked_bytes - from);
6454 free_conversion_buffer (&buf);
6455
6456 coding->consumed += shrinked_bytes;
6457 coding->consumed_char += shrinked_bytes;
6458 coding->produced += shrinked_bytes;
6459 coding->produced_char += shrinked_bytes;
6460
6461 if (coding->cmp_data && coding->cmp_data->used)
6462 coding_restore_composition (coding, newstr);
6463 coding_free_composition_data (coding);
6464
6465 if (SYMBOLP (coding->post_read_conversion)
6466 && !NILP (Ffboundp (coding->post_read_conversion)))
6467 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6468
6469 return newstr;
6470 }
6471
6472 Lisp_Object
6473 encode_coding_string (str, coding, nocopy)
6474 Lisp_Object str;
6475 struct coding_system *coding;
6476 int nocopy;
6477 {
6478 int len;
6479 struct conversion_buffer buf;
6480 int from, to, to_byte;
6481 int result;
6482 int shrinked_bytes = 0;
6483 Lisp_Object newstr;
6484 int consumed, consumed_char, produced, produced_char;
6485
6486 if (SYMBOLP (coding->pre_write_conversion)
6487 && !NILP (Ffboundp (coding->pre_write_conversion)))
6488 {
6489 str = run_pre_post_conversion_on_str (str, coding, 1);
6490 /* As STR is just newly generated, we don't have to copy it
6491 anymore. */
6492 nocopy = 1;
6493 }
6494
6495 from = 0;
6496 to = SCHARS (str);
6497 to_byte = SBYTES (str);
6498
6499 /* Encoding routines determine the multibyteness of the source text
6500 by coding->src_multibyte. */
6501 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6502 coding->dst_multibyte = 0;
6503 if (! CODING_REQUIRE_ENCODING (coding))
6504 goto no_need_of_encoding;
6505
6506 if (coding->composing != COMPOSITION_DISABLED)
6507 coding_save_composition (coding, from, to, str);
6508
6509 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6510 if we must run CCL program or there are compositions to
6511 encode. */
6512 coding->heading_ascii = 0;
6513 if (coding->type != coding_type_ccl
6514 && (! coding->cmp_data || coding->cmp_data->used == 0))
6515 {
6516 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6517 1);
6518 if (from == to_byte)
6519 {
6520 coding_free_composition_data (coding);
6521 goto no_need_of_encoding;
6522 }
6523 shrinked_bytes = from + (SBYTES (str) - to_byte);
6524 }
6525
6526 len = encoding_buffer_size (coding, to_byte - from);
6527 allocate_conversion_buffer (buf, len);
6528
6529 consumed = consumed_char = produced = produced_char = 0;
6530 while (1)
6531 {
6532 result = encode_coding (coding, SDATA (str) + from + consumed,
6533 buf.data + produced, to_byte - from - consumed,
6534 buf.size - produced);
6535 consumed += coding->consumed;
6536 consumed_char += coding->consumed_char;
6537 produced += coding->produced;
6538 produced_char += coding->produced_char;
6539 if (result == CODING_FINISH_NORMAL
6540 || result == CODING_FINISH_INTERRUPT
6541 || (result == CODING_FINISH_INSUFFICIENT_SRC
6542 && coding->consumed == 0))
6543 break;
6544 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6545 extend_conversion_buffer (&buf);
6546 }
6547
6548 coding->consumed = consumed;
6549 coding->consumed_char = consumed_char;
6550 coding->produced = produced;
6551 coding->produced_char = produced_char;
6552
6553 newstr = make_uninit_string (produced + shrinked_bytes);
6554 if (from > 0)
6555 STRING_COPYIN (newstr, 0, SDATA (str), from);
6556 STRING_COPYIN (newstr, from, buf.data, produced);
6557 if (shrinked_bytes > from)
6558 STRING_COPYIN (newstr, from + produced,
6559 SDATA (str) + to_byte,
6560 shrinked_bytes - from);
6561
6562 free_conversion_buffer (&buf);
6563 coding_free_composition_data (coding);
6564
6565 return newstr;
6566
6567 no_need_of_encoding:
6568 coding->consumed = SBYTES (str);
6569 coding->consumed_char = SCHARS (str);
6570 if (STRING_MULTIBYTE (str))
6571 {
6572 if (nocopy)
6573 /* We are sure that STR doesn't contain a multibyte
6574 character. */
6575 STRING_SET_UNIBYTE (str);
6576 else
6577 {
6578 str = Fstring_as_unibyte (str);
6579 nocopy = 1;
6580 }
6581 }
6582 coding->produced = SBYTES (str);
6583 coding->produced_char = SCHARS (str);
6584 return (nocopy ? str : Fcopy_sequence (str));
6585 }
6586
6587 \f
6588 #ifdef emacs
6589 /*** 8. Emacs Lisp library functions ***/
6590
6591 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6592 doc: /* Return t if OBJECT is nil or a coding-system.
6593 See the documentation of `make-coding-system' for information
6594 about coding-system objects. */)
6595 (obj)
6596 Lisp_Object obj;
6597 {
6598 if (NILP (obj))
6599 return Qt;
6600 if (!SYMBOLP (obj))
6601 return Qnil;
6602 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6603 return Qt;
6604 /* Get coding-spec vector for OBJ. */
6605 obj = Fget (obj, Qcoding_system);
6606 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6607 ? Qt : Qnil);
6608 }
6609
6610 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6611 Sread_non_nil_coding_system, 1, 1, 0,
6612 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6613 (prompt)
6614 Lisp_Object prompt;
6615 {
6616 Lisp_Object val;
6617 do
6618 {
6619 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6620 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6621 }
6622 while (SCHARS (val) == 0);
6623 return (Fintern (val, Qnil));
6624 }
6625
6626 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6627 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6628 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6629 Ignores case when completing coding systems (all Emacs coding systems
6630 are lower-case). */)
6631 (prompt, default_coding_system)
6632 Lisp_Object prompt, default_coding_system;
6633 {
6634 Lisp_Object val;
6635 int count = SPECPDL_INDEX ();
6636
6637 if (SYMBOLP (default_coding_system))
6638 default_coding_system = SYMBOL_NAME (default_coding_system);
6639 specbind (Qcompletion_ignore_case, Qt);
6640 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6641 Qt, Qnil, Qcoding_system_history,
6642 default_coding_system, Qnil);
6643 unbind_to (count, Qnil);
6644 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6645 }
6646
6647 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6648 1, 1, 0,
6649 doc: /* Check validity of CODING-SYSTEM.
6650 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6651 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6652 The value of this property should be a vector of length 5. */)
6653 (coding_system)
6654 Lisp_Object coding_system;
6655 {
6656 Lisp_Object define_form;
6657
6658 define_form = Fget (coding_system, Qcoding_system_define_form);
6659 if (! NILP (define_form))
6660 {
6661 Fput (coding_system, Qcoding_system_define_form, Qnil);
6662 safe_eval (define_form);
6663 }
6664 if (!NILP (Fcoding_system_p (coding_system)))
6665 return coding_system;
6666 xsignal1 (Qcoding_system_error, coding_system);
6667 }
6668 \f
6669 Lisp_Object
6670 detect_coding_system (src, src_bytes, highest, multibytep)
6671 const unsigned char *src;
6672 int src_bytes, highest;
6673 int multibytep;
6674 {
6675 int coding_mask, eol_type;
6676 Lisp_Object val, tmp;
6677 int dummy;
6678
6679 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6680 eol_type = detect_eol_type (src, src_bytes, &dummy);
6681 if (eol_type == CODING_EOL_INCONSISTENT)
6682 eol_type = CODING_EOL_UNDECIDED;
6683
6684 if (!coding_mask)
6685 {
6686 val = Qundecided;
6687 if (eol_type != CODING_EOL_UNDECIDED)
6688 {
6689 Lisp_Object val2;
6690 val2 = Fget (Qundecided, Qeol_type);
6691 if (VECTORP (val2))
6692 val = XVECTOR (val2)->contents[eol_type];
6693 }
6694 return (highest ? val : Fcons (val, Qnil));
6695 }
6696
6697 /* At first, gather possible coding systems in VAL. */
6698 val = Qnil;
6699 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6700 {
6701 Lisp_Object category_val, category_index;
6702
6703 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6704 category_val = Fsymbol_value (XCAR (tmp));
6705 if (!NILP (category_val)
6706 && NATNUMP (category_index)
6707 && (coding_mask & (1 << XFASTINT (category_index))))
6708 {
6709 val = Fcons (category_val, val);
6710 if (highest)
6711 break;
6712 }
6713 }
6714 if (!highest)
6715 val = Fnreverse (val);
6716
6717 /* Then, replace the elements with subsidiary coding systems. */
6718 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6719 {
6720 if (eol_type != CODING_EOL_UNDECIDED
6721 && eol_type != CODING_EOL_INCONSISTENT)
6722 {
6723 Lisp_Object eol;
6724 eol = Fget (XCAR (tmp), Qeol_type);
6725 if (VECTORP (eol))
6726 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6727 }
6728 }
6729 return (highest ? XCAR (val) : val);
6730 }
6731
6732 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6733 2, 3, 0,
6734 doc: /* Detect how the byte sequence in the region is encoded.
6735 Return a list of possible coding systems used on decoding a byte
6736 sequence containing the bytes in the region between START and END when
6737 the coding system `undecided' is specified. The list is ordered by
6738 priority decided in the current language environment.
6739
6740 If only ASCII characters are found (except for such ISO-2022 control
6741 characters ISO-2022 as ESC), it returns a list of single element
6742 `undecided' or its subsidiary coding system according to a detected
6743 end-of-line format.
6744
6745 If optional argument HIGHEST is non-nil, return the coding system of
6746 highest priority. */)
6747 (start, end, highest)
6748 Lisp_Object start, end, highest;
6749 {
6750 int from, to;
6751 int from_byte, to_byte;
6752 int include_anchor_byte = 0;
6753
6754 CHECK_NUMBER_COERCE_MARKER (start);
6755 CHECK_NUMBER_COERCE_MARKER (end);
6756
6757 validate_region (&start, &end);
6758 from = XINT (start), to = XINT (end);
6759 from_byte = CHAR_TO_BYTE (from);
6760 to_byte = CHAR_TO_BYTE (to);
6761
6762 if (from < GPT && to >= GPT)
6763 move_gap_both (to, to_byte);
6764 /* If we an anchor byte `\0' follows the region, we include it in
6765 the detecting source. Then code detectors can handle the tailing
6766 byte sequence more accurately.
6767
6768 Fix me: This is not a perfect solution. It is better that we
6769 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6770 */
6771 if (to == Z || (to == GPT && GAP_SIZE > 0))
6772 include_anchor_byte = 1;
6773 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6774 to_byte - from_byte + include_anchor_byte,
6775 !NILP (highest),
6776 !NILP (current_buffer
6777 ->enable_multibyte_characters));
6778 }
6779
6780 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6781 1, 2, 0,
6782 doc: /* Detect how the byte sequence in STRING is encoded.
6783 Return a list of possible coding systems used on decoding a byte
6784 sequence containing the bytes in STRING when the coding system
6785 `undecided' is specified. The list is ordered by priority decided in
6786 the current language environment.
6787
6788 If only ASCII characters are found (except for such ISO-2022 control
6789 characters ISO-2022 as ESC), it returns a list of single element
6790 `undecided' or its subsidiary coding system according to a detected
6791 end-of-line format.
6792
6793 If optional argument HIGHEST is non-nil, return the coding system of
6794 highest priority. */)
6795 (string, highest)
6796 Lisp_Object string, highest;
6797 {
6798 CHECK_STRING (string);
6799
6800 return detect_coding_system (SDATA (string),
6801 /* "+ 1" is to include the anchor byte
6802 `\0'. With this, code detectors can
6803 handle the tailing bytes more
6804 accurately. */
6805 SBYTES (string) + 1,
6806 !NILP (highest),
6807 STRING_MULTIBYTE (string));
6808 }
6809
6810 /* Subroutine for Ffind_coding_systems_region_internal.
6811
6812 Return a list of coding systems that safely encode the multibyte
6813 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6814 possible coding systems. If it is nil, it means that we have not
6815 yet found any coding systems.
6816
6817 WORK_TABLE a char-table of which element is set to t once the
6818 element is looked up.
6819
6820 If a non-ASCII single byte char is found, set
6821 *single_byte_char_found to 1. */
6822
6823 static Lisp_Object
6824 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6825 unsigned char *p, *pend;
6826 Lisp_Object safe_codings, work_table;
6827 int *single_byte_char_found;
6828 {
6829 int c, len;
6830 Lisp_Object val, ch;
6831 Lisp_Object prev, tail;
6832
6833 if (NILP (safe_codings))
6834 goto done_safe_codings;
6835 while (p < pend)
6836 {
6837 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6838 p += len;
6839 if (ASCII_BYTE_P (c))
6840 /* We can ignore ASCII characters here. */
6841 continue;
6842 if (SINGLE_BYTE_CHAR_P (c))
6843 *single_byte_char_found = 1;
6844 /* Check the safe coding systems for C. */
6845 ch = make_number (c);
6846 val = Faref (work_table, ch);
6847 if (EQ (val, Qt))
6848 /* This element was already checked. Ignore it. */
6849 continue;
6850 /* Remember that we checked this element. */
6851 Faset (work_table, ch, Qt);
6852
6853 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6854 {
6855 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6856 int encodable;
6857
6858 elt = XCAR (tail);
6859 if (CONSP (XCDR (elt)))
6860 {
6861 /* This entry has this format now:
6862 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6863 ACCEPT-LATIN-EXTRA ) */
6864 val = XCDR (elt);
6865 encodable = ! NILP (Faref (XCAR (val), ch));
6866 if (! encodable)
6867 {
6868 val = XCDR (val);
6869 translation_table = XCAR (val);
6870 hash_table = XCAR (XCDR (val));
6871 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6872 }
6873 }
6874 else
6875 {
6876 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6877 encodable = ! NILP (Faref (XCDR (elt), ch));
6878 if (! encodable)
6879 {
6880 /* Transform the format to:
6881 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6882 ACCEPT-LATIN-EXTRA ) */
6883 val = Fget (XCAR (elt), Qcoding_system);
6884 translation_table
6885 = Fplist_get (AREF (val, 3),
6886 Qtranslation_table_for_encode);
6887 if (SYMBOLP (translation_table))
6888 translation_table = Fget (translation_table,
6889 Qtranslation_table);
6890 hash_table
6891 = (CHAR_TABLE_P (translation_table)
6892 ? XCHAR_TABLE (translation_table)->extras[1]
6893 : Qnil);
6894 accept_latin_extra
6895 = ((EQ (AREF (val, 0), make_number (2))
6896 && VECTORP (AREF (val, 4)))
6897 ? AREF (AREF (val, 4), 16)
6898 : Qnil);
6899 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6900 translation_table, hash_table,
6901 accept_latin_extra));
6902 }
6903 }
6904
6905 if (! encodable
6906 && ((CHAR_TABLE_P (translation_table)
6907 && ! NILP (Faref (translation_table, ch)))
6908 || (HASH_TABLE_P (hash_table)
6909 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6910 || (SINGLE_BYTE_CHAR_P (c)
6911 && ! NILP (accept_latin_extra)
6912 && VECTORP (Vlatin_extra_code_table)
6913 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6914 encodable = 1;
6915 if (encodable)
6916 prev = tail;
6917 else
6918 {
6919 /* Exclude this coding system from SAFE_CODINGS. */
6920 if (EQ (tail, safe_codings))
6921 {
6922 safe_codings = XCDR (safe_codings);
6923 if (NILP (safe_codings))
6924 goto done_safe_codings;
6925 }
6926 else
6927 XSETCDR (prev, XCDR (tail));
6928 }
6929 }
6930 }
6931
6932 done_safe_codings:
6933 /* If the above loop was terminated before P reaches PEND, it means
6934 SAFE_CODINGS was set to nil. If we have not yet found an
6935 non-ASCII single-byte char, check it now. */
6936 if (! *single_byte_char_found)
6937 while (p < pend)
6938 {
6939 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6940 p += len;
6941 if (! ASCII_BYTE_P (c)
6942 && SINGLE_BYTE_CHAR_P (c))
6943 {
6944 *single_byte_char_found = 1;
6945 break;
6946 }
6947 }
6948 return safe_codings;
6949 }
6950
6951 DEFUN ("find-coding-systems-region-internal",
6952 Ffind_coding_systems_region_internal,
6953 Sfind_coding_systems_region_internal, 2, 2, 0,
6954 doc: /* Internal use only. */)
6955 (start, end)
6956 Lisp_Object start, end;
6957 {
6958 Lisp_Object work_table, safe_codings;
6959 int non_ascii_p = 0;
6960 int single_byte_char_found = 0;
6961 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6962
6963 if (STRINGP (start))
6964 {
6965 if (!STRING_MULTIBYTE (start))
6966 return Qt;
6967 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6968 p2 = p2end = p1end;
6969 if (SCHARS (start) != SBYTES (start))
6970 non_ascii_p = 1;
6971 }
6972 else
6973 {
6974 int from, to, stop;
6975
6976 CHECK_NUMBER_COERCE_MARKER (start);
6977 CHECK_NUMBER_COERCE_MARKER (end);
6978 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6979 args_out_of_range (start, end);
6980 if (NILP (current_buffer->enable_multibyte_characters))
6981 return Qt;
6982 from = CHAR_TO_BYTE (XINT (start));
6983 to = CHAR_TO_BYTE (XINT (end));
6984 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6985 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6986 if (stop == to)
6987 p2 = p2end = p1end;
6988 else
6989 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6990 if (XINT (end) - XINT (start) != to - from)
6991 non_ascii_p = 1;
6992 }
6993
6994 if (!non_ascii_p)
6995 {
6996 /* We are sure that the text contains no multibyte character.
6997 Check if it contains eight-bit-graphic. */
6998 p = p1;
6999 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
7000 if (p == p1end)
7001 {
7002 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
7003 if (p == p2end)
7004 return Qt;
7005 }
7006 }
7007
7008 /* The text contains non-ASCII characters. */
7009
7010 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
7011 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
7012
7013 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
7014 &single_byte_char_found);
7015 if (p2 < p2end)
7016 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
7017 &single_byte_char_found);
7018 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
7019 safe_codings = Qt;
7020 else
7021 {
7022 /* Turn safe_codings to a list of coding systems... */
7023 Lisp_Object val;
7024
7025 if (single_byte_char_found)
7026 /* ... and append these for eight-bit chars. */
7027 val = Fcons (Qraw_text,
7028 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
7029 else
7030 /* ... and append generic coding systems. */
7031 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
7032
7033 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
7034 val = Fcons (XCAR (XCAR (safe_codings)), val);
7035 safe_codings = val;
7036 }
7037
7038 return safe_codings;
7039 }
7040
7041
7042 /* Search from position POS for such characters that are unencodable
7043 accoding to SAFE_CHARS, and return a list of their positions. P
7044 points where in the memory the character at POS exists. Limit the
7045 search at PEND or when Nth unencodable characters are found.
7046
7047 If SAFE_CHARS is a char table, an element for an unencodable
7048 character is nil.
7049
7050 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
7051
7052 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
7053 eight-bit-graphic characters are unencodable. */
7054
7055 static Lisp_Object
7056 unencodable_char_position (safe_chars, pos, p, pend, n)
7057 Lisp_Object safe_chars;
7058 int pos;
7059 unsigned char *p, *pend;
7060 int n;
7061 {
7062 Lisp_Object pos_list;
7063
7064 pos_list = Qnil;
7065 while (p < pend)
7066 {
7067 int len;
7068 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7069
7070 if (c >= 128
7071 && (CHAR_TABLE_P (safe_chars)
7072 ? NILP (CHAR_TABLE_REF (safe_chars, c))
7073 : (NILP (safe_chars) || c < 256)))
7074 {
7075 pos_list = Fcons (make_number (pos), pos_list);
7076 if (--n <= 0)
7077 break;
7078 }
7079 pos++;
7080 p += len;
7081 }
7082 return Fnreverse (pos_list);
7083 }
7084
7085
7086 DEFUN ("unencodable-char-position", Funencodable_char_position,
7087 Sunencodable_char_position, 3, 5, 0,
7088 doc: /*
7089 Return position of first un-encodable character in a region.
7090 START and END specfiy the region and CODING-SYSTEM specifies the
7091 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7092
7093 If optional 4th argument COUNT is non-nil, it specifies at most how
7094 many un-encodable characters to search. In this case, the value is a
7095 list of positions.
7096
7097 If optional 5th argument STRING is non-nil, it is a string to search
7098 for un-encodable characters. In that case, START and END are indexes
7099 to the string. */)
7100 (start, end, coding_system, count, string)
7101 Lisp_Object start, end, coding_system, count, string;
7102 {
7103 int n;
7104 Lisp_Object safe_chars;
7105 struct coding_system coding;
7106 Lisp_Object positions;
7107 int from, to;
7108 unsigned char *p, *pend;
7109
7110 if (NILP (string))
7111 {
7112 validate_region (&start, &end);
7113 from = XINT (start);
7114 to = XINT (end);
7115 if (NILP (current_buffer->enable_multibyte_characters))
7116 return Qnil;
7117 p = CHAR_POS_ADDR (from);
7118 if (to == GPT)
7119 pend = GPT_ADDR;
7120 else
7121 pend = CHAR_POS_ADDR (to);
7122 }
7123 else
7124 {
7125 CHECK_STRING (string);
7126 CHECK_NATNUM (start);
7127 CHECK_NATNUM (end);
7128 from = XINT (start);
7129 to = XINT (end);
7130 if (from > to
7131 || to > SCHARS (string))
7132 args_out_of_range_3 (string, start, end);
7133 if (! STRING_MULTIBYTE (string))
7134 return Qnil;
7135 p = SDATA (string) + string_char_to_byte (string, from);
7136 pend = SDATA (string) + string_char_to_byte (string, to);
7137 }
7138
7139 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7140
7141 if (NILP (count))
7142 n = 1;
7143 else
7144 {
7145 CHECK_NATNUM (count);
7146 n = XINT (count);
7147 }
7148
7149 if (coding.type == coding_type_no_conversion
7150 || coding.type == coding_type_raw_text)
7151 return Qnil;
7152
7153 if (coding.type == coding_type_undecided)
7154 safe_chars = Qnil;
7155 else
7156 safe_chars = coding_safe_chars (coding_system);
7157
7158 if (STRINGP (string)
7159 || from >= GPT || to <= GPT)
7160 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7161 else
7162 {
7163 Lisp_Object args[2];
7164
7165 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7166 n -= XINT (Flength (args[0]));
7167 if (n <= 0)
7168 positions = args[0];
7169 else
7170 {
7171 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7172 pend, n);
7173 positions = Fappend (2, args);
7174 }
7175 }
7176
7177 return (NILP (count) ? Fcar (positions) : positions);
7178 }
7179
7180
7181 Lisp_Object
7182 code_convert_region1 (start, end, coding_system, encodep)
7183 Lisp_Object start, end, coding_system;
7184 int encodep;
7185 {
7186 struct coding_system coding;
7187 int from, to;
7188
7189 CHECK_NUMBER_COERCE_MARKER (start);
7190 CHECK_NUMBER_COERCE_MARKER (end);
7191 CHECK_SYMBOL (coding_system);
7192
7193 validate_region (&start, &end);
7194 from = XFASTINT (start);
7195 to = XFASTINT (end);
7196
7197 if (NILP (coding_system))
7198 return make_number (to - from);
7199
7200 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7201 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7202
7203 coding.mode |= CODING_MODE_LAST_BLOCK;
7204 coding.src_multibyte = coding.dst_multibyte
7205 = !NILP (current_buffer->enable_multibyte_characters);
7206 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7207 &coding, encodep, 1);
7208 Vlast_coding_system_used = coding.symbol;
7209 return make_number (coding.produced_char);
7210 }
7211
7212 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7213 3, 3, "r\nzCoding system: ",
7214 doc: /* Decode the current region from the specified coding system.
7215 When called from a program, takes three arguments:
7216 START, END, and CODING-SYSTEM. START and END are buffer positions.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.)
7220 It returns the length of the decoded text. */)
7221 (start, end, coding_system)
7222 Lisp_Object start, end, coding_system;
7223 {
7224 return code_convert_region1 (start, end, coding_system, 0);
7225 }
7226
7227 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7228 3, 3, "r\nzCoding system: ",
7229 doc: /* Encode the current region into the specified coding system.
7230 When called from a program, takes three arguments:
7231 START, END, and CODING-SYSTEM. START and END are buffer positions.
7232 This function sets `last-coding-system-used' to the precise coding system
7233 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7234 not fully specified.)
7235 It returns the length of the encoded text. */)
7236 (start, end, coding_system)
7237 Lisp_Object start, end, coding_system;
7238 {
7239 return code_convert_region1 (start, end, coding_system, 1);
7240 }
7241
7242 Lisp_Object
7243 code_convert_string1 (string, coding_system, nocopy, encodep)
7244 Lisp_Object string, coding_system, nocopy;
7245 int encodep;
7246 {
7247 struct coding_system coding;
7248
7249 CHECK_STRING (string);
7250 CHECK_SYMBOL (coding_system);
7251
7252 if (NILP (coding_system))
7253 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7254
7255 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7256 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7257
7258 coding.mode |= CODING_MODE_LAST_BLOCK;
7259 string = (encodep
7260 ? encode_coding_string (string, &coding, !NILP (nocopy))
7261 : decode_coding_string (string, &coding, !NILP (nocopy)));
7262 Vlast_coding_system_used = coding.symbol;
7263
7264 return string;
7265 }
7266
7267 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7268 2, 3, 0,
7269 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7270 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7271 if the decoding operation is trivial.
7272 This function sets `last-coding-system-used' to the precise coding system
7273 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7274 not fully specified.) */)
7275 (string, coding_system, nocopy)
7276 Lisp_Object string, coding_system, nocopy;
7277 {
7278 return code_convert_string1 (string, coding_system, nocopy, 0);
7279 }
7280
7281 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7282 2, 3, 0,
7283 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7284 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7285 if the encoding operation is trivial.
7286 This function sets `last-coding-system-used' to the precise coding system
7287 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7288 not fully specified.) */)
7289 (string, coding_system, nocopy)
7290 Lisp_Object string, coding_system, nocopy;
7291 {
7292 return code_convert_string1 (string, coding_system, nocopy, 1);
7293 }
7294
7295 /* Encode or decode STRING according to CODING_SYSTEM.
7296 Do not set Vlast_coding_system_used.
7297
7298 This function is called only from macros DECODE_FILE and
7299 ENCODE_FILE, thus we ignore character composition. */
7300
7301 Lisp_Object
7302 code_convert_string_norecord (string, coding_system, encodep)
7303 Lisp_Object string, coding_system;
7304 int encodep;
7305 {
7306 struct coding_system coding;
7307
7308 CHECK_STRING (string);
7309 CHECK_SYMBOL (coding_system);
7310
7311 if (NILP (coding_system))
7312 return string;
7313
7314 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7315 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7316
7317 coding.composing = COMPOSITION_DISABLED;
7318 coding.mode |= CODING_MODE_LAST_BLOCK;
7319 return (encodep
7320 ? encode_coding_string (string, &coding, 1)
7321 : decode_coding_string (string, &coding, 1));
7322 }
7323 \f
7324 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7325 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7326 Return the corresponding character. */)
7327 (code)
7328 Lisp_Object code;
7329 {
7330 unsigned char c1, c2, s1, s2;
7331 Lisp_Object val;
7332
7333 CHECK_NUMBER (code);
7334 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7335 if (s1 == 0)
7336 {
7337 if (s2 < 0x80)
7338 XSETFASTINT (val, s2);
7339 else if (s2 >= 0xA0 || s2 <= 0xDF)
7340 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7341 else
7342 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7343 }
7344 else
7345 {
7346 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7347 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7348 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7349 DECODE_SJIS (s1, s2, c1, c2);
7350 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7351 }
7352 return val;
7353 }
7354
7355 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7356 doc: /* Encode a Japanese character CH to shift_jis encoding.
7357 Return the corresponding code in SJIS. */)
7358 (ch)
7359 Lisp_Object ch;
7360 {
7361 int charset, c1, c2, s1, s2;
7362 Lisp_Object val;
7363
7364 CHECK_NUMBER (ch);
7365 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7366 if (charset == CHARSET_ASCII)
7367 {
7368 val = ch;
7369 }
7370 else if (charset == charset_jisx0208
7371 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7372 {
7373 ENCODE_SJIS (c1, c2, s1, s2);
7374 XSETFASTINT (val, (s1 << 8) | s2);
7375 }
7376 else if (charset == charset_katakana_jisx0201
7377 && c1 > 0x20 && c2 < 0xE0)
7378 {
7379 XSETFASTINT (val, c1 | 0x80);
7380 }
7381 else
7382 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7383 return val;
7384 }
7385
7386 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7387 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7388 Return the corresponding character. */)
7389 (code)
7390 Lisp_Object code;
7391 {
7392 int charset;
7393 unsigned char b1, b2, c1, c2;
7394 Lisp_Object val;
7395
7396 CHECK_NUMBER (code);
7397 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7398 if (b1 == 0)
7399 {
7400 if (b2 >= 0x80)
7401 error ("Invalid BIG5 code: %x", XFASTINT (code));
7402 val = code;
7403 }
7404 else
7405 {
7406 if ((b1 < 0xA1 || b1 > 0xFE)
7407 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7408 error ("Invalid BIG5 code: %x", XFASTINT (code));
7409 DECODE_BIG5 (b1, b2, charset, c1, c2);
7410 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7411 }
7412 return val;
7413 }
7414
7415 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7416 doc: /* Encode the Big5 character CH to BIG5 coding system.
7417 Return the corresponding character code in Big5. */)
7418 (ch)
7419 Lisp_Object ch;
7420 {
7421 int charset, c1, c2, b1, b2;
7422 Lisp_Object val;
7423
7424 CHECK_NUMBER (ch);
7425 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7426 if (charset == CHARSET_ASCII)
7427 {
7428 val = ch;
7429 }
7430 else if ((charset == charset_big5_1
7431 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7432 || (charset == charset_big5_2
7433 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7434 {
7435 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7436 XSETFASTINT (val, (b1 << 8) | b2);
7437 }
7438 else
7439 error ("Can't encode to Big5: %d", XFASTINT (ch));
7440 return val;
7441 }
7442 \f
7443 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7444 Sset_terminal_coding_system_internal, 1, 2, 0,
7445 doc: /* Internal use only. */)
7446 (coding_system, terminal)
7447 Lisp_Object coding_system;
7448 Lisp_Object terminal;
7449 {
7450 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7451 CHECK_SYMBOL (coding_system);
7452 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7453 /* We had better not send unsafe characters to terminal. */
7454 terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7455 /* Character composition should be disabled. */
7456 terminal_coding->composing = COMPOSITION_DISABLED;
7457 /* Error notification should be suppressed. */
7458 terminal_coding->suppress_error = 1;
7459 terminal_coding->src_multibyte = 1;
7460 terminal_coding->dst_multibyte = 0;
7461 return Qnil;
7462 }
7463
7464 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7465 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7466 doc: /* Internal use only. */)
7467 (coding_system)
7468 Lisp_Object coding_system;
7469 {
7470 CHECK_SYMBOL (coding_system);
7471 setup_coding_system (Fcheck_coding_system (coding_system),
7472 &safe_terminal_coding);
7473 /* Character composition should be disabled. */
7474 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7475 /* Error notification should be suppressed. */
7476 safe_terminal_coding.suppress_error = 1;
7477 safe_terminal_coding.src_multibyte = 1;
7478 safe_terminal_coding.dst_multibyte = 0;
7479 return Qnil;
7480 }
7481
7482 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7483 Sterminal_coding_system, 0, 1, 0,
7484 doc: /* Return coding system specified for terminal output on the given terminal.
7485 TERMINAL may be a terminal id, a frame, or nil for the selected
7486 frame's terminal device. */)
7487 (terminal)
7488 Lisp_Object terminal;
7489 {
7490 return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7491 }
7492
7493 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7494 Sset_keyboard_coding_system_internal, 1, 2, 0,
7495 doc: /* Internal use only. */)
7496 (coding_system, terminal)
7497 Lisp_Object coding_system;
7498 Lisp_Object terminal;
7499 {
7500 struct terminal *t = get_terminal (terminal, 1);
7501 CHECK_SYMBOL (coding_system);
7502
7503 setup_coding_system (Fcheck_coding_system (coding_system),
7504 TERMINAL_KEYBOARD_CODING (t));
7505 /* Character composition should be disabled. */
7506 TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7507 return Qnil;
7508 }
7509
7510 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7511 Skeyboard_coding_system, 0, 1, 0,
7512 doc: /* Return coding system for decoding keyboard input on TERMINAL.
7513 TERMINAL may be a terminal id, a frame, or nil for the selected
7514 frame's terminal device. */)
7515 (terminal)
7516 Lisp_Object terminal;
7517 {
7518 return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7519 }
7520
7521 \f
7522 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7523 Sfind_operation_coding_system, 1, MANY, 0,
7524 doc: /* Choose a coding system for an operation based on the target name.
7525 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7526 DECODING-SYSTEM is the coding system to use for decoding
7527 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7528 for encoding (in case OPERATION does encoding).
7529
7530 The first argument OPERATION specifies an I/O primitive:
7531 For file I/O, `insert-file-contents' or `write-region'.
7532 For process I/O, `call-process', `call-process-region', or `start-process'.
7533 For network I/O, `open-network-stream'.
7534
7535 The remaining arguments should be the same arguments that were passed
7536 to the primitive. Depending on which primitive, one of those arguments
7537 is selected as the TARGET. For example, if OPERATION does file I/O,
7538 whichever argument specifies the file name is TARGET.
7539
7540 TARGET has a meaning which depends on OPERATION:
7541 For file I/O, TARGET is a file name (except for the special case below).
7542 For process I/O, TARGET is a process name.
7543 For network I/O, TARGET is a service name or a port number
7544
7545 This function looks up what specified for TARGET in,
7546 `file-coding-system-alist', `process-coding-system-alist',
7547 or `network-coding-system-alist' depending on OPERATION.
7548 They may specify a coding system, a cons of coding systems,
7549 or a function symbol to call.
7550 In the last case, we call the function with one argument,
7551 which is a list of all the arguments given to this function.
7552 If the function can't decide a coding system, it can return
7553 `undecided' so that the normal code-detection is performed.
7554
7555 If OPERATION is `insert-file-contents', the argument corresponding to
7556 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7557 file name to look up, and BUFFER is a buffer that contains the file's
7558 contents (not yet decoded). If `file-coding-system-alist' specifies a
7559 function to call for FILENAME, that function should examine the
7560 contents of BUFFER instead of reading the file.
7561
7562 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
7563 (nargs, args)
7564 int nargs;
7565 Lisp_Object *args;
7566 {
7567 Lisp_Object operation, target_idx, target, val;
7568 register Lisp_Object chain;
7569
7570 if (nargs < 2)
7571 error ("Too few arguments");
7572 operation = args[0];
7573 if (!SYMBOLP (operation)
7574 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7575 error ("Invalid first argument");
7576 if (nargs < 1 + XINT (target_idx))
7577 error ("Too few arguments for operation: %s",
7578 SDATA (SYMBOL_NAME (operation)));
7579 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7580 argument to write-region) is string, it must be treated as a
7581 target file name. */
7582 if (EQ (operation, Qwrite_region)
7583 && nargs > 5
7584 && STRINGP (args[5]))
7585 target_idx = make_number (4);
7586 target = args[XINT (target_idx) + 1];
7587 if (!(STRINGP (target)
7588 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7589 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7590 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7591 error ("Invalid argument %d", XINT (target_idx) + 1);
7592 if (CONSP (target))
7593 target = XCAR (target);
7594
7595 chain = ((EQ (operation, Qinsert_file_contents)
7596 || EQ (operation, Qwrite_region))
7597 ? Vfile_coding_system_alist
7598 : (EQ (operation, Qopen_network_stream)
7599 ? Vnetwork_coding_system_alist
7600 : Vprocess_coding_system_alist));
7601 if (NILP (chain))
7602 return Qnil;
7603
7604 for (; CONSP (chain); chain = XCDR (chain))
7605 {
7606 Lisp_Object elt;
7607 elt = XCAR (chain);
7608
7609 if (CONSP (elt)
7610 && ((STRINGP (target)
7611 && STRINGP (XCAR (elt))
7612 && fast_string_match (XCAR (elt), target) >= 0)
7613 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7614 {
7615 val = XCDR (elt);
7616 /* Here, if VAL is both a valid coding system and a valid
7617 function symbol, we return VAL as a coding system. */
7618 if (CONSP (val))
7619 return val;
7620 if (! SYMBOLP (val))
7621 return Qnil;
7622 if (! NILP (Fcoding_system_p (val)))
7623 return Fcons (val, val);
7624 if (! NILP (Ffboundp (val)))
7625 {
7626 /* We use call1 rather than safe_call1
7627 so as to get bug reports about functions called here
7628 which don't handle the current interface. */
7629 val = call1 (val, Flist (nargs, args));
7630 if (CONSP (val))
7631 return val;
7632 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7633 return Fcons (val, val);
7634 }
7635 return Qnil;
7636 }
7637 }
7638 return Qnil;
7639 }
7640
7641 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7642 Supdate_coding_systems_internal, 0, 0, 0,
7643 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7644 When values of any coding categories are changed, you must
7645 call this function. */)
7646 ()
7647 {
7648 int i;
7649
7650 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7651 {
7652 Lisp_Object val;
7653
7654 val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[i]);
7655 if (!NILP (val))
7656 {
7657 if (! coding_system_table[i])
7658 coding_system_table[i] = ((struct coding_system *)
7659 xmalloc (sizeof (struct coding_system)));
7660 setup_coding_system (val, coding_system_table[i]);
7661 }
7662 else if (coding_system_table[i])
7663 {
7664 xfree (coding_system_table[i]);
7665 coding_system_table[i] = NULL;
7666 }
7667 }
7668
7669 return Qnil;
7670 }
7671
7672 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7673 Sset_coding_priority_internal, 0, 0, 0,
7674 doc: /* Update internal database for the current value of `coding-category-list'.
7675 This function is internal use only. */)
7676 ()
7677 {
7678 int i = 0, idx;
7679 Lisp_Object val;
7680
7681 val = Vcoding_category_list;
7682
7683 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7684 {
7685 if (! SYMBOLP (XCAR (val)))
7686 break;
7687 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7688 if (idx >= CODING_CATEGORY_IDX_MAX)
7689 break;
7690 coding_priorities[i++] = (1 << idx);
7691 val = XCDR (val);
7692 }
7693 /* If coding-category-list is valid and contains all coding
7694 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7695 the following code saves Emacs from crashing. */
7696 while (i < CODING_CATEGORY_IDX_MAX)
7697 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7698
7699 return Qnil;
7700 }
7701
7702 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7703 Sdefine_coding_system_internal, 1, 1, 0,
7704 doc: /* Register CODING-SYSTEM as a base coding system.
7705 This function is internal use only. */)
7706 (coding_system)
7707 Lisp_Object coding_system;
7708 {
7709 Lisp_Object safe_chars, slot;
7710
7711 if (NILP (Fcheck_coding_system (coding_system)))
7712 xsignal1 (Qcoding_system_error, coding_system);
7713
7714 safe_chars = coding_safe_chars (coding_system);
7715 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7716 error ("No valid safe-chars property for %s",
7717 SDATA (SYMBOL_NAME (coding_system)));
7718
7719 if (EQ (safe_chars, Qt))
7720 {
7721 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7722 XSETCAR (Vcoding_system_safe_chars,
7723 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7724 }
7725 else
7726 {
7727 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7728 if (NILP (slot))
7729 XSETCDR (Vcoding_system_safe_chars,
7730 nconc2 (XCDR (Vcoding_system_safe_chars),
7731 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7732 else
7733 XSETCDR (slot, safe_chars);
7734 }
7735 return Qnil;
7736 }
7737
7738 #endif /* emacs */
7739
7740 \f
7741 /*** 9. Post-amble ***/
7742
7743 void
7744 init_coding_once ()
7745 {
7746 int i;
7747
7748 /* Emacs' internal format specific initialize routine. */
7749 for (i = 0; i <= 0x20; i++)
7750 emacs_code_class[i] = EMACS_control_code;
7751 emacs_code_class[0x0A] = EMACS_linefeed_code;
7752 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7753 for (i = 0x21 ; i < 0x7F; i++)
7754 emacs_code_class[i] = EMACS_ascii_code;
7755 emacs_code_class[0x7F] = EMACS_control_code;
7756 for (i = 0x80; i < 0xFF; i++)
7757 emacs_code_class[i] = EMACS_invalid_code;
7758 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7759 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7760 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7761 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7762
7763 /* ISO2022 specific initialize routine. */
7764 for (i = 0; i < 0x20; i++)
7765 iso_code_class[i] = ISO_control_0;
7766 for (i = 0x21; i < 0x7F; i++)
7767 iso_code_class[i] = ISO_graphic_plane_0;
7768 for (i = 0x80; i < 0xA0; i++)
7769 iso_code_class[i] = ISO_control_1;
7770 for (i = 0xA1; i < 0xFF; i++)
7771 iso_code_class[i] = ISO_graphic_plane_1;
7772 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7773 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7774 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7775 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7776 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7777 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7778 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7779 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7780 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7781 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7782
7783 setup_coding_system (Qnil, &safe_terminal_coding);
7784 setup_coding_system (Qnil, &default_buffer_file_coding);
7785
7786 bzero (coding_system_table, sizeof coding_system_table);
7787
7788 bzero (ascii_skip_code, sizeof ascii_skip_code);
7789 for (i = 0; i < 128; i++)
7790 ascii_skip_code[i] = 1;
7791
7792 #if defined (MSDOS) || defined (WINDOWSNT)
7793 system_eol_type = CODING_EOL_CRLF;
7794 #else
7795 system_eol_type = CODING_EOL_LF;
7796 #endif
7797
7798 inhibit_pre_post_conversion = 0;
7799 }
7800
7801 #ifdef emacs
7802
7803 void
7804 syms_of_coding ()
7805 {
7806 staticpro (&Vcode_conversion_workbuf_name);
7807 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7808
7809 Qtarget_idx = intern ("target-idx");
7810 staticpro (&Qtarget_idx);
7811
7812 Qcoding_system_history = intern ("coding-system-history");
7813 staticpro (&Qcoding_system_history);
7814 Fset (Qcoding_system_history, Qnil);
7815
7816 /* Target FILENAME is the first argument. */
7817 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7818 /* Target FILENAME is the third argument. */
7819 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7820
7821 Qcall_process = intern ("call-process");
7822 staticpro (&Qcall_process);
7823 /* Target PROGRAM is the first argument. */
7824 Fput (Qcall_process, Qtarget_idx, make_number (0));
7825
7826 Qcall_process_region = intern ("call-process-region");
7827 staticpro (&Qcall_process_region);
7828 /* Target PROGRAM is the third argument. */
7829 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7830
7831 Qstart_process = intern ("start-process");
7832 staticpro (&Qstart_process);
7833 /* Target PROGRAM is the third argument. */
7834 Fput (Qstart_process, Qtarget_idx, make_number (2));
7835
7836 Qopen_network_stream = intern ("open-network-stream");
7837 staticpro (&Qopen_network_stream);
7838 /* Target SERVICE is the fourth argument. */
7839 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7840
7841 Qcoding_system = intern ("coding-system");
7842 staticpro (&Qcoding_system);
7843
7844 Qeol_type = intern ("eol-type");
7845 staticpro (&Qeol_type);
7846
7847 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7848 staticpro (&Qbuffer_file_coding_system);
7849
7850 Qpost_read_conversion = intern ("post-read-conversion");
7851 staticpro (&Qpost_read_conversion);
7852
7853 Qpre_write_conversion = intern ("pre-write-conversion");
7854 staticpro (&Qpre_write_conversion);
7855
7856 Qno_conversion = intern ("no-conversion");
7857 staticpro (&Qno_conversion);
7858
7859 Qundecided = intern ("undecided");
7860 staticpro (&Qundecided);
7861
7862 Qcoding_system_p = intern ("coding-system-p");
7863 staticpro (&Qcoding_system_p);
7864
7865 Qcoding_system_error = intern ("coding-system-error");
7866 staticpro (&Qcoding_system_error);
7867
7868 Fput (Qcoding_system_error, Qerror_conditions,
7869 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7870 Fput (Qcoding_system_error, Qerror_message,
7871 build_string ("Invalid coding system"));
7872
7873 Qcoding_category = intern ("coding-category");
7874 staticpro (&Qcoding_category);
7875 Qcoding_category_index = intern ("coding-category-index");
7876 staticpro (&Qcoding_category_index);
7877
7878 Vcoding_category_table
7879 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7880 staticpro (&Vcoding_category_table);
7881 {
7882 int i;
7883 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7884 {
7885 XVECTOR (Vcoding_category_table)->contents[i]
7886 = intern (coding_category_name[i]);
7887 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7888 Qcoding_category_index, make_number (i));
7889 }
7890 }
7891
7892 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7893 staticpro (&Vcoding_system_safe_chars);
7894
7895 Qtranslation_table = intern ("translation-table");
7896 staticpro (&Qtranslation_table);
7897 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7898
7899 Qtranslation_table_id = intern ("translation-table-id");
7900 staticpro (&Qtranslation_table_id);
7901
7902 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7903 staticpro (&Qtranslation_table_for_decode);
7904
7905 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7906 staticpro (&Qtranslation_table_for_encode);
7907
7908 Qsafe_chars = intern ("safe-chars");
7909 staticpro (&Qsafe_chars);
7910
7911 Qchar_coding_system = intern ("char-coding-system");
7912 staticpro (&Qchar_coding_system);
7913
7914 /* Intern this now in case it isn't already done.
7915 Setting this variable twice is harmless.
7916 But don't staticpro it here--that is done in alloc.c. */
7917 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7918 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7919 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7920
7921 Qvalid_codes = intern ("valid-codes");
7922 staticpro (&Qvalid_codes);
7923
7924 Qascii_incompatible = intern ("ascii-incompatible");
7925 staticpro (&Qascii_incompatible);
7926
7927 Qemacs_mule = intern ("emacs-mule");
7928 staticpro (&Qemacs_mule);
7929
7930 Qraw_text = intern ("raw-text");
7931 staticpro (&Qraw_text);
7932
7933 Qutf_8 = intern ("utf-8");
7934 staticpro (&Qutf_8);
7935
7936 Qcoding_system_define_form = intern ("coding-system-define-form");
7937 staticpro (&Qcoding_system_define_form);
7938
7939 defsubr (&Scoding_system_p);
7940 defsubr (&Sread_coding_system);
7941 defsubr (&Sread_non_nil_coding_system);
7942 defsubr (&Scheck_coding_system);
7943 defsubr (&Sdetect_coding_region);
7944 defsubr (&Sdetect_coding_string);
7945 defsubr (&Sfind_coding_systems_region_internal);
7946 defsubr (&Sunencodable_char_position);
7947 defsubr (&Sdecode_coding_region);
7948 defsubr (&Sencode_coding_region);
7949 defsubr (&Sdecode_coding_string);
7950 defsubr (&Sencode_coding_string);
7951 defsubr (&Sdecode_sjis_char);
7952 defsubr (&Sencode_sjis_char);
7953 defsubr (&Sdecode_big5_char);
7954 defsubr (&Sencode_big5_char);
7955 defsubr (&Sset_terminal_coding_system_internal);
7956 defsubr (&Sset_safe_terminal_coding_system_internal);
7957 defsubr (&Sterminal_coding_system);
7958 defsubr (&Sset_keyboard_coding_system_internal);
7959 defsubr (&Skeyboard_coding_system);
7960 defsubr (&Sfind_operation_coding_system);
7961 defsubr (&Supdate_coding_systems_internal);
7962 defsubr (&Sset_coding_priority_internal);
7963 defsubr (&Sdefine_coding_system_internal);
7964
7965 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7966 doc: /* List of coding systems.
7967
7968 Do not alter the value of this variable manually. This variable should be
7969 updated by the functions `make-coding-system' and
7970 `define-coding-system-alias'. */);
7971 Vcoding_system_list = Qnil;
7972
7973 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7974 doc: /* Alist of coding system names.
7975 Each element is one element list of coding system name.
7976 This variable is given to `completing-read' as TABLE argument.
7977
7978 Do not alter the value of this variable manually. This variable should be
7979 updated by the functions `make-coding-system' and
7980 `define-coding-system-alias'. */);
7981 Vcoding_system_alist = Qnil;
7982
7983 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7984 doc: /* List of coding-categories (symbols) ordered by priority.
7985
7986 On detecting a coding system, Emacs tries code detection algorithms
7987 associated with each coding-category one by one in this order. When
7988 one algorithm agrees with a byte sequence of source text, the coding
7989 system bound to the corresponding coding-category is selected.
7990
7991 Don't modify this variable directly, but use `set-coding-priority'. */);
7992 {
7993 int i;
7994
7995 Vcoding_category_list = Qnil;
7996 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7997 Vcoding_category_list
7998 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7999 Vcoding_category_list);
8000 }
8001
8002 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8003 doc: /* Specify the coding system for read operations.
8004 It is useful to bind this variable with `let', but do not set it globally.
8005 If the value is a coding system, it is used for decoding on read operation.
8006 If not, an appropriate element is used from one of the coding system alists:
8007 There are three such tables, `file-coding-system-alist',
8008 `process-coding-system-alist', and `network-coding-system-alist'. */);
8009 Vcoding_system_for_read = Qnil;
8010
8011 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8012 doc: /* Specify the coding system for write operations.
8013 Programs bind this variable with `let', but you should not set it globally.
8014 If the value is a coding system, it is used for encoding of output,
8015 when writing it to a file and when sending it to a file or subprocess.
8016
8017 If this does not specify a coding system, an appropriate element
8018 is used from one of the coding system alists:
8019 There are three such tables, `file-coding-system-alist',
8020 `process-coding-system-alist', and `network-coding-system-alist'.
8021 For output to files, if the above procedure does not specify a coding system,
8022 the value of `buffer-file-coding-system' is used. */);
8023 Vcoding_system_for_write = Qnil;
8024
8025 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8026 doc: /* Coding system used in the latest file or process I/O.
8027 Also set by `encode-coding-region', `decode-coding-region',
8028 `encode-coding-string' and `decode-coding-string'. */);
8029 Vlast_coding_system_used = Qnil;
8030
8031 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8032 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
8033 See info node `Coding Systems' and info node `Text and Binary' concerning
8034 such conversion. */);
8035 inhibit_eol_conversion = 0;
8036
8037 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8038 doc: /* Non-nil means process buffer inherits coding system of process output.
8039 Bind it to t if the process output is to be treated as if it were a file
8040 read from some filesystem. */);
8041 inherit_process_coding_system = 0;
8042
8043 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8044 doc: /* Alist to decide a coding system to use for a file I/O operation.
8045 The format is ((PATTERN . VAL) ...),
8046 where PATTERN is a regular expression matching a file name,
8047 VAL is a coding system, a cons of coding systems, or a function symbol.
8048 If VAL is a coding system, it is used for both decoding and encoding
8049 the file contents.
8050 If VAL is a cons of coding systems, the car part is used for decoding,
8051 and the cdr part is used for encoding.
8052 If VAL is a function symbol, the function must return a coding system
8053 or a cons of coding systems which are used as above. The function is
8054 called with an argument that is a list of the arguments with which
8055 `find-operation-coding-system' was called. If the function can't decide
8056 a coding system, it can return `undecided' so that the normal
8057 code-detection is performed.
8058
8059 See also the function `find-operation-coding-system'
8060 and the variable `auto-coding-alist'. */);
8061 Vfile_coding_system_alist = Qnil;
8062
8063 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8064 doc: /* Alist to decide a coding system to use for a process I/O operation.
8065 The format is ((PATTERN . VAL) ...),
8066 where PATTERN is a regular expression matching a program name,
8067 VAL is a coding system, a cons of coding systems, or a function symbol.
8068 If VAL is a coding system, it is used for both decoding what received
8069 from the program and encoding what sent to the program.
8070 If VAL is a cons of coding systems, the car part is used for decoding,
8071 and the cdr part is used for encoding.
8072 If VAL is a function symbol, the function must return a coding system
8073 or a cons of coding systems which are used as above.
8074
8075 See also the function `find-operation-coding-system'. */);
8076 Vprocess_coding_system_alist = Qnil;
8077
8078 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8079 doc: /* Alist to decide a coding system to use for a network I/O operation.
8080 The format is ((PATTERN . VAL) ...),
8081 where PATTERN is a regular expression matching a network service name
8082 or is a port number to connect to,
8083 VAL is a coding system, a cons of coding systems, or a function symbol.
8084 If VAL is a coding system, it is used for both decoding what received
8085 from the network stream and encoding what sent to the network stream.
8086 If VAL is a cons of coding systems, the car part is used for decoding,
8087 and the cdr part is used for encoding.
8088 If VAL is a function symbol, the function must return a coding system
8089 or a cons of coding systems which are used as above.
8090
8091 See also the function `find-operation-coding-system'. */);
8092 Vnetwork_coding_system_alist = Qnil;
8093
8094 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8095 doc: /* Coding system to use with system messages.
8096 Also used for decoding keyboard input on X Window system. */);
8097 Vlocale_coding_system = Qnil;
8098
8099 /* The eol mnemonics are reset in startup.el system-dependently. */
8100 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8101 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8102 eol_mnemonic_unix = build_string (":");
8103
8104 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8105 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8106 eol_mnemonic_dos = build_string ("\\");
8107
8108 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8109 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8110 eol_mnemonic_mac = build_string ("/");
8111
8112 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8113 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8114 eol_mnemonic_undecided = build_string (":");
8115
8116 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8117 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8118 Venable_character_translation = Qt;
8119
8120 DEFVAR_LISP ("standard-translation-table-for-decode",
8121 &Vstandard_translation_table_for_decode,
8122 doc: /* Table for translating characters while decoding. */);
8123 Vstandard_translation_table_for_decode = Qnil;
8124
8125 DEFVAR_LISP ("standard-translation-table-for-encode",
8126 &Vstandard_translation_table_for_encode,
8127 doc: /* Table for translating characters while encoding. */);
8128 Vstandard_translation_table_for_encode = Qnil;
8129
8130 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8131 doc: /* Alist of charsets vs revision numbers.
8132 While encoding, if a charset (car part of an element) is found,
8133 designate it with the escape sequence identifying revision (cdr part of the element). */);
8134 Vcharset_revision_alist = Qnil;
8135
8136 DEFVAR_LISP ("default-process-coding-system",
8137 &Vdefault_process_coding_system,
8138 doc: /* Cons of coding systems used for process I/O by default.
8139 The car part is used for decoding a process output,
8140 the cdr part is used for encoding a text to be sent to a process. */);
8141 Vdefault_process_coding_system = Qnil;
8142
8143 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8144 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8145 This is a vector of length 256.
8146 If Nth element is non-nil, the existence of code N in a file
8147 \(or output of subprocess) doesn't prevent it to be detected as
8148 a coding system of ISO 2022 variant which has a flag
8149 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8150 or reading output of a subprocess.
8151 Only 128th through 159th elements has a meaning. */);
8152 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8153
8154 DEFVAR_LISP ("select-safe-coding-system-function",
8155 &Vselect_safe_coding_system_function,
8156 doc: /* Function to call to select safe coding system for encoding a text.
8157
8158 If set, this function is called to force a user to select a proper
8159 coding system which can encode the text in the case that a default
8160 coding system used in each operation can't encode the text. The
8161 function should take care that the buffer is not modified while
8162 the coding system is being selected.
8163
8164 The default value is `select-safe-coding-system' (which see). */);
8165 Vselect_safe_coding_system_function = Qnil;
8166
8167 DEFVAR_BOOL ("coding-system-require-warning",
8168 &coding_system_require_warning,
8169 doc: /* Internal use only.
8170 If non-nil, on writing a file, `select-safe-coding-system-function' is
8171 called even if `coding-system-for-write' is non-nil. The command
8172 `universal-coding-system-argument' binds this variable to t temporarily. */);
8173 coding_system_require_warning = 0;
8174
8175
8176 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8177 &inhibit_iso_escape_detection,
8178 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8179
8180 By default, on reading a file, Emacs tries to detect how the text is
8181 encoded. This code detection is sensitive to escape sequences. If
8182 the sequence is valid as ISO2022, the code is determined as one of
8183 the ISO2022 encodings, and the file is decoded by the corresponding
8184 coding system (e.g. `iso-2022-7bit').
8185
8186 However, there may be a case that you want to read escape sequences in
8187 a file as is. In such a case, you can set this variable to non-nil.
8188 Then, as the code detection ignores any escape sequences, no file is
8189 detected as encoded in some ISO2022 encoding. The result is that all
8190 escape sequences become visible in a buffer.
8191
8192 The default value is nil, and it is strongly recommended not to change
8193 it. That is because many Emacs Lisp source files that contain
8194 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8195 in Emacs's distribution, and they won't be decoded correctly on
8196 reading if you suppress escape sequence detection.
8197
8198 The other way to read escape sequences in a file without decoding is
8199 to explicitly specify some coding system that doesn't use ISO2022's
8200 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8201 inhibit_iso_escape_detection = 0;
8202
8203 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8204 doc: /* Char table for translating self-inserting characters.
8205 This is applied to the result of input methods, not their input. See also
8206 `keyboard-translate-table'. */);
8207 Vtranslation_table_for_input = Qnil;
8208 }
8209
8210 char *
8211 emacs_strerror (error_number)
8212 int error_number;
8213 {
8214 char *str;
8215
8216 synchronize_system_messages_locale ();
8217 str = strerror (error_number);
8218
8219 if (! NILP (Vlocale_coding_system))
8220 {
8221 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8222 Vlocale_coding_system,
8223 0);
8224 str = (char *) SDATA (dec);
8225 }
8226
8227 return str;
8228 }
8229
8230 #endif /* emacs */
8231
8232 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8233 (do not change this comment) */