(decode_coding_emacs_mule, decode_coding_iso2022)
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001,2002 Free Software Foundation, Inc.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /*** TABLE OF CONTENTS ***
24
25 0. General comments
26 1. Preamble
27 2. Emacs' internal format (emacs-mule) handlers
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
35
36 */
37
38 /*** 0. General comments ***/
39
40
41 /*** GENERAL NOTE on CODING SYSTEMS ***
42
43 A coding system is an encoding mechanism for one or more character
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
46 Emacs' internal format (emacs-mule), and when we say "encode",
47 it means converting the coding system emacs-mule to some other
48 coding system.
49
50 0. Emacs' internal format (emacs-mule)
51
52 Emacs itself holds a multi-lingual character in buffers and strings
53 in a special format. Details are described in section 2.
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
66 section 4.
67
68 3. BIG5
69
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
75
76 4. Raw text
77
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
80
81 5. Other
82
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
90 information about it is set in a structure of type `struct
91 coding_system' for rapid processing. See section 6 for more details.
92
93 */
94
95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
99 whereas DOS's format is two-byte sequence of `carriage-return' and
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
102
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
107
108 */
109
110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
114 which appropriate flag bits for the category XXX are set. The flag
115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
117 of the range 0x80..0x9F are in multibyte form. */
118 #if 0
119 int
120 detect_coding_emacs_mule (src, src_end, multibytep)
121 unsigned char *src, *src_end;
122 int multibytep;
123 {
124 ...
125 }
126 #endif
127
128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
134
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
140
141 DST_BYTES zero means that the source area and destination area are
142 overlapped, which means that we can produce a decoded text until it
143 reaches the head of the not-yet-decoded source text.
144
145 Below is a template for these functions. */
146 #if 0
147 static void
148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 {
153 ...
154 }
155 #endif
156
157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
163
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
169
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
173
174 Below is a template for these functions. */
175 #if 0
176 static void
177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
181 {
182 ...
183 }
184 #endif
185
186 /*** COMMONLY USED MACROS ***/
187
188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
195
196 #define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
204 } while (0)
205
206 #define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
215 } while (0)
216
217
218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
233 /* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
242
243 #define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
258 c = translate_char (translation_table, c, -1, 0, 0); \
259 src += bytes; \
260 } while (0)
261
262
263 /* Produce a multibyte form of character C to `dst'. Jump to
264 `label_end_of_loop' if there's not enough space at `dst'.
265
266 If we are now in the middle of a composition sequence, the decoded
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273 #define EMIT_CHAR(c) \
274 do { \
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
288 \
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
296 } while (0)
297
298
299 #define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309 #define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319 #define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
328 } while (0)
329
330 \f
331 /*** 1. Preamble ***/
332
333 #ifdef emacs
334 #include <config.h>
335 #endif
336
337 #include <stdio.h>
338
339 #ifdef emacs
340
341 #include "lisp.h"
342 #include "buffer.h"
343 #include "charset.h"
344 #include "composite.h"
345 #include "ccl.h"
346 #include "coding.h"
347 #include "window.h"
348
349 #else /* not emacs */
350
351 #include "mulelib.h"
352
353 #endif /* not emacs */
354
355 Lisp_Object Qcoding_system, Qeol_type;
356 Lisp_Object Qbuffer_file_coding_system;
357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
358 Lisp_Object Qno_conversion, Qundecided;
359 Lisp_Object Qcoding_system_history;
360 Lisp_Object Qsafe_chars;
361 Lisp_Object Qvalid_codes;
362
363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
365 Lisp_Object Qstart_process, Qopen_network_stream;
366 Lisp_Object Qtarget_idx;
367
368 Lisp_Object Vselect_safe_coding_system_function;
369
370 /* Mnemonic string for each format of end-of-line. */
371 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
372 /* Mnemonic string to indicate format of end-of-line is not yet
373 decided. */
374 Lisp_Object eol_mnemonic_undecided;
375
376 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
377 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
378 int system_eol_type;
379
380 #ifdef emacs
381
382 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
383
384 Lisp_Object Qcoding_system_p, Qcoding_system_error;
385
386 /* Coding system emacs-mule and raw-text are for converting only
387 end-of-line format. */
388 Lisp_Object Qemacs_mule, Qraw_text;
389
390 /* Coding-systems are handed between Emacs Lisp programs and C internal
391 routines by the following three variables. */
392 /* Coding-system for reading files and receiving data from process. */
393 Lisp_Object Vcoding_system_for_read;
394 /* Coding-system for writing files and sending data to process. */
395 Lisp_Object Vcoding_system_for_write;
396 /* Coding-system actually used in the latest I/O. */
397 Lisp_Object Vlast_coding_system_used;
398
399 /* A vector of length 256 which contains information about special
400 Latin codes (especially for dealing with Microsoft codes). */
401 Lisp_Object Vlatin_extra_code_table;
402
403 /* Flag to inhibit code conversion of end-of-line format. */
404 int inhibit_eol_conversion;
405
406 /* Flag to inhibit ISO2022 escape sequence detection. */
407 int inhibit_iso_escape_detection;
408
409 /* Flag to make buffer-file-coding-system inherit from process-coding. */
410 int inherit_process_coding_system;
411
412 /* Coding system to be used to encode text for terminal display. */
413 struct coding_system terminal_coding;
414
415 /* Coding system to be used to encode text for terminal display when
416 terminal coding system is nil. */
417 struct coding_system safe_terminal_coding;
418
419 /* Coding system of what is sent from terminal keyboard. */
420 struct coding_system keyboard_coding;
421
422 /* Default coding system to be used to write a file. */
423 struct coding_system default_buffer_file_coding;
424
425 Lisp_Object Vfile_coding_system_alist;
426 Lisp_Object Vprocess_coding_system_alist;
427 Lisp_Object Vnetwork_coding_system_alist;
428
429 Lisp_Object Vlocale_coding_system;
430
431 #endif /* emacs */
432
433 Lisp_Object Qcoding_category, Qcoding_category_index;
434
435 /* List of symbols `coding-category-xxx' ordered by priority. */
436 Lisp_Object Vcoding_category_list;
437
438 /* Table of coding categories (Lisp symbols). */
439 Lisp_Object Vcoding_category_table;
440
441 /* Table of names of symbol for each coding-category. */
442 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
443 "coding-category-emacs-mule",
444 "coding-category-sjis",
445 "coding-category-iso-7",
446 "coding-category-iso-7-tight",
447 "coding-category-iso-8-1",
448 "coding-category-iso-8-2",
449 "coding-category-iso-7-else",
450 "coding-category-iso-8-else",
451 "coding-category-ccl",
452 "coding-category-big5",
453 "coding-category-utf-8",
454 "coding-category-utf-16-be",
455 "coding-category-utf-16-le",
456 "coding-category-raw-text",
457 "coding-category-binary"
458 };
459
460 /* Table of pointers to coding systems corresponding to each coding
461 categories. */
462 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
463
464 /* Table of coding category masks. Nth element is a mask for a coding
465 category of which priority is Nth. */
466 static
467 int coding_priorities[CODING_CATEGORY_IDX_MAX];
468
469 /* Flag to tell if we look up translation table on character code
470 conversion. */
471 Lisp_Object Venable_character_translation;
472 /* Standard translation table to look up on decoding (reading). */
473 Lisp_Object Vstandard_translation_table_for_decode;
474 /* Standard translation table to look up on encoding (writing). */
475 Lisp_Object Vstandard_translation_table_for_encode;
476
477 Lisp_Object Qtranslation_table;
478 Lisp_Object Qtranslation_table_id;
479 Lisp_Object Qtranslation_table_for_decode;
480 Lisp_Object Qtranslation_table_for_encode;
481
482 /* Alist of charsets vs revision number. */
483 Lisp_Object Vcharset_revision_alist;
484
485 /* Default coding systems used for process I/O. */
486 Lisp_Object Vdefault_process_coding_system;
487
488 /* Global flag to tell that we can't call post-read-conversion and
489 pre-write-conversion functions. Usually the value is zero, but it
490 is set to 1 temporarily while such functions are running. This is
491 to avoid infinite recursive call. */
492 static int inhibit_pre_post_conversion;
493
494 /* Char-table containing safe coding systems of each character. */
495 Lisp_Object Vchar_coding_system_table;
496 Lisp_Object Qchar_coding_system;
497
498 /* Return `safe-chars' property of coding system CODING. Don't check
499 validity of CODING. */
500
501 Lisp_Object
502 coding_safe_chars (coding)
503 struct coding_system *coding;
504 {
505 Lisp_Object coding_spec, plist, safe_chars;
506
507 coding_spec = Fget (coding->symbol, Qcoding_system);
508 plist = XVECTOR (coding_spec)->contents[3];
509 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
510 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
511 }
512
513 #define CODING_SAFE_CHAR_P(safe_chars, c) \
514 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
515
516 \f
517 /*** 2. Emacs internal format (emacs-mule) handlers ***/
518
519 /* Emacs' internal format for representation of multiple character
520 sets is a kind of multi-byte encoding, i.e. characters are
521 represented by variable-length sequences of one-byte codes.
522
523 ASCII characters and control characters (e.g. `tab', `newline') are
524 represented by one-byte sequences which are their ASCII codes, in
525 the range 0x00 through 0x7F.
526
527 8-bit characters of the range 0x80..0x9F are represented by
528 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
529 code + 0x20).
530
531 8-bit characters of the range 0xA0..0xFF are represented by
532 one-byte sequences which are their 8-bit code.
533
534 The other characters are represented by a sequence of `base
535 leading-code', optional `extended leading-code', and one or two
536 `position-code's. The length of the sequence is determined by the
537 base leading-code. Leading-code takes the range 0x81 through 0x9D,
538 whereas extended leading-code and position-code take the range 0xA0
539 through 0xFF. See `charset.h' for more details about leading-code
540 and position-code.
541
542 --- CODE RANGE of Emacs' internal format ---
543 character set range
544 ------------- -----
545 ascii 0x00..0x7F
546 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
547 eight-bit-graphic 0xA0..0xBF
548 ELSE 0x81..0x9D + [0xA0..0xFF]+
549 ---------------------------------------------
550
551 As this is the internal character representation, the format is
552 usually not used externally (i.e. in a file or in a data sent to a
553 process). But, it is possible to have a text externally in this
554 format (i.e. by encoding by the coding system `emacs-mule').
555
556 In that case, a sequence of one-byte codes has a slightly different
557 form.
558
559 Firstly, all characters in eight-bit-control are represented by
560 one-byte sequences which are their 8-bit code.
561
562 Next, character composition data are represented by the byte
563 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
564 where,
565 METHOD is 0xF0 plus one of composition method (enum
566 composition_method),
567
568 BYTES is 0xA0 plus the byte length of these composition data,
569
570 CHARS is 0xA0 plus the number of characters composed by these
571 data,
572
573 COMPONENTs are characters of multibyte form or composition
574 rules encoded by two-byte of ASCII codes.
575
576 In addition, for backward compatibility, the following formats are
577 also recognized as composition data on decoding.
578
579 0x80 MSEQ ...
580 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
581
582 Here,
583 MSEQ is a multibyte form but in these special format:
584 ASCII: 0xA0 ASCII_CODE+0x80,
585 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
586 RULE is a one byte code of the range 0xA0..0xF0 that
587 represents a composition rule.
588 */
589
590 enum emacs_code_class_type emacs_code_class[256];
591
592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
593 Check if a text is encoded in Emacs' internal format. If it is,
594 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
595
596 static int
597 detect_coding_emacs_mule (src, src_end, multibytep)
598 unsigned char *src, *src_end;
599 int multibytep;
600 {
601 unsigned char c;
602 int composing = 0;
603 /* Dummy for ONE_MORE_BYTE. */
604 struct coding_system dummy_coding;
605 struct coding_system *coding = &dummy_coding;
606
607 while (1)
608 {
609 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
610
611 if (composing)
612 {
613 if (c < 0xA0)
614 composing = 0;
615 else if (c == 0xA0)
616 {
617 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
618 c &= 0x7F;
619 }
620 else
621 c -= 0x20;
622 }
623
624 if (c < 0x20)
625 {
626 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
627 return 0;
628 }
629 else if (c >= 0x80 && c < 0xA0)
630 {
631 if (c == 0x80)
632 /* Old leading code for a composite character. */
633 composing = 1;
634 else
635 {
636 unsigned char *src_base = src - 1;
637 int bytes;
638
639 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
640 bytes))
641 return 0;
642 src = src_base + bytes;
643 }
644 }
645 }
646 label_end_of_loop:
647 return CODING_CATEGORY_MASK_EMACS_MULE;
648 }
649
650
651 /* Record the starting position START and METHOD of one composition. */
652
653 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
654 do { \
655 struct composition_data *cmp_data = coding->cmp_data; \
656 int *data = cmp_data->data + cmp_data->used; \
657 coding->cmp_data_start = cmp_data->used; \
658 data[0] = -1; \
659 data[1] = cmp_data->char_offset + start; \
660 data[3] = (int) method; \
661 cmp_data->used += 4; \
662 } while (0)
663
664 /* Record the ending position END of the current composition. */
665
666 #define CODING_ADD_COMPOSITION_END(coding, end) \
667 do { \
668 struct composition_data *cmp_data = coding->cmp_data; \
669 int *data = cmp_data->data + coding->cmp_data_start; \
670 data[0] = cmp_data->used - coding->cmp_data_start; \
671 data[2] = cmp_data->char_offset + end; \
672 } while (0)
673
674 /* Record one COMPONENT (alternate character or composition rule). */
675
676 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
677 (coding->cmp_data->data[coding->cmp_data->used++] = component)
678
679
680 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
681 is not less than SRC_END, return -1 without incrementing Src. */
682
683 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
684
685
686 /* Decode a character represented as a component of composition
687 sequence of Emacs 20 style at SRC. Set C to that character, store
688 its multibyte form sequence at P, and set P to the end of that
689 sequence. If no valid character is found, set C to -1. */
690
691 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
692 do { \
693 int bytes; \
694 \
695 c = SAFE_ONE_MORE_BYTE (); \
696 if (c < 0) \
697 break; \
698 if (CHAR_HEAD_P (c)) \
699 c = -1; \
700 else if (c == 0xA0) \
701 { \
702 c = SAFE_ONE_MORE_BYTE (); \
703 if (c < 0xA0) \
704 c = -1; \
705 else \
706 { \
707 c -= 0xA0; \
708 *p++ = c; \
709 } \
710 } \
711 else if (BASE_LEADING_CODE_P (c - 0x20)) \
712 { \
713 unsigned char *p0 = p; \
714 \
715 c -= 0x20; \
716 *p++ = c; \
717 bytes = BYTES_BY_CHAR_HEAD (c); \
718 while (--bytes) \
719 { \
720 c = SAFE_ONE_MORE_BYTE (); \
721 if (c < 0) \
722 break; \
723 *p++ = c; \
724 } \
725 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)) \
726 c = STRING_CHAR (p0, bytes); \
727 else \
728 c = -1; \
729 } \
730 else \
731 c = -1; \
732 } while (0)
733
734
735 /* Decode a composition rule represented as a component of composition
736 sequence of Emacs 20 style at SRC. Set C to the rule. If not
737 valid rule is found, set C to -1. */
738
739 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
740 do { \
741 c = SAFE_ONE_MORE_BYTE (); \
742 c -= 0xA0; \
743 if (c < 0 || c >= 81) \
744 c = -1; \
745 else \
746 { \
747 gref = c / 9, nref = c % 9; \
748 c = COMPOSITION_ENCODE_RULE (gref, nref); \
749 } \
750 } while (0)
751
752
753 /* Decode composition sequence encoded by `emacs-mule' at the source
754 pointed by SRC. SRC_END is the end of source. Store information
755 of the composition in CODING->cmp_data.
756
757 For backward compatibility, decode also a composition sequence of
758 Emacs 20 style. In that case, the composition sequence contains
759 characters that should be extracted into a buffer or string. Store
760 those characters at *DESTINATION in multibyte form.
761
762 If we encounter an invalid byte sequence, return 0.
763 If we encounter an insufficient source or destination, or
764 insufficient space in CODING->cmp_data, return 1.
765 Otherwise, return consumed bytes in the source.
766
767 */
768 static INLINE int
769 decode_composition_emacs_mule (coding, src, src_end,
770 destination, dst_end, dst_bytes)
771 struct coding_system *coding;
772 unsigned char *src, *src_end, **destination, *dst_end;
773 int dst_bytes;
774 {
775 unsigned char *dst = *destination;
776 int method, data_len, nchars;
777 unsigned char *src_base = src++;
778 /* Store components of composition. */
779 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
780 int ncomponent;
781 /* Store multibyte form of characters to be composed. This is for
782 Emacs 20 style composition sequence. */
783 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
784 unsigned char *bufp = buf;
785 int c, i, gref, nref;
786
787 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
788 >= COMPOSITION_DATA_SIZE)
789 {
790 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
791 return -1;
792 }
793
794 ONE_MORE_BYTE (c);
795 if (c - 0xF0 >= COMPOSITION_RELATIVE
796 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
797 {
798 int with_rule;
799
800 method = c - 0xF0;
801 with_rule = (method == COMPOSITION_WITH_RULE
802 || method == COMPOSITION_WITH_RULE_ALTCHARS);
803 ONE_MORE_BYTE (c);
804 data_len = c - 0xA0;
805 if (data_len < 4
806 || src_base + data_len > src_end)
807 return 0;
808 ONE_MORE_BYTE (c);
809 nchars = c - 0xA0;
810 if (c < 1)
811 return 0;
812 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
813 {
814 /* If it is longer than this, it can't be valid. */
815 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
816 return 0;
817
818 if (ncomponent % 2 && with_rule)
819 {
820 ONE_MORE_BYTE (gref);
821 gref -= 32;
822 ONE_MORE_BYTE (nref);
823 nref -= 32;
824 c = COMPOSITION_ENCODE_RULE (gref, nref);
825 }
826 else
827 {
828 int bytes;
829 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
830 c = STRING_CHAR (src, bytes);
831 else
832 c = *src, bytes = 1;
833 src += bytes;
834 }
835 component[ncomponent] = c;
836 }
837 }
838 else
839 {
840 /* This may be an old Emacs 20 style format. See the comment at
841 the section 2 of this file. */
842 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
843 if (src == src_end
844 && !(coding->mode & CODING_MODE_LAST_BLOCK))
845 goto label_end_of_loop;
846
847 src_end = src;
848 src = src_base + 1;
849 if (c < 0xC0)
850 {
851 method = COMPOSITION_RELATIVE;
852 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
853 {
854 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
855 if (c < 0)
856 break;
857 component[ncomponent++] = c;
858 }
859 if (ncomponent < 2)
860 return 0;
861 nchars = ncomponent;
862 }
863 else if (c == 0xFF)
864 {
865 method = COMPOSITION_WITH_RULE;
866 src++;
867 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
868 if (c < 0)
869 return 0;
870 component[0] = c;
871 for (ncomponent = 1;
872 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
873 {
874 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
875 if (c < 0)
876 break;
877 component[ncomponent++] = c;
878 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
879 if (c < 0)
880 break;
881 component[ncomponent++] = c;
882 }
883 if (ncomponent < 3)
884 return 0;
885 nchars = (ncomponent + 1) / 2;
886 }
887 else
888 return 0;
889 }
890
891 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
892 {
893 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
894 for (i = 0; i < ncomponent; i++)
895 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
896 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
897 if (buf < bufp)
898 {
899 unsigned char *p = buf;
900 EMIT_BYTES (p, bufp);
901 *destination += bufp - buf;
902 coding->produced_char += nchars;
903 }
904 return (src - src_base);
905 }
906 label_end_of_loop:
907 return -1;
908 }
909
910 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
911
912 static void
913 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
914 struct coding_system *coding;
915 unsigned char *source, *destination;
916 int src_bytes, dst_bytes;
917 {
918 unsigned char *src = source;
919 unsigned char *src_end = source + src_bytes;
920 unsigned char *dst = destination;
921 unsigned char *dst_end = destination + dst_bytes;
922 /* SRC_BASE remembers the start position in source in each loop.
923 The loop will be exited when there's not enough source code, or
924 when there's not enough destination area to produce a
925 character. */
926 unsigned char *src_base;
927
928 coding->produced_char = 0;
929 while ((src_base = src) < src_end)
930 {
931 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
932 int bytes;
933
934 if (*src == '\r')
935 {
936 int c = *src++;
937
938 if (coding->eol_type == CODING_EOL_CR)
939 c = '\n';
940 else if (coding->eol_type == CODING_EOL_CRLF)
941 {
942 ONE_MORE_BYTE (c);
943 if (c != '\n')
944 {
945 src--;
946 c = '\r';
947 }
948 }
949 *dst++ = c;
950 coding->produced_char++;
951 continue;
952 }
953 else if (*src == '\n')
954 {
955 if ((coding->eol_type == CODING_EOL_CR
956 || coding->eol_type == CODING_EOL_CRLF)
957 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
958 {
959 coding->result = CODING_FINISH_INCONSISTENT_EOL;
960 goto label_end_of_loop;
961 }
962 *dst++ = *src++;
963 coding->produced_char++;
964 continue;
965 }
966 else if (*src == 0x80 && coding->cmp_data)
967 {
968 /* Start of composition data. */
969 int consumed = decode_composition_emacs_mule (coding, src, src_end,
970 &dst, dst_end,
971 dst_bytes);
972 if (consumed < 0)
973 goto label_end_of_loop;
974 else if (consumed > 0)
975 {
976 src += consumed;
977 continue;
978 }
979 bytes = CHAR_STRING (*src, tmp);
980 p = tmp;
981 src++;
982 }
983 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
984 {
985 p = src;
986 src += bytes;
987 }
988 else
989 {
990 bytes = CHAR_STRING (*src, tmp);
991 p = tmp;
992 src++;
993 }
994 if (dst + bytes >= (dst_bytes ? dst_end : src))
995 {
996 coding->result = CODING_FINISH_INSUFFICIENT_DST;
997 break;
998 }
999 while (bytes--) *dst++ = *p++;
1000 coding->produced_char++;
1001 }
1002 label_end_of_loop:
1003 coding->consumed = coding->consumed_char = src_base - source;
1004 coding->produced = dst - destination;
1005 }
1006
1007
1008 /* Encode composition data stored at DATA into a special byte sequence
1009 starting by 0x80. Update CODING->cmp_data_start and maybe
1010 CODING->cmp_data for the next call. */
1011
1012 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1013 do { \
1014 unsigned char buf[1024], *p0 = buf, *p; \
1015 int len = data[0]; \
1016 int i; \
1017 \
1018 buf[0] = 0x80; \
1019 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1020 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1021 p = buf + 4; \
1022 if (data[3] == COMPOSITION_WITH_RULE \
1023 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1024 { \
1025 p += CHAR_STRING (data[4], p); \
1026 for (i = 5; i < len; i += 2) \
1027 { \
1028 int gref, nref; \
1029 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1030 *p++ = 0x20 + gref; \
1031 *p++ = 0x20 + nref; \
1032 p += CHAR_STRING (data[i + 1], p); \
1033 } \
1034 } \
1035 else \
1036 { \
1037 for (i = 4; i < len; i++) \
1038 p += CHAR_STRING (data[i], p); \
1039 } \
1040 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1041 \
1042 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1043 { \
1044 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1045 goto label_end_of_loop; \
1046 } \
1047 while (p0 < p) \
1048 *dst++ = *p0++; \
1049 coding->cmp_data_start += data[0]; \
1050 if (coding->cmp_data_start == coding->cmp_data->used \
1051 && coding->cmp_data->next) \
1052 { \
1053 coding->cmp_data = coding->cmp_data->next; \
1054 coding->cmp_data_start = 0; \
1055 } \
1056 } while (0)
1057
1058
1059 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1060 unsigned char *, int, int));
1061
1062 static void
1063 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1064 struct coding_system *coding;
1065 unsigned char *source, *destination;
1066 int src_bytes, dst_bytes;
1067 {
1068 unsigned char *src = source;
1069 unsigned char *src_end = source + src_bytes;
1070 unsigned char *dst = destination;
1071 unsigned char *dst_end = destination + dst_bytes;
1072 unsigned char *src_base;
1073 int c;
1074 int char_offset;
1075 int *data;
1076
1077 Lisp_Object translation_table;
1078
1079 translation_table = Qnil;
1080
1081 /* Optimization for the case that there's no composition. */
1082 if (!coding->cmp_data || coding->cmp_data->used == 0)
1083 {
1084 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1085 return;
1086 }
1087
1088 char_offset = coding->cmp_data->char_offset;
1089 data = coding->cmp_data->data + coding->cmp_data_start;
1090 while (1)
1091 {
1092 src_base = src;
1093
1094 /* If SRC starts a composition, encode the information about the
1095 composition in advance. */
1096 if (coding->cmp_data_start < coding->cmp_data->used
1097 && char_offset + coding->consumed_char == data[1])
1098 {
1099 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1100 char_offset = coding->cmp_data->char_offset;
1101 data = coding->cmp_data->data + coding->cmp_data_start;
1102 }
1103
1104 ONE_MORE_CHAR (c);
1105 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1106 || coding->eol_type == CODING_EOL_CR))
1107 {
1108 if (coding->eol_type == CODING_EOL_CRLF)
1109 EMIT_TWO_BYTES ('\r', c);
1110 else
1111 EMIT_ONE_BYTE ('\r');
1112 }
1113 else if (SINGLE_BYTE_CHAR_P (c))
1114 EMIT_ONE_BYTE (c);
1115 else
1116 EMIT_BYTES (src_base, src);
1117 coding->consumed_char++;
1118 }
1119 label_end_of_loop:
1120 coding->consumed = src_base - source;
1121 coding->produced = coding->produced_char = dst - destination;
1122 return;
1123 }
1124
1125 \f
1126 /*** 3. ISO2022 handlers ***/
1127
1128 /* The following note describes the coding system ISO2022 briefly.
1129 Since the intention of this note is to help understand the
1130 functions in this file, some parts are NOT ACCURATE or are OVERLY
1131 SIMPLIFIED. For thorough understanding, please refer to the
1132 original document of ISO2022. This is equivalent to the standard
1133 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1134
1135 ISO2022 provides many mechanisms to encode several character sets
1136 in 7-bit and 8-bit environments. For 7-bit environments, all text
1137 is encoded using bytes less than 128. This may make the encoded
1138 text a little bit longer, but the text passes more easily through
1139 several types of gateway, some of which strip off the MSB (Most
1140 Significant Bit).
1141
1142 There are two kinds of character sets: control character sets and
1143 graphic character sets. The former contain control characters such
1144 as `newline' and `escape' to provide control functions (control
1145 functions are also provided by escape sequences). The latter
1146 contain graphic characters such as 'A' and '-'. Emacs recognizes
1147 two control character sets and many graphic character sets.
1148
1149 Graphic character sets are classified into one of the following
1150 four classes, according to the number of bytes (DIMENSION) and
1151 number of characters in one dimension (CHARS) of the set:
1152 - DIMENSION1_CHARS94
1153 - DIMENSION1_CHARS96
1154 - DIMENSION2_CHARS94
1155 - DIMENSION2_CHARS96
1156
1157 In addition, each character set is assigned an identification tag,
1158 unique for each set, called the "final character" (denoted as <F>
1159 hereafter). The <F> of each character set is decided by ECMA(*)
1160 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1161 (0x30..0x3F are for private use only).
1162
1163 Note (*): ECMA = European Computer Manufacturers Association
1164
1165 Here are examples of graphic character sets [NAME(<F>)]:
1166 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1167 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1168 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1169 o DIMENSION2_CHARS96 -- none for the moment
1170
1171 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1172 C0 [0x00..0x1F] -- control character plane 0
1173 GL [0x20..0x7F] -- graphic character plane 0
1174 C1 [0x80..0x9F] -- control character plane 1
1175 GR [0xA0..0xFF] -- graphic character plane 1
1176
1177 A control character set is directly designated and invoked to C0 or
1178 C1 by an escape sequence. The most common case is that:
1179 - ISO646's control character set is designated/invoked to C0, and
1180 - ISO6429's control character set is designated/invoked to C1,
1181 and usually these designations/invocations are omitted in encoded
1182 text. In a 7-bit environment, only C0 can be used, and a control
1183 character for C1 is encoded by an appropriate escape sequence to
1184 fit into the environment. All control characters for C1 are
1185 defined to have corresponding escape sequences.
1186
1187 A graphic character set is at first designated to one of four
1188 graphic registers (G0 through G3), then these graphic registers are
1189 invoked to GL or GR. These designations and invocations can be
1190 done independently. The most common case is that G0 is invoked to
1191 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1192 these invocations and designations are omitted in encoded text.
1193 In a 7-bit environment, only GL can be used.
1194
1195 When a graphic character set of CHARS94 is invoked to GL, codes
1196 0x20 and 0x7F of the GL area work as control characters SPACE and
1197 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1198 be used.
1199
1200 There are two ways of invocation: locking-shift and single-shift.
1201 With locking-shift, the invocation lasts until the next different
1202 invocation, whereas with single-shift, the invocation affects the
1203 following character only and doesn't affect the locking-shift
1204 state. Invocations are done by the following control characters or
1205 escape sequences:
1206
1207 ----------------------------------------------------------------------
1208 abbrev function cntrl escape seq description
1209 ----------------------------------------------------------------------
1210 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1211 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1212 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1213 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1214 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1215 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1216 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1217 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1218 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1219 ----------------------------------------------------------------------
1220 (*) These are not used by any known coding system.
1221
1222 Control characters for these functions are defined by macros
1223 ISO_CODE_XXX in `coding.h'.
1224
1225 Designations are done by the following escape sequences:
1226 ----------------------------------------------------------------------
1227 escape sequence description
1228 ----------------------------------------------------------------------
1229 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1230 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1231 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1232 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1233 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1234 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1235 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1236 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1237 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1238 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1239 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1240 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1241 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1242 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1243 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1244 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1245 ----------------------------------------------------------------------
1246
1247 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1248 of dimension 1, chars 94, and final character <F>, etc...
1249
1250 Note (*): Although these designations are not allowed in ISO2022,
1251 Emacs accepts them on decoding, and produces them on encoding
1252 CHARS96 character sets in a coding system which is characterized as
1253 7-bit environment, non-locking-shift, and non-single-shift.
1254
1255 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1256 '(' can be omitted. We refer to this as "short-form" hereafter.
1257
1258 Now you may notice that there are a lot of ways of encoding the
1259 same multilingual text in ISO2022. Actually, there exist many
1260 coding systems such as Compound Text (used in X11's inter client
1261 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1262 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1263 localized platforms), and all of these are variants of ISO2022.
1264
1265 In addition to the above, Emacs handles two more kinds of escape
1266 sequences: ISO6429's direction specification and Emacs' private
1267 sequence for specifying character composition.
1268
1269 ISO6429's direction specification takes the following form:
1270 o CSI ']' -- end of the current direction
1271 o CSI '0' ']' -- end of the current direction
1272 o CSI '1' ']' -- start of left-to-right text
1273 o CSI '2' ']' -- start of right-to-left text
1274 The control character CSI (0x9B: control sequence introducer) is
1275 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1276
1277 Character composition specification takes the following form:
1278 o ESC '0' -- start relative composition
1279 o ESC '1' -- end composition
1280 o ESC '2' -- start rule-base composition (*)
1281 o ESC '3' -- start relative composition with alternate chars (**)
1282 o ESC '4' -- start rule-base composition with alternate chars (**)
1283 Since these are not standard escape sequences of any ISO standard,
1284 the use of them with these meanings is restricted to Emacs only.
1285
1286 (*) This form is used only in Emacs 20.5 and older versions,
1287 but the newer versions can safely decode it.
1288 (**) This form is used only in Emacs 21.1 and newer versions,
1289 and the older versions can't decode it.
1290
1291 Here's a list of example usages of these composition escape
1292 sequences (categorized by `enum composition_method').
1293
1294 COMPOSITION_RELATIVE:
1295 ESC 0 CHAR [ CHAR ] ESC 1
1296 COMPOSITION_WITH_RULE:
1297 ESC 2 CHAR [ RULE CHAR ] ESC 1
1298 COMPOSITION_WITH_ALTCHARS:
1299 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1300 COMPOSITION_WITH_RULE_ALTCHARS:
1301 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1302
1303 enum iso_code_class_type iso_code_class[256];
1304
1305 #define CHARSET_OK(idx, charset, c) \
1306 (coding_system_table[idx] \
1307 && (charset == CHARSET_ASCII \
1308 || (safe_chars = coding_safe_chars (coding_system_table[idx]), \
1309 CODING_SAFE_CHAR_P (safe_chars, c))) \
1310 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1311 charset) \
1312 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1313
1314 #define SHIFT_OUT_OK(idx) \
1315 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1316
1317 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1318 Check if a text is encoded in ISO2022. If it is, return an
1319 integer in which appropriate flag bits any of:
1320 CODING_CATEGORY_MASK_ISO_7
1321 CODING_CATEGORY_MASK_ISO_7_TIGHT
1322 CODING_CATEGORY_MASK_ISO_8_1
1323 CODING_CATEGORY_MASK_ISO_8_2
1324 CODING_CATEGORY_MASK_ISO_7_ELSE
1325 CODING_CATEGORY_MASK_ISO_8_ELSE
1326 are set. If a code which should never appear in ISO2022 is found,
1327 returns 0. */
1328
1329 static int
1330 detect_coding_iso2022 (src, src_end, multibytep)
1331 unsigned char *src, *src_end;
1332 int multibytep;
1333 {
1334 int mask = CODING_CATEGORY_MASK_ISO;
1335 int mask_found = 0;
1336 int reg[4], shift_out = 0, single_shifting = 0;
1337 int c, c1, charset;
1338 /* Dummy for ONE_MORE_BYTE. */
1339 struct coding_system dummy_coding;
1340 struct coding_system *coding = &dummy_coding;
1341 Lisp_Object safe_chars;
1342
1343 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1344 while (mask && src < src_end)
1345 {
1346 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1347 retry:
1348 switch (c)
1349 {
1350 case ISO_CODE_ESC:
1351 if (inhibit_iso_escape_detection)
1352 break;
1353 single_shifting = 0;
1354 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1355 if (c >= '(' && c <= '/')
1356 {
1357 /* Designation sequence for a charset of dimension 1. */
1358 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1359 if (c1 < ' ' || c1 >= 0x80
1360 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1361 /* Invalid designation sequence. Just ignore. */
1362 break;
1363 reg[(c - '(') % 4] = charset;
1364 }
1365 else if (c == '$')
1366 {
1367 /* Designation sequence for a charset of dimension 2. */
1368 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1369 if (c >= '@' && c <= 'B')
1370 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1371 reg[0] = charset = iso_charset_table[1][0][c];
1372 else if (c >= '(' && c <= '/')
1373 {
1374 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1375 if (c1 < ' ' || c1 >= 0x80
1376 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1377 /* Invalid designation sequence. Just ignore. */
1378 break;
1379 reg[(c - '(') % 4] = charset;
1380 }
1381 else
1382 /* Invalid designation sequence. Just ignore. */
1383 break;
1384 }
1385 else if (c == 'N' || c == 'O')
1386 {
1387 /* ESC <Fe> for SS2 or SS3. */
1388 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1389 break;
1390 }
1391 else if (c >= '0' && c <= '4')
1392 {
1393 /* ESC <Fp> for start/end composition. */
1394 mask_found |= CODING_CATEGORY_MASK_ISO;
1395 break;
1396 }
1397 else
1398 /* Invalid escape sequence. Just ignore. */
1399 break;
1400
1401 /* We found a valid designation sequence for CHARSET. */
1402 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1403 c = MAKE_CHAR (charset, 0, 0);
1404 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1405 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1406 else
1407 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1408 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1409 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1410 else
1411 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1412 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1413 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1414 else
1415 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1416 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1417 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1418 else
1419 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1420 break;
1421
1422 case ISO_CODE_SO:
1423 if (inhibit_iso_escape_detection)
1424 break;
1425 single_shifting = 0;
1426 if (shift_out == 0
1427 && (reg[1] >= 0
1428 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1429 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1430 {
1431 /* Locking shift out. */
1432 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1433 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1434 }
1435 break;
1436
1437 case ISO_CODE_SI:
1438 if (inhibit_iso_escape_detection)
1439 break;
1440 single_shifting = 0;
1441 if (shift_out == 1)
1442 {
1443 /* Locking shift in. */
1444 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1445 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1446 }
1447 break;
1448
1449 case ISO_CODE_CSI:
1450 single_shifting = 0;
1451 case ISO_CODE_SS2:
1452 case ISO_CODE_SS3:
1453 {
1454 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1455
1456 if (inhibit_iso_escape_detection)
1457 break;
1458 if (c != ISO_CODE_CSI)
1459 {
1460 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1461 & CODING_FLAG_ISO_SINGLE_SHIFT)
1462 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1463 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1464 & CODING_FLAG_ISO_SINGLE_SHIFT)
1465 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1466 single_shifting = 1;
1467 }
1468 if (VECTORP (Vlatin_extra_code_table)
1469 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1470 {
1471 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1472 & CODING_FLAG_ISO_LATIN_EXTRA)
1473 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1474 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1475 & CODING_FLAG_ISO_LATIN_EXTRA)
1476 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1477 }
1478 mask &= newmask;
1479 mask_found |= newmask;
1480 }
1481 break;
1482
1483 default:
1484 if (c < 0x80)
1485 {
1486 single_shifting = 0;
1487 break;
1488 }
1489 else if (c < 0xA0)
1490 {
1491 single_shifting = 0;
1492 if (VECTORP (Vlatin_extra_code_table)
1493 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1494 {
1495 int newmask = 0;
1496
1497 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1498 & CODING_FLAG_ISO_LATIN_EXTRA)
1499 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1500 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1501 & CODING_FLAG_ISO_LATIN_EXTRA)
1502 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1503 mask &= newmask;
1504 mask_found |= newmask;
1505 }
1506 else
1507 return 0;
1508 }
1509 else
1510 {
1511 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1512 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1513 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1514 /* Check the length of succeeding codes of the range
1515 0xA0..0FF. If the byte length is odd, we exclude
1516 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1517 when we are not single shifting. */
1518 if (!single_shifting
1519 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1520 {
1521 int i = 1;
1522
1523 c = -1;
1524 while (src < src_end)
1525 {
1526 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1527 if (c < 0xA0)
1528 break;
1529 i++;
1530 }
1531
1532 if (i & 1 && src < src_end)
1533 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1534 else
1535 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1536 if (c >= 0)
1537 /* This means that we have read one extra byte. */
1538 goto retry;
1539 }
1540 }
1541 break;
1542 }
1543 }
1544 label_end_of_loop:
1545 return (mask & mask_found);
1546 }
1547
1548 /* Decode a character of which charset is CHARSET, the 1st position
1549 code is C1, the 2nd position code is C2, and return the decoded
1550 character code. If the variable `translation_table' is non-nil,
1551 returned the translated code. */
1552
1553 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1554 (NILP (translation_table) \
1555 ? MAKE_CHAR (charset, c1, c2) \
1556 : translate_char (translation_table, -1, charset, c1, c2))
1557
1558 /* Set designation state into CODING. */
1559 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1560 do { \
1561 int charset, c; \
1562 \
1563 if (final_char < '0' || final_char >= 128) \
1564 goto label_invalid_code; \
1565 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1566 make_number (chars), \
1567 make_number (final_char)); \
1568 c = MAKE_CHAR (charset, 0, 0); \
1569 if (charset >= 0 \
1570 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1571 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1572 { \
1573 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1574 && reg == 0 \
1575 && charset == CHARSET_ASCII) \
1576 { \
1577 /* We should insert this designation sequence as is so \
1578 that it is surely written back to a file. */ \
1579 coding->spec.iso2022.last_invalid_designation_register = -1; \
1580 goto label_invalid_code; \
1581 } \
1582 coding->spec.iso2022.last_invalid_designation_register = -1; \
1583 if ((coding->mode & CODING_MODE_DIRECTION) \
1584 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1585 charset = CHARSET_REVERSE_CHARSET (charset); \
1586 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1587 } \
1588 else \
1589 { \
1590 coding->spec.iso2022.last_invalid_designation_register = reg; \
1591 goto label_invalid_code; \
1592 } \
1593 } while (0)
1594
1595 /* Allocate a memory block for storing information about compositions.
1596 The block is chained to the already allocated blocks. */
1597
1598 void
1599 coding_allocate_composition_data (coding, char_offset)
1600 struct coding_system *coding;
1601 int char_offset;
1602 {
1603 struct composition_data *cmp_data
1604 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1605
1606 cmp_data->char_offset = char_offset;
1607 cmp_data->used = 0;
1608 cmp_data->prev = coding->cmp_data;
1609 cmp_data->next = NULL;
1610 if (coding->cmp_data)
1611 coding->cmp_data->next = cmp_data;
1612 coding->cmp_data = cmp_data;
1613 coding->cmp_data_start = 0;
1614 }
1615
1616 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1617 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1618 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1619 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1620 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1621 */
1622
1623 #define DECODE_COMPOSITION_START(c1) \
1624 do { \
1625 if (coding->composing == COMPOSITION_DISABLED) \
1626 { \
1627 *dst++ = ISO_CODE_ESC; \
1628 *dst++ = c1 & 0x7f; \
1629 coding->produced_char += 2; \
1630 } \
1631 else if (!COMPOSING_P (coding)) \
1632 { \
1633 /* This is surely the start of a composition. We must be sure \
1634 that coding->cmp_data has enough space to store the \
1635 information about the composition. If not, terminate the \
1636 current decoding loop, allocate one more memory block for \
1637 coding->cmp_data in the caller, then start the decoding \
1638 loop again. We can't allocate memory here directly because \
1639 it may cause buffer/string relocation. */ \
1640 if (!coding->cmp_data \
1641 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1642 >= COMPOSITION_DATA_SIZE)) \
1643 { \
1644 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1645 goto label_end_of_loop; \
1646 } \
1647 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1648 : c1 == '2' ? COMPOSITION_WITH_RULE \
1649 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1650 : COMPOSITION_WITH_RULE_ALTCHARS); \
1651 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1652 coding->composing); \
1653 coding->composition_rule_follows = 0; \
1654 } \
1655 else \
1656 { \
1657 /* We are already handling a composition. If the method is \
1658 the following two, the codes following the current escape \
1659 sequence are actual characters stored in a buffer. */ \
1660 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1661 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1662 { \
1663 coding->composing = COMPOSITION_RELATIVE; \
1664 coding->composition_rule_follows = 0; \
1665 } \
1666 } \
1667 } while (0)
1668
1669 /* Handle composition end sequence ESC 1. */
1670
1671 #define DECODE_COMPOSITION_END(c1) \
1672 do { \
1673 if (! COMPOSING_P (coding)) \
1674 { \
1675 *dst++ = ISO_CODE_ESC; \
1676 *dst++ = c1; \
1677 coding->produced_char += 2; \
1678 } \
1679 else \
1680 { \
1681 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1682 coding->composing = COMPOSITION_NO; \
1683 } \
1684 } while (0)
1685
1686 /* Decode a composition rule from the byte C1 (and maybe one more byte
1687 from SRC) and store one encoded composition rule in
1688 coding->cmp_data. */
1689
1690 #define DECODE_COMPOSITION_RULE(c1) \
1691 do { \
1692 int rule = 0; \
1693 (c1) -= 32; \
1694 if (c1 < 81) /* old format (before ver.21) */ \
1695 { \
1696 int gref = (c1) / 9; \
1697 int nref = (c1) % 9; \
1698 if (gref == 4) gref = 10; \
1699 if (nref == 4) nref = 10; \
1700 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1701 } \
1702 else if (c1 < 93) /* new format (after ver.21) */ \
1703 { \
1704 ONE_MORE_BYTE (c2); \
1705 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1706 } \
1707 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1708 coding->composition_rule_follows = 0; \
1709 } while (0)
1710
1711
1712 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1713
1714 static void
1715 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1716 struct coding_system *coding;
1717 unsigned char *source, *destination;
1718 int src_bytes, dst_bytes;
1719 {
1720 unsigned char *src = source;
1721 unsigned char *src_end = source + src_bytes;
1722 unsigned char *dst = destination;
1723 unsigned char *dst_end = destination + dst_bytes;
1724 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1725 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1726 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1727 /* SRC_BASE remembers the start position in source in each loop.
1728 The loop will be exited when there's not enough source code
1729 (within macro ONE_MORE_BYTE), or when there's not enough
1730 destination area to produce a character (within macro
1731 EMIT_CHAR). */
1732 unsigned char *src_base;
1733 int c, charset;
1734 Lisp_Object translation_table;
1735 Lisp_Object safe_chars;
1736
1737 safe_chars = coding_safe_chars (coding);
1738
1739 if (NILP (Venable_character_translation))
1740 translation_table = Qnil;
1741 else
1742 {
1743 translation_table = coding->translation_table_for_decode;
1744 if (NILP (translation_table))
1745 translation_table = Vstandard_translation_table_for_decode;
1746 }
1747
1748 coding->result = CODING_FINISH_NORMAL;
1749
1750 while (1)
1751 {
1752 int c1, c2;
1753
1754 src_base = src;
1755 ONE_MORE_BYTE (c1);
1756
1757 /* We produce no character or one character. */
1758 switch (iso_code_class [c1])
1759 {
1760 case ISO_0x20_or_0x7F:
1761 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1762 {
1763 DECODE_COMPOSITION_RULE (c1);
1764 continue;
1765 }
1766 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1767 {
1768 /* This is SPACE or DEL. */
1769 charset = CHARSET_ASCII;
1770 break;
1771 }
1772 /* This is a graphic character, we fall down ... */
1773
1774 case ISO_graphic_plane_0:
1775 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1776 {
1777 DECODE_COMPOSITION_RULE (c1);
1778 continue;
1779 }
1780 charset = charset0;
1781 break;
1782
1783 case ISO_0xA0_or_0xFF:
1784 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1785 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1786 goto label_invalid_code;
1787 /* This is a graphic character, we fall down ... */
1788
1789 case ISO_graphic_plane_1:
1790 if (charset1 < 0)
1791 goto label_invalid_code;
1792 charset = charset1;
1793 break;
1794
1795 case ISO_control_0:
1796 if (COMPOSING_P (coding))
1797 DECODE_COMPOSITION_END ('1');
1798
1799 /* All ISO2022 control characters in this class have the
1800 same representation in Emacs internal format. */
1801 if (c1 == '\n'
1802 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1803 && (coding->eol_type == CODING_EOL_CR
1804 || coding->eol_type == CODING_EOL_CRLF))
1805 {
1806 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1807 goto label_end_of_loop;
1808 }
1809 charset = CHARSET_ASCII;
1810 break;
1811
1812 case ISO_control_1:
1813 if (COMPOSING_P (coding))
1814 DECODE_COMPOSITION_END ('1');
1815 goto label_invalid_code;
1816
1817 case ISO_carriage_return:
1818 if (COMPOSING_P (coding))
1819 DECODE_COMPOSITION_END ('1');
1820
1821 if (coding->eol_type == CODING_EOL_CR)
1822 c1 = '\n';
1823 else if (coding->eol_type == CODING_EOL_CRLF)
1824 {
1825 ONE_MORE_BYTE (c1);
1826 if (c1 != ISO_CODE_LF)
1827 {
1828 src--;
1829 c1 = '\r';
1830 }
1831 }
1832 charset = CHARSET_ASCII;
1833 break;
1834
1835 case ISO_shift_out:
1836 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1837 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1838 goto label_invalid_code;
1839 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1840 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1841 continue;
1842
1843 case ISO_shift_in:
1844 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1845 goto label_invalid_code;
1846 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1847 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1848 continue;
1849
1850 case ISO_single_shift_2_7:
1851 case ISO_single_shift_2:
1852 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1853 goto label_invalid_code;
1854 /* SS2 is handled as an escape sequence of ESC 'N' */
1855 c1 = 'N';
1856 goto label_escape_sequence;
1857
1858 case ISO_single_shift_3:
1859 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1860 goto label_invalid_code;
1861 /* SS2 is handled as an escape sequence of ESC 'O' */
1862 c1 = 'O';
1863 goto label_escape_sequence;
1864
1865 case ISO_control_sequence_introducer:
1866 /* CSI is handled as an escape sequence of ESC '[' ... */
1867 c1 = '[';
1868 goto label_escape_sequence;
1869
1870 case ISO_escape:
1871 ONE_MORE_BYTE (c1);
1872 label_escape_sequence:
1873 /* Escape sequences handled by Emacs are invocation,
1874 designation, direction specification, and character
1875 composition specification. */
1876 switch (c1)
1877 {
1878 case '&': /* revision of following character set */
1879 ONE_MORE_BYTE (c1);
1880 if (!(c1 >= '@' && c1 <= '~'))
1881 goto label_invalid_code;
1882 ONE_MORE_BYTE (c1);
1883 if (c1 != ISO_CODE_ESC)
1884 goto label_invalid_code;
1885 ONE_MORE_BYTE (c1);
1886 goto label_escape_sequence;
1887
1888 case '$': /* designation of 2-byte character set */
1889 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1890 goto label_invalid_code;
1891 ONE_MORE_BYTE (c1);
1892 if (c1 >= '@' && c1 <= 'B')
1893 { /* designation of JISX0208.1978, GB2312.1980,
1894 or JISX0208.1980 */
1895 DECODE_DESIGNATION (0, 2, 94, c1);
1896 }
1897 else if (c1 >= 0x28 && c1 <= 0x2B)
1898 { /* designation of DIMENSION2_CHARS94 character set */
1899 ONE_MORE_BYTE (c2);
1900 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1901 }
1902 else if (c1 >= 0x2C && c1 <= 0x2F)
1903 { /* designation of DIMENSION2_CHARS96 character set */
1904 ONE_MORE_BYTE (c2);
1905 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1906 }
1907 else
1908 goto label_invalid_code;
1909 /* We must update these variables now. */
1910 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1911 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1912 continue;
1913
1914 case 'n': /* invocation of locking-shift-2 */
1915 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1916 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1917 goto label_invalid_code;
1918 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1919 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1920 continue;
1921
1922 case 'o': /* invocation of locking-shift-3 */
1923 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1924 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1925 goto label_invalid_code;
1926 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1927 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1928 continue;
1929
1930 case 'N': /* invocation of single-shift-2 */
1931 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1932 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1933 goto label_invalid_code;
1934 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1935 ONE_MORE_BYTE (c1);
1936 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1937 goto label_invalid_code;
1938 break;
1939
1940 case 'O': /* invocation of single-shift-3 */
1941 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1942 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1943 goto label_invalid_code;
1944 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1945 ONE_MORE_BYTE (c1);
1946 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1947 goto label_invalid_code;
1948 break;
1949
1950 case '0': case '2': case '3': case '4': /* start composition */
1951 DECODE_COMPOSITION_START (c1);
1952 continue;
1953
1954 case '1': /* end composition */
1955 DECODE_COMPOSITION_END (c1);
1956 continue;
1957
1958 case '[': /* specification of direction */
1959 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1960 goto label_invalid_code;
1961 /* For the moment, nested direction is not supported.
1962 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1963 left-to-right, and nonzero means right-to-left. */
1964 ONE_MORE_BYTE (c1);
1965 switch (c1)
1966 {
1967 case ']': /* end of the current direction */
1968 coding->mode &= ~CODING_MODE_DIRECTION;
1969
1970 case '0': /* end of the current direction */
1971 case '1': /* start of left-to-right direction */
1972 ONE_MORE_BYTE (c1);
1973 if (c1 == ']')
1974 coding->mode &= ~CODING_MODE_DIRECTION;
1975 else
1976 goto label_invalid_code;
1977 break;
1978
1979 case '2': /* start of right-to-left direction */
1980 ONE_MORE_BYTE (c1);
1981 if (c1 == ']')
1982 coding->mode |= CODING_MODE_DIRECTION;
1983 else
1984 goto label_invalid_code;
1985 break;
1986
1987 default:
1988 goto label_invalid_code;
1989 }
1990 continue;
1991
1992 default:
1993 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1994 goto label_invalid_code;
1995 if (c1 >= 0x28 && c1 <= 0x2B)
1996 { /* designation of DIMENSION1_CHARS94 character set */
1997 ONE_MORE_BYTE (c2);
1998 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1999 }
2000 else if (c1 >= 0x2C && c1 <= 0x2F)
2001 { /* designation of DIMENSION1_CHARS96 character set */
2002 ONE_MORE_BYTE (c2);
2003 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2004 }
2005 else
2006 goto label_invalid_code;
2007 /* We must update these variables now. */
2008 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2009 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2010 continue;
2011 }
2012 }
2013
2014 /* Now we know CHARSET and 1st position code C1 of a character.
2015 Produce a multibyte sequence for that character while getting
2016 2nd position code C2 if necessary. */
2017 if (CHARSET_DIMENSION (charset) == 2)
2018 {
2019 ONE_MORE_BYTE (c2);
2020 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2021 /* C2 is not in a valid range. */
2022 goto label_invalid_code;
2023 }
2024 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2025 EMIT_CHAR (c);
2026 continue;
2027
2028 label_invalid_code:
2029 coding->errors++;
2030 if (COMPOSING_P (coding))
2031 DECODE_COMPOSITION_END ('1');
2032 src = src_base;
2033 c = *src++;
2034 EMIT_CHAR (c);
2035 }
2036
2037 label_end_of_loop:
2038 coding->consumed = coding->consumed_char = src_base - source;
2039 coding->produced = dst - destination;
2040 return;
2041 }
2042
2043
2044 /* ISO2022 encoding stuff. */
2045
2046 /*
2047 It is not enough to say just "ISO2022" on encoding, we have to
2048 specify more details. In Emacs, each ISO2022 coding system
2049 variant has the following specifications:
2050 1. Initial designation to G0 through G3.
2051 2. Allows short-form designation?
2052 3. ASCII should be designated to G0 before control characters?
2053 4. ASCII should be designated to G0 at end of line?
2054 5. 7-bit environment or 8-bit environment?
2055 6. Use locking-shift?
2056 7. Use Single-shift?
2057 And the following two are only for Japanese:
2058 8. Use ASCII in place of JIS0201-1976-Roman?
2059 9. Use JISX0208-1983 in place of JISX0208-1978?
2060 These specifications are encoded in `coding->flags' as flag bits
2061 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2062 details.
2063 */
2064
2065 /* Produce codes (escape sequence) for designating CHARSET to graphic
2066 register REG at DST, and increment DST. If <final-char> of CHARSET is
2067 '@', 'A', or 'B' and the coding system CODING allows, produce
2068 designation sequence of short-form. */
2069
2070 #define ENCODE_DESIGNATION(charset, reg, coding) \
2071 do { \
2072 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2073 char *intermediate_char_94 = "()*+"; \
2074 char *intermediate_char_96 = ",-./"; \
2075 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2076 \
2077 if (revision < 255) \
2078 { \
2079 *dst++ = ISO_CODE_ESC; \
2080 *dst++ = '&'; \
2081 *dst++ = '@' + revision; \
2082 } \
2083 *dst++ = ISO_CODE_ESC; \
2084 if (CHARSET_DIMENSION (charset) == 1) \
2085 { \
2086 if (CHARSET_CHARS (charset) == 94) \
2087 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2088 else \
2089 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2090 } \
2091 else \
2092 { \
2093 *dst++ = '$'; \
2094 if (CHARSET_CHARS (charset) == 94) \
2095 { \
2096 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2097 || reg != 0 \
2098 || final_char < '@' || final_char > 'B') \
2099 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2100 } \
2101 else \
2102 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2103 } \
2104 *dst++ = final_char; \
2105 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2106 } while (0)
2107
2108 /* The following two macros produce codes (control character or escape
2109 sequence) for ISO2022 single-shift functions (single-shift-2 and
2110 single-shift-3). */
2111
2112 #define ENCODE_SINGLE_SHIFT_2 \
2113 do { \
2114 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2115 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2116 else \
2117 *dst++ = ISO_CODE_SS2; \
2118 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2119 } while (0)
2120
2121 #define ENCODE_SINGLE_SHIFT_3 \
2122 do { \
2123 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2124 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2125 else \
2126 *dst++ = ISO_CODE_SS3; \
2127 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2128 } while (0)
2129
2130 /* The following four macros produce codes (control character or
2131 escape sequence) for ISO2022 locking-shift functions (shift-in,
2132 shift-out, locking-shift-2, and locking-shift-3). */
2133
2134 #define ENCODE_SHIFT_IN \
2135 do { \
2136 *dst++ = ISO_CODE_SI; \
2137 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2138 } while (0)
2139
2140 #define ENCODE_SHIFT_OUT \
2141 do { \
2142 *dst++ = ISO_CODE_SO; \
2143 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2144 } while (0)
2145
2146 #define ENCODE_LOCKING_SHIFT_2 \
2147 do { \
2148 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2149 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2150 } while (0)
2151
2152 #define ENCODE_LOCKING_SHIFT_3 \
2153 do { \
2154 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2155 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2156 } while (0)
2157
2158 /* Produce codes for a DIMENSION1 character whose character set is
2159 CHARSET and whose position-code is C1. Designation and invocation
2160 sequences are also produced in advance if necessary. */
2161
2162 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2163 do { \
2164 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2165 { \
2166 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2167 *dst++ = c1 & 0x7F; \
2168 else \
2169 *dst++ = c1 | 0x80; \
2170 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2171 break; \
2172 } \
2173 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2174 { \
2175 *dst++ = c1 & 0x7F; \
2176 break; \
2177 } \
2178 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2179 { \
2180 *dst++ = c1 | 0x80; \
2181 break; \
2182 } \
2183 else \
2184 /* Since CHARSET is not yet invoked to any graphic planes, we \
2185 must invoke it, or, at first, designate it to some graphic \
2186 register. Then repeat the loop to actually produce the \
2187 character. */ \
2188 dst = encode_invocation_designation (charset, coding, dst); \
2189 } while (1)
2190
2191 /* Produce codes for a DIMENSION2 character whose character set is
2192 CHARSET and whose position-codes are C1 and C2. Designation and
2193 invocation codes are also produced in advance if necessary. */
2194
2195 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2196 do { \
2197 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2198 { \
2199 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2200 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2201 else \
2202 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2203 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2204 break; \
2205 } \
2206 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2207 { \
2208 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2209 break; \
2210 } \
2211 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2212 { \
2213 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2214 break; \
2215 } \
2216 else \
2217 /* Since CHARSET is not yet invoked to any graphic planes, we \
2218 must invoke it, or, at first, designate it to some graphic \
2219 register. Then repeat the loop to actually produce the \
2220 character. */ \
2221 dst = encode_invocation_designation (charset, coding, dst); \
2222 } while (1)
2223
2224 #define ENCODE_ISO_CHARACTER(c) \
2225 do { \
2226 int charset, c1, c2; \
2227 \
2228 SPLIT_CHAR (c, charset, c1, c2); \
2229 if (CHARSET_DEFINED_P (charset)) \
2230 { \
2231 if (CHARSET_DIMENSION (charset) == 1) \
2232 { \
2233 if (charset == CHARSET_ASCII \
2234 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2235 charset = charset_latin_jisx0201; \
2236 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2237 } \
2238 else \
2239 { \
2240 if (charset == charset_jisx0208 \
2241 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2242 charset = charset_jisx0208_1978; \
2243 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2244 } \
2245 } \
2246 else \
2247 { \
2248 *dst++ = c1; \
2249 if (c2 >= 0) \
2250 *dst++ = c2; \
2251 } \
2252 } while (0)
2253
2254
2255 /* Instead of encoding character C, produce one or two `?'s. */
2256
2257 #define ENCODE_UNSAFE_CHARACTER(c) \
2258 do { \
2259 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
2260 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2261 ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
2262 } while (0)
2263
2264
2265 /* Produce designation and invocation codes at a place pointed by DST
2266 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2267 Return new DST. */
2268
2269 unsigned char *
2270 encode_invocation_designation (charset, coding, dst)
2271 int charset;
2272 struct coding_system *coding;
2273 unsigned char *dst;
2274 {
2275 int reg; /* graphic register number */
2276
2277 /* At first, check designations. */
2278 for (reg = 0; reg < 4; reg++)
2279 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2280 break;
2281
2282 if (reg >= 4)
2283 {
2284 /* CHARSET is not yet designated to any graphic registers. */
2285 /* At first check the requested designation. */
2286 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2287 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2288 /* Since CHARSET requests no special designation, designate it
2289 to graphic register 0. */
2290 reg = 0;
2291
2292 ENCODE_DESIGNATION (charset, reg, coding);
2293 }
2294
2295 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2296 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2297 {
2298 /* Since the graphic register REG is not invoked to any graphic
2299 planes, invoke it to graphic plane 0. */
2300 switch (reg)
2301 {
2302 case 0: /* graphic register 0 */
2303 ENCODE_SHIFT_IN;
2304 break;
2305
2306 case 1: /* graphic register 1 */
2307 ENCODE_SHIFT_OUT;
2308 break;
2309
2310 case 2: /* graphic register 2 */
2311 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2312 ENCODE_SINGLE_SHIFT_2;
2313 else
2314 ENCODE_LOCKING_SHIFT_2;
2315 break;
2316
2317 case 3: /* graphic register 3 */
2318 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2319 ENCODE_SINGLE_SHIFT_3;
2320 else
2321 ENCODE_LOCKING_SHIFT_3;
2322 break;
2323 }
2324 }
2325
2326 return dst;
2327 }
2328
2329 /* Produce 2-byte codes for encoded composition rule RULE. */
2330
2331 #define ENCODE_COMPOSITION_RULE(rule) \
2332 do { \
2333 int gref, nref; \
2334 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2335 *dst++ = 32 + 81 + gref; \
2336 *dst++ = 32 + nref; \
2337 } while (0)
2338
2339 /* Produce codes for indicating the start of a composition sequence
2340 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2341 which specify information about the composition. See the comment
2342 in coding.h for the format of DATA. */
2343
2344 #define ENCODE_COMPOSITION_START(coding, data) \
2345 do { \
2346 coding->composing = data[3]; \
2347 *dst++ = ISO_CODE_ESC; \
2348 if (coding->composing == COMPOSITION_RELATIVE) \
2349 *dst++ = '0'; \
2350 else \
2351 { \
2352 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2353 ? '3' : '4'); \
2354 coding->cmp_data_index = coding->cmp_data_start + 4; \
2355 coding->composition_rule_follows = 0; \
2356 } \
2357 } while (0)
2358
2359 /* Produce codes for indicating the end of the current composition. */
2360
2361 #define ENCODE_COMPOSITION_END(coding, data) \
2362 do { \
2363 *dst++ = ISO_CODE_ESC; \
2364 *dst++ = '1'; \
2365 coding->cmp_data_start += data[0]; \
2366 coding->composing = COMPOSITION_NO; \
2367 if (coding->cmp_data_start == coding->cmp_data->used \
2368 && coding->cmp_data->next) \
2369 { \
2370 coding->cmp_data = coding->cmp_data->next; \
2371 coding->cmp_data_start = 0; \
2372 } \
2373 } while (0)
2374
2375 /* Produce composition start sequence ESC 0. Here, this sequence
2376 doesn't mean the start of a new composition but means that we have
2377 just produced components (alternate chars and composition rules) of
2378 the composition and the actual text follows in SRC. */
2379
2380 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2381 do { \
2382 *dst++ = ISO_CODE_ESC; \
2383 *dst++ = '0'; \
2384 coding->composing = COMPOSITION_RELATIVE; \
2385 } while (0)
2386
2387 /* The following three macros produce codes for indicating direction
2388 of text. */
2389 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2390 do { \
2391 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2392 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2393 else \
2394 *dst++ = ISO_CODE_CSI; \
2395 } while (0)
2396
2397 #define ENCODE_DIRECTION_R2L \
2398 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2399
2400 #define ENCODE_DIRECTION_L2R \
2401 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2402
2403 /* Produce codes for designation and invocation to reset the graphic
2404 planes and registers to initial state. */
2405 #define ENCODE_RESET_PLANE_AND_REGISTER \
2406 do { \
2407 int reg; \
2408 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2409 ENCODE_SHIFT_IN; \
2410 for (reg = 0; reg < 4; reg++) \
2411 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2412 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2413 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2414 ENCODE_DESIGNATION \
2415 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2416 } while (0)
2417
2418 /* Produce designation sequences of charsets in the line started from
2419 SRC to a place pointed by DST, and return updated DST.
2420
2421 If the current block ends before any end-of-line, we may fail to
2422 find all the necessary designations. */
2423
2424 static unsigned char *
2425 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2426 struct coding_system *coding;
2427 Lisp_Object translation_table;
2428 unsigned char *src, *src_end, *dst;
2429 {
2430 int charset, c, found = 0, reg;
2431 /* Table of charsets to be designated to each graphic register. */
2432 int r[4];
2433
2434 for (reg = 0; reg < 4; reg++)
2435 r[reg] = -1;
2436
2437 while (found < 4)
2438 {
2439 ONE_MORE_CHAR (c);
2440 if (c == '\n')
2441 break;
2442
2443 charset = CHAR_CHARSET (c);
2444 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2445 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2446 {
2447 found++;
2448 r[reg] = charset;
2449 }
2450 }
2451
2452 label_end_of_loop:
2453 if (found)
2454 {
2455 for (reg = 0; reg < 4; reg++)
2456 if (r[reg] >= 0
2457 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2458 ENCODE_DESIGNATION (r[reg], reg, coding);
2459 }
2460
2461 return dst;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2465
2466 static void
2467 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2468 struct coding_system *coding;
2469 unsigned char *source, *destination;
2470 int src_bytes, dst_bytes;
2471 {
2472 unsigned char *src = source;
2473 unsigned char *src_end = source + src_bytes;
2474 unsigned char *dst = destination;
2475 unsigned char *dst_end = destination + dst_bytes;
2476 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2477 from DST_END to assure overflow checking is necessary only at the
2478 head of loop. */
2479 unsigned char *adjusted_dst_end = dst_end - 19;
2480 /* SRC_BASE remembers the start position in source in each loop.
2481 The loop will be exited when there's not enough source text to
2482 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2483 there's not enough destination area to produce encoded codes
2484 (within macro EMIT_BYTES). */
2485 unsigned char *src_base;
2486 int c;
2487 Lisp_Object translation_table;
2488 Lisp_Object safe_chars;
2489
2490 safe_chars = coding_safe_chars (coding);
2491
2492 if (NILP (Venable_character_translation))
2493 translation_table = Qnil;
2494 else
2495 {
2496 translation_table = coding->translation_table_for_encode;
2497 if (NILP (translation_table))
2498 translation_table = Vstandard_translation_table_for_encode;
2499 }
2500
2501 coding->consumed_char = 0;
2502 coding->errors = 0;
2503 while (1)
2504 {
2505 src_base = src;
2506
2507 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2508 {
2509 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2510 break;
2511 }
2512
2513 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2514 && CODING_SPEC_ISO_BOL (coding))
2515 {
2516 /* We have to produce designation sequences if any now. */
2517 dst = encode_designation_at_bol (coding, translation_table,
2518 src, src_end, dst);
2519 CODING_SPEC_ISO_BOL (coding) = 0;
2520 }
2521
2522 /* Check composition start and end. */
2523 if (coding->composing != COMPOSITION_DISABLED
2524 && coding->cmp_data_start < coding->cmp_data->used)
2525 {
2526 struct composition_data *cmp_data = coding->cmp_data;
2527 int *data = cmp_data->data + coding->cmp_data_start;
2528 int this_pos = cmp_data->char_offset + coding->consumed_char;
2529
2530 if (coding->composing == COMPOSITION_RELATIVE)
2531 {
2532 if (this_pos == data[2])
2533 {
2534 ENCODE_COMPOSITION_END (coding, data);
2535 cmp_data = coding->cmp_data;
2536 data = cmp_data->data + coding->cmp_data_start;
2537 }
2538 }
2539 else if (COMPOSING_P (coding))
2540 {
2541 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2542 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2543 /* We have consumed components of the composition.
2544 What follows in SRC is the composition's base
2545 text. */
2546 ENCODE_COMPOSITION_FAKE_START (coding);
2547 else
2548 {
2549 int c = cmp_data->data[coding->cmp_data_index++];
2550 if (coding->composition_rule_follows)
2551 {
2552 ENCODE_COMPOSITION_RULE (c);
2553 coding->composition_rule_follows = 0;
2554 }
2555 else
2556 {
2557 if (coding->flags & CODING_FLAG_ISO_SAFE
2558 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2559 ENCODE_UNSAFE_CHARACTER (c);
2560 else
2561 ENCODE_ISO_CHARACTER (c);
2562 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2563 coding->composition_rule_follows = 1;
2564 }
2565 continue;
2566 }
2567 }
2568 if (!COMPOSING_P (coding))
2569 {
2570 if (this_pos == data[1])
2571 {
2572 ENCODE_COMPOSITION_START (coding, data);
2573 continue;
2574 }
2575 }
2576 }
2577
2578 ONE_MORE_CHAR (c);
2579
2580 /* Now encode the character C. */
2581 if (c < 0x20 || c == 0x7F)
2582 {
2583 if (c == '\r')
2584 {
2585 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2586 {
2587 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2588 ENCODE_RESET_PLANE_AND_REGISTER;
2589 *dst++ = c;
2590 continue;
2591 }
2592 /* fall down to treat '\r' as '\n' ... */
2593 c = '\n';
2594 }
2595 if (c == '\n')
2596 {
2597 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2598 ENCODE_RESET_PLANE_AND_REGISTER;
2599 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2600 bcopy (coding->spec.iso2022.initial_designation,
2601 coding->spec.iso2022.current_designation,
2602 sizeof coding->spec.iso2022.initial_designation);
2603 if (coding->eol_type == CODING_EOL_LF
2604 || coding->eol_type == CODING_EOL_UNDECIDED)
2605 *dst++ = ISO_CODE_LF;
2606 else if (coding->eol_type == CODING_EOL_CRLF)
2607 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2608 else
2609 *dst++ = ISO_CODE_CR;
2610 CODING_SPEC_ISO_BOL (coding) = 1;
2611 }
2612 else
2613 {
2614 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2615 ENCODE_RESET_PLANE_AND_REGISTER;
2616 *dst++ = c;
2617 }
2618 }
2619 else if (ASCII_BYTE_P (c))
2620 ENCODE_ISO_CHARACTER (c);
2621 else if (SINGLE_BYTE_CHAR_P (c))
2622 {
2623 *dst++ = c;
2624 coding->errors++;
2625 }
2626 else if (coding->flags & CODING_FLAG_ISO_SAFE
2627 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2628 ENCODE_UNSAFE_CHARACTER (c);
2629 else
2630 ENCODE_ISO_CHARACTER (c);
2631
2632 coding->consumed_char++;
2633 }
2634
2635 label_end_of_loop:
2636 coding->consumed = src_base - source;
2637 coding->produced = coding->produced_char = dst - destination;
2638 }
2639
2640 \f
2641 /*** 4. SJIS and BIG5 handlers ***/
2642
2643 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2644 quite widely. So, for the moment, Emacs supports them in the bare
2645 C code. But, in the future, they may be supported only by CCL. */
2646
2647 /* SJIS is a coding system encoding three character sets: ASCII, right
2648 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2649 as is. A character of charset katakana-jisx0201 is encoded by
2650 "position-code + 0x80". A character of charset japanese-jisx0208
2651 is encoded in 2-byte but two position-codes are divided and shifted
2652 so that it fits in the range below.
2653
2654 --- CODE RANGE of SJIS ---
2655 (character set) (range)
2656 ASCII 0x00 .. 0x7F
2657 KATAKANA-JISX0201 0xA1 .. 0xDF
2658 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2659 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2660 -------------------------------
2661
2662 */
2663
2664 /* BIG5 is a coding system encoding two character sets: ASCII and
2665 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2666 character set and is encoded in two bytes.
2667
2668 --- CODE RANGE of BIG5 ---
2669 (character set) (range)
2670 ASCII 0x00 .. 0x7F
2671 Big5 (1st byte) 0xA1 .. 0xFE
2672 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2673 --------------------------
2674
2675 Since the number of characters in Big5 is larger than maximum
2676 characters in Emacs' charset (96x96), it can't be handled as one
2677 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2678 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2679 contains frequently used characters and the latter contains less
2680 frequently used characters. */
2681
2682 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2683 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2684 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2685 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2686
2687 /* Number of Big5 characters which have the same code in 1st byte. */
2688 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2689
2690 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2691 do { \
2692 unsigned int temp \
2693 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2694 if (b1 < 0xC9) \
2695 charset = charset_big5_1; \
2696 else \
2697 { \
2698 charset = charset_big5_2; \
2699 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2700 } \
2701 c1 = temp / (0xFF - 0xA1) + 0x21; \
2702 c2 = temp % (0xFF - 0xA1) + 0x21; \
2703 } while (0)
2704
2705 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2706 do { \
2707 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2708 if (charset == charset_big5_2) \
2709 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2710 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2711 b2 = temp % BIG5_SAME_ROW; \
2712 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2713 } while (0)
2714
2715 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2716 Check if a text is encoded in SJIS. If it is, return
2717 CODING_CATEGORY_MASK_SJIS, else return 0. */
2718
2719 static int
2720 detect_coding_sjis (src, src_end, multibytep)
2721 unsigned char *src, *src_end;
2722 int multibytep;
2723 {
2724 int c;
2725 /* Dummy for ONE_MORE_BYTE. */
2726 struct coding_system dummy_coding;
2727 struct coding_system *coding = &dummy_coding;
2728
2729 while (1)
2730 {
2731 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2732 if (c < 0x80)
2733 continue;
2734 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2735 return 0;
2736 if (c <= 0x9F || c >= 0xE0)
2737 {
2738 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2739 if (c < 0x40 || c == 0x7F || c > 0xFC)
2740 return 0;
2741 }
2742 }
2743 label_end_of_loop:
2744 return CODING_CATEGORY_MASK_SJIS;
2745 }
2746
2747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2748 Check if a text is encoded in BIG5. If it is, return
2749 CODING_CATEGORY_MASK_BIG5, else return 0. */
2750
2751 static int
2752 detect_coding_big5 (src, src_end, multibytep)
2753 unsigned char *src, *src_end;
2754 int multibytep;
2755 {
2756 int c;
2757 /* Dummy for ONE_MORE_BYTE. */
2758 struct coding_system dummy_coding;
2759 struct coding_system *coding = &dummy_coding;
2760
2761 while (1)
2762 {
2763 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2764 if (c < 0x80)
2765 continue;
2766 if (c < 0xA1 || c > 0xFE)
2767 return 0;
2768 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2769 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2770 return 0;
2771 }
2772 label_end_of_loop:
2773 return CODING_CATEGORY_MASK_BIG5;
2774 }
2775
2776 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2777 Check if a text is encoded in UTF-8. If it is, return
2778 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2779
2780 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2781 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2782 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2783 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2784 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2785 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2786 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2787
2788 static int
2789 detect_coding_utf_8 (src, src_end, multibytep)
2790 unsigned char *src, *src_end;
2791 int multibytep;
2792 {
2793 unsigned char c;
2794 int seq_maybe_bytes;
2795 /* Dummy for ONE_MORE_BYTE. */
2796 struct coding_system dummy_coding;
2797 struct coding_system *coding = &dummy_coding;
2798
2799 while (1)
2800 {
2801 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2802 if (UTF_8_1_OCTET_P (c))
2803 continue;
2804 else if (UTF_8_2_OCTET_LEADING_P (c))
2805 seq_maybe_bytes = 1;
2806 else if (UTF_8_3_OCTET_LEADING_P (c))
2807 seq_maybe_bytes = 2;
2808 else if (UTF_8_4_OCTET_LEADING_P (c))
2809 seq_maybe_bytes = 3;
2810 else if (UTF_8_5_OCTET_LEADING_P (c))
2811 seq_maybe_bytes = 4;
2812 else if (UTF_8_6_OCTET_LEADING_P (c))
2813 seq_maybe_bytes = 5;
2814 else
2815 return 0;
2816
2817 do
2818 {
2819 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2820 if (!UTF_8_EXTRA_OCTET_P (c))
2821 return 0;
2822 seq_maybe_bytes--;
2823 }
2824 while (seq_maybe_bytes > 0);
2825 }
2826
2827 label_end_of_loop:
2828 return CODING_CATEGORY_MASK_UTF_8;
2829 }
2830
2831 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2832 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2833 Little Endian (otherwise). If it is, return
2834 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2835 else return 0. */
2836
2837 #define UTF_16_INVALID_P(val) \
2838 (((val) == 0xFFFE) \
2839 || ((val) == 0xFFFF))
2840
2841 #define UTF_16_HIGH_SURROGATE_P(val) \
2842 (((val) & 0xD800) == 0xD800)
2843
2844 #define UTF_16_LOW_SURROGATE_P(val) \
2845 (((val) & 0xDC00) == 0xDC00)
2846
2847 static int
2848 detect_coding_utf_16 (src, src_end, multibytep)
2849 unsigned char *src, *src_end;
2850 int multibytep;
2851 {
2852 unsigned char c1, c2;
2853 /* Dummy for TWO_MORE_BYTES. */
2854 struct coding_system dummy_coding;
2855 struct coding_system *coding = &dummy_coding;
2856
2857 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2858 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2859
2860 if ((c1 == 0xFF) && (c2 == 0xFE))
2861 return CODING_CATEGORY_MASK_UTF_16_LE;
2862 else if ((c1 == 0xFE) && (c2 == 0xFF))
2863 return CODING_CATEGORY_MASK_UTF_16_BE;
2864
2865 label_end_of_loop:
2866 return 0;
2867 }
2868
2869 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2870 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2871
2872 static void
2873 decode_coding_sjis_big5 (coding, source, destination,
2874 src_bytes, dst_bytes, sjis_p)
2875 struct coding_system *coding;
2876 unsigned char *source, *destination;
2877 int src_bytes, dst_bytes;
2878 int sjis_p;
2879 {
2880 unsigned char *src = source;
2881 unsigned char *src_end = source + src_bytes;
2882 unsigned char *dst = destination;
2883 unsigned char *dst_end = destination + dst_bytes;
2884 /* SRC_BASE remembers the start position in source in each loop.
2885 The loop will be exited when there's not enough source code
2886 (within macro ONE_MORE_BYTE), or when there's not enough
2887 destination area to produce a character (within macro
2888 EMIT_CHAR). */
2889 unsigned char *src_base;
2890 Lisp_Object translation_table;
2891
2892 if (NILP (Venable_character_translation))
2893 translation_table = Qnil;
2894 else
2895 {
2896 translation_table = coding->translation_table_for_decode;
2897 if (NILP (translation_table))
2898 translation_table = Vstandard_translation_table_for_decode;
2899 }
2900
2901 coding->produced_char = 0;
2902 while (1)
2903 {
2904 int c, charset, c1, c2;
2905
2906 src_base = src;
2907 ONE_MORE_BYTE (c1);
2908
2909 if (c1 < 0x80)
2910 {
2911 charset = CHARSET_ASCII;
2912 if (c1 < 0x20)
2913 {
2914 if (c1 == '\r')
2915 {
2916 if (coding->eol_type == CODING_EOL_CRLF)
2917 {
2918 ONE_MORE_BYTE (c2);
2919 if (c2 == '\n')
2920 c1 = c2;
2921 else
2922 /* To process C2 again, SRC is subtracted by 1. */
2923 src--;
2924 }
2925 else if (coding->eol_type == CODING_EOL_CR)
2926 c1 = '\n';
2927 }
2928 else if (c1 == '\n'
2929 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2930 && (coding->eol_type == CODING_EOL_CR
2931 || coding->eol_type == CODING_EOL_CRLF))
2932 {
2933 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2934 goto label_end_of_loop;
2935 }
2936 }
2937 }
2938 else
2939 {
2940 if (sjis_p)
2941 {
2942 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2943 goto label_invalid_code;
2944 if (c1 <= 0x9F || c1 >= 0xE0)
2945 {
2946 /* SJIS -> JISX0208 */
2947 ONE_MORE_BYTE (c2);
2948 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2949 goto label_invalid_code;
2950 DECODE_SJIS (c1, c2, c1, c2);
2951 charset = charset_jisx0208;
2952 }
2953 else
2954 /* SJIS -> JISX0201-Kana */
2955 charset = charset_katakana_jisx0201;
2956 }
2957 else
2958 {
2959 /* BIG5 -> Big5 */
2960 if (c1 < 0xA0 || c1 > 0xFE)
2961 goto label_invalid_code;
2962 ONE_MORE_BYTE (c2);
2963 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2964 goto label_invalid_code;
2965 DECODE_BIG5 (c1, c2, charset, c1, c2);
2966 }
2967 }
2968
2969 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2970 EMIT_CHAR (c);
2971 continue;
2972
2973 label_invalid_code:
2974 coding->errors++;
2975 src = src_base;
2976 c = *src++;
2977 EMIT_CHAR (c);
2978 }
2979
2980 label_end_of_loop:
2981 coding->consumed = coding->consumed_char = src_base - source;
2982 coding->produced = dst - destination;
2983 return;
2984 }
2985
2986 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2987 This function can encode charsets `ascii', `katakana-jisx0201',
2988 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2989 are sure that all these charsets are registered as official charset
2990 (i.e. do not have extended leading-codes). Characters of other
2991 charsets are produced without any encoding. If SJIS_P is 1, encode
2992 SJIS text, else encode BIG5 text. */
2993
2994 static void
2995 encode_coding_sjis_big5 (coding, source, destination,
2996 src_bytes, dst_bytes, sjis_p)
2997 struct coding_system *coding;
2998 unsigned char *source, *destination;
2999 int src_bytes, dst_bytes;
3000 int sjis_p;
3001 {
3002 unsigned char *src = source;
3003 unsigned char *src_end = source + src_bytes;
3004 unsigned char *dst = destination;
3005 unsigned char *dst_end = destination + dst_bytes;
3006 /* SRC_BASE remembers the start position in source in each loop.
3007 The loop will be exited when there's not enough source text to
3008 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3009 there's not enough destination area to produce encoded codes
3010 (within macro EMIT_BYTES). */
3011 unsigned char *src_base;
3012 Lisp_Object translation_table;
3013
3014 if (NILP (Venable_character_translation))
3015 translation_table = Qnil;
3016 else
3017 {
3018 translation_table = coding->translation_table_for_encode;
3019 if (NILP (translation_table))
3020 translation_table = Vstandard_translation_table_for_encode;
3021 }
3022
3023 while (1)
3024 {
3025 int c, charset, c1, c2;
3026
3027 src_base = src;
3028 ONE_MORE_CHAR (c);
3029
3030 /* Now encode the character C. */
3031 if (SINGLE_BYTE_CHAR_P (c))
3032 {
3033 switch (c)
3034 {
3035 case '\r':
3036 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3037 {
3038 EMIT_ONE_BYTE (c);
3039 break;
3040 }
3041 c = '\n';
3042 case '\n':
3043 if (coding->eol_type == CODING_EOL_CRLF)
3044 {
3045 EMIT_TWO_BYTES ('\r', c);
3046 break;
3047 }
3048 else if (coding->eol_type == CODING_EOL_CR)
3049 c = '\r';
3050 default:
3051 EMIT_ONE_BYTE (c);
3052 }
3053 }
3054 else
3055 {
3056 SPLIT_CHAR (c, charset, c1, c2);
3057 if (sjis_p)
3058 {
3059 if (charset == charset_jisx0208
3060 || charset == charset_jisx0208_1978)
3061 {
3062 ENCODE_SJIS (c1, c2, c1, c2);
3063 EMIT_TWO_BYTES (c1, c2);
3064 }
3065 else if (charset == charset_katakana_jisx0201)
3066 EMIT_ONE_BYTE (c1 | 0x80);
3067 else if (charset == charset_latin_jisx0201)
3068 EMIT_ONE_BYTE (c1);
3069 else
3070 /* There's no way other than producing the internal
3071 codes as is. */
3072 EMIT_BYTES (src_base, src);
3073 }
3074 else
3075 {
3076 if (charset == charset_big5_1 || charset == charset_big5_2)
3077 {
3078 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3079 EMIT_TWO_BYTES (c1, c2);
3080 }
3081 else
3082 /* There's no way other than producing the internal
3083 codes as is. */
3084 EMIT_BYTES (src_base, src);
3085 }
3086 }
3087 coding->consumed_char++;
3088 }
3089
3090 label_end_of_loop:
3091 coding->consumed = src_base - source;
3092 coding->produced = coding->produced_char = dst - destination;
3093 }
3094
3095 \f
3096 /*** 5. CCL handlers ***/
3097
3098 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3099 Check if a text is encoded in a coding system of which
3100 encoder/decoder are written in CCL program. If it is, return
3101 CODING_CATEGORY_MASK_CCL, else return 0. */
3102
3103 static int
3104 detect_coding_ccl (src, src_end, multibytep)
3105 unsigned char *src, *src_end;
3106 int multibytep;
3107 {
3108 unsigned char *valid;
3109 int c;
3110 /* Dummy for ONE_MORE_BYTE. */
3111 struct coding_system dummy_coding;
3112 struct coding_system *coding = &dummy_coding;
3113
3114 /* No coding system is assigned to coding-category-ccl. */
3115 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3116 return 0;
3117
3118 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3119 while (1)
3120 {
3121 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3122 if (! valid[c])
3123 return 0;
3124 }
3125 label_end_of_loop:
3126 return CODING_CATEGORY_MASK_CCL;
3127 }
3128
3129 \f
3130 /*** 6. End-of-line handlers ***/
3131
3132 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3133
3134 static void
3135 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3136 struct coding_system *coding;
3137 unsigned char *source, *destination;
3138 int src_bytes, dst_bytes;
3139 {
3140 unsigned char *src = source;
3141 unsigned char *dst = destination;
3142 unsigned char *src_end = src + src_bytes;
3143 unsigned char *dst_end = dst + dst_bytes;
3144 Lisp_Object translation_table;
3145 /* SRC_BASE remembers the start position in source in each loop.
3146 The loop will be exited when there's not enough source code
3147 (within macro ONE_MORE_BYTE), or when there's not enough
3148 destination area to produce a character (within macro
3149 EMIT_CHAR). */
3150 unsigned char *src_base;
3151 int c;
3152
3153 translation_table = Qnil;
3154 switch (coding->eol_type)
3155 {
3156 case CODING_EOL_CRLF:
3157 while (1)
3158 {
3159 src_base = src;
3160 ONE_MORE_BYTE (c);
3161 if (c == '\r')
3162 {
3163 ONE_MORE_BYTE (c);
3164 if (c != '\n')
3165 {
3166 src--;
3167 c = '\r';
3168 }
3169 }
3170 else if (c == '\n'
3171 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3172 {
3173 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3174 goto label_end_of_loop;
3175 }
3176 EMIT_CHAR (c);
3177 }
3178 break;
3179
3180 case CODING_EOL_CR:
3181 while (1)
3182 {
3183 src_base = src;
3184 ONE_MORE_BYTE (c);
3185 if (c == '\n')
3186 {
3187 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3188 {
3189 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3190 goto label_end_of_loop;
3191 }
3192 }
3193 else if (c == '\r')
3194 c = '\n';
3195 EMIT_CHAR (c);
3196 }
3197 break;
3198
3199 default: /* no need for EOL handling */
3200 while (1)
3201 {
3202 src_base = src;
3203 ONE_MORE_BYTE (c);
3204 EMIT_CHAR (c);
3205 }
3206 }
3207
3208 label_end_of_loop:
3209 coding->consumed = coding->consumed_char = src_base - source;
3210 coding->produced = dst - destination;
3211 return;
3212 }
3213
3214 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3215 format of end-of-line according to `coding->eol_type'. It also
3216 convert multibyte form 8-bit characters to unibyte if
3217 CODING->src_multibyte is nonzero. If `coding->mode &
3218 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3219 also means end-of-line. */
3220
3221 static void
3222 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3223 struct coding_system *coding;
3224 const unsigned char *source;
3225 unsigned char *destination;
3226 int src_bytes, dst_bytes;
3227 {
3228 const unsigned char *src = source;
3229 unsigned char *dst = destination;
3230 const unsigned char *src_end = src + src_bytes;
3231 unsigned char *dst_end = dst + dst_bytes;
3232 Lisp_Object translation_table;
3233 /* SRC_BASE remembers the start position in source in each loop.
3234 The loop will be exited when there's not enough source text to
3235 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3236 there's not enough destination area to produce encoded codes
3237 (within macro EMIT_BYTES). */
3238 const unsigned char *src_base;
3239 unsigned char *tmp;
3240 int c;
3241 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3242
3243 translation_table = Qnil;
3244 if (coding->src_multibyte
3245 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3246 {
3247 src_end--;
3248 src_bytes--;
3249 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3250 }
3251
3252 if (coding->eol_type == CODING_EOL_CRLF)
3253 {
3254 while (src < src_end)
3255 {
3256 src_base = src;
3257 c = *src++;
3258 if (c >= 0x20)
3259 EMIT_ONE_BYTE (c);
3260 else if (c == '\n' || (c == '\r' && selective_display))
3261 EMIT_TWO_BYTES ('\r', '\n');
3262 else
3263 EMIT_ONE_BYTE (c);
3264 }
3265 src_base = src;
3266 label_end_of_loop:
3267 ;
3268 }
3269 else
3270 {
3271 if (!dst_bytes || src_bytes <= dst_bytes)
3272 {
3273 safe_bcopy (src, dst, src_bytes);
3274 src_base = src_end;
3275 dst += src_bytes;
3276 }
3277 else
3278 {
3279 if (coding->src_multibyte
3280 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3281 dst_bytes--;
3282 safe_bcopy (src, dst, dst_bytes);
3283 src_base = src + dst_bytes;
3284 dst = destination + dst_bytes;
3285 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3286 }
3287 if (coding->eol_type == CODING_EOL_CR)
3288 {
3289 for (tmp = destination; tmp < dst; tmp++)
3290 if (*tmp == '\n') *tmp = '\r';
3291 }
3292 else if (selective_display)
3293 {
3294 for (tmp = destination; tmp < dst; tmp++)
3295 if (*tmp == '\r') *tmp = '\n';
3296 }
3297 }
3298 if (coding->src_multibyte)
3299 dst = destination + str_as_unibyte (destination, dst - destination);
3300
3301 coding->consumed = src_base - source;
3302 coding->produced = dst - destination;
3303 coding->produced_char = coding->produced;
3304 }
3305
3306 \f
3307 /*** 7. C library functions ***/
3308
3309 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3310 has a property `coding-system'. The value of this property is a
3311 vector of length 5 (called the coding-vector). Among elements of
3312 this vector, the first (element[0]) and the fifth (element[4])
3313 carry important information for decoding/encoding. Before
3314 decoding/encoding, this information should be set in fields of a
3315 structure of type `coding_system'.
3316
3317 The value of the property `coding-system' can be a symbol of another
3318 subsidiary coding-system. In that case, Emacs gets coding-vector
3319 from that symbol.
3320
3321 `element[0]' contains information to be set in `coding->type'. The
3322 value and its meaning is as follows:
3323
3324 0 -- coding_type_emacs_mule
3325 1 -- coding_type_sjis
3326 2 -- coding_type_iso2022
3327 3 -- coding_type_big5
3328 4 -- coding_type_ccl encoder/decoder written in CCL
3329 nil -- coding_type_no_conversion
3330 t -- coding_type_undecided (automatic conversion on decoding,
3331 no-conversion on encoding)
3332
3333 `element[4]' contains information to be set in `coding->flags' and
3334 `coding->spec'. The meaning varies by `coding->type'.
3335
3336 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3337 of length 32 (of which the first 13 sub-elements are used now).
3338 Meanings of these sub-elements are:
3339
3340 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3341 If the value is an integer of valid charset, the charset is
3342 assumed to be designated to graphic register N initially.
3343
3344 If the value is minus, it is a minus value of charset which
3345 reserves graphic register N, which means that the charset is
3346 not designated initially but should be designated to graphic
3347 register N just before encoding a character in that charset.
3348
3349 If the value is nil, graphic register N is never used on
3350 encoding.
3351
3352 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3353 Each value takes t or nil. See the section ISO2022 of
3354 `coding.h' for more information.
3355
3356 If `coding->type' is `coding_type_big5', element[4] is t to denote
3357 BIG5-ETen or nil to denote BIG5-HKU.
3358
3359 If `coding->type' takes the other value, element[4] is ignored.
3360
3361 Emacs Lisp's coding systems also carry information about format of
3362 end-of-line in a value of property `eol-type'. If the value is
3363 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3364 means CODING_EOL_CR. If it is not integer, it should be a vector
3365 of subsidiary coding systems of which property `eol-type' has one
3366 of the above values.
3367
3368 */
3369
3370 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3371 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3372 is setup so that no conversion is necessary and return -1, else
3373 return 0. */
3374
3375 int
3376 setup_coding_system (coding_system, coding)
3377 Lisp_Object coding_system;
3378 struct coding_system *coding;
3379 {
3380 Lisp_Object coding_spec, coding_type, eol_type, plist;
3381 Lisp_Object val;
3382
3383 /* At first, zero clear all members. */
3384 bzero (coding, sizeof (struct coding_system));
3385
3386 /* Initialize some fields required for all kinds of coding systems. */
3387 coding->symbol = coding_system;
3388 coding->heading_ascii = -1;
3389 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3390 coding->composing = COMPOSITION_DISABLED;
3391 coding->cmp_data = NULL;
3392
3393 if (NILP (coding_system))
3394 goto label_invalid_coding_system;
3395
3396 coding_spec = Fget (coding_system, Qcoding_system);
3397
3398 if (!VECTORP (coding_spec)
3399 || XVECTOR (coding_spec)->size != 5
3400 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3401 goto label_invalid_coding_system;
3402
3403 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3404 if (VECTORP (eol_type))
3405 {
3406 coding->eol_type = CODING_EOL_UNDECIDED;
3407 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3408 }
3409 else if (XFASTINT (eol_type) == 1)
3410 {
3411 coding->eol_type = CODING_EOL_CRLF;
3412 coding->common_flags
3413 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3414 }
3415 else if (XFASTINT (eol_type) == 2)
3416 {
3417 coding->eol_type = CODING_EOL_CR;
3418 coding->common_flags
3419 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3420 }
3421 else
3422 coding->eol_type = CODING_EOL_LF;
3423
3424 coding_type = XVECTOR (coding_spec)->contents[0];
3425 /* Try short cut. */
3426 if (SYMBOLP (coding_type))
3427 {
3428 if (EQ (coding_type, Qt))
3429 {
3430 coding->type = coding_type_undecided;
3431 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3432 }
3433 else
3434 coding->type = coding_type_no_conversion;
3435 /* Initialize this member. Any thing other than
3436 CODING_CATEGORY_IDX_UTF_16_BE and
3437 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3438 special treatment in detect_eol. */
3439 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3440
3441 return 0;
3442 }
3443
3444 /* Get values of coding system properties:
3445 `post-read-conversion', `pre-write-conversion',
3446 `translation-table-for-decode', `translation-table-for-encode'. */
3447 plist = XVECTOR (coding_spec)->contents[3];
3448 /* Pre & post conversion functions should be disabled if
3449 inhibit_eol_conversion is nonzero. This is the case that a code
3450 conversion function is called while those functions are running. */
3451 if (! inhibit_pre_post_conversion)
3452 {
3453 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3454 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3455 }
3456 val = Fplist_get (plist, Qtranslation_table_for_decode);
3457 if (SYMBOLP (val))
3458 val = Fget (val, Qtranslation_table_for_decode);
3459 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3460 val = Fplist_get (plist, Qtranslation_table_for_encode);
3461 if (SYMBOLP (val))
3462 val = Fget (val, Qtranslation_table_for_encode);
3463 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3464 val = Fplist_get (plist, Qcoding_category);
3465 if (!NILP (val))
3466 {
3467 val = Fget (val, Qcoding_category_index);
3468 if (INTEGERP (val))
3469 coding->category_idx = XINT (val);
3470 else
3471 goto label_invalid_coding_system;
3472 }
3473 else
3474 goto label_invalid_coding_system;
3475
3476 /* If the coding system has non-nil `composition' property, enable
3477 composition handling. */
3478 val = Fplist_get (plist, Qcomposition);
3479 if (!NILP (val))
3480 coding->composing = COMPOSITION_NO;
3481
3482 switch (XFASTINT (coding_type))
3483 {
3484 case 0:
3485 coding->type = coding_type_emacs_mule;
3486 coding->common_flags
3487 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3488 if (!NILP (coding->post_read_conversion))
3489 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3490 if (!NILP (coding->pre_write_conversion))
3491 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3492 break;
3493
3494 case 1:
3495 coding->type = coding_type_sjis;
3496 coding->common_flags
3497 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3498 break;
3499
3500 case 2:
3501 coding->type = coding_type_iso2022;
3502 coding->common_flags
3503 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3504 {
3505 Lisp_Object val, temp;
3506 Lisp_Object *flags;
3507 int i, charset, reg_bits = 0;
3508
3509 val = XVECTOR (coding_spec)->contents[4];
3510
3511 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3512 goto label_invalid_coding_system;
3513
3514 flags = XVECTOR (val)->contents;
3515 coding->flags
3516 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3517 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3518 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3519 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3520 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3521 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3522 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3523 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3524 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3525 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3526 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3527 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3528 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3529 );
3530
3531 /* Invoke graphic register 0 to plane 0. */
3532 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3533 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3534 CODING_SPEC_ISO_INVOCATION (coding, 1)
3535 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3536 /* Not single shifting at first. */
3537 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3538 /* Beginning of buffer should also be regarded as bol. */
3539 CODING_SPEC_ISO_BOL (coding) = 1;
3540
3541 for (charset = 0; charset <= MAX_CHARSET; charset++)
3542 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3543 val = Vcharset_revision_alist;
3544 while (CONSP (val))
3545 {
3546 charset = get_charset_id (Fcar_safe (XCAR (val)));
3547 if (charset >= 0
3548 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3549 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3550 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3551 val = XCDR (val);
3552 }
3553
3554 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3555 FLAGS[REG] can be one of below:
3556 integer CHARSET: CHARSET occupies register I,
3557 t: designate nothing to REG initially, but can be used
3558 by any charsets,
3559 list of integer, nil, or t: designate the first
3560 element (if integer) to REG initially, the remaining
3561 elements (if integer) is designated to REG on request,
3562 if an element is t, REG can be used by any charsets,
3563 nil: REG is never used. */
3564 for (charset = 0; charset <= MAX_CHARSET; charset++)
3565 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3566 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3567 for (i = 0; i < 4; i++)
3568 {
3569 if ((INTEGERP (flags[i])
3570 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3571 || (charset = get_charset_id (flags[i])) >= 0)
3572 {
3573 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3574 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3575 }
3576 else if (EQ (flags[i], Qt))
3577 {
3578 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3579 reg_bits |= 1 << i;
3580 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3581 }
3582 else if (CONSP (flags[i]))
3583 {
3584 Lisp_Object tail;
3585 tail = flags[i];
3586
3587 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3588 if ((INTEGERP (XCAR (tail))
3589 && (charset = XINT (XCAR (tail)),
3590 CHARSET_VALID_P (charset)))
3591 || (charset = get_charset_id (XCAR (tail))) >= 0)
3592 {
3593 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3594 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3595 }
3596 else
3597 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3598 tail = XCDR (tail);
3599 while (CONSP (tail))
3600 {
3601 if ((INTEGERP (XCAR (tail))
3602 && (charset = XINT (XCAR (tail)),
3603 CHARSET_VALID_P (charset)))
3604 || (charset = get_charset_id (XCAR (tail))) >= 0)
3605 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3606 = i;
3607 else if (EQ (XCAR (tail), Qt))
3608 reg_bits |= 1 << i;
3609 tail = XCDR (tail);
3610 }
3611 }
3612 else
3613 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3614
3615 CODING_SPEC_ISO_DESIGNATION (coding, i)
3616 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3617 }
3618
3619 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3620 {
3621 /* REG 1 can be used only by locking shift in 7-bit env. */
3622 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3623 reg_bits &= ~2;
3624 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3625 /* Without any shifting, only REG 0 and 1 can be used. */
3626 reg_bits &= 3;
3627 }
3628
3629 if (reg_bits)
3630 for (charset = 0; charset <= MAX_CHARSET; charset++)
3631 {
3632 if (CHARSET_DEFINED_P (charset)
3633 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3634 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3635 {
3636 /* There exist some default graphic registers to be
3637 used by CHARSET. */
3638
3639 /* We had better avoid designating a charset of
3640 CHARS96 to REG 0 as far as possible. */
3641 if (CHARSET_CHARS (charset) == 96)
3642 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3643 = (reg_bits & 2
3644 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3645 else
3646 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3647 = (reg_bits & 1
3648 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3649 }
3650 }
3651 }
3652 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3653 coding->spec.iso2022.last_invalid_designation_register = -1;
3654 break;
3655
3656 case 3:
3657 coding->type = coding_type_big5;
3658 coding->common_flags
3659 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3660 coding->flags
3661 = (NILP (XVECTOR (coding_spec)->contents[4])
3662 ? CODING_FLAG_BIG5_HKU
3663 : CODING_FLAG_BIG5_ETEN);
3664 break;
3665
3666 case 4:
3667 coding->type = coding_type_ccl;
3668 coding->common_flags
3669 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3670 {
3671 val = XVECTOR (coding_spec)->contents[4];
3672 if (! CONSP (val)
3673 || setup_ccl_program (&(coding->spec.ccl.decoder),
3674 XCAR (val)) < 0
3675 || setup_ccl_program (&(coding->spec.ccl.encoder),
3676 XCDR (val)) < 0)
3677 goto label_invalid_coding_system;
3678
3679 bzero (coding->spec.ccl.valid_codes, 256);
3680 val = Fplist_get (plist, Qvalid_codes);
3681 if (CONSP (val))
3682 {
3683 Lisp_Object this;
3684
3685 for (; CONSP (val); val = XCDR (val))
3686 {
3687 this = XCAR (val);
3688 if (INTEGERP (this)
3689 && XINT (this) >= 0 && XINT (this) < 256)
3690 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3691 else if (CONSP (this)
3692 && INTEGERP (XCAR (this))
3693 && INTEGERP (XCDR (this)))
3694 {
3695 int start = XINT (XCAR (this));
3696 int end = XINT (XCDR (this));
3697
3698 if (start >= 0 && start <= end && end < 256)
3699 while (start <= end)
3700 coding->spec.ccl.valid_codes[start++] = 1;
3701 }
3702 }
3703 }
3704 }
3705 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3706 coding->spec.ccl.cr_carryover = 0;
3707 coding->spec.ccl.eight_bit_carryover[0] = 0;
3708 break;
3709
3710 case 5:
3711 coding->type = coding_type_raw_text;
3712 break;
3713
3714 default:
3715 goto label_invalid_coding_system;
3716 }
3717 return 0;
3718
3719 label_invalid_coding_system:
3720 coding->type = coding_type_no_conversion;
3721 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3722 coding->common_flags = 0;
3723 coding->eol_type = CODING_EOL_LF;
3724 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3725 return -1;
3726 }
3727
3728 /* Free memory blocks allocated for storing composition information. */
3729
3730 void
3731 coding_free_composition_data (coding)
3732 struct coding_system *coding;
3733 {
3734 struct composition_data *cmp_data = coding->cmp_data, *next;
3735
3736 if (!cmp_data)
3737 return;
3738 /* Memory blocks are chained. At first, rewind to the first, then,
3739 free blocks one by one. */
3740 while (cmp_data->prev)
3741 cmp_data = cmp_data->prev;
3742 while (cmp_data)
3743 {
3744 next = cmp_data->next;
3745 xfree (cmp_data);
3746 cmp_data = next;
3747 }
3748 coding->cmp_data = NULL;
3749 }
3750
3751 /* Set `char_offset' member of all memory blocks pointed by
3752 coding->cmp_data to POS. */
3753
3754 void
3755 coding_adjust_composition_offset (coding, pos)
3756 struct coding_system *coding;
3757 int pos;
3758 {
3759 struct composition_data *cmp_data;
3760
3761 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3762 cmp_data->char_offset = pos;
3763 }
3764
3765 /* Setup raw-text or one of its subsidiaries in the structure
3766 coding_system CODING according to the already setup value eol_type
3767 in CODING. CODING should be setup for some coding system in
3768 advance. */
3769
3770 void
3771 setup_raw_text_coding_system (coding)
3772 struct coding_system *coding;
3773 {
3774 if (coding->type != coding_type_raw_text)
3775 {
3776 coding->symbol = Qraw_text;
3777 coding->type = coding_type_raw_text;
3778 if (coding->eol_type != CODING_EOL_UNDECIDED)
3779 {
3780 Lisp_Object subsidiaries;
3781 subsidiaries = Fget (Qraw_text, Qeol_type);
3782
3783 if (VECTORP (subsidiaries)
3784 && XVECTOR (subsidiaries)->size == 3)
3785 coding->symbol
3786 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3787 }
3788 setup_coding_system (coding->symbol, coding);
3789 }
3790 return;
3791 }
3792
3793 /* Emacs has a mechanism to automatically detect a coding system if it
3794 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3795 it's impossible to distinguish some coding systems accurately
3796 because they use the same range of codes. So, at first, coding
3797 systems are categorized into 7, those are:
3798
3799 o coding-category-emacs-mule
3800
3801 The category for a coding system which has the same code range
3802 as Emacs' internal format. Assigned the coding-system (Lisp
3803 symbol) `emacs-mule' by default.
3804
3805 o coding-category-sjis
3806
3807 The category for a coding system which has the same code range
3808 as SJIS. Assigned the coding-system (Lisp
3809 symbol) `japanese-shift-jis' by default.
3810
3811 o coding-category-iso-7
3812
3813 The category for a coding system which has the same code range
3814 as ISO2022 of 7-bit environment. This doesn't use any locking
3815 shift and single shift functions. This can encode/decode all
3816 charsets. Assigned the coding-system (Lisp symbol)
3817 `iso-2022-7bit' by default.
3818
3819 o coding-category-iso-7-tight
3820
3821 Same as coding-category-iso-7 except that this can
3822 encode/decode only the specified charsets.
3823
3824 o coding-category-iso-8-1
3825
3826 The category for a coding system which has the same code range
3827 as ISO2022 of 8-bit environment and graphic plane 1 used only
3828 for DIMENSION1 charset. This doesn't use any locking shift
3829 and single shift functions. Assigned the coding-system (Lisp
3830 symbol) `iso-latin-1' by default.
3831
3832 o coding-category-iso-8-2
3833
3834 The category for a coding system which has the same code range
3835 as ISO2022 of 8-bit environment and graphic plane 1 used only
3836 for DIMENSION2 charset. This doesn't use any locking shift
3837 and single shift functions. Assigned the coding-system (Lisp
3838 symbol) `japanese-iso-8bit' by default.
3839
3840 o coding-category-iso-7-else
3841
3842 The category for a coding system which has the same code range
3843 as ISO2022 of 7-bit environment but uses locking shift or
3844 single shift functions. Assigned the coding-system (Lisp
3845 symbol) `iso-2022-7bit-lock' by default.
3846
3847 o coding-category-iso-8-else
3848
3849 The category for a coding system which has the same code range
3850 as ISO2022 of 8-bit environment but uses locking shift or
3851 single shift functions. Assigned the coding-system (Lisp
3852 symbol) `iso-2022-8bit-ss2' by default.
3853
3854 o coding-category-big5
3855
3856 The category for a coding system which has the same code range
3857 as BIG5. Assigned the coding-system (Lisp symbol)
3858 `cn-big5' by default.
3859
3860 o coding-category-utf-8
3861
3862 The category for a coding system which has the same code range
3863 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3864 symbol) `utf-8' by default.
3865
3866 o coding-category-utf-16-be
3867
3868 The category for a coding system in which a text has an
3869 Unicode signature (cf. Unicode Standard) in the order of BIG
3870 endian at the head. Assigned the coding-system (Lisp symbol)
3871 `utf-16-be' by default.
3872
3873 o coding-category-utf-16-le
3874
3875 The category for a coding system in which a text has an
3876 Unicode signature (cf. Unicode Standard) in the order of
3877 LITTLE endian at the head. Assigned the coding-system (Lisp
3878 symbol) `utf-16-le' by default.
3879
3880 o coding-category-ccl
3881
3882 The category for a coding system of which encoder/decoder is
3883 written in CCL programs. The default value is nil, i.e., no
3884 coding system is assigned.
3885
3886 o coding-category-binary
3887
3888 The category for a coding system not categorized in any of the
3889 above. Assigned the coding-system (Lisp symbol)
3890 `no-conversion' by default.
3891
3892 Each of them is a Lisp symbol and the value is an actual
3893 `coding-system' (this is also a Lisp symbol) assigned by a user.
3894 What Emacs does actually is to detect a category of coding system.
3895 Then, it uses a `coding-system' assigned to it. If Emacs can't
3896 decide a single possible category, it selects a category of the
3897 highest priority. Priorities of categories are also specified by a
3898 user in a Lisp variable `coding-category-list'.
3899
3900 */
3901
3902 static
3903 int ascii_skip_code[256];
3904
3905 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3906 If it detects possible coding systems, return an integer in which
3907 appropriate flag bits are set. Flag bits are defined by macros
3908 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3909 it should point the table `coding_priorities'. In that case, only
3910 the flag bit for a coding system of the highest priority is set in
3911 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
3912 range 0x80..0x9F are in multibyte form.
3913
3914 How many ASCII characters are at the head is returned as *SKIP. */
3915
3916 static int
3917 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3918 unsigned char *source;
3919 int src_bytes, *priorities, *skip;
3920 int multibytep;
3921 {
3922 register unsigned char c;
3923 unsigned char *src = source, *src_end = source + src_bytes;
3924 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3925 int i;
3926
3927 /* At first, skip all ASCII characters and control characters except
3928 for three ISO2022 specific control characters. */
3929 ascii_skip_code[ISO_CODE_SO] = 0;
3930 ascii_skip_code[ISO_CODE_SI] = 0;
3931 ascii_skip_code[ISO_CODE_ESC] = 0;
3932
3933 label_loop_detect_coding:
3934 while (src < src_end && ascii_skip_code[*src]) src++;
3935 *skip = src - source;
3936
3937 if (src >= src_end)
3938 /* We found nothing other than ASCII. There's nothing to do. */
3939 return 0;
3940
3941 c = *src;
3942 /* The text seems to be encoded in some multilingual coding system.
3943 Now, try to find in which coding system the text is encoded. */
3944 if (c < 0x80)
3945 {
3946 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3947 /* C is an ISO2022 specific control code of C0. */
3948 mask = detect_coding_iso2022 (src, src_end, multibytep);
3949 if (mask == 0)
3950 {
3951 /* No valid ISO2022 code follows C. Try again. */
3952 src++;
3953 if (c == ISO_CODE_ESC)
3954 ascii_skip_code[ISO_CODE_ESC] = 1;
3955 else
3956 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3957 goto label_loop_detect_coding;
3958 }
3959 if (priorities)
3960 {
3961 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3962 {
3963 if (mask & priorities[i])
3964 return priorities[i];
3965 }
3966 return CODING_CATEGORY_MASK_RAW_TEXT;
3967 }
3968 }
3969 else
3970 {
3971 int try;
3972
3973 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3974 c = src[1] - 0x20;
3975
3976 if (c < 0xA0)
3977 {
3978 /* C is the first byte of SJIS character code,
3979 or a leading-code of Emacs' internal format (emacs-mule),
3980 or the first byte of UTF-16. */
3981 try = (CODING_CATEGORY_MASK_SJIS
3982 | CODING_CATEGORY_MASK_EMACS_MULE
3983 | CODING_CATEGORY_MASK_UTF_16_BE
3984 | CODING_CATEGORY_MASK_UTF_16_LE);
3985
3986 /* Or, if C is a special latin extra code,
3987 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3988 or is an ISO2022 control-sequence-introducer (CSI),
3989 we should also consider the possibility of ISO2022 codings. */
3990 if ((VECTORP (Vlatin_extra_code_table)
3991 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3992 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3993 || (c == ISO_CODE_CSI
3994 && (src < src_end
3995 && (*src == ']'
3996 || ((*src == '0' || *src == '1' || *src == '2')
3997 && src + 1 < src_end
3998 && src[1] == ']')))))
3999 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4000 | CODING_CATEGORY_MASK_ISO_8BIT);
4001 }
4002 else
4003 /* C is a character of ISO2022 in graphic plane right,
4004 or a SJIS's 1-byte character code (i.e. JISX0201),
4005 or the first byte of BIG5's 2-byte code,
4006 or the first byte of UTF-8/16. */
4007 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4008 | CODING_CATEGORY_MASK_ISO_8BIT
4009 | CODING_CATEGORY_MASK_SJIS
4010 | CODING_CATEGORY_MASK_BIG5
4011 | CODING_CATEGORY_MASK_UTF_8
4012 | CODING_CATEGORY_MASK_UTF_16_BE
4013 | CODING_CATEGORY_MASK_UTF_16_LE);
4014
4015 /* Or, we may have to consider the possibility of CCL. */
4016 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4017 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4018 ->spec.ccl.valid_codes)[c])
4019 try |= CODING_CATEGORY_MASK_CCL;
4020
4021 mask = 0;
4022 utf16_examined_p = iso2022_examined_p = 0;
4023 if (priorities)
4024 {
4025 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4026 {
4027 if (!iso2022_examined_p
4028 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4029 {
4030 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4031 iso2022_examined_p = 1;
4032 }
4033 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4034 mask |= detect_coding_sjis (src, src_end, multibytep);
4035 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4036 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4037 else if (!utf16_examined_p
4038 && (priorities[i] & try &
4039 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4040 {
4041 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4042 utf16_examined_p = 1;
4043 }
4044 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4045 mask |= detect_coding_big5 (src, src_end, multibytep);
4046 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4047 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4048 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4049 mask |= detect_coding_ccl (src, src_end, multibytep);
4050 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4051 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4052 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4053 mask |= CODING_CATEGORY_MASK_BINARY;
4054 if (mask & priorities[i])
4055 return priorities[i];
4056 }
4057 return CODING_CATEGORY_MASK_RAW_TEXT;
4058 }
4059 if (try & CODING_CATEGORY_MASK_ISO)
4060 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4061 if (try & CODING_CATEGORY_MASK_SJIS)
4062 mask |= detect_coding_sjis (src, src_end, multibytep);
4063 if (try & CODING_CATEGORY_MASK_BIG5)
4064 mask |= detect_coding_big5 (src, src_end, multibytep);
4065 if (try & CODING_CATEGORY_MASK_UTF_8)
4066 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4067 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4068 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4069 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4070 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4071 if (try & CODING_CATEGORY_MASK_CCL)
4072 mask |= detect_coding_ccl (src, src_end, multibytep);
4073 }
4074 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4075 }
4076
4077 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4078 The information of the detected coding system is set in CODING. */
4079
4080 void
4081 detect_coding (coding, src, src_bytes)
4082 struct coding_system *coding;
4083 const unsigned char *src;
4084 int src_bytes;
4085 {
4086 unsigned int idx;
4087 int skip, mask;
4088 Lisp_Object val;
4089
4090 val = Vcoding_category_list;
4091 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4092 coding->src_multibyte);
4093 coding->heading_ascii = skip;
4094
4095 if (!mask) return;
4096
4097 /* We found a single coding system of the highest priority in MASK. */
4098 idx = 0;
4099 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4100 if (! mask)
4101 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4102
4103 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4104
4105 if (coding->eol_type != CODING_EOL_UNDECIDED)
4106 {
4107 Lisp_Object tmp;
4108
4109 tmp = Fget (val, Qeol_type);
4110 if (VECTORP (tmp))
4111 val = XVECTOR (tmp)->contents[coding->eol_type];
4112 }
4113
4114 /* Setup this new coding system while preserving some slots. */
4115 {
4116 int src_multibyte = coding->src_multibyte;
4117 int dst_multibyte = coding->dst_multibyte;
4118
4119 setup_coding_system (val, coding);
4120 coding->src_multibyte = src_multibyte;
4121 coding->dst_multibyte = dst_multibyte;
4122 coding->heading_ascii = skip;
4123 }
4124 }
4125
4126 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4127 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4128 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4129
4130 How many non-eol characters are at the head is returned as *SKIP. */
4131
4132 #define MAX_EOL_CHECK_COUNT 3
4133
4134 static int
4135 detect_eol_type (source, src_bytes, skip)
4136 unsigned char *source;
4137 int src_bytes, *skip;
4138 {
4139 unsigned char *src = source, *src_end = src + src_bytes;
4140 unsigned char c;
4141 int total = 0; /* How many end-of-lines are found so far. */
4142 int eol_type = CODING_EOL_UNDECIDED;
4143 int this_eol_type;
4144
4145 *skip = 0;
4146
4147 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4148 {
4149 c = *src++;
4150 if (c == '\n' || c == '\r')
4151 {
4152 if (*skip == 0)
4153 *skip = src - 1 - source;
4154 total++;
4155 if (c == '\n')
4156 this_eol_type = CODING_EOL_LF;
4157 else if (src >= src_end || *src != '\n')
4158 this_eol_type = CODING_EOL_CR;
4159 else
4160 this_eol_type = CODING_EOL_CRLF, src++;
4161
4162 if (eol_type == CODING_EOL_UNDECIDED)
4163 /* This is the first end-of-line. */
4164 eol_type = this_eol_type;
4165 else if (eol_type != this_eol_type)
4166 {
4167 /* The found type is different from what found before. */
4168 eol_type = CODING_EOL_INCONSISTENT;
4169 break;
4170 }
4171 }
4172 }
4173
4174 if (*skip == 0)
4175 *skip = src_end - source;
4176 return eol_type;
4177 }
4178
4179 /* Like detect_eol_type, but detect EOL type in 2-octet
4180 big-endian/little-endian format for coding systems utf-16-be and
4181 utf-16-le. */
4182
4183 static int
4184 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4185 unsigned char *source;
4186 int src_bytes, *skip, big_endian_p;
4187 {
4188 unsigned char *src = source, *src_end = src + src_bytes;
4189 unsigned int c1, c2;
4190 int total = 0; /* How many end-of-lines are found so far. */
4191 int eol_type = CODING_EOL_UNDECIDED;
4192 int this_eol_type;
4193 int msb, lsb;
4194
4195 if (big_endian_p)
4196 msb = 0, lsb = 1;
4197 else
4198 msb = 1, lsb = 0;
4199
4200 *skip = 0;
4201
4202 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4203 {
4204 c1 = (src[msb] << 8) | (src[lsb]);
4205 src += 2;
4206
4207 if (c1 == '\n' || c1 == '\r')
4208 {
4209 if (*skip == 0)
4210 *skip = src - 2 - source;
4211 total++;
4212 if (c1 == '\n')
4213 {
4214 this_eol_type = CODING_EOL_LF;
4215 }
4216 else
4217 {
4218 if ((src + 1) >= src_end)
4219 {
4220 this_eol_type = CODING_EOL_CR;
4221 }
4222 else
4223 {
4224 c2 = (src[msb] << 8) | (src[lsb]);
4225 if (c2 == '\n')
4226 this_eol_type = CODING_EOL_CRLF, src += 2;
4227 else
4228 this_eol_type = CODING_EOL_CR;
4229 }
4230 }
4231
4232 if (eol_type == CODING_EOL_UNDECIDED)
4233 /* This is the first end-of-line. */
4234 eol_type = this_eol_type;
4235 else if (eol_type != this_eol_type)
4236 {
4237 /* The found type is different from what found before. */
4238 eol_type = CODING_EOL_INCONSISTENT;
4239 break;
4240 }
4241 }
4242 }
4243
4244 if (*skip == 0)
4245 *skip = src_end - source;
4246 return eol_type;
4247 }
4248
4249 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4250 is encoded. If it detects an appropriate format of end-of-line, it
4251 sets the information in *CODING. */
4252
4253 void
4254 detect_eol (coding, src, src_bytes)
4255 struct coding_system *coding;
4256 const unsigned char *src;
4257 int src_bytes;
4258 {
4259 Lisp_Object val;
4260 int skip;
4261 int eol_type;
4262
4263 switch (coding->category_idx)
4264 {
4265 case CODING_CATEGORY_IDX_UTF_16_BE:
4266 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4267 break;
4268 case CODING_CATEGORY_IDX_UTF_16_LE:
4269 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4270 break;
4271 default:
4272 eol_type = detect_eol_type (src, src_bytes, &skip);
4273 break;
4274 }
4275
4276 if (coding->heading_ascii > skip)
4277 coding->heading_ascii = skip;
4278 else
4279 skip = coding->heading_ascii;
4280
4281 if (eol_type == CODING_EOL_UNDECIDED)
4282 return;
4283 if (eol_type == CODING_EOL_INCONSISTENT)
4284 {
4285 #if 0
4286 /* This code is suppressed until we find a better way to
4287 distinguish raw text file and binary file. */
4288
4289 /* If we have already detected that the coding is raw-text, the
4290 coding should actually be no-conversion. */
4291 if (coding->type == coding_type_raw_text)
4292 {
4293 setup_coding_system (Qno_conversion, coding);
4294 return;
4295 }
4296 /* Else, let's decode only text code anyway. */
4297 #endif /* 0 */
4298 eol_type = CODING_EOL_LF;
4299 }
4300
4301 val = Fget (coding->symbol, Qeol_type);
4302 if (VECTORP (val) && XVECTOR (val)->size == 3)
4303 {
4304 int src_multibyte = coding->src_multibyte;
4305 int dst_multibyte = coding->dst_multibyte;
4306 struct composition_data *cmp_data = coding->cmp_data;
4307
4308 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4309 coding->src_multibyte = src_multibyte;
4310 coding->dst_multibyte = dst_multibyte;
4311 coding->heading_ascii = skip;
4312 coding->cmp_data = cmp_data;
4313 }
4314 }
4315
4316 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4317
4318 #define DECODING_BUFFER_MAG(coding) \
4319 (coding->type == coding_type_iso2022 \
4320 ? 3 \
4321 : (coding->type == coding_type_ccl \
4322 ? coding->spec.ccl.decoder.buf_magnification \
4323 : 2))
4324
4325 /* Return maximum size (bytes) of a buffer enough for decoding
4326 SRC_BYTES of text encoded in CODING. */
4327
4328 int
4329 decoding_buffer_size (coding, src_bytes)
4330 struct coding_system *coding;
4331 int src_bytes;
4332 {
4333 return (src_bytes * DECODING_BUFFER_MAG (coding)
4334 + CONVERSION_BUFFER_EXTRA_ROOM);
4335 }
4336
4337 /* Return maximum size (bytes) of a buffer enough for encoding
4338 SRC_BYTES of text to CODING. */
4339
4340 int
4341 encoding_buffer_size (coding, src_bytes)
4342 struct coding_system *coding;
4343 int src_bytes;
4344 {
4345 int magnification;
4346
4347 if (coding->type == coding_type_ccl)
4348 magnification = coding->spec.ccl.encoder.buf_magnification;
4349 else if (CODING_REQUIRE_ENCODING (coding))
4350 magnification = 3;
4351 else
4352 magnification = 1;
4353
4354 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4355 }
4356
4357 /* Working buffer for code conversion. */
4358 struct conversion_buffer
4359 {
4360 int size; /* size of data. */
4361 int on_stack; /* 1 if allocated by alloca. */
4362 unsigned char *data;
4363 };
4364
4365 /* Don't use alloca for allocating memory space larger than this, lest
4366 we overflow their stack. */
4367 #define MAX_ALLOCA 16*1024
4368
4369 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4370 #define allocate_conversion_buffer(buf, len) \
4371 do { \
4372 if (len < MAX_ALLOCA) \
4373 { \
4374 buf.data = (unsigned char *) alloca (len); \
4375 buf.on_stack = 1; \
4376 } \
4377 else \
4378 { \
4379 buf.data = (unsigned char *) xmalloc (len); \
4380 buf.on_stack = 0; \
4381 } \
4382 buf.size = len; \
4383 } while (0)
4384
4385 /* Double the allocated memory for *BUF. */
4386 static void
4387 extend_conversion_buffer (buf)
4388 struct conversion_buffer *buf;
4389 {
4390 if (buf->on_stack)
4391 {
4392 unsigned char *save = buf->data;
4393 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4394 bcopy (save, buf->data, buf->size);
4395 buf->on_stack = 0;
4396 }
4397 else
4398 {
4399 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4400 }
4401 buf->size *= 2;
4402 }
4403
4404 /* Free the allocated memory for BUF if it is not on stack. */
4405 static void
4406 free_conversion_buffer (buf)
4407 struct conversion_buffer *buf;
4408 {
4409 if (!buf->on_stack)
4410 xfree (buf->data);
4411 }
4412
4413 int
4414 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4415 struct coding_system *coding;
4416 unsigned char *source, *destination;
4417 int src_bytes, dst_bytes, encodep;
4418 {
4419 struct ccl_program *ccl
4420 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4421 unsigned char *dst = destination;
4422
4423 ccl->suppress_error = coding->suppress_error;
4424 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4425 if (encodep)
4426 {
4427 /* On encoding, EOL format is converted within ccl_driver. For
4428 that, setup proper information in the structure CCL. */
4429 ccl->eol_type = coding->eol_type;
4430 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4431 ccl->eol_type = CODING_EOL_LF;
4432 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4433 }
4434 ccl->multibyte = coding->src_multibyte;
4435 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4436 {
4437 /* Move carryover bytes to DESTINATION. */
4438 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4439 while (*p)
4440 *dst++ = *p++;
4441 coding->spec.ccl.eight_bit_carryover[0] = 0;
4442 if (dst_bytes)
4443 dst_bytes -= dst - destination;
4444 }
4445
4446 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4447 &(coding->consumed))
4448 + dst - destination);
4449
4450 if (encodep)
4451 {
4452 coding->produced_char = coding->produced;
4453 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4454 }
4455 else if (!ccl->eight_bit_control)
4456 {
4457 /* The produced bytes forms a valid multibyte sequence. */
4458 coding->produced_char
4459 = multibyte_chars_in_text (destination, coding->produced);
4460 coding->spec.ccl.eight_bit_carryover[0] = 0;
4461 }
4462 else
4463 {
4464 /* On decoding, the destination should always multibyte. But,
4465 CCL program might have been generated an invalid multibyte
4466 sequence. Here we make such a sequence valid as
4467 multibyte. */
4468 int bytes
4469 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4470
4471 if ((coding->consumed < src_bytes
4472 || !ccl->last_block)
4473 && coding->produced >= 1
4474 && destination[coding->produced - 1] >= 0x80)
4475 {
4476 /* We should not convert the tailing 8-bit codes to
4477 multibyte form even if they doesn't form a valid
4478 multibyte sequence. They may form a valid sequence in
4479 the next call. */
4480 int carryover = 0;
4481
4482 if (destination[coding->produced - 1] < 0xA0)
4483 carryover = 1;
4484 else if (coding->produced >= 2)
4485 {
4486 if (destination[coding->produced - 2] >= 0x80)
4487 {
4488 if (destination[coding->produced - 2] < 0xA0)
4489 carryover = 2;
4490 else if (coding->produced >= 3
4491 && destination[coding->produced - 3] >= 0x80
4492 && destination[coding->produced - 3] < 0xA0)
4493 carryover = 3;
4494 }
4495 }
4496 if (carryover > 0)
4497 {
4498 BCOPY_SHORT (destination + coding->produced - carryover,
4499 coding->spec.ccl.eight_bit_carryover,
4500 carryover);
4501 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4502 coding->produced -= carryover;
4503 }
4504 }
4505 coding->produced = str_as_multibyte (destination, bytes,
4506 coding->produced,
4507 &(coding->produced_char));
4508 }
4509
4510 switch (ccl->status)
4511 {
4512 case CCL_STAT_SUSPEND_BY_SRC:
4513 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4514 break;
4515 case CCL_STAT_SUSPEND_BY_DST:
4516 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4517 break;
4518 case CCL_STAT_QUIT:
4519 case CCL_STAT_INVALID_CMD:
4520 coding->result = CODING_FINISH_INTERRUPT;
4521 break;
4522 default:
4523 coding->result = CODING_FINISH_NORMAL;
4524 break;
4525 }
4526 return coding->result;
4527 }
4528
4529 /* Decode EOL format of the text at PTR of BYTES length destructively
4530 according to CODING->eol_type. This is called after the CCL
4531 program produced a decoded text at PTR. If we do CRLF->LF
4532 conversion, update CODING->produced and CODING->produced_char. */
4533
4534 static void
4535 decode_eol_post_ccl (coding, ptr, bytes)
4536 struct coding_system *coding;
4537 unsigned char *ptr;
4538 int bytes;
4539 {
4540 Lisp_Object val, saved_coding_symbol;
4541 unsigned char *pend = ptr + bytes;
4542 int dummy;
4543
4544 /* Remember the current coding system symbol. We set it back when
4545 an inconsistent EOL is found so that `last-coding-system-used' is
4546 set to the coding system that doesn't specify EOL conversion. */
4547 saved_coding_symbol = coding->symbol;
4548
4549 coding->spec.ccl.cr_carryover = 0;
4550 if (coding->eol_type == CODING_EOL_UNDECIDED)
4551 {
4552 /* Here, to avoid the call of setup_coding_system, we directly
4553 call detect_eol_type. */
4554 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4555 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4556 coding->eol_type = CODING_EOL_LF;
4557 if (coding->eol_type != CODING_EOL_UNDECIDED)
4558 {
4559 val = Fget (coding->symbol, Qeol_type);
4560 if (VECTORP (val) && XVECTOR (val)->size == 3)
4561 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4562 }
4563 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4564 }
4565
4566 if (coding->eol_type == CODING_EOL_LF
4567 || coding->eol_type == CODING_EOL_UNDECIDED)
4568 {
4569 /* We have nothing to do. */
4570 ptr = pend;
4571 }
4572 else if (coding->eol_type == CODING_EOL_CRLF)
4573 {
4574 unsigned char *pstart = ptr, *p = ptr;
4575
4576 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4577 && *(pend - 1) == '\r')
4578 {
4579 /* If the last character is CR, we can't handle it here
4580 because LF will be in the not-yet-decoded source text.
4581 Record that the CR is not yet processed. */
4582 coding->spec.ccl.cr_carryover = 1;
4583 coding->produced--;
4584 coding->produced_char--;
4585 pend--;
4586 }
4587 while (ptr < pend)
4588 {
4589 if (*ptr == '\r')
4590 {
4591 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4592 {
4593 *p++ = '\n';
4594 ptr += 2;
4595 }
4596 else
4597 {
4598 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4599 goto undo_eol_conversion;
4600 *p++ = *ptr++;
4601 }
4602 }
4603 else if (*ptr == '\n'
4604 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4605 goto undo_eol_conversion;
4606 else
4607 *p++ = *ptr++;
4608 continue;
4609
4610 undo_eol_conversion:
4611 /* We have faced with inconsistent EOL format at PTR.
4612 Convert all LFs before PTR back to CRLFs. */
4613 for (p--, ptr--; p >= pstart; p--)
4614 {
4615 if (*p == '\n')
4616 *ptr-- = '\n', *ptr-- = '\r';
4617 else
4618 *ptr-- = *p;
4619 }
4620 /* If carryover is recorded, cancel it because we don't
4621 convert CRLF anymore. */
4622 if (coding->spec.ccl.cr_carryover)
4623 {
4624 coding->spec.ccl.cr_carryover = 0;
4625 coding->produced++;
4626 coding->produced_char++;
4627 pend++;
4628 }
4629 p = ptr = pend;
4630 coding->eol_type = CODING_EOL_LF;
4631 coding->symbol = saved_coding_symbol;
4632 }
4633 if (p < pend)
4634 {
4635 /* As each two-byte sequence CRLF was converted to LF, (PEND
4636 - P) is the number of deleted characters. */
4637 coding->produced -= pend - p;
4638 coding->produced_char -= pend - p;
4639 }
4640 }
4641 else /* i.e. coding->eol_type == CODING_EOL_CR */
4642 {
4643 unsigned char *p = ptr;
4644
4645 for (; ptr < pend; ptr++)
4646 {
4647 if (*ptr == '\r')
4648 *ptr = '\n';
4649 else if (*ptr == '\n'
4650 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4651 {
4652 for (; p < ptr; p++)
4653 {
4654 if (*p == '\n')
4655 *p = '\r';
4656 }
4657 ptr = pend;
4658 coding->eol_type = CODING_EOL_LF;
4659 coding->symbol = saved_coding_symbol;
4660 }
4661 }
4662 }
4663 }
4664
4665 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4666 decoding, it may detect coding system and format of end-of-line if
4667 those are not yet decided. The source should be unibyte, the
4668 result is multibyte if CODING->dst_multibyte is nonzero, else
4669 unibyte. */
4670
4671 int
4672 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4673 struct coding_system *coding;
4674 const unsigned char *source;
4675 unsigned char *destination;
4676 int src_bytes, dst_bytes;
4677 {
4678 int extra = 0;
4679
4680 if (coding->type == coding_type_undecided)
4681 detect_coding (coding, source, src_bytes);
4682
4683 if (coding->eol_type == CODING_EOL_UNDECIDED
4684 && coding->type != coding_type_ccl)
4685 {
4686 detect_eol (coding, source, src_bytes);
4687 /* We had better recover the original eol format if we
4688 encounter an inconsistent eol format while decoding. */
4689 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4690 }
4691
4692 coding->produced = coding->produced_char = 0;
4693 coding->consumed = coding->consumed_char = 0;
4694 coding->errors = 0;
4695 coding->result = CODING_FINISH_NORMAL;
4696
4697 switch (coding->type)
4698 {
4699 case coding_type_sjis:
4700 decode_coding_sjis_big5 (coding, source, destination,
4701 src_bytes, dst_bytes, 1);
4702 break;
4703
4704 case coding_type_iso2022:
4705 decode_coding_iso2022 (coding, source, destination,
4706 src_bytes, dst_bytes);
4707 break;
4708
4709 case coding_type_big5:
4710 decode_coding_sjis_big5 (coding, source, destination,
4711 src_bytes, dst_bytes, 0);
4712 break;
4713
4714 case coding_type_emacs_mule:
4715 decode_coding_emacs_mule (coding, source, destination,
4716 src_bytes, dst_bytes);
4717 break;
4718
4719 case coding_type_ccl:
4720 if (coding->spec.ccl.cr_carryover)
4721 {
4722 /* Put the CR which was not processed by the previous call
4723 of decode_eol_post_ccl in DESTINATION. It will be
4724 decoded together with the following LF by the call to
4725 decode_eol_post_ccl below. */
4726 *destination = '\r';
4727 coding->produced++;
4728 coding->produced_char++;
4729 dst_bytes--;
4730 extra = coding->spec.ccl.cr_carryover;
4731 }
4732 ccl_coding_driver (coding, source, destination + extra,
4733 src_bytes, dst_bytes, 0);
4734 if (coding->eol_type != CODING_EOL_LF)
4735 {
4736 coding->produced += extra;
4737 coding->produced_char += extra;
4738 decode_eol_post_ccl (coding, destination, coding->produced);
4739 }
4740 break;
4741
4742 default:
4743 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4744 }
4745
4746 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4747 && coding->mode & CODING_MODE_LAST_BLOCK
4748 && coding->consumed == src_bytes)
4749 coding->result = CODING_FINISH_NORMAL;
4750
4751 if (coding->mode & CODING_MODE_LAST_BLOCK
4752 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4753 {
4754 const unsigned char *src = source + coding->consumed;
4755 unsigned char *dst = destination + coding->produced;
4756
4757 src_bytes -= coding->consumed;
4758 coding->errors++;
4759 if (COMPOSING_P (coding))
4760 DECODE_COMPOSITION_END ('1');
4761 while (src_bytes--)
4762 {
4763 int c = *src++;
4764 dst += CHAR_STRING (c, dst);
4765 coding->produced_char++;
4766 }
4767 coding->consumed = coding->consumed_char = src - source;
4768 coding->produced = dst - destination;
4769 coding->result = CODING_FINISH_NORMAL;
4770 }
4771
4772 if (!coding->dst_multibyte)
4773 {
4774 coding->produced = str_as_unibyte (destination, coding->produced);
4775 coding->produced_char = coding->produced;
4776 }
4777
4778 return coding->result;
4779 }
4780
4781 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4782 multibyteness of the source is CODING->src_multibyte, the
4783 multibyteness of the result is always unibyte. */
4784
4785 int
4786 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4787 struct coding_system *coding;
4788 const unsigned char *source;
4789 unsigned char *destination;
4790 int src_bytes, dst_bytes;
4791 {
4792 coding->produced = coding->produced_char = 0;
4793 coding->consumed = coding->consumed_char = 0;
4794 coding->errors = 0;
4795 coding->result = CODING_FINISH_NORMAL;
4796
4797 switch (coding->type)
4798 {
4799 case coding_type_sjis:
4800 encode_coding_sjis_big5 (coding, source, destination,
4801 src_bytes, dst_bytes, 1);
4802 break;
4803
4804 case coding_type_iso2022:
4805 encode_coding_iso2022 (coding, source, destination,
4806 src_bytes, dst_bytes);
4807 break;
4808
4809 case coding_type_big5:
4810 encode_coding_sjis_big5 (coding, source, destination,
4811 src_bytes, dst_bytes, 0);
4812 break;
4813
4814 case coding_type_emacs_mule:
4815 encode_coding_emacs_mule (coding, source, destination,
4816 src_bytes, dst_bytes);
4817 break;
4818
4819 case coding_type_ccl:
4820 ccl_coding_driver (coding, source, destination,
4821 src_bytes, dst_bytes, 1);
4822 break;
4823
4824 default:
4825 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4826 }
4827
4828 if (coding->mode & CODING_MODE_LAST_BLOCK
4829 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4830 {
4831 const unsigned char *src = source + coding->consumed;
4832 unsigned char *dst = destination + coding->produced;
4833
4834 if (coding->type == coding_type_iso2022)
4835 ENCODE_RESET_PLANE_AND_REGISTER;
4836 if (COMPOSING_P (coding))
4837 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4838 if (coding->consumed < src_bytes)
4839 {
4840 int len = src_bytes - coding->consumed;
4841
4842 BCOPY_SHORT (src, dst, len);
4843 if (coding->src_multibyte)
4844 len = str_as_unibyte (dst, len);
4845 dst += len;
4846 coding->consumed = src_bytes;
4847 }
4848 coding->produced = coding->produced_char = dst - destination;
4849 coding->result = CODING_FINISH_NORMAL;
4850 }
4851
4852 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4853 && coding->consumed == src_bytes)
4854 coding->result = CODING_FINISH_NORMAL;
4855
4856 return coding->result;
4857 }
4858
4859 /* Scan text in the region between *BEG and *END (byte positions),
4860 skip characters which we don't have to decode by coding system
4861 CODING at the head and tail, then set *BEG and *END to the region
4862 of the text we actually have to convert. The caller should move
4863 the gap out of the region in advance if the region is from a
4864 buffer.
4865
4866 If STR is not NULL, *BEG and *END are indices into STR. */
4867
4868 static void
4869 shrink_decoding_region (beg, end, coding, str)
4870 int *beg, *end;
4871 struct coding_system *coding;
4872 unsigned char *str;
4873 {
4874 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4875 int eol_conversion;
4876 Lisp_Object translation_table;
4877
4878 if (coding->type == coding_type_ccl
4879 || coding->type == coding_type_undecided
4880 || coding->eol_type != CODING_EOL_LF
4881 || !NILP (coding->post_read_conversion)
4882 || coding->composing != COMPOSITION_DISABLED)
4883 {
4884 /* We can't skip any data. */
4885 return;
4886 }
4887 if (coding->type == coding_type_no_conversion
4888 || coding->type == coding_type_raw_text
4889 || coding->type == coding_type_emacs_mule)
4890 {
4891 /* We need no conversion, but don't have to skip any data here.
4892 Decoding routine handles them effectively anyway. */
4893 return;
4894 }
4895
4896 translation_table = coding->translation_table_for_decode;
4897 if (NILP (translation_table) && !NILP (Venable_character_translation))
4898 translation_table = Vstandard_translation_table_for_decode;
4899 if (CHAR_TABLE_P (translation_table))
4900 {
4901 int i;
4902 for (i = 0; i < 128; i++)
4903 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4904 break;
4905 if (i < 128)
4906 /* Some ASCII character should be translated. We give up
4907 shrinking. */
4908 return;
4909 }
4910
4911 if (coding->heading_ascii >= 0)
4912 /* Detection routine has already found how much we can skip at the
4913 head. */
4914 *beg += coding->heading_ascii;
4915
4916 if (str)
4917 {
4918 begp_orig = begp = str + *beg;
4919 endp_orig = endp = str + *end;
4920 }
4921 else
4922 {
4923 begp_orig = begp = BYTE_POS_ADDR (*beg);
4924 endp_orig = endp = begp + *end - *beg;
4925 }
4926
4927 eol_conversion = (coding->eol_type == CODING_EOL_CR
4928 || coding->eol_type == CODING_EOL_CRLF);
4929
4930 switch (coding->type)
4931 {
4932 case coding_type_sjis:
4933 case coding_type_big5:
4934 /* We can skip all ASCII characters at the head. */
4935 if (coding->heading_ascii < 0)
4936 {
4937 if (eol_conversion)
4938 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4939 else
4940 while (begp < endp && *begp < 0x80) begp++;
4941 }
4942 /* We can skip all ASCII characters at the tail except for the
4943 second byte of SJIS or BIG5 code. */
4944 if (eol_conversion)
4945 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4946 else
4947 while (begp < endp && endp[-1] < 0x80) endp--;
4948 /* Do not consider LF as ascii if preceded by CR, since that
4949 confuses eol decoding. */
4950 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4951 endp++;
4952 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4953 endp++;
4954 break;
4955
4956 case coding_type_iso2022:
4957 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4958 /* We can't skip any data. */
4959 break;
4960 if (coding->heading_ascii < 0)
4961 {
4962 /* We can skip all ASCII characters at the head except for a
4963 few control codes. */
4964 while (begp < endp && (c = *begp) < 0x80
4965 && c != ISO_CODE_CR && c != ISO_CODE_SO
4966 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4967 && (!eol_conversion || c != ISO_CODE_LF))
4968 begp++;
4969 }
4970 switch (coding->category_idx)
4971 {
4972 case CODING_CATEGORY_IDX_ISO_8_1:
4973 case CODING_CATEGORY_IDX_ISO_8_2:
4974 /* We can skip all ASCII characters at the tail. */
4975 if (eol_conversion)
4976 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4977 else
4978 while (begp < endp && endp[-1] < 0x80) endp--;
4979 /* Do not consider LF as ascii if preceded by CR, since that
4980 confuses eol decoding. */
4981 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4982 endp++;
4983 break;
4984
4985 case CODING_CATEGORY_IDX_ISO_7:
4986 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4987 {
4988 /* We can skip all characters at the tail except for 8-bit
4989 codes and ESC and the following 2-byte at the tail. */
4990 unsigned char *eight_bit = NULL;
4991
4992 if (eol_conversion)
4993 while (begp < endp
4994 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4995 {
4996 if (!eight_bit && c & 0x80) eight_bit = endp;
4997 endp--;
4998 }
4999 else
5000 while (begp < endp
5001 && (c = endp[-1]) != ISO_CODE_ESC)
5002 {
5003 if (!eight_bit && c & 0x80) eight_bit = endp;
5004 endp--;
5005 }
5006 /* Do not consider LF as ascii if preceded by CR, since that
5007 confuses eol decoding. */
5008 if (begp < endp && endp < endp_orig
5009 && endp[-1] == '\r' && endp[0] == '\n')
5010 endp++;
5011 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5012 {
5013 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5014 /* This is an ASCII designation sequence. We can
5015 surely skip the tail. But, if we have
5016 encountered an 8-bit code, skip only the codes
5017 after that. */
5018 endp = eight_bit ? eight_bit : endp + 2;
5019 else
5020 /* Hmmm, we can't skip the tail. */
5021 endp = endp_orig;
5022 }
5023 else if (eight_bit)
5024 endp = eight_bit;
5025 }
5026 }
5027 break;
5028
5029 default:
5030 abort ();
5031 }
5032 *beg += begp - begp_orig;
5033 *end += endp - endp_orig;
5034 return;
5035 }
5036
5037 /* Like shrink_decoding_region but for encoding. */
5038
5039 static void
5040 shrink_encoding_region (beg, end, coding, str)
5041 int *beg, *end;
5042 struct coding_system *coding;
5043 unsigned char *str;
5044 {
5045 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5046 int eol_conversion;
5047 Lisp_Object translation_table;
5048
5049 if (coding->type == coding_type_ccl
5050 || coding->eol_type == CODING_EOL_CRLF
5051 || coding->eol_type == CODING_EOL_CR
5052 || (coding->cmp_data && coding->cmp_data->used > 0))
5053 {
5054 /* We can't skip any data. */
5055 return;
5056 }
5057 if (coding->type == coding_type_no_conversion
5058 || coding->type == coding_type_raw_text
5059 || coding->type == coding_type_emacs_mule
5060 || coding->type == coding_type_undecided)
5061 {
5062 /* We need no conversion, but don't have to skip any data here.
5063 Encoding routine handles them effectively anyway. */
5064 return;
5065 }
5066
5067 translation_table = coding->translation_table_for_encode;
5068 if (NILP (translation_table) && !NILP (Venable_character_translation))
5069 translation_table = Vstandard_translation_table_for_encode;
5070 if (CHAR_TABLE_P (translation_table))
5071 {
5072 int i;
5073 for (i = 0; i < 128; i++)
5074 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5075 break;
5076 if (i < 128)
5077 /* Some ASCII character should be translated. We give up
5078 shrinking. */
5079 return;
5080 }
5081
5082 if (str)
5083 {
5084 begp_orig = begp = str + *beg;
5085 endp_orig = endp = str + *end;
5086 }
5087 else
5088 {
5089 begp_orig = begp = BYTE_POS_ADDR (*beg);
5090 endp_orig = endp = begp + *end - *beg;
5091 }
5092
5093 eol_conversion = (coding->eol_type == CODING_EOL_CR
5094 || coding->eol_type == CODING_EOL_CRLF);
5095
5096 /* Here, we don't have to check coding->pre_write_conversion because
5097 the caller is expected to have handled it already. */
5098 switch (coding->type)
5099 {
5100 case coding_type_iso2022:
5101 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5102 /* We can't skip any data. */
5103 break;
5104 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5105 {
5106 unsigned char *bol = begp;
5107 while (begp < endp && *begp < 0x80)
5108 {
5109 begp++;
5110 if (begp[-1] == '\n')
5111 bol = begp;
5112 }
5113 begp = bol;
5114 goto label_skip_tail;
5115 }
5116 /* fall down ... */
5117
5118 case coding_type_sjis:
5119 case coding_type_big5:
5120 /* We can skip all ASCII characters at the head and tail. */
5121 if (eol_conversion)
5122 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5123 else
5124 while (begp < endp && *begp < 0x80) begp++;
5125 label_skip_tail:
5126 if (eol_conversion)
5127 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5128 else
5129 while (begp < endp && *(endp - 1) < 0x80) endp--;
5130 break;
5131
5132 default:
5133 abort ();
5134 }
5135
5136 *beg += begp - begp_orig;
5137 *end += endp - endp_orig;
5138 return;
5139 }
5140
5141 /* As shrinking conversion region requires some overhead, we don't try
5142 shrinking if the length of conversion region is less than this
5143 value. */
5144 static int shrink_conversion_region_threshhold = 1024;
5145
5146 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5147 do { \
5148 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5149 { \
5150 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5151 else shrink_decoding_region (beg, end, coding, str); \
5152 } \
5153 } while (0)
5154
5155 static Lisp_Object
5156 code_convert_region_unwind (dummy)
5157 Lisp_Object dummy;
5158 {
5159 inhibit_pre_post_conversion = 0;
5160 return Qnil;
5161 }
5162
5163 /* Store information about all compositions in the range FROM and TO
5164 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5165 buffer or a string, defaults to the current buffer. */
5166
5167 void
5168 coding_save_composition (coding, from, to, obj)
5169 struct coding_system *coding;
5170 int from, to;
5171 Lisp_Object obj;
5172 {
5173 Lisp_Object prop;
5174 int start, end;
5175
5176 if (coding->composing == COMPOSITION_DISABLED)
5177 return;
5178 if (!coding->cmp_data)
5179 coding_allocate_composition_data (coding, from);
5180 if (!find_composition (from, to, &start, &end, &prop, obj)
5181 || end > to)
5182 return;
5183 if (start < from
5184 && (!find_composition (end, to, &start, &end, &prop, obj)
5185 || end > to))
5186 return;
5187 coding->composing = COMPOSITION_NO;
5188 do
5189 {
5190 if (COMPOSITION_VALID_P (start, end, prop))
5191 {
5192 enum composition_method method = COMPOSITION_METHOD (prop);
5193 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5194 >= COMPOSITION_DATA_SIZE)
5195 coding_allocate_composition_data (coding, from);
5196 /* For relative composition, we remember start and end
5197 positions, for the other compositions, we also remember
5198 components. */
5199 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5200 if (method != COMPOSITION_RELATIVE)
5201 {
5202 /* We must store a*/
5203 Lisp_Object val, ch;
5204
5205 val = COMPOSITION_COMPONENTS (prop);
5206 if (CONSP (val))
5207 while (CONSP (val))
5208 {
5209 ch = XCAR (val), val = XCDR (val);
5210 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5211 }
5212 else if (VECTORP (val) || STRINGP (val))
5213 {
5214 int len = (VECTORP (val)
5215 ? XVECTOR (val)->size : SCHARS (val));
5216 int i;
5217 for (i = 0; i < len; i++)
5218 {
5219 ch = (STRINGP (val)
5220 ? Faref (val, make_number (i))
5221 : XVECTOR (val)->contents[i]);
5222 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5223 }
5224 }
5225 else /* INTEGERP (val) */
5226 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5227 }
5228 CODING_ADD_COMPOSITION_END (coding, end - from);
5229 }
5230 start = end;
5231 }
5232 while (start < to
5233 && find_composition (start, to, &start, &end, &prop, obj)
5234 && end <= to);
5235
5236 /* Make coding->cmp_data point to the first memory block. */
5237 while (coding->cmp_data->prev)
5238 coding->cmp_data = coding->cmp_data->prev;
5239 coding->cmp_data_start = 0;
5240 }
5241
5242 /* Reflect the saved information about compositions to OBJ.
5243 CODING->cmp_data points to a memory block for the information. OBJ
5244 is a buffer or a string, defaults to the current buffer. */
5245
5246 void
5247 coding_restore_composition (coding, obj)
5248 struct coding_system *coding;
5249 Lisp_Object obj;
5250 {
5251 struct composition_data *cmp_data = coding->cmp_data;
5252
5253 if (!cmp_data)
5254 return;
5255
5256 while (cmp_data->prev)
5257 cmp_data = cmp_data->prev;
5258
5259 while (cmp_data)
5260 {
5261 int i;
5262
5263 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5264 i += cmp_data->data[i])
5265 {
5266 int *data = cmp_data->data + i;
5267 enum composition_method method = (enum composition_method) data[3];
5268 Lisp_Object components;
5269
5270 if (method == COMPOSITION_RELATIVE)
5271 components = Qnil;
5272 else
5273 {
5274 int len = data[0] - 4, j;
5275 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5276
5277 for (j = 0; j < len; j++)
5278 args[j] = make_number (data[4 + j]);
5279 components = (method == COMPOSITION_WITH_ALTCHARS
5280 ? Fstring (len, args) : Fvector (len, args));
5281 }
5282 compose_text (data[1], data[2], components, Qnil, obj);
5283 }
5284 cmp_data = cmp_data->next;
5285 }
5286 }
5287
5288 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5289 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5290 coding system CODING, and return the status code of code conversion
5291 (currently, this value has no meaning).
5292
5293 How many characters (and bytes) are converted to how many
5294 characters (and bytes) are recorded in members of the structure
5295 CODING.
5296
5297 If REPLACE is nonzero, we do various things as if the original text
5298 is deleted and a new text is inserted. See the comments in
5299 replace_range (insdel.c) to know what we are doing.
5300
5301 If REPLACE is zero, it is assumed that the source text is unibyte.
5302 Otherwise, it is assumed that the source text is multibyte. */
5303
5304 int
5305 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5306 int from, from_byte, to, to_byte, encodep, replace;
5307 struct coding_system *coding;
5308 {
5309 int len = to - from, len_byte = to_byte - from_byte;
5310 int nchars_del = 0, nbytes_del = 0;
5311 int require, inserted, inserted_byte;
5312 int head_skip, tail_skip, total_skip = 0;
5313 Lisp_Object saved_coding_symbol;
5314 int first = 1;
5315 unsigned char *src, *dst;
5316 Lisp_Object deletion;
5317 int orig_point = PT, orig_len = len;
5318 int prev_Z;
5319 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5320
5321 deletion = Qnil;
5322 saved_coding_symbol = coding->symbol;
5323
5324 if (from < PT && PT < to)
5325 {
5326 TEMP_SET_PT_BOTH (from, from_byte);
5327 orig_point = from;
5328 }
5329
5330 if (replace)
5331 {
5332 int saved_from = from;
5333 int saved_inhibit_modification_hooks;
5334
5335 prepare_to_modify_buffer (from, to, &from);
5336 if (saved_from != from)
5337 {
5338 to = from + len;
5339 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5340 len_byte = to_byte - from_byte;
5341 }
5342
5343 /* The code conversion routine can not preserve text properties
5344 for now. So, we must remove all text properties in the
5345 region. Here, we must suppress all modification hooks. */
5346 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5347 inhibit_modification_hooks = 1;
5348 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5349 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5350 }
5351
5352 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5353 {
5354 /* We must detect encoding of text and eol format. */
5355
5356 if (from < GPT && to > GPT)
5357 move_gap_both (from, from_byte);
5358 if (coding->type == coding_type_undecided)
5359 {
5360 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5361 if (coding->type == coding_type_undecided)
5362 {
5363 /* It seems that the text contains only ASCII, but we
5364 should not leave it undecided because the deeper
5365 decoding routine (decode_coding) tries to detect the
5366 encodings again in vain. */
5367 coding->type = coding_type_emacs_mule;
5368 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5369 /* As emacs-mule decoder will handle composition, we
5370 need this setting to allocate coding->cmp_data
5371 later. */
5372 coding->composing = COMPOSITION_NO;
5373 }
5374 }
5375 if (coding->eol_type == CODING_EOL_UNDECIDED
5376 && coding->type != coding_type_ccl)
5377 {
5378 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5379 if (coding->eol_type == CODING_EOL_UNDECIDED)
5380 coding->eol_type = CODING_EOL_LF;
5381 /* We had better recover the original eol format if we
5382 encounter an inconsistent eol format while decoding. */
5383 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5384 }
5385 }
5386
5387 /* Now we convert the text. */
5388
5389 /* For encoding, we must process pre-write-conversion in advance. */
5390 if (! inhibit_pre_post_conversion
5391 && encodep
5392 && SYMBOLP (coding->pre_write_conversion)
5393 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5394 {
5395 /* The function in pre-write-conversion may put a new text in a
5396 new buffer. */
5397 struct buffer *prev = current_buffer;
5398 Lisp_Object new;
5399
5400 record_unwind_protect (code_convert_region_unwind, Qnil);
5401 /* We should not call any more pre-write/post-read-conversion
5402 functions while this pre-write-conversion is running. */
5403 inhibit_pre_post_conversion = 1;
5404 call2 (coding->pre_write_conversion,
5405 make_number (from), make_number (to));
5406 inhibit_pre_post_conversion = 0;
5407 /* Discard the unwind protect. */
5408 specpdl_ptr--;
5409
5410 if (current_buffer != prev)
5411 {
5412 len = ZV - BEGV;
5413 new = Fcurrent_buffer ();
5414 set_buffer_internal_1 (prev);
5415 del_range_2 (from, from_byte, to, to_byte, 0);
5416 TEMP_SET_PT_BOTH (from, from_byte);
5417 insert_from_buffer (XBUFFER (new), 1, len, 0);
5418 Fkill_buffer (new);
5419 if (orig_point >= to)
5420 orig_point += len - orig_len;
5421 else if (orig_point > from)
5422 orig_point = from;
5423 orig_len = len;
5424 to = from + len;
5425 from_byte = CHAR_TO_BYTE (from);
5426 to_byte = CHAR_TO_BYTE (to);
5427 len_byte = to_byte - from_byte;
5428 TEMP_SET_PT_BOTH (from, from_byte);
5429 }
5430 }
5431
5432 if (replace)
5433 {
5434 if (! EQ (current_buffer->undo_list, Qt))
5435 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5436 else
5437 {
5438 nchars_del = to - from;
5439 nbytes_del = to_byte - from_byte;
5440 }
5441 }
5442
5443 if (coding->composing != COMPOSITION_DISABLED)
5444 {
5445 if (encodep)
5446 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5447 else
5448 coding_allocate_composition_data (coding, from);
5449 }
5450
5451 /* Try to skip the heading and tailing ASCIIs. */
5452 if (coding->type != coding_type_ccl)
5453 {
5454 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5455
5456 if (from < GPT && GPT < to)
5457 move_gap_both (from, from_byte);
5458 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5459 if (from_byte == to_byte
5460 && (encodep || NILP (coding->post_read_conversion))
5461 && ! CODING_REQUIRE_FLUSHING (coding))
5462 {
5463 coding->produced = len_byte;
5464 coding->produced_char = len;
5465 if (!replace)
5466 /* We must record and adjust for this new text now. */
5467 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5468 return 0;
5469 }
5470
5471 head_skip = from_byte - from_byte_orig;
5472 tail_skip = to_byte_orig - to_byte;
5473 total_skip = head_skip + tail_skip;
5474 from += head_skip;
5475 to -= tail_skip;
5476 len -= total_skip; len_byte -= total_skip;
5477 }
5478
5479 /* For conversion, we must put the gap before the text in addition to
5480 making the gap larger for efficient decoding. The required gap
5481 size starts from 2000 which is the magic number used in make_gap.
5482 But, after one batch of conversion, it will be incremented if we
5483 find that it is not enough . */
5484 require = 2000;
5485
5486 if (GAP_SIZE < require)
5487 make_gap (require - GAP_SIZE);
5488 move_gap_both (from, from_byte);
5489
5490 inserted = inserted_byte = 0;
5491
5492 GAP_SIZE += len_byte;
5493 ZV -= len;
5494 Z -= len;
5495 ZV_BYTE -= len_byte;
5496 Z_BYTE -= len_byte;
5497
5498 if (GPT - BEG < BEG_UNCHANGED)
5499 BEG_UNCHANGED = GPT - BEG;
5500 if (Z - GPT < END_UNCHANGED)
5501 END_UNCHANGED = Z - GPT;
5502
5503 if (!encodep && coding->src_multibyte)
5504 {
5505 /* Decoding routines expects that the source text is unibyte.
5506 We must convert 8-bit characters of multibyte form to
5507 unibyte. */
5508 int len_byte_orig = len_byte;
5509 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5510 if (len_byte < len_byte_orig)
5511 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5512 len_byte);
5513 coding->src_multibyte = 0;
5514 }
5515
5516 for (;;)
5517 {
5518 int result;
5519
5520 /* The buffer memory is now:
5521 +--------+converted-text+---------+-------original-text-------+---+
5522 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5523 |<---------------------- GAP ----------------------->| */
5524 src = GAP_END_ADDR - len_byte;
5525 dst = GPT_ADDR + inserted_byte;
5526
5527 if (encodep)
5528 result = encode_coding (coding, src, dst, len_byte, 0);
5529 else
5530 {
5531 if (coding->composing != COMPOSITION_DISABLED)
5532 coding->cmp_data->char_offset = from + inserted;
5533 result = decode_coding (coding, src, dst, len_byte, 0);
5534 }
5535
5536 /* The buffer memory is now:
5537 +--------+-------converted-text----+--+------original-text----+---+
5538 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5539 |<---------------------- GAP ----------------------->| */
5540
5541 inserted += coding->produced_char;
5542 inserted_byte += coding->produced;
5543 len_byte -= coding->consumed;
5544
5545 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5546 {
5547 coding_allocate_composition_data (coding, from + inserted);
5548 continue;
5549 }
5550
5551 src += coding->consumed;
5552 dst += coding->produced;
5553
5554 if (result == CODING_FINISH_NORMAL)
5555 {
5556 src += len_byte;
5557 break;
5558 }
5559 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5560 {
5561 unsigned char *pend = dst, *p = pend - inserted_byte;
5562 Lisp_Object eol_type;
5563
5564 /* Encode LFs back to the original eol format (CR or CRLF). */
5565 if (coding->eol_type == CODING_EOL_CR)
5566 {
5567 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5568 }
5569 else
5570 {
5571 int count = 0;
5572
5573 while (p < pend) if (*p++ == '\n') count++;
5574 if (src - dst < count)
5575 {
5576 /* We don't have sufficient room for encoding LFs
5577 back to CRLF. We must record converted and
5578 not-yet-converted text back to the buffer
5579 content, enlarge the gap, then record them out of
5580 the buffer contents again. */
5581 int add = len_byte + inserted_byte;
5582
5583 GAP_SIZE -= add;
5584 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5585 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5586 make_gap (count - GAP_SIZE);
5587 GAP_SIZE += add;
5588 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5589 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5590 /* Don't forget to update SRC, DST, and PEND. */
5591 src = GAP_END_ADDR - len_byte;
5592 dst = GPT_ADDR + inserted_byte;
5593 pend = dst;
5594 }
5595 inserted += count;
5596 inserted_byte += count;
5597 coding->produced += count;
5598 p = dst = pend + count;
5599 while (count)
5600 {
5601 *--p = *--pend;
5602 if (*p == '\n') count--, *--p = '\r';
5603 }
5604 }
5605
5606 /* Suppress eol-format conversion in the further conversion. */
5607 coding->eol_type = CODING_EOL_LF;
5608
5609 /* Set the coding system symbol to that for Unix-like EOL. */
5610 eol_type = Fget (saved_coding_symbol, Qeol_type);
5611 if (VECTORP (eol_type)
5612 && XVECTOR (eol_type)->size == 3
5613 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5614 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5615 else
5616 coding->symbol = saved_coding_symbol;
5617
5618 continue;
5619 }
5620 if (len_byte <= 0)
5621 {
5622 if (coding->type != coding_type_ccl
5623 || coding->mode & CODING_MODE_LAST_BLOCK)
5624 break;
5625 coding->mode |= CODING_MODE_LAST_BLOCK;
5626 continue;
5627 }
5628 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5629 {
5630 /* The source text ends in invalid codes. Let's just
5631 make them valid buffer contents, and finish conversion. */
5632 if (multibyte_p)
5633 {
5634 unsigned char *start = dst;
5635
5636 inserted += len_byte;
5637 while (len_byte--)
5638 {
5639 int c = *src++;
5640 dst += CHAR_STRING (c, dst);
5641 }
5642
5643 inserted_byte += dst - start;
5644 }
5645 else
5646 {
5647 inserted += len_byte;
5648 inserted_byte += len_byte;
5649 while (len_byte--)
5650 *dst++ = *src++;
5651 }
5652 break;
5653 }
5654 if (result == CODING_FINISH_INTERRUPT)
5655 {
5656 /* The conversion procedure was interrupted by a user. */
5657 break;
5658 }
5659 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5660 if (coding->consumed < 1)
5661 {
5662 /* It's quite strange to require more memory without
5663 consuming any bytes. Perhaps CCL program bug. */
5664 break;
5665 }
5666 if (first)
5667 {
5668 /* We have just done the first batch of conversion which was
5669 stopped because of insufficient gap. Let's reconsider the
5670 required gap size (i.e. SRT - DST) now.
5671
5672 We have converted ORIG bytes (== coding->consumed) into
5673 NEW bytes (coding->produced). To convert the remaining
5674 LEN bytes, we may need REQUIRE bytes of gap, where:
5675 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5676 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5677 Here, we are sure that NEW >= ORIG. */
5678 float ratio;
5679
5680 if (coding->produced <= coding->consumed)
5681 {
5682 /* This happens because of CCL-based coding system with
5683 eol-type CRLF. */
5684 require = 0;
5685 }
5686 else
5687 {
5688 ratio = (coding->produced - coding->consumed) / coding->consumed;
5689 require = len_byte * ratio;
5690 }
5691 first = 0;
5692 }
5693 if ((src - dst) < (require + 2000))
5694 {
5695 /* See the comment above the previous call of make_gap. */
5696 int add = len_byte + inserted_byte;
5697
5698 GAP_SIZE -= add;
5699 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5700 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5701 make_gap (require + 2000);
5702 GAP_SIZE += add;
5703 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5704 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5705 }
5706 }
5707 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5708
5709 if (encodep && coding->dst_multibyte)
5710 {
5711 /* The output is unibyte. We must convert 8-bit characters to
5712 multibyte form. */
5713 if (inserted_byte * 2 > GAP_SIZE)
5714 {
5715 GAP_SIZE -= inserted_byte;
5716 ZV += inserted_byte; Z += inserted_byte;
5717 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5718 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5719 make_gap (inserted_byte - GAP_SIZE);
5720 GAP_SIZE += inserted_byte;
5721 ZV -= inserted_byte; Z -= inserted_byte;
5722 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5723 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5724 }
5725 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5726 }
5727
5728 /* If we shrank the conversion area, adjust it now. */
5729 if (total_skip > 0)
5730 {
5731 if (tail_skip > 0)
5732 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5733 inserted += total_skip; inserted_byte += total_skip;
5734 GAP_SIZE += total_skip;
5735 GPT -= head_skip; GPT_BYTE -= head_skip;
5736 ZV -= total_skip; ZV_BYTE -= total_skip;
5737 Z -= total_skip; Z_BYTE -= total_skip;
5738 from -= head_skip; from_byte -= head_skip;
5739 to += tail_skip; to_byte += tail_skip;
5740 }
5741
5742 prev_Z = Z;
5743 if (! EQ (current_buffer->undo_list, Qt))
5744 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5745 else
5746 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5747 inserted, inserted_byte);
5748 inserted = Z - prev_Z;
5749
5750 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5751 coding_restore_composition (coding, Fcurrent_buffer ());
5752 coding_free_composition_data (coding);
5753
5754 if (! inhibit_pre_post_conversion
5755 && ! encodep && ! NILP (coding->post_read_conversion))
5756 {
5757 Lisp_Object val;
5758
5759 if (from != PT)
5760 TEMP_SET_PT_BOTH (from, from_byte);
5761 prev_Z = Z;
5762 record_unwind_protect (code_convert_region_unwind, Qnil);
5763 /* We should not call any more pre-write/post-read-conversion
5764 functions while this post-read-conversion is running. */
5765 inhibit_pre_post_conversion = 1;
5766 val = call1 (coding->post_read_conversion, make_number (inserted));
5767 inhibit_pre_post_conversion = 0;
5768 /* Discard the unwind protect. */
5769 specpdl_ptr--;
5770 CHECK_NUMBER (val);
5771 inserted += Z - prev_Z;
5772 }
5773
5774 if (orig_point >= from)
5775 {
5776 if (orig_point >= from + orig_len)
5777 orig_point += inserted - orig_len;
5778 else
5779 orig_point = from;
5780 TEMP_SET_PT (orig_point);
5781 }
5782
5783 if (replace)
5784 {
5785 signal_after_change (from, to - from, inserted);
5786 update_compositions (from, from + inserted, CHECK_BORDER);
5787 }
5788
5789 {
5790 coding->consumed = to_byte - from_byte;
5791 coding->consumed_char = to - from;
5792 coding->produced = inserted_byte;
5793 coding->produced_char = inserted;
5794 }
5795
5796 return 0;
5797 }
5798
5799 Lisp_Object
5800 run_pre_post_conversion_on_str (str, coding, encodep)
5801 Lisp_Object str;
5802 struct coding_system *coding;
5803 int encodep;
5804 {
5805 int count = SPECPDL_INDEX ();
5806 struct gcpro gcpro1, gcpro2;
5807 int multibyte = STRING_MULTIBYTE (str);
5808 Lisp_Object buffer;
5809 struct buffer *buf;
5810 Lisp_Object old_deactivate_mark;
5811
5812 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5813 record_unwind_protect (code_convert_region_unwind, Qnil);
5814 /* It is not crucial to specbind this. */
5815 old_deactivate_mark = Vdeactivate_mark;
5816 GCPRO2 (str, old_deactivate_mark);
5817
5818 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5819 buf = XBUFFER (buffer);
5820
5821 buf->directory = current_buffer->directory;
5822 buf->read_only = Qnil;
5823 buf->filename = Qnil;
5824 buf->undo_list = Qt;
5825 buf->overlays_before = Qnil;
5826 buf->overlays_after = Qnil;
5827
5828 set_buffer_internal (buf);
5829 /* We must insert the contents of STR as is without
5830 unibyte<->multibyte conversion. For that, we adjust the
5831 multibyteness of the working buffer to that of STR. */
5832 Ferase_buffer ();
5833 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5834
5835 insert_from_string (str, 0, 0,
5836 SCHARS (str), SBYTES (str), 0);
5837 UNGCPRO;
5838 inhibit_pre_post_conversion = 1;
5839 if (encodep)
5840 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5841 else
5842 {
5843 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5844 call1 (coding->post_read_conversion, make_number (Z - BEG));
5845 }
5846 inhibit_pre_post_conversion = 0;
5847 Vdeactivate_mark = old_deactivate_mark;
5848 str = make_buffer_string (BEG, Z, 1);
5849 return unbind_to (count, str);
5850 }
5851
5852 Lisp_Object
5853 decode_coding_string (str, coding, nocopy)
5854 Lisp_Object str;
5855 struct coding_system *coding;
5856 int nocopy;
5857 {
5858 int len;
5859 struct conversion_buffer buf;
5860 int from, to_byte;
5861 Lisp_Object saved_coding_symbol;
5862 int result;
5863 int require_decoding;
5864 int shrinked_bytes = 0;
5865 Lisp_Object newstr;
5866 int consumed, consumed_char, produced, produced_char;
5867
5868 from = 0;
5869 to_byte = SBYTES (str);
5870
5871 saved_coding_symbol = coding->symbol;
5872 coding->src_multibyte = STRING_MULTIBYTE (str);
5873 coding->dst_multibyte = 1;
5874 if (CODING_REQUIRE_DETECTION (coding))
5875 {
5876 /* See the comments in code_convert_region. */
5877 if (coding->type == coding_type_undecided)
5878 {
5879 detect_coding (coding, SDATA (str), to_byte);
5880 if (coding->type == coding_type_undecided)
5881 {
5882 coding->type = coding_type_emacs_mule;
5883 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5884 /* As emacs-mule decoder will handle composition, we
5885 need this setting to allocate coding->cmp_data
5886 later. */
5887 coding->composing = COMPOSITION_NO;
5888 }
5889 }
5890 if (coding->eol_type == CODING_EOL_UNDECIDED
5891 && coding->type != coding_type_ccl)
5892 {
5893 saved_coding_symbol = coding->symbol;
5894 detect_eol (coding, SDATA (str), to_byte);
5895 if (coding->eol_type == CODING_EOL_UNDECIDED)
5896 coding->eol_type = CODING_EOL_LF;
5897 /* We had better recover the original eol format if we
5898 encounter an inconsistent eol format while decoding. */
5899 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5900 }
5901 }
5902
5903 if (coding->type == coding_type_no_conversion
5904 || coding->type == coding_type_raw_text)
5905 coding->dst_multibyte = 0;
5906
5907 require_decoding = CODING_REQUIRE_DECODING (coding);
5908
5909 if (STRING_MULTIBYTE (str))
5910 {
5911 /* Decoding routines expect the source text to be unibyte. */
5912 str = Fstring_as_unibyte (str);
5913 to_byte = SBYTES (str);
5914 nocopy = 1;
5915 coding->src_multibyte = 0;
5916 }
5917
5918 /* Try to skip the heading and tailing ASCIIs. */
5919 if (require_decoding && coding->type != coding_type_ccl)
5920 {
5921 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
5922 0);
5923 if (from == to_byte)
5924 require_decoding = 0;
5925 shrinked_bytes = from + (SBYTES (str) - to_byte);
5926 }
5927
5928 if (!require_decoding)
5929 {
5930 coding->consumed = SBYTES (str);
5931 coding->consumed_char = SCHARS (str);
5932 if (coding->dst_multibyte)
5933 {
5934 str = Fstring_as_multibyte (str);
5935 nocopy = 1;
5936 }
5937 coding->produced = SBYTES (str);
5938 coding->produced_char = SCHARS (str);
5939 return (nocopy ? str : Fcopy_sequence (str));
5940 }
5941
5942 if (coding->composing != COMPOSITION_DISABLED)
5943 coding_allocate_composition_data (coding, from);
5944 len = decoding_buffer_size (coding, to_byte - from);
5945 allocate_conversion_buffer (buf, len);
5946
5947 consumed = consumed_char = produced = produced_char = 0;
5948 while (1)
5949 {
5950 result = decode_coding (coding, SDATA (str) + from + consumed,
5951 buf.data + produced, to_byte - from - consumed,
5952 buf.size - produced);
5953 consumed += coding->consumed;
5954 consumed_char += coding->consumed_char;
5955 produced += coding->produced;
5956 produced_char += coding->produced_char;
5957 if (result == CODING_FINISH_NORMAL
5958 || (result == CODING_FINISH_INSUFFICIENT_SRC
5959 && coding->consumed == 0))
5960 break;
5961 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5962 coding_allocate_composition_data (coding, from + produced_char);
5963 else if (result == CODING_FINISH_INSUFFICIENT_DST)
5964 extend_conversion_buffer (&buf);
5965 else if (result == CODING_FINISH_INCONSISTENT_EOL)
5966 {
5967 Lisp_Object eol_type;
5968
5969 /* Recover the original EOL format. */
5970 if (coding->eol_type == CODING_EOL_CR)
5971 {
5972 unsigned char *p;
5973 for (p = buf.data; p < buf.data + produced; p++)
5974 if (*p == '\n') *p = '\r';
5975 }
5976 else if (coding->eol_type == CODING_EOL_CRLF)
5977 {
5978 int num_eol = 0;
5979 unsigned char *p0, *p1;
5980 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5981 if (*p0 == '\n') num_eol++;
5982 if (produced + num_eol >= buf.size)
5983 extend_conversion_buffer (&buf);
5984 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5985 {
5986 *--p1 = *--p0;
5987 if (*p0 == '\n') *--p1 = '\r';
5988 }
5989 produced += num_eol;
5990 produced_char += num_eol;
5991 }
5992 /* Suppress eol-format conversion in the further conversion. */
5993 coding->eol_type = CODING_EOL_LF;
5994
5995 /* Set the coding system symbol to that for Unix-like EOL. */
5996 eol_type = Fget (saved_coding_symbol, Qeol_type);
5997 if (VECTORP (eol_type)
5998 && XVECTOR (eol_type)->size == 3
5999 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6000 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6001 else
6002 coding->symbol = saved_coding_symbol;
6003
6004
6005 }
6006 }
6007
6008 coding->consumed = consumed;
6009 coding->consumed_char = consumed_char;
6010 coding->produced = produced;
6011 coding->produced_char = produced_char;
6012
6013 if (coding->dst_multibyte)
6014 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6015 produced + shrinked_bytes);
6016 else
6017 newstr = make_uninit_string (produced + shrinked_bytes);
6018 if (from > 0)
6019 STRING_COPYIN (newstr, 0, SDATA (str), from);
6020 STRING_COPYIN (newstr, from, buf.data, produced);
6021 if (shrinked_bytes > from)
6022 STRING_COPYIN (newstr, from + produced,
6023 SDATA (str) + to_byte,
6024 shrinked_bytes - from);
6025 free_conversion_buffer (&buf);
6026
6027 if (coding->cmp_data && coding->cmp_data->used)
6028 coding_restore_composition (coding, newstr);
6029 coding_free_composition_data (coding);
6030
6031 if (SYMBOLP (coding->post_read_conversion)
6032 && !NILP (Ffboundp (coding->post_read_conversion)))
6033 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6034
6035 return newstr;
6036 }
6037
6038 Lisp_Object
6039 encode_coding_string (str, coding, nocopy)
6040 Lisp_Object str;
6041 struct coding_system *coding;
6042 int nocopy;
6043 {
6044 int len;
6045 struct conversion_buffer buf;
6046 int from, to, to_byte;
6047 int result;
6048 int shrinked_bytes = 0;
6049 Lisp_Object newstr;
6050 int consumed, consumed_char, produced, produced_char;
6051
6052 if (SYMBOLP (coding->pre_write_conversion)
6053 && !NILP (Ffboundp (coding->pre_write_conversion)))
6054 str = run_pre_post_conversion_on_str (str, coding, 1);
6055
6056 from = 0;
6057 to = SCHARS (str);
6058 to_byte = SBYTES (str);
6059
6060 /* Encoding routines determine the multibyteness of the source text
6061 by coding->src_multibyte. */
6062 coding->src_multibyte = STRING_MULTIBYTE (str);
6063 coding->dst_multibyte = 0;
6064 if (! CODING_REQUIRE_ENCODING (coding))
6065 {
6066 coding->consumed = SBYTES (str);
6067 coding->consumed_char = SCHARS (str);
6068 if (STRING_MULTIBYTE (str))
6069 {
6070 str = Fstring_as_unibyte (str);
6071 nocopy = 1;
6072 }
6073 coding->produced = SBYTES (str);
6074 coding->produced_char = SCHARS (str);
6075 return (nocopy ? str : Fcopy_sequence (str));
6076 }
6077
6078 if (coding->composing != COMPOSITION_DISABLED)
6079 coding_save_composition (coding, from, to, str);
6080
6081 /* Try to skip the heading and tailing ASCIIs. */
6082 if (coding->type != coding_type_ccl)
6083 {
6084 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6085 1);
6086 if (from == to_byte)
6087 return (nocopy ? str : Fcopy_sequence (str));
6088 shrinked_bytes = from + (SBYTES (str) - to_byte);
6089 }
6090
6091 len = encoding_buffer_size (coding, to_byte - from);
6092 allocate_conversion_buffer (buf, len);
6093
6094 consumed = consumed_char = produced = produced_char = 0;
6095 while (1)
6096 {
6097 result = encode_coding (coding, SDATA (str) + from + consumed,
6098 buf.data + produced, to_byte - from - consumed,
6099 buf.size - produced);
6100 consumed += coding->consumed;
6101 consumed_char += coding->consumed_char;
6102 produced += coding->produced;
6103 produced_char += coding->produced_char;
6104 if (result == CODING_FINISH_NORMAL
6105 || (result == CODING_FINISH_INSUFFICIENT_SRC
6106 && coding->consumed == 0))
6107 break;
6108 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6109 extend_conversion_buffer (&buf);
6110 }
6111
6112 coding->consumed = consumed;
6113 coding->consumed_char = consumed_char;
6114 coding->produced = produced;
6115 coding->produced_char = produced_char;
6116
6117 newstr = make_uninit_string (produced + shrinked_bytes);
6118 if (from > 0)
6119 STRING_COPYIN (newstr, 0, SDATA (str), from);
6120 STRING_COPYIN (newstr, from, buf.data, produced);
6121 if (shrinked_bytes > from)
6122 STRING_COPYIN (newstr, from + produced,
6123 SDATA (str) + to_byte,
6124 shrinked_bytes - from);
6125
6126 free_conversion_buffer (&buf);
6127 coding_free_composition_data (coding);
6128
6129 return newstr;
6130 }
6131
6132 \f
6133 #ifdef emacs
6134 /*** 8. Emacs Lisp library functions ***/
6135
6136 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6137 doc: /* Return t if OBJECT is nil or a coding-system.
6138 See the documentation of `make-coding-system' for information
6139 about coding-system objects. */)
6140 (obj)
6141 Lisp_Object obj;
6142 {
6143 if (NILP (obj))
6144 return Qt;
6145 if (!SYMBOLP (obj))
6146 return Qnil;
6147 /* Get coding-spec vector for OBJ. */
6148 obj = Fget (obj, Qcoding_system);
6149 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6150 ? Qt : Qnil);
6151 }
6152
6153 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6154 Sread_non_nil_coding_system, 1, 1, 0,
6155 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6156 (prompt)
6157 Lisp_Object prompt;
6158 {
6159 Lisp_Object val;
6160 do
6161 {
6162 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6163 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6164 }
6165 while (SCHARS (val) == 0);
6166 return (Fintern (val, Qnil));
6167 }
6168
6169 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6170 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6171 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6172 (prompt, default_coding_system)
6173 Lisp_Object prompt, default_coding_system;
6174 {
6175 Lisp_Object val;
6176 if (SYMBOLP (default_coding_system))
6177 default_coding_system = SYMBOL_NAME (default_coding_system);
6178 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6179 Qt, Qnil, Qcoding_system_history,
6180 default_coding_system, Qnil);
6181 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6182 }
6183
6184 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6185 1, 1, 0,
6186 doc: /* Check validity of CODING-SYSTEM.
6187 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6188 It is valid if it is a symbol with a non-nil `coding-system' property.
6189 The value of property should be a vector of length 5. */)
6190 (coding_system)
6191 Lisp_Object coding_system;
6192 {
6193 CHECK_SYMBOL (coding_system);
6194 if (!NILP (Fcoding_system_p (coding_system)))
6195 return coding_system;
6196 while (1)
6197 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6198 }
6199 \f
6200 Lisp_Object
6201 detect_coding_system (src, src_bytes, highest, multibytep)
6202 const unsigned char *src;
6203 int src_bytes, highest;
6204 int multibytep;
6205 {
6206 int coding_mask, eol_type;
6207 Lisp_Object val, tmp;
6208 int dummy;
6209
6210 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6211 eol_type = detect_eol_type (src, src_bytes, &dummy);
6212 if (eol_type == CODING_EOL_INCONSISTENT)
6213 eol_type = CODING_EOL_UNDECIDED;
6214
6215 if (!coding_mask)
6216 {
6217 val = Qundecided;
6218 if (eol_type != CODING_EOL_UNDECIDED)
6219 {
6220 Lisp_Object val2;
6221 val2 = Fget (Qundecided, Qeol_type);
6222 if (VECTORP (val2))
6223 val = XVECTOR (val2)->contents[eol_type];
6224 }
6225 return (highest ? val : Fcons (val, Qnil));
6226 }
6227
6228 /* At first, gather possible coding systems in VAL. */
6229 val = Qnil;
6230 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6231 {
6232 Lisp_Object category_val, category_index;
6233
6234 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6235 category_val = Fsymbol_value (XCAR (tmp));
6236 if (!NILP (category_val)
6237 && NATNUMP (category_index)
6238 && (coding_mask & (1 << XFASTINT (category_index))))
6239 {
6240 val = Fcons (category_val, val);
6241 if (highest)
6242 break;
6243 }
6244 }
6245 if (!highest)
6246 val = Fnreverse (val);
6247
6248 /* Then, replace the elements with subsidiary coding systems. */
6249 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6250 {
6251 if (eol_type != CODING_EOL_UNDECIDED
6252 && eol_type != CODING_EOL_INCONSISTENT)
6253 {
6254 Lisp_Object eol;
6255 eol = Fget (XCAR (tmp), Qeol_type);
6256 if (VECTORP (eol))
6257 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6258 }
6259 }
6260 return (highest ? XCAR (val) : val);
6261 }
6262
6263 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6264 2, 3, 0,
6265 doc: /* Detect coding system of the text in the region between START and END.
6266 Return a list of possible coding systems ordered by priority.
6267
6268 If only ASCII characters are found, it returns a list of single element
6269 `undecided' or its subsidiary coding system according to a detected
6270 end-of-line format.
6271
6272 If optional argument HIGHEST is non-nil, return the coding system of
6273 highest priority. */)
6274 (start, end, highest)
6275 Lisp_Object start, end, highest;
6276 {
6277 int from, to;
6278 int from_byte, to_byte;
6279 int include_anchor_byte = 0;
6280
6281 CHECK_NUMBER_COERCE_MARKER (start);
6282 CHECK_NUMBER_COERCE_MARKER (end);
6283
6284 validate_region (&start, &end);
6285 from = XINT (start), to = XINT (end);
6286 from_byte = CHAR_TO_BYTE (from);
6287 to_byte = CHAR_TO_BYTE (to);
6288
6289 if (from < GPT && to >= GPT)
6290 move_gap_both (to, to_byte);
6291 /* If we an anchor byte `\0' follows the region, we include it in
6292 the detecting source. Then code detectors can handle the tailing
6293 byte sequence more accurately.
6294
6295 Fix me: This is not a perfect solution. It is better that we
6296 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6297 */
6298 if (to == Z || (to == GPT && GAP_SIZE > 0))
6299 include_anchor_byte = 1;
6300 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6301 to_byte - from_byte + include_anchor_byte,
6302 !NILP (highest),
6303 !NILP (current_buffer
6304 ->enable_multibyte_characters));
6305 }
6306
6307 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6308 1, 2, 0,
6309 doc: /* Detect coding system of the text in STRING.
6310 Return a list of possible coding systems ordered by priority.
6311
6312 If only ASCII characters are found, it returns a list of single element
6313 `undecided' or its subsidiary coding system according to a detected
6314 end-of-line format.
6315
6316 If optional argument HIGHEST is non-nil, return the coding system of
6317 highest priority. */)
6318 (string, highest)
6319 Lisp_Object string, highest;
6320 {
6321 CHECK_STRING (string);
6322
6323 return detect_coding_system (SDATA (string),
6324 /* "+ 1" is to include the anchor byte
6325 `\0'. With this, code detectors can
6326 handle the tailing bytes more
6327 accurately. */
6328 SBYTES (string) + 1,
6329 !NILP (highest),
6330 STRING_MULTIBYTE (string));
6331 }
6332
6333 /* Return an intersection of lists L1 and L2. */
6334
6335 static Lisp_Object
6336 intersection (l1, l2)
6337 Lisp_Object l1, l2;
6338 {
6339 Lisp_Object val = Fcons (Qnil, Qnil), tail;
6340
6341 for (tail = val; CONSP (l1); l1 = XCDR (l1))
6342 {
6343 if (!NILP (Fmemq (XCAR (l1), l2)))
6344 {
6345 XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6346 tail = XCDR (tail);
6347 }
6348 }
6349 return XCDR (val);
6350 }
6351
6352
6353 /* Subroutine for Fsafe_coding_systems_region_internal.
6354
6355 Return a list of coding systems that safely encode the multibyte
6356 text between P and PEND. SAFE_CODINGS, if non-nil, is a list of
6357 possible coding systems. If it is nil, it means that we have not
6358 yet found any coding systems.
6359
6360 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An
6361 element of WORK_TABLE is set to t once the element is looked up.
6362
6363 If a non-ASCII single byte char is found, set
6364 *single_byte_char_found to 1. */
6365
6366 static Lisp_Object
6367 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6368 unsigned char *p, *pend;
6369 Lisp_Object safe_codings, work_table;
6370 int *single_byte_char_found;
6371 {
6372 int c, len, idx;
6373 Lisp_Object val;
6374
6375 while (p < pend)
6376 {
6377 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6378 p += len;
6379 if (ASCII_BYTE_P (c))
6380 /* We can ignore ASCII characters here. */
6381 continue;
6382 if (SINGLE_BYTE_CHAR_P (c))
6383 *single_byte_char_found = 1;
6384 if (NILP (safe_codings))
6385 continue;
6386 /* Check the safe coding systems for C. */
6387 val = char_table_ref_and_index (work_table, c, &idx);
6388 if (EQ (val, Qt))
6389 /* This element was already checked. Ignore it. */
6390 continue;
6391 /* Remember that we checked this element. */
6392 CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6393
6394 /* If there are some safe coding systems for C and we have
6395 already found the other set of coding systems for the
6396 different characters, get the intersection of them. */
6397 if (!EQ (safe_codings, Qt) && !NILP (val))
6398 val = intersection (safe_codings, val);
6399 safe_codings = val;
6400 }
6401 return safe_codings;
6402 }
6403
6404
6405 /* Return a list of coding systems that safely encode the text between
6406 START and END. If the text contains only ASCII or is unibyte,
6407 return t. */
6408
6409 DEFUN ("find-coding-systems-region-internal",
6410 Ffind_coding_systems_region_internal,
6411 Sfind_coding_systems_region_internal, 2, 2, 0,
6412 doc: /* Internal use only. */)
6413 (start, end)
6414 Lisp_Object start, end;
6415 {
6416 Lisp_Object work_table, safe_codings;
6417 int non_ascii_p = 0;
6418 int single_byte_char_found = 0;
6419 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6420
6421 if (STRINGP (start))
6422 {
6423 if (!STRING_MULTIBYTE (start))
6424 return Qt;
6425 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6426 p2 = p2end = p1end;
6427 if (SCHARS (start) != SBYTES (start))
6428 non_ascii_p = 1;
6429 }
6430 else
6431 {
6432 int from, to, stop;
6433
6434 CHECK_NUMBER_COERCE_MARKER (start);
6435 CHECK_NUMBER_COERCE_MARKER (end);
6436 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6437 args_out_of_range (start, end);
6438 if (NILP (current_buffer->enable_multibyte_characters))
6439 return Qt;
6440 from = CHAR_TO_BYTE (XINT (start));
6441 to = CHAR_TO_BYTE (XINT (end));
6442 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6443 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6444 if (stop == to)
6445 p2 = p2end = p1end;
6446 else
6447 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6448 if (XINT (end) - XINT (start) != to - from)
6449 non_ascii_p = 1;
6450 }
6451
6452 if (!non_ascii_p)
6453 {
6454 /* We are sure that the text contains no multibyte character.
6455 Check if it contains eight-bit-graphic. */
6456 p = p1;
6457 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6458 if (p == p1end)
6459 {
6460 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6461 if (p == p2end)
6462 return Qt;
6463 }
6464 }
6465
6466 /* The text contains non-ASCII characters. */
6467 work_table = Fcopy_sequence (Vchar_coding_system_table);
6468 safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6469 &single_byte_char_found);
6470 if (p2 < p2end)
6471 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6472 &single_byte_char_found);
6473
6474 if (EQ (safe_codings, Qt))
6475 ; /* Nothing to be done. */
6476 else if (!single_byte_char_found)
6477 {
6478 /* Append generic coding systems. */
6479 Lisp_Object args[2];
6480 args[0] = safe_codings;
6481 args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6482 make_number (0));
6483 safe_codings = Fappend (2, args);
6484 }
6485 else
6486 safe_codings = Fcons (Qraw_text,
6487 Fcons (Qemacs_mule,
6488 Fcons (Qno_conversion, safe_codings)));
6489 return safe_codings;
6490 }
6491
6492
6493 /* Search from position POS for such characters that are unencodable
6494 accoding to SAFE_CHARS, and return a list of their positions. P
6495 points where in the memory the character at POS exists. Limit the
6496 search at PEND or when Nth unencodable characters are found.
6497
6498 If SAFE_CHARS is a char table, an element for an unencodable
6499 character is nil.
6500
6501 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6502
6503 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6504 eight-bit-graphic characters are unencodable. */
6505
6506 static Lisp_Object
6507 unencodable_char_position (safe_chars, pos, p, pend, n)
6508 Lisp_Object safe_chars;
6509 int pos;
6510 unsigned char *p, *pend;
6511 int n;
6512 {
6513 Lisp_Object pos_list;
6514
6515 pos_list = Qnil;
6516 while (p < pend)
6517 {
6518 int len;
6519 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6520
6521 if (c >= 128
6522 && (CHAR_TABLE_P (safe_chars)
6523 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6524 : (NILP (safe_chars) || c < 256)))
6525 {
6526 pos_list = Fcons (make_number (pos), pos_list);
6527 if (--n <= 0)
6528 break;
6529 }
6530 pos++;
6531 p += len;
6532 }
6533 return Fnreverse (pos_list);
6534 }
6535
6536
6537 DEFUN ("unencodable-char-position", Funencodable_char_position,
6538 Sunencodable_char_position, 3, 5, 0,
6539 doc: /*
6540 Return position of first un-encodable character in a region.
6541 START and END specfiy the region and CODING-SYSTEM specifies the
6542 encoding to check. Return nil if CODING-SYSTEM does encode the region.
6543
6544 If optional 4th argument COUNT is non-nil, it specifies at most how
6545 many un-encodable characters to search. In this case, the value is a
6546 list of positions.
6547
6548 If optional 5th argument STRING is non-nil, it is a string to search
6549 for un-encodable characters. In that case, START and END are indexes
6550 to the string. */)
6551 (start, end, coding_system, count, string)
6552 Lisp_Object start, end, coding_system, count, string;
6553 {
6554 int n;
6555 Lisp_Object safe_chars;
6556 struct coding_system coding;
6557 Lisp_Object positions;
6558 int from, to;
6559 unsigned char *p, *pend;
6560
6561 if (NILP (string))
6562 {
6563 validate_region (&start, &end);
6564 from = XINT (start);
6565 to = XINT (end);
6566 if (NILP (current_buffer->enable_multibyte_characters))
6567 return Qnil;
6568 p = CHAR_POS_ADDR (from);
6569 pend = CHAR_POS_ADDR (to);
6570 }
6571 else
6572 {
6573 CHECK_STRING (string);
6574 CHECK_NATNUM (start);
6575 CHECK_NATNUM (end);
6576 from = XINT (start);
6577 to = XINT (end);
6578 if (from > to
6579 || to > SCHARS (string))
6580 args_out_of_range_3 (string, start, end);
6581 if (! STRING_MULTIBYTE (string))
6582 return Qnil;
6583 p = SDATA (string) + string_char_to_byte (string, from);
6584 pend = SDATA (string) + string_char_to_byte (string, to);
6585 }
6586
6587 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6588
6589 if (NILP (count))
6590 n = 1;
6591 else
6592 {
6593 CHECK_NATNUM (count);
6594 n = XINT (count);
6595 }
6596
6597 if (coding.type == coding_type_no_conversion
6598 || coding.type == coding_type_raw_text)
6599 return Qnil;
6600
6601 if (coding.type == coding_type_undecided)
6602 safe_chars = Qnil;
6603 else
6604 safe_chars = coding_safe_chars (&coding);
6605
6606 if (STRINGP (string)
6607 || from >= GPT || to <= GPT)
6608 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6609 else
6610 {
6611 Lisp_Object args[2];
6612
6613 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6614 n -= XINT (Flength (args[0]));
6615 if (n <= 0)
6616 positions = args[0];
6617 else
6618 {
6619 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6620 pend, n);
6621 positions = Fappend (2, args);
6622 }
6623 }
6624
6625 return (NILP (count) ? Fcar (positions) : positions);
6626 }
6627
6628
6629 Lisp_Object
6630 code_convert_region1 (start, end, coding_system, encodep)
6631 Lisp_Object start, end, coding_system;
6632 int encodep;
6633 {
6634 struct coding_system coding;
6635 int from, to;
6636
6637 CHECK_NUMBER_COERCE_MARKER (start);
6638 CHECK_NUMBER_COERCE_MARKER (end);
6639 CHECK_SYMBOL (coding_system);
6640
6641 validate_region (&start, &end);
6642 from = XFASTINT (start);
6643 to = XFASTINT (end);
6644
6645 if (NILP (coding_system))
6646 return make_number (to - from);
6647
6648 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6649 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6650
6651 coding.mode |= CODING_MODE_LAST_BLOCK;
6652 coding.src_multibyte = coding.dst_multibyte
6653 = !NILP (current_buffer->enable_multibyte_characters);
6654 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6655 &coding, encodep, 1);
6656 Vlast_coding_system_used = coding.symbol;
6657 return make_number (coding.produced_char);
6658 }
6659
6660 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6661 3, 3, "r\nzCoding system: ",
6662 doc: /* Decode the current region from the specified coding system.
6663 When called from a program, takes three arguments:
6664 START, END, and CODING-SYSTEM. START and END are buffer positions.
6665 This function sets `last-coding-system-used' to the precise coding system
6666 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6667 not fully specified.)
6668 It returns the length of the decoded text. */)
6669 (start, end, coding_system)
6670 Lisp_Object start, end, coding_system;
6671 {
6672 return code_convert_region1 (start, end, coding_system, 0);
6673 }
6674
6675 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6676 3, 3, "r\nzCoding system: ",
6677 doc: /* Encode the current region into the specified coding system.
6678 When called from a program, takes three arguments:
6679 START, END, and CODING-SYSTEM. START and END are buffer positions.
6680 This function sets `last-coding-system-used' to the precise coding system
6681 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6682 not fully specified.)
6683 It returns the length of the encoded text. */)
6684 (start, end, coding_system)
6685 Lisp_Object start, end, coding_system;
6686 {
6687 return code_convert_region1 (start, end, coding_system, 1);
6688 }
6689
6690 Lisp_Object
6691 code_convert_string1 (string, coding_system, nocopy, encodep)
6692 Lisp_Object string, coding_system, nocopy;
6693 int encodep;
6694 {
6695 struct coding_system coding;
6696
6697 CHECK_STRING (string);
6698 CHECK_SYMBOL (coding_system);
6699
6700 if (NILP (coding_system))
6701 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6702
6703 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6704 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6705
6706 coding.mode |= CODING_MODE_LAST_BLOCK;
6707 string = (encodep
6708 ? encode_coding_string (string, &coding, !NILP (nocopy))
6709 : decode_coding_string (string, &coding, !NILP (nocopy)));
6710 Vlast_coding_system_used = coding.symbol;
6711
6712 return string;
6713 }
6714
6715 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6716 2, 3, 0,
6717 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6718 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6719 if the decoding operation is trivial.
6720 This function sets `last-coding-system-used' to the precise coding system
6721 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6722 not fully specified.) */)
6723 (string, coding_system, nocopy)
6724 Lisp_Object string, coding_system, nocopy;
6725 {
6726 return code_convert_string1 (string, coding_system, nocopy, 0);
6727 }
6728
6729 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6730 2, 3, 0,
6731 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6732 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6733 if the encoding operation is trivial.
6734 This function sets `last-coding-system-used' to the precise coding system
6735 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6736 not fully specified.) */)
6737 (string, coding_system, nocopy)
6738 Lisp_Object string, coding_system, nocopy;
6739 {
6740 return code_convert_string1 (string, coding_system, nocopy, 1);
6741 }
6742
6743 /* Encode or decode STRING according to CODING_SYSTEM.
6744 Do not set Vlast_coding_system_used.
6745
6746 This function is called only from macros DECODE_FILE and
6747 ENCODE_FILE, thus we ignore character composition. */
6748
6749 Lisp_Object
6750 code_convert_string_norecord (string, coding_system, encodep)
6751 Lisp_Object string, coding_system;
6752 int encodep;
6753 {
6754 struct coding_system coding;
6755
6756 CHECK_STRING (string);
6757 CHECK_SYMBOL (coding_system);
6758
6759 if (NILP (coding_system))
6760 return string;
6761
6762 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6763 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6764
6765 coding.composing = COMPOSITION_DISABLED;
6766 coding.mode |= CODING_MODE_LAST_BLOCK;
6767 return (encodep
6768 ? encode_coding_string (string, &coding, 1)
6769 : decode_coding_string (string, &coding, 1));
6770 }
6771 \f
6772 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6773 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6774 Return the corresponding character. */)
6775 (code)
6776 Lisp_Object code;
6777 {
6778 unsigned char c1, c2, s1, s2;
6779 Lisp_Object val;
6780
6781 CHECK_NUMBER (code);
6782 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6783 if (s1 == 0)
6784 {
6785 if (s2 < 0x80)
6786 XSETFASTINT (val, s2);
6787 else if (s2 >= 0xA0 || s2 <= 0xDF)
6788 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6789 else
6790 error ("Invalid Shift JIS code: %x", XFASTINT (code));
6791 }
6792 else
6793 {
6794 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6795 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6796 error ("Invalid Shift JIS code: %x", XFASTINT (code));
6797 DECODE_SJIS (s1, s2, c1, c2);
6798 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6799 }
6800 return val;
6801 }
6802
6803 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6804 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6805 Return the corresponding code in SJIS. */)
6806 (ch)
6807 Lisp_Object ch;
6808 {
6809 int charset, c1, c2, s1, s2;
6810 Lisp_Object val;
6811
6812 CHECK_NUMBER (ch);
6813 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6814 if (charset == CHARSET_ASCII)
6815 {
6816 val = ch;
6817 }
6818 else if (charset == charset_jisx0208
6819 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6820 {
6821 ENCODE_SJIS (c1, c2, s1, s2);
6822 XSETFASTINT (val, (s1 << 8) | s2);
6823 }
6824 else if (charset == charset_katakana_jisx0201
6825 && c1 > 0x20 && c2 < 0xE0)
6826 {
6827 XSETFASTINT (val, c1 | 0x80);
6828 }
6829 else
6830 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6831 return val;
6832 }
6833
6834 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6835 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6836 Return the corresponding character. */)
6837 (code)
6838 Lisp_Object code;
6839 {
6840 int charset;
6841 unsigned char b1, b2, c1, c2;
6842 Lisp_Object val;
6843
6844 CHECK_NUMBER (code);
6845 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6846 if (b1 == 0)
6847 {
6848 if (b2 >= 0x80)
6849 error ("Invalid BIG5 code: %x", XFASTINT (code));
6850 val = code;
6851 }
6852 else
6853 {
6854 if ((b1 < 0xA1 || b1 > 0xFE)
6855 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6856 error ("Invalid BIG5 code: %x", XFASTINT (code));
6857 DECODE_BIG5 (b1, b2, charset, c1, c2);
6858 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6859 }
6860 return val;
6861 }
6862
6863 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6864 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6865 Return the corresponding character code in Big5. */)
6866 (ch)
6867 Lisp_Object ch;
6868 {
6869 int charset, c1, c2, b1, b2;
6870 Lisp_Object val;
6871
6872 CHECK_NUMBER (ch);
6873 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6874 if (charset == CHARSET_ASCII)
6875 {
6876 val = ch;
6877 }
6878 else if ((charset == charset_big5_1
6879 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6880 || (charset == charset_big5_2
6881 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6882 {
6883 ENCODE_BIG5 (charset, c1, c2, b1, b2);
6884 XSETFASTINT (val, (b1 << 8) | b2);
6885 }
6886 else
6887 error ("Can't encode to Big5: %d", XFASTINT (ch));
6888 return val;
6889 }
6890 \f
6891 DEFUN ("set-terminal-coding-system-internal",
6892 Fset_terminal_coding_system_internal,
6893 Sset_terminal_coding_system_internal, 1, 1, 0,
6894 doc: /* Internal use only. */)
6895 (coding_system)
6896 Lisp_Object coding_system;
6897 {
6898 CHECK_SYMBOL (coding_system);
6899 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6900 /* We had better not send unsafe characters to terminal. */
6901 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6902 /* Character composition should be disabled. */
6903 terminal_coding.composing = COMPOSITION_DISABLED;
6904 /* Error notification should be suppressed. */
6905 terminal_coding.suppress_error = 1;
6906 terminal_coding.src_multibyte = 1;
6907 terminal_coding.dst_multibyte = 0;
6908 return Qnil;
6909 }
6910
6911 DEFUN ("set-safe-terminal-coding-system-internal",
6912 Fset_safe_terminal_coding_system_internal,
6913 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6914 doc: /* Internal use only. */)
6915 (coding_system)
6916 Lisp_Object coding_system;
6917 {
6918 CHECK_SYMBOL (coding_system);
6919 setup_coding_system (Fcheck_coding_system (coding_system),
6920 &safe_terminal_coding);
6921 /* Character composition should be disabled. */
6922 safe_terminal_coding.composing = COMPOSITION_DISABLED;
6923 /* Error notification should be suppressed. */
6924 terminal_coding.suppress_error = 1;
6925 safe_terminal_coding.src_multibyte = 1;
6926 safe_terminal_coding.dst_multibyte = 0;
6927 return Qnil;
6928 }
6929
6930 DEFUN ("terminal-coding-system",
6931 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6932 doc: /* Return coding system specified for terminal output. */)
6933 ()
6934 {
6935 return terminal_coding.symbol;
6936 }
6937
6938 DEFUN ("set-keyboard-coding-system-internal",
6939 Fset_keyboard_coding_system_internal,
6940 Sset_keyboard_coding_system_internal, 1, 1, 0,
6941 doc: /* Internal use only. */)
6942 (coding_system)
6943 Lisp_Object coding_system;
6944 {
6945 CHECK_SYMBOL (coding_system);
6946 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6947 /* Character composition should be disabled. */
6948 keyboard_coding.composing = COMPOSITION_DISABLED;
6949 return Qnil;
6950 }
6951
6952 DEFUN ("keyboard-coding-system",
6953 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6954 doc: /* Return coding system specified for decoding keyboard input. */)
6955 ()
6956 {
6957 return keyboard_coding.symbol;
6958 }
6959
6960 \f
6961 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6962 Sfind_operation_coding_system, 1, MANY, 0,
6963 doc: /* Choose a coding system for an operation based on the target name.
6964 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6965 DECODING-SYSTEM is the coding system to use for decoding
6966 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6967 for encoding (in case OPERATION does encoding).
6968
6969 The first argument OPERATION specifies an I/O primitive:
6970 For file I/O, `insert-file-contents' or `write-region'.
6971 For process I/O, `call-process', `call-process-region', or `start-process'.
6972 For network I/O, `open-network-stream'.
6973
6974 The remaining arguments should be the same arguments that were passed
6975 to the primitive. Depending on which primitive, one of those arguments
6976 is selected as the TARGET. For example, if OPERATION does file I/O,
6977 whichever argument specifies the file name is TARGET.
6978
6979 TARGET has a meaning which depends on OPERATION:
6980 For file I/O, TARGET is a file name.
6981 For process I/O, TARGET is a process name.
6982 For network I/O, TARGET is a service name or a port number
6983
6984 This function looks up what specified for TARGET in,
6985 `file-coding-system-alist', `process-coding-system-alist',
6986 or `network-coding-system-alist' depending on OPERATION.
6987 They may specify a coding system, a cons of coding systems,
6988 or a function symbol to call.
6989 In the last case, we call the function with one argument,
6990 which is a list of all the arguments given to this function.
6991
6992 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
6993 (nargs, args)
6994 int nargs;
6995 Lisp_Object *args;
6996 {
6997 Lisp_Object operation, target_idx, target, val;
6998 register Lisp_Object chain;
6999
7000 if (nargs < 2)
7001 error ("Too few arguments");
7002 operation = args[0];
7003 if (!SYMBOLP (operation)
7004 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7005 error ("Invalid first argument");
7006 if (nargs < 1 + XINT (target_idx))
7007 error ("Too few arguments for operation: %s",
7008 SDATA (SYMBOL_NAME (operation)));
7009 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7010 argument to write-region) is string, it must be treated as a
7011 target file name. */
7012 if (EQ (operation, Qwrite_region)
7013 && nargs > 5
7014 && STRINGP (args[5]))
7015 target_idx = make_number (4);
7016 target = args[XINT (target_idx) + 1];
7017 if (!(STRINGP (target)
7018 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7019 error ("Invalid argument %d", XINT (target_idx) + 1);
7020
7021 chain = ((EQ (operation, Qinsert_file_contents)
7022 || EQ (operation, Qwrite_region))
7023 ? Vfile_coding_system_alist
7024 : (EQ (operation, Qopen_network_stream)
7025 ? Vnetwork_coding_system_alist
7026 : Vprocess_coding_system_alist));
7027 if (NILP (chain))
7028 return Qnil;
7029
7030 for (; CONSP (chain); chain = XCDR (chain))
7031 {
7032 Lisp_Object elt;
7033 elt = XCAR (chain);
7034
7035 if (CONSP (elt)
7036 && ((STRINGP (target)
7037 && STRINGP (XCAR (elt))
7038 && fast_string_match (XCAR (elt), target) >= 0)
7039 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7040 {
7041 val = XCDR (elt);
7042 /* Here, if VAL is both a valid coding system and a valid
7043 function symbol, we return VAL as a coding system. */
7044 if (CONSP (val))
7045 return val;
7046 if (! SYMBOLP (val))
7047 return Qnil;
7048 if (! NILP (Fcoding_system_p (val)))
7049 return Fcons (val, val);
7050 if (! NILP (Ffboundp (val)))
7051 {
7052 val = call1 (val, Flist (nargs, args));
7053 if (CONSP (val))
7054 return val;
7055 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7056 return Fcons (val, val);
7057 }
7058 return Qnil;
7059 }
7060 }
7061 return Qnil;
7062 }
7063
7064 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7065 Supdate_coding_systems_internal, 0, 0, 0,
7066 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7067 When values of any coding categories are changed, you must
7068 call this function. */)
7069 ()
7070 {
7071 int i;
7072
7073 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7074 {
7075 Lisp_Object val;
7076
7077 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7078 if (!NILP (val))
7079 {
7080 if (! coding_system_table[i])
7081 coding_system_table[i] = ((struct coding_system *)
7082 xmalloc (sizeof (struct coding_system)));
7083 setup_coding_system (val, coding_system_table[i]);
7084 }
7085 else if (coding_system_table[i])
7086 {
7087 xfree (coding_system_table[i]);
7088 coding_system_table[i] = NULL;
7089 }
7090 }
7091
7092 return Qnil;
7093 }
7094
7095 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7096 Sset_coding_priority_internal, 0, 0, 0,
7097 doc: /* Update internal database for the current value of `coding-category-list'.
7098 This function is internal use only. */)
7099 ()
7100 {
7101 int i = 0, idx;
7102 Lisp_Object val;
7103
7104 val = Vcoding_category_list;
7105
7106 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7107 {
7108 if (! SYMBOLP (XCAR (val)))
7109 break;
7110 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7111 if (idx >= CODING_CATEGORY_IDX_MAX)
7112 break;
7113 coding_priorities[i++] = (1 << idx);
7114 val = XCDR (val);
7115 }
7116 /* If coding-category-list is valid and contains all coding
7117 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7118 the following code saves Emacs from crashing. */
7119 while (i < CODING_CATEGORY_IDX_MAX)
7120 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7121
7122 return Qnil;
7123 }
7124
7125 #endif /* emacs */
7126
7127 \f
7128 /*** 9. Post-amble ***/
7129
7130 void
7131 init_coding_once ()
7132 {
7133 int i;
7134
7135 /* Emacs' internal format specific initialize routine. */
7136 for (i = 0; i <= 0x20; i++)
7137 emacs_code_class[i] = EMACS_control_code;
7138 emacs_code_class[0x0A] = EMACS_linefeed_code;
7139 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7140 for (i = 0x21 ; i < 0x7F; i++)
7141 emacs_code_class[i] = EMACS_ascii_code;
7142 emacs_code_class[0x7F] = EMACS_control_code;
7143 for (i = 0x80; i < 0xFF; i++)
7144 emacs_code_class[i] = EMACS_invalid_code;
7145 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7146 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7147 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7148 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7149
7150 /* ISO2022 specific initialize routine. */
7151 for (i = 0; i < 0x20; i++)
7152 iso_code_class[i] = ISO_control_0;
7153 for (i = 0x21; i < 0x7F; i++)
7154 iso_code_class[i] = ISO_graphic_plane_0;
7155 for (i = 0x80; i < 0xA0; i++)
7156 iso_code_class[i] = ISO_control_1;
7157 for (i = 0xA1; i < 0xFF; i++)
7158 iso_code_class[i] = ISO_graphic_plane_1;
7159 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7160 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7161 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7162 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7163 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7164 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7165 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7166 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7167 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7168 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7169
7170 setup_coding_system (Qnil, &keyboard_coding);
7171 setup_coding_system (Qnil, &terminal_coding);
7172 setup_coding_system (Qnil, &safe_terminal_coding);
7173 setup_coding_system (Qnil, &default_buffer_file_coding);
7174
7175 bzero (coding_system_table, sizeof coding_system_table);
7176
7177 bzero (ascii_skip_code, sizeof ascii_skip_code);
7178 for (i = 0; i < 128; i++)
7179 ascii_skip_code[i] = 1;
7180
7181 #if defined (MSDOS) || defined (WINDOWSNT)
7182 system_eol_type = CODING_EOL_CRLF;
7183 #else
7184 system_eol_type = CODING_EOL_LF;
7185 #endif
7186
7187 inhibit_pre_post_conversion = 0;
7188 }
7189
7190 #ifdef emacs
7191
7192 void
7193 syms_of_coding ()
7194 {
7195 Qtarget_idx = intern ("target-idx");
7196 staticpro (&Qtarget_idx);
7197
7198 Qcoding_system_history = intern ("coding-system-history");
7199 staticpro (&Qcoding_system_history);
7200 Fset (Qcoding_system_history, Qnil);
7201
7202 /* Target FILENAME is the first argument. */
7203 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7204 /* Target FILENAME is the third argument. */
7205 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7206
7207 Qcall_process = intern ("call-process");
7208 staticpro (&Qcall_process);
7209 /* Target PROGRAM is the first argument. */
7210 Fput (Qcall_process, Qtarget_idx, make_number (0));
7211
7212 Qcall_process_region = intern ("call-process-region");
7213 staticpro (&Qcall_process_region);
7214 /* Target PROGRAM is the third argument. */
7215 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7216
7217 Qstart_process = intern ("start-process");
7218 staticpro (&Qstart_process);
7219 /* Target PROGRAM is the third argument. */
7220 Fput (Qstart_process, Qtarget_idx, make_number (2));
7221
7222 Qopen_network_stream = intern ("open-network-stream");
7223 staticpro (&Qopen_network_stream);
7224 /* Target SERVICE is the fourth argument. */
7225 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7226
7227 Qcoding_system = intern ("coding-system");
7228 staticpro (&Qcoding_system);
7229
7230 Qeol_type = intern ("eol-type");
7231 staticpro (&Qeol_type);
7232
7233 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7234 staticpro (&Qbuffer_file_coding_system);
7235
7236 Qpost_read_conversion = intern ("post-read-conversion");
7237 staticpro (&Qpost_read_conversion);
7238
7239 Qpre_write_conversion = intern ("pre-write-conversion");
7240 staticpro (&Qpre_write_conversion);
7241
7242 Qno_conversion = intern ("no-conversion");
7243 staticpro (&Qno_conversion);
7244
7245 Qundecided = intern ("undecided");
7246 staticpro (&Qundecided);
7247
7248 Qcoding_system_p = intern ("coding-system-p");
7249 staticpro (&Qcoding_system_p);
7250
7251 Qcoding_system_error = intern ("coding-system-error");
7252 staticpro (&Qcoding_system_error);
7253
7254 Fput (Qcoding_system_error, Qerror_conditions,
7255 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7256 Fput (Qcoding_system_error, Qerror_message,
7257 build_string ("Invalid coding system"));
7258
7259 Qcoding_category = intern ("coding-category");
7260 staticpro (&Qcoding_category);
7261 Qcoding_category_index = intern ("coding-category-index");
7262 staticpro (&Qcoding_category_index);
7263
7264 Vcoding_category_table
7265 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7266 staticpro (&Vcoding_category_table);
7267 {
7268 int i;
7269 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7270 {
7271 XVECTOR (Vcoding_category_table)->contents[i]
7272 = intern (coding_category_name[i]);
7273 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7274 Qcoding_category_index, make_number (i));
7275 }
7276 }
7277
7278 Qtranslation_table = intern ("translation-table");
7279 staticpro (&Qtranslation_table);
7280 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7281
7282 Qtranslation_table_id = intern ("translation-table-id");
7283 staticpro (&Qtranslation_table_id);
7284
7285 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7286 staticpro (&Qtranslation_table_for_decode);
7287
7288 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7289 staticpro (&Qtranslation_table_for_encode);
7290
7291 Qsafe_chars = intern ("safe-chars");
7292 staticpro (&Qsafe_chars);
7293
7294 Qchar_coding_system = intern ("char-coding-system");
7295 staticpro (&Qchar_coding_system);
7296
7297 /* Intern this now in case it isn't already done.
7298 Setting this variable twice is harmless.
7299 But don't staticpro it here--that is done in alloc.c. */
7300 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7301 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7302 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7303
7304 Qvalid_codes = intern ("valid-codes");
7305 staticpro (&Qvalid_codes);
7306
7307 Qemacs_mule = intern ("emacs-mule");
7308 staticpro (&Qemacs_mule);
7309
7310 Qraw_text = intern ("raw-text");
7311 staticpro (&Qraw_text);
7312
7313 defsubr (&Scoding_system_p);
7314 defsubr (&Sread_coding_system);
7315 defsubr (&Sread_non_nil_coding_system);
7316 defsubr (&Scheck_coding_system);
7317 defsubr (&Sdetect_coding_region);
7318 defsubr (&Sdetect_coding_string);
7319 defsubr (&Sfind_coding_systems_region_internal);
7320 defsubr (&Sunencodable_char_position);
7321 defsubr (&Sdecode_coding_region);
7322 defsubr (&Sencode_coding_region);
7323 defsubr (&Sdecode_coding_string);
7324 defsubr (&Sencode_coding_string);
7325 defsubr (&Sdecode_sjis_char);
7326 defsubr (&Sencode_sjis_char);
7327 defsubr (&Sdecode_big5_char);
7328 defsubr (&Sencode_big5_char);
7329 defsubr (&Sset_terminal_coding_system_internal);
7330 defsubr (&Sset_safe_terminal_coding_system_internal);
7331 defsubr (&Sterminal_coding_system);
7332 defsubr (&Sset_keyboard_coding_system_internal);
7333 defsubr (&Skeyboard_coding_system);
7334 defsubr (&Sfind_operation_coding_system);
7335 defsubr (&Supdate_coding_systems_internal);
7336 defsubr (&Sset_coding_priority_internal);
7337
7338 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7339 doc: /* List of coding systems.
7340
7341 Do not alter the value of this variable manually. This variable should be
7342 updated by the functions `make-coding-system' and
7343 `define-coding-system-alias'. */);
7344 Vcoding_system_list = Qnil;
7345
7346 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7347 doc: /* Alist of coding system names.
7348 Each element is one element list of coding system name.
7349 This variable is given to `completing-read' as TABLE argument.
7350
7351 Do not alter the value of this variable manually. This variable should be
7352 updated by the functions `make-coding-system' and
7353 `define-coding-system-alias'. */);
7354 Vcoding_system_alist = Qnil;
7355
7356 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7357 doc: /* List of coding-categories (symbols) ordered by priority.
7358
7359 On detecting a coding system, Emacs tries code detection algorithms
7360 associated with each coding-category one by one in this order. When
7361 one algorithm agrees with a byte sequence of source text, the coding
7362 system bound to the corresponding coding-category is selected. */);
7363 {
7364 int i;
7365
7366 Vcoding_category_list = Qnil;
7367 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7368 Vcoding_category_list
7369 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7370 Vcoding_category_list);
7371 }
7372
7373 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7374 doc: /* Specify the coding system for read operations.
7375 It is useful to bind this variable with `let', but do not set it globally.
7376 If the value is a coding system, it is used for decoding on read operation.
7377 If not, an appropriate element is used from one of the coding system alists:
7378 There are three such tables, `file-coding-system-alist',
7379 `process-coding-system-alist', and `network-coding-system-alist'. */);
7380 Vcoding_system_for_read = Qnil;
7381
7382 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7383 doc: /* Specify the coding system for write operations.
7384 Programs bind this variable with `let', but you should not set it globally.
7385 If the value is a coding system, it is used for encoding of output,
7386 when writing it to a file and when sending it to a file or subprocess.
7387
7388 If this does not specify a coding system, an appropriate element
7389 is used from one of the coding system alists:
7390 There are three such tables, `file-coding-system-alist',
7391 `process-coding-system-alist', and `network-coding-system-alist'.
7392 For output to files, if the above procedure does not specify a coding system,
7393 the value of `buffer-file-coding-system' is used. */);
7394 Vcoding_system_for_write = Qnil;
7395
7396 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7397 doc: /* Coding system used in the latest file or process I/O. */);
7398 Vlast_coding_system_used = Qnil;
7399
7400 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7401 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7402 See info node `Coding Systems' and info node `Text and Binary' concerning
7403 such conversion. */);
7404 inhibit_eol_conversion = 0;
7405
7406 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7407 doc: /* Non-nil means process buffer inherits coding system of process output.
7408 Bind it to t if the process output is to be treated as if it were a file
7409 read from some filesystem. */);
7410 inherit_process_coding_system = 0;
7411
7412 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7413 doc: /* Alist to decide a coding system to use for a file I/O operation.
7414 The format is ((PATTERN . VAL) ...),
7415 where PATTERN is a regular expression matching a file name,
7416 VAL is a coding system, a cons of coding systems, or a function symbol.
7417 If VAL is a coding system, it is used for both decoding and encoding
7418 the file contents.
7419 If VAL is a cons of coding systems, the car part is used for decoding,
7420 and the cdr part is used for encoding.
7421 If VAL is a function symbol, the function must return a coding system
7422 or a cons of coding systems which are used as above. The function gets
7423 the arguments with which `find-operation-coding-system' was called.
7424
7425 See also the function `find-operation-coding-system'
7426 and the variable `auto-coding-alist'. */);
7427 Vfile_coding_system_alist = Qnil;
7428
7429 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7430 doc: /* Alist to decide a coding system to use for a process I/O operation.
7431 The format is ((PATTERN . VAL) ...),
7432 where PATTERN is a regular expression matching a program name,
7433 VAL is a coding system, a cons of coding systems, or a function symbol.
7434 If VAL is a coding system, it is used for both decoding what received
7435 from the program and encoding what sent to the program.
7436 If VAL is a cons of coding systems, the car part is used for decoding,
7437 and the cdr part is used for encoding.
7438 If VAL is a function symbol, the function must return a coding system
7439 or a cons of coding systems which are used as above.
7440
7441 See also the function `find-operation-coding-system'. */);
7442 Vprocess_coding_system_alist = Qnil;
7443
7444 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7445 doc: /* Alist to decide a coding system to use for a network I/O operation.
7446 The format is ((PATTERN . VAL) ...),
7447 where PATTERN is a regular expression matching a network service name
7448 or is a port number to connect to,
7449 VAL is a coding system, a cons of coding systems, or a function symbol.
7450 If VAL is a coding system, it is used for both decoding what received
7451 from the network stream and encoding what sent to the network stream.
7452 If VAL is a cons of coding systems, the car part is used for decoding,
7453 and the cdr part is used for encoding.
7454 If VAL is a function symbol, the function must return a coding system
7455 or a cons of coding systems which are used as above.
7456
7457 See also the function `find-operation-coding-system'. */);
7458 Vnetwork_coding_system_alist = Qnil;
7459
7460 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7461 doc: /* Coding system to use with system messages.
7462 Also used for decoding keyboard input on X Window system. */);
7463 Vlocale_coding_system = Qnil;
7464
7465 /* The eol mnemonics are reset in startup.el system-dependently. */
7466 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7467 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7468 eol_mnemonic_unix = build_string (":");
7469
7470 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7471 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7472 eol_mnemonic_dos = build_string ("\\");
7473
7474 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7475 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7476 eol_mnemonic_mac = build_string ("/");
7477
7478 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7479 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7480 eol_mnemonic_undecided = build_string (":");
7481
7482 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7483 doc: /* *Non-nil enables character translation while encoding and decoding. */);
7484 Venable_character_translation = Qt;
7485
7486 DEFVAR_LISP ("standard-translation-table-for-decode",
7487 &Vstandard_translation_table_for_decode,
7488 doc: /* Table for translating characters while decoding. */);
7489 Vstandard_translation_table_for_decode = Qnil;
7490
7491 DEFVAR_LISP ("standard-translation-table-for-encode",
7492 &Vstandard_translation_table_for_encode,
7493 doc: /* Table for translating characters while encoding. */);
7494 Vstandard_translation_table_for_encode = Qnil;
7495
7496 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7497 doc: /* Alist of charsets vs revision numbers.
7498 While encoding, if a charset (car part of an element) is found,
7499 designate it with the escape sequence identifying revision (cdr part of the element). */);
7500 Vcharset_revision_alist = Qnil;
7501
7502 DEFVAR_LISP ("default-process-coding-system",
7503 &Vdefault_process_coding_system,
7504 doc: /* Cons of coding systems used for process I/O by default.
7505 The car part is used for decoding a process output,
7506 the cdr part is used for encoding a text to be sent to a process. */);
7507 Vdefault_process_coding_system = Qnil;
7508
7509 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7510 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7511 This is a vector of length 256.
7512 If Nth element is non-nil, the existence of code N in a file
7513 \(or output of subprocess) doesn't prevent it to be detected as
7514 a coding system of ISO 2022 variant which has a flag
7515 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7516 or reading output of a subprocess.
7517 Only 128th through 159th elements has a meaning. */);
7518 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7519
7520 DEFVAR_LISP ("select-safe-coding-system-function",
7521 &Vselect_safe_coding_system_function,
7522 doc: /* Function to call to select safe coding system for encoding a text.
7523
7524 If set, this function is called to force a user to select a proper
7525 coding system which can encode the text in the case that a default
7526 coding system used in each operation can't encode the text.
7527
7528 The default value is `select-safe-coding-system' (which see). */);
7529 Vselect_safe_coding_system_function = Qnil;
7530
7531 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7532 doc: /* Char-table containing safe coding systems of each characters.
7533 Each element doesn't include such generic coding systems that can
7534 encode any characters. They are in the first extra slot. */);
7535 Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7536
7537 DEFVAR_BOOL ("inhibit-iso-escape-detection",
7538 &inhibit_iso_escape_detection,
7539 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7540
7541 By default, on reading a file, Emacs tries to detect how the text is
7542 encoded. This code detection is sensitive to escape sequences. If
7543 the sequence is valid as ISO2022, the code is determined as one of
7544 the ISO2022 encodings, and the file is decoded by the corresponding
7545 coding system (e.g. `iso-2022-7bit').
7546
7547 However, there may be a case that you want to read escape sequences in
7548 a file as is. In such a case, you can set this variable to non-nil.
7549 Then, as the code detection ignores any escape sequences, no file is
7550 detected as encoded in some ISO2022 encoding. The result is that all
7551 escape sequences become visible in a buffer.
7552
7553 The default value is nil, and it is strongly recommended not to change
7554 it. That is because many Emacs Lisp source files that contain
7555 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7556 in Emacs's distribution, and they won't be decoded correctly on
7557 reading if you suppress escape sequence detection.
7558
7559 The other way to read escape sequences in a file without decoding is
7560 to explicitly specify some coding system that doesn't use ISO2022's
7561 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
7562 inhibit_iso_escape_detection = 0;
7563 }
7564
7565 char *
7566 emacs_strerror (error_number)
7567 int error_number;
7568 {
7569 char *str;
7570
7571 synchronize_system_messages_locale ();
7572 str = strerror (error_number);
7573
7574 if (! NILP (Vlocale_coding_system))
7575 {
7576 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7577 Vlocale_coding_system,
7578 0);
7579 str = (char *) SDATA (dec);
7580 }
7581
7582 return str;
7583 }
7584
7585 #endif /* emacs */
7586