(openp): Initialized encoded_fn before GCPRO it.
[bpt/emacs.git] / src / coding.c
... / ...
CommitLineData
1/* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
4 National Institute of Advanced Industrial Science and Technology (AIST)
5 Registration Number H14PRO021
6
7This file is part of GNU Emacs.
8
9GNU Emacs is free software; you can redistribute it and/or modify
10it under the terms of the GNU General Public License as published by
11the Free Software Foundation; either version 2, or (at your option)
12any later version.
13
14GNU Emacs is distributed in the hope that it will be useful,
15but WITHOUT ANY WARRANTY; without even the implied warranty of
16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17GNU General Public License for more details.
18
19You should have received a copy of the GNU General Public License
20along with GNU Emacs; see the file COPYING. If not, write to
21the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22Boston, MA 02110-1301, USA. */
23
24/*** TABLE OF CONTENTS ***
25
26 0. General comments
27 1. Preamble
28 2. Emacs' internal format (emacs-mule) handlers
29 3. ISO2022 handlers
30 4. Shift-JIS and BIG5 handlers
31 5. CCL handlers
32 6. End-of-line handlers
33 7. C library functions
34 8. Emacs Lisp library functions
35 9. Post-amble
36
37*/
38
39/*** 0. General comments ***/
40
41
42/*** GENERAL NOTE on CODING SYSTEMS ***
43
44 A coding system is an encoding mechanism for one or more character
45 sets. Here's a list of coding systems which Emacs can handle. When
46 we say "decode", it means converting some other coding system to
47 Emacs' internal format (emacs-mule), and when we say "encode",
48 it means converting the coding system emacs-mule to some other
49 coding system.
50
51 0. Emacs' internal format (emacs-mule)
52
53 Emacs itself holds a multi-lingual character in buffers and strings
54 in a special format. Details are described in section 2.
55
56 1. ISO2022
57
58 The most famous coding system for multiple character sets. X's
59 Compound Text, various EUCs (Extended Unix Code), and coding
60 systems used in Internet communication such as ISO-2022-JP are
61 all variants of ISO2022. Details are described in section 3.
62
63 2. SJIS (or Shift-JIS or MS-Kanji-Code)
64
65 A coding system to encode character sets: ASCII, JISX0201, and
66 JISX0208. Widely used for PC's in Japan. Details are described in
67 section 4.
68
69 3. BIG5
70
71 A coding system to encode the character sets ASCII and Big5. Widely
72 used for Chinese (mainly in Taiwan and Hong Kong). Details are
73 described in section 4. In this file, when we write "BIG5"
74 (all uppercase), we mean the coding system, and when we write
75 "Big5" (capitalized), we mean the character set.
76
77 4. Raw text
78
79 A coding system for text containing random 8-bit code. Emacs does
80 no code conversion on such text except for end-of-line format.
81
82 5. Other
83
84 If a user wants to read/write text encoded in a coding system not
85 listed above, he can supply a decoder and an encoder for it as CCL
86 (Code Conversion Language) programs. Emacs executes the CCL program
87 while reading/writing.
88
89 Emacs represents a coding system by a Lisp symbol that has a property
90 `coding-system'. But, before actually using the coding system, the
91 information about it is set in a structure of type `struct
92 coding_system' for rapid processing. See section 6 for more details.
93
94*/
95
96/*** GENERAL NOTES on END-OF-LINE FORMAT ***
97
98 How end-of-line of text is encoded depends on the operating system.
99 For instance, Unix's format is just one byte of `line-feed' code,
100 whereas DOS's format is two-byte sequence of `carriage-return' and
101 `line-feed' codes. MacOS's format is usually one byte of
102 `carriage-return'.
103
104 Since text character encoding and end-of-line encoding are
105 independent, any coding system described above can have any
106 end-of-line format. So Emacs has information about end-of-line
107 format in each coding-system. See section 6 for more details.
108
109*/
110
111/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
112
113 These functions check if a text between SRC and SRC_END is encoded
114 in the coding system category XXX. Each returns an integer value in
115 which appropriate flag bits for the category XXX are set. The flag
116 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
117 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
118 of the range 0x80..0x9F are in multibyte form. */
119#if 0
120int
121detect_coding_emacs_mule (src, src_end, multibytep)
122 unsigned char *src, *src_end;
123 int multibytep;
124{
125 ...
126}
127#endif
128
129/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
130
131 These functions decode SRC_BYTES length of unibyte text at SOURCE
132 encoded in CODING to Emacs' internal format. The resulting
133 multibyte text goes to a place pointed to by DESTINATION, the length
134 of which should not exceed DST_BYTES.
135
136 These functions set the information about original and decoded texts
137 in the members `produced', `produced_char', `consumed', and
138 `consumed_char' of the structure *CODING. They also set the member
139 `result' to one of CODING_FINISH_XXX indicating how the decoding
140 finished.
141
142 DST_BYTES zero means that the source area and destination area are
143 overlapped, which means that we can produce a decoded text until it
144 reaches the head of the not-yet-decoded source text.
145
146 Below is a template for these functions. */
147#if 0
148static void
149decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
150 struct coding_system *coding;
151 const unsigned char *source;
152 unsigned char *destination;
153 int src_bytes, dst_bytes;
154{
155 ...
156}
157#endif
158
159/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
160
161 These functions encode SRC_BYTES length text at SOURCE from Emacs'
162 internal multibyte format to CODING. The resulting unibyte text
163 goes to a place pointed to by DESTINATION, the length of which
164 should not exceed DST_BYTES.
165
166 These functions set the information about original and encoded texts
167 in the members `produced', `produced_char', `consumed', and
168 `consumed_char' of the structure *CODING. They also set the member
169 `result' to one of CODING_FINISH_XXX indicating how the encoding
170 finished.
171
172 DST_BYTES zero means that the source area and destination area are
173 overlapped, which means that we can produce encoded text until it
174 reaches at the head of the not-yet-encoded source text.
175
176 Below is a template for these functions. */
177#if 0
178static void
179encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
180 struct coding_system *coding;
181 unsigned char *source, *destination;
182 int src_bytes, dst_bytes;
183{
184 ...
185}
186#endif
187
188/*** COMMONLY USED MACROS ***/
189
190/* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
191 get one, two, and three bytes from the source text respectively.
192 If there are not enough bytes in the source, they jump to
193 `label_end_of_loop'. The caller should set variables `coding',
194 `src' and `src_end' to appropriate pointer in advance. These
195 macros are called from decoding routines `decode_coding_XXX', thus
196 it is assumed that the source text is unibyte. */
197
198#define ONE_MORE_BYTE(c1) \
199 do { \
200 if (src >= src_end) \
201 { \
202 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
203 goto label_end_of_loop; \
204 } \
205 c1 = *src++; \
206 } while (0)
207
208#define TWO_MORE_BYTES(c1, c2) \
209 do { \
210 if (src + 1 >= src_end) \
211 { \
212 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
213 goto label_end_of_loop; \
214 } \
215 c1 = *src++; \
216 c2 = *src++; \
217 } while (0)
218
219
220/* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
221 form if MULTIBYTEP is nonzero. */
222
223#define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
224 do { \
225 if (src >= src_end) \
226 { \
227 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
228 goto label_end_of_loop; \
229 } \
230 c1 = *src++; \
231 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
232 c1 = *src++ - 0x20; \
233 } while (0)
234
235/* Set C to the next character at the source text pointed by `src'.
236 If there are not enough characters in the source, jump to
237 `label_end_of_loop'. The caller should set variables `coding'
238 `src', `src_end', and `translation_table' to appropriate pointers
239 in advance. This macro is used in encoding routines
240 `encode_coding_XXX', thus it assumes that the source text is in
241 multibyte form except for 8-bit characters. 8-bit characters are
242 in multibyte form if coding->src_multibyte is nonzero, else they
243 are represented by a single byte. */
244
245#define ONE_MORE_CHAR(c) \
246 do { \
247 int len = src_end - src; \
248 int bytes; \
249 if (len <= 0) \
250 { \
251 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
252 goto label_end_of_loop; \
253 } \
254 if (coding->src_multibyte \
255 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
256 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
257 else \
258 c = *src, bytes = 1; \
259 if (!NILP (translation_table)) \
260 c = translate_char (translation_table, c, -1, 0, 0); \
261 src += bytes; \
262 } while (0)
263
264
265/* Produce a multibyte form of character C to `dst'. Jump to
266 `label_end_of_loop' if there's not enough space at `dst'.
267
268 If we are now in the middle of a composition sequence, the decoded
269 character may be ALTCHAR (for the current composition). In that
270 case, the character goes to coding->cmp_data->data instead of
271 `dst'.
272
273 This macro is used in decoding routines. */
274
275#define EMIT_CHAR(c) \
276 do { \
277 if (! COMPOSING_P (coding) \
278 || coding->composing == COMPOSITION_RELATIVE \
279 || coding->composing == COMPOSITION_WITH_RULE) \
280 { \
281 int bytes = CHAR_BYTES (c); \
282 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
283 { \
284 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
285 goto label_end_of_loop; \
286 } \
287 dst += CHAR_STRING (c, dst); \
288 coding->produced_char++; \
289 } \
290 \
291 if (COMPOSING_P (coding) \
292 && coding->composing != COMPOSITION_RELATIVE) \
293 { \
294 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
295 coding->composition_rule_follows \
296 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
297 } \
298 } while (0)
299
300
301#define EMIT_ONE_BYTE(c) \
302 do { \
303 if (dst >= (dst_bytes ? dst_end : src)) \
304 { \
305 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
306 goto label_end_of_loop; \
307 } \
308 *dst++ = c; \
309 } while (0)
310
311#define EMIT_TWO_BYTES(c1, c2) \
312 do { \
313 if (dst + 2 > (dst_bytes ? dst_end : src)) \
314 { \
315 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
316 goto label_end_of_loop; \
317 } \
318 *dst++ = c1, *dst++ = c2; \
319 } while (0)
320
321#define EMIT_BYTES(from, to) \
322 do { \
323 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
324 { \
325 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
326 goto label_end_of_loop; \
327 } \
328 while (from < to) \
329 *dst++ = *from++; \
330 } while (0)
331
332\f
333/*** 1. Preamble ***/
334
335#ifdef emacs
336#include <config.h>
337#endif
338
339#include <stdio.h>
340
341#ifdef emacs
342
343#include "lisp.h"
344#include "buffer.h"
345#include "charset.h"
346#include "composite.h"
347#include "ccl.h"
348#include "coding.h"
349#include "window.h"
350#include "intervals.h"
351
352#else /* not emacs */
353
354#include "mulelib.h"
355
356#endif /* not emacs */
357
358Lisp_Object Qcoding_system, Qeol_type;
359Lisp_Object Qbuffer_file_coding_system;
360Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
361Lisp_Object Qno_conversion, Qundecided;
362Lisp_Object Qcoding_system_history;
363Lisp_Object Qsafe_chars;
364Lisp_Object Qvalid_codes;
365
366extern Lisp_Object Qinsert_file_contents, Qwrite_region;
367Lisp_Object Qcall_process, Qcall_process_region;
368Lisp_Object Qstart_process, Qopen_network_stream;
369Lisp_Object Qtarget_idx;
370
371/* If a symbol has this property, evaluate the value to define the
372 symbol as a coding system. */
373Lisp_Object Qcoding_system_define_form;
374
375Lisp_Object Vselect_safe_coding_system_function;
376
377int coding_system_require_warning;
378
379/* Mnemonic string for each format of end-of-line. */
380Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
381/* Mnemonic string to indicate format of end-of-line is not yet
382 decided. */
383Lisp_Object eol_mnemonic_undecided;
384
385/* Format of end-of-line decided by system. This is CODING_EOL_LF on
386 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
387int system_eol_type;
388
389#ifdef emacs
390
391/* Information about which coding system is safe for which chars.
392 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
393
394 GENERIC-LIST is a list of generic coding systems which can encode
395 any characters.
396
397 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
398 corresponding char table that contains safe chars. */
399Lisp_Object Vcoding_system_safe_chars;
400
401Lisp_Object Vcoding_system_list, Vcoding_system_alist;
402
403Lisp_Object Qcoding_system_p, Qcoding_system_error;
404
405/* Coding system emacs-mule and raw-text are for converting only
406 end-of-line format. */
407Lisp_Object Qemacs_mule, Qraw_text;
408
409Lisp_Object Qutf_8;
410
411/* Coding-systems are handed between Emacs Lisp programs and C internal
412 routines by the following three variables. */
413/* Coding-system for reading files and receiving data from process. */
414Lisp_Object Vcoding_system_for_read;
415/* Coding-system for writing files and sending data to process. */
416Lisp_Object Vcoding_system_for_write;
417/* Coding-system actually used in the latest I/O. */
418Lisp_Object Vlast_coding_system_used;
419
420/* A vector of length 256 which contains information about special
421 Latin codes (especially for dealing with Microsoft codes). */
422Lisp_Object Vlatin_extra_code_table;
423
424/* Flag to inhibit code conversion of end-of-line format. */
425int inhibit_eol_conversion;
426
427/* Flag to inhibit ISO2022 escape sequence detection. */
428int inhibit_iso_escape_detection;
429
430/* Flag to make buffer-file-coding-system inherit from process-coding. */
431int inherit_process_coding_system;
432
433/* Coding system to be used to encode text for terminal display. */
434struct coding_system terminal_coding;
435
436/* Coding system to be used to encode text for terminal display when
437 terminal coding system is nil. */
438struct coding_system safe_terminal_coding;
439
440/* Coding system of what is sent from terminal keyboard. */
441struct coding_system keyboard_coding;
442
443/* Default coding system to be used to write a file. */
444struct coding_system default_buffer_file_coding;
445
446Lisp_Object Vfile_coding_system_alist;
447Lisp_Object Vprocess_coding_system_alist;
448Lisp_Object Vnetwork_coding_system_alist;
449
450Lisp_Object Vlocale_coding_system;
451
452#endif /* emacs */
453
454Lisp_Object Qcoding_category, Qcoding_category_index;
455
456/* List of symbols `coding-category-xxx' ordered by priority. */
457Lisp_Object Vcoding_category_list;
458
459/* Table of coding categories (Lisp symbols). */
460Lisp_Object Vcoding_category_table;
461
462/* Table of names of symbol for each coding-category. */
463char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
464 "coding-category-emacs-mule",
465 "coding-category-sjis",
466 "coding-category-iso-7",
467 "coding-category-iso-7-tight",
468 "coding-category-iso-8-1",
469 "coding-category-iso-8-2",
470 "coding-category-iso-7-else",
471 "coding-category-iso-8-else",
472 "coding-category-ccl",
473 "coding-category-big5",
474 "coding-category-utf-8",
475 "coding-category-utf-16-be",
476 "coding-category-utf-16-le",
477 "coding-category-raw-text",
478 "coding-category-binary"
479};
480
481/* Table of pointers to coding systems corresponding to each coding
482 categories. */
483struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
484
485/* Table of coding category masks. Nth element is a mask for a coding
486 category of which priority is Nth. */
487static
488int coding_priorities[CODING_CATEGORY_IDX_MAX];
489
490/* Flag to tell if we look up translation table on character code
491 conversion. */
492Lisp_Object Venable_character_translation;
493/* Standard translation table to look up on decoding (reading). */
494Lisp_Object Vstandard_translation_table_for_decode;
495/* Standard translation table to look up on encoding (writing). */
496Lisp_Object Vstandard_translation_table_for_encode;
497
498Lisp_Object Qtranslation_table;
499Lisp_Object Qtranslation_table_id;
500Lisp_Object Qtranslation_table_for_decode;
501Lisp_Object Qtranslation_table_for_encode;
502
503/* Alist of charsets vs revision number. */
504Lisp_Object Vcharset_revision_alist;
505
506/* Default coding systems used for process I/O. */
507Lisp_Object Vdefault_process_coding_system;
508
509/* Char table for translating Quail and self-inserting input. */
510Lisp_Object Vtranslation_table_for_input;
511
512/* Global flag to tell that we can't call post-read-conversion and
513 pre-write-conversion functions. Usually the value is zero, but it
514 is set to 1 temporarily while such functions are running. This is
515 to avoid infinite recursive call. */
516static int inhibit_pre_post_conversion;
517
518Lisp_Object Qchar_coding_system;
519
520/* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
521 its validity. */
522
523Lisp_Object
524coding_safe_chars (coding_system)
525 Lisp_Object coding_system;
526{
527 Lisp_Object coding_spec, plist, safe_chars;
528
529 coding_spec = Fget (coding_system, Qcoding_system);
530 plist = XVECTOR (coding_spec)->contents[3];
531 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
532 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
533}
534
535#define CODING_SAFE_CHAR_P(safe_chars, c) \
536 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
537
538\f
539/*** 2. Emacs internal format (emacs-mule) handlers ***/
540
541/* Emacs' internal format for representation of multiple character
542 sets is a kind of multi-byte encoding, i.e. characters are
543 represented by variable-length sequences of one-byte codes.
544
545 ASCII characters and control characters (e.g. `tab', `newline') are
546 represented by one-byte sequences which are their ASCII codes, in
547 the range 0x00 through 0x7F.
548
549 8-bit characters of the range 0x80..0x9F are represented by
550 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
551 code + 0x20).
552
553 8-bit characters of the range 0xA0..0xFF are represented by
554 one-byte sequences which are their 8-bit code.
555
556 The other characters are represented by a sequence of `base
557 leading-code', optional `extended leading-code', and one or two
558 `position-code's. The length of the sequence is determined by the
559 base leading-code. Leading-code takes the range 0x81 through 0x9D,
560 whereas extended leading-code and position-code take the range 0xA0
561 through 0xFF. See `charset.h' for more details about leading-code
562 and position-code.
563
564 --- CODE RANGE of Emacs' internal format ---
565 character set range
566 ------------- -----
567 ascii 0x00..0x7F
568 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
569 eight-bit-graphic 0xA0..0xBF
570 ELSE 0x81..0x9D + [0xA0..0xFF]+
571 ---------------------------------------------
572
573 As this is the internal character representation, the format is
574 usually not used externally (i.e. in a file or in a data sent to a
575 process). But, it is possible to have a text externally in this
576 format (i.e. by encoding by the coding system `emacs-mule').
577
578 In that case, a sequence of one-byte codes has a slightly different
579 form.
580
581 Firstly, all characters in eight-bit-control are represented by
582 one-byte sequences which are their 8-bit code.
583
584 Next, character composition data are represented by the byte
585 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
586 where,
587 METHOD is 0xF0 plus one of composition method (enum
588 composition_method),
589
590 BYTES is 0xA0 plus the byte length of these composition data,
591
592 CHARS is 0xA0 plus the number of characters composed by these
593 data,
594
595 COMPONENTs are characters of multibyte form or composition
596 rules encoded by two-byte of ASCII codes.
597
598 In addition, for backward compatibility, the following formats are
599 also recognized as composition data on decoding.
600
601 0x80 MSEQ ...
602 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
603
604 Here,
605 MSEQ is a multibyte form but in these special format:
606 ASCII: 0xA0 ASCII_CODE+0x80,
607 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
608 RULE is a one byte code of the range 0xA0..0xF0 that
609 represents a composition rule.
610 */
611
612enum emacs_code_class_type emacs_code_class[256];
613
614/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
615 Check if a text is encoded in Emacs' internal format. If it is,
616 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
617
618static int
619detect_coding_emacs_mule (src, src_end, multibytep)
620 unsigned char *src, *src_end;
621 int multibytep;
622{
623 unsigned char c;
624 int composing = 0;
625 /* Dummy for ONE_MORE_BYTE. */
626 struct coding_system dummy_coding;
627 struct coding_system *coding = &dummy_coding;
628
629 while (1)
630 {
631 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
632
633 if (composing)
634 {
635 if (c < 0xA0)
636 composing = 0;
637 else if (c == 0xA0)
638 {
639 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
640 c &= 0x7F;
641 }
642 else
643 c -= 0x20;
644 }
645
646 if (c < 0x20)
647 {
648 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
649 return 0;
650 }
651 else if (c >= 0x80 && c < 0xA0)
652 {
653 if (c == 0x80)
654 /* Old leading code for a composite character. */
655 composing = 1;
656 else
657 {
658 unsigned char *src_base = src - 1;
659 int bytes;
660
661 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
662 bytes))
663 return 0;
664 src = src_base + bytes;
665 }
666 }
667 }
668 label_end_of_loop:
669 return CODING_CATEGORY_MASK_EMACS_MULE;
670}
671
672
673/* Record the starting position START and METHOD of one composition. */
674
675#define CODING_ADD_COMPOSITION_START(coding, start, method) \
676 do { \
677 struct composition_data *cmp_data = coding->cmp_data; \
678 int *data = cmp_data->data + cmp_data->used; \
679 coding->cmp_data_start = cmp_data->used; \
680 data[0] = -1; \
681 data[1] = cmp_data->char_offset + start; \
682 data[3] = (int) method; \
683 cmp_data->used += 4; \
684 } while (0)
685
686/* Record the ending position END of the current composition. */
687
688#define CODING_ADD_COMPOSITION_END(coding, end) \
689 do { \
690 struct composition_data *cmp_data = coding->cmp_data; \
691 int *data = cmp_data->data + coding->cmp_data_start; \
692 data[0] = cmp_data->used - coding->cmp_data_start; \
693 data[2] = cmp_data->char_offset + end; \
694 } while (0)
695
696/* Record one COMPONENT (alternate character or composition rule). */
697
698#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
699 do { \
700 coding->cmp_data->data[coding->cmp_data->used++] = component; \
701 if (coding->cmp_data->used - coding->cmp_data_start \
702 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
703 { \
704 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
705 coding->composing = COMPOSITION_NO; \
706 } \
707 } while (0)
708
709
710/* Get one byte from a data pointed by SRC and increment SRC. If SRC
711 is not less than SRC_END, return -1 without incrementing Src. */
712
713#define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
714
715
716/* Decode a character represented as a component of composition
717 sequence of Emacs 20 style at SRC. Set C to that character, store
718 its multibyte form sequence at P, and set P to the end of that
719 sequence. If no valid character is found, set C to -1. */
720
721#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
722 do { \
723 int bytes; \
724 \
725 c = SAFE_ONE_MORE_BYTE (); \
726 if (c < 0) \
727 break; \
728 if (CHAR_HEAD_P (c)) \
729 c = -1; \
730 else if (c == 0xA0) \
731 { \
732 c = SAFE_ONE_MORE_BYTE (); \
733 if (c < 0xA0) \
734 c = -1; \
735 else \
736 { \
737 c -= 0xA0; \
738 *p++ = c; \
739 } \
740 } \
741 else if (BASE_LEADING_CODE_P (c - 0x20)) \
742 { \
743 unsigned char *p0 = p; \
744 \
745 c -= 0x20; \
746 *p++ = c; \
747 bytes = BYTES_BY_CHAR_HEAD (c); \
748 while (--bytes) \
749 { \
750 c = SAFE_ONE_MORE_BYTE (); \
751 if (c < 0) \
752 break; \
753 *p++ = c; \
754 } \
755 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
756 || (coding->flags /* We are recovering a file. */ \
757 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
758 && ! CHAR_HEAD_P (p0[1]))) \
759 c = STRING_CHAR (p0, bytes); \
760 else \
761 c = -1; \
762 } \
763 else \
764 c = -1; \
765 } while (0)
766
767
768/* Decode a composition rule represented as a component of composition
769 sequence of Emacs 20 style at SRC. Set C to the rule. If not
770 valid rule is found, set C to -1. */
771
772#define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
773 do { \
774 c = SAFE_ONE_MORE_BYTE (); \
775 c -= 0xA0; \
776 if (c < 0 || c >= 81) \
777 c = -1; \
778 else \
779 { \
780 gref = c / 9, nref = c % 9; \
781 c = COMPOSITION_ENCODE_RULE (gref, nref); \
782 } \
783 } while (0)
784
785
786/* Decode composition sequence encoded by `emacs-mule' at the source
787 pointed by SRC. SRC_END is the end of source. Store information
788 of the composition in CODING->cmp_data.
789
790 For backward compatibility, decode also a composition sequence of
791 Emacs 20 style. In that case, the composition sequence contains
792 characters that should be extracted into a buffer or string. Store
793 those characters at *DESTINATION in multibyte form.
794
795 If we encounter an invalid byte sequence, return 0.
796 If we encounter an insufficient source or destination, or
797 insufficient space in CODING->cmp_data, return 1.
798 Otherwise, return consumed bytes in the source.
799
800*/
801static INLINE int
802decode_composition_emacs_mule (coding, src, src_end,
803 destination, dst_end, dst_bytes)
804 struct coding_system *coding;
805 const unsigned char *src, *src_end;
806 unsigned char **destination, *dst_end;
807 int dst_bytes;
808{
809 unsigned char *dst = *destination;
810 int method, data_len, nchars;
811 const unsigned char *src_base = src++;
812 /* Store components of composition. */
813 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
814 int ncomponent;
815 /* Store multibyte form of characters to be composed. This is for
816 Emacs 20 style composition sequence. */
817 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
818 unsigned char *bufp = buf;
819 int c, i, gref, nref;
820
821 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
822 >= COMPOSITION_DATA_SIZE)
823 {
824 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
825 return -1;
826 }
827
828 ONE_MORE_BYTE (c);
829 if (c - 0xF0 >= COMPOSITION_RELATIVE
830 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
831 {
832 int with_rule;
833
834 method = c - 0xF0;
835 with_rule = (method == COMPOSITION_WITH_RULE
836 || method == COMPOSITION_WITH_RULE_ALTCHARS);
837 ONE_MORE_BYTE (c);
838 data_len = c - 0xA0;
839 if (data_len < 4
840 || src_base + data_len > src_end)
841 return 0;
842 ONE_MORE_BYTE (c);
843 nchars = c - 0xA0;
844 if (c < 1)
845 return 0;
846 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
847 {
848 /* If it is longer than this, it can't be valid. */
849 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
850 return 0;
851
852 if (ncomponent % 2 && with_rule)
853 {
854 ONE_MORE_BYTE (gref);
855 gref -= 32;
856 ONE_MORE_BYTE (nref);
857 nref -= 32;
858 c = COMPOSITION_ENCODE_RULE (gref, nref);
859 }
860 else
861 {
862 int bytes;
863 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
864 || (coding->flags /* We are recovering a file. */
865 && src[0] == LEADING_CODE_8_BIT_CONTROL
866 && ! CHAR_HEAD_P (src[1])))
867 c = STRING_CHAR (src, bytes);
868 else
869 c = *src, bytes = 1;
870 src += bytes;
871 }
872 component[ncomponent] = c;
873 }
874 }
875 else if (c >= 0x80)
876 {
877 /* This may be an old Emacs 20 style format. See the comment at
878 the section 2 of this file. */
879 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
880 if (src == src_end
881 && !(coding->mode & CODING_MODE_LAST_BLOCK))
882 goto label_end_of_loop;
883
884 src_end = src;
885 src = src_base + 1;
886 if (c < 0xC0)
887 {
888 method = COMPOSITION_RELATIVE;
889 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
890 {
891 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
892 if (c < 0)
893 break;
894 component[ncomponent++] = c;
895 }
896 if (ncomponent < 2)
897 return 0;
898 nchars = ncomponent;
899 }
900 else if (c == 0xFF)
901 {
902 method = COMPOSITION_WITH_RULE;
903 src++;
904 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
905 if (c < 0)
906 return 0;
907 component[0] = c;
908 for (ncomponent = 1;
909 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
910 {
911 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
912 if (c < 0)
913 break;
914 component[ncomponent++] = c;
915 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
916 if (c < 0)
917 break;
918 component[ncomponent++] = c;
919 }
920 if (ncomponent < 3)
921 return 0;
922 nchars = (ncomponent + 1) / 2;
923 }
924 else
925 return 0;
926 }
927 else
928 return 0;
929
930 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
931 {
932 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
933 for (i = 0; i < ncomponent; i++)
934 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
935 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
936 if (buf < bufp)
937 {
938 unsigned char *p = buf;
939 EMIT_BYTES (p, bufp);
940 *destination += bufp - buf;
941 coding->produced_char += nchars;
942 }
943 return (src - src_base);
944 }
945 label_end_of_loop:
946 return -1;
947}
948
949/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
950
951static void
952decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
953 struct coding_system *coding;
954 const unsigned char *source;
955 unsigned char *destination;
956 int src_bytes, dst_bytes;
957{
958 const unsigned char *src = source;
959 const unsigned char *src_end = source + src_bytes;
960 unsigned char *dst = destination;
961 unsigned char *dst_end = destination + dst_bytes;
962 /* SRC_BASE remembers the start position in source in each loop.
963 The loop will be exited when there's not enough source code, or
964 when there's not enough destination area to produce a
965 character. */
966 const unsigned char *src_base;
967
968 coding->produced_char = 0;
969 while ((src_base = src) < src_end)
970 {
971 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
972 const unsigned char *p;
973 int bytes;
974
975 if (*src == '\r')
976 {
977 int c = *src++;
978
979 if (coding->eol_type == CODING_EOL_CR)
980 c = '\n';
981 else if (coding->eol_type == CODING_EOL_CRLF)
982 {
983 ONE_MORE_BYTE (c);
984 if (c != '\n')
985 {
986 src--;
987 c = '\r';
988 }
989 }
990 *dst++ = c;
991 coding->produced_char++;
992 continue;
993 }
994 else if (*src == '\n')
995 {
996 if ((coding->eol_type == CODING_EOL_CR
997 || coding->eol_type == CODING_EOL_CRLF)
998 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
999 {
1000 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1001 goto label_end_of_loop;
1002 }
1003 *dst++ = *src++;
1004 coding->produced_char++;
1005 continue;
1006 }
1007 else if (*src == 0x80 && coding->cmp_data)
1008 {
1009 /* Start of composition data. */
1010 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1011 &dst, dst_end,
1012 dst_bytes);
1013 if (consumed < 0)
1014 goto label_end_of_loop;
1015 else if (consumed > 0)
1016 {
1017 src += consumed;
1018 continue;
1019 }
1020 bytes = CHAR_STRING (*src, tmp);
1021 p = tmp;
1022 src++;
1023 }
1024 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1025 || (coding->flags /* We are recovering a file. */
1026 && src[0] == LEADING_CODE_8_BIT_CONTROL
1027 && ! CHAR_HEAD_P (src[1])))
1028 {
1029 p = src;
1030 src += bytes;
1031 }
1032 else
1033 {
1034 int i, c;
1035
1036 bytes = BYTES_BY_CHAR_HEAD (*src);
1037 src++;
1038 for (i = 1; i < bytes; i++)
1039 {
1040 ONE_MORE_BYTE (c);
1041 if (CHAR_HEAD_P (c))
1042 break;
1043 }
1044 if (i < bytes)
1045 {
1046 bytes = CHAR_STRING (*src_base, tmp);
1047 p = tmp;
1048 src = src_base + 1;
1049 }
1050 else
1051 {
1052 p = src_base;
1053 }
1054 }
1055 if (dst + bytes >= (dst_bytes ? dst_end : src))
1056 {
1057 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1058 break;
1059 }
1060 while (bytes--) *dst++ = *p++;
1061 coding->produced_char++;
1062 }
1063 label_end_of_loop:
1064 coding->consumed = coding->consumed_char = src_base - source;
1065 coding->produced = dst - destination;
1066}
1067
1068
1069/* Encode composition data stored at DATA into a special byte sequence
1070 starting by 0x80. Update CODING->cmp_data_start and maybe
1071 CODING->cmp_data for the next call. */
1072
1073#define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1074 do { \
1075 unsigned char buf[1024], *p0 = buf, *p; \
1076 int len = data[0]; \
1077 int i; \
1078 \
1079 buf[0] = 0x80; \
1080 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1081 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1082 p = buf + 4; \
1083 if (data[3] == COMPOSITION_WITH_RULE \
1084 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1085 { \
1086 p += CHAR_STRING (data[4], p); \
1087 for (i = 5; i < len; i += 2) \
1088 { \
1089 int gref, nref; \
1090 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1091 *p++ = 0x20 + gref; \
1092 *p++ = 0x20 + nref; \
1093 p += CHAR_STRING (data[i + 1], p); \
1094 } \
1095 } \
1096 else \
1097 { \
1098 for (i = 4; i < len; i++) \
1099 p += CHAR_STRING (data[i], p); \
1100 } \
1101 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1102 \
1103 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1104 { \
1105 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1106 goto label_end_of_loop; \
1107 } \
1108 while (p0 < p) \
1109 *dst++ = *p0++; \
1110 coding->cmp_data_start += data[0]; \
1111 if (coding->cmp_data_start == coding->cmp_data->used \
1112 && coding->cmp_data->next) \
1113 { \
1114 coding->cmp_data = coding->cmp_data->next; \
1115 coding->cmp_data_start = 0; \
1116 } \
1117 } while (0)
1118
1119
1120static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1121 unsigned char *, int, int));
1122
1123static void
1124encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1125 struct coding_system *coding;
1126 const unsigned char *source;
1127 unsigned char *destination;
1128 int src_bytes, dst_bytes;
1129{
1130 const unsigned char *src = source;
1131 const unsigned char *src_end = source + src_bytes;
1132 unsigned char *dst = destination;
1133 unsigned char *dst_end = destination + dst_bytes;
1134 const unsigned char *src_base;
1135 int c;
1136 int char_offset;
1137 int *data;
1138
1139 Lisp_Object translation_table;
1140
1141 translation_table = Qnil;
1142
1143 /* Optimization for the case that there's no composition. */
1144 if (!coding->cmp_data || coding->cmp_data->used == 0)
1145 {
1146 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1147 return;
1148 }
1149
1150 char_offset = coding->cmp_data->char_offset;
1151 data = coding->cmp_data->data + coding->cmp_data_start;
1152 while (1)
1153 {
1154 src_base = src;
1155
1156 /* If SRC starts a composition, encode the information about the
1157 composition in advance. */
1158 if (coding->cmp_data_start < coding->cmp_data->used
1159 && char_offset + coding->consumed_char == data[1])
1160 {
1161 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1162 char_offset = coding->cmp_data->char_offset;
1163 data = coding->cmp_data->data + coding->cmp_data_start;
1164 }
1165
1166 ONE_MORE_CHAR (c);
1167 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1168 || coding->eol_type == CODING_EOL_CR))
1169 {
1170 if (coding->eol_type == CODING_EOL_CRLF)
1171 EMIT_TWO_BYTES ('\r', c);
1172 else
1173 EMIT_ONE_BYTE ('\r');
1174 }
1175 else if (SINGLE_BYTE_CHAR_P (c))
1176 {
1177 if (coding->flags && ! ASCII_BYTE_P (c))
1178 {
1179 /* As we are auto saving, retain the multibyte form for
1180 8-bit chars. */
1181 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1182 int bytes = CHAR_STRING (c, buf);
1183
1184 if (bytes == 1)
1185 EMIT_ONE_BYTE (buf[0]);
1186 else
1187 EMIT_TWO_BYTES (buf[0], buf[1]);
1188 }
1189 else
1190 EMIT_ONE_BYTE (c);
1191 }
1192 else
1193 EMIT_BYTES (src_base, src);
1194 coding->consumed_char++;
1195 }
1196 label_end_of_loop:
1197 coding->consumed = src_base - source;
1198 coding->produced = coding->produced_char = dst - destination;
1199 return;
1200}
1201
1202\f
1203/*** 3. ISO2022 handlers ***/
1204
1205/* The following note describes the coding system ISO2022 briefly.
1206 Since the intention of this note is to help understand the
1207 functions in this file, some parts are NOT ACCURATE or are OVERLY
1208 SIMPLIFIED. For thorough understanding, please refer to the
1209 original document of ISO2022. This is equivalent to the standard
1210 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1211
1212 ISO2022 provides many mechanisms to encode several character sets
1213 in 7-bit and 8-bit environments. For 7-bit environments, all text
1214 is encoded using bytes less than 128. This may make the encoded
1215 text a little bit longer, but the text passes more easily through
1216 several types of gateway, some of which strip off the MSB (Most
1217 Significant Bit).
1218
1219 There are two kinds of character sets: control character sets and
1220 graphic character sets. The former contain control characters such
1221 as `newline' and `escape' to provide control functions (control
1222 functions are also provided by escape sequences). The latter
1223 contain graphic characters such as 'A' and '-'. Emacs recognizes
1224 two control character sets and many graphic character sets.
1225
1226 Graphic character sets are classified into one of the following
1227 four classes, according to the number of bytes (DIMENSION) and
1228 number of characters in one dimension (CHARS) of the set:
1229 - DIMENSION1_CHARS94
1230 - DIMENSION1_CHARS96
1231 - DIMENSION2_CHARS94
1232 - DIMENSION2_CHARS96
1233
1234 In addition, each character set is assigned an identification tag,
1235 unique for each set, called the "final character" (denoted as <F>
1236 hereafter). The <F> of each character set is decided by ECMA(*)
1237 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1238 (0x30..0x3F are for private use only).
1239
1240 Note (*): ECMA = European Computer Manufacturers Association
1241
1242 Here are examples of graphic character sets [NAME(<F>)]:
1243 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1244 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1245 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1246 o DIMENSION2_CHARS96 -- none for the moment
1247
1248 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1249 C0 [0x00..0x1F] -- control character plane 0
1250 GL [0x20..0x7F] -- graphic character plane 0
1251 C1 [0x80..0x9F] -- control character plane 1
1252 GR [0xA0..0xFF] -- graphic character plane 1
1253
1254 A control character set is directly designated and invoked to C0 or
1255 C1 by an escape sequence. The most common case is that:
1256 - ISO646's control character set is designated/invoked to C0, and
1257 - ISO6429's control character set is designated/invoked to C1,
1258 and usually these designations/invocations are omitted in encoded
1259 text. In a 7-bit environment, only C0 can be used, and a control
1260 character for C1 is encoded by an appropriate escape sequence to
1261 fit into the environment. All control characters for C1 are
1262 defined to have corresponding escape sequences.
1263
1264 A graphic character set is at first designated to one of four
1265 graphic registers (G0 through G3), then these graphic registers are
1266 invoked to GL or GR. These designations and invocations can be
1267 done independently. The most common case is that G0 is invoked to
1268 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1269 these invocations and designations are omitted in encoded text.
1270 In a 7-bit environment, only GL can be used.
1271
1272 When a graphic character set of CHARS94 is invoked to GL, codes
1273 0x20 and 0x7F of the GL area work as control characters SPACE and
1274 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1275 be used.
1276
1277 There are two ways of invocation: locking-shift and single-shift.
1278 With locking-shift, the invocation lasts until the next different
1279 invocation, whereas with single-shift, the invocation affects the
1280 following character only and doesn't affect the locking-shift
1281 state. Invocations are done by the following control characters or
1282 escape sequences:
1283
1284 ----------------------------------------------------------------------
1285 abbrev function cntrl escape seq description
1286 ----------------------------------------------------------------------
1287 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1288 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1289 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1290 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1291 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1292 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1293 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1294 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1295 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1296 ----------------------------------------------------------------------
1297 (*) These are not used by any known coding system.
1298
1299 Control characters for these functions are defined by macros
1300 ISO_CODE_XXX in `coding.h'.
1301
1302 Designations are done by the following escape sequences:
1303 ----------------------------------------------------------------------
1304 escape sequence description
1305 ----------------------------------------------------------------------
1306 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1307 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1308 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1309 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1310 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1311 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1312 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1313 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1314 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1315 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1316 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1317 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1318 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1319 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1320 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1321 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1322 ----------------------------------------------------------------------
1323
1324 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1325 of dimension 1, chars 94, and final character <F>, etc...
1326
1327 Note (*): Although these designations are not allowed in ISO2022,
1328 Emacs accepts them on decoding, and produces them on encoding
1329 CHARS96 character sets in a coding system which is characterized as
1330 7-bit environment, non-locking-shift, and non-single-shift.
1331
1332 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1333 '(' can be omitted. We refer to this as "short-form" hereafter.
1334
1335 Now you may notice that there are a lot of ways of encoding the
1336 same multilingual text in ISO2022. Actually, there exist many
1337 coding systems such as Compound Text (used in X11's inter client
1338 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1339 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1340 localized platforms), and all of these are variants of ISO2022.
1341
1342 In addition to the above, Emacs handles two more kinds of escape
1343 sequences: ISO6429's direction specification and Emacs' private
1344 sequence for specifying character composition.
1345
1346 ISO6429's direction specification takes the following form:
1347 o CSI ']' -- end of the current direction
1348 o CSI '0' ']' -- end of the current direction
1349 o CSI '1' ']' -- start of left-to-right text
1350 o CSI '2' ']' -- start of right-to-left text
1351 The control character CSI (0x9B: control sequence introducer) is
1352 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1353
1354 Character composition specification takes the following form:
1355 o ESC '0' -- start relative composition
1356 o ESC '1' -- end composition
1357 o ESC '2' -- start rule-base composition (*)
1358 o ESC '3' -- start relative composition with alternate chars (**)
1359 o ESC '4' -- start rule-base composition with alternate chars (**)
1360 Since these are not standard escape sequences of any ISO standard,
1361 the use of them with these meanings is restricted to Emacs only.
1362
1363 (*) This form is used only in Emacs 20.5 and older versions,
1364 but the newer versions can safely decode it.
1365 (**) This form is used only in Emacs 21.1 and newer versions,
1366 and the older versions can't decode it.
1367
1368 Here's a list of example usages of these composition escape
1369 sequences (categorized by `enum composition_method').
1370
1371 COMPOSITION_RELATIVE:
1372 ESC 0 CHAR [ CHAR ] ESC 1
1373 COMPOSITION_WITH_RULE:
1374 ESC 2 CHAR [ RULE CHAR ] ESC 1
1375 COMPOSITION_WITH_ALTCHARS:
1376 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1377 COMPOSITION_WITH_RULE_ALTCHARS:
1378 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1379
1380enum iso_code_class_type iso_code_class[256];
1381
1382#define CHARSET_OK(idx, charset, c) \
1383 (coding_system_table[idx] \
1384 && (charset == CHARSET_ASCII \
1385 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1386 CODING_SAFE_CHAR_P (safe_chars, c))) \
1387 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1388 charset) \
1389 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1390
1391#define SHIFT_OUT_OK(idx) \
1392 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1393
1394#define COMPOSITION_OK(idx) \
1395 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1396
1397/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1398 Check if a text is encoded in ISO2022. If it is, return an
1399 integer in which appropriate flag bits any of:
1400 CODING_CATEGORY_MASK_ISO_7
1401 CODING_CATEGORY_MASK_ISO_7_TIGHT
1402 CODING_CATEGORY_MASK_ISO_8_1
1403 CODING_CATEGORY_MASK_ISO_8_2
1404 CODING_CATEGORY_MASK_ISO_7_ELSE
1405 CODING_CATEGORY_MASK_ISO_8_ELSE
1406 are set. If a code which should never appear in ISO2022 is found,
1407 returns 0. */
1408
1409static int
1410detect_coding_iso2022 (src, src_end, multibytep)
1411 unsigned char *src, *src_end;
1412 int multibytep;
1413{
1414 int mask = CODING_CATEGORY_MASK_ISO;
1415 int mask_found = 0;
1416 int reg[4], shift_out = 0, single_shifting = 0;
1417 int c, c1, charset;
1418 /* Dummy for ONE_MORE_BYTE. */
1419 struct coding_system dummy_coding;
1420 struct coding_system *coding = &dummy_coding;
1421 Lisp_Object safe_chars;
1422
1423 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1424 while (mask && src < src_end)
1425 {
1426 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1427 retry:
1428 switch (c)
1429 {
1430 case ISO_CODE_ESC:
1431 if (inhibit_iso_escape_detection)
1432 break;
1433 single_shifting = 0;
1434 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1435 if (c >= '(' && c <= '/')
1436 {
1437 /* Designation sequence for a charset of dimension 1. */
1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1439 if (c1 < ' ' || c1 >= 0x80
1440 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1441 /* Invalid designation sequence. Just ignore. */
1442 break;
1443 reg[(c - '(') % 4] = charset;
1444 }
1445 else if (c == '$')
1446 {
1447 /* Designation sequence for a charset of dimension 2. */
1448 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1449 if (c >= '@' && c <= 'B')
1450 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1451 reg[0] = charset = iso_charset_table[1][0][c];
1452 else if (c >= '(' && c <= '/')
1453 {
1454 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1455 if (c1 < ' ' || c1 >= 0x80
1456 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1457 /* Invalid designation sequence. Just ignore. */
1458 break;
1459 reg[(c - '(') % 4] = charset;
1460 }
1461 else
1462 /* Invalid designation sequence. Just ignore. */
1463 break;
1464 }
1465 else if (c == 'N' || c == 'O')
1466 {
1467 /* ESC <Fe> for SS2 or SS3. */
1468 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1469 break;
1470 }
1471 else if (c >= '0' && c <= '4')
1472 {
1473 /* ESC <Fp> for start/end composition. */
1474 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1475 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1476 else
1477 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1498 break;
1499 }
1500 else
1501 /* Invalid escape sequence. Just ignore. */
1502 break;
1503
1504 /* We found a valid designation sequence for CHARSET. */
1505 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1506 c = MAKE_CHAR (charset, 0, 0);
1507 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1508 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1509 else
1510 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1523 break;
1524
1525 case ISO_CODE_SO:
1526 if (inhibit_iso_escape_detection)
1527 break;
1528 single_shifting = 0;
1529 if (shift_out == 0
1530 && (reg[1] >= 0
1531 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1532 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1533 {
1534 /* Locking shift out. */
1535 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537 }
1538 break;
1539
1540 case ISO_CODE_SI:
1541 if (inhibit_iso_escape_detection)
1542 break;
1543 single_shifting = 0;
1544 if (shift_out == 1)
1545 {
1546 /* Locking shift in. */
1547 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1548 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1549 }
1550 break;
1551
1552 case ISO_CODE_CSI:
1553 single_shifting = 0;
1554 case ISO_CODE_SS2:
1555 case ISO_CODE_SS3:
1556 {
1557 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1558
1559 if (inhibit_iso_escape_detection)
1560 break;
1561 if (c != ISO_CODE_CSI)
1562 {
1563 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1564 & CODING_FLAG_ISO_SINGLE_SHIFT)
1565 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1566 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1567 & CODING_FLAG_ISO_SINGLE_SHIFT)
1568 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1569 single_shifting = 1;
1570 }
1571 if (VECTORP (Vlatin_extra_code_table)
1572 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1573 {
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1578 & CODING_FLAG_ISO_LATIN_EXTRA)
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1580 }
1581 mask &= newmask;
1582 mask_found |= newmask;
1583 }
1584 break;
1585
1586 default:
1587 if (c < 0x80)
1588 {
1589 single_shifting = 0;
1590 break;
1591 }
1592 else if (c < 0xA0)
1593 {
1594 single_shifting = 0;
1595 if (VECTORP (Vlatin_extra_code_table)
1596 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1597 {
1598 int newmask = 0;
1599
1600 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1601 & CODING_FLAG_ISO_LATIN_EXTRA)
1602 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1604 & CODING_FLAG_ISO_LATIN_EXTRA)
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1606 mask &= newmask;
1607 mask_found |= newmask;
1608 }
1609 else
1610 return 0;
1611 }
1612 else
1613 {
1614 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1615 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1616 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1617 /* Check the length of succeeding codes of the range
1618 0xA0..0FF. If the byte length is odd, we exclude
1619 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1620 when we are not single shifting. */
1621 if (!single_shifting
1622 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1623 {
1624 int i = 1;
1625
1626 c = -1;
1627 while (src < src_end)
1628 {
1629 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1630 if (c < 0xA0)
1631 break;
1632 i++;
1633 }
1634
1635 if (i & 1 && src < src_end)
1636 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1637 else
1638 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1639 if (c >= 0)
1640 /* This means that we have read one extra byte. */
1641 goto retry;
1642 }
1643 }
1644 break;
1645 }
1646 }
1647 label_end_of_loop:
1648 return (mask & mask_found);
1649}
1650
1651/* Decode a character of which charset is CHARSET, the 1st position
1652 code is C1, the 2nd position code is C2, and return the decoded
1653 character code. If the variable `translation_table' is non-nil,
1654 returned the translated code. */
1655
1656#define DECODE_ISO_CHARACTER(charset, c1, c2) \
1657 (NILP (translation_table) \
1658 ? MAKE_CHAR (charset, c1, c2) \
1659 : translate_char (translation_table, -1, charset, c1, c2))
1660
1661/* Set designation state into CODING. */
1662#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1663 do { \
1664 int charset, c; \
1665 \
1666 if (final_char < '0' || final_char >= 128) \
1667 goto label_invalid_code; \
1668 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1669 make_number (chars), \
1670 make_number (final_char)); \
1671 c = MAKE_CHAR (charset, 0, 0); \
1672 if (charset >= 0 \
1673 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1674 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1675 { \
1676 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1677 && reg == 0 \
1678 && charset == CHARSET_ASCII) \
1679 { \
1680 /* We should insert this designation sequence as is so \
1681 that it is surely written back to a file. */ \
1682 coding->spec.iso2022.last_invalid_designation_register = -1; \
1683 goto label_invalid_code; \
1684 } \
1685 coding->spec.iso2022.last_invalid_designation_register = -1; \
1686 if ((coding->mode & CODING_MODE_DIRECTION) \
1687 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1688 charset = CHARSET_REVERSE_CHARSET (charset); \
1689 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1690 } \
1691 else \
1692 { \
1693 coding->spec.iso2022.last_invalid_designation_register = reg; \
1694 goto label_invalid_code; \
1695 } \
1696 } while (0)
1697
1698/* Allocate a memory block for storing information about compositions.
1699 The block is chained to the already allocated blocks. */
1700
1701void
1702coding_allocate_composition_data (coding, char_offset)
1703 struct coding_system *coding;
1704 int char_offset;
1705{
1706 struct composition_data *cmp_data
1707 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1708
1709 cmp_data->char_offset = char_offset;
1710 cmp_data->used = 0;
1711 cmp_data->prev = coding->cmp_data;
1712 cmp_data->next = NULL;
1713 if (coding->cmp_data)
1714 coding->cmp_data->next = cmp_data;
1715 coding->cmp_data = cmp_data;
1716 coding->cmp_data_start = 0;
1717 coding->composing = COMPOSITION_NO;
1718}
1719
1720/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1721 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1722 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1723 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1724 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1725 */
1726
1727#define DECODE_COMPOSITION_START(c1) \
1728 do { \
1729 if (coding->composing == COMPOSITION_DISABLED) \
1730 { \
1731 *dst++ = ISO_CODE_ESC; \
1732 *dst++ = c1 & 0x7f; \
1733 coding->produced_char += 2; \
1734 } \
1735 else if (!COMPOSING_P (coding)) \
1736 { \
1737 /* This is surely the start of a composition. We must be sure \
1738 that coding->cmp_data has enough space to store the \
1739 information about the composition. If not, terminate the \
1740 current decoding loop, allocate one more memory block for \
1741 coding->cmp_data in the caller, then start the decoding \
1742 loop again. We can't allocate memory here directly because \
1743 it may cause buffer/string relocation. */ \
1744 if (!coding->cmp_data \
1745 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1746 >= COMPOSITION_DATA_SIZE)) \
1747 { \
1748 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1749 goto label_end_of_loop; \
1750 } \
1751 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1752 : c1 == '2' ? COMPOSITION_WITH_RULE \
1753 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1754 : COMPOSITION_WITH_RULE_ALTCHARS); \
1755 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1756 coding->composing); \
1757 coding->composition_rule_follows = 0; \
1758 } \
1759 else \
1760 { \
1761 /* We are already handling a composition. If the method is \
1762 the following two, the codes following the current escape \
1763 sequence are actual characters stored in a buffer. */ \
1764 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1765 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1766 { \
1767 coding->composing = COMPOSITION_RELATIVE; \
1768 coding->composition_rule_follows = 0; \
1769 } \
1770 } \
1771 } while (0)
1772
1773/* Handle composition end sequence ESC 1. */
1774
1775#define DECODE_COMPOSITION_END(c1) \
1776 do { \
1777 if (! COMPOSING_P (coding)) \
1778 { \
1779 *dst++ = ISO_CODE_ESC; \
1780 *dst++ = c1; \
1781 coding->produced_char += 2; \
1782 } \
1783 else \
1784 { \
1785 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1786 coding->composing = COMPOSITION_NO; \
1787 } \
1788 } while (0)
1789
1790/* Decode a composition rule from the byte C1 (and maybe one more byte
1791 from SRC) and store one encoded composition rule in
1792 coding->cmp_data. */
1793
1794#define DECODE_COMPOSITION_RULE(c1) \
1795 do { \
1796 int rule = 0; \
1797 (c1) -= 32; \
1798 if (c1 < 81) /* old format (before ver.21) */ \
1799 { \
1800 int gref = (c1) / 9; \
1801 int nref = (c1) % 9; \
1802 if (gref == 4) gref = 10; \
1803 if (nref == 4) nref = 10; \
1804 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1805 } \
1806 else if (c1 < 93) /* new format (after ver.21) */ \
1807 { \
1808 ONE_MORE_BYTE (c2); \
1809 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1810 } \
1811 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1812 coding->composition_rule_follows = 0; \
1813 } while (0)
1814
1815
1816/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1817
1818static void
1819decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1820 struct coding_system *coding;
1821 const unsigned char *source;
1822 unsigned char *destination;
1823 int src_bytes, dst_bytes;
1824{
1825 const unsigned char *src = source;
1826 const unsigned char *src_end = source + src_bytes;
1827 unsigned char *dst = destination;
1828 unsigned char *dst_end = destination + dst_bytes;
1829 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1830 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1831 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1832 /* SRC_BASE remembers the start position in source in each loop.
1833 The loop will be exited when there's not enough source code
1834 (within macro ONE_MORE_BYTE), or when there's not enough
1835 destination area to produce a character (within macro
1836 EMIT_CHAR). */
1837 const unsigned char *src_base;
1838 int c, charset;
1839 Lisp_Object translation_table;
1840 Lisp_Object safe_chars;
1841
1842 safe_chars = coding_safe_chars (coding->symbol);
1843
1844 if (NILP (Venable_character_translation))
1845 translation_table = Qnil;
1846 else
1847 {
1848 translation_table = coding->translation_table_for_decode;
1849 if (NILP (translation_table))
1850 translation_table = Vstandard_translation_table_for_decode;
1851 }
1852
1853 coding->result = CODING_FINISH_NORMAL;
1854
1855 while (1)
1856 {
1857 int c1, c2 = 0;
1858
1859 src_base = src;
1860 ONE_MORE_BYTE (c1);
1861
1862 /* We produce no character or one character. */
1863 switch (iso_code_class [c1])
1864 {
1865 case ISO_0x20_or_0x7F:
1866 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1867 {
1868 DECODE_COMPOSITION_RULE (c1);
1869 continue;
1870 }
1871 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1872 {
1873 /* This is SPACE or DEL. */
1874 charset = CHARSET_ASCII;
1875 break;
1876 }
1877 /* This is a graphic character, we fall down ... */
1878
1879 case ISO_graphic_plane_0:
1880 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1881 {
1882 DECODE_COMPOSITION_RULE (c1);
1883 continue;
1884 }
1885 charset = charset0;
1886 break;
1887
1888 case ISO_0xA0_or_0xFF:
1889 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1890 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1891 goto label_invalid_code;
1892 /* This is a graphic character, we fall down ... */
1893
1894 case ISO_graphic_plane_1:
1895 if (charset1 < 0)
1896 goto label_invalid_code;
1897 charset = charset1;
1898 break;
1899
1900 case ISO_control_0:
1901 if (COMPOSING_P (coding))
1902 DECODE_COMPOSITION_END ('1');
1903
1904 /* All ISO2022 control characters in this class have the
1905 same representation in Emacs internal format. */
1906 if (c1 == '\n'
1907 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1908 && (coding->eol_type == CODING_EOL_CR
1909 || coding->eol_type == CODING_EOL_CRLF))
1910 {
1911 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1912 goto label_end_of_loop;
1913 }
1914 charset = CHARSET_ASCII;
1915 break;
1916
1917 case ISO_control_1:
1918 if (COMPOSING_P (coding))
1919 DECODE_COMPOSITION_END ('1');
1920 goto label_invalid_code;
1921
1922 case ISO_carriage_return:
1923 if (COMPOSING_P (coding))
1924 DECODE_COMPOSITION_END ('1');
1925
1926 if (coding->eol_type == CODING_EOL_CR)
1927 c1 = '\n';
1928 else if (coding->eol_type == CODING_EOL_CRLF)
1929 {
1930 ONE_MORE_BYTE (c1);
1931 if (c1 != ISO_CODE_LF)
1932 {
1933 src--;
1934 c1 = '\r';
1935 }
1936 }
1937 charset = CHARSET_ASCII;
1938 break;
1939
1940 case ISO_shift_out:
1941 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1942 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1943 goto label_invalid_code;
1944 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1945 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1946 continue;
1947
1948 case ISO_shift_in:
1949 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1950 goto label_invalid_code;
1951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1953 continue;
1954
1955 case ISO_single_shift_2_7:
1956 case ISO_single_shift_2:
1957 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1958 goto label_invalid_code;
1959 /* SS2 is handled as an escape sequence of ESC 'N' */
1960 c1 = 'N';
1961 goto label_escape_sequence;
1962
1963 case ISO_single_shift_3:
1964 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965 goto label_invalid_code;
1966 /* SS2 is handled as an escape sequence of ESC 'O' */
1967 c1 = 'O';
1968 goto label_escape_sequence;
1969
1970 case ISO_control_sequence_introducer:
1971 /* CSI is handled as an escape sequence of ESC '[' ... */
1972 c1 = '[';
1973 goto label_escape_sequence;
1974
1975 case ISO_escape:
1976 ONE_MORE_BYTE (c1);
1977 label_escape_sequence:
1978 /* Escape sequences handled by Emacs are invocation,
1979 designation, direction specification, and character
1980 composition specification. */
1981 switch (c1)
1982 {
1983 case '&': /* revision of following character set */
1984 ONE_MORE_BYTE (c1);
1985 if (!(c1 >= '@' && c1 <= '~'))
1986 goto label_invalid_code;
1987 ONE_MORE_BYTE (c1);
1988 if (c1 != ISO_CODE_ESC)
1989 goto label_invalid_code;
1990 ONE_MORE_BYTE (c1);
1991 goto label_escape_sequence;
1992
1993 case '$': /* designation of 2-byte character set */
1994 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1995 goto label_invalid_code;
1996 ONE_MORE_BYTE (c1);
1997 if (c1 >= '@' && c1 <= 'B')
1998 { /* designation of JISX0208.1978, GB2312.1980,
1999 or JISX0208.1980 */
2000 DECODE_DESIGNATION (0, 2, 94, c1);
2001 }
2002 else if (c1 >= 0x28 && c1 <= 0x2B)
2003 { /* designation of DIMENSION2_CHARS94 character set */
2004 ONE_MORE_BYTE (c2);
2005 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2006 }
2007 else if (c1 >= 0x2C && c1 <= 0x2F)
2008 { /* designation of DIMENSION2_CHARS96 character set */
2009 ONE_MORE_BYTE (c2);
2010 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2011 }
2012 else
2013 goto label_invalid_code;
2014 /* We must update these variables now. */
2015 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2016 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2017 continue;
2018
2019 case 'n': /* invocation of locking-shift-2 */
2020 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2021 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2022 goto label_invalid_code;
2023 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2024 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2025 continue;
2026
2027 case 'o': /* invocation of locking-shift-3 */
2028 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2029 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2030 goto label_invalid_code;
2031 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2032 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2033 continue;
2034
2035 case 'N': /* invocation of single-shift-2 */
2036 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2037 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2038 goto label_invalid_code;
2039 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2040 ONE_MORE_BYTE (c1);
2041 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2042 goto label_invalid_code;
2043 break;
2044
2045 case 'O': /* invocation of single-shift-3 */
2046 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2047 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048 goto label_invalid_code;
2049 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2050 ONE_MORE_BYTE (c1);
2051 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2052 goto label_invalid_code;
2053 break;
2054
2055 case '0': case '2': case '3': case '4': /* start composition */
2056 DECODE_COMPOSITION_START (c1);
2057 continue;
2058
2059 case '1': /* end composition */
2060 DECODE_COMPOSITION_END (c1);
2061 continue;
2062
2063 case '[': /* specification of direction */
2064 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2065 goto label_invalid_code;
2066 /* For the moment, nested direction is not supported.
2067 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2068 left-to-right, and nonzero means right-to-left. */
2069 ONE_MORE_BYTE (c1);
2070 switch (c1)
2071 {
2072 case ']': /* end of the current direction */
2073 coding->mode &= ~CODING_MODE_DIRECTION;
2074
2075 case '0': /* end of the current direction */
2076 case '1': /* start of left-to-right direction */
2077 ONE_MORE_BYTE (c1);
2078 if (c1 == ']')
2079 coding->mode &= ~CODING_MODE_DIRECTION;
2080 else
2081 goto label_invalid_code;
2082 break;
2083
2084 case '2': /* start of right-to-left direction */
2085 ONE_MORE_BYTE (c1);
2086 if (c1 == ']')
2087 coding->mode |= CODING_MODE_DIRECTION;
2088 else
2089 goto label_invalid_code;
2090 break;
2091
2092 default:
2093 goto label_invalid_code;
2094 }
2095 continue;
2096
2097 case '%':
2098 if (COMPOSING_P (coding))
2099 DECODE_COMPOSITION_END ('1');
2100 ONE_MORE_BYTE (c1);
2101 if (c1 == '/')
2102 {
2103 /* CTEXT extended segment:
2104 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2105 We keep these bytes as is for the moment.
2106 They may be decoded by post-read-conversion. */
2107 int dim, M, L;
2108 int size, required;
2109 int produced_chars;
2110
2111 ONE_MORE_BYTE (dim);
2112 ONE_MORE_BYTE (M);
2113 ONE_MORE_BYTE (L);
2114 size = ((M - 128) * 128) + (L - 128);
2115 required = 8 + size * 2;
2116 if (dst + required > (dst_bytes ? dst_end : src))
2117 goto label_end_of_loop;
2118 *dst++ = ISO_CODE_ESC;
2119 *dst++ = '%';
2120 *dst++ = '/';
2121 *dst++ = dim;
2122 produced_chars = 4;
2123 dst += CHAR_STRING (M, dst), produced_chars++;
2124 dst += CHAR_STRING (L, dst), produced_chars++;
2125 while (size-- > 0)
2126 {
2127 ONE_MORE_BYTE (c1);
2128 dst += CHAR_STRING (c1, dst), produced_chars++;
2129 }
2130 coding->produced_char += produced_chars;
2131 }
2132 else if (c1 == 'G')
2133 {
2134 unsigned char *d = dst;
2135 int produced_chars;
2136
2137 /* XFree86 extension for embedding UTF-8 in CTEXT:
2138 ESC % G --UTF-8-BYTES-- ESC % @
2139 We keep these bytes as is for the moment.
2140 They may be decoded by post-read-conversion. */
2141 if (d + 6 > (dst_bytes ? dst_end : src))
2142 goto label_end_of_loop;
2143 *d++ = ISO_CODE_ESC;
2144 *d++ = '%';
2145 *d++ = 'G';
2146 produced_chars = 3;
2147 while (d + 1 < (dst_bytes ? dst_end : src))
2148 {
2149 ONE_MORE_BYTE (c1);
2150 if (c1 == ISO_CODE_ESC
2151 && src + 1 < src_end
2152 && src[0] == '%'
2153 && src[1] == '@')
2154 {
2155 src += 2;
2156 break;
2157 }
2158 d += CHAR_STRING (c1, d), produced_chars++;
2159 }
2160 if (d + 3 > (dst_bytes ? dst_end : src))
2161 goto label_end_of_loop;
2162 *d++ = ISO_CODE_ESC;
2163 *d++ = '%';
2164 *d++ = '@';
2165 dst = d;
2166 coding->produced_char += produced_chars + 3;
2167 }
2168 else
2169 goto label_invalid_code;
2170 continue;
2171
2172 default:
2173 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2174 goto label_invalid_code;
2175 if (c1 >= 0x28 && c1 <= 0x2B)
2176 { /* designation of DIMENSION1_CHARS94 character set */
2177 ONE_MORE_BYTE (c2);
2178 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2179 }
2180 else if (c1 >= 0x2C && c1 <= 0x2F)
2181 { /* designation of DIMENSION1_CHARS96 character set */
2182 ONE_MORE_BYTE (c2);
2183 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2184 }
2185 else
2186 goto label_invalid_code;
2187 /* We must update these variables now. */
2188 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2189 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2190 continue;
2191 }
2192 }
2193
2194 /* Now we know CHARSET and 1st position code C1 of a character.
2195 Produce a multibyte sequence for that character while getting
2196 2nd position code C2 if necessary. */
2197 if (CHARSET_DIMENSION (charset) == 2)
2198 {
2199 ONE_MORE_BYTE (c2);
2200 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2201 /* C2 is not in a valid range. */
2202 goto label_invalid_code;
2203 }
2204 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2205 EMIT_CHAR (c);
2206 continue;
2207
2208 label_invalid_code:
2209 coding->errors++;
2210 if (COMPOSING_P (coding))
2211 DECODE_COMPOSITION_END ('1');
2212 src = src_base;
2213 c = *src++;
2214 if (! NILP (translation_table))
2215 c = translate_char (translation_table, c, 0, 0, 0);
2216 EMIT_CHAR (c);
2217 }
2218
2219 label_end_of_loop:
2220 coding->consumed = coding->consumed_char = src_base - source;
2221 coding->produced = dst - destination;
2222 return;
2223}
2224
2225
2226/* ISO2022 encoding stuff. */
2227
2228/*
2229 It is not enough to say just "ISO2022" on encoding, we have to
2230 specify more details. In Emacs, each ISO2022 coding system
2231 variant has the following specifications:
2232 1. Initial designation to G0 through G3.
2233 2. Allows short-form designation?
2234 3. ASCII should be designated to G0 before control characters?
2235 4. ASCII should be designated to G0 at end of line?
2236 5. 7-bit environment or 8-bit environment?
2237 6. Use locking-shift?
2238 7. Use Single-shift?
2239 And the following two are only for Japanese:
2240 8. Use ASCII in place of JIS0201-1976-Roman?
2241 9. Use JISX0208-1983 in place of JISX0208-1978?
2242 These specifications are encoded in `coding->flags' as flag bits
2243 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2244 details.
2245*/
2246
2247/* Produce codes (escape sequence) for designating CHARSET to graphic
2248 register REG at DST, and increment DST. If <final-char> of CHARSET is
2249 '@', 'A', or 'B' and the coding system CODING allows, produce
2250 designation sequence of short-form. */
2251
2252#define ENCODE_DESIGNATION(charset, reg, coding) \
2253 do { \
2254 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2255 char *intermediate_char_94 = "()*+"; \
2256 char *intermediate_char_96 = ",-./"; \
2257 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2258 \
2259 if (revision < 255) \
2260 { \
2261 *dst++ = ISO_CODE_ESC; \
2262 *dst++ = '&'; \
2263 *dst++ = '@' + revision; \
2264 } \
2265 *dst++ = ISO_CODE_ESC; \
2266 if (CHARSET_DIMENSION (charset) == 1) \
2267 { \
2268 if (CHARSET_CHARS (charset) == 94) \
2269 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2270 else \
2271 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2272 } \
2273 else \
2274 { \
2275 *dst++ = '$'; \
2276 if (CHARSET_CHARS (charset) == 94) \
2277 { \
2278 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2279 || reg != 0 \
2280 || final_char < '@' || final_char > 'B') \
2281 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2282 } \
2283 else \
2284 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2285 } \
2286 *dst++ = final_char; \
2287 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2288 } while (0)
2289
2290/* The following two macros produce codes (control character or escape
2291 sequence) for ISO2022 single-shift functions (single-shift-2 and
2292 single-shift-3). */
2293
2294#define ENCODE_SINGLE_SHIFT_2 \
2295 do { \
2296 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2297 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2298 else \
2299 *dst++ = ISO_CODE_SS2; \
2300 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2301 } while (0)
2302
2303#define ENCODE_SINGLE_SHIFT_3 \
2304 do { \
2305 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2306 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2307 else \
2308 *dst++ = ISO_CODE_SS3; \
2309 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2310 } while (0)
2311
2312/* The following four macros produce codes (control character or
2313 escape sequence) for ISO2022 locking-shift functions (shift-in,
2314 shift-out, locking-shift-2, and locking-shift-3). */
2315
2316#define ENCODE_SHIFT_IN \
2317 do { \
2318 *dst++ = ISO_CODE_SI; \
2319 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2320 } while (0)
2321
2322#define ENCODE_SHIFT_OUT \
2323 do { \
2324 *dst++ = ISO_CODE_SO; \
2325 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2326 } while (0)
2327
2328#define ENCODE_LOCKING_SHIFT_2 \
2329 do { \
2330 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2331 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2332 } while (0)
2333
2334#define ENCODE_LOCKING_SHIFT_3 \
2335 do { \
2336 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2338 } while (0)
2339
2340/* Produce codes for a DIMENSION1 character whose character set is
2341 CHARSET and whose position-code is C1. Designation and invocation
2342 sequences are also produced in advance if necessary. */
2343
2344#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2345 do { \
2346 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2347 { \
2348 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2349 *dst++ = c1 & 0x7F; \
2350 else \
2351 *dst++ = c1 | 0x80; \
2352 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2353 break; \
2354 } \
2355 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2356 { \
2357 *dst++ = c1 & 0x7F; \
2358 break; \
2359 } \
2360 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2361 { \
2362 *dst++ = c1 | 0x80; \
2363 break; \
2364 } \
2365 else \
2366 /* Since CHARSET is not yet invoked to any graphic planes, we \
2367 must invoke it, or, at first, designate it to some graphic \
2368 register. Then repeat the loop to actually produce the \
2369 character. */ \
2370 dst = encode_invocation_designation (charset, coding, dst); \
2371 } while (1)
2372
2373/* Produce codes for a DIMENSION2 character whose character set is
2374 CHARSET and whose position-codes are C1 and C2. Designation and
2375 invocation codes are also produced in advance if necessary. */
2376
2377#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2378 do { \
2379 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2380 { \
2381 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2382 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2383 else \
2384 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2385 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2386 break; \
2387 } \
2388 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2389 { \
2390 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2391 break; \
2392 } \
2393 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2394 { \
2395 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2396 break; \
2397 } \
2398 else \
2399 /* Since CHARSET is not yet invoked to any graphic planes, we \
2400 must invoke it, or, at first, designate it to some graphic \
2401 register. Then repeat the loop to actually produce the \
2402 character. */ \
2403 dst = encode_invocation_designation (charset, coding, dst); \
2404 } while (1)
2405
2406#define ENCODE_ISO_CHARACTER(c) \
2407 do { \
2408 int charset, c1, c2; \
2409 \
2410 SPLIT_CHAR (c, charset, c1, c2); \
2411 if (CHARSET_DEFINED_P (charset)) \
2412 { \
2413 if (CHARSET_DIMENSION (charset) == 1) \
2414 { \
2415 if (charset == CHARSET_ASCII \
2416 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2417 charset = charset_latin_jisx0201; \
2418 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2419 } \
2420 else \
2421 { \
2422 if (charset == charset_jisx0208 \
2423 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2424 charset = charset_jisx0208_1978; \
2425 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2426 } \
2427 } \
2428 else \
2429 { \
2430 *dst++ = c1; \
2431 if (c2 >= 0) \
2432 *dst++ = c2; \
2433 } \
2434 } while (0)
2435
2436
2437/* Instead of encoding character C, produce one or two `?'s. */
2438
2439#define ENCODE_UNSAFE_CHARACTER(c) \
2440 do { \
2441 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2442 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2443 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2444 } while (0)
2445
2446
2447/* Produce designation and invocation codes at a place pointed by DST
2448 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2449 Return new DST. */
2450
2451unsigned char *
2452encode_invocation_designation (charset, coding, dst)
2453 int charset;
2454 struct coding_system *coding;
2455 unsigned char *dst;
2456{
2457 int reg; /* graphic register number */
2458
2459 /* At first, check designations. */
2460 for (reg = 0; reg < 4; reg++)
2461 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2462 break;
2463
2464 if (reg >= 4)
2465 {
2466 /* CHARSET is not yet designated to any graphic registers. */
2467 /* At first check the requested designation. */
2468 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2469 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2470 /* Since CHARSET requests no special designation, designate it
2471 to graphic register 0. */
2472 reg = 0;
2473
2474 ENCODE_DESIGNATION (charset, reg, coding);
2475 }
2476
2477 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2478 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2479 {
2480 /* Since the graphic register REG is not invoked to any graphic
2481 planes, invoke it to graphic plane 0. */
2482 switch (reg)
2483 {
2484 case 0: /* graphic register 0 */
2485 ENCODE_SHIFT_IN;
2486 break;
2487
2488 case 1: /* graphic register 1 */
2489 ENCODE_SHIFT_OUT;
2490 break;
2491
2492 case 2: /* graphic register 2 */
2493 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2494 ENCODE_SINGLE_SHIFT_2;
2495 else
2496 ENCODE_LOCKING_SHIFT_2;
2497 break;
2498
2499 case 3: /* graphic register 3 */
2500 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501 ENCODE_SINGLE_SHIFT_3;
2502 else
2503 ENCODE_LOCKING_SHIFT_3;
2504 break;
2505 }
2506 }
2507
2508 return dst;
2509}
2510
2511/* Produce 2-byte codes for encoded composition rule RULE. */
2512
2513#define ENCODE_COMPOSITION_RULE(rule) \
2514 do { \
2515 int gref, nref; \
2516 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2517 *dst++ = 32 + 81 + gref; \
2518 *dst++ = 32 + nref; \
2519 } while (0)
2520
2521/* Produce codes for indicating the start of a composition sequence
2522 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2523 which specify information about the composition. See the comment
2524 in coding.h for the format of DATA. */
2525
2526#define ENCODE_COMPOSITION_START(coding, data) \
2527 do { \
2528 coding->composing = data[3]; \
2529 *dst++ = ISO_CODE_ESC; \
2530 if (coding->composing == COMPOSITION_RELATIVE) \
2531 *dst++ = '0'; \
2532 else \
2533 { \
2534 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2535 ? '3' : '4'); \
2536 coding->cmp_data_index = coding->cmp_data_start + 4; \
2537 coding->composition_rule_follows = 0; \
2538 } \
2539 } while (0)
2540
2541/* Produce codes for indicating the end of the current composition. */
2542
2543#define ENCODE_COMPOSITION_END(coding, data) \
2544 do { \
2545 *dst++ = ISO_CODE_ESC; \
2546 *dst++ = '1'; \
2547 coding->cmp_data_start += data[0]; \
2548 coding->composing = COMPOSITION_NO; \
2549 if (coding->cmp_data_start == coding->cmp_data->used \
2550 && coding->cmp_data->next) \
2551 { \
2552 coding->cmp_data = coding->cmp_data->next; \
2553 coding->cmp_data_start = 0; \
2554 } \
2555 } while (0)
2556
2557/* Produce composition start sequence ESC 0. Here, this sequence
2558 doesn't mean the start of a new composition but means that we have
2559 just produced components (alternate chars and composition rules) of
2560 the composition and the actual text follows in SRC. */
2561
2562#define ENCODE_COMPOSITION_FAKE_START(coding) \
2563 do { \
2564 *dst++ = ISO_CODE_ESC; \
2565 *dst++ = '0'; \
2566 coding->composing = COMPOSITION_RELATIVE; \
2567 } while (0)
2568
2569/* The following three macros produce codes for indicating direction
2570 of text. */
2571#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2572 do { \
2573 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2574 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2575 else \
2576 *dst++ = ISO_CODE_CSI; \
2577 } while (0)
2578
2579#define ENCODE_DIRECTION_R2L \
2580 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2581
2582#define ENCODE_DIRECTION_L2R \
2583 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2584
2585/* Produce codes for designation and invocation to reset the graphic
2586 planes and registers to initial state. */
2587#define ENCODE_RESET_PLANE_AND_REGISTER \
2588 do { \
2589 int reg; \
2590 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2591 ENCODE_SHIFT_IN; \
2592 for (reg = 0; reg < 4; reg++) \
2593 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2594 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2595 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2596 ENCODE_DESIGNATION \
2597 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2598 } while (0)
2599
2600/* Produce designation sequences of charsets in the line started from
2601 SRC to a place pointed by DST, and return updated DST.
2602
2603 If the current block ends before any end-of-line, we may fail to
2604 find all the necessary designations. */
2605
2606static unsigned char *
2607encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2608 struct coding_system *coding;
2609 Lisp_Object translation_table;
2610 const unsigned char *src, *src_end;
2611 unsigned char *dst;
2612{
2613 int charset, c, found = 0, reg;
2614 /* Table of charsets to be designated to each graphic register. */
2615 int r[4];
2616
2617 for (reg = 0; reg < 4; reg++)
2618 r[reg] = -1;
2619
2620 while (found < 4)
2621 {
2622 ONE_MORE_CHAR (c);
2623 if (c == '\n')
2624 break;
2625
2626 charset = CHAR_CHARSET (c);
2627 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2628 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2629 {
2630 found++;
2631 r[reg] = charset;
2632 }
2633 }
2634
2635 label_end_of_loop:
2636 if (found)
2637 {
2638 for (reg = 0; reg < 4; reg++)
2639 if (r[reg] >= 0
2640 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2641 ENCODE_DESIGNATION (r[reg], reg, coding);
2642 }
2643
2644 return dst;
2645}
2646
2647/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2648
2649static void
2650encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2651 struct coding_system *coding;
2652 const unsigned char *source;
2653 unsigned char *destination;
2654 int src_bytes, dst_bytes;
2655{
2656 const unsigned char *src = source;
2657 const unsigned char *src_end = source + src_bytes;
2658 unsigned char *dst = destination;
2659 unsigned char *dst_end = destination + dst_bytes;
2660 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2661 from DST_END to assure overflow checking is necessary only at the
2662 head of loop. */
2663 unsigned char *adjusted_dst_end = dst_end - 19;
2664 /* SRC_BASE remembers the start position in source in each loop.
2665 The loop will be exited when there's not enough source text to
2666 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2667 there's not enough destination area to produce encoded codes
2668 (within macro EMIT_BYTES). */
2669 const unsigned char *src_base;
2670 int c;
2671 Lisp_Object translation_table;
2672 Lisp_Object safe_chars;
2673
2674 if (coding->flags & CODING_FLAG_ISO_SAFE)
2675 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2676
2677 safe_chars = coding_safe_chars (coding->symbol);
2678
2679 if (NILP (Venable_character_translation))
2680 translation_table = Qnil;
2681 else
2682 {
2683 translation_table = coding->translation_table_for_encode;
2684 if (NILP (translation_table))
2685 translation_table = Vstandard_translation_table_for_encode;
2686 }
2687
2688 coding->consumed_char = 0;
2689 coding->errors = 0;
2690 while (1)
2691 {
2692 src_base = src;
2693
2694 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2695 {
2696 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2697 break;
2698 }
2699
2700 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2701 && CODING_SPEC_ISO_BOL (coding))
2702 {
2703 /* We have to produce designation sequences if any now. */
2704 dst = encode_designation_at_bol (coding, translation_table,
2705 src, src_end, dst);
2706 CODING_SPEC_ISO_BOL (coding) = 0;
2707 }
2708
2709 /* Check composition start and end. */
2710 if (coding->composing != COMPOSITION_DISABLED
2711 && coding->cmp_data_start < coding->cmp_data->used)
2712 {
2713 struct composition_data *cmp_data = coding->cmp_data;
2714 int *data = cmp_data->data + coding->cmp_data_start;
2715 int this_pos = cmp_data->char_offset + coding->consumed_char;
2716
2717 if (coding->composing == COMPOSITION_RELATIVE)
2718 {
2719 if (this_pos == data[2])
2720 {
2721 ENCODE_COMPOSITION_END (coding, data);
2722 cmp_data = coding->cmp_data;
2723 data = cmp_data->data + coding->cmp_data_start;
2724 }
2725 }
2726 else if (COMPOSING_P (coding))
2727 {
2728 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2729 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2730 /* We have consumed components of the composition.
2731 What follows in SRC is the composition's base
2732 text. */
2733 ENCODE_COMPOSITION_FAKE_START (coding);
2734 else
2735 {
2736 int c = cmp_data->data[coding->cmp_data_index++];
2737 if (coding->composition_rule_follows)
2738 {
2739 ENCODE_COMPOSITION_RULE (c);
2740 coding->composition_rule_follows = 0;
2741 }
2742 else
2743 {
2744 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2745 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2746 ENCODE_UNSAFE_CHARACTER (c);
2747 else
2748 ENCODE_ISO_CHARACTER (c);
2749 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2750 coding->composition_rule_follows = 1;
2751 }
2752 continue;
2753 }
2754 }
2755 if (!COMPOSING_P (coding))
2756 {
2757 if (this_pos == data[1])
2758 {
2759 ENCODE_COMPOSITION_START (coding, data);
2760 continue;
2761 }
2762 }
2763 }
2764
2765 ONE_MORE_CHAR (c);
2766
2767 /* Now encode the character C. */
2768 if (c < 0x20 || c == 0x7F)
2769 {
2770 if (c == '\r')
2771 {
2772 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2773 {
2774 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2775 ENCODE_RESET_PLANE_AND_REGISTER;
2776 *dst++ = c;
2777 continue;
2778 }
2779 /* fall down to treat '\r' as '\n' ... */
2780 c = '\n';
2781 }
2782 if (c == '\n')
2783 {
2784 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2785 ENCODE_RESET_PLANE_AND_REGISTER;
2786 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2787 bcopy (coding->spec.iso2022.initial_designation,
2788 coding->spec.iso2022.current_designation,
2789 sizeof coding->spec.iso2022.initial_designation);
2790 if (coding->eol_type == CODING_EOL_LF
2791 || coding->eol_type == CODING_EOL_UNDECIDED)
2792 *dst++ = ISO_CODE_LF;
2793 else if (coding->eol_type == CODING_EOL_CRLF)
2794 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2795 else
2796 *dst++ = ISO_CODE_CR;
2797 CODING_SPEC_ISO_BOL (coding) = 1;
2798 }
2799 else
2800 {
2801 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2802 ENCODE_RESET_PLANE_AND_REGISTER;
2803 *dst++ = c;
2804 }
2805 }
2806 else if (ASCII_BYTE_P (c))
2807 ENCODE_ISO_CHARACTER (c);
2808 else if (SINGLE_BYTE_CHAR_P (c))
2809 {
2810 *dst++ = c;
2811 coding->errors++;
2812 }
2813 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2814 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2815 ENCODE_UNSAFE_CHARACTER (c);
2816 else
2817 ENCODE_ISO_CHARACTER (c);
2818
2819 coding->consumed_char++;
2820 }
2821
2822 label_end_of_loop:
2823 coding->consumed = src_base - source;
2824 coding->produced = coding->produced_char = dst - destination;
2825}
2826
2827\f
2828/*** 4. SJIS and BIG5 handlers ***/
2829
2830/* Although SJIS and BIG5 are not ISO coding systems, they are used
2831 quite widely. So, for the moment, Emacs supports them in the bare
2832 C code. But, in the future, they may be supported only by CCL. */
2833
2834/* SJIS is a coding system encoding three character sets: ASCII, right
2835 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2836 as is. A character of charset katakana-jisx0201 is encoded by
2837 "position-code + 0x80". A character of charset japanese-jisx0208
2838 is encoded in 2-byte but two position-codes are divided and shifted
2839 so that it fits in the range below.
2840
2841 --- CODE RANGE of SJIS ---
2842 (character set) (range)
2843 ASCII 0x00 .. 0x7F
2844 KATAKANA-JISX0201 0xA1 .. 0xDF
2845 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2846 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2847 -------------------------------
2848
2849*/
2850
2851/* BIG5 is a coding system encoding two character sets: ASCII and
2852 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2853 character set and is encoded in two bytes.
2854
2855 --- CODE RANGE of BIG5 ---
2856 (character set) (range)
2857 ASCII 0x00 .. 0x7F
2858 Big5 (1st byte) 0xA1 .. 0xFE
2859 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2860 --------------------------
2861
2862 Since the number of characters in Big5 is larger than maximum
2863 characters in Emacs' charset (96x96), it can't be handled as one
2864 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2865 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2866 contains frequently used characters and the latter contains less
2867 frequently used characters. */
2868
2869/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2870 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2871 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2872 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2873
2874/* Number of Big5 characters which have the same code in 1st byte. */
2875#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2876
2877#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2878 do { \
2879 unsigned int temp \
2880 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2881 if (b1 < 0xC9) \
2882 charset = charset_big5_1; \
2883 else \
2884 { \
2885 charset = charset_big5_2; \
2886 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2887 } \
2888 c1 = temp / (0xFF - 0xA1) + 0x21; \
2889 c2 = temp % (0xFF - 0xA1) + 0x21; \
2890 } while (0)
2891
2892#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2893 do { \
2894 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2895 if (charset == charset_big5_2) \
2896 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2897 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2898 b2 = temp % BIG5_SAME_ROW; \
2899 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2900 } while (0)
2901
2902/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2903 Check if a text is encoded in SJIS. If it is, return
2904 CODING_CATEGORY_MASK_SJIS, else return 0. */
2905
2906static int
2907detect_coding_sjis (src, src_end, multibytep)
2908 unsigned char *src, *src_end;
2909 int multibytep;
2910{
2911 int c;
2912 /* Dummy for ONE_MORE_BYTE. */
2913 struct coding_system dummy_coding;
2914 struct coding_system *coding = &dummy_coding;
2915
2916 while (1)
2917 {
2918 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2919 if (c < 0x80)
2920 continue;
2921 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2922 return 0;
2923 if (c <= 0x9F || c >= 0xE0)
2924 {
2925 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2926 if (c < 0x40 || c == 0x7F || c > 0xFC)
2927 return 0;
2928 }
2929 }
2930 label_end_of_loop:
2931 return CODING_CATEGORY_MASK_SJIS;
2932}
2933
2934/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2935 Check if a text is encoded in BIG5. If it is, return
2936 CODING_CATEGORY_MASK_BIG5, else return 0. */
2937
2938static int
2939detect_coding_big5 (src, src_end, multibytep)
2940 unsigned char *src, *src_end;
2941 int multibytep;
2942{
2943 int c;
2944 /* Dummy for ONE_MORE_BYTE. */
2945 struct coding_system dummy_coding;
2946 struct coding_system *coding = &dummy_coding;
2947
2948 while (1)
2949 {
2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951 if (c < 0x80)
2952 continue;
2953 if (c < 0xA1 || c > 0xFE)
2954 return 0;
2955 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2956 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2957 return 0;
2958 }
2959 label_end_of_loop:
2960 return CODING_CATEGORY_MASK_BIG5;
2961}
2962
2963/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2964 Check if a text is encoded in UTF-8. If it is, return
2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2966
2967#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2968#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2969#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2970#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2971#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2972#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2973#define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2974
2975static int
2976detect_coding_utf_8 (src, src_end, multibytep)
2977 unsigned char *src, *src_end;
2978 int multibytep;
2979{
2980 unsigned char c;
2981 int seq_maybe_bytes;
2982 /* Dummy for ONE_MORE_BYTE. */
2983 struct coding_system dummy_coding;
2984 struct coding_system *coding = &dummy_coding;
2985
2986 while (1)
2987 {
2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2989 if (UTF_8_1_OCTET_P (c))
2990 continue;
2991 else if (UTF_8_2_OCTET_LEADING_P (c))
2992 seq_maybe_bytes = 1;
2993 else if (UTF_8_3_OCTET_LEADING_P (c))
2994 seq_maybe_bytes = 2;
2995 else if (UTF_8_4_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 3;
2997 else if (UTF_8_5_OCTET_LEADING_P (c))
2998 seq_maybe_bytes = 4;
2999 else if (UTF_8_6_OCTET_LEADING_P (c))
3000 seq_maybe_bytes = 5;
3001 else
3002 return 0;
3003
3004 do
3005 {
3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3007 if (!UTF_8_EXTRA_OCTET_P (c))
3008 return 0;
3009 seq_maybe_bytes--;
3010 }
3011 while (seq_maybe_bytes > 0);
3012 }
3013
3014 label_end_of_loop:
3015 return CODING_CATEGORY_MASK_UTF_8;
3016}
3017
3018/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3019 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3020 Little Endian (otherwise). If it is, return
3021 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3022 else return 0. */
3023
3024#define UTF_16_INVALID_P(val) \
3025 (((val) == 0xFFFE) \
3026 || ((val) == 0xFFFF))
3027
3028#define UTF_16_HIGH_SURROGATE_P(val) \
3029 (((val) & 0xD800) == 0xD800)
3030
3031#define UTF_16_LOW_SURROGATE_P(val) \
3032 (((val) & 0xDC00) == 0xDC00)
3033
3034static int
3035detect_coding_utf_16 (src, src_end, multibytep)
3036 unsigned char *src, *src_end;
3037 int multibytep;
3038{
3039 unsigned char c1, c2;
3040 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3041 struct coding_system dummy_coding;
3042 struct coding_system *coding = &dummy_coding;
3043
3044 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3045 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3046
3047 if ((c1 == 0xFF) && (c2 == 0xFE))
3048 return CODING_CATEGORY_MASK_UTF_16_LE;
3049 else if ((c1 == 0xFE) && (c2 == 0xFF))
3050 return CODING_CATEGORY_MASK_UTF_16_BE;
3051
3052 label_end_of_loop:
3053 return 0;
3054}
3055
3056/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3057 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3058
3059static void
3060decode_coding_sjis_big5 (coding, source, destination,
3061 src_bytes, dst_bytes, sjis_p)
3062 struct coding_system *coding;
3063 const unsigned char *source;
3064 unsigned char *destination;
3065 int src_bytes, dst_bytes;
3066 int sjis_p;
3067{
3068 const unsigned char *src = source;
3069 const unsigned char *src_end = source + src_bytes;
3070 unsigned char *dst = destination;
3071 unsigned char *dst_end = destination + dst_bytes;
3072 /* SRC_BASE remembers the start position in source in each loop.
3073 The loop will be exited when there's not enough source code
3074 (within macro ONE_MORE_BYTE), or when there's not enough
3075 destination area to produce a character (within macro
3076 EMIT_CHAR). */
3077 const unsigned char *src_base;
3078 Lisp_Object translation_table;
3079
3080 if (NILP (Venable_character_translation))
3081 translation_table = Qnil;
3082 else
3083 {
3084 translation_table = coding->translation_table_for_decode;
3085 if (NILP (translation_table))
3086 translation_table = Vstandard_translation_table_for_decode;
3087 }
3088
3089 coding->produced_char = 0;
3090 while (1)
3091 {
3092 int c, charset, c1, c2 = 0;
3093
3094 src_base = src;
3095 ONE_MORE_BYTE (c1);
3096
3097 if (c1 < 0x80)
3098 {
3099 charset = CHARSET_ASCII;
3100 if (c1 < 0x20)
3101 {
3102 if (c1 == '\r')
3103 {
3104 if (coding->eol_type == CODING_EOL_CRLF)
3105 {
3106 ONE_MORE_BYTE (c2);
3107 if (c2 == '\n')
3108 c1 = c2;
3109 else
3110 /* To process C2 again, SRC is subtracted by 1. */
3111 src--;
3112 }
3113 else if (coding->eol_type == CODING_EOL_CR)
3114 c1 = '\n';
3115 }
3116 else if (c1 == '\n'
3117 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3118 && (coding->eol_type == CODING_EOL_CR
3119 || coding->eol_type == CODING_EOL_CRLF))
3120 {
3121 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3122 goto label_end_of_loop;
3123 }
3124 }
3125 }
3126 else
3127 {
3128 if (sjis_p)
3129 {
3130 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3131 goto label_invalid_code;
3132 if (c1 <= 0x9F || c1 >= 0xE0)
3133 {
3134 /* SJIS -> JISX0208 */
3135 ONE_MORE_BYTE (c2);
3136 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3137 goto label_invalid_code;
3138 DECODE_SJIS (c1, c2, c1, c2);
3139 charset = charset_jisx0208;
3140 }
3141 else
3142 /* SJIS -> JISX0201-Kana */
3143 charset = charset_katakana_jisx0201;
3144 }
3145 else
3146 {
3147 /* BIG5 -> Big5 */
3148 if (c1 < 0xA0 || c1 > 0xFE)
3149 goto label_invalid_code;
3150 ONE_MORE_BYTE (c2);
3151 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3152 goto label_invalid_code;
3153 DECODE_BIG5 (c1, c2, charset, c1, c2);
3154 }
3155 }
3156
3157 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3158 EMIT_CHAR (c);
3159 continue;
3160
3161 label_invalid_code:
3162 coding->errors++;
3163 src = src_base;
3164 c = *src++;
3165 EMIT_CHAR (c);
3166 }
3167
3168 label_end_of_loop:
3169 coding->consumed = coding->consumed_char = src_base - source;
3170 coding->produced = dst - destination;
3171 return;
3172}
3173
3174/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3175 This function can encode charsets `ascii', `katakana-jisx0201',
3176 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3177 are sure that all these charsets are registered as official charset
3178 (i.e. do not have extended leading-codes). Characters of other
3179 charsets are produced without any encoding. If SJIS_P is 1, encode
3180 SJIS text, else encode BIG5 text. */
3181
3182static void
3183encode_coding_sjis_big5 (coding, source, destination,
3184 src_bytes, dst_bytes, sjis_p)
3185 struct coding_system *coding;
3186 unsigned char *source, *destination;
3187 int src_bytes, dst_bytes;
3188 int sjis_p;
3189{
3190 unsigned char *src = source;
3191 unsigned char *src_end = source + src_bytes;
3192 unsigned char *dst = destination;
3193 unsigned char *dst_end = destination + dst_bytes;
3194 /* SRC_BASE remembers the start position in source in each loop.
3195 The loop will be exited when there's not enough source text to
3196 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3197 there's not enough destination area to produce encoded codes
3198 (within macro EMIT_BYTES). */
3199 unsigned char *src_base;
3200 Lisp_Object translation_table;
3201
3202 if (NILP (Venable_character_translation))
3203 translation_table = Qnil;
3204 else
3205 {
3206 translation_table = coding->translation_table_for_encode;
3207 if (NILP (translation_table))
3208 translation_table = Vstandard_translation_table_for_encode;
3209 }
3210
3211 while (1)
3212 {
3213 int c, charset, c1, c2;
3214
3215 src_base = src;
3216 ONE_MORE_CHAR (c);
3217
3218 /* Now encode the character C. */
3219 if (SINGLE_BYTE_CHAR_P (c))
3220 {
3221 switch (c)
3222 {
3223 case '\r':
3224 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3225 {
3226 EMIT_ONE_BYTE (c);
3227 break;
3228 }
3229 c = '\n';
3230 case '\n':
3231 if (coding->eol_type == CODING_EOL_CRLF)
3232 {
3233 EMIT_TWO_BYTES ('\r', c);
3234 break;
3235 }
3236 else if (coding->eol_type == CODING_EOL_CR)
3237 c = '\r';
3238 default:
3239 EMIT_ONE_BYTE (c);
3240 }
3241 }
3242 else
3243 {
3244 SPLIT_CHAR (c, charset, c1, c2);
3245 if (sjis_p)
3246 {
3247 if (charset == charset_jisx0208
3248 || charset == charset_jisx0208_1978)
3249 {
3250 ENCODE_SJIS (c1, c2, c1, c2);
3251 EMIT_TWO_BYTES (c1, c2);
3252 }
3253 else if (charset == charset_katakana_jisx0201)
3254 EMIT_ONE_BYTE (c1 | 0x80);
3255 else if (charset == charset_latin_jisx0201)
3256 EMIT_ONE_BYTE (c1);
3257 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3258 {
3259 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260 if (CHARSET_WIDTH (charset) > 1)
3261 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3262 }
3263 else
3264 /* There's no way other than producing the internal
3265 codes as is. */
3266 EMIT_BYTES (src_base, src);
3267 }
3268 else
3269 {
3270 if (charset == charset_big5_1 || charset == charset_big5_2)
3271 {
3272 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3273 EMIT_TWO_BYTES (c1, c2);
3274 }
3275 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3276 {
3277 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3278 if (CHARSET_WIDTH (charset) > 1)
3279 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3280 }
3281 else
3282 /* There's no way other than producing the internal
3283 codes as is. */
3284 EMIT_BYTES (src_base, src);
3285 }
3286 }
3287 coding->consumed_char++;
3288 }
3289
3290 label_end_of_loop:
3291 coding->consumed = src_base - source;
3292 coding->produced = coding->produced_char = dst - destination;
3293}
3294
3295\f
3296/*** 5. CCL handlers ***/
3297
3298/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3299 Check if a text is encoded in a coding system of which
3300 encoder/decoder are written in CCL program. If it is, return
3301 CODING_CATEGORY_MASK_CCL, else return 0. */
3302
3303static int
3304detect_coding_ccl (src, src_end, multibytep)
3305 unsigned char *src, *src_end;
3306 int multibytep;
3307{
3308 unsigned char *valid;
3309 int c;
3310 /* Dummy for ONE_MORE_BYTE. */
3311 struct coding_system dummy_coding;
3312 struct coding_system *coding = &dummy_coding;
3313
3314 /* No coding system is assigned to coding-category-ccl. */
3315 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3316 return 0;
3317
3318 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3319 while (1)
3320 {
3321 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3322 if (! valid[c])
3323 return 0;
3324 }
3325 label_end_of_loop:
3326 return CODING_CATEGORY_MASK_CCL;
3327}
3328
3329\f
3330/*** 6. End-of-line handlers ***/
3331
3332/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3333
3334static void
3335decode_eol (coding, source, destination, src_bytes, dst_bytes)
3336 struct coding_system *coding;
3337 const unsigned char *source;
3338 unsigned char *destination;
3339 int src_bytes, dst_bytes;
3340{
3341 const unsigned char *src = source;
3342 unsigned char *dst = destination;
3343 const unsigned char *src_end = src + src_bytes;
3344 unsigned char *dst_end = dst + dst_bytes;
3345 Lisp_Object translation_table;
3346 /* SRC_BASE remembers the start position in source in each loop.
3347 The loop will be exited when there's not enough source code
3348 (within macro ONE_MORE_BYTE), or when there's not enough
3349 destination area to produce a character (within macro
3350 EMIT_CHAR). */
3351 const unsigned char *src_base;
3352 int c;
3353
3354 translation_table = Qnil;
3355 switch (coding->eol_type)
3356 {
3357 case CODING_EOL_CRLF:
3358 while (1)
3359 {
3360 src_base = src;
3361 ONE_MORE_BYTE (c);
3362 if (c == '\r')
3363 {
3364 ONE_MORE_BYTE (c);
3365 if (c != '\n')
3366 {
3367 src--;
3368 c = '\r';
3369 }
3370 }
3371 else if (c == '\n'
3372 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3373 {
3374 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3375 goto label_end_of_loop;
3376 }
3377 EMIT_CHAR (c);
3378 }
3379 break;
3380
3381 case CODING_EOL_CR:
3382 while (1)
3383 {
3384 src_base = src;
3385 ONE_MORE_BYTE (c);
3386 if (c == '\n')
3387 {
3388 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3389 {
3390 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3391 goto label_end_of_loop;
3392 }
3393 }
3394 else if (c == '\r')
3395 c = '\n';
3396 EMIT_CHAR (c);
3397 }
3398 break;
3399
3400 default: /* no need for EOL handling */
3401 while (1)
3402 {
3403 src_base = src;
3404 ONE_MORE_BYTE (c);
3405 EMIT_CHAR (c);
3406 }
3407 }
3408
3409 label_end_of_loop:
3410 coding->consumed = coding->consumed_char = src_base - source;
3411 coding->produced = dst - destination;
3412 return;
3413}
3414
3415/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3416 format of end-of-line according to `coding->eol_type'. It also
3417 convert multibyte form 8-bit characters to unibyte if
3418 CODING->src_multibyte is nonzero. If `coding->mode &
3419 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3420 also means end-of-line. */
3421
3422static void
3423encode_eol (coding, source, destination, src_bytes, dst_bytes)
3424 struct coding_system *coding;
3425 const unsigned char *source;
3426 unsigned char *destination;
3427 int src_bytes, dst_bytes;
3428{
3429 const unsigned char *src = source;
3430 unsigned char *dst = destination;
3431 const unsigned char *src_end = src + src_bytes;
3432 unsigned char *dst_end = dst + dst_bytes;
3433 Lisp_Object translation_table;
3434 /* SRC_BASE remembers the start position in source in each loop.
3435 The loop will be exited when there's not enough source text to
3436 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3437 there's not enough destination area to produce encoded codes
3438 (within macro EMIT_BYTES). */
3439 const unsigned char *src_base;
3440 unsigned char *tmp;
3441 int c;
3442 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3443
3444 translation_table = Qnil;
3445 if (coding->src_multibyte
3446 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3447 {
3448 src_end--;
3449 src_bytes--;
3450 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3451 }
3452
3453 if (coding->eol_type == CODING_EOL_CRLF)
3454 {
3455 while (src < src_end)
3456 {
3457 src_base = src;
3458 c = *src++;
3459 if (c >= 0x20)
3460 EMIT_ONE_BYTE (c);
3461 else if (c == '\n' || (c == '\r' && selective_display))
3462 EMIT_TWO_BYTES ('\r', '\n');
3463 else
3464 EMIT_ONE_BYTE (c);
3465 }
3466 src_base = src;
3467 label_end_of_loop:
3468 ;
3469 }
3470 else
3471 {
3472 if (!dst_bytes || src_bytes <= dst_bytes)
3473 {
3474 safe_bcopy (src, dst, src_bytes);
3475 src_base = src_end;
3476 dst += src_bytes;
3477 }
3478 else
3479 {
3480 if (coding->src_multibyte
3481 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3482 dst_bytes--;
3483 safe_bcopy (src, dst, dst_bytes);
3484 src_base = src + dst_bytes;
3485 dst = destination + dst_bytes;
3486 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3487 }
3488 if (coding->eol_type == CODING_EOL_CR)
3489 {
3490 for (tmp = destination; tmp < dst; tmp++)
3491 if (*tmp == '\n') *tmp = '\r';
3492 }
3493 else if (selective_display)
3494 {
3495 for (tmp = destination; tmp < dst; tmp++)
3496 if (*tmp == '\r') *tmp = '\n';
3497 }
3498 }
3499 if (coding->src_multibyte)
3500 dst = destination + str_as_unibyte (destination, dst - destination);
3501
3502 coding->consumed = src_base - source;
3503 coding->produced = dst - destination;
3504 coding->produced_char = coding->produced;
3505}
3506
3507\f
3508/*** 7. C library functions ***/
3509
3510/* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3511 has a property `coding-system'. The value of this property is a
3512 vector of length 5 (called the coding-vector). Among elements of
3513 this vector, the first (element[0]) and the fifth (element[4])
3514 carry important information for decoding/encoding. Before
3515 decoding/encoding, this information should be set in fields of a
3516 structure of type `coding_system'.
3517
3518 The value of the property `coding-system' can be a symbol of another
3519 subsidiary coding-system. In that case, Emacs gets coding-vector
3520 from that symbol.
3521
3522 `element[0]' contains information to be set in `coding->type'. The
3523 value and its meaning is as follows:
3524
3525 0 -- coding_type_emacs_mule
3526 1 -- coding_type_sjis
3527 2 -- coding_type_iso2022
3528 3 -- coding_type_big5
3529 4 -- coding_type_ccl encoder/decoder written in CCL
3530 nil -- coding_type_no_conversion
3531 t -- coding_type_undecided (automatic conversion on decoding,
3532 no-conversion on encoding)
3533
3534 `element[4]' contains information to be set in `coding->flags' and
3535 `coding->spec'. The meaning varies by `coding->type'.
3536
3537 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3538 of length 32 (of which the first 13 sub-elements are used now).
3539 Meanings of these sub-elements are:
3540
3541 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3542 If the value is an integer of valid charset, the charset is
3543 assumed to be designated to graphic register N initially.
3544
3545 If the value is minus, it is a minus value of charset which
3546 reserves graphic register N, which means that the charset is
3547 not designated initially but should be designated to graphic
3548 register N just before encoding a character in that charset.
3549
3550 If the value is nil, graphic register N is never used on
3551 encoding.
3552
3553 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3554 Each value takes t or nil. See the section ISO2022 of
3555 `coding.h' for more information.
3556
3557 If `coding->type' is `coding_type_big5', element[4] is t to denote
3558 BIG5-ETen or nil to denote BIG5-HKU.
3559
3560 If `coding->type' takes the other value, element[4] is ignored.
3561
3562 Emacs Lisp's coding systems also carry information about format of
3563 end-of-line in a value of property `eol-type'. If the value is
3564 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3565 means CODING_EOL_CR. If it is not integer, it should be a vector
3566 of subsidiary coding systems of which property `eol-type' has one
3567 of the above values.
3568
3569*/
3570
3571/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3572 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3573 is setup so that no conversion is necessary and return -1, else
3574 return 0. */
3575
3576int
3577setup_coding_system (coding_system, coding)
3578 Lisp_Object coding_system;
3579 struct coding_system *coding;
3580{
3581 Lisp_Object coding_spec, coding_type, eol_type, plist;
3582 Lisp_Object val;
3583
3584 /* At first, zero clear all members. */
3585 bzero (coding, sizeof (struct coding_system));
3586
3587 /* Initialize some fields required for all kinds of coding systems. */
3588 coding->symbol = coding_system;
3589 coding->heading_ascii = -1;
3590 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3591 coding->composing = COMPOSITION_DISABLED;
3592 coding->cmp_data = NULL;
3593
3594 if (NILP (coding_system))
3595 goto label_invalid_coding_system;
3596
3597 coding_spec = Fget (coding_system, Qcoding_system);
3598
3599 if (!VECTORP (coding_spec)
3600 || XVECTOR (coding_spec)->size != 5
3601 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3602 goto label_invalid_coding_system;
3603
3604 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3605 if (VECTORP (eol_type))
3606 {
3607 coding->eol_type = CODING_EOL_UNDECIDED;
3608 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3609 }
3610 else if (XFASTINT (eol_type) == 1)
3611 {
3612 coding->eol_type = CODING_EOL_CRLF;
3613 coding->common_flags
3614 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3615 }
3616 else if (XFASTINT (eol_type) == 2)
3617 {
3618 coding->eol_type = CODING_EOL_CR;
3619 coding->common_flags
3620 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3621 }
3622 else
3623 coding->eol_type = CODING_EOL_LF;
3624
3625 coding_type = XVECTOR (coding_spec)->contents[0];
3626 /* Try short cut. */
3627 if (SYMBOLP (coding_type))
3628 {
3629 if (EQ (coding_type, Qt))
3630 {
3631 coding->type = coding_type_undecided;
3632 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3633 }
3634 else
3635 coding->type = coding_type_no_conversion;
3636 /* Initialize this member. Any thing other than
3637 CODING_CATEGORY_IDX_UTF_16_BE and
3638 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3639 special treatment in detect_eol. */
3640 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3641
3642 return 0;
3643 }
3644
3645 /* Get values of coding system properties:
3646 `post-read-conversion', `pre-write-conversion',
3647 `translation-table-for-decode', `translation-table-for-encode'. */
3648 plist = XVECTOR (coding_spec)->contents[3];
3649 /* Pre & post conversion functions should be disabled if
3650 inhibit_eol_conversion is nonzero. This is the case that a code
3651 conversion function is called while those functions are running. */
3652 if (! inhibit_pre_post_conversion)
3653 {
3654 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3655 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3656 }
3657 val = Fplist_get (plist, Qtranslation_table_for_decode);
3658 if (SYMBOLP (val))
3659 val = Fget (val, Qtranslation_table_for_decode);
3660 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3661 val = Fplist_get (plist, Qtranslation_table_for_encode);
3662 if (SYMBOLP (val))
3663 val = Fget (val, Qtranslation_table_for_encode);
3664 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3665 val = Fplist_get (plist, Qcoding_category);
3666 if (!NILP (val))
3667 {
3668 val = Fget (val, Qcoding_category_index);
3669 if (INTEGERP (val))
3670 coding->category_idx = XINT (val);
3671 else
3672 goto label_invalid_coding_system;
3673 }
3674 else
3675 goto label_invalid_coding_system;
3676
3677 /* If the coding system has non-nil `composition' property, enable
3678 composition handling. */
3679 val = Fplist_get (plist, Qcomposition);
3680 if (!NILP (val))
3681 coding->composing = COMPOSITION_NO;
3682
3683 switch (XFASTINT (coding_type))
3684 {
3685 case 0:
3686 coding->type = coding_type_emacs_mule;
3687 coding->common_flags
3688 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3689 if (!NILP (coding->post_read_conversion))
3690 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3691 if (!NILP (coding->pre_write_conversion))
3692 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3693 break;
3694
3695 case 1:
3696 coding->type = coding_type_sjis;
3697 coding->common_flags
3698 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3699 break;
3700
3701 case 2:
3702 coding->type = coding_type_iso2022;
3703 coding->common_flags
3704 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3705 {
3706 Lisp_Object val, temp;
3707 Lisp_Object *flags;
3708 int i, charset, reg_bits = 0;
3709
3710 val = XVECTOR (coding_spec)->contents[4];
3711
3712 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3713 goto label_invalid_coding_system;
3714
3715 flags = XVECTOR (val)->contents;
3716 coding->flags
3717 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3718 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3719 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3720 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3721 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3722 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3723 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3724 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3725 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3726 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3727 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3728 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3729 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3730 );
3731
3732 /* Invoke graphic register 0 to plane 0. */
3733 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3734 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3735 CODING_SPEC_ISO_INVOCATION (coding, 1)
3736 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3737 /* Not single shifting at first. */
3738 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3739 /* Beginning of buffer should also be regarded as bol. */
3740 CODING_SPEC_ISO_BOL (coding) = 1;
3741
3742 for (charset = 0; charset <= MAX_CHARSET; charset++)
3743 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3744 val = Vcharset_revision_alist;
3745 while (CONSP (val))
3746 {
3747 charset = get_charset_id (Fcar_safe (XCAR (val)));
3748 if (charset >= 0
3749 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3750 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3751 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3752 val = XCDR (val);
3753 }
3754
3755 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3756 FLAGS[REG] can be one of below:
3757 integer CHARSET: CHARSET occupies register I,
3758 t: designate nothing to REG initially, but can be used
3759 by any charsets,
3760 list of integer, nil, or t: designate the first
3761 element (if integer) to REG initially, the remaining
3762 elements (if integer) is designated to REG on request,
3763 if an element is t, REG can be used by any charsets,
3764 nil: REG is never used. */
3765 for (charset = 0; charset <= MAX_CHARSET; charset++)
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3768 for (i = 0; i < 4; i++)
3769 {
3770 if ((INTEGERP (flags[i])
3771 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3772 || (charset = get_charset_id (flags[i])) >= 0)
3773 {
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3775 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3776 }
3777 else if (EQ (flags[i], Qt))
3778 {
3779 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3780 reg_bits |= 1 << i;
3781 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3782 }
3783 else if (CONSP (flags[i]))
3784 {
3785 Lisp_Object tail;
3786 tail = flags[i];
3787
3788 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3789 if ((INTEGERP (XCAR (tail))
3790 && (charset = XINT (XCAR (tail)),
3791 CHARSET_VALID_P (charset)))
3792 || (charset = get_charset_id (XCAR (tail))) >= 0)
3793 {
3794 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3795 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3796 }
3797 else
3798 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3799 tail = XCDR (tail);
3800 while (CONSP (tail))
3801 {
3802 if ((INTEGERP (XCAR (tail))
3803 && (charset = XINT (XCAR (tail)),
3804 CHARSET_VALID_P (charset)))
3805 || (charset = get_charset_id (XCAR (tail))) >= 0)
3806 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3807 = i;
3808 else if (EQ (XCAR (tail), Qt))
3809 reg_bits |= 1 << i;
3810 tail = XCDR (tail);
3811 }
3812 }
3813 else
3814 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3815
3816 CODING_SPEC_ISO_DESIGNATION (coding, i)
3817 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3818 }
3819
3820 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3821 {
3822 /* REG 1 can be used only by locking shift in 7-bit env. */
3823 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3824 reg_bits &= ~2;
3825 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3826 /* Without any shifting, only REG 0 and 1 can be used. */
3827 reg_bits &= 3;
3828 }
3829
3830 if (reg_bits)
3831 for (charset = 0; charset <= MAX_CHARSET; charset++)
3832 {
3833 if (CHARSET_DEFINED_P (charset)
3834 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3835 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3836 {
3837 /* There exist some default graphic registers to be
3838 used by CHARSET. */
3839
3840 /* We had better avoid designating a charset of
3841 CHARS96 to REG 0 as far as possible. */
3842 if (CHARSET_CHARS (charset) == 96)
3843 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3844 = (reg_bits & 2
3845 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3846 else
3847 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848 = (reg_bits & 1
3849 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3850 }
3851 }
3852 }
3853 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3854 coding->spec.iso2022.last_invalid_designation_register = -1;
3855 break;
3856
3857 case 3:
3858 coding->type = coding_type_big5;
3859 coding->common_flags
3860 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3861 coding->flags
3862 = (NILP (XVECTOR (coding_spec)->contents[4])
3863 ? CODING_FLAG_BIG5_HKU
3864 : CODING_FLAG_BIG5_ETEN);
3865 break;
3866
3867 case 4:
3868 coding->type = coding_type_ccl;
3869 coding->common_flags
3870 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3871 {
3872 val = XVECTOR (coding_spec)->contents[4];
3873 if (! CONSP (val)
3874 || setup_ccl_program (&(coding->spec.ccl.decoder),
3875 XCAR (val)) < 0
3876 || setup_ccl_program (&(coding->spec.ccl.encoder),
3877 XCDR (val)) < 0)
3878 goto label_invalid_coding_system;
3879
3880 bzero (coding->spec.ccl.valid_codes, 256);
3881 val = Fplist_get (plist, Qvalid_codes);
3882 if (CONSP (val))
3883 {
3884 Lisp_Object this;
3885
3886 for (; CONSP (val); val = XCDR (val))
3887 {
3888 this = XCAR (val);
3889 if (INTEGERP (this)
3890 && XINT (this) >= 0 && XINT (this) < 256)
3891 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3892 else if (CONSP (this)
3893 && INTEGERP (XCAR (this))
3894 && INTEGERP (XCDR (this)))
3895 {
3896 int start = XINT (XCAR (this));
3897 int end = XINT (XCDR (this));
3898
3899 if (start >= 0 && start <= end && end < 256)
3900 while (start <= end)
3901 coding->spec.ccl.valid_codes[start++] = 1;
3902 }
3903 }
3904 }
3905 }
3906 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3907 coding->spec.ccl.cr_carryover = 0;
3908 coding->spec.ccl.eight_bit_carryover[0] = 0;
3909 break;
3910
3911 case 5:
3912 coding->type = coding_type_raw_text;
3913 break;
3914
3915 default:
3916 goto label_invalid_coding_system;
3917 }
3918 return 0;
3919
3920 label_invalid_coding_system:
3921 coding->type = coding_type_no_conversion;
3922 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3923 coding->common_flags = 0;
3924 coding->eol_type = CODING_EOL_LF;
3925 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3926 return -1;
3927}
3928
3929/* Free memory blocks allocated for storing composition information. */
3930
3931void
3932coding_free_composition_data (coding)
3933 struct coding_system *coding;
3934{
3935 struct composition_data *cmp_data = coding->cmp_data, *next;
3936
3937 if (!cmp_data)
3938 return;
3939 /* Memory blocks are chained. At first, rewind to the first, then,
3940 free blocks one by one. */
3941 while (cmp_data->prev)
3942 cmp_data = cmp_data->prev;
3943 while (cmp_data)
3944 {
3945 next = cmp_data->next;
3946 xfree (cmp_data);
3947 cmp_data = next;
3948 }
3949 coding->cmp_data = NULL;
3950}
3951
3952/* Set `char_offset' member of all memory blocks pointed by
3953 coding->cmp_data to POS. */
3954
3955void
3956coding_adjust_composition_offset (coding, pos)
3957 struct coding_system *coding;
3958 int pos;
3959{
3960 struct composition_data *cmp_data;
3961
3962 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3963 cmp_data->char_offset = pos;
3964}
3965
3966/* Setup raw-text or one of its subsidiaries in the structure
3967 coding_system CODING according to the already setup value eol_type
3968 in CODING. CODING should be setup for some coding system in
3969 advance. */
3970
3971void
3972setup_raw_text_coding_system (coding)
3973 struct coding_system *coding;
3974{
3975 if (coding->type != coding_type_raw_text)
3976 {
3977 coding->symbol = Qraw_text;
3978 coding->type = coding_type_raw_text;
3979 if (coding->eol_type != CODING_EOL_UNDECIDED)
3980 {
3981 Lisp_Object subsidiaries;
3982 subsidiaries = Fget (Qraw_text, Qeol_type);
3983
3984 if (VECTORP (subsidiaries)
3985 && XVECTOR (subsidiaries)->size == 3)
3986 coding->symbol
3987 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3988 }
3989 setup_coding_system (coding->symbol, coding);
3990 }
3991 return;
3992}
3993
3994/* Emacs has a mechanism to automatically detect a coding system if it
3995 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3996 it's impossible to distinguish some coding systems accurately
3997 because they use the same range of codes. So, at first, coding
3998 systems are categorized into 7, those are:
3999
4000 o coding-category-emacs-mule
4001
4002 The category for a coding system which has the same code range
4003 as Emacs' internal format. Assigned the coding-system (Lisp
4004 symbol) `emacs-mule' by default.
4005
4006 o coding-category-sjis
4007
4008 The category for a coding system which has the same code range
4009 as SJIS. Assigned the coding-system (Lisp
4010 symbol) `japanese-shift-jis' by default.
4011
4012 o coding-category-iso-7
4013
4014 The category for a coding system which has the same code range
4015 as ISO2022 of 7-bit environment. This doesn't use any locking
4016 shift and single shift functions. This can encode/decode all
4017 charsets. Assigned the coding-system (Lisp symbol)
4018 `iso-2022-7bit' by default.
4019
4020 o coding-category-iso-7-tight
4021
4022 Same as coding-category-iso-7 except that this can
4023 encode/decode only the specified charsets.
4024
4025 o coding-category-iso-8-1
4026
4027 The category for a coding system which has the same code range
4028 as ISO2022 of 8-bit environment and graphic plane 1 used only
4029 for DIMENSION1 charset. This doesn't use any locking shift
4030 and single shift functions. Assigned the coding-system (Lisp
4031 symbol) `iso-latin-1' by default.
4032
4033 o coding-category-iso-8-2
4034
4035 The category for a coding system which has the same code range
4036 as ISO2022 of 8-bit environment and graphic plane 1 used only
4037 for DIMENSION2 charset. This doesn't use any locking shift
4038 and single shift functions. Assigned the coding-system (Lisp
4039 symbol) `japanese-iso-8bit' by default.
4040
4041 o coding-category-iso-7-else
4042
4043 The category for a coding system which has the same code range
4044 as ISO2022 of 7-bit environment but uses locking shift or
4045 single shift functions. Assigned the coding-system (Lisp
4046 symbol) `iso-2022-7bit-lock' by default.
4047
4048 o coding-category-iso-8-else
4049
4050 The category for a coding system which has the same code range
4051 as ISO2022 of 8-bit environment but uses locking shift or
4052 single shift functions. Assigned the coding-system (Lisp
4053 symbol) `iso-2022-8bit-ss2' by default.
4054
4055 o coding-category-big5
4056
4057 The category for a coding system which has the same code range
4058 as BIG5. Assigned the coding-system (Lisp symbol)
4059 `cn-big5' by default.
4060
4061 o coding-category-utf-8
4062
4063 The category for a coding system which has the same code range
4064 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4065 symbol) `utf-8' by default.
4066
4067 o coding-category-utf-16-be
4068
4069 The category for a coding system in which a text has an
4070 Unicode signature (cf. Unicode Standard) in the order of BIG
4071 endian at the head. Assigned the coding-system (Lisp symbol)
4072 `utf-16-be' by default.
4073
4074 o coding-category-utf-16-le
4075
4076 The category for a coding system in which a text has an
4077 Unicode signature (cf. Unicode Standard) in the order of
4078 LITTLE endian at the head. Assigned the coding-system (Lisp
4079 symbol) `utf-16-le' by default.
4080
4081 o coding-category-ccl
4082
4083 The category for a coding system of which encoder/decoder is
4084 written in CCL programs. The default value is nil, i.e., no
4085 coding system is assigned.
4086
4087 o coding-category-binary
4088
4089 The category for a coding system not categorized in any of the
4090 above. Assigned the coding-system (Lisp symbol)
4091 `no-conversion' by default.
4092
4093 Each of them is a Lisp symbol and the value is an actual
4094 `coding-system' (this is also a Lisp symbol) assigned by a user.
4095 What Emacs does actually is to detect a category of coding system.
4096 Then, it uses a `coding-system' assigned to it. If Emacs can't
4097 decide a single possible category, it selects a category of the
4098 highest priority. Priorities of categories are also specified by a
4099 user in a Lisp variable `coding-category-list'.
4100
4101*/
4102
4103static
4104int ascii_skip_code[256];
4105
4106/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4107 If it detects possible coding systems, return an integer in which
4108 appropriate flag bits are set. Flag bits are defined by macros
4109 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4110 it should point the table `coding_priorities'. In that case, only
4111 the flag bit for a coding system of the highest priority is set in
4112 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4113 range 0x80..0x9F are in multibyte form.
4114
4115 How many ASCII characters are at the head is returned as *SKIP. */
4116
4117static int
4118detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4119 unsigned char *source;
4120 int src_bytes, *priorities, *skip;
4121 int multibytep;
4122{
4123 register unsigned char c;
4124 unsigned char *src = source, *src_end = source + src_bytes;
4125 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4126 int i;
4127
4128 /* At first, skip all ASCII characters and control characters except
4129 for three ISO2022 specific control characters. */
4130 ascii_skip_code[ISO_CODE_SO] = 0;
4131 ascii_skip_code[ISO_CODE_SI] = 0;
4132 ascii_skip_code[ISO_CODE_ESC] = 0;
4133
4134 label_loop_detect_coding:
4135 while (src < src_end && ascii_skip_code[*src]) src++;
4136 *skip = src - source;
4137
4138 if (src >= src_end)
4139 /* We found nothing other than ASCII. There's nothing to do. */
4140 return 0;
4141
4142 c = *src;
4143 /* The text seems to be encoded in some multilingual coding system.
4144 Now, try to find in which coding system the text is encoded. */
4145 if (c < 0x80)
4146 {
4147 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4148 /* C is an ISO2022 specific control code of C0. */
4149 mask = detect_coding_iso2022 (src, src_end, multibytep);
4150 if (mask == 0)
4151 {
4152 /* No valid ISO2022 code follows C. Try again. */
4153 src++;
4154 if (c == ISO_CODE_ESC)
4155 ascii_skip_code[ISO_CODE_ESC] = 1;
4156 else
4157 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4158 goto label_loop_detect_coding;
4159 }
4160 if (priorities)
4161 {
4162 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4163 {
4164 if (mask & priorities[i])
4165 return priorities[i];
4166 }
4167 return CODING_CATEGORY_MASK_RAW_TEXT;
4168 }
4169 }
4170 else
4171 {
4172 int try;
4173
4174 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4175 c = src[1] - 0x20;
4176
4177 if (c < 0xA0)
4178 {
4179 /* C is the first byte of SJIS character code,
4180 or a leading-code of Emacs' internal format (emacs-mule),
4181 or the first byte of UTF-16. */
4182 try = (CODING_CATEGORY_MASK_SJIS
4183 | CODING_CATEGORY_MASK_EMACS_MULE
4184 | CODING_CATEGORY_MASK_UTF_16_BE
4185 | CODING_CATEGORY_MASK_UTF_16_LE);
4186
4187 /* Or, if C is a special latin extra code,
4188 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4189 or is an ISO2022 control-sequence-introducer (CSI),
4190 we should also consider the possibility of ISO2022 codings. */
4191 if ((VECTORP (Vlatin_extra_code_table)
4192 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4193 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4194 || (c == ISO_CODE_CSI
4195 && (src < src_end
4196 && (*src == ']'
4197 || ((*src == '0' || *src == '1' || *src == '2')
4198 && src + 1 < src_end
4199 && src[1] == ']')))))
4200 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4201 | CODING_CATEGORY_MASK_ISO_8BIT);
4202 }
4203 else
4204 /* C is a character of ISO2022 in graphic plane right,
4205 or a SJIS's 1-byte character code (i.e. JISX0201),
4206 or the first byte of BIG5's 2-byte code,
4207 or the first byte of UTF-8/16. */
4208 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4209 | CODING_CATEGORY_MASK_ISO_8BIT
4210 | CODING_CATEGORY_MASK_SJIS
4211 | CODING_CATEGORY_MASK_BIG5
4212 | CODING_CATEGORY_MASK_UTF_8
4213 | CODING_CATEGORY_MASK_UTF_16_BE
4214 | CODING_CATEGORY_MASK_UTF_16_LE);
4215
4216 /* Or, we may have to consider the possibility of CCL. */
4217 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4218 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4219 ->spec.ccl.valid_codes)[c])
4220 try |= CODING_CATEGORY_MASK_CCL;
4221
4222 mask = 0;
4223 utf16_examined_p = iso2022_examined_p = 0;
4224 if (priorities)
4225 {
4226 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4227 {
4228 if (!iso2022_examined_p
4229 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4230 {
4231 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4232 iso2022_examined_p = 1;
4233 }
4234 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4235 mask |= detect_coding_sjis (src, src_end, multibytep);
4236 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4237 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4238 else if (!utf16_examined_p
4239 && (priorities[i] & try &
4240 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4241 {
4242 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4243 utf16_examined_p = 1;
4244 }
4245 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4246 mask |= detect_coding_big5 (src, src_end, multibytep);
4247 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4248 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4250 mask |= detect_coding_ccl (src, src_end, multibytep);
4251 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4252 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4253 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4254 mask |= CODING_CATEGORY_MASK_BINARY;
4255 if (mask & priorities[i])
4256 return priorities[i];
4257 }
4258 return CODING_CATEGORY_MASK_RAW_TEXT;
4259 }
4260 if (try & CODING_CATEGORY_MASK_ISO)
4261 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4262 if (try & CODING_CATEGORY_MASK_SJIS)
4263 mask |= detect_coding_sjis (src, src_end, multibytep);
4264 if (try & CODING_CATEGORY_MASK_BIG5)
4265 mask |= detect_coding_big5 (src, src_end, multibytep);
4266 if (try & CODING_CATEGORY_MASK_UTF_8)
4267 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4268 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4269 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4270 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4271 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4272 if (try & CODING_CATEGORY_MASK_CCL)
4273 mask |= detect_coding_ccl (src, src_end, multibytep);
4274 }
4275 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4276}
4277
4278/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4279 The information of the detected coding system is set in CODING. */
4280
4281void
4282detect_coding (coding, src, src_bytes)
4283 struct coding_system *coding;
4284 const unsigned char *src;
4285 int src_bytes;
4286{
4287 unsigned int idx;
4288 int skip, mask;
4289 Lisp_Object val;
4290
4291 val = Vcoding_category_list;
4292 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4293 coding->src_multibyte);
4294 coding->heading_ascii = skip;
4295
4296 if (!mask) return;
4297
4298 /* We found a single coding system of the highest priority in MASK. */
4299 idx = 0;
4300 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4301 if (! mask)
4302 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4303
4304 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4305
4306 if (coding->eol_type != CODING_EOL_UNDECIDED)
4307 {
4308 Lisp_Object tmp;
4309
4310 tmp = Fget (val, Qeol_type);
4311 if (VECTORP (tmp))
4312 val = XVECTOR (tmp)->contents[coding->eol_type];
4313 }
4314
4315 /* Setup this new coding system while preserving some slots. */
4316 {
4317 int src_multibyte = coding->src_multibyte;
4318 int dst_multibyte = coding->dst_multibyte;
4319
4320 setup_coding_system (val, coding);
4321 coding->src_multibyte = src_multibyte;
4322 coding->dst_multibyte = dst_multibyte;
4323 coding->heading_ascii = skip;
4324 }
4325}
4326
4327/* Detect how end-of-line of a text of length SRC_BYTES pointed by
4328 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4329 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4330
4331 How many non-eol characters are at the head is returned as *SKIP. */
4332
4333#define MAX_EOL_CHECK_COUNT 3
4334
4335static int
4336detect_eol_type (source, src_bytes, skip)
4337 unsigned char *source;
4338 int src_bytes, *skip;
4339{
4340 unsigned char *src = source, *src_end = src + src_bytes;
4341 unsigned char c;
4342 int total = 0; /* How many end-of-lines are found so far. */
4343 int eol_type = CODING_EOL_UNDECIDED;
4344 int this_eol_type;
4345
4346 *skip = 0;
4347
4348 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4349 {
4350 c = *src++;
4351 if (c == '\n' || c == '\r')
4352 {
4353 if (*skip == 0)
4354 *skip = src - 1 - source;
4355 total++;
4356 if (c == '\n')
4357 this_eol_type = CODING_EOL_LF;
4358 else if (src >= src_end || *src != '\n')
4359 this_eol_type = CODING_EOL_CR;
4360 else
4361 this_eol_type = CODING_EOL_CRLF, src++;
4362
4363 if (eol_type == CODING_EOL_UNDECIDED)
4364 /* This is the first end-of-line. */
4365 eol_type = this_eol_type;
4366 else if (eol_type != this_eol_type)
4367 {
4368 /* The found type is different from what found before. */
4369 eol_type = CODING_EOL_INCONSISTENT;
4370 break;
4371 }
4372 }
4373 }
4374
4375 if (*skip == 0)
4376 *skip = src_end - source;
4377 return eol_type;
4378}
4379
4380/* Like detect_eol_type, but detect EOL type in 2-octet
4381 big-endian/little-endian format for coding systems utf-16-be and
4382 utf-16-le. */
4383
4384static int
4385detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4386 unsigned char *source;
4387 int src_bytes, *skip, big_endian_p;
4388{
4389 unsigned char *src = source, *src_end = src + src_bytes;
4390 unsigned int c1, c2;
4391 int total = 0; /* How many end-of-lines are found so far. */
4392 int eol_type = CODING_EOL_UNDECIDED;
4393 int this_eol_type;
4394 int msb, lsb;
4395
4396 if (big_endian_p)
4397 msb = 0, lsb = 1;
4398 else
4399 msb = 1, lsb = 0;
4400
4401 *skip = 0;
4402
4403 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4404 {
4405 c1 = (src[msb] << 8) | (src[lsb]);
4406 src += 2;
4407
4408 if (c1 == '\n' || c1 == '\r')
4409 {
4410 if (*skip == 0)
4411 *skip = src - 2 - source;
4412 total++;
4413 if (c1 == '\n')
4414 {
4415 this_eol_type = CODING_EOL_LF;
4416 }
4417 else
4418 {
4419 if ((src + 1) >= src_end)
4420 {
4421 this_eol_type = CODING_EOL_CR;
4422 }
4423 else
4424 {
4425 c2 = (src[msb] << 8) | (src[lsb]);
4426 if (c2 == '\n')
4427 this_eol_type = CODING_EOL_CRLF, src += 2;
4428 else
4429 this_eol_type = CODING_EOL_CR;
4430 }
4431 }
4432
4433 if (eol_type == CODING_EOL_UNDECIDED)
4434 /* This is the first end-of-line. */
4435 eol_type = this_eol_type;
4436 else if (eol_type != this_eol_type)
4437 {
4438 /* The found type is different from what found before. */
4439 eol_type = CODING_EOL_INCONSISTENT;
4440 break;
4441 }
4442 }
4443 }
4444
4445 if (*skip == 0)
4446 *skip = src_end - source;
4447 return eol_type;
4448}
4449
4450/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4451 is encoded. If it detects an appropriate format of end-of-line, it
4452 sets the information in *CODING. */
4453
4454void
4455detect_eol (coding, src, src_bytes)
4456 struct coding_system *coding;
4457 const unsigned char *src;
4458 int src_bytes;
4459{
4460 Lisp_Object val;
4461 int skip;
4462 int eol_type;
4463
4464 switch (coding->category_idx)
4465 {
4466 case CODING_CATEGORY_IDX_UTF_16_BE:
4467 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4468 break;
4469 case CODING_CATEGORY_IDX_UTF_16_LE:
4470 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4471 break;
4472 default:
4473 eol_type = detect_eol_type (src, src_bytes, &skip);
4474 break;
4475 }
4476
4477 if (coding->heading_ascii > skip)
4478 coding->heading_ascii = skip;
4479 else
4480 skip = coding->heading_ascii;
4481
4482 if (eol_type == CODING_EOL_UNDECIDED)
4483 return;
4484 if (eol_type == CODING_EOL_INCONSISTENT)
4485 {
4486#if 0
4487 /* This code is suppressed until we find a better way to
4488 distinguish raw text file and binary file. */
4489
4490 /* If we have already detected that the coding is raw-text, the
4491 coding should actually be no-conversion. */
4492 if (coding->type == coding_type_raw_text)
4493 {
4494 setup_coding_system (Qno_conversion, coding);
4495 return;
4496 }
4497 /* Else, let's decode only text code anyway. */
4498#endif /* 0 */
4499 eol_type = CODING_EOL_LF;
4500 }
4501
4502 val = Fget (coding->symbol, Qeol_type);
4503 if (VECTORP (val) && XVECTOR (val)->size == 3)
4504 {
4505 int src_multibyte = coding->src_multibyte;
4506 int dst_multibyte = coding->dst_multibyte;
4507 struct composition_data *cmp_data = coding->cmp_data;
4508
4509 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4510 coding->src_multibyte = src_multibyte;
4511 coding->dst_multibyte = dst_multibyte;
4512 coding->heading_ascii = skip;
4513 coding->cmp_data = cmp_data;
4514 }
4515}
4516
4517#define CONVERSION_BUFFER_EXTRA_ROOM 256
4518
4519#define DECODING_BUFFER_MAG(coding) \
4520 (coding->type == coding_type_iso2022 \
4521 ? 3 \
4522 : (coding->type == coding_type_ccl \
4523 ? coding->spec.ccl.decoder.buf_magnification \
4524 : 2))
4525
4526/* Return maximum size (bytes) of a buffer enough for decoding
4527 SRC_BYTES of text encoded in CODING. */
4528
4529int
4530decoding_buffer_size (coding, src_bytes)
4531 struct coding_system *coding;
4532 int src_bytes;
4533{
4534 return (src_bytes * DECODING_BUFFER_MAG (coding)
4535 + CONVERSION_BUFFER_EXTRA_ROOM);
4536}
4537
4538/* Return maximum size (bytes) of a buffer enough for encoding
4539 SRC_BYTES of text to CODING. */
4540
4541int
4542encoding_buffer_size (coding, src_bytes)
4543 struct coding_system *coding;
4544 int src_bytes;
4545{
4546 int magnification;
4547
4548 if (coding->type == coding_type_ccl)
4549 {
4550 magnification = coding->spec.ccl.encoder.buf_magnification;
4551 if (coding->eol_type == CODING_EOL_CRLF)
4552 magnification *= 2;
4553 }
4554 else if (CODING_REQUIRE_ENCODING (coding))
4555 magnification = 3;
4556 else
4557 magnification = 1;
4558
4559 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4560}
4561
4562/* Working buffer for code conversion. */
4563struct conversion_buffer
4564{
4565 int size; /* size of data. */
4566 int on_stack; /* 1 if allocated by alloca. */
4567 unsigned char *data;
4568};
4569
4570/* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4571#define allocate_conversion_buffer(buf, len) \
4572 do { \
4573 if (len < MAX_ALLOCA) \
4574 { \
4575 buf.data = (unsigned char *) alloca (len); \
4576 buf.on_stack = 1; \
4577 } \
4578 else \
4579 { \
4580 buf.data = (unsigned char *) xmalloc (len); \
4581 buf.on_stack = 0; \
4582 } \
4583 buf.size = len; \
4584 } while (0)
4585
4586/* Double the allocated memory for *BUF. */
4587static void
4588extend_conversion_buffer (buf)
4589 struct conversion_buffer *buf;
4590{
4591 if (buf->on_stack)
4592 {
4593 unsigned char *save = buf->data;
4594 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4595 bcopy (save, buf->data, buf->size);
4596 buf->on_stack = 0;
4597 }
4598 else
4599 {
4600 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4601 }
4602 buf->size *= 2;
4603}
4604
4605/* Free the allocated memory for BUF if it is not on stack. */
4606static void
4607free_conversion_buffer (buf)
4608 struct conversion_buffer *buf;
4609{
4610 if (!buf->on_stack)
4611 xfree (buf->data);
4612}
4613
4614int
4615ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4616 struct coding_system *coding;
4617 unsigned char *source, *destination;
4618 int src_bytes, dst_bytes, encodep;
4619{
4620 struct ccl_program *ccl
4621 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4622 unsigned char *dst = destination;
4623
4624 ccl->suppress_error = coding->suppress_error;
4625 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4626 if (encodep)
4627 {
4628 /* On encoding, EOL format is converted within ccl_driver. For
4629 that, setup proper information in the structure CCL. */
4630 ccl->eol_type = coding->eol_type;
4631 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4632 ccl->eol_type = CODING_EOL_LF;
4633 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4634 ccl->eight_bit_control = coding->dst_multibyte;
4635 }
4636 else
4637 ccl->eight_bit_control = 1;
4638 ccl->multibyte = coding->src_multibyte;
4639 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4640 {
4641 /* Move carryover bytes to DESTINATION. */
4642 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4643 while (*p)
4644 *dst++ = *p++;
4645 coding->spec.ccl.eight_bit_carryover[0] = 0;
4646 if (dst_bytes)
4647 dst_bytes -= dst - destination;
4648 }
4649
4650 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4651 &(coding->consumed))
4652 + dst - destination);
4653
4654 if (encodep)
4655 {
4656 coding->produced_char = coding->produced;
4657 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4658 }
4659 else if (!ccl->eight_bit_control)
4660 {
4661 /* The produced bytes forms a valid multibyte sequence. */
4662 coding->produced_char
4663 = multibyte_chars_in_text (destination, coding->produced);
4664 coding->spec.ccl.eight_bit_carryover[0] = 0;
4665 }
4666 else
4667 {
4668 /* On decoding, the destination should always multibyte. But,
4669 CCL program might have been generated an invalid multibyte
4670 sequence. Here we make such a sequence valid as
4671 multibyte. */
4672 int bytes
4673 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4674
4675 if ((coding->consumed < src_bytes
4676 || !ccl->last_block)
4677 && coding->produced >= 1
4678 && destination[coding->produced - 1] >= 0x80)
4679 {
4680 /* We should not convert the tailing 8-bit codes to
4681 multibyte form even if they doesn't form a valid
4682 multibyte sequence. They may form a valid sequence in
4683 the next call. */
4684 int carryover = 0;
4685
4686 if (destination[coding->produced - 1] < 0xA0)
4687 carryover = 1;
4688 else if (coding->produced >= 2)
4689 {
4690 if (destination[coding->produced - 2] >= 0x80)
4691 {
4692 if (destination[coding->produced - 2] < 0xA0)
4693 carryover = 2;
4694 else if (coding->produced >= 3
4695 && destination[coding->produced - 3] >= 0x80
4696 && destination[coding->produced - 3] < 0xA0)
4697 carryover = 3;
4698 }
4699 }
4700 if (carryover > 0)
4701 {
4702 BCOPY_SHORT (destination + coding->produced - carryover,
4703 coding->spec.ccl.eight_bit_carryover,
4704 carryover);
4705 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4706 coding->produced -= carryover;
4707 }
4708 }
4709 coding->produced = str_as_multibyte (destination, bytes,
4710 coding->produced,
4711 &(coding->produced_char));
4712 }
4713
4714 switch (ccl->status)
4715 {
4716 case CCL_STAT_SUSPEND_BY_SRC:
4717 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4718 break;
4719 case CCL_STAT_SUSPEND_BY_DST:
4720 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4721 break;
4722 case CCL_STAT_QUIT:
4723 case CCL_STAT_INVALID_CMD:
4724 coding->result = CODING_FINISH_INTERRUPT;
4725 break;
4726 default:
4727 coding->result = CODING_FINISH_NORMAL;
4728 break;
4729 }
4730 return coding->result;
4731}
4732
4733/* Decode EOL format of the text at PTR of BYTES length destructively
4734 according to CODING->eol_type. This is called after the CCL
4735 program produced a decoded text at PTR. If we do CRLF->LF
4736 conversion, update CODING->produced and CODING->produced_char. */
4737
4738static void
4739decode_eol_post_ccl (coding, ptr, bytes)
4740 struct coding_system *coding;
4741 unsigned char *ptr;
4742 int bytes;
4743{
4744 Lisp_Object val, saved_coding_symbol;
4745 unsigned char *pend = ptr + bytes;
4746 int dummy;
4747
4748 /* Remember the current coding system symbol. We set it back when
4749 an inconsistent EOL is found so that `last-coding-system-used' is
4750 set to the coding system that doesn't specify EOL conversion. */
4751 saved_coding_symbol = coding->symbol;
4752
4753 coding->spec.ccl.cr_carryover = 0;
4754 if (coding->eol_type == CODING_EOL_UNDECIDED)
4755 {
4756 /* Here, to avoid the call of setup_coding_system, we directly
4757 call detect_eol_type. */
4758 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4759 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4760 coding->eol_type = CODING_EOL_LF;
4761 if (coding->eol_type != CODING_EOL_UNDECIDED)
4762 {
4763 val = Fget (coding->symbol, Qeol_type);
4764 if (VECTORP (val) && XVECTOR (val)->size == 3)
4765 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4766 }
4767 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4768 }
4769
4770 if (coding->eol_type == CODING_EOL_LF
4771 || coding->eol_type == CODING_EOL_UNDECIDED)
4772 {
4773 /* We have nothing to do. */
4774 ptr = pend;
4775 }
4776 else if (coding->eol_type == CODING_EOL_CRLF)
4777 {
4778 unsigned char *pstart = ptr, *p = ptr;
4779
4780 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4781 && *(pend - 1) == '\r')
4782 {
4783 /* If the last character is CR, we can't handle it here
4784 because LF will be in the not-yet-decoded source text.
4785 Record that the CR is not yet processed. */
4786 coding->spec.ccl.cr_carryover = 1;
4787 coding->produced--;
4788 coding->produced_char--;
4789 pend--;
4790 }
4791 while (ptr < pend)
4792 {
4793 if (*ptr == '\r')
4794 {
4795 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4796 {
4797 *p++ = '\n';
4798 ptr += 2;
4799 }
4800 else
4801 {
4802 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4803 goto undo_eol_conversion;
4804 *p++ = *ptr++;
4805 }
4806 }
4807 else if (*ptr == '\n'
4808 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4809 goto undo_eol_conversion;
4810 else
4811 *p++ = *ptr++;
4812 continue;
4813
4814 undo_eol_conversion:
4815 /* We have faced with inconsistent EOL format at PTR.
4816 Convert all LFs before PTR back to CRLFs. */
4817 for (p--, ptr--; p >= pstart; p--)
4818 {
4819 if (*p == '\n')
4820 *ptr-- = '\n', *ptr-- = '\r';
4821 else
4822 *ptr-- = *p;
4823 }
4824 /* If carryover is recorded, cancel it because we don't
4825 convert CRLF anymore. */
4826 if (coding->spec.ccl.cr_carryover)
4827 {
4828 coding->spec.ccl.cr_carryover = 0;
4829 coding->produced++;
4830 coding->produced_char++;
4831 pend++;
4832 }
4833 p = ptr = pend;
4834 coding->eol_type = CODING_EOL_LF;
4835 coding->symbol = saved_coding_symbol;
4836 }
4837 if (p < pend)
4838 {
4839 /* As each two-byte sequence CRLF was converted to LF, (PEND
4840 - P) is the number of deleted characters. */
4841 coding->produced -= pend - p;
4842 coding->produced_char -= pend - p;
4843 }
4844 }
4845 else /* i.e. coding->eol_type == CODING_EOL_CR */
4846 {
4847 unsigned char *p = ptr;
4848
4849 for (; ptr < pend; ptr++)
4850 {
4851 if (*ptr == '\r')
4852 *ptr = '\n';
4853 else if (*ptr == '\n'
4854 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4855 {
4856 for (; p < ptr; p++)
4857 {
4858 if (*p == '\n')
4859 *p = '\r';
4860 }
4861 ptr = pend;
4862 coding->eol_type = CODING_EOL_LF;
4863 coding->symbol = saved_coding_symbol;
4864 }
4865 }
4866 }
4867}
4868
4869/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4870 decoding, it may detect coding system and format of end-of-line if
4871 those are not yet decided. The source should be unibyte, the
4872 result is multibyte if CODING->dst_multibyte is nonzero, else
4873 unibyte. */
4874
4875int
4876decode_coding (coding, source, destination, src_bytes, dst_bytes)
4877 struct coding_system *coding;
4878 const unsigned char *source;
4879 unsigned char *destination;
4880 int src_bytes, dst_bytes;
4881{
4882 int extra = 0;
4883
4884 if (coding->type == coding_type_undecided)
4885 detect_coding (coding, source, src_bytes);
4886
4887 if (coding->eol_type == CODING_EOL_UNDECIDED
4888 && coding->type != coding_type_ccl)
4889 {
4890 detect_eol (coding, source, src_bytes);
4891 /* We had better recover the original eol format if we
4892 encounter an inconsistent eol format while decoding. */
4893 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4894 }
4895
4896 coding->produced = coding->produced_char = 0;
4897 coding->consumed = coding->consumed_char = 0;
4898 coding->errors = 0;
4899 coding->result = CODING_FINISH_NORMAL;
4900
4901 switch (coding->type)
4902 {
4903 case coding_type_sjis:
4904 decode_coding_sjis_big5 (coding, source, destination,
4905 src_bytes, dst_bytes, 1);
4906 break;
4907
4908 case coding_type_iso2022:
4909 decode_coding_iso2022 (coding, source, destination,
4910 src_bytes, dst_bytes);
4911 break;
4912
4913 case coding_type_big5:
4914 decode_coding_sjis_big5 (coding, source, destination,
4915 src_bytes, dst_bytes, 0);
4916 break;
4917
4918 case coding_type_emacs_mule:
4919 decode_coding_emacs_mule (coding, source, destination,
4920 src_bytes, dst_bytes);
4921 break;
4922
4923 case coding_type_ccl:
4924 if (coding->spec.ccl.cr_carryover)
4925 {
4926 /* Put the CR which was not processed by the previous call
4927 of decode_eol_post_ccl in DESTINATION. It will be
4928 decoded together with the following LF by the call to
4929 decode_eol_post_ccl below. */
4930 *destination = '\r';
4931 coding->produced++;
4932 coding->produced_char++;
4933 dst_bytes--;
4934 extra = coding->spec.ccl.cr_carryover;
4935 }
4936 ccl_coding_driver (coding, source, destination + extra,
4937 src_bytes, dst_bytes, 0);
4938 if (coding->eol_type != CODING_EOL_LF)
4939 {
4940 coding->produced += extra;
4941 coding->produced_char += extra;
4942 decode_eol_post_ccl (coding, destination, coding->produced);
4943 }
4944 break;
4945
4946 default:
4947 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4948 }
4949
4950 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4951 && coding->mode & CODING_MODE_LAST_BLOCK
4952 && coding->consumed == src_bytes)
4953 coding->result = CODING_FINISH_NORMAL;
4954
4955 if (coding->mode & CODING_MODE_LAST_BLOCK
4956 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4957 {
4958 const unsigned char *src = source + coding->consumed;
4959 unsigned char *dst = destination + coding->produced;
4960
4961 src_bytes -= coding->consumed;
4962 coding->errors++;
4963 if (COMPOSING_P (coding))
4964 DECODE_COMPOSITION_END ('1');
4965 while (src_bytes--)
4966 {
4967 int c = *src++;
4968 dst += CHAR_STRING (c, dst);
4969 coding->produced_char++;
4970 }
4971 coding->consumed = coding->consumed_char = src - source;
4972 coding->produced = dst - destination;
4973 coding->result = CODING_FINISH_NORMAL;
4974 }
4975
4976 if (!coding->dst_multibyte)
4977 {
4978 coding->produced = str_as_unibyte (destination, coding->produced);
4979 coding->produced_char = coding->produced;
4980 }
4981
4982 return coding->result;
4983}
4984
4985/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4986 multibyteness of the source is CODING->src_multibyte, the
4987 multibyteness of the result is always unibyte. */
4988
4989int
4990encode_coding (coding, source, destination, src_bytes, dst_bytes)
4991 struct coding_system *coding;
4992 const unsigned char *source;
4993 unsigned char *destination;
4994 int src_bytes, dst_bytes;
4995{
4996 coding->produced = coding->produced_char = 0;
4997 coding->consumed = coding->consumed_char = 0;
4998 coding->errors = 0;
4999 coding->result = CODING_FINISH_NORMAL;
5000
5001 switch (coding->type)
5002 {
5003 case coding_type_sjis:
5004 encode_coding_sjis_big5 (coding, source, destination,
5005 src_bytes, dst_bytes, 1);
5006 break;
5007
5008 case coding_type_iso2022:
5009 encode_coding_iso2022 (coding, source, destination,
5010 src_bytes, dst_bytes);
5011 break;
5012
5013 case coding_type_big5:
5014 encode_coding_sjis_big5 (coding, source, destination,
5015 src_bytes, dst_bytes, 0);
5016 break;
5017
5018 case coding_type_emacs_mule:
5019 encode_coding_emacs_mule (coding, source, destination,
5020 src_bytes, dst_bytes);
5021 break;
5022
5023 case coding_type_ccl:
5024 ccl_coding_driver (coding, source, destination,
5025 src_bytes, dst_bytes, 1);
5026 break;
5027
5028 default:
5029 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5030 }
5031
5032 if (coding->mode & CODING_MODE_LAST_BLOCK
5033 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5034 {
5035 const unsigned char *src = source + coding->consumed;
5036 unsigned char *dst = destination + coding->produced;
5037
5038 if (coding->type == coding_type_iso2022)
5039 ENCODE_RESET_PLANE_AND_REGISTER;
5040 if (COMPOSING_P (coding))
5041 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5042 if (coding->consumed < src_bytes)
5043 {
5044 int len = src_bytes - coding->consumed;
5045
5046 BCOPY_SHORT (src, dst, len);
5047 if (coding->src_multibyte)
5048 len = str_as_unibyte (dst, len);
5049 dst += len;
5050 coding->consumed = src_bytes;
5051 }
5052 coding->produced = coding->produced_char = dst - destination;
5053 coding->result = CODING_FINISH_NORMAL;
5054 }
5055
5056 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5057 && coding->consumed == src_bytes)
5058 coding->result = CODING_FINISH_NORMAL;
5059
5060 return coding->result;
5061}
5062
5063/* Scan text in the region between *BEG and *END (byte positions),
5064 skip characters which we don't have to decode by coding system
5065 CODING at the head and tail, then set *BEG and *END to the region
5066 of the text we actually have to convert. The caller should move
5067 the gap out of the region in advance if the region is from a
5068 buffer.
5069
5070 If STR is not NULL, *BEG and *END are indices into STR. */
5071
5072static void
5073shrink_decoding_region (beg, end, coding, str)
5074 int *beg, *end;
5075 struct coding_system *coding;
5076 unsigned char *str;
5077{
5078 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5079 int eol_conversion;
5080 Lisp_Object translation_table;
5081
5082 if (coding->type == coding_type_ccl
5083 || coding->type == coding_type_undecided
5084 || coding->eol_type != CODING_EOL_LF
5085 || !NILP (coding->post_read_conversion)
5086 || coding->composing != COMPOSITION_DISABLED)
5087 {
5088 /* We can't skip any data. */
5089 return;
5090 }
5091 if (coding->type == coding_type_no_conversion
5092 || coding->type == coding_type_raw_text
5093 || coding->type == coding_type_emacs_mule)
5094 {
5095 /* We need no conversion, but don't have to skip any data here.
5096 Decoding routine handles them effectively anyway. */
5097 return;
5098 }
5099
5100 translation_table = coding->translation_table_for_decode;
5101 if (NILP (translation_table) && !NILP (Venable_character_translation))
5102 translation_table = Vstandard_translation_table_for_decode;
5103 if (CHAR_TABLE_P (translation_table))
5104 {
5105 int i;
5106 for (i = 0; i < 128; i++)
5107 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5108 break;
5109 if (i < 128)
5110 /* Some ASCII character should be translated. We give up
5111 shrinking. */
5112 return;
5113 }
5114
5115 if (coding->heading_ascii >= 0)
5116 /* Detection routine has already found how much we can skip at the
5117 head. */
5118 *beg += coding->heading_ascii;
5119
5120 if (str)
5121 {
5122 begp_orig = begp = str + *beg;
5123 endp_orig = endp = str + *end;
5124 }
5125 else
5126 {
5127 begp_orig = begp = BYTE_POS_ADDR (*beg);
5128 endp_orig = endp = begp + *end - *beg;
5129 }
5130
5131 eol_conversion = (coding->eol_type == CODING_EOL_CR
5132 || coding->eol_type == CODING_EOL_CRLF);
5133
5134 switch (coding->type)
5135 {
5136 case coding_type_sjis:
5137 case coding_type_big5:
5138 /* We can skip all ASCII characters at the head. */
5139 if (coding->heading_ascii < 0)
5140 {
5141 if (eol_conversion)
5142 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5143 else
5144 while (begp < endp && *begp < 0x80) begp++;
5145 }
5146 /* We can skip all ASCII characters at the tail except for the
5147 second byte of SJIS or BIG5 code. */
5148 if (eol_conversion)
5149 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5150 else
5151 while (begp < endp && endp[-1] < 0x80) endp--;
5152 /* Do not consider LF as ascii if preceded by CR, since that
5153 confuses eol decoding. */
5154 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5155 endp++;
5156 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5157 endp++;
5158 break;
5159
5160 case coding_type_iso2022:
5161 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5162 /* We can't skip any data. */
5163 break;
5164 if (coding->heading_ascii < 0)
5165 {
5166 /* We can skip all ASCII characters at the head except for a
5167 few control codes. */
5168 while (begp < endp && (c = *begp) < 0x80
5169 && c != ISO_CODE_CR && c != ISO_CODE_SO
5170 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5171 && (!eol_conversion || c != ISO_CODE_LF))
5172 begp++;
5173 }
5174 switch (coding->category_idx)
5175 {
5176 case CODING_CATEGORY_IDX_ISO_8_1:
5177 case CODING_CATEGORY_IDX_ISO_8_2:
5178 /* We can skip all ASCII characters at the tail. */
5179 if (eol_conversion)
5180 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5181 else
5182 while (begp < endp && endp[-1] < 0x80) endp--;
5183 /* Do not consider LF as ascii if preceded by CR, since that
5184 confuses eol decoding. */
5185 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5186 endp++;
5187 break;
5188
5189 case CODING_CATEGORY_IDX_ISO_7:
5190 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5191 {
5192 /* We can skip all characters at the tail except for 8-bit
5193 codes and ESC and the following 2-byte at the tail. */
5194 unsigned char *eight_bit = NULL;
5195
5196 if (eol_conversion)
5197 while (begp < endp
5198 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5199 {
5200 if (!eight_bit && c & 0x80) eight_bit = endp;
5201 endp--;
5202 }
5203 else
5204 while (begp < endp
5205 && (c = endp[-1]) != ISO_CODE_ESC)
5206 {
5207 if (!eight_bit && c & 0x80) eight_bit = endp;
5208 endp--;
5209 }
5210 /* Do not consider LF as ascii if preceded by CR, since that
5211 confuses eol decoding. */
5212 if (begp < endp && endp < endp_orig
5213 && endp[-1] == '\r' && endp[0] == '\n')
5214 endp++;
5215 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5216 {
5217 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5218 /* This is an ASCII designation sequence. We can
5219 surely skip the tail. But, if we have
5220 encountered an 8-bit code, skip only the codes
5221 after that. */
5222 endp = eight_bit ? eight_bit : endp + 2;
5223 else
5224 /* Hmmm, we can't skip the tail. */
5225 endp = endp_orig;
5226 }
5227 else if (eight_bit)
5228 endp = eight_bit;
5229 }
5230 }
5231 break;
5232
5233 default:
5234 abort ();
5235 }
5236 *beg += begp - begp_orig;
5237 *end += endp - endp_orig;
5238 return;
5239}
5240
5241/* Like shrink_decoding_region but for encoding. */
5242
5243static void
5244shrink_encoding_region (beg, end, coding, str)
5245 int *beg, *end;
5246 struct coding_system *coding;
5247 unsigned char *str;
5248{
5249 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5250 int eol_conversion;
5251 Lisp_Object translation_table;
5252
5253 if (coding->type == coding_type_ccl
5254 || coding->eol_type == CODING_EOL_CRLF
5255 || coding->eol_type == CODING_EOL_CR
5256 || (coding->cmp_data && coding->cmp_data->used > 0))
5257 {
5258 /* We can't skip any data. */
5259 return;
5260 }
5261 if (coding->type == coding_type_no_conversion
5262 || coding->type == coding_type_raw_text
5263 || coding->type == coding_type_emacs_mule
5264 || coding->type == coding_type_undecided)
5265 {
5266 /* We need no conversion, but don't have to skip any data here.
5267 Encoding routine handles them effectively anyway. */
5268 return;
5269 }
5270
5271 translation_table = coding->translation_table_for_encode;
5272 if (NILP (translation_table) && !NILP (Venable_character_translation))
5273 translation_table = Vstandard_translation_table_for_encode;
5274 if (CHAR_TABLE_P (translation_table))
5275 {
5276 int i;
5277 for (i = 0; i < 128; i++)
5278 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5279 break;
5280 if (i < 128)
5281 /* Some ASCII character should be translated. We give up
5282 shrinking. */
5283 return;
5284 }
5285
5286 if (str)
5287 {
5288 begp_orig = begp = str + *beg;
5289 endp_orig = endp = str + *end;
5290 }
5291 else
5292 {
5293 begp_orig = begp = BYTE_POS_ADDR (*beg);
5294 endp_orig = endp = begp + *end - *beg;
5295 }
5296
5297 eol_conversion = (coding->eol_type == CODING_EOL_CR
5298 || coding->eol_type == CODING_EOL_CRLF);
5299
5300 /* Here, we don't have to check coding->pre_write_conversion because
5301 the caller is expected to have handled it already. */
5302 switch (coding->type)
5303 {
5304 case coding_type_iso2022:
5305 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5306 /* We can't skip any data. */
5307 break;
5308 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5309 {
5310 unsigned char *bol = begp;
5311 while (begp < endp && *begp < 0x80)
5312 {
5313 begp++;
5314 if (begp[-1] == '\n')
5315 bol = begp;
5316 }
5317 begp = bol;
5318 goto label_skip_tail;
5319 }
5320 /* fall down ... */
5321
5322 case coding_type_sjis:
5323 case coding_type_big5:
5324 /* We can skip all ASCII characters at the head and tail. */
5325 if (eol_conversion)
5326 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5327 else
5328 while (begp < endp && *begp < 0x80) begp++;
5329 label_skip_tail:
5330 if (eol_conversion)
5331 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5332 else
5333 while (begp < endp && *(endp - 1) < 0x80) endp--;
5334 break;
5335
5336 default:
5337 abort ();
5338 }
5339
5340 *beg += begp - begp_orig;
5341 *end += endp - endp_orig;
5342 return;
5343}
5344
5345/* As shrinking conversion region requires some overhead, we don't try
5346 shrinking if the length of conversion region is less than this
5347 value. */
5348static int shrink_conversion_region_threshhold = 1024;
5349
5350#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5351 do { \
5352 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5353 { \
5354 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5355 else shrink_decoding_region (beg, end, coding, str); \
5356 } \
5357 } while (0)
5358
5359/* ARG is (CODING BUFFER ...) where CODING is what to be set in
5360 Vlast_coding_system_used and the remaining elements are buffers to
5361 kill. */
5362static Lisp_Object
5363code_convert_region_unwind (arg)
5364 Lisp_Object arg;
5365{
5366 struct gcpro gcpro1;
5367 GCPRO1 (arg);
5368
5369 inhibit_pre_post_conversion = 0;
5370 Vlast_coding_system_used = XCAR (arg);
5371 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5372 Fkill_buffer (XCAR (arg));
5373
5374 UNGCPRO;
5375 return Qnil;
5376}
5377
5378/* Store information about all compositions in the range FROM and TO
5379 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5380 buffer or a string, defaults to the current buffer. */
5381
5382void
5383coding_save_composition (coding, from, to, obj)
5384 struct coding_system *coding;
5385 int from, to;
5386 Lisp_Object obj;
5387{
5388 Lisp_Object prop;
5389 int start, end;
5390
5391 if (coding->composing == COMPOSITION_DISABLED)
5392 return;
5393 if (!coding->cmp_data)
5394 coding_allocate_composition_data (coding, from);
5395 if (!find_composition (from, to, &start, &end, &prop, obj)
5396 || end > to)
5397 return;
5398 if (start < from
5399 && (!find_composition (end, to, &start, &end, &prop, obj)
5400 || end > to))
5401 return;
5402 coding->composing = COMPOSITION_NO;
5403 do
5404 {
5405 if (COMPOSITION_VALID_P (start, end, prop))
5406 {
5407 enum composition_method method = COMPOSITION_METHOD (prop);
5408 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5409 >= COMPOSITION_DATA_SIZE)
5410 coding_allocate_composition_data (coding, from);
5411 /* For relative composition, we remember start and end
5412 positions, for the other compositions, we also remember
5413 components. */
5414 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5415 if (method != COMPOSITION_RELATIVE)
5416 {
5417 /* We must store a*/
5418 Lisp_Object val, ch;
5419
5420 val = COMPOSITION_COMPONENTS (prop);
5421 if (CONSP (val))
5422 while (CONSP (val))
5423 {
5424 ch = XCAR (val), val = XCDR (val);
5425 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5426 }
5427 else if (VECTORP (val) || STRINGP (val))
5428 {
5429 int len = (VECTORP (val)
5430 ? XVECTOR (val)->size : SCHARS (val));
5431 int i;
5432 for (i = 0; i < len; i++)
5433 {
5434 ch = (STRINGP (val)
5435 ? Faref (val, make_number (i))
5436 : XVECTOR (val)->contents[i]);
5437 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5438 }
5439 }
5440 else /* INTEGERP (val) */
5441 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5442 }
5443 CODING_ADD_COMPOSITION_END (coding, end - from);
5444 }
5445 start = end;
5446 }
5447 while (start < to
5448 && find_composition (start, to, &start, &end, &prop, obj)
5449 && end <= to);
5450
5451 /* Make coding->cmp_data point to the first memory block. */
5452 while (coding->cmp_data->prev)
5453 coding->cmp_data = coding->cmp_data->prev;
5454 coding->cmp_data_start = 0;
5455}
5456
5457/* Reflect the saved information about compositions to OBJ.
5458 CODING->cmp_data points to a memory block for the information. OBJ
5459 is a buffer or a string, defaults to the current buffer. */
5460
5461void
5462coding_restore_composition (coding, obj)
5463 struct coding_system *coding;
5464 Lisp_Object obj;
5465{
5466 struct composition_data *cmp_data = coding->cmp_data;
5467
5468 if (!cmp_data)
5469 return;
5470
5471 while (cmp_data->prev)
5472 cmp_data = cmp_data->prev;
5473
5474 while (cmp_data)
5475 {
5476 int i;
5477
5478 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5479 i += cmp_data->data[i])
5480 {
5481 int *data = cmp_data->data + i;
5482 enum composition_method method = (enum composition_method) data[3];
5483 Lisp_Object components;
5484
5485 if (data[0] < 0 || i + data[0] > cmp_data->used)
5486 /* Invalid composition data. */
5487 break;
5488
5489 if (method == COMPOSITION_RELATIVE)
5490 components = Qnil;
5491 else
5492 {
5493 int len = data[0] - 4, j;
5494 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5495
5496 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5497 && len % 2 == 0)
5498 len --;
5499 if (len < 1)
5500 /* Invalid composition data. */
5501 break;
5502 for (j = 0; j < len; j++)
5503 args[j] = make_number (data[4 + j]);
5504 components = (method == COMPOSITION_WITH_ALTCHARS
5505 ? Fstring (len, args)
5506 : Fvector (len, args));
5507 }
5508 compose_text (data[1], data[2], components, Qnil, obj);
5509 }
5510 cmp_data = cmp_data->next;
5511 }
5512}
5513
5514/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5515 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5516 coding system CODING, and return the status code of code conversion
5517 (currently, this value has no meaning).
5518
5519 How many characters (and bytes) are converted to how many
5520 characters (and bytes) are recorded in members of the structure
5521 CODING.
5522
5523 If REPLACE is nonzero, we do various things as if the original text
5524 is deleted and a new text is inserted. See the comments in
5525 replace_range (insdel.c) to know what we are doing.
5526
5527 If REPLACE is zero, it is assumed that the source text is unibyte.
5528 Otherwise, it is assumed that the source text is multibyte. */
5529
5530int
5531code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5532 int from, from_byte, to, to_byte, encodep, replace;
5533 struct coding_system *coding;
5534{
5535 int len = to - from, len_byte = to_byte - from_byte;
5536 int nchars_del = 0, nbytes_del = 0;
5537 int require, inserted, inserted_byte;
5538 int head_skip, tail_skip, total_skip = 0;
5539 Lisp_Object saved_coding_symbol;
5540 int first = 1;
5541 unsigned char *src, *dst;
5542 Lisp_Object deletion;
5543 int orig_point = PT, orig_len = len;
5544 int prev_Z;
5545 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5546
5547 deletion = Qnil;
5548 saved_coding_symbol = coding->symbol;
5549
5550 if (from < PT && PT < to)
5551 {
5552 TEMP_SET_PT_BOTH (from, from_byte);
5553 orig_point = from;
5554 }
5555
5556 if (replace)
5557 {
5558 int saved_from = from;
5559 int saved_inhibit_modification_hooks;
5560
5561 prepare_to_modify_buffer (from, to, &from);
5562 if (saved_from != from)
5563 {
5564 to = from + len;
5565 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5566 len_byte = to_byte - from_byte;
5567 }
5568
5569 /* The code conversion routine can not preserve text properties
5570 for now. So, we must remove all text properties in the
5571 region. Here, we must suppress all modification hooks. */
5572 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5573 inhibit_modification_hooks = 1;
5574 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5575 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5576 }
5577
5578 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5579 {
5580 /* We must detect encoding of text and eol format. */
5581
5582 if (from < GPT && to > GPT)
5583 move_gap_both (from, from_byte);
5584 if (coding->type == coding_type_undecided)
5585 {
5586 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5587 if (coding->type == coding_type_undecided)
5588 {
5589 /* It seems that the text contains only ASCII, but we
5590 should not leave it undecided because the deeper
5591 decoding routine (decode_coding) tries to detect the
5592 encodings again in vain. */
5593 coding->type = coding_type_emacs_mule;
5594 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5595 /* As emacs-mule decoder will handle composition, we
5596 need this setting to allocate coding->cmp_data
5597 later. */
5598 coding->composing = COMPOSITION_NO;
5599 }
5600 }
5601 if (coding->eol_type == CODING_EOL_UNDECIDED
5602 && coding->type != coding_type_ccl)
5603 {
5604 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5605 if (coding->eol_type == CODING_EOL_UNDECIDED)
5606 coding->eol_type = CODING_EOL_LF;
5607 /* We had better recover the original eol format if we
5608 encounter an inconsistent eol format while decoding. */
5609 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5610 }
5611 }
5612
5613 /* Now we convert the text. */
5614
5615 /* For encoding, we must process pre-write-conversion in advance. */
5616 if (! inhibit_pre_post_conversion
5617 && encodep
5618 && SYMBOLP (coding->pre_write_conversion)
5619 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5620 {
5621 /* The function in pre-write-conversion may put a new text in a
5622 new buffer. */
5623 struct buffer *prev = current_buffer;
5624 Lisp_Object new;
5625
5626 record_unwind_protect (code_convert_region_unwind,
5627 Fcons (Vlast_coding_system_used, Qnil));
5628 /* We should not call any more pre-write/post-read-conversion
5629 functions while this pre-write-conversion is running. */
5630 inhibit_pre_post_conversion = 1;
5631 call2 (coding->pre_write_conversion,
5632 make_number (from), make_number (to));
5633 inhibit_pre_post_conversion = 0;
5634 /* Discard the unwind protect. */
5635 specpdl_ptr--;
5636
5637 if (current_buffer != prev)
5638 {
5639 len = ZV - BEGV;
5640 new = Fcurrent_buffer ();
5641 set_buffer_internal_1 (prev);
5642 del_range_2 (from, from_byte, to, to_byte, 0);
5643 TEMP_SET_PT_BOTH (from, from_byte);
5644 insert_from_buffer (XBUFFER (new), 1, len, 0);
5645 Fkill_buffer (new);
5646 if (orig_point >= to)
5647 orig_point += len - orig_len;
5648 else if (orig_point > from)
5649 orig_point = from;
5650 orig_len = len;
5651 to = from + len;
5652 from_byte = CHAR_TO_BYTE (from);
5653 to_byte = CHAR_TO_BYTE (to);
5654 len_byte = to_byte - from_byte;
5655 TEMP_SET_PT_BOTH (from, from_byte);
5656 }
5657 }
5658
5659 if (replace)
5660 {
5661 if (! EQ (current_buffer->undo_list, Qt))
5662 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5663 else
5664 {
5665 nchars_del = to - from;
5666 nbytes_del = to_byte - from_byte;
5667 }
5668 }
5669
5670 if (coding->composing != COMPOSITION_DISABLED)
5671 {
5672 if (encodep)
5673 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5674 else
5675 coding_allocate_composition_data (coding, from);
5676 }
5677
5678 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5679 if we must run CCL program or there are compositions to
5680 encode. */
5681 if (coding->type != coding_type_ccl
5682 && (! coding->cmp_data || coding->cmp_data->used == 0))
5683 {
5684 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5685
5686 if (from < GPT && GPT < to)
5687 move_gap_both (from, from_byte);
5688 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5689 if (from_byte == to_byte
5690 && (encodep || NILP (coding->post_read_conversion))
5691 && ! CODING_REQUIRE_FLUSHING (coding))
5692 {
5693 coding->produced = len_byte;
5694 coding->produced_char = len;
5695 if (!replace)
5696 /* We must record and adjust for this new text now. */
5697 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5698 coding_free_composition_data (coding);
5699 return 0;
5700 }
5701
5702 head_skip = from_byte - from_byte_orig;
5703 tail_skip = to_byte_orig - to_byte;
5704 total_skip = head_skip + tail_skip;
5705 from += head_skip;
5706 to -= tail_skip;
5707 len -= total_skip; len_byte -= total_skip;
5708 }
5709
5710 /* For conversion, we must put the gap before the text in addition to
5711 making the gap larger for efficient decoding. The required gap
5712 size starts from 2000 which is the magic number used in make_gap.
5713 But, after one batch of conversion, it will be incremented if we
5714 find that it is not enough . */
5715 require = 2000;
5716
5717 if (GAP_SIZE < require)
5718 make_gap (require - GAP_SIZE);
5719 move_gap_both (from, from_byte);
5720
5721 inserted = inserted_byte = 0;
5722
5723 GAP_SIZE += len_byte;
5724 ZV -= len;
5725 Z -= len;
5726 ZV_BYTE -= len_byte;
5727 Z_BYTE -= len_byte;
5728
5729 if (GPT - BEG < BEG_UNCHANGED)
5730 BEG_UNCHANGED = GPT - BEG;
5731 if (Z - GPT < END_UNCHANGED)
5732 END_UNCHANGED = Z - GPT;
5733
5734 if (!encodep && coding->src_multibyte)
5735 {
5736 /* Decoding routines expects that the source text is unibyte.
5737 We must convert 8-bit characters of multibyte form to
5738 unibyte. */
5739 int len_byte_orig = len_byte;
5740 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5741 if (len_byte < len_byte_orig)
5742 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5743 len_byte);
5744 coding->src_multibyte = 0;
5745 }
5746
5747 for (;;)
5748 {
5749 int result;
5750
5751 /* The buffer memory is now:
5752 +--------+converted-text+---------+-------original-text-------+---+
5753 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5754 |<---------------------- GAP ----------------------->| */
5755 src = GAP_END_ADDR - len_byte;
5756 dst = GPT_ADDR + inserted_byte;
5757
5758 if (encodep)
5759 result = encode_coding (coding, src, dst, len_byte, 0);
5760 else
5761 {
5762 if (coding->composing != COMPOSITION_DISABLED)
5763 coding->cmp_data->char_offset = from + inserted;
5764 result = decode_coding (coding, src, dst, len_byte, 0);
5765 }
5766
5767 /* The buffer memory is now:
5768 +--------+-------converted-text----+--+------original-text----+---+
5769 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5770 |<---------------------- GAP ----------------------->| */
5771
5772 inserted += coding->produced_char;
5773 inserted_byte += coding->produced;
5774 len_byte -= coding->consumed;
5775
5776 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5777 {
5778 coding_allocate_composition_data (coding, from + inserted);
5779 continue;
5780 }
5781
5782 src += coding->consumed;
5783 dst += coding->produced;
5784
5785 if (result == CODING_FINISH_NORMAL)
5786 {
5787 src += len_byte;
5788 break;
5789 }
5790 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5791 {
5792 unsigned char *pend = dst, *p = pend - inserted_byte;
5793 Lisp_Object eol_type;
5794
5795 /* Encode LFs back to the original eol format (CR or CRLF). */
5796 if (coding->eol_type == CODING_EOL_CR)
5797 {
5798 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5799 }
5800 else
5801 {
5802 int count = 0;
5803
5804 while (p < pend) if (*p++ == '\n') count++;
5805 if (src - dst < count)
5806 {
5807 /* We don't have sufficient room for encoding LFs
5808 back to CRLF. We must record converted and
5809 not-yet-converted text back to the buffer
5810 content, enlarge the gap, then record them out of
5811 the buffer contents again. */
5812 int add = len_byte + inserted_byte;
5813
5814 GAP_SIZE -= add;
5815 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5816 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5817 make_gap (count - GAP_SIZE);
5818 GAP_SIZE += add;
5819 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5820 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5821 /* Don't forget to update SRC, DST, and PEND. */
5822 src = GAP_END_ADDR - len_byte;
5823 dst = GPT_ADDR + inserted_byte;
5824 pend = dst;
5825 }
5826 inserted += count;
5827 inserted_byte += count;
5828 coding->produced += count;
5829 p = dst = pend + count;
5830 while (count)
5831 {
5832 *--p = *--pend;
5833 if (*p == '\n') count--, *--p = '\r';
5834 }
5835 }
5836
5837 /* Suppress eol-format conversion in the further conversion. */
5838 coding->eol_type = CODING_EOL_LF;
5839
5840 /* Set the coding system symbol to that for Unix-like EOL. */
5841 eol_type = Fget (saved_coding_symbol, Qeol_type);
5842 if (VECTORP (eol_type)
5843 && XVECTOR (eol_type)->size == 3
5844 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5845 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5846 else
5847 coding->symbol = saved_coding_symbol;
5848
5849 continue;
5850 }
5851 if (len_byte <= 0)
5852 {
5853 if (coding->type != coding_type_ccl
5854 || coding->mode & CODING_MODE_LAST_BLOCK)
5855 break;
5856 coding->mode |= CODING_MODE_LAST_BLOCK;
5857 continue;
5858 }
5859 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5860 {
5861 /* The source text ends in invalid codes. Let's just
5862 make them valid buffer contents, and finish conversion. */
5863 if (multibyte_p)
5864 {
5865 unsigned char *start = dst;
5866
5867 inserted += len_byte;
5868 while (len_byte--)
5869 {
5870 int c = *src++;
5871 dst += CHAR_STRING (c, dst);
5872 }
5873
5874 inserted_byte += dst - start;
5875 }
5876 else
5877 {
5878 inserted += len_byte;
5879 inserted_byte += len_byte;
5880 while (len_byte--)
5881 *dst++ = *src++;
5882 }
5883 break;
5884 }
5885 if (result == CODING_FINISH_INTERRUPT)
5886 {
5887 /* The conversion procedure was interrupted by a user. */
5888 break;
5889 }
5890 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5891 if (coding->consumed < 1)
5892 {
5893 /* It's quite strange to require more memory without
5894 consuming any bytes. Perhaps CCL program bug. */
5895 break;
5896 }
5897 if (first)
5898 {
5899 /* We have just done the first batch of conversion which was
5900 stopped because of insufficient gap. Let's reconsider the
5901 required gap size (i.e. SRT - DST) now.
5902
5903 We have converted ORIG bytes (== coding->consumed) into
5904 NEW bytes (coding->produced). To convert the remaining
5905 LEN bytes, we may need REQUIRE bytes of gap, where:
5906 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5907 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5908 Here, we are sure that NEW >= ORIG. */
5909
5910 if (coding->produced <= coding->consumed)
5911 {
5912 /* This happens because of CCL-based coding system with
5913 eol-type CRLF. */
5914 require = 0;
5915 }
5916 else
5917 {
5918 float ratio = coding->produced - coding->consumed;
5919 ratio /= coding->consumed;
5920 require = len_byte * ratio;
5921 }
5922 first = 0;
5923 }
5924 if ((src - dst) < (require + 2000))
5925 {
5926 /* See the comment above the previous call of make_gap. */
5927 int add = len_byte + inserted_byte;
5928
5929 GAP_SIZE -= add;
5930 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5931 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5932 make_gap (require + 2000);
5933 GAP_SIZE += add;
5934 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5935 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5936 }
5937 }
5938 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5939
5940 if (encodep && coding->dst_multibyte)
5941 {
5942 /* The output is unibyte. We must convert 8-bit characters to
5943 multibyte form. */
5944 if (inserted_byte * 2 > GAP_SIZE)
5945 {
5946 GAP_SIZE -= inserted_byte;
5947 ZV += inserted_byte; Z += inserted_byte;
5948 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5949 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5950 make_gap (inserted_byte - GAP_SIZE);
5951 GAP_SIZE += inserted_byte;
5952 ZV -= inserted_byte; Z -= inserted_byte;
5953 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5954 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5955 }
5956 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5957 }
5958
5959 /* If we shrank the conversion area, adjust it now. */
5960 if (total_skip > 0)
5961 {
5962 if (tail_skip > 0)
5963 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5964 inserted += total_skip; inserted_byte += total_skip;
5965 GAP_SIZE += total_skip;
5966 GPT -= head_skip; GPT_BYTE -= head_skip;
5967 ZV -= total_skip; ZV_BYTE -= total_skip;
5968 Z -= total_skip; Z_BYTE -= total_skip;
5969 from -= head_skip; from_byte -= head_skip;
5970 to += tail_skip; to_byte += tail_skip;
5971 }
5972
5973 prev_Z = Z;
5974 if (! EQ (current_buffer->undo_list, Qt))
5975 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5976 else
5977 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5978 inserted, inserted_byte);
5979 inserted = Z - prev_Z;
5980
5981 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5982 coding_restore_composition (coding, Fcurrent_buffer ());
5983 coding_free_composition_data (coding);
5984
5985 if (! inhibit_pre_post_conversion
5986 && ! encodep && ! NILP (coding->post_read_conversion))
5987 {
5988 Lisp_Object val;
5989 Lisp_Object saved_coding_system;
5990
5991 if (from != PT)
5992 TEMP_SET_PT_BOTH (from, from_byte);
5993 prev_Z = Z;
5994 record_unwind_protect (code_convert_region_unwind,
5995 Fcons (Vlast_coding_system_used, Qnil));
5996 saved_coding_system = Vlast_coding_system_used;
5997 Vlast_coding_system_used = coding->symbol;
5998 /* We should not call any more pre-write/post-read-conversion
5999 functions while this post-read-conversion is running. */
6000 inhibit_pre_post_conversion = 1;
6001 val = call1 (coding->post_read_conversion, make_number (inserted));
6002 inhibit_pre_post_conversion = 0;
6003 coding->symbol = Vlast_coding_system_used;
6004 Vlast_coding_system_used = saved_coding_system;
6005 /* Discard the unwind protect. */
6006 specpdl_ptr--;
6007 CHECK_NUMBER (val);
6008 inserted += Z - prev_Z;
6009 }
6010
6011 if (orig_point >= from)
6012 {
6013 if (orig_point >= from + orig_len)
6014 orig_point += inserted - orig_len;
6015 else
6016 orig_point = from;
6017 TEMP_SET_PT (orig_point);
6018 }
6019
6020 if (replace)
6021 {
6022 signal_after_change (from, to - from, inserted);
6023 update_compositions (from, from + inserted, CHECK_BORDER);
6024 }
6025
6026 {
6027 coding->consumed = to_byte - from_byte;
6028 coding->consumed_char = to - from;
6029 coding->produced = inserted_byte;
6030 coding->produced_char = inserted;
6031 }
6032
6033 return 0;
6034}
6035
6036/* Name (or base name) of work buffer for code conversion. */
6037static Lisp_Object Vcode_conversion_workbuf_name;
6038
6039/* Set the current buffer to the working buffer prepared for
6040 code-conversion. MULTIBYTE specifies the multibyteness of the
6041 buffer. Return the buffer we set if it must be killed after use.
6042 Otherwise return Qnil. */
6043
6044static Lisp_Object
6045set_conversion_work_buffer (multibyte)
6046 int multibyte;
6047{
6048 Lisp_Object buffer, buffer_to_kill;
6049 struct buffer *buf;
6050
6051 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6052 buf = XBUFFER (buffer);
6053 if (buf == current_buffer)
6054 {
6055 /* As we are already in the work buffer, we must generate a new
6056 buffer for the work. */
6057 Lisp_Object name;
6058
6059 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6060 buffer = buffer_to_kill = Fget_buffer_create (name);
6061 buf = XBUFFER (buffer);
6062 }
6063 else
6064 buffer_to_kill = Qnil;
6065
6066 delete_all_overlays (buf);
6067 buf->directory = current_buffer->directory;
6068 buf->read_only = Qnil;
6069 buf->filename = Qnil;
6070 buf->undo_list = Qt;
6071 eassert (buf->overlays_before == NULL);
6072 eassert (buf->overlays_after == NULL);
6073 set_buffer_internal (buf);
6074 if (BEG != BEGV || Z != ZV)
6075 Fwiden ();
6076 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6077 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6078 return buffer_to_kill;
6079}
6080
6081Lisp_Object
6082run_pre_post_conversion_on_str (str, coding, encodep)
6083 Lisp_Object str;
6084 struct coding_system *coding;
6085 int encodep;
6086{
6087 int count = SPECPDL_INDEX ();
6088 struct gcpro gcpro1, gcpro2;
6089 int multibyte = STRING_MULTIBYTE (str);
6090 Lisp_Object old_deactivate_mark;
6091 Lisp_Object buffer_to_kill;
6092 Lisp_Object unwind_arg;
6093
6094 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6095 /* It is not crucial to specbind this. */
6096 old_deactivate_mark = Vdeactivate_mark;
6097 GCPRO2 (str, old_deactivate_mark);
6098
6099 /* We must insert the contents of STR as is without
6100 unibyte<->multibyte conversion. For that, we adjust the
6101 multibyteness of the working buffer to that of STR. */
6102 buffer_to_kill = set_conversion_work_buffer (multibyte);
6103 if (NILP (buffer_to_kill))
6104 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6105 else
6106 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6107 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6108
6109 insert_from_string (str, 0, 0,
6110 SCHARS (str), SBYTES (str), 0);
6111 UNGCPRO;
6112 inhibit_pre_post_conversion = 1;
6113 if (encodep)
6114 {
6115 struct buffer *prev = current_buffer;
6116
6117 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6118 if (prev != current_buffer)
6119 /* We must kill the current buffer too. */
6120 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6121 }
6122 else
6123 {
6124 Vlast_coding_system_used = coding->symbol;
6125 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6126 call1 (coding->post_read_conversion, make_number (Z - BEG));
6127 coding->symbol = Vlast_coding_system_used;
6128 }
6129 inhibit_pre_post_conversion = 0;
6130 Vdeactivate_mark = old_deactivate_mark;
6131 str = make_buffer_string (BEG, Z, 1);
6132 return unbind_to (count, str);
6133}
6134
6135
6136/* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6137 text in *STR. *SIZE is the allocated bytes for STR. As it
6138 is intended that this function is called from encode_terminal_code,
6139 the pre-write-conversion function is run by safe_call and thus
6140 "Error during redisplay: ..." is logged when an error occurs.
6141
6142 Store the resulting text in *STR and set CODING->produced_char and
6143 CODING->produced to the number of characters and bytes
6144 respectively. If the size of *STR is too small, enlarge it by
6145 xrealloc and update *STR and *SIZE. */
6146
6147void
6148run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6149 unsigned char **str;
6150 int *size, nchars, nbytes;
6151 struct coding_system *coding;
6152{
6153 struct gcpro gcpro1, gcpro2;
6154 struct buffer *cur = current_buffer;
6155 struct buffer *prev;
6156 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6157 Lisp_Object args[3];
6158 Lisp_Object buffer_to_kill;
6159
6160 /* It is not crucial to specbind this. */
6161 old_deactivate_mark = Vdeactivate_mark;
6162 old_last_coding_system_used = Vlast_coding_system_used;
6163 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6164
6165 /* We must insert the contents of STR as is without
6166 unibyte<->multibyte conversion. For that, we adjust the
6167 multibyteness of the working buffer to that of STR. */
6168 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6169 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6170 UNGCPRO;
6171 inhibit_pre_post_conversion = 1;
6172 prev = current_buffer;
6173 args[0] = coding->pre_write_conversion;
6174 args[1] = make_number (BEG);
6175 args[2] = make_number (Z);
6176 safe_call (3, args);
6177 inhibit_pre_post_conversion = 0;
6178 Vdeactivate_mark = old_deactivate_mark;
6179 Vlast_coding_system_used = old_last_coding_system_used;
6180 coding->produced_char = Z - BEG;
6181 coding->produced = Z_BYTE - BEG_BYTE;
6182 if (coding->produced > *size)
6183 {
6184 *size = coding->produced;
6185 *str = xrealloc (*str, *size);
6186 }
6187 if (BEG < GPT && GPT < Z)
6188 move_gap (BEG);
6189 bcopy (BEG_ADDR, *str, coding->produced);
6190 coding->src_multibyte
6191 = ! NILP (current_buffer->enable_multibyte_characters);
6192 if (prev != current_buffer)
6193 Fkill_buffer (Fcurrent_buffer ());
6194 set_buffer_internal (cur);
6195 if (! NILP (buffer_to_kill))
6196 Fkill_buffer (buffer_to_kill);
6197}
6198
6199
6200Lisp_Object
6201decode_coding_string (str, coding, nocopy)
6202 Lisp_Object str;
6203 struct coding_system *coding;
6204 int nocopy;
6205{
6206 int len;
6207 struct conversion_buffer buf;
6208 int from, to_byte;
6209 Lisp_Object saved_coding_symbol;
6210 int result;
6211 int require_decoding;
6212 int shrinked_bytes = 0;
6213 Lisp_Object newstr;
6214 int consumed, consumed_char, produced, produced_char;
6215
6216 from = 0;
6217 to_byte = SBYTES (str);
6218
6219 saved_coding_symbol = coding->symbol;
6220 coding->src_multibyte = STRING_MULTIBYTE (str);
6221 coding->dst_multibyte = 1;
6222 if (CODING_REQUIRE_DETECTION (coding))
6223 {
6224 /* See the comments in code_convert_region. */
6225 if (coding->type == coding_type_undecided)
6226 {
6227 detect_coding (coding, SDATA (str), to_byte);
6228 if (coding->type == coding_type_undecided)
6229 {
6230 coding->type = coding_type_emacs_mule;
6231 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6232 /* As emacs-mule decoder will handle composition, we
6233 need this setting to allocate coding->cmp_data
6234 later. */
6235 coding->composing = COMPOSITION_NO;
6236 }
6237 }
6238 if (coding->eol_type == CODING_EOL_UNDECIDED
6239 && coding->type != coding_type_ccl)
6240 {
6241 saved_coding_symbol = coding->symbol;
6242 detect_eol (coding, SDATA (str), to_byte);
6243 if (coding->eol_type == CODING_EOL_UNDECIDED)
6244 coding->eol_type = CODING_EOL_LF;
6245 /* We had better recover the original eol format if we
6246 encounter an inconsistent eol format while decoding. */
6247 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6248 }
6249 }
6250
6251 if (coding->type == coding_type_no_conversion
6252 || coding->type == coding_type_raw_text)
6253 coding->dst_multibyte = 0;
6254
6255 require_decoding = CODING_REQUIRE_DECODING (coding);
6256
6257 if (STRING_MULTIBYTE (str))
6258 {
6259 /* Decoding routines expect the source text to be unibyte. */
6260 str = Fstring_as_unibyte (str);
6261 to_byte = SBYTES (str);
6262 nocopy = 1;
6263 coding->src_multibyte = 0;
6264 }
6265
6266 /* Try to skip the heading and tailing ASCIIs. */
6267 if (require_decoding && coding->type != coding_type_ccl)
6268 {
6269 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6270 0);
6271 if (from == to_byte)
6272 require_decoding = 0;
6273 shrinked_bytes = from + (SBYTES (str) - to_byte);
6274 }
6275
6276 if (!require_decoding
6277 && !(SYMBOLP (coding->post_read_conversion)
6278 && !NILP (Ffboundp (coding->post_read_conversion))))
6279 {
6280 coding->consumed = SBYTES (str);
6281 coding->consumed_char = SCHARS (str);
6282 if (coding->dst_multibyte)
6283 {
6284 str = Fstring_as_multibyte (str);
6285 nocopy = 1;
6286 }
6287 coding->produced = SBYTES (str);
6288 coding->produced_char = SCHARS (str);
6289 return (nocopy ? str : Fcopy_sequence (str));
6290 }
6291
6292 if (coding->composing != COMPOSITION_DISABLED)
6293 coding_allocate_composition_data (coding, from);
6294 len = decoding_buffer_size (coding, to_byte - from);
6295 allocate_conversion_buffer (buf, len);
6296
6297 consumed = consumed_char = produced = produced_char = 0;
6298 while (1)
6299 {
6300 result = decode_coding (coding, SDATA (str) + from + consumed,
6301 buf.data + produced, to_byte - from - consumed,
6302 buf.size - produced);
6303 consumed += coding->consumed;
6304 consumed_char += coding->consumed_char;
6305 produced += coding->produced;
6306 produced_char += coding->produced_char;
6307 if (result == CODING_FINISH_NORMAL
6308 || result == CODING_FINISH_INTERRUPT
6309 || (result == CODING_FINISH_INSUFFICIENT_SRC
6310 && coding->consumed == 0))
6311 break;
6312 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6313 coding_allocate_composition_data (coding, from + produced_char);
6314 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6315 extend_conversion_buffer (&buf);
6316 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6317 {
6318 Lisp_Object eol_type;
6319
6320 /* Recover the original EOL format. */
6321 if (coding->eol_type == CODING_EOL_CR)
6322 {
6323 unsigned char *p;
6324 for (p = buf.data; p < buf.data + produced; p++)
6325 if (*p == '\n') *p = '\r';
6326 }
6327 else if (coding->eol_type == CODING_EOL_CRLF)
6328 {
6329 int num_eol = 0;
6330 unsigned char *p0, *p1;
6331 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6332 if (*p0 == '\n') num_eol++;
6333 if (produced + num_eol >= buf.size)
6334 extend_conversion_buffer (&buf);
6335 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6336 {
6337 *--p1 = *--p0;
6338 if (*p0 == '\n') *--p1 = '\r';
6339 }
6340 produced += num_eol;
6341 produced_char += num_eol;
6342 }
6343 /* Suppress eol-format conversion in the further conversion. */
6344 coding->eol_type = CODING_EOL_LF;
6345
6346 /* Set the coding system symbol to that for Unix-like EOL. */
6347 eol_type = Fget (saved_coding_symbol, Qeol_type);
6348 if (VECTORP (eol_type)
6349 && XVECTOR (eol_type)->size == 3
6350 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6351 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6352 else
6353 coding->symbol = saved_coding_symbol;
6354
6355
6356 }
6357 }
6358
6359 coding->consumed = consumed;
6360 coding->consumed_char = consumed_char;
6361 coding->produced = produced;
6362 coding->produced_char = produced_char;
6363
6364 if (coding->dst_multibyte)
6365 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6366 produced + shrinked_bytes);
6367 else
6368 newstr = make_uninit_string (produced + shrinked_bytes);
6369 if (from > 0)
6370 STRING_COPYIN (newstr, 0, SDATA (str), from);
6371 STRING_COPYIN (newstr, from, buf.data, produced);
6372 if (shrinked_bytes > from)
6373 STRING_COPYIN (newstr, from + produced,
6374 SDATA (str) + to_byte,
6375 shrinked_bytes - from);
6376 free_conversion_buffer (&buf);
6377
6378 coding->consumed += shrinked_bytes;
6379 coding->consumed_char += shrinked_bytes;
6380 coding->produced += shrinked_bytes;
6381 coding->produced_char += shrinked_bytes;
6382
6383 if (coding->cmp_data && coding->cmp_data->used)
6384 coding_restore_composition (coding, newstr);
6385 coding_free_composition_data (coding);
6386
6387 if (SYMBOLP (coding->post_read_conversion)
6388 && !NILP (Ffboundp (coding->post_read_conversion)))
6389 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6390
6391 return newstr;
6392}
6393
6394Lisp_Object
6395encode_coding_string (str, coding, nocopy)
6396 Lisp_Object str;
6397 struct coding_system *coding;
6398 int nocopy;
6399{
6400 int len;
6401 struct conversion_buffer buf;
6402 int from, to, to_byte;
6403 int result;
6404 int shrinked_bytes = 0;
6405 Lisp_Object newstr;
6406 int consumed, consumed_char, produced, produced_char;
6407
6408 if (SYMBOLP (coding->pre_write_conversion)
6409 && !NILP (Ffboundp (coding->pre_write_conversion)))
6410 {
6411 str = run_pre_post_conversion_on_str (str, coding, 1);
6412 /* As STR is just newly generated, we don't have to copy it
6413 anymore. */
6414 nocopy = 1;
6415 }
6416
6417 from = 0;
6418 to = SCHARS (str);
6419 to_byte = SBYTES (str);
6420
6421 /* Encoding routines determine the multibyteness of the source text
6422 by coding->src_multibyte. */
6423 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6424 coding->dst_multibyte = 0;
6425 if (! CODING_REQUIRE_ENCODING (coding))
6426 goto no_need_of_encoding;
6427
6428 if (coding->composing != COMPOSITION_DISABLED)
6429 coding_save_composition (coding, from, to, str);
6430
6431 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6432 if we must run CCL program or there are compositions to
6433 encode. */
6434 if (coding->type != coding_type_ccl
6435 && (! coding->cmp_data || coding->cmp_data->used == 0))
6436 {
6437 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6438 1);
6439 if (from == to_byte)
6440 {
6441 coding_free_composition_data (coding);
6442 goto no_need_of_encoding;
6443 }
6444 shrinked_bytes = from + (SBYTES (str) - to_byte);
6445 }
6446
6447 len = encoding_buffer_size (coding, to_byte - from);
6448 allocate_conversion_buffer (buf, len);
6449
6450 consumed = consumed_char = produced = produced_char = 0;
6451 while (1)
6452 {
6453 result = encode_coding (coding, SDATA (str) + from + consumed,
6454 buf.data + produced, to_byte - from - consumed,
6455 buf.size - produced);
6456 consumed += coding->consumed;
6457 consumed_char += coding->consumed_char;
6458 produced += coding->produced;
6459 produced_char += coding->produced_char;
6460 if (result == CODING_FINISH_NORMAL
6461 || result == CODING_FINISH_INTERRUPT
6462 || (result == CODING_FINISH_INSUFFICIENT_SRC
6463 && coding->consumed == 0))
6464 break;
6465 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6466 extend_conversion_buffer (&buf);
6467 }
6468
6469 coding->consumed = consumed;
6470 coding->consumed_char = consumed_char;
6471 coding->produced = produced;
6472 coding->produced_char = produced_char;
6473
6474 newstr = make_uninit_string (produced + shrinked_bytes);
6475 if (from > 0)
6476 STRING_COPYIN (newstr, 0, SDATA (str), from);
6477 STRING_COPYIN (newstr, from, buf.data, produced);
6478 if (shrinked_bytes > from)
6479 STRING_COPYIN (newstr, from + produced,
6480 SDATA (str) + to_byte,
6481 shrinked_bytes - from);
6482
6483 free_conversion_buffer (&buf);
6484 coding_free_composition_data (coding);
6485
6486 return newstr;
6487
6488 no_need_of_encoding:
6489 coding->consumed = SBYTES (str);
6490 coding->consumed_char = SCHARS (str);
6491 if (STRING_MULTIBYTE (str))
6492 {
6493 if (nocopy)
6494 /* We are sure that STR doesn't contain a multibyte
6495 character. */
6496 STRING_SET_UNIBYTE (str);
6497 else
6498 {
6499 str = Fstring_as_unibyte (str);
6500 nocopy = 1;
6501 }
6502 }
6503 coding->produced = SBYTES (str);
6504 coding->produced_char = SCHARS (str);
6505 return (nocopy ? str : Fcopy_sequence (str));
6506}
6507
6508\f
6509#ifdef emacs
6510/*** 8. Emacs Lisp library functions ***/
6511
6512DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6513 doc: /* Return t if OBJECT is nil or a coding-system.
6514See the documentation of `make-coding-system' for information
6515about coding-system objects. */)
6516 (obj)
6517 Lisp_Object obj;
6518{
6519 if (NILP (obj))
6520 return Qt;
6521 if (!SYMBOLP (obj))
6522 return Qnil;
6523 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6524 return Qt;
6525 /* Get coding-spec vector for OBJ. */
6526 obj = Fget (obj, Qcoding_system);
6527 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6528 ? Qt : Qnil);
6529}
6530
6531DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6532 Sread_non_nil_coding_system, 1, 1, 0,
6533 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6534 (prompt)
6535 Lisp_Object prompt;
6536{
6537 Lisp_Object val;
6538 do
6539 {
6540 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6541 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6542 }
6543 while (SCHARS (val) == 0);
6544 return (Fintern (val, Qnil));
6545}
6546
6547DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6548 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6549If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6550 (prompt, default_coding_system)
6551 Lisp_Object prompt, default_coding_system;
6552{
6553 Lisp_Object val;
6554 if (SYMBOLP (default_coding_system))
6555 default_coding_system = SYMBOL_NAME (default_coding_system);
6556 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6557 Qt, Qnil, Qcoding_system_history,
6558 default_coding_system, Qnil);
6559 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6560}
6561
6562DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6563 1, 1, 0,
6564 doc: /* Check validity of CODING-SYSTEM.
6565If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6566It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6567The value of this property should be a vector of length 5. */)
6568 (coding_system)
6569 Lisp_Object coding_system;
6570{
6571 Lisp_Object define_form;
6572
6573 define_form = Fget (coding_system, Qcoding_system_define_form);
6574 if (! NILP (define_form))
6575 {
6576 Fput (coding_system, Qcoding_system_define_form, Qnil);
6577 safe_eval (define_form);
6578 }
6579 if (!NILP (Fcoding_system_p (coding_system)))
6580 return coding_system;
6581 while (1)
6582 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6583}
6584\f
6585Lisp_Object
6586detect_coding_system (src, src_bytes, highest, multibytep)
6587 const unsigned char *src;
6588 int src_bytes, highest;
6589 int multibytep;
6590{
6591 int coding_mask, eol_type;
6592 Lisp_Object val, tmp;
6593 int dummy;
6594
6595 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6596 eol_type = detect_eol_type (src, src_bytes, &dummy);
6597 if (eol_type == CODING_EOL_INCONSISTENT)
6598 eol_type = CODING_EOL_UNDECIDED;
6599
6600 if (!coding_mask)
6601 {
6602 val = Qundecided;
6603 if (eol_type != CODING_EOL_UNDECIDED)
6604 {
6605 Lisp_Object val2;
6606 val2 = Fget (Qundecided, Qeol_type);
6607 if (VECTORP (val2))
6608 val = XVECTOR (val2)->contents[eol_type];
6609 }
6610 return (highest ? val : Fcons (val, Qnil));
6611 }
6612
6613 /* At first, gather possible coding systems in VAL. */
6614 val = Qnil;
6615 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6616 {
6617 Lisp_Object category_val, category_index;
6618
6619 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6620 category_val = Fsymbol_value (XCAR (tmp));
6621 if (!NILP (category_val)
6622 && NATNUMP (category_index)
6623 && (coding_mask & (1 << XFASTINT (category_index))))
6624 {
6625 val = Fcons (category_val, val);
6626 if (highest)
6627 break;
6628 }
6629 }
6630 if (!highest)
6631 val = Fnreverse (val);
6632
6633 /* Then, replace the elements with subsidiary coding systems. */
6634 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6635 {
6636 if (eol_type != CODING_EOL_UNDECIDED
6637 && eol_type != CODING_EOL_INCONSISTENT)
6638 {
6639 Lisp_Object eol;
6640 eol = Fget (XCAR (tmp), Qeol_type);
6641 if (VECTORP (eol))
6642 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6643 }
6644 }
6645 return (highest ? XCAR (val) : val);
6646}
6647
6648DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6649 2, 3, 0,
6650 doc: /* Detect how the byte sequence in the region is encoded.
6651Return a list of possible coding systems used on decoding a byte
6652sequence containing the bytes in the region between START and END when
6653the coding system `undecided' is specified. The list is ordered by
6654priority decided in the current language environment.
6655
6656If only ASCII characters are found, it returns a list of single element
6657`undecided' or its subsidiary coding system according to a detected
6658end-of-line format.
6659
6660If optional argument HIGHEST is non-nil, return the coding system of
6661highest priority. */)
6662 (start, end, highest)
6663 Lisp_Object start, end, highest;
6664{
6665 int from, to;
6666 int from_byte, to_byte;
6667 int include_anchor_byte = 0;
6668
6669 CHECK_NUMBER_COERCE_MARKER (start);
6670 CHECK_NUMBER_COERCE_MARKER (end);
6671
6672 validate_region (&start, &end);
6673 from = XINT (start), to = XINT (end);
6674 from_byte = CHAR_TO_BYTE (from);
6675 to_byte = CHAR_TO_BYTE (to);
6676
6677 if (from < GPT && to >= GPT)
6678 move_gap_both (to, to_byte);
6679 /* If we an anchor byte `\0' follows the region, we include it in
6680 the detecting source. Then code detectors can handle the tailing
6681 byte sequence more accurately.
6682
6683 Fix me: This is not a perfect solution. It is better that we
6684 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6685 */
6686 if (to == Z || (to == GPT && GAP_SIZE > 0))
6687 include_anchor_byte = 1;
6688 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6689 to_byte - from_byte + include_anchor_byte,
6690 !NILP (highest),
6691 !NILP (current_buffer
6692 ->enable_multibyte_characters));
6693}
6694
6695DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6696 1, 2, 0,
6697 doc: /* Detect how the byte sequence in STRING is encoded.
6698Return a list of possible coding systems used on decoding a byte
6699sequence containing the bytes in STRING when the coding system
6700`undecided' is specified. The list is ordered by priority decided in
6701the current language environment.
6702
6703If only ASCII characters are found, it returns a list of single element
6704`undecided' or its subsidiary coding system according to a detected
6705end-of-line format.
6706
6707If optional argument HIGHEST is non-nil, return the coding system of
6708highest priority. */)
6709 (string, highest)
6710 Lisp_Object string, highest;
6711{
6712 CHECK_STRING (string);
6713
6714 return detect_coding_system (SDATA (string),
6715 /* "+ 1" is to include the anchor byte
6716 `\0'. With this, code detectors can
6717 handle the tailing bytes more
6718 accurately. */
6719 SBYTES (string) + 1,
6720 !NILP (highest),
6721 STRING_MULTIBYTE (string));
6722}
6723
6724/* Subroutine for Ffind_coding_systems_region_internal.
6725
6726 Return a list of coding systems that safely encode the multibyte
6727 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6728 possible coding systems. If it is nil, it means that we have not
6729 yet found any coding systems.
6730
6731 WORK_TABLE a char-table of which element is set to t once the
6732 element is looked up.
6733
6734 If a non-ASCII single byte char is found, set
6735 *single_byte_char_found to 1. */
6736
6737static Lisp_Object
6738find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6739 unsigned char *p, *pend;
6740 Lisp_Object safe_codings, work_table;
6741 int *single_byte_char_found;
6742{
6743 int c, len;
6744 Lisp_Object val, ch;
6745 Lisp_Object prev, tail;
6746
6747 if (NILP (safe_codings))
6748 goto done_safe_codings;
6749 while (p < pend)
6750 {
6751 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6752 p += len;
6753 if (ASCII_BYTE_P (c))
6754 /* We can ignore ASCII characters here. */
6755 continue;
6756 if (SINGLE_BYTE_CHAR_P (c))
6757 *single_byte_char_found = 1;
6758 /* Check the safe coding systems for C. */
6759 ch = make_number (c);
6760 val = Faref (work_table, ch);
6761 if (EQ (val, Qt))
6762 /* This element was already checked. Ignore it. */
6763 continue;
6764 /* Remember that we checked this element. */
6765 Faset (work_table, ch, Qt);
6766
6767 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6768 {
6769 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6770 int encodable;
6771
6772 elt = XCAR (tail);
6773 if (CONSP (XCDR (elt)))
6774 {
6775 /* This entry has this format now:
6776 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6777 ACCEPT-LATIN-EXTRA ) */
6778 val = XCDR (elt);
6779 encodable = ! NILP (Faref (XCAR (val), ch));
6780 if (! encodable)
6781 {
6782 val = XCDR (val);
6783 translation_table = XCAR (val);
6784 hash_table = XCAR (XCDR (val));
6785 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6786 }
6787 }
6788 else
6789 {
6790 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6791 encodable = ! NILP (Faref (XCDR (elt), ch));
6792 if (! encodable)
6793 {
6794 /* Transform the format to:
6795 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6796 ACCEPT-LATIN-EXTRA ) */
6797 val = Fget (XCAR (elt), Qcoding_system);
6798 translation_table
6799 = Fplist_get (AREF (val, 3),
6800 Qtranslation_table_for_encode);
6801 if (SYMBOLP (translation_table))
6802 translation_table = Fget (translation_table,
6803 Qtranslation_table);
6804 hash_table
6805 = (CHAR_TABLE_P (translation_table)
6806 ? XCHAR_TABLE (translation_table)->extras[1]
6807 : Qnil);
6808 accept_latin_extra
6809 = ((EQ (AREF (val, 0), make_number (2))
6810 && VECTORP (AREF (val, 4)))
6811 ? AREF (AREF (val, 4), 16)
6812 : Qnil);
6813 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6814 translation_table, hash_table,
6815 accept_latin_extra));
6816 }
6817 }
6818
6819 if (! encodable
6820 && ((CHAR_TABLE_P (translation_table)
6821 && ! NILP (Faref (translation_table, ch)))
6822 || (HASH_TABLE_P (hash_table)
6823 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6824 || (SINGLE_BYTE_CHAR_P (c)
6825 && ! NILP (accept_latin_extra)
6826 && VECTORP (Vlatin_extra_code_table)
6827 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6828 encodable = 1;
6829 if (encodable)
6830 prev = tail;
6831 else
6832 {
6833 /* Exclude this coding system from SAFE_CODINGS. */
6834 if (EQ (tail, safe_codings))
6835 {
6836 safe_codings = XCDR (safe_codings);
6837 if (NILP (safe_codings))
6838 goto done_safe_codings;
6839 }
6840 else
6841 XSETCDR (prev, XCDR (tail));
6842 }
6843 }
6844 }
6845
6846 done_safe_codings:
6847 /* If the above loop was terminated before P reaches PEND, it means
6848 SAFE_CODINGS was set to nil. If we have not yet found an
6849 non-ASCII single-byte char, check it now. */
6850 if (! *single_byte_char_found)
6851 while (p < pend)
6852 {
6853 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6854 p += len;
6855 if (! ASCII_BYTE_P (c)
6856 && SINGLE_BYTE_CHAR_P (c))
6857 {
6858 *single_byte_char_found = 1;
6859 break;
6860 }
6861 }
6862 return safe_codings;
6863}
6864
6865DEFUN ("find-coding-systems-region-internal",
6866 Ffind_coding_systems_region_internal,
6867 Sfind_coding_systems_region_internal, 2, 2, 0,
6868 doc: /* Internal use only. */)
6869 (start, end)
6870 Lisp_Object start, end;
6871{
6872 Lisp_Object work_table, safe_codings;
6873 int non_ascii_p = 0;
6874 int single_byte_char_found = 0;
6875 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6876
6877 if (STRINGP (start))
6878 {
6879 if (!STRING_MULTIBYTE (start))
6880 return Qt;
6881 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6882 p2 = p2end = p1end;
6883 if (SCHARS (start) != SBYTES (start))
6884 non_ascii_p = 1;
6885 }
6886 else
6887 {
6888 int from, to, stop;
6889
6890 CHECK_NUMBER_COERCE_MARKER (start);
6891 CHECK_NUMBER_COERCE_MARKER (end);
6892 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6893 args_out_of_range (start, end);
6894 if (NILP (current_buffer->enable_multibyte_characters))
6895 return Qt;
6896 from = CHAR_TO_BYTE (XINT (start));
6897 to = CHAR_TO_BYTE (XINT (end));
6898 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6899 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6900 if (stop == to)
6901 p2 = p2end = p1end;
6902 else
6903 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6904 if (XINT (end) - XINT (start) != to - from)
6905 non_ascii_p = 1;
6906 }
6907
6908 if (!non_ascii_p)
6909 {
6910 /* We are sure that the text contains no multibyte character.
6911 Check if it contains eight-bit-graphic. */
6912 p = p1;
6913 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6914 if (p == p1end)
6915 {
6916 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6917 if (p == p2end)
6918 return Qt;
6919 }
6920 }
6921
6922 /* The text contains non-ASCII characters. */
6923
6924 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6925 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6926
6927 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6928 &single_byte_char_found);
6929 if (p2 < p2end)
6930 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6931 &single_byte_char_found);
6932 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6933 safe_codings = Qt;
6934 else
6935 {
6936 /* Turn safe_codings to a list of coding systems... */
6937 Lisp_Object val;
6938
6939 if (single_byte_char_found)
6940 /* ... and append these for eight-bit chars. */
6941 val = Fcons (Qraw_text,
6942 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6943 else
6944 /* ... and append generic coding systems. */
6945 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6946
6947 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6948 val = Fcons (XCAR (XCAR (safe_codings)), val);
6949 safe_codings = val;
6950 }
6951
6952 return safe_codings;
6953}
6954
6955
6956/* Search from position POS for such characters that are unencodable
6957 accoding to SAFE_CHARS, and return a list of their positions. P
6958 points where in the memory the character at POS exists. Limit the
6959 search at PEND or when Nth unencodable characters are found.
6960
6961 If SAFE_CHARS is a char table, an element for an unencodable
6962 character is nil.
6963
6964 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6965
6966 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6967 eight-bit-graphic characters are unencodable. */
6968
6969static Lisp_Object
6970unencodable_char_position (safe_chars, pos, p, pend, n)
6971 Lisp_Object safe_chars;
6972 int pos;
6973 unsigned char *p, *pend;
6974 int n;
6975{
6976 Lisp_Object pos_list;
6977
6978 pos_list = Qnil;
6979 while (p < pend)
6980 {
6981 int len;
6982 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6983
6984 if (c >= 128
6985 && (CHAR_TABLE_P (safe_chars)
6986 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6987 : (NILP (safe_chars) || c < 256)))
6988 {
6989 pos_list = Fcons (make_number (pos), pos_list);
6990 if (--n <= 0)
6991 break;
6992 }
6993 pos++;
6994 p += len;
6995 }
6996 return Fnreverse (pos_list);
6997}
6998
6999
7000DEFUN ("unencodable-char-position", Funencodable_char_position,
7001 Sunencodable_char_position, 3, 5, 0,
7002 doc: /*
7003Return position of first un-encodable character in a region.
7004START and END specfiy the region and CODING-SYSTEM specifies the
7005encoding to check. Return nil if CODING-SYSTEM does encode the region.
7006
7007If optional 4th argument COUNT is non-nil, it specifies at most how
7008many un-encodable characters to search. In this case, the value is a
7009list of positions.
7010
7011If optional 5th argument STRING is non-nil, it is a string to search
7012for un-encodable characters. In that case, START and END are indexes
7013to the string. */)
7014 (start, end, coding_system, count, string)
7015 Lisp_Object start, end, coding_system, count, string;
7016{
7017 int n;
7018 Lisp_Object safe_chars;
7019 struct coding_system coding;
7020 Lisp_Object positions;
7021 int from, to;
7022 unsigned char *p, *pend;
7023
7024 if (NILP (string))
7025 {
7026 validate_region (&start, &end);
7027 from = XINT (start);
7028 to = XINT (end);
7029 if (NILP (current_buffer->enable_multibyte_characters))
7030 return Qnil;
7031 p = CHAR_POS_ADDR (from);
7032 if (to == GPT)
7033 pend = GPT_ADDR;
7034 else
7035 pend = CHAR_POS_ADDR (to);
7036 }
7037 else
7038 {
7039 CHECK_STRING (string);
7040 CHECK_NATNUM (start);
7041 CHECK_NATNUM (end);
7042 from = XINT (start);
7043 to = XINT (end);
7044 if (from > to
7045 || to > SCHARS (string))
7046 args_out_of_range_3 (string, start, end);
7047 if (! STRING_MULTIBYTE (string))
7048 return Qnil;
7049 p = SDATA (string) + string_char_to_byte (string, from);
7050 pend = SDATA (string) + string_char_to_byte (string, to);
7051 }
7052
7053 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7054
7055 if (NILP (count))
7056 n = 1;
7057 else
7058 {
7059 CHECK_NATNUM (count);
7060 n = XINT (count);
7061 }
7062
7063 if (coding.type == coding_type_no_conversion
7064 || coding.type == coding_type_raw_text)
7065 return Qnil;
7066
7067 if (coding.type == coding_type_undecided)
7068 safe_chars = Qnil;
7069 else
7070 safe_chars = coding_safe_chars (coding_system);
7071
7072 if (STRINGP (string)
7073 || from >= GPT || to <= GPT)
7074 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7075 else
7076 {
7077 Lisp_Object args[2];
7078
7079 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7080 n -= XINT (Flength (args[0]));
7081 if (n <= 0)
7082 positions = args[0];
7083 else
7084 {
7085 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7086 pend, n);
7087 positions = Fappend (2, args);
7088 }
7089 }
7090
7091 return (NILP (count) ? Fcar (positions) : positions);
7092}
7093
7094
7095Lisp_Object
7096code_convert_region1 (start, end, coding_system, encodep)
7097 Lisp_Object start, end, coding_system;
7098 int encodep;
7099{
7100 struct coding_system coding;
7101 int from, to;
7102
7103 CHECK_NUMBER_COERCE_MARKER (start);
7104 CHECK_NUMBER_COERCE_MARKER (end);
7105 CHECK_SYMBOL (coding_system);
7106
7107 validate_region (&start, &end);
7108 from = XFASTINT (start);
7109 to = XFASTINT (end);
7110
7111 if (NILP (coding_system))
7112 return make_number (to - from);
7113
7114 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7115 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7116
7117 coding.mode |= CODING_MODE_LAST_BLOCK;
7118 coding.src_multibyte = coding.dst_multibyte
7119 = !NILP (current_buffer->enable_multibyte_characters);
7120 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7121 &coding, encodep, 1);
7122 Vlast_coding_system_used = coding.symbol;
7123 return make_number (coding.produced_char);
7124}
7125
7126DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7127 3, 3, "r\nzCoding system: ",
7128 doc: /* Decode the current region from the specified coding system.
7129When called from a program, takes three arguments:
7130START, END, and CODING-SYSTEM. START and END are buffer positions.
7131This function sets `last-coding-system-used' to the precise coding system
7132used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7133not fully specified.)
7134It returns the length of the decoded text. */)
7135 (start, end, coding_system)
7136 Lisp_Object start, end, coding_system;
7137{
7138 return code_convert_region1 (start, end, coding_system, 0);
7139}
7140
7141DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7142 3, 3, "r\nzCoding system: ",
7143 doc: /* Encode the current region into the specified coding system.
7144When called from a program, takes three arguments:
7145START, END, and CODING-SYSTEM. START and END are buffer positions.
7146This function sets `last-coding-system-used' to the precise coding system
7147used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7148not fully specified.)
7149It returns the length of the encoded text. */)
7150 (start, end, coding_system)
7151 Lisp_Object start, end, coding_system;
7152{
7153 return code_convert_region1 (start, end, coding_system, 1);
7154}
7155
7156Lisp_Object
7157code_convert_string1 (string, coding_system, nocopy, encodep)
7158 Lisp_Object string, coding_system, nocopy;
7159 int encodep;
7160{
7161 struct coding_system coding;
7162
7163 CHECK_STRING (string);
7164 CHECK_SYMBOL (coding_system);
7165
7166 if (NILP (coding_system))
7167 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7168
7169 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7170 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7171
7172 coding.mode |= CODING_MODE_LAST_BLOCK;
7173 string = (encodep
7174 ? encode_coding_string (string, &coding, !NILP (nocopy))
7175 : decode_coding_string (string, &coding, !NILP (nocopy)));
7176 Vlast_coding_system_used = coding.symbol;
7177
7178 return string;
7179}
7180
7181DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7182 2, 3, 0,
7183 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7184Optional arg NOCOPY non-nil means it is OK to return STRING itself
7185if the decoding operation is trivial.
7186This function sets `last-coding-system-used' to the precise coding system
7187used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7188not fully specified.) */)
7189 (string, coding_system, nocopy)
7190 Lisp_Object string, coding_system, nocopy;
7191{
7192 return code_convert_string1 (string, coding_system, nocopy, 0);
7193}
7194
7195DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7196 2, 3, 0,
7197 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7198Optional arg NOCOPY non-nil means it is OK to return STRING itself
7199if the encoding operation is trivial.
7200This function sets `last-coding-system-used' to the precise coding system
7201used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7202not fully specified.) */)
7203 (string, coding_system, nocopy)
7204 Lisp_Object string, coding_system, nocopy;
7205{
7206 return code_convert_string1 (string, coding_system, nocopy, 1);
7207}
7208
7209/* Encode or decode STRING according to CODING_SYSTEM.
7210 Do not set Vlast_coding_system_used.
7211
7212 This function is called only from macros DECODE_FILE and
7213 ENCODE_FILE, thus we ignore character composition. */
7214
7215Lisp_Object
7216code_convert_string_norecord (string, coding_system, encodep)
7217 Lisp_Object string, coding_system;
7218 int encodep;
7219{
7220 struct coding_system coding;
7221
7222 CHECK_STRING (string);
7223 CHECK_SYMBOL (coding_system);
7224
7225 if (NILP (coding_system))
7226 return string;
7227
7228 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7229 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7230
7231 coding.composing = COMPOSITION_DISABLED;
7232 coding.mode |= CODING_MODE_LAST_BLOCK;
7233 return (encodep
7234 ? encode_coding_string (string, &coding, 1)
7235 : decode_coding_string (string, &coding, 1));
7236}
7237\f
7238DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7239 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7240Return the corresponding character. */)
7241 (code)
7242 Lisp_Object code;
7243{
7244 unsigned char c1, c2, s1, s2;
7245 Lisp_Object val;
7246
7247 CHECK_NUMBER (code);
7248 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7249 if (s1 == 0)
7250 {
7251 if (s2 < 0x80)
7252 XSETFASTINT (val, s2);
7253 else if (s2 >= 0xA0 || s2 <= 0xDF)
7254 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7255 else
7256 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7257 }
7258 else
7259 {
7260 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7261 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7262 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7263 DECODE_SJIS (s1, s2, c1, c2);
7264 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7265 }
7266 return val;
7267}
7268
7269DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7270 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7271Return the corresponding code in SJIS. */)
7272 (ch)
7273 Lisp_Object ch;
7274{
7275 int charset, c1, c2, s1, s2;
7276 Lisp_Object val;
7277
7278 CHECK_NUMBER (ch);
7279 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7280 if (charset == CHARSET_ASCII)
7281 {
7282 val = ch;
7283 }
7284 else if (charset == charset_jisx0208
7285 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7286 {
7287 ENCODE_SJIS (c1, c2, s1, s2);
7288 XSETFASTINT (val, (s1 << 8) | s2);
7289 }
7290 else if (charset == charset_katakana_jisx0201
7291 && c1 > 0x20 && c2 < 0xE0)
7292 {
7293 XSETFASTINT (val, c1 | 0x80);
7294 }
7295 else
7296 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7297 return val;
7298}
7299
7300DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7301 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7302Return the corresponding character. */)
7303 (code)
7304 Lisp_Object code;
7305{
7306 int charset;
7307 unsigned char b1, b2, c1, c2;
7308 Lisp_Object val;
7309
7310 CHECK_NUMBER (code);
7311 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7312 if (b1 == 0)
7313 {
7314 if (b2 >= 0x80)
7315 error ("Invalid BIG5 code: %x", XFASTINT (code));
7316 val = code;
7317 }
7318 else
7319 {
7320 if ((b1 < 0xA1 || b1 > 0xFE)
7321 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7322 error ("Invalid BIG5 code: %x", XFASTINT (code));
7323 DECODE_BIG5 (b1, b2, charset, c1, c2);
7324 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7325 }
7326 return val;
7327}
7328
7329DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7330 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7331Return the corresponding character code in Big5. */)
7332 (ch)
7333 Lisp_Object ch;
7334{
7335 int charset, c1, c2, b1, b2;
7336 Lisp_Object val;
7337
7338 CHECK_NUMBER (ch);
7339 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7340 if (charset == CHARSET_ASCII)
7341 {
7342 val = ch;
7343 }
7344 else if ((charset == charset_big5_1
7345 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7346 || (charset == charset_big5_2
7347 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7348 {
7349 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7350 XSETFASTINT (val, (b1 << 8) | b2);
7351 }
7352 else
7353 error ("Can't encode to Big5: %d", XFASTINT (ch));
7354 return val;
7355}
7356\f
7357DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7358 Sset_terminal_coding_system_internal, 1, 1, 0,
7359 doc: /* Internal use only. */)
7360 (coding_system)
7361 Lisp_Object coding_system;
7362{
7363 CHECK_SYMBOL (coding_system);
7364 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7365 /* We had better not send unsafe characters to terminal. */
7366 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7367 /* Character composition should be disabled. */
7368 terminal_coding.composing = COMPOSITION_DISABLED;
7369 /* Error notification should be suppressed. */
7370 terminal_coding.suppress_error = 1;
7371 terminal_coding.src_multibyte = 1;
7372 terminal_coding.dst_multibyte = 0;
7373 return Qnil;
7374}
7375
7376DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7377 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7378 doc: /* Internal use only. */)
7379 (coding_system)
7380 Lisp_Object coding_system;
7381{
7382 CHECK_SYMBOL (coding_system);
7383 setup_coding_system (Fcheck_coding_system (coding_system),
7384 &safe_terminal_coding);
7385 /* Character composition should be disabled. */
7386 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7387 /* Error notification should be suppressed. */
7388 safe_terminal_coding.suppress_error = 1;
7389 safe_terminal_coding.src_multibyte = 1;
7390 safe_terminal_coding.dst_multibyte = 0;
7391 return Qnil;
7392}
7393
7394DEFUN ("terminal-coding-system", Fterminal_coding_system,
7395 Sterminal_coding_system, 0, 0, 0,
7396 doc: /* Return coding system specified for terminal output. */)
7397 ()
7398{
7399 return terminal_coding.symbol;
7400}
7401
7402DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7403 Sset_keyboard_coding_system_internal, 1, 1, 0,
7404 doc: /* Internal use only. */)
7405 (coding_system)
7406 Lisp_Object coding_system;
7407{
7408 CHECK_SYMBOL (coding_system);
7409 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7410 /* Character composition should be disabled. */
7411 keyboard_coding.composing = COMPOSITION_DISABLED;
7412 return Qnil;
7413}
7414
7415DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7416 Skeyboard_coding_system, 0, 0, 0,
7417 doc: /* Return coding system specified for decoding keyboard input. */)
7418 ()
7419{
7420 return keyboard_coding.symbol;
7421}
7422
7423\f
7424DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7425 Sfind_operation_coding_system, 1, MANY, 0,
7426 doc: /* Choose a coding system for an operation based on the target name.
7427The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7428DECODING-SYSTEM is the coding system to use for decoding
7429\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7430for encoding (in case OPERATION does encoding).
7431
7432The first argument OPERATION specifies an I/O primitive:
7433 For file I/O, `insert-file-contents' or `write-region'.
7434 For process I/O, `call-process', `call-process-region', or `start-process'.
7435 For network I/O, `open-network-stream'.
7436
7437The remaining arguments should be the same arguments that were passed
7438to the primitive. Depending on which primitive, one of those arguments
7439is selected as the TARGET. For example, if OPERATION does file I/O,
7440whichever argument specifies the file name is TARGET.
7441
7442TARGET has a meaning which depends on OPERATION:
7443 For file I/O, TARGET is a file name.
7444 For process I/O, TARGET is a process name.
7445 For network I/O, TARGET is a service name or a port number
7446
7447This function looks up what specified for TARGET in,
7448`file-coding-system-alist', `process-coding-system-alist',
7449or `network-coding-system-alist' depending on OPERATION.
7450They may specify a coding system, a cons of coding systems,
7451or a function symbol to call.
7452In the last case, we call the function with one argument,
7453which is a list of all the arguments given to this function.
7454
7455usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7456 (nargs, args)
7457 int nargs;
7458 Lisp_Object *args;
7459{
7460 Lisp_Object operation, target_idx, target, val;
7461 register Lisp_Object chain;
7462
7463 if (nargs < 2)
7464 error ("Too few arguments");
7465 operation = args[0];
7466 if (!SYMBOLP (operation)
7467 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7468 error ("Invalid first argument");
7469 if (nargs < 1 + XINT (target_idx))
7470 error ("Too few arguments for operation: %s",
7471 SDATA (SYMBOL_NAME (operation)));
7472 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7473 argument to write-region) is string, it must be treated as a
7474 target file name. */
7475 if (EQ (operation, Qwrite_region)
7476 && nargs > 5
7477 && STRINGP (args[5]))
7478 target_idx = make_number (4);
7479 target = args[XINT (target_idx) + 1];
7480 if (!(STRINGP (target)
7481 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7482 error ("Invalid argument %d", XINT (target_idx) + 1);
7483
7484 chain = ((EQ (operation, Qinsert_file_contents)
7485 || EQ (operation, Qwrite_region))
7486 ? Vfile_coding_system_alist
7487 : (EQ (operation, Qopen_network_stream)
7488 ? Vnetwork_coding_system_alist
7489 : Vprocess_coding_system_alist));
7490 if (NILP (chain))
7491 return Qnil;
7492
7493 for (; CONSP (chain); chain = XCDR (chain))
7494 {
7495 Lisp_Object elt;
7496 elt = XCAR (chain);
7497
7498 if (CONSP (elt)
7499 && ((STRINGP (target)
7500 && STRINGP (XCAR (elt))
7501 && fast_string_match (XCAR (elt), target) >= 0)
7502 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7503 {
7504 val = XCDR (elt);
7505 /* Here, if VAL is both a valid coding system and a valid
7506 function symbol, we return VAL as a coding system. */
7507 if (CONSP (val))
7508 return val;
7509 if (! SYMBOLP (val))
7510 return Qnil;
7511 if (! NILP (Fcoding_system_p (val)))
7512 return Fcons (val, val);
7513 if (! NILP (Ffboundp (val)))
7514 {
7515 val = call1 (val, Flist (nargs, args));
7516 if (CONSP (val))
7517 return val;
7518 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7519 return Fcons (val, val);
7520 }
7521 return Qnil;
7522 }
7523 }
7524 return Qnil;
7525}
7526
7527DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7528 Supdate_coding_systems_internal, 0, 0, 0,
7529 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7530When values of any coding categories are changed, you must
7531call this function. */)
7532 ()
7533{
7534 int i;
7535
7536 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7537 {
7538 Lisp_Object val;
7539
7540 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7541 if (!NILP (val))
7542 {
7543 if (! coding_system_table[i])
7544 coding_system_table[i] = ((struct coding_system *)
7545 xmalloc (sizeof (struct coding_system)));
7546 setup_coding_system (val, coding_system_table[i]);
7547 }
7548 else if (coding_system_table[i])
7549 {
7550 xfree (coding_system_table[i]);
7551 coding_system_table[i] = NULL;
7552 }
7553 }
7554
7555 return Qnil;
7556}
7557
7558DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7559 Sset_coding_priority_internal, 0, 0, 0,
7560 doc: /* Update internal database for the current value of `coding-category-list'.
7561This function is internal use only. */)
7562 ()
7563{
7564 int i = 0, idx;
7565 Lisp_Object val;
7566
7567 val = Vcoding_category_list;
7568
7569 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7570 {
7571 if (! SYMBOLP (XCAR (val)))
7572 break;
7573 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7574 if (idx >= CODING_CATEGORY_IDX_MAX)
7575 break;
7576 coding_priorities[i++] = (1 << idx);
7577 val = XCDR (val);
7578 }
7579 /* If coding-category-list is valid and contains all coding
7580 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7581 the following code saves Emacs from crashing. */
7582 while (i < CODING_CATEGORY_IDX_MAX)
7583 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7584
7585 return Qnil;
7586}
7587
7588DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7589 Sdefine_coding_system_internal, 1, 1, 0,
7590 doc: /* Register CODING-SYSTEM as a base coding system.
7591This function is internal use only. */)
7592 (coding_system)
7593 Lisp_Object coding_system;
7594{
7595 Lisp_Object safe_chars, slot;
7596
7597 if (NILP (Fcheck_coding_system (coding_system)))
7598 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7599 safe_chars = coding_safe_chars (coding_system);
7600 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7601 error ("No valid safe-chars property for %s",
7602 SDATA (SYMBOL_NAME (coding_system)));
7603 if (EQ (safe_chars, Qt))
7604 {
7605 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7606 XSETCAR (Vcoding_system_safe_chars,
7607 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7608 }
7609 else
7610 {
7611 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7612 if (NILP (slot))
7613 XSETCDR (Vcoding_system_safe_chars,
7614 nconc2 (XCDR (Vcoding_system_safe_chars),
7615 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7616 else
7617 XSETCDR (slot, safe_chars);
7618 }
7619 return Qnil;
7620}
7621
7622#endif /* emacs */
7623
7624\f
7625/*** 9. Post-amble ***/
7626
7627void
7628init_coding_once ()
7629{
7630 int i;
7631
7632 /* Emacs' internal format specific initialize routine. */
7633 for (i = 0; i <= 0x20; i++)
7634 emacs_code_class[i] = EMACS_control_code;
7635 emacs_code_class[0x0A] = EMACS_linefeed_code;
7636 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7637 for (i = 0x21 ; i < 0x7F; i++)
7638 emacs_code_class[i] = EMACS_ascii_code;
7639 emacs_code_class[0x7F] = EMACS_control_code;
7640 for (i = 0x80; i < 0xFF; i++)
7641 emacs_code_class[i] = EMACS_invalid_code;
7642 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7643 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7644 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7645 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7646
7647 /* ISO2022 specific initialize routine. */
7648 for (i = 0; i < 0x20; i++)
7649 iso_code_class[i] = ISO_control_0;
7650 for (i = 0x21; i < 0x7F; i++)
7651 iso_code_class[i] = ISO_graphic_plane_0;
7652 for (i = 0x80; i < 0xA0; i++)
7653 iso_code_class[i] = ISO_control_1;
7654 for (i = 0xA1; i < 0xFF; i++)
7655 iso_code_class[i] = ISO_graphic_plane_1;
7656 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7657 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7658 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7659 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7660 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7661 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7662 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7663 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7664 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7665 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7666
7667 setup_coding_system (Qnil, &keyboard_coding);
7668 setup_coding_system (Qnil, &terminal_coding);
7669 setup_coding_system (Qnil, &safe_terminal_coding);
7670 setup_coding_system (Qnil, &default_buffer_file_coding);
7671
7672 bzero (coding_system_table, sizeof coding_system_table);
7673
7674 bzero (ascii_skip_code, sizeof ascii_skip_code);
7675 for (i = 0; i < 128; i++)
7676 ascii_skip_code[i] = 1;
7677
7678#if defined (MSDOS) || defined (WINDOWSNT)
7679 system_eol_type = CODING_EOL_CRLF;
7680#else
7681 system_eol_type = CODING_EOL_LF;
7682#endif
7683
7684 inhibit_pre_post_conversion = 0;
7685}
7686
7687#ifdef emacs
7688
7689void
7690syms_of_coding ()
7691{
7692 staticpro (&Vcode_conversion_workbuf_name);
7693 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7694
7695 Qtarget_idx = intern ("target-idx");
7696 staticpro (&Qtarget_idx);
7697
7698 Qcoding_system_history = intern ("coding-system-history");
7699 staticpro (&Qcoding_system_history);
7700 Fset (Qcoding_system_history, Qnil);
7701
7702 /* Target FILENAME is the first argument. */
7703 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7704 /* Target FILENAME is the third argument. */
7705 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7706
7707 Qcall_process = intern ("call-process");
7708 staticpro (&Qcall_process);
7709 /* Target PROGRAM is the first argument. */
7710 Fput (Qcall_process, Qtarget_idx, make_number (0));
7711
7712 Qcall_process_region = intern ("call-process-region");
7713 staticpro (&Qcall_process_region);
7714 /* Target PROGRAM is the third argument. */
7715 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7716
7717 Qstart_process = intern ("start-process");
7718 staticpro (&Qstart_process);
7719 /* Target PROGRAM is the third argument. */
7720 Fput (Qstart_process, Qtarget_idx, make_number (2));
7721
7722 Qopen_network_stream = intern ("open-network-stream");
7723 staticpro (&Qopen_network_stream);
7724 /* Target SERVICE is the fourth argument. */
7725 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7726
7727 Qcoding_system = intern ("coding-system");
7728 staticpro (&Qcoding_system);
7729
7730 Qeol_type = intern ("eol-type");
7731 staticpro (&Qeol_type);
7732
7733 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7734 staticpro (&Qbuffer_file_coding_system);
7735
7736 Qpost_read_conversion = intern ("post-read-conversion");
7737 staticpro (&Qpost_read_conversion);
7738
7739 Qpre_write_conversion = intern ("pre-write-conversion");
7740 staticpro (&Qpre_write_conversion);
7741
7742 Qno_conversion = intern ("no-conversion");
7743 staticpro (&Qno_conversion);
7744
7745 Qundecided = intern ("undecided");
7746 staticpro (&Qundecided);
7747
7748 Qcoding_system_p = intern ("coding-system-p");
7749 staticpro (&Qcoding_system_p);
7750
7751 Qcoding_system_error = intern ("coding-system-error");
7752 staticpro (&Qcoding_system_error);
7753
7754 Fput (Qcoding_system_error, Qerror_conditions,
7755 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7756 Fput (Qcoding_system_error, Qerror_message,
7757 build_string ("Invalid coding system"));
7758
7759 Qcoding_category = intern ("coding-category");
7760 staticpro (&Qcoding_category);
7761 Qcoding_category_index = intern ("coding-category-index");
7762 staticpro (&Qcoding_category_index);
7763
7764 Vcoding_category_table
7765 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7766 staticpro (&Vcoding_category_table);
7767 {
7768 int i;
7769 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7770 {
7771 XVECTOR (Vcoding_category_table)->contents[i]
7772 = intern (coding_category_name[i]);
7773 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7774 Qcoding_category_index, make_number (i));
7775 }
7776 }
7777
7778 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7779 staticpro (&Vcoding_system_safe_chars);
7780
7781 Qtranslation_table = intern ("translation-table");
7782 staticpro (&Qtranslation_table);
7783 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7784
7785 Qtranslation_table_id = intern ("translation-table-id");
7786 staticpro (&Qtranslation_table_id);
7787
7788 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7789 staticpro (&Qtranslation_table_for_decode);
7790
7791 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7792 staticpro (&Qtranslation_table_for_encode);
7793
7794 Qsafe_chars = intern ("safe-chars");
7795 staticpro (&Qsafe_chars);
7796
7797 Qchar_coding_system = intern ("char-coding-system");
7798 staticpro (&Qchar_coding_system);
7799
7800 /* Intern this now in case it isn't already done.
7801 Setting this variable twice is harmless.
7802 But don't staticpro it here--that is done in alloc.c. */
7803 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7804 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7805 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7806
7807 Qvalid_codes = intern ("valid-codes");
7808 staticpro (&Qvalid_codes);
7809
7810 Qemacs_mule = intern ("emacs-mule");
7811 staticpro (&Qemacs_mule);
7812
7813 Qraw_text = intern ("raw-text");
7814 staticpro (&Qraw_text);
7815
7816 Qutf_8 = intern ("utf-8");
7817 staticpro (&Qutf_8);
7818
7819 Qcoding_system_define_form = intern ("coding-system-define-form");
7820 staticpro (&Qcoding_system_define_form);
7821
7822 defsubr (&Scoding_system_p);
7823 defsubr (&Sread_coding_system);
7824 defsubr (&Sread_non_nil_coding_system);
7825 defsubr (&Scheck_coding_system);
7826 defsubr (&Sdetect_coding_region);
7827 defsubr (&Sdetect_coding_string);
7828 defsubr (&Sfind_coding_systems_region_internal);
7829 defsubr (&Sunencodable_char_position);
7830 defsubr (&Sdecode_coding_region);
7831 defsubr (&Sencode_coding_region);
7832 defsubr (&Sdecode_coding_string);
7833 defsubr (&Sencode_coding_string);
7834 defsubr (&Sdecode_sjis_char);
7835 defsubr (&Sencode_sjis_char);
7836 defsubr (&Sdecode_big5_char);
7837 defsubr (&Sencode_big5_char);
7838 defsubr (&Sset_terminal_coding_system_internal);
7839 defsubr (&Sset_safe_terminal_coding_system_internal);
7840 defsubr (&Sterminal_coding_system);
7841 defsubr (&Sset_keyboard_coding_system_internal);
7842 defsubr (&Skeyboard_coding_system);
7843 defsubr (&Sfind_operation_coding_system);
7844 defsubr (&Supdate_coding_systems_internal);
7845 defsubr (&Sset_coding_priority_internal);
7846 defsubr (&Sdefine_coding_system_internal);
7847
7848 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7849 doc: /* List of coding systems.
7850
7851Do not alter the value of this variable manually. This variable should be
7852updated by the functions `make-coding-system' and
7853`define-coding-system-alias'. */);
7854 Vcoding_system_list = Qnil;
7855
7856 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7857 doc: /* Alist of coding system names.
7858Each element is one element list of coding system name.
7859This variable is given to `completing-read' as TABLE argument.
7860
7861Do not alter the value of this variable manually. This variable should be
7862updated by the functions `make-coding-system' and
7863`define-coding-system-alias'. */);
7864 Vcoding_system_alist = Qnil;
7865
7866 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7867 doc: /* List of coding-categories (symbols) ordered by priority.
7868
7869On detecting a coding system, Emacs tries code detection algorithms
7870associated with each coding-category one by one in this order. When
7871one algorithm agrees with a byte sequence of source text, the coding
7872system bound to the corresponding coding-category is selected.
7873
7874Don't modify this variable directly, but use `set-coding-priority'. */);
7875 {
7876 int i;
7877
7878 Vcoding_category_list = Qnil;
7879 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7880 Vcoding_category_list
7881 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7882 Vcoding_category_list);
7883 }
7884
7885 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7886 doc: /* Specify the coding system for read operations.
7887It is useful to bind this variable with `let', but do not set it globally.
7888If the value is a coding system, it is used for decoding on read operation.
7889If not, an appropriate element is used from one of the coding system alists:
7890There are three such tables, `file-coding-system-alist',
7891`process-coding-system-alist', and `network-coding-system-alist'. */);
7892 Vcoding_system_for_read = Qnil;
7893
7894 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7895 doc: /* Specify the coding system for write operations.
7896Programs bind this variable with `let', but you should not set it globally.
7897If the value is a coding system, it is used for encoding of output,
7898when writing it to a file and when sending it to a file or subprocess.
7899
7900If this does not specify a coding system, an appropriate element
7901is used from one of the coding system alists:
7902There are three such tables, `file-coding-system-alist',
7903`process-coding-system-alist', and `network-coding-system-alist'.
7904For output to files, if the above procedure does not specify a coding system,
7905the value of `buffer-file-coding-system' is used. */);
7906 Vcoding_system_for_write = Qnil;
7907
7908 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7909 doc: /* Coding system used in the latest file or process I/O.
7910Also set by `encode-coding-region', `decode-coding-region',
7911`encode-coding-string' and `decode-coding-string'. */);
7912 Vlast_coding_system_used = Qnil;
7913
7914 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7915 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7916See info node `Coding Systems' and info node `Text and Binary' concerning
7917such conversion. */);
7918 inhibit_eol_conversion = 0;
7919
7920 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7921 doc: /* Non-nil means process buffer inherits coding system of process output.
7922Bind it to t if the process output is to be treated as if it were a file
7923read from some filesystem. */);
7924 inherit_process_coding_system = 0;
7925
7926 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7927 doc: /* Alist to decide a coding system to use for a file I/O operation.
7928The format is ((PATTERN . VAL) ...),
7929where PATTERN is a regular expression matching a file name,
7930VAL is a coding system, a cons of coding systems, or a function symbol.
7931If VAL is a coding system, it is used for both decoding and encoding
7932the file contents.
7933If VAL is a cons of coding systems, the car part is used for decoding,
7934and the cdr part is used for encoding.
7935If VAL is a function symbol, the function must return a coding system
7936or a cons of coding systems which are used as above. The function gets
7937the arguments with which `find-operation-coding-system' was called.
7938
7939See also the function `find-operation-coding-system'
7940and the variable `auto-coding-alist'. */);
7941 Vfile_coding_system_alist = Qnil;
7942
7943 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7944 doc: /* Alist to decide a coding system to use for a process I/O operation.
7945The format is ((PATTERN . VAL) ...),
7946where PATTERN is a regular expression matching a program name,
7947VAL is a coding system, a cons of coding systems, or a function symbol.
7948If VAL is a coding system, it is used for both decoding what received
7949from the program and encoding what sent to the program.
7950If VAL is a cons of coding systems, the car part is used for decoding,
7951and the cdr part is used for encoding.
7952If VAL is a function symbol, the function must return a coding system
7953or a cons of coding systems which are used as above.
7954
7955See also the function `find-operation-coding-system'. */);
7956 Vprocess_coding_system_alist = Qnil;
7957
7958 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7959 doc: /* Alist to decide a coding system to use for a network I/O operation.
7960The format is ((PATTERN . VAL) ...),
7961where PATTERN is a regular expression matching a network service name
7962or is a port number to connect to,
7963VAL is a coding system, a cons of coding systems, or a function symbol.
7964If VAL is a coding system, it is used for both decoding what received
7965from the network stream and encoding what sent to the network stream.
7966If VAL is a cons of coding systems, the car part is used for decoding,
7967and the cdr part is used for encoding.
7968If VAL is a function symbol, the function must return a coding system
7969or a cons of coding systems which are used as above.
7970
7971See also the function `find-operation-coding-system'. */);
7972 Vnetwork_coding_system_alist = Qnil;
7973
7974 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7975 doc: /* Coding system to use with system messages.
7976Also used for decoding keyboard input on X Window system. */);
7977 Vlocale_coding_system = Qnil;
7978
7979 /* The eol mnemonics are reset in startup.el system-dependently. */
7980 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7981 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7982 eol_mnemonic_unix = build_string (":");
7983
7984 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7985 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7986 eol_mnemonic_dos = build_string ("\\");
7987
7988 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7989 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7990 eol_mnemonic_mac = build_string ("/");
7991
7992 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7993 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7994 eol_mnemonic_undecided = build_string (":");
7995
7996 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7997 doc: /* *Non-nil enables character translation while encoding and decoding. */);
7998 Venable_character_translation = Qt;
7999
8000 DEFVAR_LISP ("standard-translation-table-for-decode",
8001 &Vstandard_translation_table_for_decode,
8002 doc: /* Table for translating characters while decoding. */);
8003 Vstandard_translation_table_for_decode = Qnil;
8004
8005 DEFVAR_LISP ("standard-translation-table-for-encode",
8006 &Vstandard_translation_table_for_encode,
8007 doc: /* Table for translating characters while encoding. */);
8008 Vstandard_translation_table_for_encode = Qnil;
8009
8010 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8011 doc: /* Alist of charsets vs revision numbers.
8012While encoding, if a charset (car part of an element) is found,
8013designate it with the escape sequence identifying revision (cdr part of the element). */);
8014 Vcharset_revision_alist = Qnil;
8015
8016 DEFVAR_LISP ("default-process-coding-system",
8017 &Vdefault_process_coding_system,
8018 doc: /* Cons of coding systems used for process I/O by default.
8019The car part is used for decoding a process output,
8020the cdr part is used for encoding a text to be sent to a process. */);
8021 Vdefault_process_coding_system = Qnil;
8022
8023 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8024 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8025This is a vector of length 256.
8026If Nth element is non-nil, the existence of code N in a file
8027\(or output of subprocess) doesn't prevent it to be detected as
8028a coding system of ISO 2022 variant which has a flag
8029`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8030or reading output of a subprocess.
8031Only 128th through 159th elements has a meaning. */);
8032 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8033
8034 DEFVAR_LISP ("select-safe-coding-system-function",
8035 &Vselect_safe_coding_system_function,
8036 doc: /* Function to call to select safe coding system for encoding a text.
8037
8038If set, this function is called to force a user to select a proper
8039coding system which can encode the text in the case that a default
8040coding system used in each operation can't encode the text.
8041
8042The default value is `select-safe-coding-system' (which see). */);
8043 Vselect_safe_coding_system_function = Qnil;
8044
8045 DEFVAR_BOOL ("coding-system-require-warning",
8046 &coding_system_require_warning,
8047 doc: /* Internal use only.
8048If non-nil, on writing a file, `select-safe-coding-system-function' is
8049called even if `coding-system-for-write' is non-nil. The command
8050`universal-coding-system-argument' binds this variable to t temporarily. */);
8051 coding_system_require_warning = 0;
8052
8053
8054 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8055 &inhibit_iso_escape_detection,
8056 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8057
8058By default, on reading a file, Emacs tries to detect how the text is
8059encoded. This code detection is sensitive to escape sequences. If
8060the sequence is valid as ISO2022, the code is determined as one of
8061the ISO2022 encodings, and the file is decoded by the corresponding
8062coding system (e.g. `iso-2022-7bit').
8063
8064However, there may be a case that you want to read escape sequences in
8065a file as is. In such a case, you can set this variable to non-nil.
8066Then, as the code detection ignores any escape sequences, no file is
8067detected as encoded in some ISO2022 encoding. The result is that all
8068escape sequences become visible in a buffer.
8069
8070The default value is nil, and it is strongly recommended not to change
8071it. That is because many Emacs Lisp source files that contain
8072non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8073in Emacs's distribution, and they won't be decoded correctly on
8074reading if you suppress escape sequence detection.
8075
8076The other way to read escape sequences in a file without decoding is
8077to explicitly specify some coding system that doesn't use ISO2022's
8078escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8079 inhibit_iso_escape_detection = 0;
8080
8081 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8082 doc: /* Char table for translating self-inserting characters.
8083This is applied to the result of input methods, not their input. See also
8084`keyboard-translate-table'. */);
8085 Vtranslation_table_for_input = Qnil;
8086}
8087
8088char *
8089emacs_strerror (error_number)
8090 int error_number;
8091{
8092 char *str;
8093
8094 synchronize_system_messages_locale ();
8095 str = strerror (error_number);
8096
8097 if (! NILP (Vlocale_coding_system))
8098 {
8099 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8100 Vlocale_coding_system,
8101 0);
8102 str = (char *) SDATA (dec);
8103 }
8104
8105 return str;
8106}
8107
8108#endif /* emacs */
8109
8110/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8111 (do not change this comment) */