Add support for large files, 64-bit Solaris, system locale codings.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
33
34 */
35
36 /*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
44
45 0. Emacs' internal format (emacs-mule)
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 section 4.
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
70
71 4. Raw text
72
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
75
76 5. Other
77
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
87
88 */
89
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
97
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
102
103 */
104
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112 #if 0
113 int
114 detect_coding_emacs_mule (src, src_end)
115 unsigned char *src, *src_end;
116 {
117 ...
118 }
119 #endif
120
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
138 #if 0
139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
143 {
144 ...
145 }
146 #endif
147
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
165 #if 0
166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
170 {
171 ...
172 }
173 #endif
174
175 /*** COMMONLY USED MACROS ***/
176
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183 #define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191 #define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199 #define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214 /* Decode one ASCII character C. */
215
216 #define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
224 } \
225 else \
226 { \
227 /* If ASCII charset is invoked to GR, \
228 we must reset MSB now. */ \
229 *dst++ = (c) & 0x7F; \
230 coding->produced_char++; \
231 } \
232 } while (0)
233
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
236
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
245 else \
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
255 } while (0)
256
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
259
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
266 } while (0)
267
268 \f
269 /*** 1. Preamble ***/
270
271 #ifdef emacs
272 #include <config.h>
273 #endif
274
275 #include <stdio.h>
276
277 #ifdef emacs
278
279 #include "lisp.h"
280 #include "buffer.h"
281 #include "charset.h"
282 #include "ccl.h"
283 #include "coding.h"
284 #include "window.h"
285
286 #else /* not emacs */
287
288 #include "mulelib.h"
289
290 #endif /* not emacs */
291
292 Lisp_Object Qcoding_system, Qeol_type;
293 Lisp_Object Qbuffer_file_coding_system;
294 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
295 Lisp_Object Qno_conversion, Qundecided;
296 Lisp_Object Qcoding_system_history;
297 Lisp_Object Qsafe_charsets;
298 Lisp_Object Qvalid_codes;
299
300 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
301 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
302 Lisp_Object Qstart_process, Qopen_network_stream;
303 Lisp_Object Qtarget_idx;
304
305 Lisp_Object Vselect_safe_coding_system_function;
306
307 /* Mnemonic string for each format of end-of-line. */
308 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
309 /* Mnemonic string to indicate format of end-of-line is not yet
310 decided. */
311 Lisp_Object eol_mnemonic_undecided;
312
313 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
314 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
315 int system_eol_type;
316
317 #ifdef emacs
318
319 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
320
321 Lisp_Object Qcoding_system_p, Qcoding_system_error;
322
323 /* Coding system emacs-mule and raw-text are for converting only
324 end-of-line format. */
325 Lisp_Object Qemacs_mule, Qraw_text;
326
327 /* Coding-systems are handed between Emacs Lisp programs and C internal
328 routines by the following three variables. */
329 /* Coding-system for reading files and receiving data from process. */
330 Lisp_Object Vcoding_system_for_read;
331 /* Coding-system for writing files and sending data to process. */
332 Lisp_Object Vcoding_system_for_write;
333 /* Coding-system actually used in the latest I/O. */
334 Lisp_Object Vlast_coding_system_used;
335
336 /* A vector of length 256 which contains information about special
337 Latin codes (especially for dealing with Microsoft codes). */
338 Lisp_Object Vlatin_extra_code_table;
339
340 /* Flag to inhibit code conversion of end-of-line format. */
341 int inhibit_eol_conversion;
342
343 /* Flag to make buffer-file-coding-system inherit from process-coding. */
344 int inherit_process_coding_system;
345
346 /* Coding system to be used to encode text for terminal display. */
347 struct coding_system terminal_coding;
348
349 /* Coding system to be used to encode text for terminal display when
350 terminal coding system is nil. */
351 struct coding_system safe_terminal_coding;
352
353 /* Coding system of what is sent from terminal keyboard. */
354 struct coding_system keyboard_coding;
355
356 /* Default coding system to be used to write a file. */
357 struct coding_system default_buffer_file_coding;
358
359 Lisp_Object Vfile_coding_system_alist;
360 Lisp_Object Vprocess_coding_system_alist;
361 Lisp_Object Vnetwork_coding_system_alist;
362
363 Lisp_Object Vlocale_coding_system;
364
365 #endif /* emacs */
366
367 Lisp_Object Qcoding_category, Qcoding_category_index;
368
369 /* List of symbols `coding-category-xxx' ordered by priority. */
370 Lisp_Object Vcoding_category_list;
371
372 /* Table of coding categories (Lisp symbols). */
373 Lisp_Object Vcoding_category_table;
374
375 /* Table of names of symbol for each coding-category. */
376 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
377 "coding-category-emacs-mule",
378 "coding-category-sjis",
379 "coding-category-iso-7",
380 "coding-category-iso-7-tight",
381 "coding-category-iso-8-1",
382 "coding-category-iso-8-2",
383 "coding-category-iso-7-else",
384 "coding-category-iso-8-else",
385 "coding-category-ccl",
386 "coding-category-big5",
387 "coding-category-raw-text",
388 "coding-category-binary"
389 };
390
391 /* Table of pointers to coding systems corresponding to each coding
392 categories. */
393 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
394
395 /* Table of coding category masks. Nth element is a mask for a coding
396 cateogry of which priority is Nth. */
397 static
398 int coding_priorities[CODING_CATEGORY_IDX_MAX];
399
400 /* Flag to tell if we look up translation table on character code
401 conversion. */
402 Lisp_Object Venable_character_translation;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode;
407
408 Lisp_Object Qtranslation_table;
409 Lisp_Object Qtranslation_table_id;
410 Lisp_Object Qtranslation_table_for_decode;
411 Lisp_Object Qtranslation_table_for_encode;
412
413 /* Alist of charsets vs revision number. */
414 Lisp_Object Vcharset_revision_alist;
415
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system;
418
419 /* Global flag to tell that we can't call post-read-conversion and
420 pre-write-conversion functions. Usually the value is zero, but it
421 is set to 1 temporarily while such functions are running. This is
422 to avoid infinite recursive call. */
423 static int inhibit_pre_post_conversion;
424
425 \f
426 /*** 2. Emacs internal format (emacs-mule) handlers ***/
427
428 /* Emacs' internal format for encoding multiple character sets is a
429 kind of multi-byte encoding, i.e. characters are encoded by
430 variable-length sequences of one-byte codes. ASCII characters
431 and control characters (e.g. `tab', `newline') are represented by
432 one-byte sequences which are their ASCII codes, in the range 0x00
433 through 0x7F. The other characters are represented by a sequence
434 of `base leading-code', optional `extended leading-code', and one
435 or two `position-code's. The length of the sequence is determined
436 by the base leading-code. Leading-code takes the range 0x80
437 through 0x9F, whereas extended leading-code and position-code take
438 the range 0xA0 through 0xFF. See `charset.h' for more details
439 about leading-code and position-code.
440
441 There's one exception to this rule. Special leading-code
442 `leading-code-composition' denotes that the following several
443 characters should be composed into one character. Leading-codes of
444 components (except for ASCII) are added 0x20. An ASCII character
445 component is represented by a 2-byte sequence of `0xA0' and
446 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
447 details of composite character. Hence, we can summarize the code
448 range as follows:
449
450 --- CODE RANGE of Emacs' internal format ---
451 (character set) (range)
452 ASCII 0x00 .. 0x7F
453 ELSE (1st byte) 0x80 .. 0x9F
454 (rest bytes) 0xA0 .. 0xFF
455 ---------------------------------------------
456
457 */
458
459 enum emacs_code_class_type emacs_code_class[256];
460
461 /* Go to the next statement only if *SRC is accessible and the code is
462 greater than 0xA0. */
463 #define CHECK_CODE_RANGE_A0_FF \
464 do { \
465 if (src >= src_end) \
466 goto label_end_of_switch; \
467 else if (*src++ < 0xA0) \
468 return 0; \
469 } while (0)
470
471 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
472 Check if a text is encoded in Emacs' internal format. If it is,
473 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
474
475 int
476 detect_coding_emacs_mule (src, src_end)
477 unsigned char *src, *src_end;
478 {
479 unsigned char c;
480 int composing = 0;
481
482 while (src < src_end)
483 {
484 c = *src++;
485
486 if (composing)
487 {
488 if (c < 0xA0)
489 composing = 0;
490 else
491 c -= 0x20;
492 }
493
494 switch (emacs_code_class[c])
495 {
496 case EMACS_ascii_code:
497 case EMACS_linefeed_code:
498 break;
499
500 case EMACS_control_code:
501 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
502 return 0;
503 break;
504
505 case EMACS_invalid_code:
506 return 0;
507
508 case EMACS_leading_code_composition: /* c == 0x80 */
509 if (composing)
510 CHECK_CODE_RANGE_A0_FF;
511 else
512 composing = 1;
513 break;
514
515 case EMACS_leading_code_4:
516 CHECK_CODE_RANGE_A0_FF;
517 /* fall down to check it two more times ... */
518
519 case EMACS_leading_code_3:
520 CHECK_CODE_RANGE_A0_FF;
521 /* fall down to check it one more time ... */
522
523 case EMACS_leading_code_2:
524 CHECK_CODE_RANGE_A0_FF;
525 break;
526
527 default:
528 label_end_of_switch:
529 break;
530 }
531 }
532 return CODING_CATEGORY_MASK_EMACS_MULE;
533 }
534
535 \f
536 /*** 3. ISO2022 handlers ***/
537
538 /* The following note describes the coding system ISO2022 briefly.
539 Since the intention of this note is to help understand the
540 functions in this file, some parts are NOT ACCURATE or OVERLY
541 SIMPLIFIED. For thorough understanding, please refer to the
542 original document of ISO2022.
543
544 ISO2022 provides many mechanisms to encode several character sets
545 in 7-bit and 8-bit environments. For 7-bite environments, all text
546 is encoded using bytes less than 128. This may make the encoded
547 text a little bit longer, but the text passes more easily through
548 several gateways, some of which strip off MSB (Most Signigant Bit).
549
550 There are two kinds of character sets: control character set and
551 graphic character set. The former contains control characters such
552 as `newline' and `escape' to provide control functions (control
553 functions are also provided by escape sequences). The latter
554 contains graphic characters such as 'A' and '-'. Emacs recognizes
555 two control character sets and many graphic character sets.
556
557 Graphic character sets are classified into one of the following
558 four classes, according to the number of bytes (DIMENSION) and
559 number of characters in one dimension (CHARS) of the set:
560 - DIMENSION1_CHARS94
561 - DIMENSION1_CHARS96
562 - DIMENSION2_CHARS94
563 - DIMENSION2_CHARS96
564
565 In addition, each character set is assigned an identification tag,
566 unique for each set, called "final character" (denoted as <F>
567 hereafter). The <F> of each character set is decided by ECMA(*)
568 when it is registered in ISO. The code range of <F> is 0x30..0x7F
569 (0x30..0x3F are for private use only).
570
571 Note (*): ECMA = European Computer Manufacturers Association
572
573 Here are examples of graphic character set [NAME(<F>)]:
574 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
575 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
576 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
577 o DIMENSION2_CHARS96 -- none for the moment
578
579 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
580 C0 [0x00..0x1F] -- control character plane 0
581 GL [0x20..0x7F] -- graphic character plane 0
582 C1 [0x80..0x9F] -- control character plane 1
583 GR [0xA0..0xFF] -- graphic character plane 1
584
585 A control character set is directly designated and invoked to C0 or
586 C1 by an escape sequence. The most common case is that:
587 - ISO646's control character set is designated/invoked to C0, and
588 - ISO6429's control character set is designated/invoked to C1,
589 and usually these designations/invocations are omitted in encoded
590 text. In a 7-bit environment, only C0 can be used, and a control
591 character for C1 is encoded by an appropriate escape sequence to
592 fit into the environment. All control characters for C1 are
593 defined to have corresponding escape sequences.
594
595 A graphic character set is at first designated to one of four
596 graphic registers (G0 through G3), then these graphic registers are
597 invoked to GL or GR. These designations and invocations can be
598 done independently. The most common case is that G0 is invoked to
599 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
600 these invocations and designations are omitted in encoded text.
601 In a 7-bit environment, only GL can be used.
602
603 When a graphic character set of CHARS94 is invoked to GL, codes
604 0x20 and 0x7F of the GL area work as control characters SPACE and
605 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
606 be used.
607
608 There are two ways of invocation: locking-shift and single-shift.
609 With locking-shift, the invocation lasts until the next different
610 invocation, whereas with single-shift, the invocation affects the
611 following character only and doesn't affect the locking-shift
612 state. Invocations are done by the following control characters or
613 escape sequences:
614
615 ----------------------------------------------------------------------
616 abbrev function cntrl escape seq description
617 ----------------------------------------------------------------------
618 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
619 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
620 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
621 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
622 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
623 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
624 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
625 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
626 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
627 ----------------------------------------------------------------------
628 (*) These are not used by any known coding system.
629
630 Control characters for these functions are defined by macros
631 ISO_CODE_XXX in `coding.h'.
632
633 Designations are done by the following escape sequences:
634 ----------------------------------------------------------------------
635 escape sequence description
636 ----------------------------------------------------------------------
637 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
638 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
639 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
640 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
641 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
642 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
643 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
644 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
645 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
646 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
647 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
648 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
649 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
650 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
651 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
652 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
653 ----------------------------------------------------------------------
654
655 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
656 of dimension 1, chars 94, and final character <F>, etc...
657
658 Note (*): Although these designations are not allowed in ISO2022,
659 Emacs accepts them on decoding, and produces them on encoding
660 CHARS96 character sets in a coding system which is characterized as
661 7-bit environment, non-locking-shift, and non-single-shift.
662
663 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
664 '(' can be omitted. We refer to this as "short-form" hereafter.
665
666 Now you may notice that there are a lot of ways for encoding the
667 same multilingual text in ISO2022. Actually, there exist many
668 coding systems such as Compound Text (used in X11's inter client
669 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
670 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
671 localized platforms), and all of these are variants of ISO2022.
672
673 In addition to the above, Emacs handles two more kinds of escape
674 sequences: ISO6429's direction specification and Emacs' private
675 sequence for specifying character composition.
676
677 ISO6429's direction specification takes the following form:
678 o CSI ']' -- end of the current direction
679 o CSI '0' ']' -- end of the current direction
680 o CSI '1' ']' -- start of left-to-right text
681 o CSI '2' ']' -- start of right-to-left text
682 The control character CSI (0x9B: control sequence introducer) is
683 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
684
685 Character composition specification takes the following form:
686 o ESC '0' -- start character composition
687 o ESC '1' -- end character composition
688 Since these are not standard escape sequences of any ISO standard,
689 the use of them for these meaning is restricted to Emacs only. */
690
691 enum iso_code_class_type iso_code_class[256];
692
693 #define CHARSET_OK(idx, charset) \
694 (coding_system_table[idx] \
695 && (coding_system_table[idx]->safe_charsets[charset] \
696 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
697 (coding_system_table[idx], charset) \
698 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
699
700 #define SHIFT_OUT_OK(idx) \
701 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
702
703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
704 Check if a text is encoded in ISO2022. If it is, returns an
705 integer in which appropriate flag bits any of:
706 CODING_CATEGORY_MASK_ISO_7
707 CODING_CATEGORY_MASK_ISO_7_TIGHT
708 CODING_CATEGORY_MASK_ISO_8_1
709 CODING_CATEGORY_MASK_ISO_8_2
710 CODING_CATEGORY_MASK_ISO_7_ELSE
711 CODING_CATEGORY_MASK_ISO_8_ELSE
712 are set. If a code which should never appear in ISO2022 is found,
713 returns 0. */
714
715 int
716 detect_coding_iso2022 (src, src_end)
717 unsigned char *src, *src_end;
718 {
719 int mask = CODING_CATEGORY_MASK_ISO;
720 int mask_found = 0;
721 int reg[4], shift_out = 0, single_shifting = 0;
722 int c, c1, i, charset;
723
724 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
725 while (mask && src < src_end)
726 {
727 c = *src++;
728 switch (c)
729 {
730 case ISO_CODE_ESC:
731 single_shifting = 0;
732 if (src >= src_end)
733 break;
734 c = *src++;
735 if (c >= '(' && c <= '/')
736 {
737 /* Designation sequence for a charset of dimension 1. */
738 if (src >= src_end)
739 break;
740 c1 = *src++;
741 if (c1 < ' ' || c1 >= 0x80
742 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
743 /* Invalid designation sequence. Just ignore. */
744 break;
745 reg[(c - '(') % 4] = charset;
746 }
747 else if (c == '$')
748 {
749 /* Designation sequence for a charset of dimension 2. */
750 if (src >= src_end)
751 break;
752 c = *src++;
753 if (c >= '@' && c <= 'B')
754 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
755 reg[0] = charset = iso_charset_table[1][0][c];
756 else if (c >= '(' && c <= '/')
757 {
758 if (src >= src_end)
759 break;
760 c1 = *src++;
761 if (c1 < ' ' || c1 >= 0x80
762 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
763 /* Invalid designation sequence. Just ignore. */
764 break;
765 reg[(c - '(') % 4] = charset;
766 }
767 else
768 /* Invalid designation sequence. Just ignore. */
769 break;
770 }
771 else if (c == 'N' || c == 'O')
772 {
773 /* ESC <Fe> for SS2 or SS3. */
774 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
775 break;
776 }
777 else if (c == '0' || c == '1' || c == '2')
778 /* ESC <Fp> for start/end composition. Just ignore. */
779 break;
780 else
781 /* Invalid escape sequence. Just ignore. */
782 break;
783
784 /* We found a valid designation sequence for CHARSET. */
785 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
786 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
787 mask_found |= CODING_CATEGORY_MASK_ISO_7;
788 else
789 mask &= ~CODING_CATEGORY_MASK_ISO_7;
790 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
791 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
792 else
793 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
794 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
795 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
796 else
797 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
798 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
799 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
800 else
801 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
802 break;
803
804 case ISO_CODE_SO:
805 single_shifting = 0;
806 if (shift_out == 0
807 && (reg[1] >= 0
808 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
809 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
810 {
811 /* Locking shift out. */
812 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
813 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
814 }
815 break;
816
817 case ISO_CODE_SI:
818 single_shifting = 0;
819 if (shift_out == 1)
820 {
821 /* Locking shift in. */
822 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
823 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
824 }
825 break;
826
827 case ISO_CODE_CSI:
828 single_shifting = 0;
829 case ISO_CODE_SS2:
830 case ISO_CODE_SS3:
831 {
832 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
833
834 if (c != ISO_CODE_CSI)
835 {
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_SINGLE_SHIFT)
838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_SINGLE_SHIFT)
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 single_shifting = 1;
843 }
844 if (VECTORP (Vlatin_extra_code_table)
845 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
846 {
847 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
848 & CODING_FLAG_ISO_LATIN_EXTRA)
849 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
850 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
851 & CODING_FLAG_ISO_LATIN_EXTRA)
852 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
853 }
854 mask &= newmask;
855 mask_found |= newmask;
856 }
857 break;
858
859 default:
860 if (c < 0x80)
861 {
862 single_shifting = 0;
863 break;
864 }
865 else if (c < 0xA0)
866 {
867 single_shifting = 0;
868 if (VECTORP (Vlatin_extra_code_table)
869 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
870 {
871 int newmask = 0;
872
873 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
874 & CODING_FLAG_ISO_LATIN_EXTRA)
875 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
876 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
877 & CODING_FLAG_ISO_LATIN_EXTRA)
878 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
879 mask &= newmask;
880 mask_found |= newmask;
881 }
882 else
883 return 0;
884 }
885 else
886 {
887 unsigned char *src_begin = src;
888
889 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
890 | CODING_CATEGORY_MASK_ISO_7_ELSE);
891 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
892 /* Check the length of succeeding codes of the range
893 0xA0..0FF. If the byte length is odd, we exclude
894 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
895 when we are not single shifting. */
896 if (!single_shifting)
897 {
898 while (src < src_end && *src >= 0xA0)
899 src++;
900 if ((src - src_begin - 1) & 1 && src < src_end)
901 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
902 else
903 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
904 }
905 }
906 break;
907 }
908 }
909
910 return (mask & mask_found);
911 }
912
913 /* Decode a character of which charset is CHARSET and the 1st position
914 code is C1. If dimension of CHARSET is 2, the 2nd position code is
915 fetched from SRC and set to C2. If CHARSET is negative, it means
916 that we are decoding ill formed text, and what we can do is just to
917 read C1 as is. */
918
919 #define DECODE_ISO_CHARACTER(charset, c1) \
920 do { \
921 int c_alt, charset_alt = (charset); \
922 if (COMPOSING_HEAD_P (coding->composing)) \
923 { \
924 *dst++ = LEADING_CODE_COMPOSITION; \
925 if (COMPOSING_WITH_RULE_P (coding->composing)) \
926 /* To tell composition rules are embeded. */ \
927 *dst++ = 0xFF; \
928 coding->composing += 2; \
929 } \
930 if (charset_alt >= 0) \
931 { \
932 if (CHARSET_DIMENSION (charset_alt) == 2) \
933 { \
934 ONE_MORE_BYTE (c2); \
935 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
936 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
937 { \
938 src--; \
939 charset_alt = CHARSET_ASCII; \
940 } \
941 } \
942 if (!NILP (translation_table) \
943 && ((c_alt = translate_char (translation_table, \
944 -1, charset_alt, c1, c2)) >= 0)) \
945 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
946 } \
947 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
948 DECODE_CHARACTER_ASCII (c1); \
949 else if (CHARSET_DIMENSION (charset_alt) == 1) \
950 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
951 else \
952 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
953 if (COMPOSING_WITH_RULE_P (coding->composing)) \
954 /* To tell a composition rule follows. */ \
955 coding->composing = COMPOSING_WITH_RULE_RULE; \
956 } while (0)
957
958 /* Set designation state into CODING. */
959 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
960 do { \
961 int charset; \
962 \
963 if (final_char < '0' || final_char >= 128) \
964 goto label_invalid_code; \
965 charset = ISO_CHARSET_TABLE (make_number (dimension), \
966 make_number (chars), \
967 make_number (final_char)); \
968 if (charset >= 0 \
969 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
970 || coding->safe_charsets[charset])) \
971 { \
972 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
973 && reg == 0 \
974 && charset == CHARSET_ASCII) \
975 { \
976 /* We should insert this designation sequence as is so \
977 that it is surely written back to a file. */ \
978 coding->spec.iso2022.last_invalid_designation_register = -1; \
979 goto label_invalid_code; \
980 } \
981 coding->spec.iso2022.last_invalid_designation_register = -1; \
982 if ((coding->mode & CODING_MODE_DIRECTION) \
983 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
984 charset = CHARSET_REVERSE_CHARSET (charset); \
985 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
986 } \
987 else \
988 { \
989 coding->spec.iso2022.last_invalid_designation_register = reg; \
990 goto label_invalid_code; \
991 } \
992 } while (0)
993
994 /* Return 0 if there's a valid composing sequence starting at SRC and
995 ending before SRC_END, else return -1. */
996
997 int
998 check_composing_code (coding, src, src_end)
999 struct coding_system *coding;
1000 unsigned char *src, *src_end;
1001 {
1002 int charset, c, c1, dim;
1003
1004 while (src < src_end)
1005 {
1006 c = *src++;
1007 if (c >= 0x20)
1008 continue;
1009 if (c != ISO_CODE_ESC || src >= src_end)
1010 return -1;
1011 c = *src++;
1012 if (c == '1') /* end of compsition */
1013 return 0;
1014 if (src + 2 >= src_end
1015 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1016 return -1;
1017
1018 dim = (c == '$');
1019 if (dim == 1)
1020 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1021 if (c >= '(' && c <= '/')
1022 {
1023 c1 = *src++;
1024 if ((c1 < ' ' || c1 >= 0x80)
1025 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1026 || ! coding->safe_charsets[charset]
1027 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1028 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1029 return -1;
1030 }
1031 else
1032 return -1;
1033 }
1034
1035 /* We have not found the sequence "ESC 1". */
1036 return -1;
1037 }
1038
1039 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1040
1041 int
1042 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1043 struct coding_system *coding;
1044 unsigned char *source, *destination;
1045 int src_bytes, dst_bytes;
1046 {
1047 unsigned char *src = source;
1048 unsigned char *src_end = source + src_bytes;
1049 unsigned char *dst = destination;
1050 unsigned char *dst_end = destination + dst_bytes;
1051 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1052 from DST_END to assure that overflow checking is necessary only
1053 at the head of loop. */
1054 unsigned char *adjusted_dst_end = dst_end - 6;
1055 int charset;
1056 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1057 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1058 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1059 Lisp_Object translation_table
1060 = coding->translation_table_for_decode;
1061 int result = CODING_FINISH_NORMAL;
1062
1063 if (!NILP (Venable_character_translation) && NILP (translation_table))
1064 translation_table = Vstandard_translation_table_for_decode;
1065
1066 coding->produced_char = 0;
1067 coding->fake_multibyte = 0;
1068 while (src < src_end && (dst_bytes
1069 ? (dst < adjusted_dst_end)
1070 : (dst < src - 6)))
1071 {
1072 /* SRC_BASE remembers the start position in source in each loop.
1073 The loop will be exited when there's not enough source text
1074 to analyze long escape sequence or 2-byte code (within macros
1075 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1076 to SRC_BASE before exiting. */
1077 unsigned char *src_base = src;
1078 int c1 = *src++, c2;
1079
1080 switch (iso_code_class [c1])
1081 {
1082 case ISO_0x20_or_0x7F:
1083 if (!coding->composing
1084 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1085 {
1086 /* This is SPACE or DEL. */
1087 *dst++ = c1;
1088 coding->produced_char++;
1089 break;
1090 }
1091 /* This is a graphic character, we fall down ... */
1092
1093 case ISO_graphic_plane_0:
1094 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1095 {
1096 /* This is a composition rule. */
1097 *dst++ = c1 | 0x80;
1098 coding->composing = COMPOSING_WITH_RULE_TAIL;
1099 }
1100 else
1101 DECODE_ISO_CHARACTER (charset0, c1);
1102 break;
1103
1104 case ISO_0xA0_or_0xFF:
1105 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1106 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1107 goto label_invalid_code;
1108 /* This is a graphic character, we fall down ... */
1109
1110 case ISO_graphic_plane_1:
1111 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1112 goto label_invalid_code;
1113 else
1114 DECODE_ISO_CHARACTER (charset1, c1);
1115 break;
1116
1117 case ISO_control_code:
1118 /* All ISO2022 control characters in this class have the
1119 same representation in Emacs internal format. */
1120 if (c1 == '\n'
1121 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1122 && (coding->eol_type == CODING_EOL_CR
1123 || coding->eol_type == CODING_EOL_CRLF))
1124 {
1125 result = CODING_FINISH_INCONSISTENT_EOL;
1126 goto label_end_of_loop_2;
1127 }
1128 *dst++ = c1;
1129 coding->produced_char++;
1130 if (c1 >= 0x80)
1131 coding->fake_multibyte = 1;
1132 break;
1133
1134 case ISO_carriage_return:
1135 if (coding->eol_type == CODING_EOL_CR)
1136 *dst++ = '\n';
1137 else if (coding->eol_type == CODING_EOL_CRLF)
1138 {
1139 ONE_MORE_BYTE (c1);
1140 if (c1 == ISO_CODE_LF)
1141 *dst++ = '\n';
1142 else
1143 {
1144 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1145 {
1146 result = CODING_FINISH_INCONSISTENT_EOL;
1147 goto label_end_of_loop_2;
1148 }
1149 src--;
1150 *dst++ = '\r';
1151 }
1152 }
1153 else
1154 *dst++ = c1;
1155 coding->produced_char++;
1156 break;
1157
1158 case ISO_shift_out:
1159 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1160 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1161 goto label_invalid_code;
1162 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1163 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1164 break;
1165
1166 case ISO_shift_in:
1167 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1168 goto label_invalid_code;
1169 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1170 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1171 break;
1172
1173 case ISO_single_shift_2_7:
1174 case ISO_single_shift_2:
1175 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1176 goto label_invalid_code;
1177 /* SS2 is handled as an escape sequence of ESC 'N' */
1178 c1 = 'N';
1179 goto label_escape_sequence;
1180
1181 case ISO_single_shift_3:
1182 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1183 goto label_invalid_code;
1184 /* SS2 is handled as an escape sequence of ESC 'O' */
1185 c1 = 'O';
1186 goto label_escape_sequence;
1187
1188 case ISO_control_sequence_introducer:
1189 /* CSI is handled as an escape sequence of ESC '[' ... */
1190 c1 = '[';
1191 goto label_escape_sequence;
1192
1193 case ISO_escape:
1194 ONE_MORE_BYTE (c1);
1195 label_escape_sequence:
1196 /* Escape sequences handled by Emacs are invocation,
1197 designation, direction specification, and character
1198 composition specification. */
1199 switch (c1)
1200 {
1201 case '&': /* revision of following character set */
1202 ONE_MORE_BYTE (c1);
1203 if (!(c1 >= '@' && c1 <= '~'))
1204 goto label_invalid_code;
1205 ONE_MORE_BYTE (c1);
1206 if (c1 != ISO_CODE_ESC)
1207 goto label_invalid_code;
1208 ONE_MORE_BYTE (c1);
1209 goto label_escape_sequence;
1210
1211 case '$': /* designation of 2-byte character set */
1212 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1213 goto label_invalid_code;
1214 ONE_MORE_BYTE (c1);
1215 if (c1 >= '@' && c1 <= 'B')
1216 { /* designation of JISX0208.1978, GB2312.1980,
1217 or JISX0208.1980 */
1218 DECODE_DESIGNATION (0, 2, 94, c1);
1219 }
1220 else if (c1 >= 0x28 && c1 <= 0x2B)
1221 { /* designation of DIMENSION2_CHARS94 character set */
1222 ONE_MORE_BYTE (c2);
1223 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1224 }
1225 else if (c1 >= 0x2C && c1 <= 0x2F)
1226 { /* designation of DIMENSION2_CHARS96 character set */
1227 ONE_MORE_BYTE (c2);
1228 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1229 }
1230 else
1231 goto label_invalid_code;
1232 break;
1233
1234 case 'n': /* invocation of locking-shift-2 */
1235 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1236 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1237 goto label_invalid_code;
1238 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1239 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1240 break;
1241
1242 case 'o': /* invocation of locking-shift-3 */
1243 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1244 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1245 goto label_invalid_code;
1246 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1247 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1248 break;
1249
1250 case 'N': /* invocation of single-shift-2 */
1251 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1252 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1253 goto label_invalid_code;
1254 ONE_MORE_BYTE (c1);
1255 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1256 DECODE_ISO_CHARACTER (charset, c1);
1257 break;
1258
1259 case 'O': /* invocation of single-shift-3 */
1260 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1261 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1262 goto label_invalid_code;
1263 ONE_MORE_BYTE (c1);
1264 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1265 DECODE_ISO_CHARACTER (charset, c1);
1266 break;
1267
1268 case '0': case '2': /* start composing */
1269 /* Before processing composing, we must be sure that all
1270 characters being composed are supported by CODING.
1271 If not, we must give up composing. */
1272 if (check_composing_code (coding, src, src_end) == 0)
1273 {
1274 /* We are looking at a valid composition sequence. */
1275 coding->composing = (c1 == '0'
1276 ? COMPOSING_NO_RULE_HEAD
1277 : COMPOSING_WITH_RULE_HEAD);
1278 coding->composed_chars = 0;
1279 }
1280 else
1281 {
1282 *dst++ = ISO_CODE_ESC;
1283 *dst++ = c1;
1284 coding->produced_char += 2;
1285 }
1286 break;
1287
1288 case '1': /* end composing */
1289 if (!coding->composing)
1290 {
1291 *dst++ = ISO_CODE_ESC;
1292 *dst++ = c1;
1293 coding->produced_char += 2;
1294 break;
1295 }
1296
1297 if (coding->composed_chars > 0)
1298 {
1299 if (coding->composed_chars == 1)
1300 {
1301 unsigned char *this_char_start = dst;
1302 int this_bytes;
1303
1304 /* Only one character is in the composing
1305 sequence. Make it a normal character. */
1306 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1307 dst = (this_char_start
1308 + (coding->composing == COMPOSING_NO_RULE_TAIL
1309 ? 1 : 2));
1310 *dst -= 0x20;
1311 if (*dst == 0x80)
1312 *++dst &= 0x7F;
1313 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1314 while (this_bytes--) *this_char_start++ = *dst++;
1315 dst = this_char_start;
1316 }
1317 coding->produced_char++;
1318 }
1319 coding->composing = COMPOSING_NO;
1320 break;
1321
1322 case '[': /* specification of direction */
1323 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1324 goto label_invalid_code;
1325 /* For the moment, nested direction is not supported.
1326 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1327 left-to-right, and nozero means right-to-left. */
1328 ONE_MORE_BYTE (c1);
1329 switch (c1)
1330 {
1331 case ']': /* end of the current direction */
1332 coding->mode &= ~CODING_MODE_DIRECTION;
1333
1334 case '0': /* end of the current direction */
1335 case '1': /* start of left-to-right direction */
1336 ONE_MORE_BYTE (c1);
1337 if (c1 == ']')
1338 coding->mode &= ~CODING_MODE_DIRECTION;
1339 else
1340 goto label_invalid_code;
1341 break;
1342
1343 case '2': /* start of right-to-left direction */
1344 ONE_MORE_BYTE (c1);
1345 if (c1 == ']')
1346 coding->mode |= CODING_MODE_DIRECTION;
1347 else
1348 goto label_invalid_code;
1349 break;
1350
1351 default:
1352 goto label_invalid_code;
1353 }
1354 break;
1355
1356 default:
1357 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1358 goto label_invalid_code;
1359 if (c1 >= 0x28 && c1 <= 0x2B)
1360 { /* designation of DIMENSION1_CHARS94 character set */
1361 ONE_MORE_BYTE (c2);
1362 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1363 }
1364 else if (c1 >= 0x2C && c1 <= 0x2F)
1365 { /* designation of DIMENSION1_CHARS96 character set */
1366 ONE_MORE_BYTE (c2);
1367 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1368 }
1369 else
1370 {
1371 goto label_invalid_code;
1372 }
1373 }
1374 /* We must update these variables now. */
1375 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1376 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1377 break;
1378
1379 label_invalid_code:
1380 while (src_base < src)
1381 *dst++ = *src_base++;
1382 coding->fake_multibyte = 1;
1383 }
1384 continue;
1385
1386 label_end_of_loop:
1387 result = CODING_FINISH_INSUFFICIENT_SRC;
1388 label_end_of_loop_2:
1389 src = src_base;
1390 break;
1391 }
1392
1393 if (src < src_end)
1394 {
1395 if (result == CODING_FINISH_NORMAL)
1396 result = CODING_FINISH_INSUFFICIENT_DST;
1397 else if (result != CODING_FINISH_INCONSISTENT_EOL
1398 && coding->mode & CODING_MODE_LAST_BLOCK)
1399 {
1400 /* This is the last block of the text to be decoded. We had
1401 better just flush out all remaining codes in the text
1402 although they are not valid characters. */
1403 src_bytes = src_end - src;
1404 if (dst_bytes && (dst_end - dst < src_bytes))
1405 src_bytes = dst_end - dst;
1406 bcopy (src, dst, src_bytes);
1407 dst += src_bytes;
1408 src += src_bytes;
1409 coding->fake_multibyte = 1;
1410 }
1411 }
1412
1413 coding->consumed = coding->consumed_char = src - source;
1414 coding->produced = dst - destination;
1415 return result;
1416 }
1417
1418 /* ISO2022 encoding stuff. */
1419
1420 /*
1421 It is not enough to say just "ISO2022" on encoding, we have to
1422 specify more details. In Emacs, each coding system of ISO2022
1423 variant has the following specifications:
1424 1. Initial designation to G0 thru G3.
1425 2. Allows short-form designation?
1426 3. ASCII should be designated to G0 before control characters?
1427 4. ASCII should be designated to G0 at end of line?
1428 5. 7-bit environment or 8-bit environment?
1429 6. Use locking-shift?
1430 7. Use Single-shift?
1431 And the following two are only for Japanese:
1432 8. Use ASCII in place of JIS0201-1976-Roman?
1433 9. Use JISX0208-1983 in place of JISX0208-1978?
1434 These specifications are encoded in `coding->flags' as flag bits
1435 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1436 details.
1437 */
1438
1439 /* Produce codes (escape sequence) for designating CHARSET to graphic
1440 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1441 the coding system CODING allows, produce designation sequence of
1442 short-form. */
1443
1444 #define ENCODE_DESIGNATION(charset, reg, coding) \
1445 do { \
1446 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1447 char *intermediate_char_94 = "()*+"; \
1448 char *intermediate_char_96 = ",-./"; \
1449 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1450 if (revision < 255) \
1451 { \
1452 *dst++ = ISO_CODE_ESC; \
1453 *dst++ = '&'; \
1454 *dst++ = '@' + revision; \
1455 } \
1456 *dst++ = ISO_CODE_ESC; \
1457 if (CHARSET_DIMENSION (charset) == 1) \
1458 { \
1459 if (CHARSET_CHARS (charset) == 94) \
1460 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1461 else \
1462 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1463 } \
1464 else \
1465 { \
1466 *dst++ = '$'; \
1467 if (CHARSET_CHARS (charset) == 94) \
1468 { \
1469 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1470 || reg != 0 \
1471 || final_char < '@' || final_char > 'B') \
1472 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1473 } \
1474 else \
1475 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1476 } \
1477 *dst++ = final_char; \
1478 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1479 } while (0)
1480
1481 /* The following two macros produce codes (control character or escape
1482 sequence) for ISO2022 single-shift functions (single-shift-2 and
1483 single-shift-3). */
1484
1485 #define ENCODE_SINGLE_SHIFT_2 \
1486 do { \
1487 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1488 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1489 else \
1490 { \
1491 *dst++ = ISO_CODE_SS2; \
1492 coding->fake_multibyte = 1; \
1493 } \
1494 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1495 } while (0)
1496
1497 #define ENCODE_SINGLE_SHIFT_3 \
1498 do { \
1499 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1500 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1501 else \
1502 { \
1503 *dst++ = ISO_CODE_SS3; \
1504 coding->fake_multibyte = 1; \
1505 } \
1506 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1507 } while (0)
1508
1509 /* The following four macros produce codes (control character or
1510 escape sequence) for ISO2022 locking-shift functions (shift-in,
1511 shift-out, locking-shift-2, and locking-shift-3). */
1512
1513 #define ENCODE_SHIFT_IN \
1514 do { \
1515 *dst++ = ISO_CODE_SI; \
1516 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1517 } while (0)
1518
1519 #define ENCODE_SHIFT_OUT \
1520 do { \
1521 *dst++ = ISO_CODE_SO; \
1522 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1523 } while (0)
1524
1525 #define ENCODE_LOCKING_SHIFT_2 \
1526 do { \
1527 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1528 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1529 } while (0)
1530
1531 #define ENCODE_LOCKING_SHIFT_3 \
1532 do { \
1533 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1534 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1535 } while (0)
1536
1537 /* Produce codes for a DIMENSION1 character whose character set is
1538 CHARSET and whose position-code is C1. Designation and invocation
1539 sequences are also produced in advance if necessary. */
1540
1541
1542 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1543 do { \
1544 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1545 { \
1546 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1547 *dst++ = c1 & 0x7F; \
1548 else \
1549 *dst++ = c1 | 0x80; \
1550 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1551 break; \
1552 } \
1553 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1554 { \
1555 *dst++ = c1 & 0x7F; \
1556 break; \
1557 } \
1558 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1559 { \
1560 *dst++ = c1 | 0x80; \
1561 break; \
1562 } \
1563 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1564 && !coding->safe_charsets[charset]) \
1565 { \
1566 /* We should not encode this character, instead produce one or \
1567 two `?'s. */ \
1568 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1569 if (CHARSET_WIDTH (charset) == 2) \
1570 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1571 break; \
1572 } \
1573 else \
1574 /* Since CHARSET is not yet invoked to any graphic planes, we \
1575 must invoke it, or, at first, designate it to some graphic \
1576 register. Then repeat the loop to actually produce the \
1577 character. */ \
1578 dst = encode_invocation_designation (charset, coding, dst); \
1579 } while (1)
1580
1581 /* Produce codes for a DIMENSION2 character whose character set is
1582 CHARSET and whose position-codes are C1 and C2. Designation and
1583 invocation codes are also produced in advance if necessary. */
1584
1585 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1586 do { \
1587 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1588 { \
1589 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1590 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1591 else \
1592 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1593 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1594 break; \
1595 } \
1596 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1597 { \
1598 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1599 break; \
1600 } \
1601 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1602 { \
1603 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1604 break; \
1605 } \
1606 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1607 && !coding->safe_charsets[charset]) \
1608 { \
1609 /* We should not encode this character, instead produce one or \
1610 two `?'s. */ \
1611 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1612 if (CHARSET_WIDTH (charset) == 2) \
1613 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1614 break; \
1615 } \
1616 else \
1617 /* Since CHARSET is not yet invoked to any graphic planes, we \
1618 must invoke it, or, at first, designate it to some graphic \
1619 register. Then repeat the loop to actually produce the \
1620 character. */ \
1621 dst = encode_invocation_designation (charset, coding, dst); \
1622 } while (1)
1623
1624 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1625 do { \
1626 int c_alt, charset_alt; \
1627 if (!NILP (translation_table) \
1628 && ((c_alt = translate_char (translation_table, -1, \
1629 charset, c1, c2)) \
1630 >= 0)) \
1631 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1632 else \
1633 charset_alt = charset; \
1634 if (CHARSET_DEFINED_P (charset_alt)) \
1635 { \
1636 if (CHARSET_DIMENSION (charset_alt) == 1) \
1637 { \
1638 if (charset == CHARSET_ASCII \
1639 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1640 charset_alt = charset_latin_jisx0201; \
1641 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1642 } \
1643 else \
1644 { \
1645 if (charset == charset_jisx0208 \
1646 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1647 charset_alt = charset_jisx0208_1978; \
1648 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1649 } \
1650 } \
1651 else \
1652 { \
1653 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1654 { \
1655 *dst++ = charset & 0x7f; \
1656 *dst++ = c1 & 0x7f; \
1657 if (c2) \
1658 *dst++ = c2 & 0x7f; \
1659 } \
1660 else \
1661 { \
1662 *dst++ = charset; \
1663 *dst++ = c1; \
1664 if (c2) \
1665 *dst++ = c2; \
1666 } \
1667 } \
1668 if (! COMPOSING_P (coding->composing)) \
1669 coding->consumed_char++; \
1670 } while (0)
1671
1672 /* Produce designation and invocation codes at a place pointed by DST
1673 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1674 Return new DST. */
1675
1676 unsigned char *
1677 encode_invocation_designation (charset, coding, dst)
1678 int charset;
1679 struct coding_system *coding;
1680 unsigned char *dst;
1681 {
1682 int reg; /* graphic register number */
1683
1684 /* At first, check designations. */
1685 for (reg = 0; reg < 4; reg++)
1686 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1687 break;
1688
1689 if (reg >= 4)
1690 {
1691 /* CHARSET is not yet designated to any graphic registers. */
1692 /* At first check the requested designation. */
1693 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1694 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1695 /* Since CHARSET requests no special designation, designate it
1696 to graphic register 0. */
1697 reg = 0;
1698
1699 ENCODE_DESIGNATION (charset, reg, coding);
1700 }
1701
1702 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1703 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1704 {
1705 /* Since the graphic register REG is not invoked to any graphic
1706 planes, invoke it to graphic plane 0. */
1707 switch (reg)
1708 {
1709 case 0: /* graphic register 0 */
1710 ENCODE_SHIFT_IN;
1711 break;
1712
1713 case 1: /* graphic register 1 */
1714 ENCODE_SHIFT_OUT;
1715 break;
1716
1717 case 2: /* graphic register 2 */
1718 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1719 ENCODE_SINGLE_SHIFT_2;
1720 else
1721 ENCODE_LOCKING_SHIFT_2;
1722 break;
1723
1724 case 3: /* graphic register 3 */
1725 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1726 ENCODE_SINGLE_SHIFT_3;
1727 else
1728 ENCODE_LOCKING_SHIFT_3;
1729 break;
1730 }
1731 }
1732 return dst;
1733 }
1734
1735 /* The following two macros produce codes for indicating composition. */
1736 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1737 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1738 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1739
1740 /* The following three macros produce codes for indicating direction
1741 of text. */
1742 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1743 do { \
1744 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1745 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1746 else \
1747 *dst++ = ISO_CODE_CSI; \
1748 } while (0)
1749
1750 #define ENCODE_DIRECTION_R2L \
1751 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1752
1753 #define ENCODE_DIRECTION_L2R \
1754 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1755
1756 /* Produce codes for designation and invocation to reset the graphic
1757 planes and registers to initial state. */
1758 #define ENCODE_RESET_PLANE_AND_REGISTER \
1759 do { \
1760 int reg; \
1761 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1762 ENCODE_SHIFT_IN; \
1763 for (reg = 0; reg < 4; reg++) \
1764 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1765 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1766 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1767 ENCODE_DESIGNATION \
1768 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1769 } while (0)
1770
1771 /* Produce designation sequences of charsets in the line started from
1772 SRC to a place pointed by *DSTP, and update DSTP.
1773
1774 If the current block ends before any end-of-line, we may fail to
1775 find all the necessary designations. */
1776
1777 void
1778 encode_designation_at_bol (coding, table, src, src_end, dstp)
1779 struct coding_system *coding;
1780 Lisp_Object table;
1781 unsigned char *src, *src_end, **dstp;
1782 {
1783 int charset, c, found = 0, reg;
1784 /* Table of charsets to be designated to each graphic register. */
1785 int r[4];
1786 unsigned char *dst = *dstp;
1787
1788 for (reg = 0; reg < 4; reg++)
1789 r[reg] = -1;
1790
1791 while (src < src_end && *src != '\n' && found < 4)
1792 {
1793 int bytes = BYTES_BY_CHAR_HEAD (*src);
1794
1795 if (NILP (table))
1796 charset = CHARSET_AT (src);
1797 else
1798 {
1799 int c_alt;
1800 unsigned char c1, c2;
1801
1802 SPLIT_STRING(src, bytes, charset, c1, c2);
1803 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1804 charset = CHAR_CHARSET (c_alt);
1805 }
1806
1807 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1808 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1809 {
1810 found++;
1811 r[reg] = charset;
1812 }
1813
1814 src += bytes;
1815 }
1816
1817 if (found)
1818 {
1819 for (reg = 0; reg < 4; reg++)
1820 if (r[reg] >= 0
1821 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1822 ENCODE_DESIGNATION (r[reg], reg, coding);
1823 *dstp = dst;
1824 }
1825 }
1826
1827 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1828
1829 int
1830 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1831 struct coding_system *coding;
1832 unsigned char *source, *destination;
1833 int src_bytes, dst_bytes;
1834 {
1835 unsigned char *src = source;
1836 unsigned char *src_end = source + src_bytes;
1837 unsigned char *dst = destination;
1838 unsigned char *dst_end = destination + dst_bytes;
1839 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1840 from DST_END to assure overflow checking is necessary only at the
1841 head of loop. */
1842 unsigned char *adjusted_dst_end = dst_end - 19;
1843 Lisp_Object translation_table
1844 = coding->translation_table_for_encode;
1845 int result = CODING_FINISH_NORMAL;
1846
1847 if (!NILP (Venable_character_translation) && NILP (translation_table))
1848 translation_table = Vstandard_translation_table_for_encode;
1849
1850 coding->consumed_char = 0;
1851 coding->fake_multibyte = 0;
1852 while (src < src_end && (dst_bytes
1853 ? (dst < adjusted_dst_end)
1854 : (dst < src - 19)))
1855 {
1856 /* SRC_BASE remembers the start position in source in each loop.
1857 The loop will be exited when there's not enough source text
1858 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1859 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1860 reset to SRC_BASE before exiting. */
1861 unsigned char *src_base = src;
1862 int charset, c1, c2, c3, c4;
1863
1864 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1865 && CODING_SPEC_ISO_BOL (coding))
1866 {
1867 /* We have to produce designation sequences if any now. */
1868 encode_designation_at_bol (coding, translation_table,
1869 src, src_end, &dst);
1870 CODING_SPEC_ISO_BOL (coding) = 0;
1871 }
1872
1873 c1 = *src++;
1874 /* If we are seeing a component of a composite character, we are
1875 seeing a leading-code encoded irregularly for composition, or
1876 a composition rule if composing with rule. We must set C1 to
1877 a normal leading-code or an ASCII code. If we are not seeing
1878 a composite character, we must reset composition,
1879 designation, and invocation states. */
1880 if (COMPOSING_P (coding->composing))
1881 {
1882 if (c1 < 0xA0)
1883 {
1884 /* We are not in a composite character any longer. */
1885 coding->composing = COMPOSING_NO;
1886 ENCODE_RESET_PLANE_AND_REGISTER;
1887 ENCODE_COMPOSITION_END;
1888 }
1889 else
1890 {
1891 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1892 {
1893 *dst++ = c1 & 0x7F;
1894 coding->composing = COMPOSING_WITH_RULE_HEAD;
1895 continue;
1896 }
1897 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1898 coding->composing = COMPOSING_WITH_RULE_RULE;
1899 if (c1 == 0xA0)
1900 {
1901 /* This is an ASCII component. */
1902 ONE_MORE_BYTE (c1);
1903 c1 &= 0x7F;
1904 }
1905 else
1906 /* This is a leading-code of non ASCII component. */
1907 c1 -= 0x20;
1908 }
1909 }
1910
1911 /* Now encode one character. C1 is a control character, an
1912 ASCII character, or a leading-code of multi-byte character. */
1913 switch (emacs_code_class[c1])
1914 {
1915 case EMACS_ascii_code:
1916 c2 = 0;
1917 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1918 break;
1919
1920 case EMACS_control_code:
1921 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1922 ENCODE_RESET_PLANE_AND_REGISTER;
1923 *dst++ = c1;
1924 coding->consumed_char++;
1925 break;
1926
1927 case EMACS_carriage_return_code:
1928 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1929 {
1930 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1931 ENCODE_RESET_PLANE_AND_REGISTER;
1932 *dst++ = c1;
1933 coding->consumed_char++;
1934 break;
1935 }
1936 /* fall down to treat '\r' as '\n' ... */
1937
1938 case EMACS_linefeed_code:
1939 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1940 ENCODE_RESET_PLANE_AND_REGISTER;
1941 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1942 bcopy (coding->spec.iso2022.initial_designation,
1943 coding->spec.iso2022.current_designation,
1944 sizeof coding->spec.iso2022.initial_designation);
1945 if (coding->eol_type == CODING_EOL_LF
1946 || coding->eol_type == CODING_EOL_UNDECIDED)
1947 *dst++ = ISO_CODE_LF;
1948 else if (coding->eol_type == CODING_EOL_CRLF)
1949 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1950 else
1951 *dst++ = ISO_CODE_CR;
1952 CODING_SPEC_ISO_BOL (coding) = 1;
1953 coding->consumed_char++;
1954 break;
1955
1956 case EMACS_leading_code_2:
1957 ONE_MORE_BYTE (c2);
1958 c3 = 0;
1959 if (c2 < 0xA0)
1960 {
1961 /* invalid sequence */
1962 *dst++ = c1;
1963 src--;
1964 coding->consumed_char++;
1965 }
1966 else
1967 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1968 break;
1969
1970 case EMACS_leading_code_3:
1971 TWO_MORE_BYTES (c2, c3);
1972 c4 = 0;
1973 if (c2 < 0xA0 || c3 < 0xA0)
1974 {
1975 /* invalid sequence */
1976 *dst++ = c1;
1977 src -= 2;
1978 coding->consumed_char++;
1979 }
1980 else if (c1 < LEADING_CODE_PRIVATE_11)
1981 ENCODE_ISO_CHARACTER (c1, c2, c3);
1982 else
1983 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1984 break;
1985
1986 case EMACS_leading_code_4:
1987 THREE_MORE_BYTES (c2, c3, c4);
1988 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1989 {
1990 /* invalid sequence */
1991 *dst++ = c1;
1992 src -= 3;
1993 coding->consumed_char++;
1994 }
1995 else
1996 ENCODE_ISO_CHARACTER (c2, c3, c4);
1997 break;
1998
1999 case EMACS_leading_code_composition:
2000 ONE_MORE_BYTE (c2);
2001 if (c2 < 0xA0)
2002 {
2003 /* invalid sequence */
2004 *dst++ = c1;
2005 src--;
2006 coding->consumed_char++;
2007 }
2008 else if (c2 == 0xFF)
2009 {
2010 ENCODE_RESET_PLANE_AND_REGISTER;
2011 coding->composing = COMPOSING_WITH_RULE_HEAD;
2012 ENCODE_COMPOSITION_WITH_RULE_START;
2013 coding->consumed_char++;
2014 }
2015 else
2016 {
2017 ENCODE_RESET_PLANE_AND_REGISTER;
2018 /* Rewind one byte because it is a character code of
2019 composition elements. */
2020 src--;
2021 coding->composing = COMPOSING_NO_RULE_HEAD;
2022 ENCODE_COMPOSITION_NO_RULE_START;
2023 coding->consumed_char++;
2024 }
2025 break;
2026
2027 case EMACS_invalid_code:
2028 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2029 ENCODE_RESET_PLANE_AND_REGISTER;
2030 *dst++ = c1;
2031 coding->consumed_char++;
2032 break;
2033 }
2034 continue;
2035 label_end_of_loop:
2036 result = CODING_FINISH_INSUFFICIENT_SRC;
2037 src = src_base;
2038 break;
2039 }
2040
2041 if (src < src_end && result == CODING_FINISH_NORMAL)
2042 result = CODING_FINISH_INSUFFICIENT_DST;
2043
2044 /* If this is the last block of the text to be encoded, we must
2045 reset graphic planes and registers to the initial state, and
2046 flush out the carryover if any. */
2047 if (coding->mode & CODING_MODE_LAST_BLOCK)
2048 {
2049 ENCODE_RESET_PLANE_AND_REGISTER;
2050 if (COMPOSING_P (coding->composing))
2051 ENCODE_COMPOSITION_END;
2052 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2053 {
2054 while (src < src_end && dst < dst_end)
2055 *dst++ = *src++;
2056 }
2057 }
2058 coding->consumed = src - source;
2059 coding->produced = coding->produced_char = dst - destination;
2060 return result;
2061 }
2062
2063 \f
2064 /*** 4. SJIS and BIG5 handlers ***/
2065
2066 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2067 quite widely. So, for the moment, Emacs supports them in the bare
2068 C code. But, in the future, they may be supported only by CCL. */
2069
2070 /* SJIS is a coding system encoding three character sets: ASCII, right
2071 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2072 as is. A character of charset katakana-jisx0201 is encoded by
2073 "position-code + 0x80". A character of charset japanese-jisx0208
2074 is encoded in 2-byte but two position-codes are divided and shifted
2075 so that it fit in the range below.
2076
2077 --- CODE RANGE of SJIS ---
2078 (character set) (range)
2079 ASCII 0x00 .. 0x7F
2080 KATAKANA-JISX0201 0xA0 .. 0xDF
2081 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2082 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2083 -------------------------------
2084
2085 */
2086
2087 /* BIG5 is a coding system encoding two character sets: ASCII and
2088 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2089 character set and is encoded in two-byte.
2090
2091 --- CODE RANGE of BIG5 ---
2092 (character set) (range)
2093 ASCII 0x00 .. 0x7F
2094 Big5 (1st byte) 0xA1 .. 0xFE
2095 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2096 --------------------------
2097
2098 Since the number of characters in Big5 is larger than maximum
2099 characters in Emacs' charset (96x96), it can't be handled as one
2100 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2101 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2102 contains frequently used characters and the latter contains less
2103 frequently used characters. */
2104
2105 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2106 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2107 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2108 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2109
2110 /* Number of Big5 characters which have the same code in 1st byte. */
2111 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2112
2113 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2114 do { \
2115 unsigned int temp \
2116 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2117 if (b1 < 0xC9) \
2118 charset = charset_big5_1; \
2119 else \
2120 { \
2121 charset = charset_big5_2; \
2122 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2123 } \
2124 c1 = temp / (0xFF - 0xA1) + 0x21; \
2125 c2 = temp % (0xFF - 0xA1) + 0x21; \
2126 } while (0)
2127
2128 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2129 do { \
2130 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2131 if (charset == charset_big5_2) \
2132 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2133 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2134 b2 = temp % BIG5_SAME_ROW; \
2135 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2136 } while (0)
2137
2138 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2139 do { \
2140 int c_alt, charset_alt = (charset); \
2141 if (!NILP (translation_table) \
2142 && ((c_alt = translate_char (translation_table, \
2143 -1, (charset), c1, c2)) >= 0)) \
2144 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2145 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2146 DECODE_CHARACTER_ASCII (c1); \
2147 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2148 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2149 else \
2150 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2151 } while (0)
2152
2153 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2154 do { \
2155 int c_alt, charset_alt; \
2156 if (!NILP (translation_table) \
2157 && ((c_alt = translate_char (translation_table, -1, \
2158 charset, c1, c2)) \
2159 >= 0)) \
2160 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2161 else \
2162 charset_alt = charset; \
2163 if (charset_alt == charset_ascii) \
2164 *dst++ = c1; \
2165 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2166 { \
2167 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2168 *dst++ = c1; \
2169 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2170 *dst++ = c1 & 0x7F; \
2171 else \
2172 { \
2173 *dst++ = charset_alt, *dst++ = c1; \
2174 coding->fake_multibyte = 1; \
2175 } \
2176 } \
2177 else \
2178 { \
2179 c1 &= 0x7F, c2 &= 0x7F; \
2180 if (sjis_p && (charset_alt == charset_jisx0208 \
2181 || charset_alt == charset_jisx0208_1978))\
2182 { \
2183 unsigned char s1, s2; \
2184 \
2185 ENCODE_SJIS (c1, c2, s1, s2); \
2186 *dst++ = s1, *dst++ = s2; \
2187 coding->fake_multibyte = 1; \
2188 } \
2189 else if (!sjis_p \
2190 && (charset_alt == charset_big5_1 \
2191 || charset_alt == charset_big5_2)) \
2192 { \
2193 unsigned char b1, b2; \
2194 \
2195 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2196 *dst++ = b1, *dst++ = b2; \
2197 } \
2198 else \
2199 { \
2200 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2201 coding->fake_multibyte = 1; \
2202 } \
2203 } \
2204 coding->consumed_char++; \
2205 } while (0);
2206
2207 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2208 Check if a text is encoded in SJIS. If it is, return
2209 CODING_CATEGORY_MASK_SJIS, else return 0. */
2210
2211 int
2212 detect_coding_sjis (src, src_end)
2213 unsigned char *src, *src_end;
2214 {
2215 unsigned char c;
2216
2217 while (src < src_end)
2218 {
2219 c = *src++;
2220 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2221 {
2222 if (src < src_end && *src++ < 0x40)
2223 return 0;
2224 }
2225 }
2226 return CODING_CATEGORY_MASK_SJIS;
2227 }
2228
2229 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2230 Check if a text is encoded in BIG5. If it is, return
2231 CODING_CATEGORY_MASK_BIG5, else return 0. */
2232
2233 int
2234 detect_coding_big5 (src, src_end)
2235 unsigned char *src, *src_end;
2236 {
2237 unsigned char c;
2238
2239 while (src < src_end)
2240 {
2241 c = *src++;
2242 if (c >= 0xA1)
2243 {
2244 if (src >= src_end)
2245 break;
2246 c = *src++;
2247 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2248 return 0;
2249 }
2250 }
2251 return CODING_CATEGORY_MASK_BIG5;
2252 }
2253
2254 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2255 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2256
2257 int
2258 decode_coding_sjis_big5 (coding, source, destination,
2259 src_bytes, dst_bytes, sjis_p)
2260 struct coding_system *coding;
2261 unsigned char *source, *destination;
2262 int src_bytes, dst_bytes;
2263 int sjis_p;
2264 {
2265 unsigned char *src = source;
2266 unsigned char *src_end = source + src_bytes;
2267 unsigned char *dst = destination;
2268 unsigned char *dst_end = destination + dst_bytes;
2269 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2270 from DST_END to assure overflow checking is necessary only at the
2271 head of loop. */
2272 unsigned char *adjusted_dst_end = dst_end - 3;
2273 Lisp_Object translation_table
2274 = coding->translation_table_for_decode;
2275 int result = CODING_FINISH_NORMAL;
2276
2277 if (!NILP (Venable_character_translation) && NILP (translation_table))
2278 translation_table = Vstandard_translation_table_for_decode;
2279
2280 coding->produced_char = 0;
2281 coding->fake_multibyte = 0;
2282 while (src < src_end && (dst_bytes
2283 ? (dst < adjusted_dst_end)
2284 : (dst < src - 3)))
2285 {
2286 /* SRC_BASE remembers the start position in source in each loop.
2287 The loop will be exited when there's not enough source text
2288 to analyze two-byte character (within macro ONE_MORE_BYTE).
2289 In that case, SRC is reset to SRC_BASE before exiting. */
2290 unsigned char *src_base = src;
2291 unsigned char c1 = *src++, c2, c3, c4;
2292
2293 if (c1 < 0x20)
2294 {
2295 if (c1 == '\r')
2296 {
2297 if (coding->eol_type == CODING_EOL_CRLF)
2298 {
2299 ONE_MORE_BYTE (c2);
2300 if (c2 == '\n')
2301 *dst++ = c2;
2302 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2303 {
2304 result = CODING_FINISH_INCONSISTENT_EOL;
2305 goto label_end_of_loop_2;
2306 }
2307 else
2308 /* To process C2 again, SRC is subtracted by 1. */
2309 *dst++ = c1, src--;
2310 }
2311 else if (coding->eol_type == CODING_EOL_CR)
2312 *dst++ = '\n';
2313 else
2314 *dst++ = c1;
2315 }
2316 else if (c1 == '\n'
2317 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2318 && (coding->eol_type == CODING_EOL_CR
2319 || coding->eol_type == CODING_EOL_CRLF))
2320 {
2321 result = CODING_FINISH_INCONSISTENT_EOL;
2322 goto label_end_of_loop_2;
2323 }
2324 else
2325 *dst++ = c1;
2326 coding->produced_char++;
2327 }
2328 else if (c1 < 0x80)
2329 {
2330 c2 = 0; /* avoid warning */
2331 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2332 }
2333 else
2334 {
2335 if (sjis_p)
2336 {
2337 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2338 {
2339 /* SJIS -> JISX0208 */
2340 ONE_MORE_BYTE (c2);
2341 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2342 {
2343 DECODE_SJIS (c1, c2, c3, c4);
2344 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2345 }
2346 else
2347 goto label_invalid_code_2;
2348 }
2349 else if (c1 < 0xE0)
2350 /* SJIS -> JISX0201-Kana */
2351 {
2352 c2 = 0; /* avoid warning */
2353 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2354 /* dummy */ c2);
2355 }
2356 else
2357 goto label_invalid_code_1;
2358 }
2359 else
2360 {
2361 /* BIG5 -> Big5 */
2362 if (c1 >= 0xA1 && c1 <= 0xFE)
2363 {
2364 ONE_MORE_BYTE (c2);
2365 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2366 {
2367 int charset;
2368
2369 DECODE_BIG5 (c1, c2, charset, c3, c4);
2370 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2371 }
2372 else
2373 goto label_invalid_code_2;
2374 }
2375 else
2376 goto label_invalid_code_1;
2377 }
2378 }
2379 continue;
2380
2381 label_invalid_code_1:
2382 *dst++ = c1;
2383 coding->produced_char++;
2384 coding->fake_multibyte = 1;
2385 continue;
2386
2387 label_invalid_code_2:
2388 *dst++ = c1; *dst++= c2;
2389 coding->produced_char += 2;
2390 coding->fake_multibyte = 1;
2391 continue;
2392
2393 label_end_of_loop:
2394 result = CODING_FINISH_INSUFFICIENT_SRC;
2395 label_end_of_loop_2:
2396 src = src_base;
2397 break;
2398 }
2399
2400 if (src < src_end)
2401 {
2402 if (result == CODING_FINISH_NORMAL)
2403 result = CODING_FINISH_INSUFFICIENT_DST;
2404 else if (result != CODING_FINISH_INCONSISTENT_EOL
2405 && coding->mode & CODING_MODE_LAST_BLOCK)
2406 {
2407 src_bytes = src_end - src;
2408 if (dst_bytes && (dst_end - dst < src_bytes))
2409 src_bytes = dst_end - dst;
2410 bcopy (dst, src, src_bytes);
2411 src += src_bytes;
2412 dst += src_bytes;
2413 coding->fake_multibyte = 1;
2414 }
2415 }
2416
2417 coding->consumed = coding->consumed_char = src - source;
2418 coding->produced = dst - destination;
2419 return result;
2420 }
2421
2422 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2423 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2424 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2425 sure that all these charsets are registered as official charset
2426 (i.e. do not have extended leading-codes). Characters of other
2427 charsets are produced without any encoding. If SJIS_P is 1, encode
2428 SJIS text, else encode BIG5 text. */
2429
2430 int
2431 encode_coding_sjis_big5 (coding, source, destination,
2432 src_bytes, dst_bytes, sjis_p)
2433 struct coding_system *coding;
2434 unsigned char *source, *destination;
2435 int src_bytes, dst_bytes;
2436 int sjis_p;
2437 {
2438 unsigned char *src = source;
2439 unsigned char *src_end = source + src_bytes;
2440 unsigned char *dst = destination;
2441 unsigned char *dst_end = destination + dst_bytes;
2442 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2443 from DST_END to assure overflow checking is necessary only at the
2444 head of loop. */
2445 unsigned char *adjusted_dst_end = dst_end - 1;
2446 Lisp_Object translation_table
2447 = coding->translation_table_for_encode;
2448 int result = CODING_FINISH_NORMAL;
2449
2450 if (!NILP (Venable_character_translation) && NILP (translation_table))
2451 translation_table = Vstandard_translation_table_for_encode;
2452
2453 coding->consumed_char = 0;
2454 coding->fake_multibyte = 0;
2455 while (src < src_end && (dst_bytes
2456 ? (dst < adjusted_dst_end)
2457 : (dst < src - 1)))
2458 {
2459 /* SRC_BASE remembers the start position in source in each loop.
2460 The loop will be exited when there's not enough source text
2461 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2462 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2463 before exiting. */
2464 unsigned char *src_base = src;
2465 unsigned char c1 = *src++, c2, c3, c4;
2466
2467 if (coding->composing)
2468 {
2469 if (c1 == 0xA0)
2470 {
2471 ONE_MORE_BYTE (c1);
2472 c1 &= 0x7F;
2473 }
2474 else if (c1 >= 0xA0)
2475 c1 -= 0x20;
2476 else
2477 coding->composing = 0;
2478 }
2479
2480 switch (emacs_code_class[c1])
2481 {
2482 case EMACS_ascii_code:
2483 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2484 break;
2485
2486 case EMACS_control_code:
2487 *dst++ = c1;
2488 coding->consumed_char++;
2489 break;
2490
2491 case EMACS_carriage_return_code:
2492 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2493 {
2494 *dst++ = c1;
2495 coding->consumed_char++;
2496 break;
2497 }
2498 /* fall down to treat '\r' as '\n' ... */
2499
2500 case EMACS_linefeed_code:
2501 if (coding->eol_type == CODING_EOL_LF
2502 || coding->eol_type == CODING_EOL_UNDECIDED)
2503 *dst++ = '\n';
2504 else if (coding->eol_type == CODING_EOL_CRLF)
2505 *dst++ = '\r', *dst++ = '\n';
2506 else
2507 *dst++ = '\r';
2508 coding->consumed_char++;
2509 break;
2510
2511 case EMACS_leading_code_2:
2512 ONE_MORE_BYTE (c2);
2513 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2514 break;
2515
2516 case EMACS_leading_code_3:
2517 TWO_MORE_BYTES (c2, c3);
2518 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2519 break;
2520
2521 case EMACS_leading_code_4:
2522 THREE_MORE_BYTES (c2, c3, c4);
2523 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2524 break;
2525
2526 case EMACS_leading_code_composition:
2527 coding->composing = 1;
2528 break;
2529
2530 default: /* i.e. case EMACS_invalid_code: */
2531 *dst++ = c1;
2532 coding->consumed_char++;
2533 }
2534 continue;
2535
2536 label_end_of_loop:
2537 result = CODING_FINISH_INSUFFICIENT_SRC;
2538 src = src_base;
2539 break;
2540 }
2541
2542 if (result == CODING_FINISH_NORMAL
2543 && src < src_end)
2544 result = CODING_FINISH_INSUFFICIENT_DST;
2545 coding->consumed = src - source;
2546 coding->produced = coding->produced_char = dst - destination;
2547 return result;
2548 }
2549
2550 \f
2551 /*** 5. CCL handlers ***/
2552
2553 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2554 Check if a text is encoded in a coding system of which
2555 encoder/decoder are written in CCL program. If it is, return
2556 CODING_CATEGORY_MASK_CCL, else return 0. */
2557
2558 int
2559 detect_coding_ccl (src, src_end)
2560 unsigned char *src, *src_end;
2561 {
2562 unsigned char *valid;
2563
2564 /* No coding system is assigned to coding-category-ccl. */
2565 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2566 return 0;
2567
2568 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2569 while (src < src_end)
2570 {
2571 if (! valid[*src]) return 0;
2572 src++;
2573 }
2574 return CODING_CATEGORY_MASK_CCL;
2575 }
2576
2577 \f
2578 /*** 6. End-of-line handlers ***/
2579
2580 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2581 This function is called only when `coding->eol_type' is
2582 CODING_EOL_CRLF or CODING_EOL_CR. */
2583
2584 int
2585 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2586 struct coding_system *coding;
2587 unsigned char *source, *destination;
2588 int src_bytes, dst_bytes;
2589 {
2590 unsigned char *src = source;
2591 unsigned char *src_end = source + src_bytes;
2592 unsigned char *dst = destination;
2593 unsigned char *dst_end = destination + dst_bytes;
2594 unsigned char c;
2595 int result = CODING_FINISH_NORMAL;
2596
2597 coding->fake_multibyte = 0;
2598
2599 if (src_bytes <= 0)
2600 {
2601 coding->produced = coding->produced_char = 0;
2602 coding->consumed = coding->consumed_char = 0;
2603 return result;
2604 }
2605
2606 switch (coding->eol_type)
2607 {
2608 case CODING_EOL_CRLF:
2609 {
2610 /* Since the maximum bytes produced by each loop is 2, we
2611 subtract 1 from DST_END to assure overflow checking is
2612 necessary only at the head of loop. */
2613 unsigned char *adjusted_dst_end = dst_end - 1;
2614
2615 while (src < src_end && (dst_bytes
2616 ? (dst < adjusted_dst_end)
2617 : (dst < src - 1)))
2618 {
2619 unsigned char *src_base = src;
2620
2621 c = *src++;
2622 if (c == '\r')
2623 {
2624 ONE_MORE_BYTE (c);
2625 if (c == '\n')
2626 *dst++ = c;
2627 else
2628 {
2629 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2630 {
2631 result = CODING_FINISH_INCONSISTENT_EOL;
2632 goto label_end_of_loop_2;
2633 }
2634 src--;
2635 *dst++ = '\r';
2636 if (BASE_LEADING_CODE_P (c))
2637 coding->fake_multibyte = 1;
2638 }
2639 }
2640 else if (c == '\n'
2641 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2642 {
2643 result = CODING_FINISH_INCONSISTENT_EOL;
2644 goto label_end_of_loop_2;
2645 }
2646 else
2647 {
2648 *dst++ = c;
2649 if (BASE_LEADING_CODE_P (c))
2650 coding->fake_multibyte = 1;
2651 }
2652 continue;
2653
2654 label_end_of_loop:
2655 result = CODING_FINISH_INSUFFICIENT_SRC;
2656 label_end_of_loop_2:
2657 src = src_base;
2658 break;
2659 }
2660 if (src < src_end)
2661 {
2662 if (result == CODING_FINISH_NORMAL)
2663 result = CODING_FINISH_INSUFFICIENT_DST;
2664 else if (result != CODING_FINISH_INCONSISTENT_EOL
2665 && coding->mode & CODING_MODE_LAST_BLOCK)
2666 {
2667 /* This is the last block of the text to be decoded.
2668 We flush out all remaining codes. */
2669 src_bytes = src_end - src;
2670 if (dst_bytes && (dst_end - dst < src_bytes))
2671 src_bytes = dst_end - dst;
2672 bcopy (src, dst, src_bytes);
2673 dst += src_bytes;
2674 src += src_bytes;
2675 }
2676 }
2677 }
2678 break;
2679
2680 case CODING_EOL_CR:
2681 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2682 {
2683 while (src < src_end)
2684 {
2685 if ((c = *src++) == '\n')
2686 break;
2687 if (BASE_LEADING_CODE_P (c))
2688 coding->fake_multibyte = 1;
2689 }
2690 if (*--src == '\n')
2691 {
2692 src_bytes = src - source;
2693 result = CODING_FINISH_INCONSISTENT_EOL;
2694 }
2695 }
2696 if (dst_bytes && src_bytes > dst_bytes)
2697 {
2698 result = CODING_FINISH_INSUFFICIENT_DST;
2699 src_bytes = dst_bytes;
2700 }
2701 if (dst_bytes)
2702 bcopy (source, destination, src_bytes);
2703 else
2704 safe_bcopy (source, destination, src_bytes);
2705 src = source + src_bytes;
2706 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2707 break;
2708
2709 default: /* i.e. case: CODING_EOL_LF */
2710 if (dst_bytes && src_bytes > dst_bytes)
2711 {
2712 result = CODING_FINISH_INSUFFICIENT_DST;
2713 src_bytes = dst_bytes;
2714 }
2715 if (dst_bytes)
2716 bcopy (source, destination, src_bytes);
2717 else
2718 safe_bcopy (source, destination, src_bytes);
2719 src += src_bytes;
2720 dst += src_bytes;
2721 coding->fake_multibyte = 1;
2722 break;
2723 }
2724
2725 coding->consumed = coding->consumed_char = src - source;
2726 coding->produced = coding->produced_char = dst - destination;
2727 return result;
2728 }
2729
2730 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2731 format of end-of-line according to `coding->eol_type'. If
2732 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2733 '\r' in source text also means end-of-line. */
2734
2735 int
2736 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2737 struct coding_system *coding;
2738 unsigned char *source, *destination;
2739 int src_bytes, dst_bytes;
2740 {
2741 unsigned char *src = source;
2742 unsigned char *dst = destination;
2743 int result = CODING_FINISH_NORMAL;
2744
2745 coding->fake_multibyte = 0;
2746
2747 if (coding->eol_type == CODING_EOL_CRLF)
2748 {
2749 unsigned char c;
2750 unsigned char *src_end = source + src_bytes;
2751 unsigned char *dst_end = destination + dst_bytes;
2752 /* Since the maximum bytes produced by each loop is 2, we
2753 subtract 1 from DST_END to assure overflow checking is
2754 necessary only at the head of loop. */
2755 unsigned char *adjusted_dst_end = dst_end - 1;
2756
2757 while (src < src_end && (dst_bytes
2758 ? (dst < adjusted_dst_end)
2759 : (dst < src - 1)))
2760 {
2761 c = *src++;
2762 if (c == '\n'
2763 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2764 *dst++ = '\r', *dst++ = '\n';
2765 else
2766 {
2767 *dst++ = c;
2768 if (BASE_LEADING_CODE_P (c))
2769 coding->fake_multibyte = 1;
2770 }
2771 }
2772 if (src < src_end)
2773 result = CODING_FINISH_INSUFFICIENT_DST;
2774 }
2775 else
2776 {
2777 unsigned char c;
2778
2779 if (dst_bytes && src_bytes > dst_bytes)
2780 {
2781 src_bytes = dst_bytes;
2782 result = CODING_FINISH_INSUFFICIENT_DST;
2783 }
2784 if (dst_bytes)
2785 bcopy (source, destination, src_bytes);
2786 else
2787 safe_bcopy (source, destination, src_bytes);
2788 dst_bytes = src_bytes;
2789 if (coding->eol_type == CODING_EOL_CR)
2790 {
2791 while (src_bytes--)
2792 {
2793 if ((c = *dst++) == '\n')
2794 dst[-1] = '\r';
2795 else if (BASE_LEADING_CODE_P (c))
2796 coding->fake_multibyte = 1;
2797 }
2798 }
2799 else
2800 {
2801 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2802 {
2803 while (src_bytes--)
2804 if (*dst++ == '\r') dst[-1] = '\n';
2805 }
2806 coding->fake_multibyte = 1;
2807 }
2808 src = source + dst_bytes;
2809 dst = destination + dst_bytes;
2810 }
2811
2812 coding->consumed = coding->consumed_char = src - source;
2813 coding->produced = coding->produced_char = dst - destination;
2814 return result;
2815 }
2816
2817 \f
2818 /*** 7. C library functions ***/
2819
2820 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2821 has a property `coding-system'. The value of this property is a
2822 vector of length 5 (called as coding-vector). Among elements of
2823 this vector, the first (element[0]) and the fifth (element[4])
2824 carry important information for decoding/encoding. Before
2825 decoding/encoding, this information should be set in fields of a
2826 structure of type `coding_system'.
2827
2828 A value of property `coding-system' can be a symbol of another
2829 subsidiary coding-system. In that case, Emacs gets coding-vector
2830 from that symbol.
2831
2832 `element[0]' contains information to be set in `coding->type'. The
2833 value and its meaning is as follows:
2834
2835 0 -- coding_type_emacs_mule
2836 1 -- coding_type_sjis
2837 2 -- coding_type_iso2022
2838 3 -- coding_type_big5
2839 4 -- coding_type_ccl encoder/decoder written in CCL
2840 nil -- coding_type_no_conversion
2841 t -- coding_type_undecided (automatic conversion on decoding,
2842 no-conversion on encoding)
2843
2844 `element[4]' contains information to be set in `coding->flags' and
2845 `coding->spec'. The meaning varies by `coding->type'.
2846
2847 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2848 of length 32 (of which the first 13 sub-elements are used now).
2849 Meanings of these sub-elements are:
2850
2851 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2852 If the value is an integer of valid charset, the charset is
2853 assumed to be designated to graphic register N initially.
2854
2855 If the value is minus, it is a minus value of charset which
2856 reserves graphic register N, which means that the charset is
2857 not designated initially but should be designated to graphic
2858 register N just before encoding a character in that charset.
2859
2860 If the value is nil, graphic register N is never used on
2861 encoding.
2862
2863 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2864 Each value takes t or nil. See the section ISO2022 of
2865 `coding.h' for more information.
2866
2867 If `coding->type' is `coding_type_big5', element[4] is t to denote
2868 BIG5-ETen or nil to denote BIG5-HKU.
2869
2870 If `coding->type' takes the other value, element[4] is ignored.
2871
2872 Emacs Lisp's coding system also carries information about format of
2873 end-of-line in a value of property `eol-type'. If the value is
2874 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2875 means CODING_EOL_CR. If it is not integer, it should be a vector
2876 of subsidiary coding systems of which property `eol-type' has one
2877 of above values.
2878
2879 */
2880
2881 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2882 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2883 is setup so that no conversion is necessary and return -1, else
2884 return 0. */
2885
2886 int
2887 setup_coding_system (coding_system, coding)
2888 Lisp_Object coding_system;
2889 struct coding_system *coding;
2890 {
2891 Lisp_Object coding_spec, coding_type, eol_type, plist;
2892 Lisp_Object val;
2893 int i;
2894
2895 /* Initialize some fields required for all kinds of coding systems. */
2896 coding->symbol = coding_system;
2897 coding->common_flags = 0;
2898 coding->mode = 0;
2899 coding->heading_ascii = -1;
2900 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2901
2902 if (NILP (coding_system))
2903 goto label_invalid_coding_system;
2904
2905 coding_spec = Fget (coding_system, Qcoding_system);
2906
2907 if (!VECTORP (coding_spec)
2908 || XVECTOR (coding_spec)->size != 5
2909 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2910 goto label_invalid_coding_system;
2911
2912 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2913 if (VECTORP (eol_type))
2914 {
2915 coding->eol_type = CODING_EOL_UNDECIDED;
2916 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2917 }
2918 else if (XFASTINT (eol_type) == 1)
2919 {
2920 coding->eol_type = CODING_EOL_CRLF;
2921 coding->common_flags
2922 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2923 }
2924 else if (XFASTINT (eol_type) == 2)
2925 {
2926 coding->eol_type = CODING_EOL_CR;
2927 coding->common_flags
2928 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2929 }
2930 else
2931 coding->eol_type = CODING_EOL_LF;
2932
2933 coding_type = XVECTOR (coding_spec)->contents[0];
2934 /* Try short cut. */
2935 if (SYMBOLP (coding_type))
2936 {
2937 if (EQ (coding_type, Qt))
2938 {
2939 coding->type = coding_type_undecided;
2940 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2941 }
2942 else
2943 coding->type = coding_type_no_conversion;
2944 return 0;
2945 }
2946
2947 /* Initialize remaining fields. */
2948 coding->composing = 0;
2949 coding->composed_chars = 0;
2950
2951 /* Get values of coding system properties:
2952 `post-read-conversion', `pre-write-conversion',
2953 `translation-table-for-decode', `translation-table-for-encode'. */
2954 plist = XVECTOR (coding_spec)->contents[3];
2955 /* Pre & post conversion functions should be disabled if
2956 inhibit_eol_conversion is nozero. This is the case that a code
2957 conversion function is called while those functions are running. */
2958 if (! inhibit_pre_post_conversion)
2959 {
2960 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2961 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2962 }
2963 val = Fplist_get (plist, Qtranslation_table_for_decode);
2964 if (SYMBOLP (val))
2965 val = Fget (val, Qtranslation_table_for_decode);
2966 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2967 val = Fplist_get (plist, Qtranslation_table_for_encode);
2968 if (SYMBOLP (val))
2969 val = Fget (val, Qtranslation_table_for_encode);
2970 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2971 val = Fplist_get (plist, Qcoding_category);
2972 if (!NILP (val))
2973 {
2974 val = Fget (val, Qcoding_category_index);
2975 if (INTEGERP (val))
2976 coding->category_idx = XINT (val);
2977 else
2978 goto label_invalid_coding_system;
2979 }
2980 else
2981 goto label_invalid_coding_system;
2982
2983 val = Fplist_get (plist, Qsafe_charsets);
2984 if (EQ (val, Qt))
2985 {
2986 for (i = 0; i <= MAX_CHARSET; i++)
2987 coding->safe_charsets[i] = 1;
2988 }
2989 else
2990 {
2991 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2992 while (CONSP (val))
2993 {
2994 if ((i = get_charset_id (XCAR (val))) >= 0)
2995 coding->safe_charsets[i] = 1;
2996 val = XCDR (val);
2997 }
2998 }
2999
3000 switch (XFASTINT (coding_type))
3001 {
3002 case 0:
3003 coding->type = coding_type_emacs_mule;
3004 if (!NILP (coding->post_read_conversion))
3005 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3006 if (!NILP (coding->pre_write_conversion))
3007 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3008 break;
3009
3010 case 1:
3011 coding->type = coding_type_sjis;
3012 coding->common_flags
3013 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3014 break;
3015
3016 case 2:
3017 coding->type = coding_type_iso2022;
3018 coding->common_flags
3019 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3020 {
3021 Lisp_Object val, temp;
3022 Lisp_Object *flags;
3023 int i, charset, reg_bits = 0;
3024
3025 val = XVECTOR (coding_spec)->contents[4];
3026
3027 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3028 goto label_invalid_coding_system;
3029
3030 flags = XVECTOR (val)->contents;
3031 coding->flags
3032 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3033 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3034 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3035 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3036 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3037 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3038 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3039 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3040 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3041 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3042 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3043 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3044 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3045 );
3046
3047 /* Invoke graphic register 0 to plane 0. */
3048 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3049 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3050 CODING_SPEC_ISO_INVOCATION (coding, 1)
3051 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3052 /* Not single shifting at first. */
3053 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3054 /* Beginning of buffer should also be regarded as bol. */
3055 CODING_SPEC_ISO_BOL (coding) = 1;
3056
3057 for (charset = 0; charset <= MAX_CHARSET; charset++)
3058 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3059 val = Vcharset_revision_alist;
3060 while (CONSP (val))
3061 {
3062 charset = get_charset_id (Fcar_safe (XCAR (val)));
3063 if (charset >= 0
3064 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3065 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3066 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3067 val = XCDR (val);
3068 }
3069
3070 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3071 FLAGS[REG] can be one of below:
3072 integer CHARSET: CHARSET occupies register I,
3073 t: designate nothing to REG initially, but can be used
3074 by any charsets,
3075 list of integer, nil, or t: designate the first
3076 element (if integer) to REG initially, the remaining
3077 elements (if integer) is designated to REG on request,
3078 if an element is t, REG can be used by any charsets,
3079 nil: REG is never used. */
3080 for (charset = 0; charset <= MAX_CHARSET; charset++)
3081 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3082 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3083 for (i = 0; i < 4; i++)
3084 {
3085 if (INTEGERP (flags[i])
3086 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3087 || (charset = get_charset_id (flags[i])) >= 0)
3088 {
3089 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3090 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3091 }
3092 else if (EQ (flags[i], Qt))
3093 {
3094 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3095 reg_bits |= 1 << i;
3096 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3097 }
3098 else if (CONSP (flags[i]))
3099 {
3100 Lisp_Object tail;
3101 tail = flags[i];
3102
3103 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3104 if (INTEGERP (XCAR (tail))
3105 && (charset = XINT (XCAR (tail)),
3106 CHARSET_VALID_P (charset))
3107 || (charset = get_charset_id (XCAR (tail))) >= 0)
3108 {
3109 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3110 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3111 }
3112 else
3113 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3114 tail = XCDR (tail);
3115 while (CONSP (tail))
3116 {
3117 if (INTEGERP (XCAR (tail))
3118 && (charset = XINT (XCAR (tail)),
3119 CHARSET_VALID_P (charset))
3120 || (charset = get_charset_id (XCAR (tail))) >= 0)
3121 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3122 = i;
3123 else if (EQ (XCAR (tail), Qt))
3124 reg_bits |= 1 << i;
3125 tail = XCDR (tail);
3126 }
3127 }
3128 else
3129 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3130
3131 CODING_SPEC_ISO_DESIGNATION (coding, i)
3132 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3133 }
3134
3135 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3136 {
3137 /* REG 1 can be used only by locking shift in 7-bit env. */
3138 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3139 reg_bits &= ~2;
3140 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3141 /* Without any shifting, only REG 0 and 1 can be used. */
3142 reg_bits &= 3;
3143 }
3144
3145 if (reg_bits)
3146 for (charset = 0; charset <= MAX_CHARSET; charset++)
3147 {
3148 if (CHARSET_VALID_P (charset))
3149 {
3150 /* There exist some default graphic registers to be
3151 used CHARSET. */
3152
3153 /* We had better avoid designating a charset of
3154 CHARS96 to REG 0 as far as possible. */
3155 if (CHARSET_CHARS (charset) == 96)
3156 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3157 = (reg_bits & 2
3158 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3159 else
3160 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3161 = (reg_bits & 1
3162 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3163 }
3164 }
3165 }
3166 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3167 coding->spec.iso2022.last_invalid_designation_register = -1;
3168 break;
3169
3170 case 3:
3171 coding->type = coding_type_big5;
3172 coding->common_flags
3173 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3174 coding->flags
3175 = (NILP (XVECTOR (coding_spec)->contents[4])
3176 ? CODING_FLAG_BIG5_HKU
3177 : CODING_FLAG_BIG5_ETEN);
3178 break;
3179
3180 case 4:
3181 coding->type = coding_type_ccl;
3182 coding->common_flags
3183 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3184 {
3185 val = XVECTOR (coding_spec)->contents[4];
3186 if (! CONSP (val)
3187 || setup_ccl_program (&(coding->spec.ccl.decoder),
3188 XCAR (val)) < 0
3189 || setup_ccl_program (&(coding->spec.ccl.encoder),
3190 XCDR (val)) < 0)
3191 goto label_invalid_coding_system;
3192
3193 bzero (coding->spec.ccl.valid_codes, 256);
3194 val = Fplist_get (plist, Qvalid_codes);
3195 if (CONSP (val))
3196 {
3197 Lisp_Object this;
3198
3199 for (; CONSP (val); val = XCDR (val))
3200 {
3201 this = XCAR (val);
3202 if (INTEGERP (this)
3203 && XINT (this) >= 0 && XINT (this) < 256)
3204 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3205 else if (CONSP (this)
3206 && INTEGERP (XCAR (this))
3207 && INTEGERP (XCDR (this)))
3208 {
3209 int start = XINT (XCAR (this));
3210 int end = XINT (XCDR (this));
3211
3212 if (start >= 0 && start <= end && end < 256)
3213 while (start <= end)
3214 coding->spec.ccl.valid_codes[start++] = 1;
3215 }
3216 }
3217 }
3218 }
3219 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3220 break;
3221
3222 case 5:
3223 coding->type = coding_type_raw_text;
3224 break;
3225
3226 default:
3227 goto label_invalid_coding_system;
3228 }
3229 return 0;
3230
3231 label_invalid_coding_system:
3232 coding->type = coding_type_no_conversion;
3233 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3234 coding->common_flags = 0;
3235 coding->eol_type = CODING_EOL_LF;
3236 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3237 return -1;
3238 }
3239
3240 /* Setup raw-text or one of its subsidiaries in the structure
3241 coding_system CODING according to the already setup value eol_type
3242 in CODING. CODING should be setup for some coding system in
3243 advance. */
3244
3245 void
3246 setup_raw_text_coding_system (coding)
3247 struct coding_system *coding;
3248 {
3249 if (coding->type != coding_type_raw_text)
3250 {
3251 coding->symbol = Qraw_text;
3252 coding->type = coding_type_raw_text;
3253 if (coding->eol_type != CODING_EOL_UNDECIDED)
3254 {
3255 Lisp_Object subsidiaries;
3256 subsidiaries = Fget (Qraw_text, Qeol_type);
3257
3258 if (VECTORP (subsidiaries)
3259 && XVECTOR (subsidiaries)->size == 3)
3260 coding->symbol
3261 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3262 }
3263 setup_coding_system (coding->symbol, coding);
3264 }
3265 return;
3266 }
3267
3268 /* Emacs has a mechanism to automatically detect a coding system if it
3269 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3270 it's impossible to distinguish some coding systems accurately
3271 because they use the same range of codes. So, at first, coding
3272 systems are categorized into 7, those are:
3273
3274 o coding-category-emacs-mule
3275
3276 The category for a coding system which has the same code range
3277 as Emacs' internal format. Assigned the coding-system (Lisp
3278 symbol) `emacs-mule' by default.
3279
3280 o coding-category-sjis
3281
3282 The category for a coding system which has the same code range
3283 as SJIS. Assigned the coding-system (Lisp
3284 symbol) `japanese-shift-jis' by default.
3285
3286 o coding-category-iso-7
3287
3288 The category for a coding system which has the same code range
3289 as ISO2022 of 7-bit environment. This doesn't use any locking
3290 shift and single shift functions. This can encode/decode all
3291 charsets. Assigned the coding-system (Lisp symbol)
3292 `iso-2022-7bit' by default.
3293
3294 o coding-category-iso-7-tight
3295
3296 Same as coding-category-iso-7 except that this can
3297 encode/decode only the specified charsets.
3298
3299 o coding-category-iso-8-1
3300
3301 The category for a coding system which has the same code range
3302 as ISO2022 of 8-bit environment and graphic plane 1 used only
3303 for DIMENSION1 charset. This doesn't use any locking shift
3304 and single shift functions. Assigned the coding-system (Lisp
3305 symbol) `iso-latin-1' by default.
3306
3307 o coding-category-iso-8-2
3308
3309 The category for a coding system which has the same code range
3310 as ISO2022 of 8-bit environment and graphic plane 1 used only
3311 for DIMENSION2 charset. This doesn't use any locking shift
3312 and single shift functions. Assigned the coding-system (Lisp
3313 symbol) `japanese-iso-8bit' by default.
3314
3315 o coding-category-iso-7-else
3316
3317 The category for a coding system which has the same code range
3318 as ISO2022 of 7-bit environemnt but uses locking shift or
3319 single shift functions. Assigned the coding-system (Lisp
3320 symbol) `iso-2022-7bit-lock' by default.
3321
3322 o coding-category-iso-8-else
3323
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environemnt but uses locking shift or
3326 single shift functions. Assigned the coding-system (Lisp
3327 symbol) `iso-2022-8bit-ss2' by default.
3328
3329 o coding-category-big5
3330
3331 The category for a coding system which has the same code range
3332 as BIG5. Assigned the coding-system (Lisp symbol)
3333 `cn-big5' by default.
3334
3335 o coding-category-ccl
3336
3337 The category for a coding system of which encoder/decoder is
3338 written in CCL programs. The default value is nil, i.e., no
3339 coding system is assigned.
3340
3341 o coding-category-binary
3342
3343 The category for a coding system not categorized in any of the
3344 above. Assigned the coding-system (Lisp symbol)
3345 `no-conversion' by default.
3346
3347 Each of them is a Lisp symbol and the value is an actual
3348 `coding-system's (this is also a Lisp symbol) assigned by a user.
3349 What Emacs does actually is to detect a category of coding system.
3350 Then, it uses a `coding-system' assigned to it. If Emacs can't
3351 decide only one possible category, it selects a category of the
3352 highest priority. Priorities of categories are also specified by a
3353 user in a Lisp variable `coding-category-list'.
3354
3355 */
3356
3357 static
3358 int ascii_skip_code[256];
3359
3360 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3361 If it detects possible coding systems, return an integer in which
3362 appropriate flag bits are set. Flag bits are defined by macros
3363 CODING_CATEGORY_MASK_XXX in `coding.h'.
3364
3365 How many ASCII characters are at the head is returned as *SKIP. */
3366
3367 static int
3368 detect_coding_mask (source, src_bytes, priorities, skip)
3369 unsigned char *source;
3370 int src_bytes, *priorities, *skip;
3371 {
3372 register unsigned char c;
3373 unsigned char *src = source, *src_end = source + src_bytes;
3374 unsigned int mask;
3375 int i;
3376
3377 /* At first, skip all ASCII characters and control characters except
3378 for three ISO2022 specific control characters. */
3379 ascii_skip_code[ISO_CODE_SO] = 0;
3380 ascii_skip_code[ISO_CODE_SI] = 0;
3381 ascii_skip_code[ISO_CODE_ESC] = 0;
3382
3383 label_loop_detect_coding:
3384 while (src < src_end && ascii_skip_code[*src]) src++;
3385 *skip = src - source;
3386
3387 if (src >= src_end)
3388 /* We found nothing other than ASCII. There's nothing to do. */
3389 return 0;
3390
3391 c = *src;
3392 /* The text seems to be encoded in some multilingual coding system.
3393 Now, try to find in which coding system the text is encoded. */
3394 if (c < 0x80)
3395 {
3396 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3397 /* C is an ISO2022 specific control code of C0. */
3398 mask = detect_coding_iso2022 (src, src_end);
3399 if (mask == 0)
3400 {
3401 /* No valid ISO2022 code follows C. Try again. */
3402 src++;
3403 if (c == ISO_CODE_ESC)
3404 ascii_skip_code[ISO_CODE_ESC] = 1;
3405 else
3406 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3407 goto label_loop_detect_coding;
3408 }
3409 if (priorities)
3410 goto label_return_highest_only;
3411 }
3412 else
3413 {
3414 int try;
3415
3416 if (c < 0xA0)
3417 {
3418 /* C is the first byte of SJIS character code,
3419 or a leading-code of Emacs' internal format (emacs-mule). */
3420 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3421
3422 /* Or, if C is a special latin extra code,
3423 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3424 or is an ISO2022 control-sequence-introducer (CSI),
3425 we should also consider the possibility of ISO2022 codings. */
3426 if ((VECTORP (Vlatin_extra_code_table)
3427 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3428 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3429 || (c == ISO_CODE_CSI
3430 && (src < src_end
3431 && (*src == ']'
3432 || ((*src == '0' || *src == '1' || *src == '2')
3433 && src + 1 < src_end
3434 && src[1] == ']')))))
3435 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3436 | CODING_CATEGORY_MASK_ISO_8BIT);
3437 }
3438 else
3439 /* C is a character of ISO2022 in graphic plane right,
3440 or a SJIS's 1-byte character code (i.e. JISX0201),
3441 or the first byte of BIG5's 2-byte code. */
3442 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3443 | CODING_CATEGORY_MASK_ISO_8BIT
3444 | CODING_CATEGORY_MASK_SJIS
3445 | CODING_CATEGORY_MASK_BIG5);
3446
3447 /* Or, we may have to consider the possibility of CCL. */
3448 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3449 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3450 ->spec.ccl.valid_codes)[c])
3451 try |= CODING_CATEGORY_MASK_CCL;
3452
3453 mask = 0;
3454 if (priorities)
3455 {
3456 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3457 {
3458 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3459 mask = detect_coding_iso2022 (src, src_end);
3460 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3461 mask = detect_coding_sjis (src, src_end);
3462 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3463 mask = detect_coding_big5 (src, src_end);
3464 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3465 mask = detect_coding_emacs_mule (src, src_end);
3466 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3467 mask = detect_coding_ccl (src, src_end);
3468 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3469 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3470 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3471 mask = CODING_CATEGORY_MASK_BINARY;
3472 if (mask)
3473 goto label_return_highest_only;
3474 }
3475 return CODING_CATEGORY_MASK_RAW_TEXT;
3476 }
3477 if (try & CODING_CATEGORY_MASK_ISO)
3478 mask |= detect_coding_iso2022 (src, src_end);
3479 if (try & CODING_CATEGORY_MASK_SJIS)
3480 mask |= detect_coding_sjis (src, src_end);
3481 if (try & CODING_CATEGORY_MASK_BIG5)
3482 mask |= detect_coding_big5 (src, src_end);
3483 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3484 mask |= detect_coding_emacs_mule (src, src_end);
3485 if (try & CODING_CATEGORY_MASK_CCL)
3486 mask |= detect_coding_ccl (src, src_end);
3487 }
3488 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3489
3490 label_return_highest_only:
3491 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3492 {
3493 if (mask & priorities[i])
3494 return priorities[i];
3495 }
3496 return CODING_CATEGORY_MASK_RAW_TEXT;
3497 }
3498
3499 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3500 The information of the detected coding system is set in CODING. */
3501
3502 void
3503 detect_coding (coding, src, src_bytes)
3504 struct coding_system *coding;
3505 unsigned char *src;
3506 int src_bytes;
3507 {
3508 unsigned int idx;
3509 int skip, mask, i;
3510 Lisp_Object val;
3511
3512 val = Vcoding_category_list;
3513 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3514 coding->heading_ascii = skip;
3515
3516 if (!mask) return;
3517
3518 /* We found a single coding system of the highest priority in MASK. */
3519 idx = 0;
3520 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3521 if (! mask)
3522 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3523
3524 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3525
3526 if (coding->eol_type != CODING_EOL_UNDECIDED)
3527 {
3528 Lisp_Object tmp;
3529
3530 tmp = Fget (val, Qeol_type);
3531 if (VECTORP (tmp))
3532 val = XVECTOR (tmp)->contents[coding->eol_type];
3533 }
3534 setup_coding_system (val, coding);
3535 /* Set this again because setup_coding_system reset this member. */
3536 coding->heading_ascii = skip;
3537 }
3538
3539 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3540 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3541 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3542
3543 How many non-eol characters are at the head is returned as *SKIP. */
3544
3545 #define MAX_EOL_CHECK_COUNT 3
3546
3547 static int
3548 detect_eol_type (source, src_bytes, skip)
3549 unsigned char *source;
3550 int src_bytes, *skip;
3551 {
3552 unsigned char *src = source, *src_end = src + src_bytes;
3553 unsigned char c;
3554 int total = 0; /* How many end-of-lines are found so far. */
3555 int eol_type = CODING_EOL_UNDECIDED;
3556 int this_eol_type;
3557
3558 *skip = 0;
3559
3560 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3561 {
3562 c = *src++;
3563 if (c == '\n' || c == '\r')
3564 {
3565 if (*skip == 0)
3566 *skip = src - 1 - source;
3567 total++;
3568 if (c == '\n')
3569 this_eol_type = CODING_EOL_LF;
3570 else if (src >= src_end || *src != '\n')
3571 this_eol_type = CODING_EOL_CR;
3572 else
3573 this_eol_type = CODING_EOL_CRLF, src++;
3574
3575 if (eol_type == CODING_EOL_UNDECIDED)
3576 /* This is the first end-of-line. */
3577 eol_type = this_eol_type;
3578 else if (eol_type != this_eol_type)
3579 {
3580 /* The found type is different from what found before. */
3581 eol_type = CODING_EOL_INCONSISTENT;
3582 break;
3583 }
3584 }
3585 }
3586
3587 if (*skip == 0)
3588 *skip = src_end - source;
3589 return eol_type;
3590 }
3591
3592 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3593 is encoded. If it detects an appropriate format of end-of-line, it
3594 sets the information in *CODING. */
3595
3596 void
3597 detect_eol (coding, src, src_bytes)
3598 struct coding_system *coding;
3599 unsigned char *src;
3600 int src_bytes;
3601 {
3602 Lisp_Object val;
3603 int skip;
3604 int eol_type = detect_eol_type (src, src_bytes, &skip);
3605
3606 if (coding->heading_ascii > skip)
3607 coding->heading_ascii = skip;
3608 else
3609 skip = coding->heading_ascii;
3610
3611 if (eol_type == CODING_EOL_UNDECIDED)
3612 return;
3613 if (eol_type == CODING_EOL_INCONSISTENT)
3614 {
3615 #if 0
3616 /* This code is suppressed until we find a better way to
3617 distinguish raw text file and binary file. */
3618
3619 /* If we have already detected that the coding is raw-text, the
3620 coding should actually be no-conversion. */
3621 if (coding->type == coding_type_raw_text)
3622 {
3623 setup_coding_system (Qno_conversion, coding);
3624 return;
3625 }
3626 /* Else, let's decode only text code anyway. */
3627 #endif /* 0 */
3628 eol_type = CODING_EOL_LF;
3629 }
3630
3631 val = Fget (coding->symbol, Qeol_type);
3632 if (VECTORP (val) && XVECTOR (val)->size == 3)
3633 {
3634 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3635 coding->heading_ascii = skip;
3636 }
3637 }
3638
3639 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3640
3641 #define DECODING_BUFFER_MAG(coding) \
3642 (coding->type == coding_type_iso2022 \
3643 ? 3 \
3644 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3645 ? 2 \
3646 : (coding->type == coding_type_raw_text \
3647 ? 1 \
3648 : (coding->type == coding_type_ccl \
3649 ? coding->spec.ccl.decoder.buf_magnification \
3650 : 2))))
3651
3652 /* Return maximum size (bytes) of a buffer enough for decoding
3653 SRC_BYTES of text encoded in CODING. */
3654
3655 int
3656 decoding_buffer_size (coding, src_bytes)
3657 struct coding_system *coding;
3658 int src_bytes;
3659 {
3660 return (src_bytes * DECODING_BUFFER_MAG (coding)
3661 + CONVERSION_BUFFER_EXTRA_ROOM);
3662 }
3663
3664 /* Return maximum size (bytes) of a buffer enough for encoding
3665 SRC_BYTES of text to CODING. */
3666
3667 int
3668 encoding_buffer_size (coding, src_bytes)
3669 struct coding_system *coding;
3670 int src_bytes;
3671 {
3672 int magnification;
3673
3674 if (coding->type == coding_type_ccl)
3675 magnification = coding->spec.ccl.encoder.buf_magnification;
3676 else
3677 magnification = 3;
3678
3679 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3680 }
3681
3682 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3683 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3684 #endif
3685
3686 char *conversion_buffer;
3687 int conversion_buffer_size;
3688
3689 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3690 or decoding. Sufficient memory is allocated automatically. If we
3691 run out of memory, return NULL. */
3692
3693 char *
3694 get_conversion_buffer (size)
3695 int size;
3696 {
3697 if (size > conversion_buffer_size)
3698 {
3699 char *buf;
3700 int real_size = conversion_buffer_size * 2;
3701
3702 while (real_size < size) real_size *= 2;
3703 buf = (char *) xmalloc (real_size);
3704 xfree (conversion_buffer);
3705 conversion_buffer = buf;
3706 conversion_buffer_size = real_size;
3707 }
3708 return conversion_buffer;
3709 }
3710
3711 int
3712 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3713 struct coding_system *coding;
3714 unsigned char *source, *destination;
3715 int src_bytes, dst_bytes, encodep;
3716 {
3717 struct ccl_program *ccl
3718 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3719 int result;
3720
3721 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3722
3723 coding->produced = ccl_driver (ccl, source, destination,
3724 src_bytes, dst_bytes, &(coding->consumed));
3725 coding->produced_char
3726 = (encodep
3727 ? coding->produced
3728 : multibyte_chars_in_text (destination, coding->produced));
3729 coding->consumed_char
3730 = multibyte_chars_in_text (source, coding->consumed);
3731
3732 switch (ccl->status)
3733 {
3734 case CCL_STAT_SUSPEND_BY_SRC:
3735 result = CODING_FINISH_INSUFFICIENT_SRC;
3736 break;
3737 case CCL_STAT_SUSPEND_BY_DST:
3738 result = CODING_FINISH_INSUFFICIENT_DST;
3739 break;
3740 case CCL_STAT_QUIT:
3741 case CCL_STAT_INVALID_CMD:
3742 result = CODING_FINISH_INTERRUPT;
3743 break;
3744 default:
3745 result = CODING_FINISH_NORMAL;
3746 break;
3747 }
3748 return result;
3749 }
3750
3751 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3752 decoding, it may detect coding system and format of end-of-line if
3753 those are not yet decided.
3754
3755 This function does not make full use of DESTINATION buffer. For
3756 instance, if coding->type is coding_type_iso2022, it uses only
3757 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3758 DST_BYTES is decided by the function decoding_buffer_size, it
3759 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3760 So, this function can decode the full SOURCE. But, in the other
3761 case, if you want to avoid carry over, you must supply at least 7
3762 bytes more area in DESTINATION buffer than expected maximum bytes
3763 that will be produced by this function. */
3764
3765 int
3766 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3767 struct coding_system *coding;
3768 unsigned char *source, *destination;
3769 int src_bytes, dst_bytes;
3770 {
3771 int result;
3772
3773 if (src_bytes <= 0
3774 && coding->type != coding_type_ccl
3775 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3776 && CODING_REQUIRE_FLUSHING (coding)))
3777 {
3778 coding->produced = coding->produced_char = 0;
3779 coding->consumed = coding->consumed_char = 0;
3780 coding->fake_multibyte = 0;
3781 return CODING_FINISH_NORMAL;
3782 }
3783
3784 if (coding->type == coding_type_undecided)
3785 detect_coding (coding, source, src_bytes);
3786
3787 if (coding->eol_type == CODING_EOL_UNDECIDED)
3788 detect_eol (coding, source, src_bytes);
3789
3790 switch (coding->type)
3791 {
3792 case coding_type_emacs_mule:
3793 case coding_type_undecided:
3794 case coding_type_raw_text:
3795 if (coding->eol_type == CODING_EOL_LF
3796 || coding->eol_type == CODING_EOL_UNDECIDED)
3797 goto label_no_conversion;
3798 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3799 break;
3800
3801 case coding_type_sjis:
3802 result = decode_coding_sjis_big5 (coding, source, destination,
3803 src_bytes, dst_bytes, 1);
3804 break;
3805
3806 case coding_type_iso2022:
3807 result = decode_coding_iso2022 (coding, source, destination,
3808 src_bytes, dst_bytes);
3809 break;
3810
3811 case coding_type_big5:
3812 result = decode_coding_sjis_big5 (coding, source, destination,
3813 src_bytes, dst_bytes, 0);
3814 break;
3815
3816 case coding_type_ccl:
3817 result = ccl_coding_driver (coding, source, destination,
3818 src_bytes, dst_bytes, 0);
3819 break;
3820
3821 default: /* i.e. case coding_type_no_conversion: */
3822 label_no_conversion:
3823 if (dst_bytes && src_bytes > dst_bytes)
3824 {
3825 coding->produced = dst_bytes;
3826 result = CODING_FINISH_INSUFFICIENT_DST;
3827 }
3828 else
3829 {
3830 coding->produced = src_bytes;
3831 result = CODING_FINISH_NORMAL;
3832 }
3833 if (dst_bytes)
3834 bcopy (source, destination, coding->produced);
3835 else
3836 safe_bcopy (source, destination, coding->produced);
3837 coding->fake_multibyte = 1;
3838 coding->consumed
3839 = coding->consumed_char = coding->produced_char = coding->produced;
3840 break;
3841 }
3842
3843 return result;
3844 }
3845
3846 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3847
3848 This function does not make full use of DESTINATION buffer. For
3849 instance, if coding->type is coding_type_iso2022, it uses only
3850 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3851 DST_BYTES is decided by the function encoding_buffer_size, it
3852 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3853 So, this function can encode the full SOURCE. But, in the other
3854 case, if you want to avoid carry over, you must supply at least 20
3855 bytes more area in DESTINATION buffer than expected maximum bytes
3856 that will be produced by this function. */
3857
3858 int
3859 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3860 struct coding_system *coding;
3861 unsigned char *source, *destination;
3862 int src_bytes, dst_bytes;
3863 {
3864 int result;
3865
3866 if (src_bytes <= 0
3867 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3868 && CODING_REQUIRE_FLUSHING (coding)))
3869 {
3870 coding->produced = coding->produced_char = 0;
3871 coding->consumed = coding->consumed_char = 0;
3872 coding->fake_multibyte = 0;
3873 return CODING_FINISH_NORMAL;
3874 }
3875
3876 switch (coding->type)
3877 {
3878 case coding_type_emacs_mule:
3879 case coding_type_undecided:
3880 case coding_type_raw_text:
3881 if (coding->eol_type == CODING_EOL_LF
3882 || coding->eol_type == CODING_EOL_UNDECIDED)
3883 goto label_no_conversion;
3884 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3885 break;
3886
3887 case coding_type_sjis:
3888 result = encode_coding_sjis_big5 (coding, source, destination,
3889 src_bytes, dst_bytes, 1);
3890 break;
3891
3892 case coding_type_iso2022:
3893 result = encode_coding_iso2022 (coding, source, destination,
3894 src_bytes, dst_bytes);
3895 break;
3896
3897 case coding_type_big5:
3898 result = encode_coding_sjis_big5 (coding, source, destination,
3899 src_bytes, dst_bytes, 0);
3900 break;
3901
3902 case coding_type_ccl:
3903 result = ccl_coding_driver (coding, source, destination,
3904 src_bytes, dst_bytes, 1);
3905 break;
3906
3907 default: /* i.e. case coding_type_no_conversion: */
3908 label_no_conversion:
3909 if (dst_bytes && src_bytes > dst_bytes)
3910 {
3911 coding->produced = dst_bytes;
3912 result = CODING_FINISH_INSUFFICIENT_DST;
3913 }
3914 else
3915 {
3916 coding->produced = src_bytes;
3917 result = CODING_FINISH_NORMAL;
3918 }
3919 if (dst_bytes)
3920 bcopy (source, destination, coding->produced);
3921 else
3922 safe_bcopy (source, destination, coding->produced);
3923 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3924 {
3925 unsigned char *p = destination, *pend = p + coding->produced;
3926 while (p < pend)
3927 if (*p++ == '\015') p[-1] = '\n';
3928 }
3929 coding->fake_multibyte = 1;
3930 coding->consumed
3931 = coding->consumed_char = coding->produced_char = coding->produced;
3932 break;
3933 }
3934
3935 return result;
3936 }
3937
3938 /* Scan text in the region between *BEG and *END (byte positions),
3939 skip characters which we don't have to decode by coding system
3940 CODING at the head and tail, then set *BEG and *END to the region
3941 of the text we actually have to convert. The caller should move
3942 the gap out of the region in advance.
3943
3944 If STR is not NULL, *BEG and *END are indices into STR. */
3945
3946 static void
3947 shrink_decoding_region (beg, end, coding, str)
3948 int *beg, *end;
3949 struct coding_system *coding;
3950 unsigned char *str;
3951 {
3952 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3953 int eol_conversion;
3954 Lisp_Object translation_table;
3955
3956 if (coding->type == coding_type_ccl
3957 || coding->type == coding_type_undecided
3958 || !NILP (coding->post_read_conversion))
3959 {
3960 /* We can't skip any data. */
3961 return;
3962 }
3963 else if (coding->type == coding_type_no_conversion)
3964 {
3965 /* We need no conversion, but don't have to skip any data here.
3966 Decoding routine handles them effectively anyway. */
3967 return;
3968 }
3969
3970 translation_table = coding->translation_table_for_decode;
3971 if (NILP (translation_table) && !NILP (Venable_character_translation))
3972 translation_table = Vstandard_translation_table_for_decode;
3973 if (CHAR_TABLE_P (translation_table))
3974 {
3975 int i;
3976 for (i = 0; i < 128; i++)
3977 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3978 break;
3979 if (i < 128)
3980 /* Some ASCII character should be tranlsated. We give up
3981 shrinking. */
3982 return;
3983 }
3984
3985 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3986
3987 if ((! eol_conversion) && (coding->heading_ascii >= 0))
3988 /* Detection routine has already found how much we can skip at the
3989 head. */
3990 *beg += coding->heading_ascii;
3991
3992 if (str)
3993 {
3994 begp_orig = begp = str + *beg;
3995 endp_orig = endp = str + *end;
3996 }
3997 else
3998 {
3999 begp_orig = begp = BYTE_POS_ADDR (*beg);
4000 endp_orig = endp = begp + *end - *beg;
4001 }
4002
4003 switch (coding->type)
4004 {
4005 case coding_type_emacs_mule:
4006 case coding_type_raw_text:
4007 if (eol_conversion)
4008 {
4009 if (coding->heading_ascii < 0)
4010 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
4011 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
4012 endp--;
4013 /* Do not consider LF as ascii if preceded by CR, since that
4014 confuses eol decoding. */
4015 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4016 endp++;
4017 }
4018 else
4019 begp = endp;
4020 break;
4021
4022 case coding_type_sjis:
4023 case coding_type_big5:
4024 /* We can skip all ASCII characters at the head. */
4025 if (coding->heading_ascii < 0)
4026 {
4027 if (eol_conversion)
4028 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4029 else
4030 while (begp < endp && *begp < 0x80) begp++;
4031 }
4032 /* We can skip all ASCII characters at the tail except for the
4033 second byte of SJIS or BIG5 code. */
4034 if (eol_conversion)
4035 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4036 else
4037 while (begp < endp && endp[-1] < 0x80) endp--;
4038 /* Do not consider LF as ascii if preceded by CR, since that
4039 confuses eol decoding. */
4040 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4041 endp++;
4042 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4043 endp++;
4044 break;
4045
4046 default: /* i.e. case coding_type_iso2022: */
4047 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4048 /* We can't skip any data. */
4049 break;
4050 if (coding->heading_ascii < 0)
4051 {
4052 /* We can skip all ASCII characters at the head except for a
4053 few control codes. */
4054 while (begp < endp && (c = *begp) < 0x80
4055 && c != ISO_CODE_CR && c != ISO_CODE_SO
4056 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4057 && (!eol_conversion || c != ISO_CODE_LF))
4058 begp++;
4059 }
4060 switch (coding->category_idx)
4061 {
4062 case CODING_CATEGORY_IDX_ISO_8_1:
4063 case CODING_CATEGORY_IDX_ISO_8_2:
4064 /* We can skip all ASCII characters at the tail. */
4065 if (eol_conversion)
4066 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4067 else
4068 while (begp < endp && endp[-1] < 0x80) endp--;
4069 /* Do not consider LF as ascii if preceded by CR, since that
4070 confuses eol decoding. */
4071 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4072 endp++;
4073 break;
4074
4075 case CODING_CATEGORY_IDX_ISO_7:
4076 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4077 {
4078 /* We can skip all charactes at the tail except for 8-bit
4079 codes and ESC and the following 2-byte at the tail. */
4080 unsigned char *eight_bit = NULL;
4081
4082 if (eol_conversion)
4083 while (begp < endp
4084 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4085 {
4086 if (!eight_bit && c & 0x80) eight_bit = endp;
4087 endp--;
4088 }
4089 else
4090 while (begp < endp
4091 && (c = endp[-1]) != ISO_CODE_ESC)
4092 {
4093 if (!eight_bit && c & 0x80) eight_bit = endp;
4094 endp--;
4095 }
4096 /* Do not consider LF as ascii if preceded by CR, since that
4097 confuses eol decoding. */
4098 if (begp < endp && endp < endp_orig
4099 && endp[-1] == '\r' && endp[0] == '\n')
4100 endp++;
4101 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4102 {
4103 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4104 /* This is an ASCII designation sequence. We can
4105 surely skip the tail. But, if we have
4106 encountered an 8-bit code, skip only the codes
4107 after that. */
4108 endp = eight_bit ? eight_bit : endp + 2;
4109 else
4110 /* Hmmm, we can't skip the tail. */
4111 endp = endp_orig;
4112 }
4113 else if (eight_bit)
4114 endp = eight_bit;
4115 }
4116 }
4117 }
4118 *beg += begp - begp_orig;
4119 *end += endp - endp_orig;
4120 return;
4121 }
4122
4123 /* Like shrink_decoding_region but for encoding. */
4124
4125 static void
4126 shrink_encoding_region (beg, end, coding, str)
4127 int *beg, *end;
4128 struct coding_system *coding;
4129 unsigned char *str;
4130 {
4131 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4132 int eol_conversion;
4133 Lisp_Object translation_table;
4134
4135 if (coding->type == coding_type_ccl)
4136 /* We can't skip any data. */
4137 return;
4138 else if (coding->type == coding_type_no_conversion)
4139 {
4140 /* We need no conversion. */
4141 *beg = *end;
4142 return;
4143 }
4144
4145 translation_table = coding->translation_table_for_encode;
4146 if (NILP (translation_table) && !NILP (Venable_character_translation))
4147 translation_table = Vstandard_translation_table_for_encode;
4148 if (CHAR_TABLE_P (translation_table))
4149 {
4150 int i;
4151 for (i = 0; i < 128; i++)
4152 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4153 break;
4154 if (i < 128)
4155 /* Some ASCII character should be tranlsated. We give up
4156 shrinking. */
4157 return;
4158 }
4159
4160 if (str)
4161 {
4162 begp_orig = begp = str + *beg;
4163 endp_orig = endp = str + *end;
4164 }
4165 else
4166 {
4167 begp_orig = begp = BYTE_POS_ADDR (*beg);
4168 endp_orig = endp = begp + *end - *beg;
4169 }
4170
4171 eol_conversion = (coding->eol_type == CODING_EOL_CR
4172 || coding->eol_type == CODING_EOL_CRLF);
4173
4174 /* Here, we don't have to check coding->pre_write_conversion because
4175 the caller is expected to have handled it already. */
4176 switch (coding->type)
4177 {
4178 case coding_type_undecided:
4179 case coding_type_emacs_mule:
4180 case coding_type_raw_text:
4181 if (eol_conversion)
4182 {
4183 while (begp < endp && *begp != '\n') begp++;
4184 while (begp < endp && endp[-1] != '\n') endp--;
4185 }
4186 else
4187 begp = endp;
4188 break;
4189
4190 case coding_type_iso2022:
4191 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4192 /* We can't skip any data. */
4193 break;
4194 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4195 {
4196 unsigned char *bol = begp;
4197 while (begp < endp && *begp < 0x80)
4198 {
4199 begp++;
4200 if (begp[-1] == '\n')
4201 bol = begp;
4202 }
4203 begp = bol;
4204 goto label_skip_tail;
4205 }
4206 /* fall down ... */
4207
4208 default:
4209 /* We can skip all ASCII characters at the head and tail. */
4210 if (eol_conversion)
4211 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4212 else
4213 while (begp < endp && *begp < 0x80) begp++;
4214 label_skip_tail:
4215 if (eol_conversion)
4216 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4217 else
4218 while (begp < endp && *(endp - 1) < 0x80) endp--;
4219 break;
4220 }
4221
4222 *beg += begp - begp_orig;
4223 *end += endp - endp_orig;
4224 return;
4225 }
4226
4227 /* As shrinking conversion region requires some overhead, we don't try
4228 shrinking if the length of conversion region is less than this
4229 value. */
4230 static int shrink_conversion_region_threshhold = 1024;
4231
4232 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4233 do { \
4234 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4235 { \
4236 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4237 else shrink_decoding_region (beg, end, coding, str); \
4238 } \
4239 } while (0)
4240
4241 static Lisp_Object
4242 code_convert_region_unwind (dummy)
4243 Lisp_Object dummy;
4244 {
4245 inhibit_pre_post_conversion = 0;
4246 return Qnil;
4247 }
4248
4249 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4250 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4251 coding system CODING, and return the status code of code conversion
4252 (currently, this value has no meaning).
4253
4254 How many characters (and bytes) are converted to how many
4255 characters (and bytes) are recorded in members of the structure
4256 CODING.
4257
4258 If REPLACE is nonzero, we do various things as if the original text
4259 is deleted and a new text is inserted. See the comments in
4260 replace_range (insdel.c) to know what we are doing. */
4261
4262 int
4263 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4264 int from, from_byte, to, to_byte, encodep, replace;
4265 struct coding_system *coding;
4266 {
4267 int len = to - from, len_byte = to_byte - from_byte;
4268 int require, inserted, inserted_byte;
4269 int head_skip, tail_skip, total_skip;
4270 Lisp_Object saved_coding_symbol;
4271 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4272 int first = 1;
4273 int fake_multibyte = 0;
4274 unsigned char *src, *dst;
4275 Lisp_Object deletion;
4276 int orig_point = PT, orig_len = len;
4277 int prev_Z;
4278
4279 deletion = Qnil;
4280 saved_coding_symbol = Qnil;
4281
4282 if (from < PT && PT < to)
4283 {
4284 TEMP_SET_PT_BOTH (from, from_byte);
4285 orig_point = from;
4286 }
4287
4288 if (replace)
4289 {
4290 int saved_from = from;
4291
4292 prepare_to_modify_buffer (from, to, &from);
4293 if (saved_from != from)
4294 {
4295 to = from + len;
4296 if (multibyte)
4297 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4298 else
4299 from_byte = from, to_byte = to;
4300 len_byte = to_byte - from_byte;
4301 }
4302 }
4303
4304 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4305 {
4306 /* We must detect encoding of text and eol format. */
4307
4308 if (from < GPT && to > GPT)
4309 move_gap_both (from, from_byte);
4310 if (coding->type == coding_type_undecided)
4311 {
4312 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4313 if (coding->type == coding_type_undecided)
4314 /* It seems that the text contains only ASCII, but we
4315 should not left it undecided because the deeper
4316 decoding routine (decode_coding) tries to detect the
4317 encodings again in vain. */
4318 coding->type = coding_type_emacs_mule;
4319 }
4320 if (coding->eol_type == CODING_EOL_UNDECIDED)
4321 {
4322 saved_coding_symbol = coding->symbol;
4323 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4324 if (coding->eol_type == CODING_EOL_UNDECIDED)
4325 coding->eol_type = CODING_EOL_LF;
4326 /* We had better recover the original eol format if we
4327 encounter an inconsitent eol format while decoding. */
4328 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4329 }
4330 }
4331
4332 coding->consumed_char = len, coding->consumed = len_byte;
4333
4334 if (encodep
4335 ? ! CODING_REQUIRE_ENCODING (coding)
4336 : ! CODING_REQUIRE_DECODING (coding))
4337 {
4338 coding->produced = len_byte;
4339 if (multibyte
4340 && ! replace
4341 /* See the comment of the member heading_ascii in coding.h. */
4342 && coding->heading_ascii < len_byte)
4343 {
4344 /* We still may have to combine byte at the head and the
4345 tail of the text in the region. */
4346 if (from < GPT && GPT < to)
4347 move_gap_both (to, to_byte);
4348 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4349 adjust_after_insert (from, from_byte, to, to_byte, len);
4350 coding->produced_char = len;
4351 }
4352 else
4353 {
4354 if (!replace)
4355 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4356 coding->produced_char = len_byte;
4357 }
4358 return 0;
4359 }
4360
4361 /* Now we convert the text. */
4362
4363 /* For encoding, we must process pre-write-conversion in advance. */
4364 if (encodep
4365 && ! NILP (coding->pre_write_conversion)
4366 && SYMBOLP (coding->pre_write_conversion)
4367 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4368 {
4369 /* The function in pre-write-conversion may put a new text in a
4370 new buffer. */
4371 struct buffer *prev = current_buffer;
4372 Lisp_Object new;
4373 int count = specpdl_ptr - specpdl;
4374
4375 record_unwind_protect (code_convert_region_unwind, Qnil);
4376 /* We should not call any more pre-write/post-read-conversion
4377 functions while this pre-write-conversion is running. */
4378 inhibit_pre_post_conversion = 1;
4379 call2 (coding->pre_write_conversion,
4380 make_number (from), make_number (to));
4381 inhibit_pre_post_conversion = 0;
4382 /* Discard the unwind protect. */
4383 specpdl_ptr--;
4384
4385 if (current_buffer != prev)
4386 {
4387 len = ZV - BEGV;
4388 new = Fcurrent_buffer ();
4389 set_buffer_internal_1 (prev);
4390 del_range_2 (from, from_byte, to, to_byte);
4391 TEMP_SET_PT_BOTH (from, from_byte);
4392 insert_from_buffer (XBUFFER (new), 1, len, 0);
4393 Fkill_buffer (new);
4394 if (orig_point >= to)
4395 orig_point += len - orig_len;
4396 else if (orig_point > from)
4397 orig_point = from;
4398 orig_len = len;
4399 to = from + len;
4400 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4401 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4402 len_byte = to_byte - from_byte;
4403 TEMP_SET_PT_BOTH (from, from_byte);
4404 }
4405 }
4406
4407 if (replace)
4408 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4409
4410 /* Try to skip the heading and tailing ASCIIs. */
4411 {
4412 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4413
4414 if (from < GPT && GPT < to)
4415 move_gap_both (from, from_byte);
4416 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4417 if (from_byte == to_byte
4418 && coding->type != coding_type_ccl
4419 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4420 && CODING_REQUIRE_FLUSHING (coding)))
4421 {
4422 coding->produced = len_byte;
4423 coding->produced_char = multibyte ? len : len_byte;
4424 if (!replace)
4425 /* We must record and adjust for this new text now. */
4426 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4427 return 0;
4428 }
4429
4430 head_skip = from_byte - from_byte_orig;
4431 tail_skip = to_byte_orig - to_byte;
4432 total_skip = head_skip + tail_skip;
4433 from += head_skip;
4434 to -= tail_skip;
4435 len -= total_skip; len_byte -= total_skip;
4436 }
4437
4438 /* The code conversion routine can not preserve text properties for
4439 now. So, we must remove all text properties in the region.
4440 Here, we must suppress all modification hooks. */
4441 if (replace)
4442 {
4443 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4444 inhibit_modification_hooks = 1;
4445 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4446 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4447 }
4448
4449 /* For converion, we must put the gap before the text in addition to
4450 making the gap larger for efficient decoding. The required gap
4451 size starts from 2000 which is the magic number used in make_gap.
4452 But, after one batch of conversion, it will be incremented if we
4453 find that it is not enough . */
4454 require = 2000;
4455
4456 if (GAP_SIZE < require)
4457 make_gap (require - GAP_SIZE);
4458 move_gap_both (from, from_byte);
4459
4460 inserted = inserted_byte = 0;
4461 src = GAP_END_ADDR, dst = GPT_ADDR;
4462
4463 GAP_SIZE += len_byte;
4464 ZV -= len;
4465 Z -= len;
4466 ZV_BYTE -= len_byte;
4467 Z_BYTE -= len_byte;
4468
4469 if (GPT - BEG < BEG_UNCHANGED)
4470 BEG_UNCHANGED = GPT - BEG;
4471 if (Z - GPT < END_UNCHANGED)
4472 END_UNCHANGED = Z - GPT;
4473
4474 for (;;)
4475 {
4476 int result;
4477
4478 /* The buffer memory is changed from:
4479 +--------+converted-text+---------+-------original-text------+---+
4480 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4481 |<------------------- GAP_SIZE -------------------->| */
4482 if (encodep)
4483 result = encode_coding (coding, src, dst, len_byte, 0);
4484 else
4485 result = decode_coding (coding, src, dst, len_byte, 0);
4486 /* to:
4487 +--------+-------converted-text--------+--+---original-text--+---+
4488 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4489 |<------------------- GAP_SIZE -------------------->| */
4490 if (coding->fake_multibyte)
4491 fake_multibyte = 1;
4492
4493 if (!encodep && !multibyte)
4494 coding->produced_char = coding->produced;
4495 inserted += coding->produced_char;
4496 inserted_byte += coding->produced;
4497 len_byte -= coding->consumed;
4498 src += coding->consumed;
4499 dst += inserted_byte;
4500
4501 if (result == CODING_FINISH_NORMAL)
4502 {
4503 src += len_byte;
4504 break;
4505 }
4506 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4507 {
4508 unsigned char *pend = dst, *p = pend - inserted_byte;
4509 Lisp_Object eol_type;
4510
4511 /* Encode LFs back to the original eol format (CR or CRLF). */
4512 if (coding->eol_type == CODING_EOL_CR)
4513 {
4514 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4515 }
4516 else
4517 {
4518 int count = 0;
4519
4520 while (p < pend) if (*p++ == '\n') count++;
4521 if (src - dst < count)
4522 {
4523 /* We don't have sufficient room for encoding LFs
4524 back to CRLF. We must record converted and
4525 not-yet-converted text back to the buffer
4526 content, enlarge the gap, then record them out of
4527 the buffer contents again. */
4528 int add = len_byte + inserted_byte;
4529
4530 GAP_SIZE -= add;
4531 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4532 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4533 make_gap (count - GAP_SIZE);
4534 GAP_SIZE += add;
4535 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4536 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4537 /* Don't forget to update SRC, DST, and PEND. */
4538 src = GAP_END_ADDR - len_byte;
4539 dst = GPT_ADDR + inserted_byte;
4540 pend = dst;
4541 }
4542 inserted += count;
4543 inserted_byte += count;
4544 coding->produced += count;
4545 p = dst = pend + count;
4546 while (count)
4547 {
4548 *--p = *--pend;
4549 if (*p == '\n') count--, *--p = '\r';
4550 }
4551 }
4552
4553 /* Suppress eol-format conversion in the further conversion. */
4554 coding->eol_type = CODING_EOL_LF;
4555
4556 /* Set the coding system symbol to that for Unix-like EOL. */
4557 eol_type = Fget (saved_coding_symbol, Qeol_type);
4558 if (VECTORP (eol_type)
4559 && XVECTOR (eol_type)->size == 3
4560 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4561 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4562 else
4563 coding->symbol = saved_coding_symbol;
4564
4565 continue;
4566 }
4567 if (len_byte <= 0)
4568 {
4569 if (coding->type != coding_type_ccl
4570 || coding->mode & CODING_MODE_LAST_BLOCK)
4571 break;
4572 coding->mode |= CODING_MODE_LAST_BLOCK;
4573 continue;
4574 }
4575 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4576 {
4577 /* The source text ends in invalid codes. Let's just
4578 make them valid buffer contents, and finish conversion. */
4579 inserted += len_byte;
4580 inserted_byte += len_byte;
4581 while (len_byte--)
4582 *dst++ = *src++;
4583 fake_multibyte = 1;
4584 break;
4585 }
4586 if (result == CODING_FINISH_INTERRUPT)
4587 {
4588 /* The conversion procedure was interrupted by a user. */
4589 fake_multibyte = 1;
4590 break;
4591 }
4592 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4593 if (coding->consumed < 1)
4594 {
4595 /* It's quite strange to require more memory without
4596 consuming any bytes. Perhaps CCL program bug. */
4597 fake_multibyte = 1;
4598 break;
4599 }
4600 if (first)
4601 {
4602 /* We have just done the first batch of conversion which was
4603 stoped because of insufficient gap. Let's reconsider the
4604 required gap size (i.e. SRT - DST) now.
4605
4606 We have converted ORIG bytes (== coding->consumed) into
4607 NEW bytes (coding->produced). To convert the remaining
4608 LEN bytes, we may need REQUIRE bytes of gap, where:
4609 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4610 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4611 Here, we are sure that NEW >= ORIG. */
4612 float ratio = coding->produced - coding->consumed;
4613 ratio /= coding->consumed;
4614 require = len_byte * ratio;
4615 first = 0;
4616 }
4617 if ((src - dst) < (require + 2000))
4618 {
4619 /* See the comment above the previous call of make_gap. */
4620 int add = len_byte + inserted_byte;
4621
4622 GAP_SIZE -= add;
4623 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4624 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4625 make_gap (require + 2000);
4626 GAP_SIZE += add;
4627 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4628 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4629 /* Don't forget to update SRC, DST. */
4630 src = GAP_END_ADDR - len_byte;
4631 dst = GPT_ADDR + inserted_byte;
4632 }
4633 }
4634 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4635
4636 if (multibyte
4637 && (encodep
4638 || fake_multibyte
4639 || (to - from) != (to_byte - from_byte)))
4640 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4641
4642 /* If we have shrinked the conversion area, adjust it now. */
4643 if (total_skip > 0)
4644 {
4645 if (tail_skip > 0)
4646 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4647 inserted += total_skip; inserted_byte += total_skip;
4648 GAP_SIZE += total_skip;
4649 GPT -= head_skip; GPT_BYTE -= head_skip;
4650 ZV -= total_skip; ZV_BYTE -= total_skip;
4651 Z -= total_skip; Z_BYTE -= total_skip;
4652 from -= head_skip; from_byte -= head_skip;
4653 to += tail_skip; to_byte += tail_skip;
4654 }
4655
4656 prev_Z = Z;
4657 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4658 inserted = Z - prev_Z;
4659
4660 if (! encodep && ! NILP (coding->post_read_conversion))
4661 {
4662 Lisp_Object val;
4663 int count = specpdl_ptr - specpdl;
4664
4665 if (from != PT)
4666 TEMP_SET_PT_BOTH (from, from_byte);
4667 prev_Z = Z;
4668 record_unwind_protect (code_convert_region_unwind, Qnil);
4669 /* We should not call any more pre-write/post-read-conversion
4670 functions while this post-read-conversion is running. */
4671 inhibit_pre_post_conversion = 1;
4672 val = call1 (coding->post_read_conversion, make_number (inserted));
4673 inhibit_pre_post_conversion = 0;
4674 /* Discard the unwind protect. */
4675 specpdl_ptr--;
4676 CHECK_NUMBER (val, 0);
4677 inserted += Z - prev_Z;
4678 }
4679
4680 if (orig_point >= from)
4681 {
4682 if (orig_point >= from + orig_len)
4683 orig_point += inserted - orig_len;
4684 else
4685 orig_point = from;
4686 TEMP_SET_PT (orig_point);
4687 }
4688
4689 signal_after_change (from, to - from, inserted);
4690
4691 {
4692 coding->consumed = to_byte - from_byte;
4693 coding->consumed_char = to - from;
4694 coding->produced = inserted_byte;
4695 coding->produced_char = inserted;
4696 }
4697
4698 return 0;
4699 }
4700
4701 Lisp_Object
4702 code_convert_string (str, coding, encodep, nocopy)
4703 Lisp_Object str;
4704 struct coding_system *coding;
4705 int encodep, nocopy;
4706 {
4707 int len;
4708 char *buf;
4709 int from = 0, to = XSTRING (str)->size;
4710 int to_byte = STRING_BYTES (XSTRING (str));
4711 struct gcpro gcpro1;
4712 Lisp_Object saved_coding_symbol;
4713 int result;
4714
4715 saved_coding_symbol = Qnil;
4716 if ((encodep && !NILP (coding->pre_write_conversion)
4717 || !encodep && !NILP (coding->post_read_conversion)))
4718 {
4719 /* Since we have to call Lisp functions which assume target text
4720 is in a buffer, after setting a temporary buffer, call
4721 code_convert_region. */
4722 int count = specpdl_ptr - specpdl;
4723 struct buffer *prev = current_buffer;
4724 int multibyte = STRING_MULTIBYTE (str);
4725
4726 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4727 record_unwind_protect (code_convert_region_unwind, Qnil);
4728 inhibit_pre_post_conversion = 1;
4729 GCPRO1 (str);
4730 temp_output_buffer_setup (" *code-converting-work*");
4731 set_buffer_internal (XBUFFER (Vstandard_output));
4732 /* We must insert the contents of STR as is without
4733 unibyte<->multibyte conversion. For that, we adjust the
4734 multibyteness of the working buffer to that of STR. */
4735 Ferase_buffer (); /* for safety */
4736 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4737 insert_from_string (str, 0, 0, to, to_byte, 0);
4738 UNGCPRO;
4739 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4740 /* Make a unibyte string if we are encoding, otherwise make a
4741 multibyte string. */
4742 Fset_buffer_multibyte (encodep ? Qnil : Qt);
4743 str = make_buffer_string (BEGV, ZV, 0);
4744 return unbind_to (count, str);
4745 }
4746
4747 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4748 {
4749 /* See the comments in code_convert_region. */
4750 if (coding->type == coding_type_undecided)
4751 {
4752 detect_coding (coding, XSTRING (str)->data, to_byte);
4753 if (coding->type == coding_type_undecided)
4754 coding->type = coding_type_emacs_mule;
4755 }
4756 if (coding->eol_type == CODING_EOL_UNDECIDED)
4757 {
4758 saved_coding_symbol = coding->symbol;
4759 detect_eol (coding, XSTRING (str)->data, to_byte);
4760 if (coding->eol_type == CODING_EOL_UNDECIDED)
4761 coding->eol_type = CODING_EOL_LF;
4762 /* We had better recover the original eol format if we
4763 encounter an inconsitent eol format while decoding. */
4764 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4765 }
4766 }
4767
4768 if (encodep
4769 ? ! CODING_REQUIRE_ENCODING (coding)
4770 : ! CODING_REQUIRE_DECODING (coding))
4771 from = to_byte;
4772 else
4773 {
4774 /* Try to skip the heading and tailing ASCIIs. */
4775 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4776 encodep);
4777 }
4778 if (from == to_byte
4779 && coding->type != coding_type_ccl)
4780 return (nocopy ? str : Fcopy_sequence (str));
4781
4782 if (encodep)
4783 len = encoding_buffer_size (coding, to_byte - from);
4784 else
4785 len = decoding_buffer_size (coding, to_byte - from);
4786 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4787 GCPRO1 (str);
4788 buf = get_conversion_buffer (len);
4789 UNGCPRO;
4790
4791 if (from > 0)
4792 bcopy (XSTRING (str)->data, buf, from);
4793 result = (encodep
4794 ? encode_coding (coding, XSTRING (str)->data + from,
4795 buf + from, to_byte - from, len)
4796 : decode_coding (coding, XSTRING (str)->data + from,
4797 buf + from, to_byte - from, len));
4798 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4799 {
4800 /* We simple try to decode the whole string again but without
4801 eol-conversion this time. */
4802 coding->eol_type = CODING_EOL_LF;
4803 coding->symbol = saved_coding_symbol;
4804 return code_convert_string (str, coding, encodep, nocopy);
4805 }
4806
4807 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4808 STRING_BYTES (XSTRING (str)) - to_byte);
4809
4810 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4811 if (encodep)
4812 str = make_unibyte_string (buf, len + coding->produced);
4813 else
4814 {
4815 int chars= (coding->fake_multibyte
4816 ? multibyte_chars_in_text (buf + from, coding->produced)
4817 : coding->produced_char);
4818 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4819 }
4820
4821 return str;
4822 }
4823
4824 \f
4825 #ifdef emacs
4826 /*** 8. Emacs Lisp library functions ***/
4827
4828 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4829 "Return t if OBJECT is nil or a coding-system.\n\
4830 See the documentation of `make-coding-system' for information\n\
4831 about coding-system objects.")
4832 (obj)
4833 Lisp_Object obj;
4834 {
4835 if (NILP (obj))
4836 return Qt;
4837 if (!SYMBOLP (obj))
4838 return Qnil;
4839 /* Get coding-spec vector for OBJ. */
4840 obj = Fget (obj, Qcoding_system);
4841 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4842 ? Qt : Qnil);
4843 }
4844
4845 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4846 Sread_non_nil_coding_system, 1, 1, 0,
4847 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4848 (prompt)
4849 Lisp_Object prompt;
4850 {
4851 Lisp_Object val;
4852 do
4853 {
4854 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4855 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4856 }
4857 while (XSTRING (val)->size == 0);
4858 return (Fintern (val, Qnil));
4859 }
4860
4861 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4862 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4863 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4864 (prompt, default_coding_system)
4865 Lisp_Object prompt, default_coding_system;
4866 {
4867 Lisp_Object val;
4868 if (SYMBOLP (default_coding_system))
4869 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4870 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4871 Qt, Qnil, Qcoding_system_history,
4872 default_coding_system, Qnil);
4873 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4874 }
4875
4876 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4877 1, 1, 0,
4878 "Check validity of CODING-SYSTEM.\n\
4879 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4880 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4881 The value of property should be a vector of length 5.")
4882 (coding_system)
4883 Lisp_Object coding_system;
4884 {
4885 CHECK_SYMBOL (coding_system, 0);
4886 if (!NILP (Fcoding_system_p (coding_system)))
4887 return coding_system;
4888 while (1)
4889 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4890 }
4891 \f
4892 Lisp_Object
4893 detect_coding_system (src, src_bytes, highest)
4894 unsigned char *src;
4895 int src_bytes, highest;
4896 {
4897 int coding_mask, eol_type;
4898 Lisp_Object val, tmp;
4899 int dummy;
4900
4901 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4902 eol_type = detect_eol_type (src, src_bytes, &dummy);
4903 if (eol_type == CODING_EOL_INCONSISTENT)
4904 eol_type = CODING_EOL_UNDECIDED;
4905
4906 if (!coding_mask)
4907 {
4908 val = Qundecided;
4909 if (eol_type != CODING_EOL_UNDECIDED)
4910 {
4911 Lisp_Object val2;
4912 val2 = Fget (Qundecided, Qeol_type);
4913 if (VECTORP (val2))
4914 val = XVECTOR (val2)->contents[eol_type];
4915 }
4916 return (highest ? val : Fcons (val, Qnil));
4917 }
4918
4919 /* At first, gather possible coding systems in VAL. */
4920 val = Qnil;
4921 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp))
4922 {
4923 int idx
4924 = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
4925 if (coding_mask & (1 << idx))
4926 {
4927 val = Fcons (Fsymbol_value (XCAR (tmp)), val);
4928 if (highest)
4929 break;
4930 }
4931 }
4932 if (!highest)
4933 val = Fnreverse (val);
4934
4935 /* Then, replace the elements with subsidiary coding systems. */
4936 for (tmp = val; !NILP (tmp); tmp = XCDR (tmp))
4937 {
4938 if (eol_type != CODING_EOL_UNDECIDED
4939 && eol_type != CODING_EOL_INCONSISTENT)
4940 {
4941 Lisp_Object eol;
4942 eol = Fget (XCAR (tmp), Qeol_type);
4943 if (VECTORP (eol))
4944 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4945 }
4946 }
4947 return (highest ? XCAR (val) : val);
4948 }
4949
4950 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4951 2, 3, 0,
4952 "Detect coding system of the text in the region between START and END.\n\
4953 Return a list of possible coding systems ordered by priority.\n\
4954 \n\
4955 If only ASCII characters are found, it returns a list of single element\n\
4956 `undecided' or its subsidiary coding system according to a detected\n\
4957 end-of-line format.\n\
4958 \n\
4959 If optional argument HIGHEST is non-nil, return the coding system of\n\
4960 highest priority.")
4961 (start, end, highest)
4962 Lisp_Object start, end, highest;
4963 {
4964 int from, to;
4965 int from_byte, to_byte;
4966
4967 CHECK_NUMBER_COERCE_MARKER (start, 0);
4968 CHECK_NUMBER_COERCE_MARKER (end, 1);
4969
4970 validate_region (&start, &end);
4971 from = XINT (start), to = XINT (end);
4972 from_byte = CHAR_TO_BYTE (from);
4973 to_byte = CHAR_TO_BYTE (to);
4974
4975 if (from < GPT && to >= GPT)
4976 move_gap_both (to, to_byte);
4977
4978 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4979 to_byte - from_byte,
4980 !NILP (highest));
4981 }
4982
4983 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4984 1, 2, 0,
4985 "Detect coding system of the text in STRING.\n\
4986 Return a list of possible coding systems ordered by priority.\n\
4987 \n\
4988 If only ASCII characters are found, it returns a list of single element\n\
4989 `undecided' or its subsidiary coding system according to a detected\n\
4990 end-of-line format.\n\
4991 \n\
4992 If optional argument HIGHEST is non-nil, return the coding system of\n\
4993 highest priority.")
4994 (string, highest)
4995 Lisp_Object string, highest;
4996 {
4997 CHECK_STRING (string, 0);
4998
4999 return detect_coding_system (XSTRING (string)->data,
5000 STRING_BYTES (XSTRING (string)),
5001 !NILP (highest));
5002 }
5003
5004 Lisp_Object
5005 code_convert_region1 (start, end, coding_system, encodep)
5006 Lisp_Object start, end, coding_system;
5007 int encodep;
5008 {
5009 struct coding_system coding;
5010 int from, to, len;
5011
5012 CHECK_NUMBER_COERCE_MARKER (start, 0);
5013 CHECK_NUMBER_COERCE_MARKER (end, 1);
5014 CHECK_SYMBOL (coding_system, 2);
5015
5016 validate_region (&start, &end);
5017 from = XFASTINT (start);
5018 to = XFASTINT (end);
5019
5020 if (NILP (coding_system))
5021 return make_number (to - from);
5022
5023 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5024 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5025
5026 coding.mode |= CODING_MODE_LAST_BLOCK;
5027 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5028 &coding, encodep, 1);
5029 Vlast_coding_system_used = coding.symbol;
5030 return make_number (coding.produced_char);
5031 }
5032
5033 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5034 3, 3, "r\nzCoding system: ",
5035 "Decode the current region by specified coding system.\n\
5036 When called from a program, takes three arguments:\n\
5037 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5038 This function sets `last-coding-system-used' to the precise coding system\n\
5039 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5040 not fully specified.)\n\
5041 It returns the length of the decoded text.")
5042 (start, end, coding_system)
5043 Lisp_Object start, end, coding_system;
5044 {
5045 return code_convert_region1 (start, end, coding_system, 0);
5046 }
5047
5048 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5049 3, 3, "r\nzCoding system: ",
5050 "Encode the current region by specified coding system.\n\
5051 When called from a program, takes three arguments:\n\
5052 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5053 This function sets `last-coding-system-used' to the precise coding system\n\
5054 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5055 not fully specified.)\n\
5056 It returns the length of the encoded text.")
5057 (start, end, coding_system)
5058 Lisp_Object start, end, coding_system;
5059 {
5060 return code_convert_region1 (start, end, coding_system, 1);
5061 }
5062
5063 Lisp_Object
5064 code_convert_string1 (string, coding_system, nocopy, encodep)
5065 Lisp_Object string, coding_system, nocopy;
5066 int encodep;
5067 {
5068 struct coding_system coding;
5069
5070 CHECK_STRING (string, 0);
5071 CHECK_SYMBOL (coding_system, 1);
5072
5073 if (NILP (coding_system))
5074 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5075
5076 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5077 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5078
5079 coding.mode |= CODING_MODE_LAST_BLOCK;
5080 Vlast_coding_system_used = coding.symbol;
5081 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
5082 }
5083
5084 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5085 2, 3, 0,
5086 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5087 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5088 if the decoding operation is trivial.\n\
5089 This function sets `last-coding-system-used' to the precise coding system\n\
5090 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5091 not fully specified.)")
5092 (string, coding_system, nocopy)
5093 Lisp_Object string, coding_system, nocopy;
5094 {
5095 return code_convert_string1 (string, coding_system, nocopy, 0);
5096 }
5097
5098 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5099 2, 3, 0,
5100 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5101 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5102 if the encoding operation is trivial.\n\
5103 This function sets `last-coding-system-used' to the precise coding system\n\
5104 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5105 not fully specified.)")
5106 (string, coding_system, nocopy)
5107 Lisp_Object string, coding_system, nocopy;
5108 {
5109 return code_convert_string1 (string, coding_system, nocopy, 1);
5110 }
5111
5112 /* Encode or decode STRING according to CODING_SYSTEM.
5113 Do not set Vlast_coding_system_used. */
5114
5115 Lisp_Object
5116 code_convert_string_norecord (string, coding_system, encodep)
5117 Lisp_Object string, coding_system;
5118 int encodep;
5119 {
5120 struct coding_system coding;
5121
5122 CHECK_STRING (string, 0);
5123 CHECK_SYMBOL (coding_system, 1);
5124
5125 if (NILP (coding_system))
5126 return string;
5127
5128 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5129 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5130
5131 coding.mode |= CODING_MODE_LAST_BLOCK;
5132 return code_convert_string (string, &coding, encodep, Qt);
5133 }
5134 \f
5135 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5136 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5137 Return the corresponding character.")
5138 (code)
5139 Lisp_Object code;
5140 {
5141 unsigned char c1, c2, s1, s2;
5142 Lisp_Object val;
5143
5144 CHECK_NUMBER (code, 0);
5145 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5146 if (s1 == 0)
5147 {
5148 if (s2 < 0x80)
5149 XSETFASTINT (val, s2);
5150 else if (s2 >= 0xA0 || s2 <= 0xDF)
5151 XSETFASTINT (val,
5152 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5153 else
5154 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5155 }
5156 else
5157 {
5158 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5159 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5160 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5161 DECODE_SJIS (s1, s2, c1, c2);
5162 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5163 }
5164 return val;
5165 }
5166
5167 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5168 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5169 Return the corresponding code in SJIS.")
5170 (ch)
5171 Lisp_Object ch;
5172 {
5173 int charset, c1, c2, s1, s2;
5174 Lisp_Object val;
5175
5176 CHECK_NUMBER (ch, 0);
5177 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5178 if (charset == CHARSET_ASCII)
5179 {
5180 val = ch;
5181 }
5182 else if (charset == charset_jisx0208
5183 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5184 {
5185 ENCODE_SJIS (c1, c2, s1, s2);
5186 XSETFASTINT (val, (s1 << 8) | s2);
5187 }
5188 else if (charset == charset_katakana_jisx0201
5189 && c1 > 0x20 && c2 < 0xE0)
5190 {
5191 XSETFASTINT (val, c1 | 0x80);
5192 }
5193 else
5194 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5195 return val;
5196 }
5197
5198 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5199 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5200 Return the corresponding character.")
5201 (code)
5202 Lisp_Object code;
5203 {
5204 int charset;
5205 unsigned char b1, b2, c1, c2;
5206 Lisp_Object val;
5207
5208 CHECK_NUMBER (code, 0);
5209 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5210 if (b1 == 0)
5211 {
5212 if (b2 >= 0x80)
5213 error ("Invalid BIG5 code: %x", XFASTINT (code));
5214 val = code;
5215 }
5216 else
5217 {
5218 if ((b1 < 0xA1 || b1 > 0xFE)
5219 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5220 error ("Invalid BIG5 code: %x", XFASTINT (code));
5221 DECODE_BIG5 (b1, b2, charset, c1, c2);
5222 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5223 }
5224 return val;
5225 }
5226
5227 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5228 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5229 Return the corresponding character code in Big5.")
5230 (ch)
5231 Lisp_Object ch;
5232 {
5233 int charset, c1, c2, b1, b2;
5234 Lisp_Object val;
5235
5236 CHECK_NUMBER (ch, 0);
5237 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5238 if (charset == CHARSET_ASCII)
5239 {
5240 val = ch;
5241 }
5242 else if ((charset == charset_big5_1
5243 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5244 || (charset == charset_big5_2
5245 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5246 {
5247 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5248 XSETFASTINT (val, (b1 << 8) | b2);
5249 }
5250 else
5251 error ("Can't encode to Big5: %d", XFASTINT (ch));
5252 return val;
5253 }
5254 \f
5255 DEFUN ("set-terminal-coding-system-internal",
5256 Fset_terminal_coding_system_internal,
5257 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5258 (coding_system)
5259 Lisp_Object coding_system;
5260 {
5261 CHECK_SYMBOL (coding_system, 0);
5262 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5263 /* We had better not send unsafe characters to terminal. */
5264 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5265
5266 return Qnil;
5267 }
5268
5269 DEFUN ("set-safe-terminal-coding-system-internal",
5270 Fset_safe_terminal_coding_system_internal,
5271 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5272 (coding_system)
5273 Lisp_Object coding_system;
5274 {
5275 CHECK_SYMBOL (coding_system, 0);
5276 setup_coding_system (Fcheck_coding_system (coding_system),
5277 &safe_terminal_coding);
5278 return Qnil;
5279 }
5280
5281 DEFUN ("terminal-coding-system",
5282 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5283 "Return coding system specified for terminal output.")
5284 ()
5285 {
5286 return terminal_coding.symbol;
5287 }
5288
5289 DEFUN ("set-keyboard-coding-system-internal",
5290 Fset_keyboard_coding_system_internal,
5291 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5292 (coding_system)
5293 Lisp_Object coding_system;
5294 {
5295 CHECK_SYMBOL (coding_system, 0);
5296 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5297 return Qnil;
5298 }
5299
5300 DEFUN ("keyboard-coding-system",
5301 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5302 "Return coding system specified for decoding keyboard input.")
5303 ()
5304 {
5305 return keyboard_coding.symbol;
5306 }
5307
5308 \f
5309 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5310 Sfind_operation_coding_system, 1, MANY, 0,
5311 "Choose a coding system for an operation based on the target name.\n\
5312 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5313 DECODING-SYSTEM is the coding system to use for decoding\n\
5314 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5315 for encoding (in case OPERATION does encoding).\n\
5316 \n\
5317 The first argument OPERATION specifies an I/O primitive:\n\
5318 For file I/O, `insert-file-contents' or `write-region'.\n\
5319 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5320 For network I/O, `open-network-stream'.\n\
5321 \n\
5322 The remaining arguments should be the same arguments that were passed\n\
5323 to the primitive. Depending on which primitive, one of those arguments\n\
5324 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5325 whichever argument specifies the file name is TARGET.\n\
5326 \n\
5327 TARGET has a meaning which depends on OPERATION:\n\
5328 For file I/O, TARGET is a file name.\n\
5329 For process I/O, TARGET is a process name.\n\
5330 For network I/O, TARGET is a service name or a port number\n\
5331 \n\
5332 This function looks up what specified for TARGET in,\n\
5333 `file-coding-system-alist', `process-coding-system-alist',\n\
5334 or `network-coding-system-alist' depending on OPERATION.\n\
5335 They may specify a coding system, a cons of coding systems,\n\
5336 or a function symbol to call.\n\
5337 In the last case, we call the function with one argument,\n\
5338 which is a list of all the arguments given to this function.")
5339 (nargs, args)
5340 int nargs;
5341 Lisp_Object *args;
5342 {
5343 Lisp_Object operation, target_idx, target, val;
5344 register Lisp_Object chain;
5345
5346 if (nargs < 2)
5347 error ("Too few arguments");
5348 operation = args[0];
5349 if (!SYMBOLP (operation)
5350 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5351 error ("Invalid first arguement");
5352 if (nargs < 1 + XINT (target_idx))
5353 error ("Too few arguments for operation: %s",
5354 XSYMBOL (operation)->name->data);
5355 target = args[XINT (target_idx) + 1];
5356 if (!(STRINGP (target)
5357 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5358 error ("Invalid %dth argument", XINT (target_idx) + 1);
5359
5360 chain = ((EQ (operation, Qinsert_file_contents)
5361 || EQ (operation, Qwrite_region))
5362 ? Vfile_coding_system_alist
5363 : (EQ (operation, Qopen_network_stream)
5364 ? Vnetwork_coding_system_alist
5365 : Vprocess_coding_system_alist));
5366 if (NILP (chain))
5367 return Qnil;
5368
5369 for (; CONSP (chain); chain = XCDR (chain))
5370 {
5371 Lisp_Object elt;
5372 elt = XCAR (chain);
5373
5374 if (CONSP (elt)
5375 && ((STRINGP (target)
5376 && STRINGP (XCAR (elt))
5377 && fast_string_match (XCAR (elt), target) >= 0)
5378 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5379 {
5380 val = XCDR (elt);
5381 /* Here, if VAL is both a valid coding system and a valid
5382 function symbol, we return VAL as a coding system. */
5383 if (CONSP (val))
5384 return val;
5385 if (! SYMBOLP (val))
5386 return Qnil;
5387 if (! NILP (Fcoding_system_p (val)))
5388 return Fcons (val, val);
5389 if (! NILP (Ffboundp (val)))
5390 {
5391 val = call1 (val, Flist (nargs, args));
5392 if (CONSP (val))
5393 return val;
5394 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5395 return Fcons (val, val);
5396 }
5397 return Qnil;
5398 }
5399 }
5400 return Qnil;
5401 }
5402
5403 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5404 Supdate_coding_systems_internal, 0, 0, 0,
5405 "Update internal database for ISO2022 and CCL based coding systems.\n\
5406 When values of the following coding categories are changed, you must\n\
5407 call this function:\n\
5408 coding-category-iso-7, coding-category-iso-7-tight,\n\
5409 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5410 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5411 coding-category-ccl")
5412 ()
5413 {
5414 int i;
5415
5416 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5417 {
5418 Lisp_Object val;
5419
5420 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5421 if (!NILP (val))
5422 {
5423 if (! coding_system_table[i])
5424 coding_system_table[i] = ((struct coding_system *)
5425 xmalloc (sizeof (struct coding_system)));
5426 setup_coding_system (val, coding_system_table[i]);
5427 }
5428 else if (coding_system_table[i])
5429 {
5430 xfree (coding_system_table[i]);
5431 coding_system_table[i] = NULL;
5432 }
5433 }
5434
5435 return Qnil;
5436 }
5437
5438 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5439 Sset_coding_priority_internal, 0, 0, 0,
5440 "Update internal database for the current value of `coding-category-list'.\n\
5441 This function is internal use only.")
5442 ()
5443 {
5444 int i = 0, idx;
5445 Lisp_Object val;
5446
5447 val = Vcoding_category_list;
5448
5449 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5450 {
5451 if (! SYMBOLP (XCAR (val)))
5452 break;
5453 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5454 if (idx >= CODING_CATEGORY_IDX_MAX)
5455 break;
5456 coding_priorities[i++] = (1 << idx);
5457 val = XCDR (val);
5458 }
5459 /* If coding-category-list is valid and contains all coding
5460 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5461 the following code saves Emacs from craching. */
5462 while (i < CODING_CATEGORY_IDX_MAX)
5463 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5464
5465 return Qnil;
5466 }
5467
5468 #endif /* emacs */
5469
5470 \f
5471 /*** 9. Post-amble ***/
5472
5473 void
5474 init_coding ()
5475 {
5476 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5477 }
5478
5479 void
5480 init_coding_once ()
5481 {
5482 int i;
5483
5484 /* Emacs' internal format specific initialize routine. */
5485 for (i = 0; i <= 0x20; i++)
5486 emacs_code_class[i] = EMACS_control_code;
5487 emacs_code_class[0x0A] = EMACS_linefeed_code;
5488 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5489 for (i = 0x21 ; i < 0x7F; i++)
5490 emacs_code_class[i] = EMACS_ascii_code;
5491 emacs_code_class[0x7F] = EMACS_control_code;
5492 emacs_code_class[0x80] = EMACS_leading_code_composition;
5493 for (i = 0x81; i < 0xFF; i++)
5494 emacs_code_class[i] = EMACS_invalid_code;
5495 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5496 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5497 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5498 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5499
5500 /* ISO2022 specific initialize routine. */
5501 for (i = 0; i < 0x20; i++)
5502 iso_code_class[i] = ISO_control_code;
5503 for (i = 0x21; i < 0x7F; i++)
5504 iso_code_class[i] = ISO_graphic_plane_0;
5505 for (i = 0x80; i < 0xA0; i++)
5506 iso_code_class[i] = ISO_control_code;
5507 for (i = 0xA1; i < 0xFF; i++)
5508 iso_code_class[i] = ISO_graphic_plane_1;
5509 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5510 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5511 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5512 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5513 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5514 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5515 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5516 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5517 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5518 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5519
5520 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5521
5522 setup_coding_system (Qnil, &keyboard_coding);
5523 setup_coding_system (Qnil, &terminal_coding);
5524 setup_coding_system (Qnil, &safe_terminal_coding);
5525 setup_coding_system (Qnil, &default_buffer_file_coding);
5526
5527 bzero (coding_system_table, sizeof coding_system_table);
5528
5529 bzero (ascii_skip_code, sizeof ascii_skip_code);
5530 for (i = 0; i < 128; i++)
5531 ascii_skip_code[i] = 1;
5532
5533 #if defined (MSDOS) || defined (WINDOWSNT)
5534 system_eol_type = CODING_EOL_CRLF;
5535 #else
5536 system_eol_type = CODING_EOL_LF;
5537 #endif
5538
5539 inhibit_pre_post_conversion = 0;
5540 }
5541
5542 #ifdef emacs
5543
5544 void
5545 syms_of_coding ()
5546 {
5547 Qtarget_idx = intern ("target-idx");
5548 staticpro (&Qtarget_idx);
5549
5550 Qcoding_system_history = intern ("coding-system-history");
5551 staticpro (&Qcoding_system_history);
5552 Fset (Qcoding_system_history, Qnil);
5553
5554 /* Target FILENAME is the first argument. */
5555 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5556 /* Target FILENAME is the third argument. */
5557 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5558
5559 Qcall_process = intern ("call-process");
5560 staticpro (&Qcall_process);
5561 /* Target PROGRAM is the first argument. */
5562 Fput (Qcall_process, Qtarget_idx, make_number (0));
5563
5564 Qcall_process_region = intern ("call-process-region");
5565 staticpro (&Qcall_process_region);
5566 /* Target PROGRAM is the third argument. */
5567 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5568
5569 Qstart_process = intern ("start-process");
5570 staticpro (&Qstart_process);
5571 /* Target PROGRAM is the third argument. */
5572 Fput (Qstart_process, Qtarget_idx, make_number (2));
5573
5574 Qopen_network_stream = intern ("open-network-stream");
5575 staticpro (&Qopen_network_stream);
5576 /* Target SERVICE is the fourth argument. */
5577 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5578
5579 Qcoding_system = intern ("coding-system");
5580 staticpro (&Qcoding_system);
5581
5582 Qeol_type = intern ("eol-type");
5583 staticpro (&Qeol_type);
5584
5585 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5586 staticpro (&Qbuffer_file_coding_system);
5587
5588 Qpost_read_conversion = intern ("post-read-conversion");
5589 staticpro (&Qpost_read_conversion);
5590
5591 Qpre_write_conversion = intern ("pre-write-conversion");
5592 staticpro (&Qpre_write_conversion);
5593
5594 Qno_conversion = intern ("no-conversion");
5595 staticpro (&Qno_conversion);
5596
5597 Qundecided = intern ("undecided");
5598 staticpro (&Qundecided);
5599
5600 Qcoding_system_p = intern ("coding-system-p");
5601 staticpro (&Qcoding_system_p);
5602
5603 Qcoding_system_error = intern ("coding-system-error");
5604 staticpro (&Qcoding_system_error);
5605
5606 Fput (Qcoding_system_error, Qerror_conditions,
5607 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5608 Fput (Qcoding_system_error, Qerror_message,
5609 build_string ("Invalid coding system"));
5610
5611 Qcoding_category = intern ("coding-category");
5612 staticpro (&Qcoding_category);
5613 Qcoding_category_index = intern ("coding-category-index");
5614 staticpro (&Qcoding_category_index);
5615
5616 Vcoding_category_table
5617 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5618 staticpro (&Vcoding_category_table);
5619 {
5620 int i;
5621 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5622 {
5623 XVECTOR (Vcoding_category_table)->contents[i]
5624 = intern (coding_category_name[i]);
5625 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5626 Qcoding_category_index, make_number (i));
5627 }
5628 }
5629
5630 Qtranslation_table = intern ("translation-table");
5631 staticpro (&Qtranslation_table);
5632 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5633
5634 Qtranslation_table_id = intern ("translation-table-id");
5635 staticpro (&Qtranslation_table_id);
5636
5637 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5638 staticpro (&Qtranslation_table_for_decode);
5639
5640 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5641 staticpro (&Qtranslation_table_for_encode);
5642
5643 Qsafe_charsets = intern ("safe-charsets");
5644 staticpro (&Qsafe_charsets);
5645
5646 Qvalid_codes = intern ("valid-codes");
5647 staticpro (&Qvalid_codes);
5648
5649 Qemacs_mule = intern ("emacs-mule");
5650 staticpro (&Qemacs_mule);
5651
5652 Qraw_text = intern ("raw-text");
5653 staticpro (&Qraw_text);
5654
5655 defsubr (&Scoding_system_p);
5656 defsubr (&Sread_coding_system);
5657 defsubr (&Sread_non_nil_coding_system);
5658 defsubr (&Scheck_coding_system);
5659 defsubr (&Sdetect_coding_region);
5660 defsubr (&Sdetect_coding_string);
5661 defsubr (&Sdecode_coding_region);
5662 defsubr (&Sencode_coding_region);
5663 defsubr (&Sdecode_coding_string);
5664 defsubr (&Sencode_coding_string);
5665 defsubr (&Sdecode_sjis_char);
5666 defsubr (&Sencode_sjis_char);
5667 defsubr (&Sdecode_big5_char);
5668 defsubr (&Sencode_big5_char);
5669 defsubr (&Sset_terminal_coding_system_internal);
5670 defsubr (&Sset_safe_terminal_coding_system_internal);
5671 defsubr (&Sterminal_coding_system);
5672 defsubr (&Sset_keyboard_coding_system_internal);
5673 defsubr (&Skeyboard_coding_system);
5674 defsubr (&Sfind_operation_coding_system);
5675 defsubr (&Supdate_coding_systems_internal);
5676 defsubr (&Sset_coding_priority_internal);
5677
5678 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5679 "List of coding systems.\n\
5680 \n\
5681 Do not alter the value of this variable manually. This variable should be\n\
5682 updated by the functions `make-coding-system' and\n\
5683 `define-coding-system-alias'.");
5684 Vcoding_system_list = Qnil;
5685
5686 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5687 "Alist of coding system names.\n\
5688 Each element is one element list of coding system name.\n\
5689 This variable is given to `completing-read' as TABLE argument.\n\
5690 \n\
5691 Do not alter the value of this variable manually. This variable should be\n\
5692 updated by the functions `make-coding-system' and\n\
5693 `define-coding-system-alias'.");
5694 Vcoding_system_alist = Qnil;
5695
5696 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5697 "List of coding-categories (symbols) ordered by priority.");
5698 {
5699 int i;
5700
5701 Vcoding_category_list = Qnil;
5702 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5703 Vcoding_category_list
5704 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5705 Vcoding_category_list);
5706 }
5707
5708 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5709 "Specify the coding system for read operations.\n\
5710 It is useful to bind this variable with `let', but do not set it globally.\n\
5711 If the value is a coding system, it is used for decoding on read operation.\n\
5712 If not, an appropriate element is used from one of the coding system alists:\n\
5713 There are three such tables, `file-coding-system-alist',\n\
5714 `process-coding-system-alist', and `network-coding-system-alist'.");
5715 Vcoding_system_for_read = Qnil;
5716
5717 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5718 "Specify the coding system for write operations.\n\
5719 Programs bind this variable with `let', but you should not set it globally.\n\
5720 If the value is a coding system, it is used for encoding of output,\n\
5721 when writing it to a file and when sending it to a file or subprocess.\n\
5722 \n\
5723 If this does not specify a coding system, an appropriate element\n\
5724 is used from one of the coding system alists:\n\
5725 There are three such tables, `file-coding-system-alist',\n\
5726 `process-coding-system-alist', and `network-coding-system-alist'.\n\
5727 For output to files, if the above procedure does not specify a coding system,\n\
5728 the value of `buffer-file-coding-system' is used.");
5729 Vcoding_system_for_write = Qnil;
5730
5731 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5732 "Coding system used in the latest file or process I/O.");
5733 Vlast_coding_system_used = Qnil;
5734
5735 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5736 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5737 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5738 such conversion.");
5739 inhibit_eol_conversion = 0;
5740
5741 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5742 "Non-nil means process buffer inherits coding system of process output.\n\
5743 Bind it to t if the process output is to be treated as if it were a file\n\
5744 read from some filesystem.");
5745 inherit_process_coding_system = 0;
5746
5747 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5748 "Alist to decide a coding system to use for a file I/O operation.\n\
5749 The format is ((PATTERN . VAL) ...),\n\
5750 where PATTERN is a regular expression matching a file name,\n\
5751 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5752 If VAL is a coding system, it is used for both decoding and encoding\n\
5753 the file contents.\n\
5754 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5755 and the cdr part is used for encoding.\n\
5756 If VAL is a function symbol, the function must return a coding system\n\
5757 or a cons of coding systems which are used as above.\n\
5758 \n\
5759 See also the function `find-operation-coding-system'\n\
5760 and the variable `auto-coding-alist'.");
5761 Vfile_coding_system_alist = Qnil;
5762
5763 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5764 "Alist to decide a coding system to use for a process I/O operation.\n\
5765 The format is ((PATTERN . VAL) ...),\n\
5766 where PATTERN is a regular expression matching a program name,\n\
5767 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5768 If VAL is a coding system, it is used for both decoding what received\n\
5769 from the program and encoding what sent to the program.\n\
5770 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5771 and the cdr part is used for encoding.\n\
5772 If VAL is a function symbol, the function must return a coding system\n\
5773 or a cons of coding systems which are used as above.\n\
5774 \n\
5775 See also the function `find-operation-coding-system'.");
5776 Vprocess_coding_system_alist = Qnil;
5777
5778 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5779 "Alist to decide a coding system to use for a network I/O operation.\n\
5780 The format is ((PATTERN . VAL) ...),\n\
5781 where PATTERN is a regular expression matching a network service name\n\
5782 or is a port number to connect to,\n\
5783 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5784 If VAL is a coding system, it is used for both decoding what received\n\
5785 from the network stream and encoding what sent to the network stream.\n\
5786 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5787 and the cdr part is used for encoding.\n\
5788 If VAL is a function symbol, the function must return a coding system\n\
5789 or a cons of coding systems which are used as above.\n\
5790 \n\
5791 See also the function `find-operation-coding-system'.");
5792 Vnetwork_coding_system_alist = Qnil;
5793
5794 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
5795 "Coding system to use with system messages.");
5796 Vlocale_coding_system = Qnil;
5797
5798 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5799 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5800 eol_mnemonic_unix = build_string (":");
5801
5802 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5803 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5804 eol_mnemonic_dos = build_string ("\\");
5805
5806 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5807 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5808 eol_mnemonic_mac = build_string ("/");
5809
5810 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5811 "*String displayed in mode line when end-of-line format is not yet determined.");
5812 eol_mnemonic_undecided = build_string (":");
5813
5814 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5815 "*Non-nil enables character translation while encoding and decoding.");
5816 Venable_character_translation = Qt;
5817
5818 DEFVAR_LISP ("standard-translation-table-for-decode",
5819 &Vstandard_translation_table_for_decode,
5820 "Table for translating characters while decoding.");
5821 Vstandard_translation_table_for_decode = Qnil;
5822
5823 DEFVAR_LISP ("standard-translation-table-for-encode",
5824 &Vstandard_translation_table_for_encode,
5825 "Table for translationg characters while encoding.");
5826 Vstandard_translation_table_for_encode = Qnil;
5827
5828 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5829 "Alist of charsets vs revision numbers.\n\
5830 While encoding, if a charset (car part of an element) is found,\n\
5831 designate it with the escape sequence identifing revision (cdr part of the element).");
5832 Vcharset_revision_alist = Qnil;
5833
5834 DEFVAR_LISP ("default-process-coding-system",
5835 &Vdefault_process_coding_system,
5836 "Cons of coding systems used for process I/O by default.\n\
5837 The car part is used for decoding a process output,\n\
5838 the cdr part is used for encoding a text to be sent to a process.");
5839 Vdefault_process_coding_system = Qnil;
5840
5841 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5842 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5843 This is a vector of length 256.\n\
5844 If Nth element is non-nil, the existence of code N in a file\n\
5845 \(or output of subprocess) doesn't prevent it to be detected as\n\
5846 a coding system of ISO 2022 variant which has a flag\n\
5847 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5848 or reading output of a subprocess.\n\
5849 Only 128th through 159th elements has a meaning.");
5850 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5851
5852 DEFVAR_LISP ("select-safe-coding-system-function",
5853 &Vselect_safe_coding_system_function,
5854 "Function to call to select safe coding system for encoding a text.\n\
5855 \n\
5856 If set, this function is called to force a user to select a proper\n\
5857 coding system which can encode the text in the case that a default\n\
5858 coding system used in each operation can't encode the text.\n\
5859 \n\
5860 The default value is `select-safe-coding-system' (which see).");
5861 Vselect_safe_coding_system_function = Qnil;
5862
5863 }
5864
5865 char *
5866 emacs_strerror (error_number)
5867 int error_number;
5868 {
5869 char *str;
5870
5871 synchronize_messages_locale ();
5872 str = strerror (error_number);
5873
5874 if (! NILP (Vlocale_coding_system))
5875 {
5876 Lisp_Object dec = code_convert_string_norecord (build_string (str),
5877 Vlocale_coding_system,
5878 0);
5879 str = (char *) XSTRING (dec)->data;
5880 }
5881
5882 return str;
5883 }
5884
5885 #endif /* emacs */