Use XCAR, XCDR, and XFLOAT_DATA instead of explicit member access.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
33
34 */
35
36 /*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
44
45 0. Emacs' internal format (emacs-mule)
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 section 4.
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
70
71 4. Raw text
72
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
75
76 5. Other
77
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
87
88 */
89
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
97
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
102
103 */
104
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112 #if 0
113 int
114 detect_coding_emacs_mule (src, src_end)
115 unsigned char *src, *src_end;
116 {
117 ...
118 }
119 #endif
120
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
138 #if 0
139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
143 {
144 ...
145 }
146 #endif
147
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
165 #if 0
166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
170 {
171 ...
172 }
173 #endif
174
175 /*** COMMONLY USED MACROS ***/
176
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183 #define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191 #define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199 #define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214 /* Decode one ASCII character C. */
215
216 #define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
231 } \
232 } while (0)
233
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
236
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
245 else \
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
255 } while (0)
256
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
259
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
266 } while (0)
267
268 \f
269 /*** 1. Preamble ***/
270
271 #include <stdio.h>
272
273 #ifdef emacs
274
275 #include <config.h>
276 #include "lisp.h"
277 #include "buffer.h"
278 #include "charset.h"
279 #include "ccl.h"
280 #include "coding.h"
281 #include "window.h"
282
283 #else /* not emacs */
284
285 #include "mulelib.h"
286
287 #endif /* not emacs */
288
289 Lisp_Object Qcoding_system, Qeol_type;
290 Lisp_Object Qbuffer_file_coding_system;
291 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
292 Lisp_Object Qno_conversion, Qundecided;
293 Lisp_Object Qcoding_system_history;
294 Lisp_Object Qsafe_charsets;
295 Lisp_Object Qvalid_codes;
296
297 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299 Lisp_Object Qstart_process, Qopen_network_stream;
300 Lisp_Object Qtarget_idx;
301
302 Lisp_Object Vselect_safe_coding_system_function;
303
304 /* Mnemonic string for each format of end-of-line. */
305 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306 /* Mnemonic string to indicate format of end-of-line is not yet
307 decided. */
308 Lisp_Object eol_mnemonic_undecided;
309
310 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312 int system_eol_type;
313
314 #ifdef emacs
315
316 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318 Lisp_Object Qcoding_system_p, Qcoding_system_error;
319
320 /* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322 Lisp_Object Qemacs_mule, Qraw_text;
323
324 /* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326 /* Coding-system for reading files and receiving data from process. */
327 Lisp_Object Vcoding_system_for_read;
328 /* Coding-system for writing files and sending data to process. */
329 Lisp_Object Vcoding_system_for_write;
330 /* Coding-system actually used in the latest I/O. */
331 Lisp_Object Vlast_coding_system_used;
332
333 /* A vector of length 256 which contains information about special
334 Latin codes (especially for dealing with Microsoft codes). */
335 Lisp_Object Vlatin_extra_code_table;
336
337 /* Flag to inhibit code conversion of end-of-line format. */
338 int inhibit_eol_conversion;
339
340 /* Flag to make buffer-file-coding-system inherit from process-coding. */
341 int inherit_process_coding_system;
342
343 /* Coding system to be used to encode text for terminal display. */
344 struct coding_system terminal_coding;
345
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding;
349
350 /* Coding system of what is sent from terminal keyboard. */
351 struct coding_system keyboard_coding;
352
353 /* Default coding system to be used to write a file. */
354 struct coding_system default_buffer_file_coding;
355
356 Lisp_Object Vfile_coding_system_alist;
357 Lisp_Object Vprocess_coding_system_alist;
358 Lisp_Object Vnetwork_coding_system_alist;
359
360 #endif /* emacs */
361
362 Lisp_Object Qcoding_category, Qcoding_category_index;
363
364 /* List of symbols `coding-category-xxx' ordered by priority. */
365 Lisp_Object Vcoding_category_list;
366
367 /* Table of coding categories (Lisp symbols). */
368 Lisp_Object Vcoding_category_table;
369
370 /* Table of names of symbol for each coding-category. */
371 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
372 "coding-category-emacs-mule",
373 "coding-category-sjis",
374 "coding-category-iso-7",
375 "coding-category-iso-7-tight",
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
380 "coding-category-ccl",
381 "coding-category-big5",
382 "coding-category-raw-text",
383 "coding-category-binary"
384 };
385
386 /* Table of pointers to coding systems corresponding to each coding
387 categories. */
388 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
390 /* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392 static
393 int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
395 /* Flag to tell if we look up translation table on character code
396 conversion. */
397 Lisp_Object Venable_character_translation;
398 /* Standard translation table to look up on decoding (reading). */
399 Lisp_Object Vstandard_translation_table_for_decode;
400 /* Standard translation table to look up on encoding (writing). */
401 Lisp_Object Vstandard_translation_table_for_encode;
402
403 Lisp_Object Qtranslation_table;
404 Lisp_Object Qtranslation_table_id;
405 Lisp_Object Qtranslation_table_for_decode;
406 Lisp_Object Qtranslation_table_for_encode;
407
408 /* Alist of charsets vs revision number. */
409 Lisp_Object Vcharset_revision_alist;
410
411 /* Default coding systems used for process I/O. */
412 Lisp_Object Vdefault_process_coding_system;
413
414 \f
415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
416
417 /* Emacs' internal format for encoding multiple character sets is a
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448 enum emacs_code_class_type emacs_code_class[256];
449
450 /* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452 #define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
463
464 int
465 detect_coding_emacs_mule (src, src_end)
466 unsigned char *src, *src_end;
467 {
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
521 return CODING_CATEGORY_MASK_EMACS_MULE;
522 }
523
524 \f
525 /*** 3. ISO2022 handlers ***/
526
527 /* The following note describes the coding system ISO2022 briefly.
528 Since the intention of this note is to help understand the
529 functions in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For thorough understanding, please refer to the
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
534 in 7-bit and 8-bit environments. For 7-bite environments, all text
535 is encoded using bytes less than 128. This may make the encoded
536 text a little bit longer, but the text passes more easily through
537 several gateways, some of which strip off MSB (Most Signigant Bit).
538
539 There are two kinds of character sets: control character set and
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
542 functions are also provided by escape sequences). The latter
543 contains graphic characters such as 'A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
547 four classes, according to the number of bytes (DIMENSION) and
548 number of characters in one dimension (CHARS) of the set:
549 - DIMENSION1_CHARS94
550 - DIMENSION1_CHARS96
551 - DIMENSION2_CHARS94
552 - DIMENSION2_CHARS96
553
554 In addition, each character set is assigned an identification tag,
555 unique for each set, called "final character" (denoted as <F>
556 hereafter). The <F> of each character set is decided by ECMA(*)
557 when it is registered in ISO. The code range of <F> is 0x30..0x7F
558 (0x30..0x3F are for private use only).
559
560 Note (*): ECMA = European Computer Manufacturers Association
561
562 Here are examples of graphic character set [NAME(<F>)]:
563 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
564 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
565 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
566 o DIMENSION2_CHARS96 -- none for the moment
567
568 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
569 C0 [0x00..0x1F] -- control character plane 0
570 GL [0x20..0x7F] -- graphic character plane 0
571 C1 [0x80..0x9F] -- control character plane 1
572 GR [0xA0..0xFF] -- graphic character plane 1
573
574 A control character set is directly designated and invoked to C0 or
575 C1 by an escape sequence. The most common case is that:
576 - ISO646's control character set is designated/invoked to C0, and
577 - ISO6429's control character set is designated/invoked to C1,
578 and usually these designations/invocations are omitted in encoded
579 text. In a 7-bit environment, only C0 can be used, and a control
580 character for C1 is encoded by an appropriate escape sequence to
581 fit into the environment. All control characters for C1 are
582 defined to have corresponding escape sequences.
583
584 A graphic character set is at first designated to one of four
585 graphic registers (G0 through G3), then these graphic registers are
586 invoked to GL or GR. These designations and invocations can be
587 done independently. The most common case is that G0 is invoked to
588 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
589 these invocations and designations are omitted in encoded text.
590 In a 7-bit environment, only GL can be used.
591
592 When a graphic character set of CHARS94 is invoked to GL, codes
593 0x20 and 0x7F of the GL area work as control characters SPACE and
594 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
595 be used.
596
597 There are two ways of invocation: locking-shift and single-shift.
598 With locking-shift, the invocation lasts until the next different
599 invocation, whereas with single-shift, the invocation affects the
600 following character only and doesn't affect the locking-shift
601 state. Invocations are done by the following control characters or
602 escape sequences:
603
604 ----------------------------------------------------------------------
605 abbrev function cntrl escape seq description
606 ----------------------------------------------------------------------
607 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
608 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
609 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
610 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
611 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
612 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
613 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
614 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
615 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
616 ----------------------------------------------------------------------
617 (*) These are not used by any known coding system.
618
619 Control characters for these functions are defined by macros
620 ISO_CODE_XXX in `coding.h'.
621
622 Designations are done by the following escape sequences:
623 ----------------------------------------------------------------------
624 escape sequence description
625 ----------------------------------------------------------------------
626 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
627 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
628 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
629 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
630 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
631 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
632 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
633 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
634 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
635 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
636 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
637 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
638 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
639 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
640 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
641 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
642 ----------------------------------------------------------------------
643
644 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
645 of dimension 1, chars 94, and final character <F>, etc...
646
647 Note (*): Although these designations are not allowed in ISO2022,
648 Emacs accepts them on decoding, and produces them on encoding
649 CHARS96 character sets in a coding system which is characterized as
650 7-bit environment, non-locking-shift, and non-single-shift.
651
652 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
653 '(' can be omitted. We refer to this as "short-form" hereafter.
654
655 Now you may notice that there are a lot of ways for encoding the
656 same multilingual text in ISO2022. Actually, there exist many
657 coding systems such as Compound Text (used in X11's inter client
658 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
659 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
660 localized platforms), and all of these are variants of ISO2022.
661
662 In addition to the above, Emacs handles two more kinds of escape
663 sequences: ISO6429's direction specification and Emacs' private
664 sequence for specifying character composition.
665
666 ISO6429's direction specification takes the following form:
667 o CSI ']' -- end of the current direction
668 o CSI '0' ']' -- end of the current direction
669 o CSI '1' ']' -- start of left-to-right text
670 o CSI '2' ']' -- start of right-to-left text
671 The control character CSI (0x9B: control sequence introducer) is
672 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
673
674 Character composition specification takes the following form:
675 o ESC '0' -- start character composition
676 o ESC '1' -- end character composition
677 Since these are not standard escape sequences of any ISO standard,
678 the use of them for these meaning is restricted to Emacs only. */
679
680 enum iso_code_class_type iso_code_class[256];
681
682 #define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
688
689 #define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
696 CODING_CATEGORY_MASK_ISO_7_TIGHT
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704 int
705 detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707 {
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
710 int reg[4], shift_out = 0, single_shifting = 0;
711 int c, c1, i, charset;
712
713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
714 while (mask && src < src_end)
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
720 single_shifting = 0;
721 if (src >= src_end)
722 break;
723 c = *src++;
724 if (c >= '(' && c <= '/')
725 {
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
744 reg[0] = charset = iso_charset_table[1][0][c];
745 else if (c >= '(' && c <= '/')
746 {
747 if (src >= src_end)
748 break;
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
755 }
756 else
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
760 else if (c == 'N' || c == 'O')
761 {
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
764 break;
765 }
766 else if (c == '0' || c == '1' || c == '2')
767 /* ESC <Fp> for start/end composition. Just ignore. */
768 break;
769 else
770 /* Invalid escape sequence. Just ignore. */
771 break;
772
773 /* We found a valid designation sequence for CHARSET. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
775 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
776 mask_found |= CODING_CATEGORY_MASK_ISO_7;
777 else
778 mask &= ~CODING_CATEGORY_MASK_ISO_7;
779 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
780 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
781 else
782 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
783 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
784 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
785 else
786 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
787 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
788 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
789 else
790 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
791 break;
792
793 case ISO_CODE_SO:
794 single_shifting = 0;
795 if (shift_out == 0
796 && (reg[1] >= 0
797 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
798 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
799 {
800 /* Locking shift out. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
804 break;
805
806 case ISO_CODE_SI:
807 single_shifting = 0;
808 if (shift_out == 1)
809 {
810 /* Locking shift in. */
811 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
812 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
813 }
814 break;
815
816 case ISO_CODE_CSI:
817 single_shifting = 0;
818 case ISO_CODE_SS2:
819 case ISO_CODE_SS3:
820 {
821 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
822
823 if (c != ISO_CODE_CSI)
824 {
825 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
826 & CODING_FLAG_ISO_SINGLE_SHIFT)
827 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
830 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
831 single_shifting = 1;
832 }
833 if (VECTORP (Vlatin_extra_code_table)
834 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
835 {
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_LATIN_EXTRA)
838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 }
843 mask &= newmask;
844 mask_found |= newmask;
845 }
846 break;
847
848 default:
849 if (c < 0x80)
850 {
851 single_shifting = 0;
852 break;
853 }
854 else if (c < 0xA0)
855 {
856 single_shifting = 0;
857 if (VECTORP (Vlatin_extra_code_table)
858 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
859 {
860 int newmask = 0;
861
862 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
863 & CODING_FLAG_ISO_LATIN_EXTRA)
864 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
867 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
868 mask &= newmask;
869 mask_found |= newmask;
870 }
871 else
872 return 0;
873 }
874 else
875 {
876 unsigned char *src_begin = src;
877
878 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
879 | CODING_CATEGORY_MASK_ISO_7_ELSE);
880 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
881 /* Check the length of succeeding codes of the range
882 0xA0..0FF. If the byte length is odd, we exclude
883 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
884 when we are not single shifting. */
885 if (!single_shifting)
886 {
887 while (src < src_end && *src >= 0xA0)
888 src++;
889 if ((src - src_begin - 1) & 1 && src < src_end)
890 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
891 else
892 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
893 }
894 }
895 break;
896 }
897 }
898
899 return (mask & mask_found);
900 }
901
902 /* Decode a character of which charset is CHARSET and the 1st position
903 code is C1. If dimension of CHARSET is 2, the 2nd position code is
904 fetched from SRC and set to C2. If CHARSET is negative, it means
905 that we are decoding ill formed text, and what we can do is just to
906 read C1 as is. */
907
908 #define DECODE_ISO_CHARACTER(charset, c1) \
909 do { \
910 int c_alt, charset_alt = (charset); \
911 if (COMPOSING_HEAD_P (coding->composing)) \
912 { \
913 *dst++ = LEADING_CODE_COMPOSITION; \
914 if (COMPOSING_WITH_RULE_P (coding->composing)) \
915 /* To tell composition rules are embeded. */ \
916 *dst++ = 0xFF; \
917 coding->composing += 2; \
918 } \
919 if (charset_alt >= 0) \
920 { \
921 if (CHARSET_DIMENSION (charset_alt) == 2) \
922 { \
923 ONE_MORE_BYTE (c2); \
924 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
925 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
926 { \
927 src--; \
928 charset_alt = CHARSET_ASCII; \
929 } \
930 } \
931 if (!NILP (translation_table) \
932 && ((c_alt = translate_char (translation_table, \
933 -1, charset_alt, c1, c2)) >= 0)) \
934 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
935 } \
936 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
937 DECODE_CHARACTER_ASCII (c1); \
938 else if (CHARSET_DIMENSION (charset_alt) == 1) \
939 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
940 else \
941 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
942 if (COMPOSING_WITH_RULE_P (coding->composing)) \
943 /* To tell a composition rule follows. */ \
944 coding->composing = COMPOSING_WITH_RULE_RULE; \
945 } while (0)
946
947 /* Set designation state into CODING. */
948 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
949 do { \
950 int charset; \
951 \
952 if (final_char < '0' || final_char >= 128) \
953 goto label_invalid_code; \
954 charset = ISO_CHARSET_TABLE (make_number (dimension), \
955 make_number (chars), \
956 make_number (final_char)); \
957 if (charset >= 0 \
958 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
959 || coding->safe_charsets[charset])) \
960 { \
961 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
962 && reg == 0 \
963 && charset == CHARSET_ASCII) \
964 { \
965 /* We should insert this designation sequence as is so \
966 that it is surely written back to a file. */ \
967 coding->spec.iso2022.last_invalid_designation_register = -1; \
968 goto label_invalid_code; \
969 } \
970 coding->spec.iso2022.last_invalid_designation_register = -1; \
971 if ((coding->mode & CODING_MODE_DIRECTION) \
972 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
973 charset = CHARSET_REVERSE_CHARSET (charset); \
974 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
975 } \
976 else \
977 { \
978 coding->spec.iso2022.last_invalid_designation_register = reg; \
979 goto label_invalid_code; \
980 } \
981 } while (0)
982
983 /* Return 0 if there's a valid composing sequence starting at SRC and
984 ending before SRC_END, else return -1. */
985
986 int
987 check_composing_code (coding, src, src_end)
988 struct coding_system *coding;
989 unsigned char *src, *src_end;
990 {
991 int charset, c, c1, dim;
992
993 while (src < src_end)
994 {
995 c = *src++;
996 if (c >= 0x20)
997 continue;
998 if (c != ISO_CODE_ESC || src >= src_end)
999 return -1;
1000 c = *src++;
1001 if (c == '1') /* end of compsition */
1002 return 0;
1003 if (src + 2 >= src_end
1004 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005 return -1;
1006
1007 dim = (c == '$');
1008 if (dim == 1)
1009 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010 if (c >= '(' && c <= '/')
1011 {
1012 c1 = *src++;
1013 if ((c1 < ' ' || c1 >= 0x80)
1014 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015 || ! coding->safe_charsets[charset]
1016 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018 return -1;
1019 }
1020 else
1021 return -1;
1022 }
1023
1024 /* We have not found the sequence "ESC 1". */
1025 return -1;
1026 }
1027
1028 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1029
1030 int
1031 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1032 struct coding_system *coding;
1033 unsigned char *source, *destination;
1034 int src_bytes, dst_bytes;
1035 {
1036 unsigned char *src = source;
1037 unsigned char *src_end = source + src_bytes;
1038 unsigned char *dst = destination;
1039 unsigned char *dst_end = destination + dst_bytes;
1040 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041 from DST_END to assure that overflow checking is necessary only
1042 at the head of loop. */
1043 unsigned char *adjusted_dst_end = dst_end - 6;
1044 int charset;
1045 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1046 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1048 Lisp_Object translation_table
1049 = coding->translation_table_for_decode;
1050 int result = CODING_FINISH_NORMAL;
1051
1052 if (!NILP (Venable_character_translation) && NILP (translation_table))
1053 translation_table = Vstandard_translation_table_for_decode;
1054
1055 coding->produced_char = 0;
1056 coding->fake_multibyte = 0;
1057 while (src < src_end && (dst_bytes
1058 ? (dst < adjusted_dst_end)
1059 : (dst < src - 6)))
1060 {
1061 /* SRC_BASE remembers the start position in source in each loop.
1062 The loop will be exited when there's not enough source text
1063 to analyze long escape sequence or 2-byte code (within macros
1064 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1065 to SRC_BASE before exiting. */
1066 unsigned char *src_base = src;
1067 int c1 = *src++, c2;
1068
1069 switch (iso_code_class [c1])
1070 {
1071 case ISO_0x20_or_0x7F:
1072 if (!coding->composing
1073 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074 {
1075 /* This is SPACE or DEL. */
1076 *dst++ = c1;
1077 coding->produced_char++;
1078 break;
1079 }
1080 /* This is a graphic character, we fall down ... */
1081
1082 case ISO_graphic_plane_0:
1083 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084 {
1085 /* This is a composition rule. */
1086 *dst++ = c1 | 0x80;
1087 coding->composing = COMPOSING_WITH_RULE_TAIL;
1088 }
1089 else
1090 DECODE_ISO_CHARACTER (charset0, c1);
1091 break;
1092
1093 case ISO_0xA0_or_0xFF:
1094 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1096 goto label_invalid_code;
1097 /* This is a graphic character, we fall down ... */
1098
1099 case ISO_graphic_plane_1:
1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1101 goto label_invalid_code;
1102 else
1103 DECODE_ISO_CHARACTER (charset1, c1);
1104 break;
1105
1106 case ISO_control_code:
1107 /* All ISO2022 control characters in this class have the
1108 same representation in Emacs internal format. */
1109 if (c1 == '\n'
1110 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111 && (coding->eol_type == CODING_EOL_CR
1112 || coding->eol_type == CODING_EOL_CRLF))
1113 {
1114 result = CODING_FINISH_INCONSISTENT_EOL;
1115 goto label_end_of_loop_2;
1116 }
1117 *dst++ = c1;
1118 coding->produced_char++;
1119 if (c1 >= 0x80)
1120 coding->fake_multibyte = 1;
1121 break;
1122
1123 case ISO_carriage_return:
1124 if (coding->eol_type == CODING_EOL_CR)
1125 *dst++ = '\n';
1126 else if (coding->eol_type == CODING_EOL_CRLF)
1127 {
1128 ONE_MORE_BYTE (c1);
1129 if (c1 == ISO_CODE_LF)
1130 *dst++ = '\n';
1131 else
1132 {
1133 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134 {
1135 result = CODING_FINISH_INCONSISTENT_EOL;
1136 goto label_end_of_loop_2;
1137 }
1138 src--;
1139 *dst++ = '\r';
1140 }
1141 }
1142 else
1143 *dst++ = c1;
1144 coding->produced_char++;
1145 break;
1146
1147 case ISO_shift_out:
1148 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150 goto label_invalid_code;
1151 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153 break;
1154
1155 case ISO_shift_in:
1156 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157 goto label_invalid_code;
1158 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160 break;
1161
1162 case ISO_single_shift_2_7:
1163 case ISO_single_shift_2:
1164 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165 goto label_invalid_code;
1166 /* SS2 is handled as an escape sequence of ESC 'N' */
1167 c1 = 'N';
1168 goto label_escape_sequence;
1169
1170 case ISO_single_shift_3:
1171 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172 goto label_invalid_code;
1173 /* SS2 is handled as an escape sequence of ESC 'O' */
1174 c1 = 'O';
1175 goto label_escape_sequence;
1176
1177 case ISO_control_sequence_introducer:
1178 /* CSI is handled as an escape sequence of ESC '[' ... */
1179 c1 = '[';
1180 goto label_escape_sequence;
1181
1182 case ISO_escape:
1183 ONE_MORE_BYTE (c1);
1184 label_escape_sequence:
1185 /* Escape sequences handled by Emacs are invocation,
1186 designation, direction specification, and character
1187 composition specification. */
1188 switch (c1)
1189 {
1190 case '&': /* revision of following character set */
1191 ONE_MORE_BYTE (c1);
1192 if (!(c1 >= '@' && c1 <= '~'))
1193 goto label_invalid_code;
1194 ONE_MORE_BYTE (c1);
1195 if (c1 != ISO_CODE_ESC)
1196 goto label_invalid_code;
1197 ONE_MORE_BYTE (c1);
1198 goto label_escape_sequence;
1199
1200 case '$': /* designation of 2-byte character set */
1201 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202 goto label_invalid_code;
1203 ONE_MORE_BYTE (c1);
1204 if (c1 >= '@' && c1 <= 'B')
1205 { /* designation of JISX0208.1978, GB2312.1980,
1206 or JISX0208.1980 */
1207 DECODE_DESIGNATION (0, 2, 94, c1);
1208 }
1209 else if (c1 >= 0x28 && c1 <= 0x2B)
1210 { /* designation of DIMENSION2_CHARS94 character set */
1211 ONE_MORE_BYTE (c2);
1212 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213 }
1214 else if (c1 >= 0x2C && c1 <= 0x2F)
1215 { /* designation of DIMENSION2_CHARS96 character set */
1216 ONE_MORE_BYTE (c2);
1217 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218 }
1219 else
1220 goto label_invalid_code;
1221 break;
1222
1223 case 'n': /* invocation of locking-shift-2 */
1224 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226 goto label_invalid_code;
1227 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1228 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1229 break;
1230
1231 case 'o': /* invocation of locking-shift-3 */
1232 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234 goto label_invalid_code;
1235 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1236 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1237 break;
1238
1239 case 'N': /* invocation of single-shift-2 */
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242 goto label_invalid_code;
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
1248 case 'O': /* invocation of single-shift-3 */
1249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251 goto label_invalid_code;
1252 ONE_MORE_BYTE (c1);
1253 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254 DECODE_ISO_CHARACTER (charset, c1);
1255 break;
1256
1257 case '0': case '2': /* start composing */
1258 /* Before processing composing, we must be sure that all
1259 characters being composed are supported by CODING.
1260 If not, we must give up composing. */
1261 if (check_composing_code (coding, src, src_end) == 0)
1262 {
1263 /* We are looking at a valid composition sequence. */
1264 coding->composing = (c1 == '0'
1265 ? COMPOSING_NO_RULE_HEAD
1266 : COMPOSING_WITH_RULE_HEAD);
1267 coding->composed_chars = 0;
1268 }
1269 else
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 }
1275 break;
1276
1277 case '1': /* end composing */
1278 if (!coding->composing)
1279 {
1280 *dst++ = ISO_CODE_ESC;
1281 *dst++ = c1;
1282 coding->produced_char += 2;
1283 break;
1284 }
1285
1286 if (coding->composed_chars > 0)
1287 {
1288 if (coding->composed_chars == 1)
1289 {
1290 unsigned char *this_char_start = dst;
1291 int this_bytes;
1292
1293 /* Only one character is in the composing
1294 sequence. Make it a normal character. */
1295 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296 dst = (this_char_start
1297 + (coding->composing == COMPOSING_NO_RULE_TAIL
1298 ? 1 : 2));
1299 *dst -= 0x20;
1300 if (*dst == 0x80)
1301 *++dst &= 0x7F;
1302 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303 while (this_bytes--) *this_char_start++ = *dst++;
1304 dst = this_char_start;
1305 }
1306 coding->produced_char++;
1307 }
1308 coding->composing = COMPOSING_NO;
1309 break;
1310
1311 case '[': /* specification of direction */
1312 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313 goto label_invalid_code;
1314 /* For the moment, nested direction is not supported.
1315 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316 left-to-right, and nozero means right-to-left. */
1317 ONE_MORE_BYTE (c1);
1318 switch (c1)
1319 {
1320 case ']': /* end of the current direction */
1321 coding->mode &= ~CODING_MODE_DIRECTION;
1322
1323 case '0': /* end of the current direction */
1324 case '1': /* start of left-to-right direction */
1325 ONE_MORE_BYTE (c1);
1326 if (c1 == ']')
1327 coding->mode &= ~CODING_MODE_DIRECTION;
1328 else
1329 goto label_invalid_code;
1330 break;
1331
1332 case '2': /* start of right-to-left direction */
1333 ONE_MORE_BYTE (c1);
1334 if (c1 == ']')
1335 coding->mode |= CODING_MODE_DIRECTION;
1336 else
1337 goto label_invalid_code;
1338 break;
1339
1340 default:
1341 goto label_invalid_code;
1342 }
1343 break;
1344
1345 default:
1346 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347 goto label_invalid_code;
1348 if (c1 >= 0x28 && c1 <= 0x2B)
1349 { /* designation of DIMENSION1_CHARS94 character set */
1350 ONE_MORE_BYTE (c2);
1351 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352 }
1353 else if (c1 >= 0x2C && c1 <= 0x2F)
1354 { /* designation of DIMENSION1_CHARS96 character set */
1355 ONE_MORE_BYTE (c2);
1356 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357 }
1358 else
1359 {
1360 goto label_invalid_code;
1361 }
1362 }
1363 /* We must update these variables now. */
1364 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366 break;
1367
1368 label_invalid_code:
1369 while (src_base < src)
1370 *dst++ = *src_base++;
1371 coding->fake_multibyte = 1;
1372 }
1373 continue;
1374
1375 label_end_of_loop:
1376 result = CODING_FINISH_INSUFFICIENT_SRC;
1377 label_end_of_loop_2:
1378 src = src_base;
1379 break;
1380 }
1381
1382 if (src < src_end)
1383 {
1384 if (result == CODING_FINISH_NORMAL)
1385 result = CODING_FINISH_INSUFFICIENT_DST;
1386 else if (result != CODING_FINISH_INCONSISTENT_EOL
1387 && coding->mode & CODING_MODE_LAST_BLOCK)
1388 {
1389 /* This is the last block of the text to be decoded. We had
1390 better just flush out all remaining codes in the text
1391 although they are not valid characters. */
1392 src_bytes = src_end - src;
1393 if (dst_bytes && (dst_end - dst < src_bytes))
1394 src_bytes = dst_end - dst;
1395 bcopy (src, dst, src_bytes);
1396 dst += src_bytes;
1397 src += src_bytes;
1398 coding->fake_multibyte = 1;
1399 }
1400 }
1401
1402 coding->consumed = coding->consumed_char = src - source;
1403 coding->produced = dst - destination;
1404 return result;
1405 }
1406
1407 /* ISO2022 encoding stuff. */
1408
1409 /*
1410 It is not enough to say just "ISO2022" on encoding, we have to
1411 specify more details. In Emacs, each coding system of ISO2022
1412 variant has the following specifications:
1413 1. Initial designation to G0 thru G3.
1414 2. Allows short-form designation?
1415 3. ASCII should be designated to G0 before control characters?
1416 4. ASCII should be designated to G0 at end of line?
1417 5. 7-bit environment or 8-bit environment?
1418 6. Use locking-shift?
1419 7. Use Single-shift?
1420 And the following two are only for Japanese:
1421 8. Use ASCII in place of JIS0201-1976-Roman?
1422 9. Use JISX0208-1983 in place of JISX0208-1978?
1423 These specifications are encoded in `coding->flags' as flag bits
1424 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1425 details.
1426 */
1427
1428 /* Produce codes (escape sequence) for designating CHARSET to graphic
1429 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1430 the coding system CODING allows, produce designation sequence of
1431 short-form. */
1432
1433 #define ENCODE_DESIGNATION(charset, reg, coding) \
1434 do { \
1435 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1436 char *intermediate_char_94 = "()*+"; \
1437 char *intermediate_char_96 = ",-./"; \
1438 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1439 if (revision < 255) \
1440 { \
1441 *dst++ = ISO_CODE_ESC; \
1442 *dst++ = '&'; \
1443 *dst++ = '@' + revision; \
1444 } \
1445 *dst++ = ISO_CODE_ESC; \
1446 if (CHARSET_DIMENSION (charset) == 1) \
1447 { \
1448 if (CHARSET_CHARS (charset) == 94) \
1449 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1450 else \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1452 } \
1453 else \
1454 { \
1455 *dst++ = '$'; \
1456 if (CHARSET_CHARS (charset) == 94) \
1457 { \
1458 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1459 || reg != 0 \
1460 || final_char < '@' || final_char > 'B') \
1461 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 } \
1463 else \
1464 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1465 } \
1466 *dst++ = final_char; \
1467 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1468 } while (0)
1469
1470 /* The following two macros produce codes (control character or escape
1471 sequence) for ISO2022 single-shift functions (single-shift-2 and
1472 single-shift-3). */
1473
1474 #define ENCODE_SINGLE_SHIFT_2 \
1475 do { \
1476 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1477 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1478 else \
1479 { \
1480 *dst++ = ISO_CODE_SS2; \
1481 coding->fake_multibyte = 1; \
1482 } \
1483 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1484 } while (0)
1485
1486 #define ENCODE_SINGLE_SHIFT_3 \
1487 do { \
1488 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1489 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1490 else \
1491 { \
1492 *dst++ = ISO_CODE_SS3; \
1493 coding->fake_multibyte = 1; \
1494 } \
1495 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1496 } while (0)
1497
1498 /* The following four macros produce codes (control character or
1499 escape sequence) for ISO2022 locking-shift functions (shift-in,
1500 shift-out, locking-shift-2, and locking-shift-3). */
1501
1502 #define ENCODE_SHIFT_IN \
1503 do { \
1504 *dst++ = ISO_CODE_SI; \
1505 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506 } while (0)
1507
1508 #define ENCODE_SHIFT_OUT \
1509 do { \
1510 *dst++ = ISO_CODE_SO; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512 } while (0)
1513
1514 #define ENCODE_LOCKING_SHIFT_2 \
1515 do { \
1516 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518 } while (0)
1519
1520 #define ENCODE_LOCKING_SHIFT_3 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524 } while (0)
1525
1526 /* Produce codes for a DIMENSION1 character whose character set is
1527 CHARSET and whose position-code is C1. Designation and invocation
1528 sequences are also produced in advance if necessary. */
1529
1530
1531 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1532 do { \
1533 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1534 { \
1535 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1536 *dst++ = c1 & 0x7F; \
1537 else \
1538 *dst++ = c1 | 0x80; \
1539 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1540 break; \
1541 } \
1542 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1543 { \
1544 *dst++ = c1 & 0x7F; \
1545 break; \
1546 } \
1547 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1548 { \
1549 *dst++ = c1 | 0x80; \
1550 break; \
1551 } \
1552 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1553 && !coding->safe_charsets[charset]) \
1554 { \
1555 /* We should not encode this character, instead produce one or \
1556 two `?'s. */ \
1557 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1558 if (CHARSET_WIDTH (charset) == 2) \
1559 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1560 break; \
1561 } \
1562 else \
1563 /* Since CHARSET is not yet invoked to any graphic planes, we \
1564 must invoke it, or, at first, designate it to some graphic \
1565 register. Then repeat the loop to actually produce the \
1566 character. */ \
1567 dst = encode_invocation_designation (charset, coding, dst); \
1568 } while (1)
1569
1570 /* Produce codes for a DIMENSION2 character whose character set is
1571 CHARSET and whose position-codes are C1 and C2. Designation and
1572 invocation codes are also produced in advance if necessary. */
1573
1574 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1575 do { \
1576 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1577 { \
1578 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1579 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1580 else \
1581 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1582 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1583 break; \
1584 } \
1585 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1586 { \
1587 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1588 break; \
1589 } \
1590 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1591 { \
1592 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1593 break; \
1594 } \
1595 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1596 && !coding->safe_charsets[charset]) \
1597 { \
1598 /* We should not encode this character, instead produce one or \
1599 two `?'s. */ \
1600 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1601 if (CHARSET_WIDTH (charset) == 2) \
1602 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1603 break; \
1604 } \
1605 else \
1606 /* Since CHARSET is not yet invoked to any graphic planes, we \
1607 must invoke it, or, at first, designate it to some graphic \
1608 register. Then repeat the loop to actually produce the \
1609 character. */ \
1610 dst = encode_invocation_designation (charset, coding, dst); \
1611 } while (1)
1612
1613 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1614 do { \
1615 int c_alt, charset_alt; \
1616 if (!NILP (translation_table) \
1617 && ((c_alt = translate_char (translation_table, -1, \
1618 charset, c1, c2)) \
1619 >= 0)) \
1620 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1621 else \
1622 charset_alt = charset; \
1623 if (CHARSET_DEFINED_P (charset_alt)) \
1624 { \
1625 if (CHARSET_DIMENSION (charset_alt) == 1) \
1626 { \
1627 if (charset == CHARSET_ASCII \
1628 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1629 charset_alt = charset_latin_jisx0201; \
1630 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1631 } \
1632 else \
1633 { \
1634 if (charset == charset_jisx0208 \
1635 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1636 charset_alt = charset_jisx0208_1978; \
1637 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } \
1639 } \
1640 else \
1641 { \
1642 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1643 { \
1644 *dst++ = charset & 0x7f; \
1645 *dst++ = c1 & 0x7f; \
1646 if (c2) \
1647 *dst++ = c2 & 0x7f; \
1648 } \
1649 else \
1650 { \
1651 *dst++ = charset; \
1652 *dst++ = c1; \
1653 if (c2) \
1654 *dst++ = c2; \
1655 } \
1656 } \
1657 if (! COMPOSING_P (coding->composing)) \
1658 coding->consumed_char++; \
1659 } while (0)
1660
1661 /* Produce designation and invocation codes at a place pointed by DST
1662 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1663 Return new DST. */
1664
1665 unsigned char *
1666 encode_invocation_designation (charset, coding, dst)
1667 int charset;
1668 struct coding_system *coding;
1669 unsigned char *dst;
1670 {
1671 int reg; /* graphic register number */
1672
1673 /* At first, check designations. */
1674 for (reg = 0; reg < 4; reg++)
1675 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676 break;
1677
1678 if (reg >= 4)
1679 {
1680 /* CHARSET is not yet designated to any graphic registers. */
1681 /* At first check the requested designation. */
1682 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1683 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684 /* Since CHARSET requests no special designation, designate it
1685 to graphic register 0. */
1686 reg = 0;
1687
1688 ENCODE_DESIGNATION (charset, reg, coding);
1689 }
1690
1691 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693 {
1694 /* Since the graphic register REG is not invoked to any graphic
1695 planes, invoke it to graphic plane 0. */
1696 switch (reg)
1697 {
1698 case 0: /* graphic register 0 */
1699 ENCODE_SHIFT_IN;
1700 break;
1701
1702 case 1: /* graphic register 1 */
1703 ENCODE_SHIFT_OUT;
1704 break;
1705
1706 case 2: /* graphic register 2 */
1707 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708 ENCODE_SINGLE_SHIFT_2;
1709 else
1710 ENCODE_LOCKING_SHIFT_2;
1711 break;
1712
1713 case 3: /* graphic register 3 */
1714 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715 ENCODE_SINGLE_SHIFT_3;
1716 else
1717 ENCODE_LOCKING_SHIFT_3;
1718 break;
1719 }
1720 }
1721 return dst;
1722 }
1723
1724 /* The following two macros produce codes for indicating composition. */
1725 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729 /* The following three macros produce codes for indicating direction
1730 of text. */
1731 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1732 do { \
1733 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1734 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1735 else \
1736 *dst++ = ISO_CODE_CSI; \
1737 } while (0)
1738
1739 #define ENCODE_DIRECTION_R2L \
1740 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742 #define ENCODE_DIRECTION_L2R \
1743 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745 /* Produce codes for designation and invocation to reset the graphic
1746 planes and registers to initial state. */
1747 #define ENCODE_RESET_PLANE_AND_REGISTER \
1748 do { \
1749 int reg; \
1750 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1751 ENCODE_SHIFT_IN; \
1752 for (reg = 0; reg < 4; reg++) \
1753 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1754 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1755 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1756 ENCODE_DESIGNATION \
1757 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1758 } while (0)
1759
1760 /* Produce designation sequences of charsets in the line started from
1761 SRC to a place pointed by *DSTP, and update DSTP.
1762
1763 If the current block ends before any end-of-line, we may fail to
1764 find all the necessary designations. */
1765
1766 void
1767 encode_designation_at_bol (coding, table, src, src_end, dstp)
1768 struct coding_system *coding;
1769 Lisp_Object table;
1770 unsigned char *src, *src_end, **dstp;
1771 {
1772 int charset, c, found = 0, reg;
1773 /* Table of charsets to be designated to each graphic register. */
1774 int r[4];
1775 unsigned char *dst = *dstp;
1776
1777 for (reg = 0; reg < 4; reg++)
1778 r[reg] = -1;
1779
1780 while (src < src_end && *src != '\n' && found < 4)
1781 {
1782 int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784 if (NILP (table))
1785 charset = CHARSET_AT (src);
1786 else
1787 {
1788 int c_alt;
1789 unsigned char c1, c2;
1790
1791 SPLIT_STRING(src, bytes, charset, c1, c2);
1792 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1793 charset = CHAR_CHARSET (c_alt);
1794 }
1795
1796 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1797 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1798 {
1799 found++;
1800 r[reg] = charset;
1801 }
1802
1803 src += bytes;
1804 }
1805
1806 if (found)
1807 {
1808 for (reg = 0; reg < 4; reg++)
1809 if (r[reg] >= 0
1810 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811 ENCODE_DESIGNATION (r[reg], reg, coding);
1812 *dstp = dst;
1813 }
1814 }
1815
1816 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1817
1818 int
1819 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1820 struct coding_system *coding;
1821 unsigned char *source, *destination;
1822 int src_bytes, dst_bytes;
1823 {
1824 unsigned char *src = source;
1825 unsigned char *src_end = source + src_bytes;
1826 unsigned char *dst = destination;
1827 unsigned char *dst_end = destination + dst_bytes;
1828 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1829 from DST_END to assure overflow checking is necessary only at the
1830 head of loop. */
1831 unsigned char *adjusted_dst_end = dst_end - 19;
1832 Lisp_Object translation_table
1833 = coding->translation_table_for_encode;
1834 int result = CODING_FINISH_NORMAL;
1835
1836 if (!NILP (Venable_character_translation) && NILP (translation_table))
1837 translation_table = Vstandard_translation_table_for_encode;
1838
1839 coding->consumed_char = 0;
1840 coding->fake_multibyte = 0;
1841 while (src < src_end && (dst_bytes
1842 ? (dst < adjusted_dst_end)
1843 : (dst < src - 19)))
1844 {
1845 /* SRC_BASE remembers the start position in source in each loop.
1846 The loop will be exited when there's not enough source text
1847 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1849 reset to SRC_BASE before exiting. */
1850 unsigned char *src_base = src;
1851 int charset, c1, c2, c3, c4;
1852
1853 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854 && CODING_SPEC_ISO_BOL (coding))
1855 {
1856 /* We have to produce designation sequences if any now. */
1857 encode_designation_at_bol (coding, translation_table,
1858 src, src_end, &dst);
1859 CODING_SPEC_ISO_BOL (coding) = 0;
1860 }
1861
1862 c1 = *src++;
1863 /* If we are seeing a component of a composite character, we are
1864 seeing a leading-code encoded irregularly for composition, or
1865 a composition rule if composing with rule. We must set C1 to
1866 a normal leading-code or an ASCII code. If we are not seeing
1867 a composite character, we must reset composition,
1868 designation, and invocation states. */
1869 if (COMPOSING_P (coding->composing))
1870 {
1871 if (c1 < 0xA0)
1872 {
1873 /* We are not in a composite character any longer. */
1874 coding->composing = COMPOSING_NO;
1875 ENCODE_RESET_PLANE_AND_REGISTER;
1876 ENCODE_COMPOSITION_END;
1877 }
1878 else
1879 {
1880 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881 {
1882 *dst++ = c1 & 0x7F;
1883 coding->composing = COMPOSING_WITH_RULE_HEAD;
1884 continue;
1885 }
1886 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887 coding->composing = COMPOSING_WITH_RULE_RULE;
1888 if (c1 == 0xA0)
1889 {
1890 /* This is an ASCII component. */
1891 ONE_MORE_BYTE (c1);
1892 c1 &= 0x7F;
1893 }
1894 else
1895 /* This is a leading-code of non ASCII component. */
1896 c1 -= 0x20;
1897 }
1898 }
1899
1900 /* Now encode one character. C1 is a control character, an
1901 ASCII character, or a leading-code of multi-byte character. */
1902 switch (emacs_code_class[c1])
1903 {
1904 case EMACS_ascii_code:
1905 c2 = 0;
1906 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1907 break;
1908
1909 case EMACS_control_code:
1910 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1911 ENCODE_RESET_PLANE_AND_REGISTER;
1912 *dst++ = c1;
1913 coding->consumed_char++;
1914 break;
1915
1916 case EMACS_carriage_return_code:
1917 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1918 {
1919 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1920 ENCODE_RESET_PLANE_AND_REGISTER;
1921 *dst++ = c1;
1922 coding->consumed_char++;
1923 break;
1924 }
1925 /* fall down to treat '\r' as '\n' ... */
1926
1927 case EMACS_linefeed_code:
1928 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1929 ENCODE_RESET_PLANE_AND_REGISTER;
1930 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1931 bcopy (coding->spec.iso2022.initial_designation,
1932 coding->spec.iso2022.current_designation,
1933 sizeof coding->spec.iso2022.initial_designation);
1934 if (coding->eol_type == CODING_EOL_LF
1935 || coding->eol_type == CODING_EOL_UNDECIDED)
1936 *dst++ = ISO_CODE_LF;
1937 else if (coding->eol_type == CODING_EOL_CRLF)
1938 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1939 else
1940 *dst++ = ISO_CODE_CR;
1941 CODING_SPEC_ISO_BOL (coding) = 1;
1942 coding->consumed_char++;
1943 break;
1944
1945 case EMACS_leading_code_2:
1946 ONE_MORE_BYTE (c2);
1947 c3 = 0;
1948 if (c2 < 0xA0)
1949 {
1950 /* invalid sequence */
1951 *dst++ = c1;
1952 src--;
1953 coding->consumed_char++;
1954 }
1955 else
1956 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1957 break;
1958
1959 case EMACS_leading_code_3:
1960 TWO_MORE_BYTES (c2, c3);
1961 c4 = 0;
1962 if (c2 < 0xA0 || c3 < 0xA0)
1963 {
1964 /* invalid sequence */
1965 *dst++ = c1;
1966 src -= 2;
1967 coding->consumed_char++;
1968 }
1969 else if (c1 < LEADING_CODE_PRIVATE_11)
1970 ENCODE_ISO_CHARACTER (c1, c2, c3);
1971 else
1972 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1973 break;
1974
1975 case EMACS_leading_code_4:
1976 THREE_MORE_BYTES (c2, c3, c4);
1977 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1978 {
1979 /* invalid sequence */
1980 *dst++ = c1;
1981 src -= 3;
1982 coding->consumed_char++;
1983 }
1984 else
1985 ENCODE_ISO_CHARACTER (c2, c3, c4);
1986 break;
1987
1988 case EMACS_leading_code_composition:
1989 ONE_MORE_BYTE (c2);
1990 if (c2 < 0xA0)
1991 {
1992 /* invalid sequence */
1993 *dst++ = c1;
1994 src--;
1995 coding->consumed_char++;
1996 }
1997 else if (c2 == 0xFF)
1998 {
1999 ENCODE_RESET_PLANE_AND_REGISTER;
2000 coding->composing = COMPOSING_WITH_RULE_HEAD;
2001 ENCODE_COMPOSITION_WITH_RULE_START;
2002 coding->consumed_char++;
2003 }
2004 else
2005 {
2006 ENCODE_RESET_PLANE_AND_REGISTER;
2007 /* Rewind one byte because it is a character code of
2008 composition elements. */
2009 src--;
2010 coding->composing = COMPOSING_NO_RULE_HEAD;
2011 ENCODE_COMPOSITION_NO_RULE_START;
2012 coding->consumed_char++;
2013 }
2014 break;
2015
2016 case EMACS_invalid_code:
2017 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2018 ENCODE_RESET_PLANE_AND_REGISTER;
2019 *dst++ = c1;
2020 coding->consumed_char++;
2021 break;
2022 }
2023 continue;
2024 label_end_of_loop:
2025 result = CODING_FINISH_INSUFFICIENT_SRC;
2026 src = src_base;
2027 break;
2028 }
2029
2030 if (src < src_end && result == CODING_FINISH_NORMAL)
2031 result = CODING_FINISH_INSUFFICIENT_DST;
2032
2033 /* If this is the last block of the text to be encoded, we must
2034 reset graphic planes and registers to the initial state, and
2035 flush out the carryover if any. */
2036 if (coding->mode & CODING_MODE_LAST_BLOCK)
2037 {
2038 ENCODE_RESET_PLANE_AND_REGISTER;
2039 if (COMPOSING_P (coding->composing))
2040 ENCODE_COMPOSITION_END;
2041 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2042 {
2043 while (src < src_end && dst < dst_end)
2044 *dst++ = *src++;
2045 }
2046 }
2047 coding->consumed = src - source;
2048 coding->produced = coding->produced_char = dst - destination;
2049 return result;
2050 }
2051
2052 \f
2053 /*** 4. SJIS and BIG5 handlers ***/
2054
2055 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2056 quite widely. So, for the moment, Emacs supports them in the bare
2057 C code. But, in the future, they may be supported only by CCL. */
2058
2059 /* SJIS is a coding system encoding three character sets: ASCII, right
2060 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2061 as is. A character of charset katakana-jisx0201 is encoded by
2062 "position-code + 0x80". A character of charset japanese-jisx0208
2063 is encoded in 2-byte but two position-codes are divided and shifted
2064 so that it fit in the range below.
2065
2066 --- CODE RANGE of SJIS ---
2067 (character set) (range)
2068 ASCII 0x00 .. 0x7F
2069 KATAKANA-JISX0201 0xA0 .. 0xDF
2070 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2071 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2072 -------------------------------
2073
2074 */
2075
2076 /* BIG5 is a coding system encoding two character sets: ASCII and
2077 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2078 character set and is encoded in two-byte.
2079
2080 --- CODE RANGE of BIG5 ---
2081 (character set) (range)
2082 ASCII 0x00 .. 0x7F
2083 Big5 (1st byte) 0xA1 .. 0xFE
2084 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2085 --------------------------
2086
2087 Since the number of characters in Big5 is larger than maximum
2088 characters in Emacs' charset (96x96), it can't be handled as one
2089 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2090 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2091 contains frequently used characters and the latter contains less
2092 frequently used characters. */
2093
2094 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2095 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2096 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2097 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2098
2099 /* Number of Big5 characters which have the same code in 1st byte. */
2100 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2101
2102 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2103 do { \
2104 unsigned int temp \
2105 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2106 if (b1 < 0xC9) \
2107 charset = charset_big5_1; \
2108 else \
2109 { \
2110 charset = charset_big5_2; \
2111 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2112 } \
2113 c1 = temp / (0xFF - 0xA1) + 0x21; \
2114 c2 = temp % (0xFF - 0xA1) + 0x21; \
2115 } while (0)
2116
2117 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2118 do { \
2119 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2120 if (charset == charset_big5_2) \
2121 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2122 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2123 b2 = temp % BIG5_SAME_ROW; \
2124 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2125 } while (0)
2126
2127 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2128 do { \
2129 int c_alt, charset_alt = (charset); \
2130 if (!NILP (translation_table) \
2131 && ((c_alt = translate_char (translation_table, \
2132 -1, (charset), c1, c2)) >= 0)) \
2133 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2134 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2135 DECODE_CHARACTER_ASCII (c1); \
2136 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2137 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2138 else \
2139 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2140 } while (0)
2141
2142 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2143 do { \
2144 int c_alt, charset_alt; \
2145 if (!NILP (translation_table) \
2146 && ((c_alt = translate_char (translation_table, -1, \
2147 charset, c1, c2)) \
2148 >= 0)) \
2149 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2150 else \
2151 charset_alt = charset; \
2152 if (charset_alt == charset_ascii) \
2153 *dst++ = c1; \
2154 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2155 { \
2156 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2157 *dst++ = c1; \
2158 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2159 *dst++ = c1 & 0x7F; \
2160 else \
2161 { \
2162 *dst++ = charset_alt, *dst++ = c1; \
2163 coding->fake_multibyte = 1; \
2164 } \
2165 } \
2166 else \
2167 { \
2168 c1 &= 0x7F, c2 &= 0x7F; \
2169 if (sjis_p && (charset_alt == charset_jisx0208 \
2170 || charset_alt == charset_jisx0208_1978))\
2171 { \
2172 unsigned char s1, s2; \
2173 \
2174 ENCODE_SJIS (c1, c2, s1, s2); \
2175 *dst++ = s1, *dst++ = s2; \
2176 coding->fake_multibyte = 1; \
2177 } \
2178 else if (!sjis_p \
2179 && (charset_alt == charset_big5_1 \
2180 || charset_alt == charset_big5_2)) \
2181 { \
2182 unsigned char b1, b2; \
2183 \
2184 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2185 *dst++ = b1, *dst++ = b2; \
2186 } \
2187 else \
2188 { \
2189 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2190 coding->fake_multibyte = 1; \
2191 } \
2192 } \
2193 coding->consumed_char++; \
2194 } while (0);
2195
2196 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2197 Check if a text is encoded in SJIS. If it is, return
2198 CODING_CATEGORY_MASK_SJIS, else return 0. */
2199
2200 int
2201 detect_coding_sjis (src, src_end)
2202 unsigned char *src, *src_end;
2203 {
2204 unsigned char c;
2205
2206 while (src < src_end)
2207 {
2208 c = *src++;
2209 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2210 {
2211 if (src < src_end && *src++ < 0x40)
2212 return 0;
2213 }
2214 }
2215 return CODING_CATEGORY_MASK_SJIS;
2216 }
2217
2218 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2219 Check if a text is encoded in BIG5. If it is, return
2220 CODING_CATEGORY_MASK_BIG5, else return 0. */
2221
2222 int
2223 detect_coding_big5 (src, src_end)
2224 unsigned char *src, *src_end;
2225 {
2226 unsigned char c;
2227
2228 while (src < src_end)
2229 {
2230 c = *src++;
2231 if (c >= 0xA1)
2232 {
2233 if (src >= src_end)
2234 break;
2235 c = *src++;
2236 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2237 return 0;
2238 }
2239 }
2240 return CODING_CATEGORY_MASK_BIG5;
2241 }
2242
2243 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2244 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2245
2246 int
2247 decode_coding_sjis_big5 (coding, source, destination,
2248 src_bytes, dst_bytes, sjis_p)
2249 struct coding_system *coding;
2250 unsigned char *source, *destination;
2251 int src_bytes, dst_bytes;
2252 int sjis_p;
2253 {
2254 unsigned char *src = source;
2255 unsigned char *src_end = source + src_bytes;
2256 unsigned char *dst = destination;
2257 unsigned char *dst_end = destination + dst_bytes;
2258 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2259 from DST_END to assure overflow checking is necessary only at the
2260 head of loop. */
2261 unsigned char *adjusted_dst_end = dst_end - 3;
2262 Lisp_Object translation_table
2263 = coding->translation_table_for_decode;
2264 int result = CODING_FINISH_NORMAL;
2265
2266 if (!NILP (Venable_character_translation) && NILP (translation_table))
2267 translation_table = Vstandard_translation_table_for_decode;
2268
2269 coding->produced_char = 0;
2270 coding->fake_multibyte = 0;
2271 while (src < src_end && (dst_bytes
2272 ? (dst < adjusted_dst_end)
2273 : (dst < src - 3)))
2274 {
2275 /* SRC_BASE remembers the start position in source in each loop.
2276 The loop will be exited when there's not enough source text
2277 to analyze two-byte character (within macro ONE_MORE_BYTE).
2278 In that case, SRC is reset to SRC_BASE before exiting. */
2279 unsigned char *src_base = src;
2280 unsigned char c1 = *src++, c2, c3, c4;
2281
2282 if (c1 < 0x20)
2283 {
2284 if (c1 == '\r')
2285 {
2286 if (coding->eol_type == CODING_EOL_CRLF)
2287 {
2288 ONE_MORE_BYTE (c2);
2289 if (c2 == '\n')
2290 *dst++ = c2;
2291 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2292 {
2293 result = CODING_FINISH_INCONSISTENT_EOL;
2294 goto label_end_of_loop_2;
2295 }
2296 else
2297 /* To process C2 again, SRC is subtracted by 1. */
2298 *dst++ = c1, src--;
2299 }
2300 else if (coding->eol_type == CODING_EOL_CR)
2301 *dst++ = '\n';
2302 else
2303 *dst++ = c1;
2304 }
2305 else if (c1 == '\n'
2306 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2307 && (coding->eol_type == CODING_EOL_CR
2308 || coding->eol_type == CODING_EOL_CRLF))
2309 {
2310 result = CODING_FINISH_INCONSISTENT_EOL;
2311 goto label_end_of_loop_2;
2312 }
2313 else
2314 *dst++ = c1;
2315 coding->produced_char++;
2316 }
2317 else if (c1 < 0x80)
2318 {
2319 c2 = 0; /* avoid warning */
2320 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2321 }
2322 else
2323 {
2324 if (sjis_p)
2325 {
2326 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2327 {
2328 /* SJIS -> JISX0208 */
2329 ONE_MORE_BYTE (c2);
2330 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2331 {
2332 DECODE_SJIS (c1, c2, c3, c4);
2333 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2334 }
2335 else
2336 goto label_invalid_code_2;
2337 }
2338 else if (c1 < 0xE0)
2339 /* SJIS -> JISX0201-Kana */
2340 {
2341 c2 = 0; /* avoid warning */
2342 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2343 /* dummy */ c2);
2344 }
2345 else
2346 goto label_invalid_code_1;
2347 }
2348 else
2349 {
2350 /* BIG5 -> Big5 */
2351 if (c1 >= 0xA1 && c1 <= 0xFE)
2352 {
2353 ONE_MORE_BYTE (c2);
2354 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2355 {
2356 int charset;
2357
2358 DECODE_BIG5 (c1, c2, charset, c3, c4);
2359 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2360 }
2361 else
2362 goto label_invalid_code_2;
2363 }
2364 else
2365 goto label_invalid_code_1;
2366 }
2367 }
2368 continue;
2369
2370 label_invalid_code_1:
2371 *dst++ = c1;
2372 coding->produced_char++;
2373 coding->fake_multibyte = 1;
2374 continue;
2375
2376 label_invalid_code_2:
2377 *dst++ = c1; *dst++= c2;
2378 coding->produced_char += 2;
2379 coding->fake_multibyte = 1;
2380 continue;
2381
2382 label_end_of_loop:
2383 result = CODING_FINISH_INSUFFICIENT_SRC;
2384 label_end_of_loop_2:
2385 src = src_base;
2386 break;
2387 }
2388
2389 if (src < src_end)
2390 {
2391 if (result == CODING_FINISH_NORMAL)
2392 result = CODING_FINISH_INSUFFICIENT_DST;
2393 else if (result != CODING_FINISH_INCONSISTENT_EOL
2394 && coding->mode & CODING_MODE_LAST_BLOCK)
2395 {
2396 src_bytes = src_end - src;
2397 if (dst_bytes && (dst_end - dst < src_bytes))
2398 src_bytes = dst_end - dst;
2399 bcopy (dst, src, src_bytes);
2400 src += src_bytes;
2401 dst += src_bytes;
2402 coding->fake_multibyte = 1;
2403 }
2404 }
2405
2406 coding->consumed = coding->consumed_char = src - source;
2407 coding->produced = dst - destination;
2408 return result;
2409 }
2410
2411 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2412 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2413 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2414 sure that all these charsets are registered as official charset
2415 (i.e. do not have extended leading-codes). Characters of other
2416 charsets are produced without any encoding. If SJIS_P is 1, encode
2417 SJIS text, else encode BIG5 text. */
2418
2419 int
2420 encode_coding_sjis_big5 (coding, source, destination,
2421 src_bytes, dst_bytes, sjis_p)
2422 struct coding_system *coding;
2423 unsigned char *source, *destination;
2424 int src_bytes, dst_bytes;
2425 int sjis_p;
2426 {
2427 unsigned char *src = source;
2428 unsigned char *src_end = source + src_bytes;
2429 unsigned char *dst = destination;
2430 unsigned char *dst_end = destination + dst_bytes;
2431 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2432 from DST_END to assure overflow checking is necessary only at the
2433 head of loop. */
2434 unsigned char *adjusted_dst_end = dst_end - 1;
2435 Lisp_Object translation_table
2436 = coding->translation_table_for_encode;
2437 int result = CODING_FINISH_NORMAL;
2438
2439 if (!NILP (Venable_character_translation) && NILP (translation_table))
2440 translation_table = Vstandard_translation_table_for_encode;
2441
2442 coding->consumed_char = 0;
2443 coding->fake_multibyte = 0;
2444 while (src < src_end && (dst_bytes
2445 ? (dst < adjusted_dst_end)
2446 : (dst < src - 1)))
2447 {
2448 /* SRC_BASE remembers the start position in source in each loop.
2449 The loop will be exited when there's not enough source text
2450 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2451 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2452 before exiting. */
2453 unsigned char *src_base = src;
2454 unsigned char c1 = *src++, c2, c3, c4;
2455
2456 if (coding->composing)
2457 {
2458 if (c1 == 0xA0)
2459 {
2460 ONE_MORE_BYTE (c1);
2461 c1 &= 0x7F;
2462 }
2463 else if (c1 >= 0xA0)
2464 c1 -= 0x20;
2465 else
2466 coding->composing = 0;
2467 }
2468
2469 switch (emacs_code_class[c1])
2470 {
2471 case EMACS_ascii_code:
2472 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2473 break;
2474
2475 case EMACS_control_code:
2476 *dst++ = c1;
2477 coding->consumed_char++;
2478 break;
2479
2480 case EMACS_carriage_return_code:
2481 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2482 {
2483 *dst++ = c1;
2484 coding->consumed_char++;
2485 break;
2486 }
2487 /* fall down to treat '\r' as '\n' ... */
2488
2489 case EMACS_linefeed_code:
2490 if (coding->eol_type == CODING_EOL_LF
2491 || coding->eol_type == CODING_EOL_UNDECIDED)
2492 *dst++ = '\n';
2493 else if (coding->eol_type == CODING_EOL_CRLF)
2494 *dst++ = '\r', *dst++ = '\n';
2495 else
2496 *dst++ = '\r';
2497 coding->consumed_char++;
2498 break;
2499
2500 case EMACS_leading_code_2:
2501 ONE_MORE_BYTE (c2);
2502 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2503 break;
2504
2505 case EMACS_leading_code_3:
2506 TWO_MORE_BYTES (c2, c3);
2507 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2508 break;
2509
2510 case EMACS_leading_code_4:
2511 THREE_MORE_BYTES (c2, c3, c4);
2512 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2513 break;
2514
2515 case EMACS_leading_code_composition:
2516 coding->composing = 1;
2517 break;
2518
2519 default: /* i.e. case EMACS_invalid_code: */
2520 *dst++ = c1;
2521 coding->consumed_char++;
2522 }
2523 continue;
2524
2525 label_end_of_loop:
2526 result = CODING_FINISH_INSUFFICIENT_SRC;
2527 src = src_base;
2528 break;
2529 }
2530
2531 if (result == CODING_FINISH_NORMAL
2532 && src < src_end)
2533 result = CODING_FINISH_INSUFFICIENT_DST;
2534 coding->consumed = src - source;
2535 coding->produced = coding->produced_char = dst - destination;
2536 return result;
2537 }
2538
2539 \f
2540 /*** 5. CCL handlers ***/
2541
2542 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2543 Check if a text is encoded in a coding system of which
2544 encoder/decoder are written in CCL program. If it is, return
2545 CODING_CATEGORY_MASK_CCL, else return 0. */
2546
2547 int
2548 detect_coding_ccl (src, src_end)
2549 unsigned char *src, *src_end;
2550 {
2551 unsigned char *valid;
2552
2553 /* No coding system is assigned to coding-category-ccl. */
2554 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2555 return 0;
2556
2557 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2558 while (src < src_end)
2559 {
2560 if (! valid[*src]) return 0;
2561 src++;
2562 }
2563 return CODING_CATEGORY_MASK_CCL;
2564 }
2565
2566 \f
2567 /*** 6. End-of-line handlers ***/
2568
2569 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2570 This function is called only when `coding->eol_type' is
2571 CODING_EOL_CRLF or CODING_EOL_CR. */
2572
2573 int
2574 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2575 struct coding_system *coding;
2576 unsigned char *source, *destination;
2577 int src_bytes, dst_bytes;
2578 {
2579 unsigned char *src = source;
2580 unsigned char *src_end = source + src_bytes;
2581 unsigned char *dst = destination;
2582 unsigned char *dst_end = destination + dst_bytes;
2583 unsigned char c;
2584 int result = CODING_FINISH_NORMAL;
2585
2586 coding->fake_multibyte = 0;
2587
2588 if (src_bytes <= 0)
2589 {
2590 coding->produced = coding->produced_char = 0;
2591 coding->consumed = coding->consumed_char = 0;
2592 return result;
2593 }
2594
2595 switch (coding->eol_type)
2596 {
2597 case CODING_EOL_CRLF:
2598 {
2599 /* Since the maximum bytes produced by each loop is 2, we
2600 subtract 1 from DST_END to assure overflow checking is
2601 necessary only at the head of loop. */
2602 unsigned char *adjusted_dst_end = dst_end - 1;
2603
2604 while (src < src_end && (dst_bytes
2605 ? (dst < adjusted_dst_end)
2606 : (dst < src - 1)))
2607 {
2608 unsigned char *src_base = src;
2609
2610 c = *src++;
2611 if (c == '\r')
2612 {
2613 ONE_MORE_BYTE (c);
2614 if (c == '\n')
2615 *dst++ = c;
2616 else
2617 {
2618 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2619 {
2620 result = CODING_FINISH_INCONSISTENT_EOL;
2621 goto label_end_of_loop_2;
2622 }
2623 src--;
2624 *dst++ = '\r';
2625 if (BASE_LEADING_CODE_P (c))
2626 coding->fake_multibyte = 1;
2627 }
2628 }
2629 else if (c == '\n'
2630 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2631 {
2632 result = CODING_FINISH_INCONSISTENT_EOL;
2633 goto label_end_of_loop_2;
2634 }
2635 else
2636 {
2637 *dst++ = c;
2638 if (BASE_LEADING_CODE_P (c))
2639 coding->fake_multibyte = 1;
2640 }
2641 continue;
2642
2643 label_end_of_loop:
2644 result = CODING_FINISH_INSUFFICIENT_SRC;
2645 label_end_of_loop_2:
2646 src = src_base;
2647 break;
2648 }
2649 if (src < src_end)
2650 {
2651 if (result == CODING_FINISH_NORMAL)
2652 result = CODING_FINISH_INSUFFICIENT_DST;
2653 else if (result != CODING_FINISH_INCONSISTENT_EOL
2654 && coding->mode & CODING_MODE_LAST_BLOCK)
2655 {
2656 /* This is the last block of the text to be decoded.
2657 We flush out all remaining codes. */
2658 src_bytes = src_end - src;
2659 if (dst_bytes && (dst_end - dst < src_bytes))
2660 src_bytes = dst_end - dst;
2661 bcopy (src, dst, src_bytes);
2662 dst += src_bytes;
2663 src += src_bytes;
2664 }
2665 }
2666 }
2667 break;
2668
2669 case CODING_EOL_CR:
2670 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2671 {
2672 while (src < src_end)
2673 {
2674 if ((c = *src++) == '\n')
2675 break;
2676 if (BASE_LEADING_CODE_P (c))
2677 coding->fake_multibyte = 1;
2678 }
2679 if (*--src == '\n')
2680 {
2681 src_bytes = src - source;
2682 result = CODING_FINISH_INCONSISTENT_EOL;
2683 }
2684 }
2685 if (dst_bytes && src_bytes > dst_bytes)
2686 {
2687 result = CODING_FINISH_INSUFFICIENT_DST;
2688 src_bytes = dst_bytes;
2689 }
2690 if (dst_bytes)
2691 bcopy (source, destination, src_bytes);
2692 else
2693 safe_bcopy (source, destination, src_bytes);
2694 src = source + src_bytes;
2695 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2696 break;
2697
2698 default: /* i.e. case: CODING_EOL_LF */
2699 if (dst_bytes && src_bytes > dst_bytes)
2700 {
2701 result = CODING_FINISH_INSUFFICIENT_DST;
2702 src_bytes = dst_bytes;
2703 }
2704 if (dst_bytes)
2705 bcopy (source, destination, src_bytes);
2706 else
2707 safe_bcopy (source, destination, src_bytes);
2708 src += src_bytes;
2709 dst += src_bytes;
2710 coding->fake_multibyte = 1;
2711 break;
2712 }
2713
2714 coding->consumed = coding->consumed_char = src - source;
2715 coding->produced = coding->produced_char = dst - destination;
2716 return result;
2717 }
2718
2719 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2720 format of end-of-line according to `coding->eol_type'. If
2721 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2722 '\r' in source text also means end-of-line. */
2723
2724 int
2725 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2726 struct coding_system *coding;
2727 unsigned char *source, *destination;
2728 int src_bytes, dst_bytes;
2729 {
2730 unsigned char *src = source;
2731 unsigned char *dst = destination;
2732 int result = CODING_FINISH_NORMAL;
2733
2734 coding->fake_multibyte = 0;
2735
2736 if (coding->eol_type == CODING_EOL_CRLF)
2737 {
2738 unsigned char c;
2739 unsigned char *src_end = source + src_bytes;
2740 unsigned char *dst_end = destination + dst_bytes;
2741 /* Since the maximum bytes produced by each loop is 2, we
2742 subtract 1 from DST_END to assure overflow checking is
2743 necessary only at the head of loop. */
2744 unsigned char *adjusted_dst_end = dst_end - 1;
2745
2746 while (src < src_end && (dst_bytes
2747 ? (dst < adjusted_dst_end)
2748 : (dst < src - 1)))
2749 {
2750 c = *src++;
2751 if (c == '\n'
2752 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2753 *dst++ = '\r', *dst++ = '\n';
2754 else
2755 {
2756 *dst++ = c;
2757 if (BASE_LEADING_CODE_P (c))
2758 coding->fake_multibyte = 1;
2759 }
2760 }
2761 if (src < src_end)
2762 result = CODING_FINISH_INSUFFICIENT_DST;
2763 }
2764 else
2765 {
2766 unsigned char c;
2767
2768 if (dst_bytes && src_bytes > dst_bytes)
2769 {
2770 src_bytes = dst_bytes;
2771 result = CODING_FINISH_INSUFFICIENT_DST;
2772 }
2773 if (dst_bytes)
2774 bcopy (source, destination, src_bytes);
2775 else
2776 safe_bcopy (source, destination, src_bytes);
2777 dst_bytes = src_bytes;
2778 if (coding->eol_type == CODING_EOL_CR)
2779 {
2780 while (src_bytes--)
2781 {
2782 if ((c = *dst++) == '\n')
2783 dst[-1] = '\r';
2784 else if (BASE_LEADING_CODE_P (c))
2785 coding->fake_multibyte = 1;
2786 }
2787 }
2788 else
2789 {
2790 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2791 {
2792 while (src_bytes--)
2793 if (*dst++ == '\r') dst[-1] = '\n';
2794 }
2795 coding->fake_multibyte = 1;
2796 }
2797 src = source + dst_bytes;
2798 dst = destination + dst_bytes;
2799 }
2800
2801 coding->consumed = coding->consumed_char = src - source;
2802 coding->produced = coding->produced_char = dst - destination;
2803 return result;
2804 }
2805
2806 \f
2807 /*** 7. C library functions ***/
2808
2809 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2810 has a property `coding-system'. The value of this property is a
2811 vector of length 5 (called as coding-vector). Among elements of
2812 this vector, the first (element[0]) and the fifth (element[4])
2813 carry important information for decoding/encoding. Before
2814 decoding/encoding, this information should be set in fields of a
2815 structure of type `coding_system'.
2816
2817 A value of property `coding-system' can be a symbol of another
2818 subsidiary coding-system. In that case, Emacs gets coding-vector
2819 from that symbol.
2820
2821 `element[0]' contains information to be set in `coding->type'. The
2822 value and its meaning is as follows:
2823
2824 0 -- coding_type_emacs_mule
2825 1 -- coding_type_sjis
2826 2 -- coding_type_iso2022
2827 3 -- coding_type_big5
2828 4 -- coding_type_ccl encoder/decoder written in CCL
2829 nil -- coding_type_no_conversion
2830 t -- coding_type_undecided (automatic conversion on decoding,
2831 no-conversion on encoding)
2832
2833 `element[4]' contains information to be set in `coding->flags' and
2834 `coding->spec'. The meaning varies by `coding->type'.
2835
2836 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2837 of length 32 (of which the first 13 sub-elements are used now).
2838 Meanings of these sub-elements are:
2839
2840 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2841 If the value is an integer of valid charset, the charset is
2842 assumed to be designated to graphic register N initially.
2843
2844 If the value is minus, it is a minus value of charset which
2845 reserves graphic register N, which means that the charset is
2846 not designated initially but should be designated to graphic
2847 register N just before encoding a character in that charset.
2848
2849 If the value is nil, graphic register N is never used on
2850 encoding.
2851
2852 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2853 Each value takes t or nil. See the section ISO2022 of
2854 `coding.h' for more information.
2855
2856 If `coding->type' is `coding_type_big5', element[4] is t to denote
2857 BIG5-ETen or nil to denote BIG5-HKU.
2858
2859 If `coding->type' takes the other value, element[4] is ignored.
2860
2861 Emacs Lisp's coding system also carries information about format of
2862 end-of-line in a value of property `eol-type'. If the value is
2863 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2864 means CODING_EOL_CR. If it is not integer, it should be a vector
2865 of subsidiary coding systems of which property `eol-type' has one
2866 of above values.
2867
2868 */
2869
2870 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2871 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2872 is setup so that no conversion is necessary and return -1, else
2873 return 0. */
2874
2875 int
2876 setup_coding_system (coding_system, coding)
2877 Lisp_Object coding_system;
2878 struct coding_system *coding;
2879 {
2880 Lisp_Object coding_spec, coding_type, eol_type, plist;
2881 Lisp_Object val;
2882 int i;
2883
2884 /* Initialize some fields required for all kinds of coding systems. */
2885 coding->symbol = coding_system;
2886 coding->common_flags = 0;
2887 coding->mode = 0;
2888 coding->heading_ascii = -1;
2889 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2890
2891 if (NILP (coding_system))
2892 goto label_invalid_coding_system;
2893
2894 coding_spec = Fget (coding_system, Qcoding_system);
2895
2896 if (!VECTORP (coding_spec)
2897 || XVECTOR (coding_spec)->size != 5
2898 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2899 goto label_invalid_coding_system;
2900
2901 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2902 if (VECTORP (eol_type))
2903 {
2904 coding->eol_type = CODING_EOL_UNDECIDED;
2905 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2906 }
2907 else if (XFASTINT (eol_type) == 1)
2908 {
2909 coding->eol_type = CODING_EOL_CRLF;
2910 coding->common_flags
2911 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912 }
2913 else if (XFASTINT (eol_type) == 2)
2914 {
2915 coding->eol_type = CODING_EOL_CR;
2916 coding->common_flags
2917 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2918 }
2919 else
2920 coding->eol_type = CODING_EOL_LF;
2921
2922 coding_type = XVECTOR (coding_spec)->contents[0];
2923 /* Try short cut. */
2924 if (SYMBOLP (coding_type))
2925 {
2926 if (EQ (coding_type, Qt))
2927 {
2928 coding->type = coding_type_undecided;
2929 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2930 }
2931 else
2932 coding->type = coding_type_no_conversion;
2933 return 0;
2934 }
2935
2936 /* Initialize remaining fields. */
2937 coding->composing = 0;
2938 coding->composed_chars = 0;
2939
2940 /* Get values of coding system properties:
2941 `post-read-conversion', `pre-write-conversion',
2942 `translation-table-for-decode', `translation-table-for-encode'. */
2943 plist = XVECTOR (coding_spec)->contents[3];
2944 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2945 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2946 val = Fplist_get (plist, Qtranslation_table_for_decode);
2947 if (SYMBOLP (val))
2948 val = Fget (val, Qtranslation_table_for_decode);
2949 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2950 val = Fplist_get (plist, Qtranslation_table_for_encode);
2951 if (SYMBOLP (val))
2952 val = Fget (val, Qtranslation_table_for_encode);
2953 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2954 val = Fplist_get (plist, Qcoding_category);
2955 if (!NILP (val))
2956 {
2957 val = Fget (val, Qcoding_category_index);
2958 if (INTEGERP (val))
2959 coding->category_idx = XINT (val);
2960 else
2961 goto label_invalid_coding_system;
2962 }
2963 else
2964 goto label_invalid_coding_system;
2965
2966 val = Fplist_get (plist, Qsafe_charsets);
2967 if (EQ (val, Qt))
2968 {
2969 for (i = 0; i <= MAX_CHARSET; i++)
2970 coding->safe_charsets[i] = 1;
2971 }
2972 else
2973 {
2974 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2975 while (CONSP (val))
2976 {
2977 if ((i = get_charset_id (XCAR (val))) >= 0)
2978 coding->safe_charsets[i] = 1;
2979 val = XCDR (val);
2980 }
2981 }
2982
2983 switch (XFASTINT (coding_type))
2984 {
2985 case 0:
2986 coding->type = coding_type_emacs_mule;
2987 if (!NILP (coding->post_read_conversion))
2988 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2989 if (!NILP (coding->pre_write_conversion))
2990 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2991 break;
2992
2993 case 1:
2994 coding->type = coding_type_sjis;
2995 coding->common_flags
2996 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2997 break;
2998
2999 case 2:
3000 coding->type = coding_type_iso2022;
3001 coding->common_flags
3002 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3003 {
3004 Lisp_Object val, temp;
3005 Lisp_Object *flags;
3006 int i, charset, reg_bits = 0;
3007
3008 val = XVECTOR (coding_spec)->contents[4];
3009
3010 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3011 goto label_invalid_coding_system;
3012
3013 flags = XVECTOR (val)->contents;
3014 coding->flags
3015 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3016 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3017 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3018 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3019 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3020 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3021 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3022 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3023 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3024 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3025 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3026 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3027 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3028 );
3029
3030 /* Invoke graphic register 0 to plane 0. */
3031 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3032 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3033 CODING_SPEC_ISO_INVOCATION (coding, 1)
3034 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3035 /* Not single shifting at first. */
3036 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3037 /* Beginning of buffer should also be regarded as bol. */
3038 CODING_SPEC_ISO_BOL (coding) = 1;
3039
3040 for (charset = 0; charset <= MAX_CHARSET; charset++)
3041 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3042 val = Vcharset_revision_alist;
3043 while (CONSP (val))
3044 {
3045 charset = get_charset_id (Fcar_safe (XCAR (val)));
3046 if (charset >= 0
3047 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3048 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3049 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3050 val = XCDR (val);
3051 }
3052
3053 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3054 FLAGS[REG] can be one of below:
3055 integer CHARSET: CHARSET occupies register I,
3056 t: designate nothing to REG initially, but can be used
3057 by any charsets,
3058 list of integer, nil, or t: designate the first
3059 element (if integer) to REG initially, the remaining
3060 elements (if integer) is designated to REG on request,
3061 if an element is t, REG can be used by any charsets,
3062 nil: REG is never used. */
3063 for (charset = 0; charset <= MAX_CHARSET; charset++)
3064 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3065 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3066 for (i = 0; i < 4; i++)
3067 {
3068 if (INTEGERP (flags[i])
3069 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3070 || (charset = get_charset_id (flags[i])) >= 0)
3071 {
3072 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3073 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3074 }
3075 else if (EQ (flags[i], Qt))
3076 {
3077 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3078 reg_bits |= 1 << i;
3079 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3080 }
3081 else if (CONSP (flags[i]))
3082 {
3083 Lisp_Object tail;
3084 tail = flags[i];
3085
3086 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3087 if (INTEGERP (XCAR (tail))
3088 && (charset = XINT (XCAR (tail)),
3089 CHARSET_VALID_P (charset))
3090 || (charset = get_charset_id (XCAR (tail))) >= 0)
3091 {
3092 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3093 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3094 }
3095 else
3096 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3097 tail = XCDR (tail);
3098 while (CONSP (tail))
3099 {
3100 if (INTEGERP (XCAR (tail))
3101 && (charset = XINT (XCAR (tail)),
3102 CHARSET_VALID_P (charset))
3103 || (charset = get_charset_id (XCAR (tail))) >= 0)
3104 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3105 = i;
3106 else if (EQ (XCAR (tail), Qt))
3107 reg_bits |= 1 << i;
3108 tail = XCDR (tail);
3109 }
3110 }
3111 else
3112 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3113
3114 CODING_SPEC_ISO_DESIGNATION (coding, i)
3115 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3116 }
3117
3118 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3119 {
3120 /* REG 1 can be used only by locking shift in 7-bit env. */
3121 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3122 reg_bits &= ~2;
3123 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3124 /* Without any shifting, only REG 0 and 1 can be used. */
3125 reg_bits &= 3;
3126 }
3127
3128 if (reg_bits)
3129 for (charset = 0; charset <= MAX_CHARSET; charset++)
3130 {
3131 if (CHARSET_VALID_P (charset))
3132 {
3133 /* There exist some default graphic registers to be
3134 used CHARSET. */
3135
3136 /* We had better avoid designating a charset of
3137 CHARS96 to REG 0 as far as possible. */
3138 if (CHARSET_CHARS (charset) == 96)
3139 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3140 = (reg_bits & 2
3141 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3142 else
3143 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3144 = (reg_bits & 1
3145 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3146 }
3147 }
3148 }
3149 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3150 coding->spec.iso2022.last_invalid_designation_register = -1;
3151 break;
3152
3153 case 3:
3154 coding->type = coding_type_big5;
3155 coding->common_flags
3156 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3157 coding->flags
3158 = (NILP (XVECTOR (coding_spec)->contents[4])
3159 ? CODING_FLAG_BIG5_HKU
3160 : CODING_FLAG_BIG5_ETEN);
3161 break;
3162
3163 case 4:
3164 coding->type = coding_type_ccl;
3165 coding->common_flags
3166 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3167 {
3168 val = XVECTOR (coding_spec)->contents[4];
3169 if (! CONSP (val)
3170 || setup_ccl_program (&(coding->spec.ccl.decoder),
3171 XCAR (val)) < 0
3172 || setup_ccl_program (&(coding->spec.ccl.encoder),
3173 XCDR (val)) < 0)
3174 goto label_invalid_coding_system;
3175
3176 bzero (coding->spec.ccl.valid_codes, 256);
3177 val = Fplist_get (plist, Qvalid_codes);
3178 if (CONSP (val))
3179 {
3180 Lisp_Object this;
3181
3182 for (; CONSP (val); val = XCDR (val))
3183 {
3184 this = XCAR (val);
3185 if (INTEGERP (this)
3186 && XINT (this) >= 0 && XINT (this) < 256)
3187 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3188 else if (CONSP (this)
3189 && INTEGERP (XCAR (this))
3190 && INTEGERP (XCDR (this)))
3191 {
3192 int start = XINT (XCAR (this));
3193 int end = XINT (XCDR (this));
3194
3195 if (start >= 0 && start <= end && end < 256)
3196 while (start <= end)
3197 coding->spec.ccl.valid_codes[start++] = 1;
3198 }
3199 }
3200 }
3201 }
3202 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3203 break;
3204
3205 case 5:
3206 coding->type = coding_type_raw_text;
3207 break;
3208
3209 default:
3210 goto label_invalid_coding_system;
3211 }
3212 return 0;
3213
3214 label_invalid_coding_system:
3215 coding->type = coding_type_no_conversion;
3216 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3217 coding->common_flags = 0;
3218 coding->eol_type = CODING_EOL_LF;
3219 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3220 return -1;
3221 }
3222
3223 /* Setup raw-text or one of its subsidiaries in the structure
3224 coding_system CODING according to the already setup value eol_type
3225 in CODING. CODING should be setup for some coding system in
3226 advance. */
3227
3228 void
3229 setup_raw_text_coding_system (coding)
3230 struct coding_system *coding;
3231 {
3232 if (coding->type != coding_type_raw_text)
3233 {
3234 coding->symbol = Qraw_text;
3235 coding->type = coding_type_raw_text;
3236 if (coding->eol_type != CODING_EOL_UNDECIDED)
3237 {
3238 Lisp_Object subsidiaries;
3239 subsidiaries = Fget (Qraw_text, Qeol_type);
3240
3241 if (VECTORP (subsidiaries)
3242 && XVECTOR (subsidiaries)->size == 3)
3243 coding->symbol
3244 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3245 }
3246 setup_coding_system (coding->symbol, coding);
3247 }
3248 return;
3249 }
3250
3251 /* Emacs has a mechanism to automatically detect a coding system if it
3252 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3253 it's impossible to distinguish some coding systems accurately
3254 because they use the same range of codes. So, at first, coding
3255 systems are categorized into 7, those are:
3256
3257 o coding-category-emacs-mule
3258
3259 The category for a coding system which has the same code range
3260 as Emacs' internal format. Assigned the coding-system (Lisp
3261 symbol) `emacs-mule' by default.
3262
3263 o coding-category-sjis
3264
3265 The category for a coding system which has the same code range
3266 as SJIS. Assigned the coding-system (Lisp
3267 symbol) `japanese-shift-jis' by default.
3268
3269 o coding-category-iso-7
3270
3271 The category for a coding system which has the same code range
3272 as ISO2022 of 7-bit environment. This doesn't use any locking
3273 shift and single shift functions. This can encode/decode all
3274 charsets. Assigned the coding-system (Lisp symbol)
3275 `iso-2022-7bit' by default.
3276
3277 o coding-category-iso-7-tight
3278
3279 Same as coding-category-iso-7 except that this can
3280 encode/decode only the specified charsets.
3281
3282 o coding-category-iso-8-1
3283
3284 The category for a coding system which has the same code range
3285 as ISO2022 of 8-bit environment and graphic plane 1 used only
3286 for DIMENSION1 charset. This doesn't use any locking shift
3287 and single shift functions. Assigned the coding-system (Lisp
3288 symbol) `iso-latin-1' by default.
3289
3290 o coding-category-iso-8-2
3291
3292 The category for a coding system which has the same code range
3293 as ISO2022 of 8-bit environment and graphic plane 1 used only
3294 for DIMENSION2 charset. This doesn't use any locking shift
3295 and single shift functions. Assigned the coding-system (Lisp
3296 symbol) `japanese-iso-8bit' by default.
3297
3298 o coding-category-iso-7-else
3299
3300 The category for a coding system which has the same code range
3301 as ISO2022 of 7-bit environemnt but uses locking shift or
3302 single shift functions. Assigned the coding-system (Lisp
3303 symbol) `iso-2022-7bit-lock' by default.
3304
3305 o coding-category-iso-8-else
3306
3307 The category for a coding system which has the same code range
3308 as ISO2022 of 8-bit environemnt but uses locking shift or
3309 single shift functions. Assigned the coding-system (Lisp
3310 symbol) `iso-2022-8bit-ss2' by default.
3311
3312 o coding-category-big5
3313
3314 The category for a coding system which has the same code range
3315 as BIG5. Assigned the coding-system (Lisp symbol)
3316 `cn-big5' by default.
3317
3318 o coding-category-ccl
3319
3320 The category for a coding system of which encoder/decoder is
3321 written in CCL programs. The default value is nil, i.e., no
3322 coding system is assigned.
3323
3324 o coding-category-binary
3325
3326 The category for a coding system not categorized in any of the
3327 above. Assigned the coding-system (Lisp symbol)
3328 `no-conversion' by default.
3329
3330 Each of them is a Lisp symbol and the value is an actual
3331 `coding-system's (this is also a Lisp symbol) assigned by a user.
3332 What Emacs does actually is to detect a category of coding system.
3333 Then, it uses a `coding-system' assigned to it. If Emacs can't
3334 decide only one possible category, it selects a category of the
3335 highest priority. Priorities of categories are also specified by a
3336 user in a Lisp variable `coding-category-list'.
3337
3338 */
3339
3340 static
3341 int ascii_skip_code[256];
3342
3343 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3344 If it detects possible coding systems, return an integer in which
3345 appropriate flag bits are set. Flag bits are defined by macros
3346 CODING_CATEGORY_MASK_XXX in `coding.h'.
3347
3348 How many ASCII characters are at the head is returned as *SKIP. */
3349
3350 static int
3351 detect_coding_mask (source, src_bytes, priorities, skip)
3352 unsigned char *source;
3353 int src_bytes, *priorities, *skip;
3354 {
3355 register unsigned char c;
3356 unsigned char *src = source, *src_end = source + src_bytes;
3357 unsigned int mask;
3358 int i;
3359
3360 /* At first, skip all ASCII characters and control characters except
3361 for three ISO2022 specific control characters. */
3362 ascii_skip_code[ISO_CODE_SO] = 0;
3363 ascii_skip_code[ISO_CODE_SI] = 0;
3364 ascii_skip_code[ISO_CODE_ESC] = 0;
3365
3366 label_loop_detect_coding:
3367 while (src < src_end && ascii_skip_code[*src]) src++;
3368 *skip = src - source;
3369
3370 if (src >= src_end)
3371 /* We found nothing other than ASCII. There's nothing to do. */
3372 return 0;
3373
3374 c = *src;
3375 /* The text seems to be encoded in some multilingual coding system.
3376 Now, try to find in which coding system the text is encoded. */
3377 if (c < 0x80)
3378 {
3379 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3380 /* C is an ISO2022 specific control code of C0. */
3381 mask = detect_coding_iso2022 (src, src_end);
3382 if (mask == 0)
3383 {
3384 /* No valid ISO2022 code follows C. Try again. */
3385 src++;
3386 if (c == ISO_CODE_ESC)
3387 ascii_skip_code[ISO_CODE_ESC] = 1;
3388 else
3389 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3390 goto label_loop_detect_coding;
3391 }
3392 if (priorities)
3393 goto label_return_highest_only;
3394 }
3395 else
3396 {
3397 int try;
3398
3399 if (c < 0xA0)
3400 {
3401 /* C is the first byte of SJIS character code,
3402 or a leading-code of Emacs' internal format (emacs-mule). */
3403 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3404
3405 /* Or, if C is a special latin extra code,
3406 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3407 or is an ISO2022 control-sequence-introducer (CSI),
3408 we should also consider the possibility of ISO2022 codings. */
3409 if ((VECTORP (Vlatin_extra_code_table)
3410 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3411 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3412 || (c == ISO_CODE_CSI
3413 && (src < src_end
3414 && (*src == ']'
3415 || ((*src == '0' || *src == '1' || *src == '2')
3416 && src + 1 < src_end
3417 && src[1] == ']')))))
3418 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3419 | CODING_CATEGORY_MASK_ISO_8BIT);
3420 }
3421 else
3422 /* C is a character of ISO2022 in graphic plane right,
3423 or a SJIS's 1-byte character code (i.e. JISX0201),
3424 or the first byte of BIG5's 2-byte code. */
3425 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3426 | CODING_CATEGORY_MASK_ISO_8BIT
3427 | CODING_CATEGORY_MASK_SJIS
3428 | CODING_CATEGORY_MASK_BIG5);
3429
3430 /* Or, we may have to consider the possibility of CCL. */
3431 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3432 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3433 ->spec.ccl.valid_codes)[c])
3434 try |= CODING_CATEGORY_MASK_CCL;
3435
3436 mask = 0;
3437 if (priorities)
3438 {
3439 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3440 {
3441 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3442 mask = detect_coding_iso2022 (src, src_end);
3443 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3444 mask = detect_coding_sjis (src, src_end);
3445 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3446 mask = detect_coding_big5 (src, src_end);
3447 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3448 mask = detect_coding_emacs_mule (src, src_end);
3449 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3450 mask = detect_coding_ccl (src, src_end);
3451 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3452 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3453 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3454 mask = CODING_CATEGORY_MASK_BINARY;
3455 if (mask)
3456 goto label_return_highest_only;
3457 }
3458 return CODING_CATEGORY_MASK_RAW_TEXT;
3459 }
3460 if (try & CODING_CATEGORY_MASK_ISO)
3461 mask |= detect_coding_iso2022 (src, src_end);
3462 if (try & CODING_CATEGORY_MASK_SJIS)
3463 mask |= detect_coding_sjis (src, src_end);
3464 if (try & CODING_CATEGORY_MASK_BIG5)
3465 mask |= detect_coding_big5 (src, src_end);
3466 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3467 mask |= detect_coding_emacs_mule (src, src_end);
3468 if (try & CODING_CATEGORY_MASK_CCL)
3469 mask |= detect_coding_ccl (src, src_end);
3470 }
3471 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3472
3473 label_return_highest_only:
3474 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3475 {
3476 if (mask & priorities[i])
3477 return priorities[i];
3478 }
3479 return CODING_CATEGORY_MASK_RAW_TEXT;
3480 }
3481
3482 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3483 The information of the detected coding system is set in CODING. */
3484
3485 void
3486 detect_coding (coding, src, src_bytes)
3487 struct coding_system *coding;
3488 unsigned char *src;
3489 int src_bytes;
3490 {
3491 unsigned int idx;
3492 int skip, mask, i;
3493 Lisp_Object val;
3494
3495 val = Vcoding_category_list;
3496 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3497 coding->heading_ascii = skip;
3498
3499 if (!mask) return;
3500
3501 /* We found a single coding system of the highest priority in MASK. */
3502 idx = 0;
3503 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3504 if (! mask)
3505 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3506
3507 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3508
3509 if (coding->eol_type != CODING_EOL_UNDECIDED)
3510 {
3511 Lisp_Object tmp;
3512
3513 tmp = Fget (val, Qeol_type);
3514 if (VECTORP (tmp))
3515 val = XVECTOR (tmp)->contents[coding->eol_type];
3516 }
3517 setup_coding_system (val, coding);
3518 /* Set this again because setup_coding_system reset this member. */
3519 coding->heading_ascii = skip;
3520 }
3521
3522 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3523 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3524 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3525
3526 How many non-eol characters are at the head is returned as *SKIP. */
3527
3528 #define MAX_EOL_CHECK_COUNT 3
3529
3530 static int
3531 detect_eol_type (source, src_bytes, skip)
3532 unsigned char *source;
3533 int src_bytes, *skip;
3534 {
3535 unsigned char *src = source, *src_end = src + src_bytes;
3536 unsigned char c;
3537 int total = 0; /* How many end-of-lines are found so far. */
3538 int eol_type = CODING_EOL_UNDECIDED;
3539 int this_eol_type;
3540
3541 *skip = 0;
3542
3543 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3544 {
3545 c = *src++;
3546 if (c == '\n' || c == '\r')
3547 {
3548 if (*skip == 0)
3549 *skip = src - 1 - source;
3550 total++;
3551 if (c == '\n')
3552 this_eol_type = CODING_EOL_LF;
3553 else if (src >= src_end || *src != '\n')
3554 this_eol_type = CODING_EOL_CR;
3555 else
3556 this_eol_type = CODING_EOL_CRLF, src++;
3557
3558 if (eol_type == CODING_EOL_UNDECIDED)
3559 /* This is the first end-of-line. */
3560 eol_type = this_eol_type;
3561 else if (eol_type != this_eol_type)
3562 {
3563 /* The found type is different from what found before. */
3564 eol_type = CODING_EOL_INCONSISTENT;
3565 break;
3566 }
3567 }
3568 }
3569
3570 if (*skip == 0)
3571 *skip = src_end - source;
3572 return eol_type;
3573 }
3574
3575 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3576 is encoded. If it detects an appropriate format of end-of-line, it
3577 sets the information in *CODING. */
3578
3579 void
3580 detect_eol (coding, src, src_bytes)
3581 struct coding_system *coding;
3582 unsigned char *src;
3583 int src_bytes;
3584 {
3585 Lisp_Object val;
3586 int skip;
3587 int eol_type = detect_eol_type (src, src_bytes, &skip);
3588
3589 if (coding->heading_ascii > skip)
3590 coding->heading_ascii = skip;
3591 else
3592 skip = coding->heading_ascii;
3593
3594 if (eol_type == CODING_EOL_UNDECIDED)
3595 return;
3596 if (eol_type == CODING_EOL_INCONSISTENT)
3597 {
3598 #if 0
3599 /* This code is suppressed until we find a better way to
3600 distinguish raw text file and binary file. */
3601
3602 /* If we have already detected that the coding is raw-text, the
3603 coding should actually be no-conversion. */
3604 if (coding->type == coding_type_raw_text)
3605 {
3606 setup_coding_system (Qno_conversion, coding);
3607 return;
3608 }
3609 /* Else, let's decode only text code anyway. */
3610 #endif /* 0 */
3611 eol_type = CODING_EOL_LF;
3612 }
3613
3614 val = Fget (coding->symbol, Qeol_type);
3615 if (VECTORP (val) && XVECTOR (val)->size == 3)
3616 {
3617 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3618 coding->heading_ascii = skip;
3619 }
3620 }
3621
3622 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3623
3624 #define DECODING_BUFFER_MAG(coding) \
3625 (coding->type == coding_type_iso2022 \
3626 ? 3 \
3627 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3628 ? 2 \
3629 : (coding->type == coding_type_raw_text \
3630 ? 1 \
3631 : (coding->type == coding_type_ccl \
3632 ? coding->spec.ccl.decoder.buf_magnification \
3633 : 2))))
3634
3635 /* Return maximum size (bytes) of a buffer enough for decoding
3636 SRC_BYTES of text encoded in CODING. */
3637
3638 int
3639 decoding_buffer_size (coding, src_bytes)
3640 struct coding_system *coding;
3641 int src_bytes;
3642 {
3643 return (src_bytes * DECODING_BUFFER_MAG (coding)
3644 + CONVERSION_BUFFER_EXTRA_ROOM);
3645 }
3646
3647 /* Return maximum size (bytes) of a buffer enough for encoding
3648 SRC_BYTES of text to CODING. */
3649
3650 int
3651 encoding_buffer_size (coding, src_bytes)
3652 struct coding_system *coding;
3653 int src_bytes;
3654 {
3655 int magnification;
3656
3657 if (coding->type == coding_type_ccl)
3658 magnification = coding->spec.ccl.encoder.buf_magnification;
3659 else
3660 magnification = 3;
3661
3662 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3663 }
3664
3665 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3666 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3667 #endif
3668
3669 char *conversion_buffer;
3670 int conversion_buffer_size;
3671
3672 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3673 or decoding. Sufficient memory is allocated automatically. If we
3674 run out of memory, return NULL. */
3675
3676 char *
3677 get_conversion_buffer (size)
3678 int size;
3679 {
3680 if (size > conversion_buffer_size)
3681 {
3682 char *buf;
3683 int real_size = conversion_buffer_size * 2;
3684
3685 while (real_size < size) real_size *= 2;
3686 buf = (char *) xmalloc (real_size);
3687 xfree (conversion_buffer);
3688 conversion_buffer = buf;
3689 conversion_buffer_size = real_size;
3690 }
3691 return conversion_buffer;
3692 }
3693
3694 int
3695 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3696 struct coding_system *coding;
3697 unsigned char *source, *destination;
3698 int src_bytes, dst_bytes, encodep;
3699 {
3700 struct ccl_program *ccl
3701 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3702 int result;
3703
3704 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3705
3706 coding->produced = ccl_driver (ccl, source, destination,
3707 src_bytes, dst_bytes, &(coding->consumed));
3708 coding->produced_char
3709 = (encodep
3710 ? coding->produced
3711 : multibyte_chars_in_text (destination, coding->produced));
3712 coding->consumed_char
3713 = multibyte_chars_in_text (source, coding->consumed);
3714
3715 switch (ccl->status)
3716 {
3717 case CCL_STAT_SUSPEND_BY_SRC:
3718 result = CODING_FINISH_INSUFFICIENT_SRC;
3719 break;
3720 case CCL_STAT_SUSPEND_BY_DST:
3721 result = CODING_FINISH_INSUFFICIENT_DST;
3722 break;
3723 case CCL_STAT_QUIT:
3724 case CCL_STAT_INVALID_CMD:
3725 result = CODING_FINISH_INTERRUPT;
3726 break;
3727 default:
3728 result = CODING_FINISH_NORMAL;
3729 break;
3730 }
3731 return result;
3732 }
3733
3734 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3735 decoding, it may detect coding system and format of end-of-line if
3736 those are not yet decided.
3737
3738 This function does not make full use of DESTINATION buffer. For
3739 instance, if coding->type is coding_type_iso2022, it uses only
3740 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3741 DST_BYTES is decided by the function decoding_buffer_size, it
3742 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3743 So, this function can decode the full SOURCE. But, in the other
3744 case, if you want to avoid carry over, you must supply at least 7
3745 bytes more area in DESTINATION buffer than expected maximum bytes
3746 that will be produced by this function. */
3747
3748 int
3749 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3750 struct coding_system *coding;
3751 unsigned char *source, *destination;
3752 int src_bytes, dst_bytes;
3753 {
3754 int result;
3755
3756 if (src_bytes <= 0
3757 && coding->type != coding_type_ccl
3758 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3759 && CODING_REQUIRE_FLUSHING (coding)))
3760 {
3761 coding->produced = coding->produced_char = 0;
3762 coding->consumed = coding->consumed_char = 0;
3763 coding->fake_multibyte = 0;
3764 return CODING_FINISH_NORMAL;
3765 }
3766
3767 if (coding->type == coding_type_undecided)
3768 detect_coding (coding, source, src_bytes);
3769
3770 if (coding->eol_type == CODING_EOL_UNDECIDED)
3771 detect_eol (coding, source, src_bytes);
3772
3773 switch (coding->type)
3774 {
3775 case coding_type_emacs_mule:
3776 case coding_type_undecided:
3777 case coding_type_raw_text:
3778 if (coding->eol_type == CODING_EOL_LF
3779 || coding->eol_type == CODING_EOL_UNDECIDED)
3780 goto label_no_conversion;
3781 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3782 break;
3783
3784 case coding_type_sjis:
3785 result = decode_coding_sjis_big5 (coding, source, destination,
3786 src_bytes, dst_bytes, 1);
3787 break;
3788
3789 case coding_type_iso2022:
3790 result = decode_coding_iso2022 (coding, source, destination,
3791 src_bytes, dst_bytes);
3792 break;
3793
3794 case coding_type_big5:
3795 result = decode_coding_sjis_big5 (coding, source, destination,
3796 src_bytes, dst_bytes, 0);
3797 break;
3798
3799 case coding_type_ccl:
3800 result = ccl_coding_driver (coding, source, destination,
3801 src_bytes, dst_bytes, 0);
3802 break;
3803
3804 default: /* i.e. case coding_type_no_conversion: */
3805 label_no_conversion:
3806 if (dst_bytes && src_bytes > dst_bytes)
3807 {
3808 coding->produced = dst_bytes;
3809 result = CODING_FINISH_INSUFFICIENT_DST;
3810 }
3811 else
3812 {
3813 coding->produced = src_bytes;
3814 result = CODING_FINISH_NORMAL;
3815 }
3816 if (dst_bytes)
3817 bcopy (source, destination, coding->produced);
3818 else
3819 safe_bcopy (source, destination, coding->produced);
3820 coding->fake_multibyte = 1;
3821 coding->consumed
3822 = coding->consumed_char = coding->produced_char = coding->produced;
3823 break;
3824 }
3825
3826 return result;
3827 }
3828
3829 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3830
3831 This function does not make full use of DESTINATION buffer. For
3832 instance, if coding->type is coding_type_iso2022, it uses only
3833 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3834 DST_BYTES is decided by the function encoding_buffer_size, it
3835 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3836 So, this function can encode the full SOURCE. But, in the other
3837 case, if you want to avoid carry over, you must supply at least 20
3838 bytes more area in DESTINATION buffer than expected maximum bytes
3839 that will be produced by this function. */
3840
3841 int
3842 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3843 struct coding_system *coding;
3844 unsigned char *source, *destination;
3845 int src_bytes, dst_bytes;
3846 {
3847 int result;
3848
3849 if (src_bytes <= 0
3850 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3851 && CODING_REQUIRE_FLUSHING (coding)))
3852 {
3853 coding->produced = coding->produced_char = 0;
3854 coding->consumed = coding->consumed_char = 0;
3855 coding->fake_multibyte = 0;
3856 return CODING_FINISH_NORMAL;
3857 }
3858
3859 switch (coding->type)
3860 {
3861 case coding_type_emacs_mule:
3862 case coding_type_undecided:
3863 case coding_type_raw_text:
3864 if (coding->eol_type == CODING_EOL_LF
3865 || coding->eol_type == CODING_EOL_UNDECIDED)
3866 goto label_no_conversion;
3867 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3868 break;
3869
3870 case coding_type_sjis:
3871 result = encode_coding_sjis_big5 (coding, source, destination,
3872 src_bytes, dst_bytes, 1);
3873 break;
3874
3875 case coding_type_iso2022:
3876 result = encode_coding_iso2022 (coding, source, destination,
3877 src_bytes, dst_bytes);
3878 break;
3879
3880 case coding_type_big5:
3881 result = encode_coding_sjis_big5 (coding, source, destination,
3882 src_bytes, dst_bytes, 0);
3883 break;
3884
3885 case coding_type_ccl:
3886 result = ccl_coding_driver (coding, source, destination,
3887 src_bytes, dst_bytes, 1);
3888 break;
3889
3890 default: /* i.e. case coding_type_no_conversion: */
3891 label_no_conversion:
3892 if (dst_bytes && src_bytes > dst_bytes)
3893 {
3894 coding->produced = dst_bytes;
3895 result = CODING_FINISH_INSUFFICIENT_DST;
3896 }
3897 else
3898 {
3899 coding->produced = src_bytes;
3900 result = CODING_FINISH_NORMAL;
3901 }
3902 if (dst_bytes)
3903 bcopy (source, destination, coding->produced);
3904 else
3905 safe_bcopy (source, destination, coding->produced);
3906 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3907 {
3908 unsigned char *p = destination, *pend = p + coding->produced;
3909 while (p < pend)
3910 if (*p++ == '\015') p[-1] = '\n';
3911 }
3912 coding->fake_multibyte = 1;
3913 coding->consumed
3914 = coding->consumed_char = coding->produced_char = coding->produced;
3915 break;
3916 }
3917
3918 return result;
3919 }
3920
3921 /* Scan text in the region between *BEG and *END (byte positions),
3922 skip characters which we don't have to decode by coding system
3923 CODING at the head and tail, then set *BEG and *END to the region
3924 of the text we actually have to convert. The caller should move
3925 the gap out of the region in advance.
3926
3927 If STR is not NULL, *BEG and *END are indices into STR. */
3928
3929 static void
3930 shrink_decoding_region (beg, end, coding, str)
3931 int *beg, *end;
3932 struct coding_system *coding;
3933 unsigned char *str;
3934 {
3935 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3936 int eol_conversion;
3937 Lisp_Object translation_table;
3938
3939 if (coding->type == coding_type_ccl
3940 || coding->type == coding_type_undecided
3941 || !NILP (coding->post_read_conversion))
3942 {
3943 /* We can't skip any data. */
3944 return;
3945 }
3946 else if (coding->type == coding_type_no_conversion)
3947 {
3948 /* We need no conversion, but don't have to skip any data here.
3949 Decoding routine handles them effectively anyway. */
3950 return;
3951 }
3952
3953 translation_table = coding->translation_table_for_decode;
3954 if (NILP (translation_table) && !NILP (Venable_character_translation))
3955 translation_table = Vstandard_translation_table_for_decode;
3956 if (CHAR_TABLE_P (translation_table))
3957 {
3958 int i;
3959 for (i = 0; i < 128; i++)
3960 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3961 break;
3962 if (i < 128)
3963 /* Some ASCII character should be tranlsated. We give up
3964 shrinking. */
3965 return;
3966 }
3967
3968 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3969
3970 if ((! eol_conversion) && (coding->heading_ascii >= 0))
3971 /* Detection routine has already found how much we can skip at the
3972 head. */
3973 *beg += coding->heading_ascii;
3974
3975 if (str)
3976 {
3977 begp_orig = begp = str + *beg;
3978 endp_orig = endp = str + *end;
3979 }
3980 else
3981 {
3982 begp_orig = begp = BYTE_POS_ADDR (*beg);
3983 endp_orig = endp = begp + *end - *beg;
3984 }
3985
3986 switch (coding->type)
3987 {
3988 case coding_type_emacs_mule:
3989 case coding_type_raw_text:
3990 if (eol_conversion)
3991 {
3992 if (coding->heading_ascii < 0)
3993 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3994 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
3995 endp--;
3996 /* Do not consider LF as ascii if preceded by CR, since that
3997 confuses eol decoding. */
3998 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3999 endp++;
4000 }
4001 else
4002 begp = endp;
4003 break;
4004
4005 case coding_type_sjis:
4006 case coding_type_big5:
4007 /* We can skip all ASCII characters at the head. */
4008 if (coding->heading_ascii < 0)
4009 {
4010 if (eol_conversion)
4011 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4012 else
4013 while (begp < endp && *begp < 0x80) begp++;
4014 }
4015 /* We can skip all ASCII characters at the tail except for the
4016 second byte of SJIS or BIG5 code. */
4017 if (eol_conversion)
4018 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4019 else
4020 while (begp < endp && endp[-1] < 0x80) endp--;
4021 /* Do not consider LF as ascii if preceded by CR, since that
4022 confuses eol decoding. */
4023 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4024 endp++;
4025 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4026 endp++;
4027 break;
4028
4029 default: /* i.e. case coding_type_iso2022: */
4030 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4031 /* We can't skip any data. */
4032 break;
4033 if (coding->heading_ascii < 0)
4034 {
4035 /* We can skip all ASCII characters at the head except for a
4036 few control codes. */
4037 while (begp < endp && (c = *begp) < 0x80
4038 && c != ISO_CODE_CR && c != ISO_CODE_SO
4039 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4040 && (!eol_conversion || c != ISO_CODE_LF))
4041 begp++;
4042 }
4043 switch (coding->category_idx)
4044 {
4045 case CODING_CATEGORY_IDX_ISO_8_1:
4046 case CODING_CATEGORY_IDX_ISO_8_2:
4047 /* We can skip all ASCII characters at the tail. */
4048 if (eol_conversion)
4049 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4050 else
4051 while (begp < endp && endp[-1] < 0x80) endp--;
4052 /* Do not consider LF as ascii if preceded by CR, since that
4053 confuses eol decoding. */
4054 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4055 endp++;
4056 break;
4057
4058 case CODING_CATEGORY_IDX_ISO_7:
4059 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4060 {
4061 /* We can skip all charactes at the tail except for 8-bit
4062 codes and ESC and the following 2-byte at the tail. */
4063 unsigned char *eight_bit = NULL;
4064
4065 if (eol_conversion)
4066 while (begp < endp
4067 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4068 {
4069 if (!eight_bit && c & 0x80) eight_bit = endp;
4070 endp--;
4071 }
4072 else
4073 while (begp < endp
4074 && (c = endp[-1]) != ISO_CODE_ESC)
4075 {
4076 if (!eight_bit && c & 0x80) eight_bit = endp;
4077 endp--;
4078 }
4079 /* Do not consider LF as ascii if preceded by CR, since that
4080 confuses eol decoding. */
4081 if (begp < endp && endp < endp_orig
4082 && endp[-1] == '\r' && endp[0] == '\n')
4083 endp++;
4084 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4085 {
4086 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4087 /* This is an ASCII designation sequence. We can
4088 surely skip the tail. But, if we have
4089 encountered an 8-bit code, skip only the codes
4090 after that. */
4091 endp = eight_bit ? eight_bit : endp + 2;
4092 else
4093 /* Hmmm, we can't skip the tail. */
4094 endp = endp_orig;
4095 }
4096 else if (eight_bit)
4097 endp = eight_bit;
4098 }
4099 }
4100 }
4101 *beg += begp - begp_orig;
4102 *end += endp - endp_orig;
4103 return;
4104 }
4105
4106 /* Like shrink_decoding_region but for encoding. */
4107
4108 static void
4109 shrink_encoding_region (beg, end, coding, str)
4110 int *beg, *end;
4111 struct coding_system *coding;
4112 unsigned char *str;
4113 {
4114 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4115 int eol_conversion;
4116 Lisp_Object translation_table;
4117
4118 if (coding->type == coding_type_ccl)
4119 /* We can't skip any data. */
4120 return;
4121 else if (coding->type == coding_type_no_conversion)
4122 {
4123 /* We need no conversion. */
4124 *beg = *end;
4125 return;
4126 }
4127
4128 translation_table = coding->translation_table_for_encode;
4129 if (NILP (translation_table) && !NILP (Venable_character_translation))
4130 translation_table = Vstandard_translation_table_for_encode;
4131 if (CHAR_TABLE_P (translation_table))
4132 {
4133 int i;
4134 for (i = 0; i < 128; i++)
4135 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4136 break;
4137 if (i < 128)
4138 /* Some ASCII character should be tranlsated. We give up
4139 shrinking. */
4140 return;
4141 }
4142
4143 if (str)
4144 {
4145 begp_orig = begp = str + *beg;
4146 endp_orig = endp = str + *end;
4147 }
4148 else
4149 {
4150 begp_orig = begp = BYTE_POS_ADDR (*beg);
4151 endp_orig = endp = begp + *end - *beg;
4152 }
4153
4154 eol_conversion = (coding->eol_type == CODING_EOL_CR
4155 || coding->eol_type == CODING_EOL_CRLF);
4156
4157 /* Here, we don't have to check coding->pre_write_conversion because
4158 the caller is expected to have handled it already. */
4159 switch (coding->type)
4160 {
4161 case coding_type_undecided:
4162 case coding_type_emacs_mule:
4163 case coding_type_raw_text:
4164 if (eol_conversion)
4165 {
4166 while (begp < endp && *begp != '\n') begp++;
4167 while (begp < endp && endp[-1] != '\n') endp--;
4168 }
4169 else
4170 begp = endp;
4171 break;
4172
4173 case coding_type_iso2022:
4174 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4175 /* We can't skip any data. */
4176 break;
4177 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4178 {
4179 unsigned char *bol = begp;
4180 while (begp < endp && *begp < 0x80)
4181 {
4182 begp++;
4183 if (begp[-1] == '\n')
4184 bol = begp;
4185 }
4186 begp = bol;
4187 goto label_skip_tail;
4188 }
4189 /* fall down ... */
4190
4191 default:
4192 /* We can skip all ASCII characters at the head and tail. */
4193 if (eol_conversion)
4194 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4195 else
4196 while (begp < endp && *begp < 0x80) begp++;
4197 label_skip_tail:
4198 if (eol_conversion)
4199 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4200 else
4201 while (begp < endp && *(endp - 1) < 0x80) endp--;
4202 break;
4203 }
4204
4205 *beg += begp - begp_orig;
4206 *end += endp - endp_orig;
4207 return;
4208 }
4209
4210 /* As shrinking conversion region requires some overhead, we don't try
4211 shrinking if the length of conversion region is less than this
4212 value. */
4213 static int shrink_conversion_region_threshhold = 1024;
4214
4215 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4216 do { \
4217 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4218 { \
4219 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4220 else shrink_decoding_region (beg, end, coding, str); \
4221 } \
4222 } while (0)
4223
4224 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4225 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4226 coding system CODING, and return the status code of code conversion
4227 (currently, this value has no meaning).
4228
4229 How many characters (and bytes) are converted to how many
4230 characters (and bytes) are recorded in members of the structure
4231 CODING.
4232
4233 If REPLACE is nonzero, we do various things as if the original text
4234 is deleted and a new text is inserted. See the comments in
4235 replace_range (insdel.c) to know what we are doing. */
4236
4237 int
4238 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4239 int from, from_byte, to, to_byte, encodep, replace;
4240 struct coding_system *coding;
4241 {
4242 int len = to - from, len_byte = to_byte - from_byte;
4243 int require, inserted, inserted_byte;
4244 int head_skip, tail_skip, total_skip;
4245 Lisp_Object saved_coding_symbol;
4246 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4247 int first = 1;
4248 int fake_multibyte = 0;
4249 unsigned char *src, *dst;
4250 Lisp_Object deletion;
4251 int orig_point = PT, orig_len = len;
4252 int prev_Z;
4253
4254 deletion = Qnil;
4255 saved_coding_symbol = Qnil;
4256
4257 if (from < PT && PT < to)
4258 {
4259 TEMP_SET_PT_BOTH (from, from_byte);
4260 orig_point = from;
4261 }
4262
4263 if (replace)
4264 {
4265 int saved_from = from;
4266
4267 prepare_to_modify_buffer (from, to, &from);
4268 if (saved_from != from)
4269 {
4270 to = from + len;
4271 if (multibyte)
4272 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4273 else
4274 from_byte = from, to_byte = to;
4275 len_byte = to_byte - from_byte;
4276 }
4277 }
4278
4279 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4280 {
4281 /* We must detect encoding of text and eol format. */
4282
4283 if (from < GPT && to > GPT)
4284 move_gap_both (from, from_byte);
4285 if (coding->type == coding_type_undecided)
4286 {
4287 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4288 if (coding->type == coding_type_undecided)
4289 /* It seems that the text contains only ASCII, but we
4290 should not left it undecided because the deeper
4291 decoding routine (decode_coding) tries to detect the
4292 encodings again in vain. */
4293 coding->type = coding_type_emacs_mule;
4294 }
4295 if (coding->eol_type == CODING_EOL_UNDECIDED)
4296 {
4297 saved_coding_symbol = coding->symbol;
4298 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4299 if (coding->eol_type == CODING_EOL_UNDECIDED)
4300 coding->eol_type = CODING_EOL_LF;
4301 /* We had better recover the original eol format if we
4302 encounter an inconsitent eol format while decoding. */
4303 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4304 }
4305 }
4306
4307 coding->consumed_char = len, coding->consumed = len_byte;
4308
4309 if (encodep
4310 ? ! CODING_REQUIRE_ENCODING (coding)
4311 : ! CODING_REQUIRE_DECODING (coding))
4312 {
4313 coding->produced = len_byte;
4314 if (multibyte
4315 && ! replace
4316 /* See the comment of the member heading_ascii in coding.h. */
4317 && coding->heading_ascii < len_byte)
4318 {
4319 /* We still may have to combine byte at the head and the
4320 tail of the text in the region. */
4321 if (from < GPT && GPT < to)
4322 move_gap_both (to, to_byte);
4323 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4324 adjust_after_insert (from, from_byte, to, to_byte, len);
4325 coding->produced_char = len;
4326 }
4327 else
4328 {
4329 if (!replace)
4330 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4331 coding->produced_char = len_byte;
4332 }
4333 return 0;
4334 }
4335
4336 /* Now we convert the text. */
4337
4338 /* For encoding, we must process pre-write-conversion in advance. */
4339 if (encodep
4340 && ! NILP (coding->pre_write_conversion)
4341 && SYMBOLP (coding->pre_write_conversion)
4342 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4343 {
4344 /* The function in pre-write-conversion may put a new text in a
4345 new buffer. */
4346 struct buffer *prev = current_buffer;
4347 Lisp_Object new;
4348
4349 call2 (coding->pre_write_conversion,
4350 make_number (from), make_number (to));
4351 if (current_buffer != prev)
4352 {
4353 len = ZV - BEGV;
4354 new = Fcurrent_buffer ();
4355 set_buffer_internal_1 (prev);
4356 del_range_2 (from, from_byte, to, to_byte);
4357 TEMP_SET_PT_BOTH (from, from_byte);
4358 insert_from_buffer (XBUFFER (new), 1, len, 0);
4359 Fkill_buffer (new);
4360 if (orig_point >= to)
4361 orig_point += len - orig_len;
4362 else if (orig_point > from)
4363 orig_point = from;
4364 orig_len = len;
4365 to = from + len;
4366 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4367 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4368 len_byte = to_byte - from_byte;
4369 TEMP_SET_PT_BOTH (from, from_byte);
4370 }
4371 }
4372
4373 if (replace)
4374 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4375
4376 /* Try to skip the heading and tailing ASCIIs. */
4377 {
4378 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4379
4380 if (from < GPT && GPT < to)
4381 move_gap_both (from, from_byte);
4382 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4383 if (from_byte == to_byte
4384 && coding->type != coding_type_ccl
4385 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4386 && CODING_REQUIRE_FLUSHING (coding)))
4387 {
4388 coding->produced = len_byte;
4389 coding->produced_char = multibyte ? len : len_byte;
4390 if (!replace)
4391 /* We must record and adjust for this new text now. */
4392 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4393 return 0;
4394 }
4395
4396 head_skip = from_byte - from_byte_orig;
4397 tail_skip = to_byte_orig - to_byte;
4398 total_skip = head_skip + tail_skip;
4399 from += head_skip;
4400 to -= tail_skip;
4401 len -= total_skip; len_byte -= total_skip;
4402 }
4403
4404 /* The code conversion routine can not preserve text properties for
4405 now. So, we must remove all text properties in the region.
4406 Here, we must suppress all modification hooks. */
4407 if (replace)
4408 {
4409 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4410 inhibit_modification_hooks = 1;
4411 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4412 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4413 }
4414
4415 /* For converion, we must put the gap before the text in addition to
4416 making the gap larger for efficient decoding. The required gap
4417 size starts from 2000 which is the magic number used in make_gap.
4418 But, after one batch of conversion, it will be incremented if we
4419 find that it is not enough . */
4420 require = 2000;
4421
4422 if (GAP_SIZE < require)
4423 make_gap (require - GAP_SIZE);
4424 move_gap_both (from, from_byte);
4425
4426 inserted = inserted_byte = 0;
4427 src = GAP_END_ADDR, dst = GPT_ADDR;
4428
4429 GAP_SIZE += len_byte;
4430 ZV -= len;
4431 Z -= len;
4432 ZV_BYTE -= len_byte;
4433 Z_BYTE -= len_byte;
4434
4435 if (GPT - BEG < BEG_UNCHANGED)
4436 BEG_UNCHANGED = GPT - BEG;
4437 if (Z - GPT < END_UNCHANGED)
4438 END_UNCHANGED = Z - GPT;
4439
4440 for (;;)
4441 {
4442 int result;
4443
4444 /* The buffer memory is changed from:
4445 +--------+converted-text+---------+-------original-text------+---+
4446 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4447 |<------------------- GAP_SIZE -------------------->| */
4448 if (encodep)
4449 result = encode_coding (coding, src, dst, len_byte, 0);
4450 else
4451 result = decode_coding (coding, src, dst, len_byte, 0);
4452 /* to:
4453 +--------+-------converted-text--------+--+---original-text--+---+
4454 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4455 |<------------------- GAP_SIZE -------------------->| */
4456 if (coding->fake_multibyte)
4457 fake_multibyte = 1;
4458
4459 if (!encodep && !multibyte)
4460 coding->produced_char = coding->produced;
4461 inserted += coding->produced_char;
4462 inserted_byte += coding->produced;
4463 len_byte -= coding->consumed;
4464 src += coding->consumed;
4465 dst += inserted_byte;
4466
4467 if (result == CODING_FINISH_NORMAL)
4468 {
4469 src += len_byte;
4470 break;
4471 }
4472 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4473 {
4474 unsigned char *pend = dst, *p = pend - inserted_byte;
4475 Lisp_Object eol_type;
4476
4477 /* Encode LFs back to the original eol format (CR or CRLF). */
4478 if (coding->eol_type == CODING_EOL_CR)
4479 {
4480 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4481 }
4482 else
4483 {
4484 int count = 0;
4485
4486 while (p < pend) if (*p++ == '\n') count++;
4487 if (src - dst < count)
4488 {
4489 /* We don't have sufficient room for encoding LFs
4490 back to CRLF. We must record converted and
4491 not-yet-converted text back to the buffer
4492 content, enlarge the gap, then record them out of
4493 the buffer contents again. */
4494 int add = len_byte + inserted_byte;
4495
4496 GAP_SIZE -= add;
4497 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4498 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4499 make_gap (count - GAP_SIZE);
4500 GAP_SIZE += add;
4501 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4502 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4503 /* Don't forget to update SRC, DST, and PEND. */
4504 src = GAP_END_ADDR - len_byte;
4505 dst = GPT_ADDR + inserted_byte;
4506 pend = dst;
4507 }
4508 inserted += count;
4509 inserted_byte += count;
4510 coding->produced += count;
4511 p = dst = pend + count;
4512 while (count)
4513 {
4514 *--p = *--pend;
4515 if (*p == '\n') count--, *--p = '\r';
4516 }
4517 }
4518
4519 /* Suppress eol-format conversion in the further conversion. */
4520 coding->eol_type = CODING_EOL_LF;
4521
4522 /* Set the coding system symbol to that for Unix-like EOL. */
4523 eol_type = Fget (saved_coding_symbol, Qeol_type);
4524 if (VECTORP (eol_type)
4525 && XVECTOR (eol_type)->size == 3
4526 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4527 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4528 else
4529 coding->symbol = saved_coding_symbol;
4530
4531 continue;
4532 }
4533 if (len_byte <= 0)
4534 {
4535 if (coding->type != coding_type_ccl
4536 || coding->mode & CODING_MODE_LAST_BLOCK)
4537 break;
4538 coding->mode |= CODING_MODE_LAST_BLOCK;
4539 continue;
4540 }
4541 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4542 {
4543 /* The source text ends in invalid codes. Let's just
4544 make them valid buffer contents, and finish conversion. */
4545 inserted += len_byte;
4546 inserted_byte += len_byte;
4547 while (len_byte--)
4548 *dst++ = *src++;
4549 fake_multibyte = 1;
4550 break;
4551 }
4552 if (result == CODING_FINISH_INTERRUPT)
4553 {
4554 /* The conversion procedure was interrupted by a user. */
4555 fake_multibyte = 1;
4556 break;
4557 }
4558 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4559 if (coding->consumed < 1)
4560 {
4561 /* It's quite strange to require more memory without
4562 consuming any bytes. Perhaps CCL program bug. */
4563 fake_multibyte = 1;
4564 break;
4565 }
4566 if (first)
4567 {
4568 /* We have just done the first batch of conversion which was
4569 stoped because of insufficient gap. Let's reconsider the
4570 required gap size (i.e. SRT - DST) now.
4571
4572 We have converted ORIG bytes (== coding->consumed) into
4573 NEW bytes (coding->produced). To convert the remaining
4574 LEN bytes, we may need REQUIRE bytes of gap, where:
4575 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4576 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4577 Here, we are sure that NEW >= ORIG. */
4578 float ratio = coding->produced - coding->consumed;
4579 ratio /= coding->consumed;
4580 require = len_byte * ratio;
4581 first = 0;
4582 }
4583 if ((src - dst) < (require + 2000))
4584 {
4585 /* See the comment above the previous call of make_gap. */
4586 int add = len_byte + inserted_byte;
4587
4588 GAP_SIZE -= add;
4589 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4590 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4591 make_gap (require + 2000);
4592 GAP_SIZE += add;
4593 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4594 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4595 /* Don't forget to update SRC, DST. */
4596 src = GAP_END_ADDR - len_byte;
4597 dst = GPT_ADDR + inserted_byte;
4598 }
4599 }
4600 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4601
4602 if (multibyte
4603 && (encodep
4604 || fake_multibyte
4605 || (to - from) != (to_byte - from_byte)))
4606 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4607
4608 /* If we have shrinked the conversion area, adjust it now. */
4609 if (total_skip > 0)
4610 {
4611 if (tail_skip > 0)
4612 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4613 inserted += total_skip; inserted_byte += total_skip;
4614 GAP_SIZE += total_skip;
4615 GPT -= head_skip; GPT_BYTE -= head_skip;
4616 ZV -= total_skip; ZV_BYTE -= total_skip;
4617 Z -= total_skip; Z_BYTE -= total_skip;
4618 from -= head_skip; from_byte -= head_skip;
4619 to += tail_skip; to_byte += tail_skip;
4620 }
4621
4622 prev_Z = Z;
4623 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4624 inserted = Z - prev_Z;
4625
4626 if (! encodep && ! NILP (coding->post_read_conversion))
4627 {
4628 Lisp_Object val;
4629
4630 if (from != PT)
4631 TEMP_SET_PT_BOTH (from, from_byte);
4632 prev_Z = Z;
4633 val = call1 (coding->post_read_conversion, make_number (inserted));
4634 CHECK_NUMBER (val, 0);
4635 inserted += Z - prev_Z;
4636 }
4637
4638 if (orig_point >= from)
4639 {
4640 if (orig_point >= from + orig_len)
4641 orig_point += inserted - orig_len;
4642 else
4643 orig_point = from;
4644 TEMP_SET_PT (orig_point);
4645 }
4646
4647 signal_after_change (from, to - from, inserted);
4648
4649 {
4650 coding->consumed = to_byte - from_byte;
4651 coding->consumed_char = to - from;
4652 coding->produced = inserted_byte;
4653 coding->produced_char = inserted;
4654 }
4655
4656 return 0;
4657 }
4658
4659 Lisp_Object
4660 code_convert_string (str, coding, encodep, nocopy)
4661 Lisp_Object str;
4662 struct coding_system *coding;
4663 int encodep, nocopy;
4664 {
4665 int len;
4666 char *buf;
4667 int from = 0, to = XSTRING (str)->size;
4668 int to_byte = STRING_BYTES (XSTRING (str));
4669 struct gcpro gcpro1;
4670 Lisp_Object saved_coding_symbol;
4671 int result;
4672
4673 saved_coding_symbol = Qnil;
4674 if (encodep && !NILP (coding->pre_write_conversion)
4675 || !encodep && !NILP (coding->post_read_conversion))
4676 {
4677 /* Since we have to call Lisp functions which assume target text
4678 is in a buffer, after setting a temporary buffer, call
4679 code_convert_region. */
4680 int count = specpdl_ptr - specpdl;
4681 struct buffer *prev = current_buffer;
4682
4683 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4684 temp_output_buffer_setup (" *code-converting-work*");
4685 set_buffer_internal (XBUFFER (Vstandard_output));
4686 if (encodep)
4687 insert_from_string (str, 0, 0, to, to_byte, 0);
4688 else
4689 {
4690 /* We must insert the contents of STR as is without
4691 unibyte<->multibyte conversion. */
4692 current_buffer->enable_multibyte_characters = Qnil;
4693 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4694 current_buffer->enable_multibyte_characters = Qt;
4695 }
4696 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4697 if (encodep)
4698 /* We must return the buffer contents as unibyte string. */
4699 current_buffer->enable_multibyte_characters = Qnil;
4700 str = make_buffer_string (BEGV, ZV, 0);
4701 set_buffer_internal (prev);
4702 return unbind_to (count, str);
4703 }
4704
4705 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4706 {
4707 /* See the comments in code_convert_region. */
4708 if (coding->type == coding_type_undecided)
4709 {
4710 detect_coding (coding, XSTRING (str)->data, to_byte);
4711 if (coding->type == coding_type_undecided)
4712 coding->type = coding_type_emacs_mule;
4713 }
4714 if (coding->eol_type == CODING_EOL_UNDECIDED)
4715 {
4716 saved_coding_symbol = coding->symbol;
4717 detect_eol (coding, XSTRING (str)->data, to_byte);
4718 if (coding->eol_type == CODING_EOL_UNDECIDED)
4719 coding->eol_type = CODING_EOL_LF;
4720 /* We had better recover the original eol format if we
4721 encounter an inconsitent eol format while decoding. */
4722 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4723 }
4724 }
4725
4726 if (encodep
4727 ? ! CODING_REQUIRE_ENCODING (coding)
4728 : ! CODING_REQUIRE_DECODING (coding))
4729 from = to_byte;
4730 else
4731 {
4732 /* Try to skip the heading and tailing ASCIIs. */
4733 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4734 encodep);
4735 }
4736 if (from == to_byte
4737 && coding->type != coding_type_ccl)
4738 return (nocopy ? str : Fcopy_sequence (str));
4739
4740 if (encodep)
4741 len = encoding_buffer_size (coding, to_byte - from);
4742 else
4743 len = decoding_buffer_size (coding, to_byte - from);
4744 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4745 GCPRO1 (str);
4746 buf = get_conversion_buffer (len);
4747 UNGCPRO;
4748
4749 if (from > 0)
4750 bcopy (XSTRING (str)->data, buf, from);
4751 result = (encodep
4752 ? encode_coding (coding, XSTRING (str)->data + from,
4753 buf + from, to_byte - from, len)
4754 : decode_coding (coding, XSTRING (str)->data + from,
4755 buf + from, to_byte - from, len));
4756 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4757 {
4758 /* We simple try to decode the whole string again but without
4759 eol-conversion this time. */
4760 coding->eol_type = CODING_EOL_LF;
4761 coding->symbol = saved_coding_symbol;
4762 return code_convert_string (str, coding, encodep, nocopy);
4763 }
4764
4765 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4766 STRING_BYTES (XSTRING (str)) - to_byte);
4767
4768 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4769 if (encodep)
4770 str = make_unibyte_string (buf, len + coding->produced);
4771 else
4772 {
4773 int chars= (coding->fake_multibyte
4774 ? multibyte_chars_in_text (buf + from, coding->produced)
4775 : coding->produced_char);
4776 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4777 }
4778
4779 return str;
4780 }
4781
4782 \f
4783 #ifdef emacs
4784 /*** 8. Emacs Lisp library functions ***/
4785
4786 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4787 "Return t if OBJECT is nil or a coding-system.\n\
4788 See the documentation of `make-coding-system' for information\n\
4789 about coding-system objects.")
4790 (obj)
4791 Lisp_Object obj;
4792 {
4793 if (NILP (obj))
4794 return Qt;
4795 if (!SYMBOLP (obj))
4796 return Qnil;
4797 /* Get coding-spec vector for OBJ. */
4798 obj = Fget (obj, Qcoding_system);
4799 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4800 ? Qt : Qnil);
4801 }
4802
4803 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4804 Sread_non_nil_coding_system, 1, 1, 0,
4805 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4806 (prompt)
4807 Lisp_Object prompt;
4808 {
4809 Lisp_Object val;
4810 do
4811 {
4812 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4813 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4814 }
4815 while (XSTRING (val)->size == 0);
4816 return (Fintern (val, Qnil));
4817 }
4818
4819 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4820 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4821 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4822 (prompt, default_coding_system)
4823 Lisp_Object prompt, default_coding_system;
4824 {
4825 Lisp_Object val;
4826 if (SYMBOLP (default_coding_system))
4827 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4828 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4829 Qt, Qnil, Qcoding_system_history,
4830 default_coding_system, Qnil);
4831 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4832 }
4833
4834 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4835 1, 1, 0,
4836 "Check validity of CODING-SYSTEM.\n\
4837 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4838 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4839 The value of property should be a vector of length 5.")
4840 (coding_system)
4841 Lisp_Object coding_system;
4842 {
4843 CHECK_SYMBOL (coding_system, 0);
4844 if (!NILP (Fcoding_system_p (coding_system)))
4845 return coding_system;
4846 while (1)
4847 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4848 }
4849 \f
4850 Lisp_Object
4851 detect_coding_system (src, src_bytes, highest)
4852 unsigned char *src;
4853 int src_bytes, highest;
4854 {
4855 int coding_mask, eol_type;
4856 Lisp_Object val, tmp;
4857 int dummy;
4858
4859 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4860 eol_type = detect_eol_type (src, src_bytes, &dummy);
4861 if (eol_type == CODING_EOL_INCONSISTENT)
4862 eol_type = CODING_EOL_UNDECIDED;
4863
4864 if (!coding_mask)
4865 {
4866 val = Qundecided;
4867 if (eol_type != CODING_EOL_UNDECIDED)
4868 {
4869 Lisp_Object val2;
4870 val2 = Fget (Qundecided, Qeol_type);
4871 if (VECTORP (val2))
4872 val = XVECTOR (val2)->contents[eol_type];
4873 }
4874 return (highest ? val : Fcons (val, Qnil));
4875 }
4876
4877 /* At first, gather possible coding systems in VAL. */
4878 val = Qnil;
4879 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp))
4880 {
4881 int idx
4882 = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
4883 if (coding_mask & (1 << idx))
4884 {
4885 val = Fcons (Fsymbol_value (XCAR (tmp)), val);
4886 if (highest)
4887 break;
4888 }
4889 }
4890 if (!highest)
4891 val = Fnreverse (val);
4892
4893 /* Then, replace the elements with subsidiary coding systems. */
4894 for (tmp = val; !NILP (tmp); tmp = XCDR (tmp))
4895 {
4896 if (eol_type != CODING_EOL_UNDECIDED
4897 && eol_type != CODING_EOL_INCONSISTENT)
4898 {
4899 Lisp_Object eol;
4900 eol = Fget (XCAR (tmp), Qeol_type);
4901 if (VECTORP (eol))
4902 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4903 }
4904 }
4905 return (highest ? XCAR (val) : val);
4906 }
4907
4908 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4909 2, 3, 0,
4910 "Detect coding system of the text in the region between START and END.\n\
4911 Return a list of possible coding systems ordered by priority.\n\
4912 \n\
4913 If only ASCII characters are found, it returns a list of single element\n\
4914 `undecided' or its subsidiary coding system according to a detected\n\
4915 end-of-line format.\n\
4916 \n\
4917 If optional argument HIGHEST is non-nil, return the coding system of\n\
4918 highest priority.")
4919 (start, end, highest)
4920 Lisp_Object start, end, highest;
4921 {
4922 int from, to;
4923 int from_byte, to_byte;
4924
4925 CHECK_NUMBER_COERCE_MARKER (start, 0);
4926 CHECK_NUMBER_COERCE_MARKER (end, 1);
4927
4928 validate_region (&start, &end);
4929 from = XINT (start), to = XINT (end);
4930 from_byte = CHAR_TO_BYTE (from);
4931 to_byte = CHAR_TO_BYTE (to);
4932
4933 if (from < GPT && to >= GPT)
4934 move_gap_both (to, to_byte);
4935
4936 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4937 to_byte - from_byte,
4938 !NILP (highest));
4939 }
4940
4941 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4942 1, 2, 0,
4943 "Detect coding system of the text in STRING.\n\
4944 Return a list of possible coding systems ordered by priority.\n\
4945 \n\
4946 If only ASCII characters are found, it returns a list of single element\n\
4947 `undecided' or its subsidiary coding system according to a detected\n\
4948 end-of-line format.\n\
4949 \n\
4950 If optional argument HIGHEST is non-nil, return the coding system of\n\
4951 highest priority.")
4952 (string, highest)
4953 Lisp_Object string, highest;
4954 {
4955 CHECK_STRING (string, 0);
4956
4957 return detect_coding_system (XSTRING (string)->data,
4958 STRING_BYTES (XSTRING (string)),
4959 !NILP (highest));
4960 }
4961
4962 Lisp_Object
4963 code_convert_region1 (start, end, coding_system, encodep)
4964 Lisp_Object start, end, coding_system;
4965 int encodep;
4966 {
4967 struct coding_system coding;
4968 int from, to, len;
4969
4970 CHECK_NUMBER_COERCE_MARKER (start, 0);
4971 CHECK_NUMBER_COERCE_MARKER (end, 1);
4972 CHECK_SYMBOL (coding_system, 2);
4973
4974 validate_region (&start, &end);
4975 from = XFASTINT (start);
4976 to = XFASTINT (end);
4977
4978 if (NILP (coding_system))
4979 return make_number (to - from);
4980
4981 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4982 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4983
4984 coding.mode |= CODING_MODE_LAST_BLOCK;
4985 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4986 &coding, encodep, 1);
4987 Vlast_coding_system_used = coding.symbol;
4988 return make_number (coding.produced_char);
4989 }
4990
4991 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4992 3, 3, "r\nzCoding system: ",
4993 "Decode the current region by specified coding system.\n\
4994 When called from a program, takes three arguments:\n\
4995 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4996 This function sets `last-coding-system-used' to the precise coding system\n\
4997 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4998 not fully specified.)\n\
4999 It returns the length of the decoded text.")
5000 (start, end, coding_system)
5001 Lisp_Object start, end, coding_system;
5002 {
5003 return code_convert_region1 (start, end, coding_system, 0);
5004 }
5005
5006 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5007 3, 3, "r\nzCoding system: ",
5008 "Encode the current region by specified coding system.\n\
5009 When called from a program, takes three arguments:\n\
5010 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5011 This function sets `last-coding-system-used' to the precise coding system\n\
5012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5013 not fully specified.)\n\
5014 It returns the length of the encoded text.")
5015 (start, end, coding_system)
5016 Lisp_Object start, end, coding_system;
5017 {
5018 return code_convert_region1 (start, end, coding_system, 1);
5019 }
5020
5021 Lisp_Object
5022 code_convert_string1 (string, coding_system, nocopy, encodep)
5023 Lisp_Object string, coding_system, nocopy;
5024 int encodep;
5025 {
5026 struct coding_system coding;
5027
5028 CHECK_STRING (string, 0);
5029 CHECK_SYMBOL (coding_system, 1);
5030
5031 if (NILP (coding_system))
5032 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5033
5034 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5035 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5036
5037 coding.mode |= CODING_MODE_LAST_BLOCK;
5038 Vlast_coding_system_used = coding.symbol;
5039 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
5040 }
5041
5042 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5043 2, 3, 0,
5044 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5045 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5046 if the decoding operation is trivial.\n\
5047 This function sets `last-coding-system-used' to the precise coding system\n\
5048 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5049 not fully specified.)")
5050 (string, coding_system, nocopy)
5051 Lisp_Object string, coding_system, nocopy;
5052 {
5053 return code_convert_string1 (string, coding_system, nocopy, 0);
5054 }
5055
5056 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5057 2, 3, 0,
5058 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5059 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5060 if the encoding operation is trivial.\n\
5061 This function sets `last-coding-system-used' to the precise coding system\n\
5062 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5063 not fully specified.)")
5064 (string, coding_system, nocopy)
5065 Lisp_Object string, coding_system, nocopy;
5066 {
5067 return code_convert_string1 (string, coding_system, nocopy, 1);
5068 }
5069
5070 /* Encode or decode STRING according to CODING_SYSTEM.
5071 Do not set Vlast_coding_system_used. */
5072
5073 Lisp_Object
5074 code_convert_string_norecord (string, coding_system, encodep)
5075 Lisp_Object string, coding_system;
5076 int encodep;
5077 {
5078 struct coding_system coding;
5079
5080 CHECK_STRING (string, 0);
5081 CHECK_SYMBOL (coding_system, 1);
5082
5083 if (NILP (coding_system))
5084 return string;
5085
5086 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5087 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5088
5089 coding.mode |= CODING_MODE_LAST_BLOCK;
5090 return code_convert_string (string, &coding, encodep, Qt);
5091 }
5092 \f
5093 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5094 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5095 Return the corresponding character.")
5096 (code)
5097 Lisp_Object code;
5098 {
5099 unsigned char c1, c2, s1, s2;
5100 Lisp_Object val;
5101
5102 CHECK_NUMBER (code, 0);
5103 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5104 if (s1 == 0)
5105 {
5106 if (s2 < 0x80)
5107 XSETFASTINT (val, s2);
5108 else if (s2 >= 0xA0 || s2 <= 0xDF)
5109 XSETFASTINT (val,
5110 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5111 else
5112 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5113 }
5114 else
5115 {
5116 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5117 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5118 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5119 DECODE_SJIS (s1, s2, c1, c2);
5120 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5121 }
5122 return val;
5123 }
5124
5125 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5126 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5127 Return the corresponding code in SJIS.")
5128 (ch)
5129 Lisp_Object ch;
5130 {
5131 int charset, c1, c2, s1, s2;
5132 Lisp_Object val;
5133
5134 CHECK_NUMBER (ch, 0);
5135 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5136 if (charset == CHARSET_ASCII)
5137 {
5138 val = ch;
5139 }
5140 else if (charset == charset_jisx0208
5141 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5142 {
5143 ENCODE_SJIS (c1, c2, s1, s2);
5144 XSETFASTINT (val, (s1 << 8) | s2);
5145 }
5146 else if (charset == charset_katakana_jisx0201
5147 && c1 > 0x20 && c2 < 0xE0)
5148 {
5149 XSETFASTINT (val, c1 | 0x80);
5150 }
5151 else
5152 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5153 return val;
5154 }
5155
5156 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5157 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5158 Return the corresponding character.")
5159 (code)
5160 Lisp_Object code;
5161 {
5162 int charset;
5163 unsigned char b1, b2, c1, c2;
5164 Lisp_Object val;
5165
5166 CHECK_NUMBER (code, 0);
5167 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5168 if (b1 == 0)
5169 {
5170 if (b2 >= 0x80)
5171 error ("Invalid BIG5 code: %x", XFASTINT (code));
5172 val = code;
5173 }
5174 else
5175 {
5176 if ((b1 < 0xA1 || b1 > 0xFE)
5177 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5178 error ("Invalid BIG5 code: %x", XFASTINT (code));
5179 DECODE_BIG5 (b1, b2, charset, c1, c2);
5180 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5181 }
5182 return val;
5183 }
5184
5185 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5186 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5187 Return the corresponding character code in Big5.")
5188 (ch)
5189 Lisp_Object ch;
5190 {
5191 int charset, c1, c2, b1, b2;
5192 Lisp_Object val;
5193
5194 CHECK_NUMBER (ch, 0);
5195 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5196 if (charset == CHARSET_ASCII)
5197 {
5198 val = ch;
5199 }
5200 else if ((charset == charset_big5_1
5201 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5202 || (charset == charset_big5_2
5203 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5204 {
5205 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5206 XSETFASTINT (val, (b1 << 8) | b2);
5207 }
5208 else
5209 error ("Can't encode to Big5: %d", XFASTINT (ch));
5210 return val;
5211 }
5212 \f
5213 DEFUN ("set-terminal-coding-system-internal",
5214 Fset_terminal_coding_system_internal,
5215 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5216 (coding_system)
5217 Lisp_Object coding_system;
5218 {
5219 CHECK_SYMBOL (coding_system, 0);
5220 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5221 /* We had better not send unsafe characters to terminal. */
5222 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5223
5224 return Qnil;
5225 }
5226
5227 DEFUN ("set-safe-terminal-coding-system-internal",
5228 Fset_safe_terminal_coding_system_internal,
5229 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5230 (coding_system)
5231 Lisp_Object coding_system;
5232 {
5233 CHECK_SYMBOL (coding_system, 0);
5234 setup_coding_system (Fcheck_coding_system (coding_system),
5235 &safe_terminal_coding);
5236 return Qnil;
5237 }
5238
5239 DEFUN ("terminal-coding-system",
5240 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5241 "Return coding system specified for terminal output.")
5242 ()
5243 {
5244 return terminal_coding.symbol;
5245 }
5246
5247 DEFUN ("set-keyboard-coding-system-internal",
5248 Fset_keyboard_coding_system_internal,
5249 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5250 (coding_system)
5251 Lisp_Object coding_system;
5252 {
5253 CHECK_SYMBOL (coding_system, 0);
5254 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5255 return Qnil;
5256 }
5257
5258 DEFUN ("keyboard-coding-system",
5259 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5260 "Return coding system specified for decoding keyboard input.")
5261 ()
5262 {
5263 return keyboard_coding.symbol;
5264 }
5265
5266 \f
5267 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5268 Sfind_operation_coding_system, 1, MANY, 0,
5269 "Choose a coding system for an operation based on the target name.\n\
5270 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5271 DECODING-SYSTEM is the coding system to use for decoding\n\
5272 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5273 for encoding (in case OPERATION does encoding).\n\
5274 \n\
5275 The first argument OPERATION specifies an I/O primitive:\n\
5276 For file I/O, `insert-file-contents' or `write-region'.\n\
5277 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5278 For network I/O, `open-network-stream'.\n\
5279 \n\
5280 The remaining arguments should be the same arguments that were passed\n\
5281 to the primitive. Depending on which primitive, one of those arguments\n\
5282 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5283 whichever argument specifies the file name is TARGET.\n\
5284 \n\
5285 TARGET has a meaning which depends on OPERATION:\n\
5286 For file I/O, TARGET is a file name.\n\
5287 For process I/O, TARGET is a process name.\n\
5288 For network I/O, TARGET is a service name or a port number\n\
5289 \n\
5290 This function looks up what specified for TARGET in,\n\
5291 `file-coding-system-alist', `process-coding-system-alist',\n\
5292 or `network-coding-system-alist' depending on OPERATION.\n\
5293 They may specify a coding system, a cons of coding systems,\n\
5294 or a function symbol to call.\n\
5295 In the last case, we call the function with one argument,\n\
5296 which is a list of all the arguments given to this function.")
5297 (nargs, args)
5298 int nargs;
5299 Lisp_Object *args;
5300 {
5301 Lisp_Object operation, target_idx, target, val;
5302 register Lisp_Object chain;
5303
5304 if (nargs < 2)
5305 error ("Too few arguments");
5306 operation = args[0];
5307 if (!SYMBOLP (operation)
5308 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5309 error ("Invalid first arguement");
5310 if (nargs < 1 + XINT (target_idx))
5311 error ("Too few arguments for operation: %s",
5312 XSYMBOL (operation)->name->data);
5313 target = args[XINT (target_idx) + 1];
5314 if (!(STRINGP (target)
5315 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5316 error ("Invalid %dth argument", XINT (target_idx) + 1);
5317
5318 chain = ((EQ (operation, Qinsert_file_contents)
5319 || EQ (operation, Qwrite_region))
5320 ? Vfile_coding_system_alist
5321 : (EQ (operation, Qopen_network_stream)
5322 ? Vnetwork_coding_system_alist
5323 : Vprocess_coding_system_alist));
5324 if (NILP (chain))
5325 return Qnil;
5326
5327 for (; CONSP (chain); chain = XCDR (chain))
5328 {
5329 Lisp_Object elt;
5330 elt = XCAR (chain);
5331
5332 if (CONSP (elt)
5333 && ((STRINGP (target)
5334 && STRINGP (XCAR (elt))
5335 && fast_string_match (XCAR (elt), target) >= 0)
5336 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5337 {
5338 val = XCDR (elt);
5339 /* Here, if VAL is both a valid coding system and a valid
5340 function symbol, we return VAL as a coding system. */
5341 if (CONSP (val))
5342 return val;
5343 if (! SYMBOLP (val))
5344 return Qnil;
5345 if (! NILP (Fcoding_system_p (val)))
5346 return Fcons (val, val);
5347 if (! NILP (Ffboundp (val)))
5348 {
5349 val = call1 (val, Flist (nargs, args));
5350 if (CONSP (val))
5351 return val;
5352 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5353 return Fcons (val, val);
5354 }
5355 return Qnil;
5356 }
5357 }
5358 return Qnil;
5359 }
5360
5361 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5362 Supdate_coding_systems_internal, 0, 0, 0,
5363 "Update internal database for ISO2022 and CCL based coding systems.\n\
5364 When values of the following coding categories are changed, you must\n\
5365 call this function:\n\
5366 coding-category-iso-7, coding-category-iso-7-tight,\n\
5367 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5368 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5369 coding-category-ccl")
5370 ()
5371 {
5372 int i;
5373
5374 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5375 {
5376 Lisp_Object val;
5377
5378 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5379 if (!NILP (val))
5380 {
5381 if (! coding_system_table[i])
5382 coding_system_table[i] = ((struct coding_system *)
5383 xmalloc (sizeof (struct coding_system)));
5384 setup_coding_system (val, coding_system_table[i]);
5385 }
5386 else if (coding_system_table[i])
5387 {
5388 xfree (coding_system_table[i]);
5389 coding_system_table[i] = NULL;
5390 }
5391 }
5392
5393 return Qnil;
5394 }
5395
5396 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5397 Sset_coding_priority_internal, 0, 0, 0,
5398 "Update internal database for the current value of `coding-category-list'.\n\
5399 This function is internal use only.")
5400 ()
5401 {
5402 int i = 0, idx;
5403 Lisp_Object val;
5404
5405 val = Vcoding_category_list;
5406
5407 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5408 {
5409 if (! SYMBOLP (XCAR (val)))
5410 break;
5411 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5412 if (idx >= CODING_CATEGORY_IDX_MAX)
5413 break;
5414 coding_priorities[i++] = (1 << idx);
5415 val = XCDR (val);
5416 }
5417 /* If coding-category-list is valid and contains all coding
5418 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5419 the following code saves Emacs from craching. */
5420 while (i < CODING_CATEGORY_IDX_MAX)
5421 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5422
5423 return Qnil;
5424 }
5425
5426 #endif /* emacs */
5427
5428 \f
5429 /*** 9. Post-amble ***/
5430
5431 void
5432 init_coding ()
5433 {
5434 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5435 }
5436
5437 void
5438 init_coding_once ()
5439 {
5440 int i;
5441
5442 /* Emacs' internal format specific initialize routine. */
5443 for (i = 0; i <= 0x20; i++)
5444 emacs_code_class[i] = EMACS_control_code;
5445 emacs_code_class[0x0A] = EMACS_linefeed_code;
5446 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5447 for (i = 0x21 ; i < 0x7F; i++)
5448 emacs_code_class[i] = EMACS_ascii_code;
5449 emacs_code_class[0x7F] = EMACS_control_code;
5450 emacs_code_class[0x80] = EMACS_leading_code_composition;
5451 for (i = 0x81; i < 0xFF; i++)
5452 emacs_code_class[i] = EMACS_invalid_code;
5453 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5454 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5455 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5456 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5457
5458 /* ISO2022 specific initialize routine. */
5459 for (i = 0; i < 0x20; i++)
5460 iso_code_class[i] = ISO_control_code;
5461 for (i = 0x21; i < 0x7F; i++)
5462 iso_code_class[i] = ISO_graphic_plane_0;
5463 for (i = 0x80; i < 0xA0; i++)
5464 iso_code_class[i] = ISO_control_code;
5465 for (i = 0xA1; i < 0xFF; i++)
5466 iso_code_class[i] = ISO_graphic_plane_1;
5467 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5468 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5469 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5470 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5471 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5472 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5473 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5474 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5475 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5476 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5477
5478 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5479
5480 setup_coding_system (Qnil, &keyboard_coding);
5481 setup_coding_system (Qnil, &terminal_coding);
5482 setup_coding_system (Qnil, &safe_terminal_coding);
5483 setup_coding_system (Qnil, &default_buffer_file_coding);
5484
5485 bzero (coding_system_table, sizeof coding_system_table);
5486
5487 bzero (ascii_skip_code, sizeof ascii_skip_code);
5488 for (i = 0; i < 128; i++)
5489 ascii_skip_code[i] = 1;
5490
5491 #if defined (MSDOS) || defined (WINDOWSNT)
5492 system_eol_type = CODING_EOL_CRLF;
5493 #else
5494 system_eol_type = CODING_EOL_LF;
5495 #endif
5496 }
5497
5498 #ifdef emacs
5499
5500 void
5501 syms_of_coding ()
5502 {
5503 Qtarget_idx = intern ("target-idx");
5504 staticpro (&Qtarget_idx);
5505
5506 Qcoding_system_history = intern ("coding-system-history");
5507 staticpro (&Qcoding_system_history);
5508 Fset (Qcoding_system_history, Qnil);
5509
5510 /* Target FILENAME is the first argument. */
5511 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5512 /* Target FILENAME is the third argument. */
5513 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5514
5515 Qcall_process = intern ("call-process");
5516 staticpro (&Qcall_process);
5517 /* Target PROGRAM is the first argument. */
5518 Fput (Qcall_process, Qtarget_idx, make_number (0));
5519
5520 Qcall_process_region = intern ("call-process-region");
5521 staticpro (&Qcall_process_region);
5522 /* Target PROGRAM is the third argument. */
5523 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5524
5525 Qstart_process = intern ("start-process");
5526 staticpro (&Qstart_process);
5527 /* Target PROGRAM is the third argument. */
5528 Fput (Qstart_process, Qtarget_idx, make_number (2));
5529
5530 Qopen_network_stream = intern ("open-network-stream");
5531 staticpro (&Qopen_network_stream);
5532 /* Target SERVICE is the fourth argument. */
5533 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5534
5535 Qcoding_system = intern ("coding-system");
5536 staticpro (&Qcoding_system);
5537
5538 Qeol_type = intern ("eol-type");
5539 staticpro (&Qeol_type);
5540
5541 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5542 staticpro (&Qbuffer_file_coding_system);
5543
5544 Qpost_read_conversion = intern ("post-read-conversion");
5545 staticpro (&Qpost_read_conversion);
5546
5547 Qpre_write_conversion = intern ("pre-write-conversion");
5548 staticpro (&Qpre_write_conversion);
5549
5550 Qno_conversion = intern ("no-conversion");
5551 staticpro (&Qno_conversion);
5552
5553 Qundecided = intern ("undecided");
5554 staticpro (&Qundecided);
5555
5556 Qcoding_system_p = intern ("coding-system-p");
5557 staticpro (&Qcoding_system_p);
5558
5559 Qcoding_system_error = intern ("coding-system-error");
5560 staticpro (&Qcoding_system_error);
5561
5562 Fput (Qcoding_system_error, Qerror_conditions,
5563 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5564 Fput (Qcoding_system_error, Qerror_message,
5565 build_string ("Invalid coding system"));
5566
5567 Qcoding_category = intern ("coding-category");
5568 staticpro (&Qcoding_category);
5569 Qcoding_category_index = intern ("coding-category-index");
5570 staticpro (&Qcoding_category_index);
5571
5572 Vcoding_category_table
5573 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5574 staticpro (&Vcoding_category_table);
5575 {
5576 int i;
5577 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5578 {
5579 XVECTOR (Vcoding_category_table)->contents[i]
5580 = intern (coding_category_name[i]);
5581 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5582 Qcoding_category_index, make_number (i));
5583 }
5584 }
5585
5586 Qtranslation_table = intern ("translation-table");
5587 staticpro (&Qtranslation_table);
5588 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5589
5590 Qtranslation_table_id = intern ("translation-table-id");
5591 staticpro (&Qtranslation_table_id);
5592
5593 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5594 staticpro (&Qtranslation_table_for_decode);
5595
5596 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5597 staticpro (&Qtranslation_table_for_encode);
5598
5599 Qsafe_charsets = intern ("safe-charsets");
5600 staticpro (&Qsafe_charsets);
5601
5602 Qvalid_codes = intern ("valid-codes");
5603 staticpro (&Qvalid_codes);
5604
5605 Qemacs_mule = intern ("emacs-mule");
5606 staticpro (&Qemacs_mule);
5607
5608 Qraw_text = intern ("raw-text");
5609 staticpro (&Qraw_text);
5610
5611 defsubr (&Scoding_system_p);
5612 defsubr (&Sread_coding_system);
5613 defsubr (&Sread_non_nil_coding_system);
5614 defsubr (&Scheck_coding_system);
5615 defsubr (&Sdetect_coding_region);
5616 defsubr (&Sdetect_coding_string);
5617 defsubr (&Sdecode_coding_region);
5618 defsubr (&Sencode_coding_region);
5619 defsubr (&Sdecode_coding_string);
5620 defsubr (&Sencode_coding_string);
5621 defsubr (&Sdecode_sjis_char);
5622 defsubr (&Sencode_sjis_char);
5623 defsubr (&Sdecode_big5_char);
5624 defsubr (&Sencode_big5_char);
5625 defsubr (&Sset_terminal_coding_system_internal);
5626 defsubr (&Sset_safe_terminal_coding_system_internal);
5627 defsubr (&Sterminal_coding_system);
5628 defsubr (&Sset_keyboard_coding_system_internal);
5629 defsubr (&Skeyboard_coding_system);
5630 defsubr (&Sfind_operation_coding_system);
5631 defsubr (&Supdate_coding_systems_internal);
5632 defsubr (&Sset_coding_priority_internal);
5633
5634 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5635 "List of coding systems.\n\
5636 \n\
5637 Do not alter the value of this variable manually. This variable should be\n\
5638 updated by the functions `make-coding-system' and\n\
5639 `define-coding-system-alias'.");
5640 Vcoding_system_list = Qnil;
5641
5642 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5643 "Alist of coding system names.\n\
5644 Each element is one element list of coding system name.\n\
5645 This variable is given to `completing-read' as TABLE argument.\n\
5646 \n\
5647 Do not alter the value of this variable manually. This variable should be\n\
5648 updated by the functions `make-coding-system' and\n\
5649 `define-coding-system-alias'.");
5650 Vcoding_system_alist = Qnil;
5651
5652 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5653 "List of coding-categories (symbols) ordered by priority.");
5654 {
5655 int i;
5656
5657 Vcoding_category_list = Qnil;
5658 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5659 Vcoding_category_list
5660 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5661 Vcoding_category_list);
5662 }
5663
5664 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5665 "Specify the coding system for read operations.\n\
5666 It is useful to bind this variable with `let', but do not set it globally.\n\
5667 If the value is a coding system, it is used for decoding on read operation.\n\
5668 If not, an appropriate element is used from one of the coding system alists:\n\
5669 There are three such tables, `file-coding-system-alist',\n\
5670 `process-coding-system-alist', and `network-coding-system-alist'.");
5671 Vcoding_system_for_read = Qnil;
5672
5673 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5674 "Specify the coding system for write operations.\n\
5675 Programs bind this variable with `let', but you should not set it globally.\n\
5676 If the value is a coding system, it is used for encoding of output,\n\
5677 when writing it to a file and when sending it to a file or subprocess.\n\
5678 \n\
5679 If this does not specify a coding system, an appropriate element\n\
5680 is used from one of the coding system alists:\n\
5681 There are three such tables, `file-coding-system-alist',\n\
5682 `process-coding-system-alist', and `network-coding-system-alist'.\n\
5683 For output to files, if the above procedure does not specify a coding system,\n\
5684 the value of `buffer-file-coding-system' is used.");
5685 Vcoding_system_for_write = Qnil;
5686
5687 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5688 "Coding system used in the latest file or process I/O.");
5689 Vlast_coding_system_used = Qnil;
5690
5691 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5692 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5693 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5694 such conversion.");
5695 inhibit_eol_conversion = 0;
5696
5697 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5698 "Non-nil means process buffer inherits coding system of process output.\n\
5699 Bind it to t if the process output is to be treated as if it were a file\n\
5700 read from some filesystem.");
5701 inherit_process_coding_system = 0;
5702
5703 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5704 "Alist to decide a coding system to use for a file I/O operation.\n\
5705 The format is ((PATTERN . VAL) ...),\n\
5706 where PATTERN is a regular expression matching a file name,\n\
5707 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5708 If VAL is a coding system, it is used for both decoding and encoding\n\
5709 the file contents.\n\
5710 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5711 and the cdr part is used for encoding.\n\
5712 If VAL is a function symbol, the function must return a coding system\n\
5713 or a cons of coding systems which are used as above.\n\
5714 \n\
5715 See also the function `find-operation-coding-system'\n\
5716 and the variable `auto-coding-alist'.");
5717 Vfile_coding_system_alist = Qnil;
5718
5719 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5720 "Alist to decide a coding system to use for a process I/O operation.\n\
5721 The format is ((PATTERN . VAL) ...),\n\
5722 where PATTERN is a regular expression matching a program name,\n\
5723 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5724 If VAL is a coding system, it is used for both decoding what received\n\
5725 from the program and encoding what sent to the program.\n\
5726 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5727 and the cdr part is used for encoding.\n\
5728 If VAL is a function symbol, the function must return a coding system\n\
5729 or a cons of coding systems which are used as above.\n\
5730 \n\
5731 See also the function `find-operation-coding-system'.");
5732 Vprocess_coding_system_alist = Qnil;
5733
5734 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5735 "Alist to decide a coding system to use for a network I/O operation.\n\
5736 The format is ((PATTERN . VAL) ...),\n\
5737 where PATTERN is a regular expression matching a network service name\n\
5738 or is a port number to connect to,\n\
5739 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5740 If VAL is a coding system, it is used for both decoding what received\n\
5741 from the network stream and encoding what sent to the network stream.\n\
5742 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5743 and the cdr part is used for encoding.\n\
5744 If VAL is a function symbol, the function must return a coding system\n\
5745 or a cons of coding systems which are used as above.\n\
5746 \n\
5747 See also the function `find-operation-coding-system'.");
5748 Vnetwork_coding_system_alist = Qnil;
5749
5750 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5751 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5752 eol_mnemonic_unix = build_string (":");
5753
5754 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5755 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5756 eol_mnemonic_dos = build_string ("\\");
5757
5758 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5759 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5760 eol_mnemonic_mac = build_string ("/");
5761
5762 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5763 "*String displayed in mode line when end-of-line format is not yet determined.");
5764 eol_mnemonic_undecided = build_string (":");
5765
5766 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5767 "*Non-nil enables character translation while encoding and decoding.");
5768 Venable_character_translation = Qt;
5769
5770 DEFVAR_LISP ("standard-translation-table-for-decode",
5771 &Vstandard_translation_table_for_decode,
5772 "Table for translating characters while decoding.");
5773 Vstandard_translation_table_for_decode = Qnil;
5774
5775 DEFVAR_LISP ("standard-translation-table-for-encode",
5776 &Vstandard_translation_table_for_encode,
5777 "Table for translationg characters while encoding.");
5778 Vstandard_translation_table_for_encode = Qnil;
5779
5780 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5781 "Alist of charsets vs revision numbers.\n\
5782 While encoding, if a charset (car part of an element) is found,\n\
5783 designate it with the escape sequence identifing revision (cdr part of the element).");
5784 Vcharset_revision_alist = Qnil;
5785
5786 DEFVAR_LISP ("default-process-coding-system",
5787 &Vdefault_process_coding_system,
5788 "Cons of coding systems used for process I/O by default.\n\
5789 The car part is used for decoding a process output,\n\
5790 the cdr part is used for encoding a text to be sent to a process.");
5791 Vdefault_process_coding_system = Qnil;
5792
5793 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5794 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5795 This is a vector of length 256.\n\
5796 If Nth element is non-nil, the existence of code N in a file\n\
5797 \(or output of subprocess) doesn't prevent it to be detected as\n\
5798 a coding system of ISO 2022 variant which has a flag\n\
5799 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5800 or reading output of a subprocess.\n\
5801 Only 128th through 159th elements has a meaning.");
5802 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5803
5804 DEFVAR_LISP ("select-safe-coding-system-function",
5805 &Vselect_safe_coding_system_function,
5806 "Function to call to select safe coding system for encoding a text.\n\
5807 \n\
5808 If set, this function is called to force a user to select a proper\n\
5809 coding system which can encode the text in the case that a default\n\
5810 coding system used in each operation can't encode the text.\n\
5811 \n\
5812 The default value is `select-safe-coding-system' (which see).");
5813 Vselect_safe_coding_system_function = Qnil;
5814
5815 }
5816
5817 #endif /* emacs */