Fix -Wimplicit warnings.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
74
75 5. Other
76
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
86
87 */
88
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is usually one byte of
95 `carriage-return'.
96
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
101
102 */
103
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
112 int
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
115 {
116 ...
117 }
118 #endif
119
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
128
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
131
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
135
136 Below is a template of these functions. */
137 #if 0
138 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
139 struct coding_system *coding;
140 unsigned char *source, *destination;
141 int src_bytes, dst_bytes;
142 {
143 ...
144 }
145 #endif
146
147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
148
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
151 a place pointed to by DESTINATION, the length of which should not
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
155
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
158
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
162
163 Below is a template of these functions. */
164 #if 0
165 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
166 struct coding_system *coding;
167 unsigned char *source, *destination;
168 int src_bytes, dst_bytes;
169 {
170 ...
171 }
172 #endif
173
174 /*** COMMONLY USED MACROS ***/
175
176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
181
182 #define ONE_MORE_BYTE(c1) \
183 do { \
184 if (src < src_end) \
185 c1 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190 #define TWO_MORE_BYTES(c1, c2) \
191 do { \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
194 else \
195 goto label_end_of_loop; \
196 } while (0)
197
198 #define THREE_MORE_BYTES(c1, c2, c3) \
199 do { \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
202 else \
203 goto label_end_of_loop; \
204 } while (0)
205
206 /* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
212
213 /* Decode one ASCII character C. */
214
215 #define DECODE_CHARACTER_ASCII(c) \
216 do { \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
219 else \
220 { \
221 *dst++ = (c); \
222 coding->produced_char++; \
223 } \
224 } while (0)
225
226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
227 position-code is C. */
228
229 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
230 do { \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
234 else \
235 { \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
238 } \
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
242 } while (0)
243
244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
245 position-codes are C1 and C2. */
246
247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
248 do { \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
251 } while (0)
252
253 \f
254 /*** 1. Preamble ***/
255
256 #include <stdio.h>
257
258 #ifdef emacs
259
260 #include <config.h>
261 #include "lisp.h"
262 #include "buffer.h"
263 #include "charset.h"
264 #include "ccl.h"
265 #include "coding.h"
266 #include "window.h"
267
268 #else /* not emacs */
269
270 #include "mulelib.h"
271
272 #endif /* not emacs */
273
274 Lisp_Object Qcoding_system, Qeol_type;
275 Lisp_Object Qbuffer_file_coding_system;
276 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
277 Lisp_Object Qno_conversion, Qundecided;
278 Lisp_Object Qcoding_system_history;
279 Lisp_Object Qsafe_charsets;
280
281 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
282 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
283 Lisp_Object Qstart_process, Qopen_network_stream;
284 Lisp_Object Qtarget_idx;
285
286 Lisp_Object Vselect_safe_coding_system_function;
287
288 /* Mnemonic character of each format of end-of-line. */
289 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
290 /* Mnemonic character to indicate format of end-of-line is not yet
291 decided. */
292 int eol_mnemonic_undecided;
293
294 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
296 int system_eol_type;
297
298 #ifdef emacs
299
300 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
301
302 Lisp_Object Qcoding_system_p, Qcoding_system_error;
303
304 /* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306 Lisp_Object Qemacs_mule, Qraw_text;
307
308 /* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310 /* Coding-system for reading files and receiving data from process. */
311 Lisp_Object Vcoding_system_for_read;
312 /* Coding-system for writing files and sending data to process. */
313 Lisp_Object Vcoding_system_for_write;
314 /* Coding-system actually used in the latest I/O. */
315 Lisp_Object Vlast_coding_system_used;
316
317 /* A vector of length 256 which contains information about special
318 Latin codes (espepcially for dealing with Microsoft code). */
319 Lisp_Object Vlatin_extra_code_table;
320
321 /* Flag to inhibit code conversion of end-of-line format. */
322 int inhibit_eol_conversion;
323
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding;
326
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding;
330
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding;
333
334 Lisp_Object Vfile_coding_system_alist;
335 Lisp_Object Vprocess_coding_system_alist;
336 Lisp_Object Vnetwork_coding_system_alist;
337
338 #endif /* emacs */
339
340 Lisp_Object Qcoding_category, Qcoding_category_index;
341
342 /* List of symbols `coding-category-xxx' ordered by priority. */
343 Lisp_Object Vcoding_category_list;
344
345 /* Table of coding categories (Lisp symbols). */
346 Lisp_Object Vcoding_category_table;
347
348 /* Table of names of symbol for each coding-category. */
349 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
350 "coding-category-emacs-mule",
351 "coding-category-sjis",
352 "coding-category-iso-7",
353 "coding-category-iso-7-tight",
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
358 "coding-category-big5",
359 "coding-category-raw-text",
360 "coding-category-binary"
361 };
362
363 /* Table pointers to coding systems corresponding to each coding
364 categories. */
365 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
366
367 /* Flag to tell if we look up unification table on character code
368 conversion. */
369 Lisp_Object Venable_character_unification;
370 /* Standard unification table to look up on decoding (reading). */
371 Lisp_Object Vstandard_character_unification_table_for_decode;
372 /* Standard unification table to look up on encoding (writing). */
373 Lisp_Object Vstandard_character_unification_table_for_encode;
374
375 Lisp_Object Qcharacter_unification_table;
376 Lisp_Object Qcharacter_unification_table_for_decode;
377 Lisp_Object Qcharacter_unification_table_for_encode;
378
379 /* Alist of charsets vs revision number. */
380 Lisp_Object Vcharset_revision_alist;
381
382 /* Default coding systems used for process I/O. */
383 Lisp_Object Vdefault_process_coding_system;
384
385 \f
386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
387
388 /* Emacs' internal format for encoding multiple character sets is a
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
400
401 There's one exception to this rule. Special leading-code
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
408 range as follows:
409
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
412 ASCII 0x00 .. 0x7F
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
416
417 */
418
419 enum emacs_code_class_type emacs_code_class[256];
420
421 /* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423 #define CHECK_CODE_RANGE_A0_FF \
424 do { \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
428 return 0; \
429 } while (0)
430
431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
434
435 int
436 detect_coding_emacs_mule (src, src_end)
437 unsigned char *src, *src_end;
438 {
439 unsigned char c;
440 int composing = 0;
441
442 while (src < src_end)
443 {
444 c = *src++;
445
446 if (composing)
447 {
448 if (c < 0xA0)
449 composing = 0;
450 else
451 c -= 0x20;
452 }
453
454 switch (emacs_code_class[c])
455 {
456 case EMACS_ascii_code:
457 case EMACS_linefeed_code:
458 break;
459
460 case EMACS_control_code:
461 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
462 return 0;
463 break;
464
465 case EMACS_invalid_code:
466 return 0;
467
468 case EMACS_leading_code_composition: /* c == 0x80 */
469 if (composing)
470 CHECK_CODE_RANGE_A0_FF;
471 else
472 composing = 1;
473 break;
474
475 case EMACS_leading_code_4:
476 CHECK_CODE_RANGE_A0_FF;
477 /* fall down to check it two more times ... */
478
479 case EMACS_leading_code_3:
480 CHECK_CODE_RANGE_A0_FF;
481 /* fall down to check it one more time ... */
482
483 case EMACS_leading_code_2:
484 CHECK_CODE_RANGE_A0_FF;
485 break;
486
487 default:
488 label_end_of_switch:
489 break;
490 }
491 }
492 return CODING_CATEGORY_MASK_EMACS_MULE;
493 }
494
495 \f
496 /*** 3. ISO2022 handlers ***/
497
498 /* The following note describes the coding system ISO2022 briefly.
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
503
504 ISO2022 provides many mechanisms to encode several character sets
505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
506 all text is encoded by codes of less than 128. This may make the
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
509
510 There are two kinds of character set: control character set and
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
513 functions are provided also by escape sequences). The latter
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
516
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
526
527 Note (*): ECMA = European Computer Manufacturers Association
528
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
534
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
540
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
550
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
558
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
562
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
568 sequences.
569
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
574 SO (shift-out) 0x0E none invoke G1 to GL
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
582
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
604
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
607
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
612
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
615
616 Now you may notice that there are a lot of ways for encoding the
617 same multilingual text in ISO2022. Actually, there exists many
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
622
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
626
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
634
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
640
641 enum iso_code_class_type iso_code_class[256];
642
643 #define CHARSET_OK(idx, charset) \
644 (coding_system_table[idx]->safe_charsets[charset] \
645 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
646 (coding_system_table[idx], charset) \
647 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
648
649 #define SHIFT_OUT_OK(idx) \
650 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
651
652 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
653 Check if a text is encoded in ISO2022. If it is, returns an
654 integer in which appropriate flag bits any of:
655 CODING_CATEGORY_MASK_ISO_7
656 CODING_CATEGORY_MASK_ISO_7_TIGHT
657 CODING_CATEGORY_MASK_ISO_8_1
658 CODING_CATEGORY_MASK_ISO_8_2
659 CODING_CATEGORY_MASK_ISO_7_ELSE
660 CODING_CATEGORY_MASK_ISO_8_ELSE
661 are set. If a code which should never appear in ISO2022 is found,
662 returns 0. */
663
664 int
665 detect_coding_iso2022 (src, src_end)
666 unsigned char *src, *src_end;
667 {
668 int mask = CODING_CATEGORY_MASK_ISO;
669 int mask_found = 0;
670 int reg[4], shift_out = 0;
671 int c, c1, i, charset;
672
673 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
674 while (mask && src < src_end)
675 {
676 c = *src++;
677 switch (c)
678 {
679 case ISO_CODE_ESC:
680 if (src >= src_end)
681 break;
682 c = *src++;
683 if (c >= '(' && c <= '/')
684 {
685 /* Designation sequence for a charset of dimension 1. */
686 if (src >= src_end)
687 break;
688 c1 = *src++;
689 if (c1 < ' ' || c1 >= 0x80
690 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
691 /* Invalid designation sequence. Just ignore. */
692 break;
693 reg[(c - '(') % 4] = charset;
694 }
695 else if (c == '$')
696 {
697 /* Designation sequence for a charset of dimension 2. */
698 if (src >= src_end)
699 break;
700 c = *src++;
701 if (c >= '@' && c <= 'B')
702 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
703 reg[0] = charset = iso_charset_table[1][0][c];
704 else if (c >= '(' && c <= '/')
705 {
706 if (src >= src_end)
707 break;
708 c1 = *src++;
709 if (c1 < ' ' || c1 >= 0x80
710 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
711 /* Invalid designation sequence. Just ignore. */
712 break;
713 reg[(c - '(') % 4] = charset;
714 }
715 else
716 /* Invalid designation sequence. Just ignore. */
717 break;
718 }
719 else if (c == 'N' || c == 'n')
720 {
721 if (shift_out == 0
722 && (reg[1] >= 0
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
724 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
725 {
726 /* Locking shift out. */
727 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
728 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
729 shift_out = 1;
730 }
731 break;
732 }
733 else if (c == 'O' || c == 'o')
734 {
735 if (shift_out == 1)
736 {
737 /* Locking shift in. */
738 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
739 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
740 shift_out = 0;
741 }
742 break;
743 }
744 else if (c == '0' || c == '1' || c == '2')
745 /* Start/end composition. Just ignore. */
746 break;
747 else
748 /* Invalid escape sequence. Just ignore. */
749 break;
750
751 /* We found a valid designation sequence for CHARSET. */
752 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
753 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
754 mask_found |= CODING_CATEGORY_MASK_ISO_7;
755 else
756 mask &= ~CODING_CATEGORY_MASK_ISO_7;
757 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
758 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
759 else
760 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
761 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
762 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
763 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
764 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
765 break;
766
767 case ISO_CODE_SO:
768 if (shift_out == 0
769 && (reg[1] >= 0
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
771 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
772 {
773 /* Locking shift out. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
775 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
776 }
777 break;
778
779 case ISO_CODE_SI:
780 if (shift_out == 1)
781 {
782 /* Locking shift in. */
783 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
784 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
785 }
786 break;
787
788 case ISO_CODE_CSI:
789 case ISO_CODE_SS2:
790 case ISO_CODE_SS3:
791 {
792 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
793
794 if (c != ISO_CODE_CSI)
795 {
796 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
797 & CODING_FLAG_ISO_SINGLE_SHIFT)
798 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
799 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
800 & CODING_FLAG_ISO_SINGLE_SHIFT)
801 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
802 }
803 if (VECTORP (Vlatin_extra_code_table)
804 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
805 {
806 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
807 & CODING_FLAG_ISO_LATIN_EXTRA)
808 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
809 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
810 & CODING_FLAG_ISO_LATIN_EXTRA)
811 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
812 }
813 mask &= newmask;
814 mask_found |= newmask;
815 }
816 break;
817
818 default:
819 if (c < 0x80)
820 break;
821 else if (c < 0xA0)
822 {
823 if (VECTORP (Vlatin_extra_code_table)
824 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
825 {
826 int newmask = 0;
827
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
829 & CODING_FLAG_ISO_LATIN_EXTRA)
830 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
832 & CODING_FLAG_ISO_LATIN_EXTRA)
833 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
834 mask &= newmask;
835 mask_found |= newmask;
836 }
837 else
838 return 0;
839 }
840 else
841 {
842 unsigned char *src_begin = src;
843
844 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
845 | CODING_CATEGORY_MASK_ISO_7_ELSE);
846 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
847 while (src < src_end && *src >= 0xA0)
848 src++;
849 if ((src - src_begin - 1) & 1 && src < src_end)
850 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
851 else
852 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
853 }
854 break;
855 }
856 }
857
858 return (mask & mask_found);
859 }
860
861 /* Decode a character of which charset is CHARSET and the 1st position
862 code is C1. If dimension of CHARSET is 2, the 2nd position code is
863 fetched from SRC and set to C2. If CHARSET is negative, it means
864 that we are decoding ill formed text, and what we can do is just to
865 read C1 as is. */
866
867 #define DECODE_ISO_CHARACTER(charset, c1) \
868 do { \
869 int c_alt, charset_alt = (charset); \
870 if (COMPOSING_HEAD_P (coding->composing)) \
871 { \
872 *dst++ = LEADING_CODE_COMPOSITION; \
873 if (COMPOSING_WITH_RULE_P (coding->composing)) \
874 /* To tell composition rules are embeded. */ \
875 *dst++ = 0xFF; \
876 coding->composing += 2; \
877 } \
878 if ((charset) >= 0) \
879 { \
880 if (CHARSET_DIMENSION (charset) == 2) \
881 { \
882 ONE_MORE_BYTE (c2); \
883 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
884 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
885 { \
886 src--; \
887 c2 = ' '; \
888 } \
889 } \
890 if (!NILP (unification_table) \
891 && ((c_alt = unify_char (unification_table, \
892 -1, (charset), c1, c2)) >= 0)) \
893 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
894 } \
895 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
896 DECODE_CHARACTER_ASCII (c1); \
897 else if (CHARSET_DIMENSION (charset_alt) == 1) \
898 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
899 else \
900 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
901 if (COMPOSING_WITH_RULE_P (coding->composing)) \
902 /* To tell a composition rule follows. */ \
903 coding->composing = COMPOSING_WITH_RULE_RULE; \
904 } while (0)
905
906 /* Set designation state into CODING. */
907 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
908 do { \
909 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
910 make_number (chars), \
911 make_number (final_char)); \
912 if (charset >= 0 \
913 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
914 || coding->safe_charsets[charset])) \
915 { \
916 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
917 && reg == 0 \
918 && charset == CHARSET_ASCII) \
919 { \
920 /* We should insert this designation sequence as is so \
921 that it is surely written back to a file. */ \
922 coding->spec.iso2022.last_invalid_designation_register = -1; \
923 goto label_invalid_code; \
924 } \
925 coding->spec.iso2022.last_invalid_designation_register = -1; \
926 if ((coding->mode & CODING_MODE_DIRECTION) \
927 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
928 charset = CHARSET_REVERSE_CHARSET (charset); \
929 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
930 } \
931 else \
932 { \
933 coding->spec.iso2022.last_invalid_designation_register = reg; \
934 goto label_invalid_code; \
935 } \
936 } while (0)
937
938 /* Check if the current composing sequence contains only valid codes.
939 If the composing sequence doesn't end before SRC_END, return -1.
940 Else, if it contains only valid codes, return 0.
941 Else return the length of the composing sequence. */
942
943 int check_composing_code (coding, src, src_end)
944 struct coding_system *coding;
945 unsigned char *src, *src_end;
946 {
947 unsigned char *src_start = src;
948 int invalid_code_found = 0;
949 int charset, c, c1, dim;
950
951 while (src < src_end)
952 {
953 if (*src++ != ISO_CODE_ESC) continue;
954 if (src >= src_end) break;
955 if ((c = *src++) == '1') /* end of compsition */
956 return (invalid_code_found ? src - src_start : 0);
957 if (src + 2 >= src_end) break;
958 if (!coding->flags & CODING_FLAG_ISO_DESIGNATION)
959 invalid_code_found = 1;
960 else
961 {
962 dim = 0;
963 if (c == '$')
964 {
965 dim = 1;
966 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
967 }
968 if (c >= '(' && c <= '/')
969 {
970 c1 = *src++;
971 if ((c1 < ' ' || c1 >= 0x80)
972 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
973 || ! coding->safe_charsets[charset]
974 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
975 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
976 invalid_code_found = 1;
977 }
978 else
979 invalid_code_found = 1;
980 }
981 }
982 return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1);
983 }
984
985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
986
987 int
988 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
989 struct coding_system *coding;
990 unsigned char *source, *destination;
991 int src_bytes, dst_bytes;
992 {
993 unsigned char *src = source;
994 unsigned char *src_end = source + src_bytes;
995 unsigned char *dst = destination;
996 unsigned char *dst_end = destination + dst_bytes;
997 /* Since the maximum bytes produced by each loop is 7, we subtract 6
998 from DST_END to assure that overflow checking is necessary only
999 at the head of loop. */
1000 unsigned char *adjusted_dst_end = dst_end - 6;
1001 int charset;
1002 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1003 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1004 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1005 Lisp_Object unification_table
1006 = coding->character_unification_table_for_decode;
1007 int result = CODING_FINISH_NORMAL;
1008
1009 if (!NILP (Venable_character_unification) && NILP (unification_table))
1010 unification_table = Vstandard_character_unification_table_for_decode;
1011
1012 coding->produced_char = 0;
1013 coding->fake_multibyte = 0;
1014 while (src < src_end && (dst_bytes
1015 ? (dst < adjusted_dst_end)
1016 : (dst < src - 6)))
1017 {
1018 /* SRC_BASE remembers the start position in source in each loop.
1019 The loop will be exited when there's not enough source text
1020 to analyze long escape sequence or 2-byte code (within macros
1021 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1022 to SRC_BASE before exiting. */
1023 unsigned char *src_base = src;
1024 int c1 = *src++, c2;
1025
1026 switch (iso_code_class [c1])
1027 {
1028 case ISO_0x20_or_0x7F:
1029 if (!coding->composing
1030 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1031 {
1032 /* This is SPACE or DEL. */
1033 *dst++ = c1;
1034 coding->produced_char++;
1035 break;
1036 }
1037 /* This is a graphic character, we fall down ... */
1038
1039 case ISO_graphic_plane_0:
1040 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1041 {
1042 /* This is a composition rule. */
1043 *dst++ = c1 | 0x80;
1044 coding->composing = COMPOSING_WITH_RULE_TAIL;
1045 }
1046 else
1047 DECODE_ISO_CHARACTER (charset0, c1);
1048 break;
1049
1050 case ISO_0xA0_or_0xFF:
1051 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1052 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1053 goto label_invalid_code;
1054 /* This is a graphic character, we fall down ... */
1055
1056 case ISO_graphic_plane_1:
1057 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1058 goto label_invalid_code;
1059 else
1060 DECODE_ISO_CHARACTER (charset1, c1);
1061 break;
1062
1063 case ISO_control_code:
1064 /* All ISO2022 control characters in this class have the
1065 same representation in Emacs internal format. */
1066 if (c1 == '\n'
1067 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1068 && (coding->eol_type == CODING_EOL_CR
1069 || coding->eol_type == CODING_EOL_CRLF))
1070 {
1071 result = CODING_FINISH_INCONSISTENT_EOL;
1072 goto label_end_of_loop_2;
1073 }
1074 *dst++ = c1;
1075 coding->produced_char++;
1076 break;
1077
1078 case ISO_carriage_return:
1079 if (coding->eol_type == CODING_EOL_CR)
1080 *dst++ = '\n';
1081 else if (coding->eol_type == CODING_EOL_CRLF)
1082 {
1083 ONE_MORE_BYTE (c1);
1084 if (c1 == ISO_CODE_LF)
1085 *dst++ = '\n';
1086 else
1087 {
1088 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1089 {
1090 result = CODING_FINISH_INCONSISTENT_EOL;
1091 goto label_end_of_loop_2;
1092 }
1093 src--;
1094 *dst++ = '\r';
1095 }
1096 }
1097 else
1098 *dst++ = c1;
1099 coding->produced_char++;
1100 break;
1101
1102 case ISO_shift_out:
1103 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1104 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1105 goto label_invalid_code;
1106 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1107 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1108 break;
1109
1110 case ISO_shift_in:
1111 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1112 goto label_invalid_code;
1113 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1114 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1115 break;
1116
1117 case ISO_single_shift_2_7:
1118 case ISO_single_shift_2:
1119 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1120 goto label_invalid_code;
1121 /* SS2 is handled as an escape sequence of ESC 'N' */
1122 c1 = 'N';
1123 goto label_escape_sequence;
1124
1125 case ISO_single_shift_3:
1126 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1127 goto label_invalid_code;
1128 /* SS2 is handled as an escape sequence of ESC 'O' */
1129 c1 = 'O';
1130 goto label_escape_sequence;
1131
1132 case ISO_control_sequence_introducer:
1133 /* CSI is handled as an escape sequence of ESC '[' ... */
1134 c1 = '[';
1135 goto label_escape_sequence;
1136
1137 case ISO_escape:
1138 ONE_MORE_BYTE (c1);
1139 label_escape_sequence:
1140 /* Escape sequences handled by Emacs are invocation,
1141 designation, direction specification, and character
1142 composition specification. */
1143 switch (c1)
1144 {
1145 case '&': /* revision of following character set */
1146 ONE_MORE_BYTE (c1);
1147 if (!(c1 >= '@' && c1 <= '~'))
1148 goto label_invalid_code;
1149 ONE_MORE_BYTE (c1);
1150 if (c1 != ISO_CODE_ESC)
1151 goto label_invalid_code;
1152 ONE_MORE_BYTE (c1);
1153 goto label_escape_sequence;
1154
1155 case '$': /* designation of 2-byte character set */
1156 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1157 goto label_invalid_code;
1158 ONE_MORE_BYTE (c1);
1159 if (c1 >= '@' && c1 <= 'B')
1160 { /* designation of JISX0208.1978, GB2312.1980,
1161 or JISX0208.1980 */
1162 DECODE_DESIGNATION (0, 2, 94, c1);
1163 }
1164 else if (c1 >= 0x28 && c1 <= 0x2B)
1165 { /* designation of DIMENSION2_CHARS94 character set */
1166 ONE_MORE_BYTE (c2);
1167 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1168 }
1169 else if (c1 >= 0x2C && c1 <= 0x2F)
1170 { /* designation of DIMENSION2_CHARS96 character set */
1171 ONE_MORE_BYTE (c2);
1172 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1173 }
1174 else
1175 goto label_invalid_code;
1176 break;
1177
1178 case 'n': /* invocation of locking-shift-2 */
1179 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1180 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1181 goto label_invalid_code;
1182 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1183 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1184 break;
1185
1186 case 'o': /* invocation of locking-shift-3 */
1187 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1188 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1189 goto label_invalid_code;
1190 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1191 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1192 break;
1193
1194 case 'N': /* invocation of single-shift-2 */
1195 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1196 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1197 goto label_invalid_code;
1198 ONE_MORE_BYTE (c1);
1199 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1200 DECODE_ISO_CHARACTER (charset, c1);
1201 break;
1202
1203 case 'O': /* invocation of single-shift-3 */
1204 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1205 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1206 goto label_invalid_code;
1207 ONE_MORE_BYTE (c1);
1208 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1209 DECODE_ISO_CHARACTER (charset, c1);
1210 break;
1211
1212 case '0': case '2': /* start composing */
1213 /* Before processing composing, we must be sure that all
1214 characters being composed are supported by CODING.
1215 If not, we must give up composing and insert the
1216 bunch of codes for composing as is without decoding. */
1217 {
1218 int result1;
1219
1220 result1 = check_composing_code (coding, src, src_end);
1221 if (result1 == 0)
1222 coding->composing = (c1 == '0'
1223 ? COMPOSING_NO_RULE_HEAD
1224 : COMPOSING_WITH_RULE_HEAD);
1225 else if (result1 > 0)
1226 {
1227 if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst)
1228 {
1229 bcopy (src_base, dst, result1 + 2);
1230 src += result1;
1231 dst += result1 + 2;
1232 coding->produced_char += result1 + 2;
1233 }
1234 else
1235 {
1236 result = CODING_FINISH_INSUFFICIENT_DST;
1237 goto label_end_of_loop_2;
1238 }
1239 }
1240 else
1241 goto label_end_of_loop;
1242 }
1243 break;
1244
1245 case '1': /* end composing */
1246 coding->composing = COMPOSING_NO;
1247 coding->produced_char++;
1248 break;
1249
1250 case '[': /* specification of direction */
1251 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1252 goto label_invalid_code;
1253 /* For the moment, nested direction is not supported.
1254 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1255 left-to-right, and nozero means right-to-left. */
1256 ONE_MORE_BYTE (c1);
1257 switch (c1)
1258 {
1259 case ']': /* end of the current direction */
1260 coding->mode &= ~CODING_MODE_DIRECTION;
1261
1262 case '0': /* end of the current direction */
1263 case '1': /* start of left-to-right direction */
1264 ONE_MORE_BYTE (c1);
1265 if (c1 == ']')
1266 coding->mode &= ~CODING_MODE_DIRECTION;
1267 else
1268 goto label_invalid_code;
1269 break;
1270
1271 case '2': /* start of right-to-left direction */
1272 ONE_MORE_BYTE (c1);
1273 if (c1 == ']')
1274 coding->mode |= CODING_MODE_DIRECTION;
1275 else
1276 goto label_invalid_code;
1277 break;
1278
1279 default:
1280 goto label_invalid_code;
1281 }
1282 break;
1283
1284 default:
1285 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1286 goto label_invalid_code;
1287 if (c1 >= 0x28 && c1 <= 0x2B)
1288 { /* designation of DIMENSION1_CHARS94 character set */
1289 ONE_MORE_BYTE (c2);
1290 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1291 }
1292 else if (c1 >= 0x2C && c1 <= 0x2F)
1293 { /* designation of DIMENSION1_CHARS96 character set */
1294 ONE_MORE_BYTE (c2);
1295 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1296 }
1297 else
1298 {
1299 goto label_invalid_code;
1300 }
1301 }
1302 /* We must update these variables now. */
1303 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1304 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1305 break;
1306
1307 label_invalid_code:
1308 while (src_base < src)
1309 *dst++ = *src_base++;
1310 coding->fake_multibyte = 1;
1311 }
1312 continue;
1313
1314 label_end_of_loop:
1315 result = CODING_FINISH_INSUFFICIENT_SRC;
1316 label_end_of_loop_2:
1317 src = src_base;
1318 break;
1319 }
1320
1321 if (src < src_end)
1322 {
1323 if (result == CODING_FINISH_NORMAL)
1324 result = CODING_FINISH_INSUFFICIENT_DST;
1325 else if (result != CODING_FINISH_INCONSISTENT_EOL
1326 && coding->mode & CODING_MODE_LAST_BLOCK)
1327 {
1328 /* This is the last block of the text to be decoded. We had
1329 better just flush out all remaining codes in the text
1330 although they are not valid characters. */
1331 src_bytes = src_end - src;
1332 if (dst_bytes && (dst_end - dst < src_bytes))
1333 src_bytes = dst_end - dst;
1334 bcopy (src, dst, src_bytes);
1335 dst += src_bytes;
1336 src += src_bytes;
1337 coding->fake_multibyte = 1;
1338 }
1339 }
1340
1341 coding->consumed = coding->consumed_char = src - source;
1342 coding->produced = dst - destination;
1343 return result;
1344 }
1345
1346 /* ISO2022 encoding stuff. */
1347
1348 /*
1349 It is not enough to say just "ISO2022" on encoding, we have to
1350 specify more details. In Emacs, each coding system of ISO2022
1351 variant has the following specifications:
1352 1. Initial designation to G0 thru G3.
1353 2. Allows short-form designation?
1354 3. ASCII should be designated to G0 before control characters?
1355 4. ASCII should be designated to G0 at end of line?
1356 5. 7-bit environment or 8-bit environment?
1357 6. Use locking-shift?
1358 7. Use Single-shift?
1359 And the following two are only for Japanese:
1360 8. Use ASCII in place of JIS0201-1976-Roman?
1361 9. Use JISX0208-1983 in place of JISX0208-1978?
1362 These specifications are encoded in `coding->flags' as flag bits
1363 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1364 details.
1365 */
1366
1367 /* Produce codes (escape sequence) for designating CHARSET to graphic
1368 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1369 the coding system CODING allows, produce designation sequence of
1370 short-form. */
1371
1372 #define ENCODE_DESIGNATION(charset, reg, coding) \
1373 do { \
1374 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1375 char *intermediate_char_94 = "()*+"; \
1376 char *intermediate_char_96 = ",-./"; \
1377 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1378 if (revision < 255) \
1379 { \
1380 *dst++ = ISO_CODE_ESC; \
1381 *dst++ = '&'; \
1382 *dst++ = '@' + revision; \
1383 } \
1384 *dst++ = ISO_CODE_ESC; \
1385 if (CHARSET_DIMENSION (charset) == 1) \
1386 { \
1387 if (CHARSET_CHARS (charset) == 94) \
1388 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1389 else \
1390 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1391 } \
1392 else \
1393 { \
1394 *dst++ = '$'; \
1395 if (CHARSET_CHARS (charset) == 94) \
1396 { \
1397 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1398 || reg != 0 \
1399 || final_char < '@' || final_char > 'B') \
1400 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1401 } \
1402 else \
1403 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1404 } \
1405 *dst++ = final_char; \
1406 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1407 } while (0)
1408
1409 /* The following two macros produce codes (control character or escape
1410 sequence) for ISO2022 single-shift functions (single-shift-2 and
1411 single-shift-3). */
1412
1413 #define ENCODE_SINGLE_SHIFT_2 \
1414 do { \
1415 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1416 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1417 else \
1418 { \
1419 *dst++ = ISO_CODE_SS2; \
1420 coding->fake_multibyte = 1; \
1421 } \
1422 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1423 } while (0)
1424
1425 #define ENCODE_SINGLE_SHIFT_3 \
1426 do { \
1427 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1428 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1429 else \
1430 { \
1431 *dst++ = ISO_CODE_SS3; \
1432 coding->fake_multibyte = 1; \
1433 } \
1434 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1435 } while (0)
1436
1437 /* The following four macros produce codes (control character or
1438 escape sequence) for ISO2022 locking-shift functions (shift-in,
1439 shift-out, locking-shift-2, and locking-shift-3). */
1440
1441 #define ENCODE_SHIFT_IN \
1442 do { \
1443 *dst++ = ISO_CODE_SI; \
1444 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1445 } while (0)
1446
1447 #define ENCODE_SHIFT_OUT \
1448 do { \
1449 *dst++ = ISO_CODE_SO; \
1450 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1451 } while (0)
1452
1453 #define ENCODE_LOCKING_SHIFT_2 \
1454 do { \
1455 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1457 } while (0)
1458
1459 #define ENCODE_LOCKING_SHIFT_3 \
1460 do { \
1461 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1462 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1463 } while (0)
1464
1465 /* Produce codes for a DIMENSION1 character whose character set is
1466 CHARSET and whose position-code is C1. Designation and invocation
1467 sequences are also produced in advance if necessary. */
1468
1469
1470 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1471 do { \
1472 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1473 { \
1474 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1475 *dst++ = c1 & 0x7F; \
1476 else \
1477 *dst++ = c1 | 0x80; \
1478 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1479 break; \
1480 } \
1481 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1482 { \
1483 *dst++ = c1 & 0x7F; \
1484 break; \
1485 } \
1486 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1487 { \
1488 *dst++ = c1 | 0x80; \
1489 break; \
1490 } \
1491 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1492 && !coding->safe_charsets[charset]) \
1493 { \
1494 /* We should not encode this character, instead produce one or \
1495 two `?'s. */ \
1496 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1497 if (CHARSET_WIDTH (charset) == 2) \
1498 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1499 break; \
1500 } \
1501 else \
1502 /* Since CHARSET is not yet invoked to any graphic planes, we \
1503 must invoke it, or, at first, designate it to some graphic \
1504 register. Then repeat the loop to actually produce the \
1505 character. */ \
1506 dst = encode_invocation_designation (charset, coding, dst); \
1507 } while (1)
1508
1509 /* Produce codes for a DIMENSION2 character whose character set is
1510 CHARSET and whose position-codes are C1 and C2. Designation and
1511 invocation codes are also produced in advance if necessary. */
1512
1513 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1514 do { \
1515 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1516 { \
1517 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1518 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1519 else \
1520 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1521 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1522 break; \
1523 } \
1524 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1525 { \
1526 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1527 break; \
1528 } \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1530 { \
1531 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1532 break; \
1533 } \
1534 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1535 && !coding->safe_charsets[charset]) \
1536 { \
1537 /* We should not encode this character, instead produce one or \
1538 two `?'s. */ \
1539 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1540 if (CHARSET_WIDTH (charset) == 2) \
1541 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1542 break; \
1543 } \
1544 else \
1545 /* Since CHARSET is not yet invoked to any graphic planes, we \
1546 must invoke it, or, at first, designate it to some graphic \
1547 register. Then repeat the loop to actually produce the \
1548 character. */ \
1549 dst = encode_invocation_designation (charset, coding, dst); \
1550 } while (1)
1551
1552 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1553 do { \
1554 int c_alt, charset_alt; \
1555 if (!NILP (unification_table) \
1556 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1557 >= 0)) \
1558 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1559 else \
1560 charset_alt = charset; \
1561 if (CHARSET_DIMENSION (charset_alt) == 1) \
1562 { \
1563 if (charset == CHARSET_ASCII \
1564 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1565 charset_alt = charset_latin_jisx0201; \
1566 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1567 } \
1568 else \
1569 { \
1570 if (charset == charset_jisx0208 \
1571 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1572 charset_alt = charset_jisx0208_1978; \
1573 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1574 } \
1575 if (! COMPOSING_P (coding->composing)) \
1576 coding->consumed_char++; \
1577 } while (0)
1578
1579 /* Produce designation and invocation codes at a place pointed by DST
1580 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1581 Return new DST. */
1582
1583 unsigned char *
1584 encode_invocation_designation (charset, coding, dst)
1585 int charset;
1586 struct coding_system *coding;
1587 unsigned char *dst;
1588 {
1589 int reg; /* graphic register number */
1590
1591 /* At first, check designations. */
1592 for (reg = 0; reg < 4; reg++)
1593 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1594 break;
1595
1596 if (reg >= 4)
1597 {
1598 /* CHARSET is not yet designated to any graphic registers. */
1599 /* At first check the requested designation. */
1600 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1601 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1602 /* Since CHARSET requests no special designation, designate it
1603 to graphic register 0. */
1604 reg = 0;
1605
1606 ENCODE_DESIGNATION (charset, reg, coding);
1607 }
1608
1609 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1610 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1611 {
1612 /* Since the graphic register REG is not invoked to any graphic
1613 planes, invoke it to graphic plane 0. */
1614 switch (reg)
1615 {
1616 case 0: /* graphic register 0 */
1617 ENCODE_SHIFT_IN;
1618 break;
1619
1620 case 1: /* graphic register 1 */
1621 ENCODE_SHIFT_OUT;
1622 break;
1623
1624 case 2: /* graphic register 2 */
1625 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1626 ENCODE_SINGLE_SHIFT_2;
1627 else
1628 ENCODE_LOCKING_SHIFT_2;
1629 break;
1630
1631 case 3: /* graphic register 3 */
1632 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1633 ENCODE_SINGLE_SHIFT_3;
1634 else
1635 ENCODE_LOCKING_SHIFT_3;
1636 break;
1637 }
1638 }
1639 return dst;
1640 }
1641
1642 /* The following two macros produce codes for indicating composition. */
1643 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1644 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1645 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1646
1647 /* The following three macros produce codes for indicating direction
1648 of text. */
1649 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1650 do { \
1651 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1652 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1653 else \
1654 *dst++ = ISO_CODE_CSI; \
1655 } while (0)
1656
1657 #define ENCODE_DIRECTION_R2L \
1658 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1659
1660 #define ENCODE_DIRECTION_L2R \
1661 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1662
1663 /* Produce codes for designation and invocation to reset the graphic
1664 planes and registers to initial state. */
1665 #define ENCODE_RESET_PLANE_AND_REGISTER \
1666 do { \
1667 int reg; \
1668 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1669 ENCODE_SHIFT_IN; \
1670 for (reg = 0; reg < 4; reg++) \
1671 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1672 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1673 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1674 ENCODE_DESIGNATION \
1675 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1676 } while (0)
1677
1678 /* Produce designation sequences of charsets in the line started from
1679 SRC to a place pointed by *DSTP, and update DSTP.
1680
1681 If the current block ends before any end-of-line, we may fail to
1682 find all the necessary designations. */
1683
1684 void
1685 encode_designation_at_bol (coding, table, src, src_end, dstp)
1686 struct coding_system *coding;
1687 Lisp_Object table;
1688 unsigned char *src, *src_end, **dstp;
1689 {
1690 int charset, c, found = 0, reg;
1691 /* Table of charsets to be designated to each graphic register. */
1692 int r[4];
1693 unsigned char *dst = *dstp;
1694
1695 for (reg = 0; reg < 4; reg++)
1696 r[reg] = -1;
1697
1698 while (src < src_end && *src != '\n' && found < 4)
1699 {
1700 int bytes = BYTES_BY_CHAR_HEAD (*src);
1701
1702 if (NILP (table))
1703 charset = CHARSET_AT (src);
1704 else
1705 {
1706 int c_alt;
1707 unsigned char c1, c2;
1708
1709 SPLIT_STRING(src, bytes, charset, c1, c2);
1710 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1711 charset = CHAR_CHARSET (c_alt);
1712 }
1713
1714 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1715 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1716 {
1717 found++;
1718 r[reg] = charset;
1719 }
1720
1721 src += bytes;
1722 }
1723
1724 if (found)
1725 {
1726 for (reg = 0; reg < 4; reg++)
1727 if (r[reg] >= 0
1728 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1729 ENCODE_DESIGNATION (r[reg], reg, coding);
1730 *dstp = dst;
1731 }
1732 }
1733
1734 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1735
1736 int
1737 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1738 struct coding_system *coding;
1739 unsigned char *source, *destination;
1740 int src_bytes, dst_bytes;
1741 {
1742 unsigned char *src = source;
1743 unsigned char *src_end = source + src_bytes;
1744 unsigned char *dst = destination;
1745 unsigned char *dst_end = destination + dst_bytes;
1746 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1747 from DST_END to assure overflow checking is necessary only at the
1748 head of loop. */
1749 unsigned char *adjusted_dst_end = dst_end - 19;
1750 Lisp_Object unification_table
1751 = coding->character_unification_table_for_encode;
1752 int result = CODING_FINISH_NORMAL;
1753
1754 if (!NILP (Venable_character_unification) && NILP (unification_table))
1755 unification_table = Vstandard_character_unification_table_for_encode;
1756
1757 coding->consumed_char = 0;
1758 coding->fake_multibyte = 0;
1759 while (src < src_end && (dst_bytes
1760 ? (dst < adjusted_dst_end)
1761 : (dst < src - 19)))
1762 {
1763 /* SRC_BASE remembers the start position in source in each loop.
1764 The loop will be exited when there's not enough source text
1765 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1766 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1767 reset to SRC_BASE before exiting. */
1768 unsigned char *src_base = src;
1769 int charset, c1, c2, c3, c4;
1770
1771 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1772 && CODING_SPEC_ISO_BOL (coding))
1773 {
1774 /* We have to produce designation sequences if any now. */
1775 encode_designation_at_bol (coding, unification_table,
1776 src, src_end, &dst);
1777 CODING_SPEC_ISO_BOL (coding) = 0;
1778 }
1779
1780 c1 = *src++;
1781 /* If we are seeing a component of a composite character, we are
1782 seeing a leading-code encoded irregularly for composition, or
1783 a composition rule if composing with rule. We must set C1 to
1784 a normal leading-code or an ASCII code. If we are not seeing
1785 a composite character, we must reset composition,
1786 designation, and invocation states. */
1787 if (COMPOSING_P (coding->composing))
1788 {
1789 if (c1 < 0xA0)
1790 {
1791 /* We are not in a composite character any longer. */
1792 coding->composing = COMPOSING_NO;
1793 ENCODE_RESET_PLANE_AND_REGISTER;
1794 ENCODE_COMPOSITION_END;
1795 }
1796 else
1797 {
1798 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1799 {
1800 *dst++ = c1 & 0x7F;
1801 coding->composing = COMPOSING_WITH_RULE_HEAD;
1802 continue;
1803 }
1804 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1805 coding->composing = COMPOSING_WITH_RULE_RULE;
1806 if (c1 == 0xA0)
1807 {
1808 /* This is an ASCII component. */
1809 ONE_MORE_BYTE (c1);
1810 c1 &= 0x7F;
1811 }
1812 else
1813 /* This is a leading-code of non ASCII component. */
1814 c1 -= 0x20;
1815 }
1816 }
1817
1818 /* Now encode one character. C1 is a control character, an
1819 ASCII character, or a leading-code of multi-byte character. */
1820 switch (emacs_code_class[c1])
1821 {
1822 case EMACS_ascii_code:
1823 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1824 break;
1825
1826 case EMACS_control_code:
1827 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1828 ENCODE_RESET_PLANE_AND_REGISTER;
1829 *dst++ = c1;
1830 coding->consumed_char++;
1831 break;
1832
1833 case EMACS_carriage_return_code:
1834 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1835 {
1836 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1837 ENCODE_RESET_PLANE_AND_REGISTER;
1838 *dst++ = c1;
1839 coding->consumed_char++;
1840 break;
1841 }
1842 /* fall down to treat '\r' as '\n' ... */
1843
1844 case EMACS_linefeed_code:
1845 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1846 ENCODE_RESET_PLANE_AND_REGISTER;
1847 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1848 bcopy (coding->spec.iso2022.initial_designation,
1849 coding->spec.iso2022.current_designation,
1850 sizeof coding->spec.iso2022.initial_designation);
1851 if (coding->eol_type == CODING_EOL_LF
1852 || coding->eol_type == CODING_EOL_UNDECIDED)
1853 *dst++ = ISO_CODE_LF;
1854 else if (coding->eol_type == CODING_EOL_CRLF)
1855 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1856 else
1857 *dst++ = ISO_CODE_CR;
1858 CODING_SPEC_ISO_BOL (coding) = 1;
1859 coding->consumed_char++;
1860 break;
1861
1862 case EMACS_leading_code_2:
1863 ONE_MORE_BYTE (c2);
1864 if (c2 < 0xA0)
1865 {
1866 /* invalid sequence */
1867 *dst++ = c1;
1868 *dst++ = c2;
1869 coding->consumed_char += 2;
1870 }
1871 else
1872 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1873 break;
1874
1875 case EMACS_leading_code_3:
1876 TWO_MORE_BYTES (c2, c3);
1877 if (c2 < 0xA0 || c3 < 0xA0)
1878 {
1879 /* invalid sequence */
1880 *dst++ = c1;
1881 *dst++ = c2;
1882 *dst++ = c3;
1883 coding->consumed_char += 3;
1884 }
1885 else if (c1 < LEADING_CODE_PRIVATE_11)
1886 ENCODE_ISO_CHARACTER (c1, c2, c3);
1887 else
1888 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1889 break;
1890
1891 case EMACS_leading_code_4:
1892 THREE_MORE_BYTES (c2, c3, c4);
1893 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1894 {
1895 /* invalid sequence */
1896 *dst++ = c1;
1897 *dst++ = c2;
1898 *dst++ = c3;
1899 *dst++ = c4;
1900 coding->consumed_char += 4;
1901 }
1902 else
1903 ENCODE_ISO_CHARACTER (c2, c3, c4);
1904 break;
1905
1906 case EMACS_leading_code_composition:
1907 ONE_MORE_BYTE (c2);
1908 if (c2 < 0xA0)
1909 {
1910 /* invalid sequence */
1911 *dst++ = c1;
1912 *dst++ = c2;
1913 coding->consumed_char += 2;
1914 }
1915 else if (c2 == 0xFF)
1916 {
1917 ENCODE_RESET_PLANE_AND_REGISTER;
1918 coding->composing = COMPOSING_WITH_RULE_HEAD;
1919 ENCODE_COMPOSITION_WITH_RULE_START;
1920 coding->consumed_char++;
1921 }
1922 else
1923 {
1924 ENCODE_RESET_PLANE_AND_REGISTER;
1925 /* Rewind one byte because it is a character code of
1926 composition elements. */
1927 src--;
1928 coding->composing = COMPOSING_NO_RULE_HEAD;
1929 ENCODE_COMPOSITION_NO_RULE_START;
1930 coding->consumed_char++;
1931 }
1932 break;
1933
1934 case EMACS_invalid_code:
1935 *dst++ = c1;
1936 coding->consumed_char++;
1937 break;
1938 }
1939 continue;
1940 label_end_of_loop:
1941 result = CODING_FINISH_INSUFFICIENT_SRC;
1942 src = src_base;
1943 break;
1944 }
1945
1946 if (src < src_end)
1947 {
1948 if (result == CODING_FINISH_NORMAL)
1949 result = CODING_FINISH_INSUFFICIENT_DST;
1950 else
1951 /* If this is the last block of the text to be encoded, we
1952 must reset graphic planes and registers to the initial
1953 state, and flush out the carryover if any. */
1954 if (coding->mode & CODING_MODE_LAST_BLOCK)
1955 ENCODE_RESET_PLANE_AND_REGISTER;
1956 }
1957
1958 coding->consumed = src - source;
1959 coding->produced = coding->produced_char = dst - destination;
1960 return result;
1961 }
1962
1963 \f
1964 /*** 4. SJIS and BIG5 handlers ***/
1965
1966 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1967 quite widely. So, for the moment, Emacs supports them in the bare
1968 C code. But, in the future, they may be supported only by CCL. */
1969
1970 /* SJIS is a coding system encoding three character sets: ASCII, right
1971 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1972 as is. A character of charset katakana-jisx0201 is encoded by
1973 "position-code + 0x80". A character of charset japanese-jisx0208
1974 is encoded in 2-byte but two position-codes are divided and shifted
1975 so that it fit in the range below.
1976
1977 --- CODE RANGE of SJIS ---
1978 (character set) (range)
1979 ASCII 0x00 .. 0x7F
1980 KATAKANA-JISX0201 0xA0 .. 0xDF
1981 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1982 (2nd byte) 0x40 .. 0xFF
1983 -------------------------------
1984
1985 */
1986
1987 /* BIG5 is a coding system encoding two character sets: ASCII and
1988 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1989 character set and is encoded in two-byte.
1990
1991 --- CODE RANGE of BIG5 ---
1992 (character set) (range)
1993 ASCII 0x00 .. 0x7F
1994 Big5 (1st byte) 0xA1 .. 0xFE
1995 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1996 --------------------------
1997
1998 Since the number of characters in Big5 is larger than maximum
1999 characters in Emacs' charset (96x96), it can't be handled as one
2000 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2001 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2002 contains frequently used characters and the latter contains less
2003 frequently used characters. */
2004
2005 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2006 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2007 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2008 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2009
2010 /* Number of Big5 characters which have the same code in 1st byte. */
2011 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2012
2013 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2014 do { \
2015 unsigned int temp \
2016 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2017 if (b1 < 0xC9) \
2018 charset = charset_big5_1; \
2019 else \
2020 { \
2021 charset = charset_big5_2; \
2022 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2023 } \
2024 c1 = temp / (0xFF - 0xA1) + 0x21; \
2025 c2 = temp % (0xFF - 0xA1) + 0x21; \
2026 } while (0)
2027
2028 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2029 do { \
2030 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2031 if (charset == charset_big5_2) \
2032 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2033 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2034 b2 = temp % BIG5_SAME_ROW; \
2035 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2036 } while (0)
2037
2038 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2039 do { \
2040 int c_alt, charset_alt = (charset); \
2041 if (!NILP (unification_table) \
2042 && ((c_alt = unify_char (unification_table, \
2043 -1, (charset), c1, c2)) >= 0)) \
2044 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2045 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2046 DECODE_CHARACTER_ASCII (c1); \
2047 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2048 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2049 else \
2050 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2051 } while (0)
2052
2053 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2054 do { \
2055 int c_alt, charset_alt; \
2056 if (!NILP (unification_table) \
2057 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2058 >= 0)) \
2059 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2060 else \
2061 charset_alt = charset; \
2062 if (charset_alt == charset_ascii) \
2063 *dst++ = c1; \
2064 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2065 { \
2066 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2067 *dst++ = c1; \
2068 else \
2069 { \
2070 *dst++ = charset_alt, *dst++ = c1; \
2071 coding->fake_multibyte = 1; \
2072 } \
2073 } \
2074 else \
2075 { \
2076 c1 &= 0x7F, c2 &= 0x7F; \
2077 if (sjis_p && charset_alt == charset_jisx0208) \
2078 { \
2079 unsigned char s1, s2; \
2080 \
2081 ENCODE_SJIS (c1, c2, s1, s2); \
2082 *dst++ = s1, *dst++ = s2; \
2083 coding->fake_multibyte = 1; \
2084 } \
2085 else if (!sjis_p \
2086 && (charset_alt == charset_big5_1 \
2087 || charset_alt == charset_big5_2)) \
2088 { \
2089 unsigned char b1, b2; \
2090 \
2091 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2092 *dst++ = b1, *dst++ = b2; \
2093 } \
2094 else \
2095 { \
2096 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2097 coding->fake_multibyte = 1; \
2098 } \
2099 } \
2100 coding->consumed_char++; \
2101 } while (0);
2102
2103 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2104 Check if a text is encoded in SJIS. If it is, return
2105 CODING_CATEGORY_MASK_SJIS, else return 0. */
2106
2107 int
2108 detect_coding_sjis (src, src_end)
2109 unsigned char *src, *src_end;
2110 {
2111 unsigned char c;
2112
2113 while (src < src_end)
2114 {
2115 c = *src++;
2116 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2117 {
2118 if (src < src_end && *src++ < 0x40)
2119 return 0;
2120 }
2121 }
2122 return CODING_CATEGORY_MASK_SJIS;
2123 }
2124
2125 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2126 Check if a text is encoded in BIG5. If it is, return
2127 CODING_CATEGORY_MASK_BIG5, else return 0. */
2128
2129 int
2130 detect_coding_big5 (src, src_end)
2131 unsigned char *src, *src_end;
2132 {
2133 unsigned char c;
2134
2135 while (src < src_end)
2136 {
2137 c = *src++;
2138 if (c >= 0xA1)
2139 {
2140 if (src >= src_end)
2141 break;
2142 c = *src++;
2143 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2144 return 0;
2145 }
2146 }
2147 return CODING_CATEGORY_MASK_BIG5;
2148 }
2149
2150 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2151 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2152
2153 int
2154 decode_coding_sjis_big5 (coding, source, destination,
2155 src_bytes, dst_bytes, sjis_p)
2156 struct coding_system *coding;
2157 unsigned char *source, *destination;
2158 int src_bytes, dst_bytes;
2159 int sjis_p;
2160 {
2161 unsigned char *src = source;
2162 unsigned char *src_end = source + src_bytes;
2163 unsigned char *dst = destination;
2164 unsigned char *dst_end = destination + dst_bytes;
2165 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2166 from DST_END to assure overflow checking is necessary only at the
2167 head of loop. */
2168 unsigned char *adjusted_dst_end = dst_end - 3;
2169 Lisp_Object unification_table
2170 = coding->character_unification_table_for_decode;
2171 int result = CODING_FINISH_NORMAL;
2172
2173 if (!NILP (Venable_character_unification) && NILP (unification_table))
2174 unification_table = Vstandard_character_unification_table_for_decode;
2175
2176 coding->produced_char = 0;
2177 coding->fake_multibyte = 0;
2178 while (src < src_end && (dst_bytes
2179 ? (dst < adjusted_dst_end)
2180 : (dst < src - 3)))
2181 {
2182 /* SRC_BASE remembers the start position in source in each loop.
2183 The loop will be exited when there's not enough source text
2184 to analyze two-byte character (within macro ONE_MORE_BYTE).
2185 In that case, SRC is reset to SRC_BASE before exiting. */
2186 unsigned char *src_base = src;
2187 unsigned char c1 = *src++, c2, c3, c4;
2188
2189 if (c1 < 0x20)
2190 {
2191 if (c1 == '\r')
2192 {
2193 if (coding->eol_type == CODING_EOL_CRLF)
2194 {
2195 ONE_MORE_BYTE (c2);
2196 if (c2 == '\n')
2197 *dst++ = c2;
2198 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2199 {
2200 result = CODING_FINISH_INCONSISTENT_EOL;
2201 goto label_end_of_loop_2;
2202 }
2203 else
2204 /* To process C2 again, SRC is subtracted by 1. */
2205 *dst++ = c1, src--;
2206 }
2207 else if (coding->eol_type == CODING_EOL_CR)
2208 *dst++ = '\n';
2209 else
2210 *dst++ = c1;
2211 }
2212 else if (c1 == '\n'
2213 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2214 && (coding->eol_type == CODING_EOL_CR
2215 || coding->eol_type == CODING_EOL_CRLF))
2216 {
2217 result = CODING_FINISH_INCONSISTENT_EOL;
2218 goto label_end_of_loop_2;
2219 }
2220 else
2221 *dst++ = c1;
2222 coding->produced_char++;
2223 }
2224 else if (c1 < 0x80)
2225 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2226 else if (c1 < 0xA0)
2227 {
2228 /* SJIS -> JISX0208 */
2229 if (sjis_p)
2230 {
2231 ONE_MORE_BYTE (c2);
2232 if (c2 >= 0x40)
2233 {
2234 DECODE_SJIS (c1, c2, c3, c4);
2235 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2236 }
2237 else
2238 goto label_invalid_code_2;
2239 }
2240 else
2241 goto label_invalid_code_1;
2242 }
2243 else if (c1 < 0xE0)
2244 {
2245 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2246 if (sjis_p)
2247 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2248 /* dummy */ c2);
2249 else
2250 {
2251 int charset;
2252
2253 ONE_MORE_BYTE (c2);
2254 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2255 {
2256 DECODE_BIG5 (c1, c2, charset, c3, c4);
2257 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2258 }
2259 else
2260 goto label_invalid_code_2;
2261 }
2262 }
2263 else /* C1 >= 0xE0 */
2264 {
2265 /* SJIS -> JISX0208, BIG5 -> Big5 */
2266 if (sjis_p)
2267 {
2268 ONE_MORE_BYTE (c2);
2269 if (c2 >= 0x40)
2270 {
2271 DECODE_SJIS (c1, c2, c3, c4);
2272 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2273 }
2274 else
2275 goto label_invalid_code_2;
2276 }
2277 else
2278 {
2279 int charset;
2280
2281 ONE_MORE_BYTE (c2);
2282 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2283 {
2284 DECODE_BIG5 (c1, c2, charset, c3, c4);
2285 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2286 }
2287 else
2288 goto label_invalid_code_2;
2289 }
2290 }
2291 continue;
2292
2293 label_invalid_code_1:
2294 *dst++ = c1;
2295 coding->produced_char++;
2296 coding->fake_multibyte = 1;
2297 continue;
2298
2299 label_invalid_code_2:
2300 *dst++ = c1; *dst++= c2;
2301 coding->produced_char += 2;
2302 coding->fake_multibyte = 1;
2303 continue;
2304
2305 label_end_of_loop:
2306 result = CODING_FINISH_INSUFFICIENT_SRC;
2307 label_end_of_loop_2:
2308 src = src_base;
2309 break;
2310 }
2311
2312 if (src < src_end)
2313 {
2314 if (result == CODING_FINISH_NORMAL)
2315 result = CODING_FINISH_INSUFFICIENT_DST;
2316 else if (result != CODING_FINISH_INCONSISTENT_EOL
2317 && coding->mode & CODING_MODE_LAST_BLOCK)
2318 {
2319 src_bytes = src_end - src;
2320 if (dst_bytes && (dst_end - dst < src_bytes))
2321 src_bytes = dst_end - dst;
2322 bcopy (dst, src, src_bytes);
2323 src += src_bytes;
2324 dst += src_bytes;
2325 coding->fake_multibyte = 1;
2326 }
2327 }
2328
2329 coding->consumed = coding->consumed_char = src - source;
2330 coding->produced = dst - destination;
2331 return result;
2332 }
2333
2334 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2335 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2336 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2337 sure that all these charsets are registered as official charset
2338 (i.e. do not have extended leading-codes). Characters of other
2339 charsets are produced without any encoding. If SJIS_P is 1, encode
2340 SJIS text, else encode BIG5 text. */
2341
2342 int
2343 encode_coding_sjis_big5 (coding, source, destination,
2344 src_bytes, dst_bytes, sjis_p)
2345 struct coding_system *coding;
2346 unsigned char *source, *destination;
2347 int src_bytes, dst_bytes;
2348 int sjis_p;
2349 {
2350 unsigned char *src = source;
2351 unsigned char *src_end = source + src_bytes;
2352 unsigned char *dst = destination;
2353 unsigned char *dst_end = destination + dst_bytes;
2354 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2355 from DST_END to assure overflow checking is necessary only at the
2356 head of loop. */
2357 unsigned char *adjusted_dst_end = dst_end - 1;
2358 Lisp_Object unification_table
2359 = coding->character_unification_table_for_encode;
2360 int result = CODING_FINISH_NORMAL;
2361
2362 if (!NILP (Venable_character_unification) && NILP (unification_table))
2363 unification_table = Vstandard_character_unification_table_for_encode;
2364
2365 coding->consumed_char = 0;
2366 coding->fake_multibyte = 0;
2367 while (src < src_end && (dst_bytes
2368 ? (dst < adjusted_dst_end)
2369 : (dst < src - 1)))
2370 {
2371 /* SRC_BASE remembers the start position in source in each loop.
2372 The loop will be exited when there's not enough source text
2373 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2374 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2375 before exiting. */
2376 unsigned char *src_base = src;
2377 unsigned char c1 = *src++, c2, c3, c4;
2378
2379 if (coding->composing)
2380 {
2381 if (c1 == 0xA0)
2382 {
2383 ONE_MORE_BYTE (c1);
2384 c1 &= 0x7F;
2385 }
2386 else if (c1 >= 0xA0)
2387 c1 -= 0x20;
2388 else
2389 coding->composing = 0;
2390 }
2391
2392 switch (emacs_code_class[c1])
2393 {
2394 case EMACS_ascii_code:
2395 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2396 break;
2397
2398 case EMACS_control_code:
2399 *dst++ = c1;
2400 coding->consumed_char++;
2401 break;
2402
2403 case EMACS_carriage_return_code:
2404 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2405 {
2406 *dst++ = c1;
2407 coding->consumed_char++;
2408 break;
2409 }
2410 /* fall down to treat '\r' as '\n' ... */
2411
2412 case EMACS_linefeed_code:
2413 if (coding->eol_type == CODING_EOL_LF
2414 || coding->eol_type == CODING_EOL_UNDECIDED)
2415 *dst++ = '\n';
2416 else if (coding->eol_type == CODING_EOL_CRLF)
2417 *dst++ = '\r', *dst++ = '\n';
2418 else
2419 *dst++ = '\r';
2420 coding->consumed_char++;
2421 break;
2422
2423 case EMACS_leading_code_2:
2424 ONE_MORE_BYTE (c2);
2425 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2426 break;
2427
2428 case EMACS_leading_code_3:
2429 TWO_MORE_BYTES (c2, c3);
2430 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2431 break;
2432
2433 case EMACS_leading_code_4:
2434 THREE_MORE_BYTES (c2, c3, c4);
2435 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2436 break;
2437
2438 case EMACS_leading_code_composition:
2439 coding->composing = 1;
2440 break;
2441
2442 default: /* i.e. case EMACS_invalid_code: */
2443 *dst++ = c1;
2444 coding->consumed_char++;
2445 }
2446 continue;
2447
2448 label_end_of_loop:
2449 result = CODING_FINISH_INSUFFICIENT_SRC;
2450 src = src_base;
2451 break;
2452 }
2453
2454 if (result == CODING_FINISH_NORMAL
2455 && src < src_end)
2456 result = CODING_FINISH_INSUFFICIENT_DST;
2457 coding->consumed = src - source;
2458 coding->produced = coding->produced_char = dst - destination;
2459 return result;
2460 }
2461
2462 \f
2463 /*** 5. End-of-line handlers ***/
2464
2465 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2466 This function is called only when `coding->eol_type' is
2467 CODING_EOL_CRLF or CODING_EOL_CR. */
2468
2469 int
2470 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2471 struct coding_system *coding;
2472 unsigned char *source, *destination;
2473 int src_bytes, dst_bytes;
2474 {
2475 unsigned char *src = source;
2476 unsigned char *src_end = source + src_bytes;
2477 unsigned char *dst = destination;
2478 unsigned char *dst_end = destination + dst_bytes;
2479 unsigned char c;
2480 int result = CODING_FINISH_NORMAL;
2481
2482 coding->fake_multibyte = 0;
2483
2484 if (src_bytes <= 0)
2485 return result;
2486
2487 switch (coding->eol_type)
2488 {
2489 case CODING_EOL_CRLF:
2490 {
2491 /* Since the maximum bytes produced by each loop is 2, we
2492 subtract 1 from DST_END to assure overflow checking is
2493 necessary only at the head of loop. */
2494 unsigned char *adjusted_dst_end = dst_end - 1;
2495
2496 while (src < src_end && (dst_bytes
2497 ? (dst < adjusted_dst_end)
2498 : (dst < src - 1)))
2499 {
2500 unsigned char *src_base = src;
2501
2502 c = *src++;
2503 if (c == '\r')
2504 {
2505 ONE_MORE_BYTE (c);
2506 if (c != '\n')
2507 {
2508 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2509 {
2510 result = CODING_FINISH_INCONSISTENT_EOL;
2511 goto label_end_of_loop_2;
2512 }
2513 *dst++ = '\r';
2514 if (BASE_LEADING_CODE_P (c))
2515 coding->fake_multibyte = 1;
2516 }
2517 *dst++ = c;
2518 }
2519 else if (c == '\n'
2520 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2521 {
2522 result = CODING_FINISH_INCONSISTENT_EOL;
2523 goto label_end_of_loop_2;
2524 }
2525 else
2526 {
2527 *dst++ = c;
2528 if (BASE_LEADING_CODE_P (c))
2529 coding->fake_multibyte = 1;
2530 }
2531 continue;
2532
2533 label_end_of_loop:
2534 result = CODING_FINISH_INSUFFICIENT_SRC;
2535 label_end_of_loop_2:
2536 src = src_base;
2537 break;
2538 }
2539 if (result == CODING_FINISH_NORMAL
2540 && src < src_end)
2541 result = CODING_FINISH_INSUFFICIENT_DST;
2542 }
2543 break;
2544
2545 case CODING_EOL_CR:
2546 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2547 {
2548 while (src < src_end)
2549 {
2550 if ((c = *src++) == '\n')
2551 break;
2552 if (BASE_LEADING_CODE_P (c))
2553 coding->fake_multibyte = 1;
2554 }
2555 if (*--src == '\n')
2556 {
2557 src_bytes = src - source;
2558 result = CODING_FINISH_INCONSISTENT_EOL;
2559 }
2560 }
2561 if (dst_bytes && src_bytes > dst_bytes)
2562 {
2563 result = CODING_FINISH_INSUFFICIENT_DST;
2564 src_bytes = dst_bytes;
2565 }
2566 if (dst_bytes)
2567 bcopy (source, destination, src_bytes);
2568 else
2569 safe_bcopy (source, destination, src_bytes);
2570 src = source + src_bytes;
2571 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2572 break;
2573
2574 default: /* i.e. case: CODING_EOL_LF */
2575 if (dst_bytes && src_bytes > dst_bytes)
2576 {
2577 result = CODING_FINISH_INSUFFICIENT_DST;
2578 src_bytes = dst_bytes;
2579 }
2580 if (dst_bytes)
2581 bcopy (source, destination, src_bytes);
2582 else
2583 safe_bcopy (source, destination, src_bytes);
2584 src += src_bytes;
2585 dst += dst_bytes;
2586 coding->fake_multibyte = 1;
2587 break;
2588 }
2589
2590 coding->consumed = coding->consumed_char = src - source;
2591 coding->produced = coding->produced_char = dst - destination;
2592 return result;
2593 }
2594
2595 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2596 format of end-of-line according to `coding->eol_type'. If
2597 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2598 '\r' in source text also means end-of-line. */
2599
2600 int
2601 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2602 struct coding_system *coding;
2603 unsigned char *source, *destination;
2604 int src_bytes, dst_bytes;
2605 {
2606 unsigned char *src = source;
2607 unsigned char *dst = destination;
2608 int result = CODING_FINISH_NORMAL;
2609
2610 coding->fake_multibyte = 0;
2611
2612 if (coding->eol_type == CODING_EOL_CRLF)
2613 {
2614 unsigned char c;
2615 unsigned char *src_end = source + src_bytes;
2616 unsigned char *dst_end = destination + dst_bytes;
2617 /* Since the maximum bytes produced by each loop is 2, we
2618 subtract 1 from DST_END to assure overflow checking is
2619 necessary only at the head of loop. */
2620 unsigned char *adjusted_dst_end = dst_end - 1;
2621
2622 while (src < src_end && (dst_bytes
2623 ? (dst < adjusted_dst_end)
2624 : (dst < src - 1)))
2625 {
2626 c = *src++;
2627 if (c == '\n'
2628 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2629 *dst++ = '\r', *dst++ = '\n';
2630 else
2631 {
2632 *dst++ = c;
2633 if (BASE_LEADING_CODE_P (c))
2634 coding->fake_multibyte = 1;
2635 }
2636 }
2637 if (src < src_end)
2638 result = CODING_FINISH_INSUFFICIENT_DST;
2639 }
2640 else
2641 {
2642 unsigned char c;
2643
2644 if (dst_bytes && src_bytes > dst_bytes)
2645 {
2646 src_bytes = dst_bytes;
2647 result = CODING_FINISH_INSUFFICIENT_DST;
2648 }
2649 if (dst_bytes)
2650 bcopy (source, destination, src_bytes);
2651 else
2652 {
2653 safe_bcopy (source, destination, src_bytes);
2654 dst_bytes = src_bytes;
2655 }
2656 if (coding->eol_type == CODING_EOL_CRLF)
2657 {
2658 while (src_bytes--)
2659 {
2660 if ((c = *dst++) == '\n')
2661 dst[-1] = '\r';
2662 else if (BASE_LEADING_CODE_P (c))
2663 coding->fake_multibyte = 1;
2664 }
2665 }
2666 else
2667 {
2668 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2669 {
2670 while (src_bytes--)
2671 if (*dst++ == '\r') dst[-1] = '\n';
2672 }
2673 coding->fake_multibyte = 1;
2674 }
2675 src = source + dst_bytes;
2676 dst = destination + dst_bytes;
2677 }
2678
2679 coding->consumed = coding->consumed_char = src - source;
2680 coding->produced = coding->produced_char = dst - destination;
2681 return result;
2682 }
2683
2684 \f
2685 /*** 6. C library functions ***/
2686
2687 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2688 has a property `coding-system'. The value of this property is a
2689 vector of length 5 (called as coding-vector). Among elements of
2690 this vector, the first (element[0]) and the fifth (element[4])
2691 carry important information for decoding/encoding. Before
2692 decoding/encoding, this information should be set in fields of a
2693 structure of type `coding_system'.
2694
2695 A value of property `coding-system' can be a symbol of another
2696 subsidiary coding-system. In that case, Emacs gets coding-vector
2697 from that symbol.
2698
2699 `element[0]' contains information to be set in `coding->type'. The
2700 value and its meaning is as follows:
2701
2702 0 -- coding_type_emacs_mule
2703 1 -- coding_type_sjis
2704 2 -- coding_type_iso2022
2705 3 -- coding_type_big5
2706 4 -- coding_type_ccl encoder/decoder written in CCL
2707 nil -- coding_type_no_conversion
2708 t -- coding_type_undecided (automatic conversion on decoding,
2709 no-conversion on encoding)
2710
2711 `element[4]' contains information to be set in `coding->flags' and
2712 `coding->spec'. The meaning varies by `coding->type'.
2713
2714 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2715 of length 32 (of which the first 13 sub-elements are used now).
2716 Meanings of these sub-elements are:
2717
2718 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2719 If the value is an integer of valid charset, the charset is
2720 assumed to be designated to graphic register N initially.
2721
2722 If the value is minus, it is a minus value of charset which
2723 reserves graphic register N, which means that the charset is
2724 not designated initially but should be designated to graphic
2725 register N just before encoding a character in that charset.
2726
2727 If the value is nil, graphic register N is never used on
2728 encoding.
2729
2730 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2731 Each value takes t or nil. See the section ISO2022 of
2732 `coding.h' for more information.
2733
2734 If `coding->type' is `coding_type_big5', element[4] is t to denote
2735 BIG5-ETen or nil to denote BIG5-HKU.
2736
2737 If `coding->type' takes the other value, element[4] is ignored.
2738
2739 Emacs Lisp's coding system also carries information about format of
2740 end-of-line in a value of property `eol-type'. If the value is
2741 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2742 means CODING_EOL_CR. If it is not integer, it should be a vector
2743 of subsidiary coding systems of which property `eol-type' has one
2744 of above values.
2745
2746 */
2747
2748 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2749 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2750 is setup so that no conversion is necessary and return -1, else
2751 return 0. */
2752
2753 int
2754 setup_coding_system (coding_system, coding)
2755 Lisp_Object coding_system;
2756 struct coding_system *coding;
2757 {
2758 Lisp_Object coding_spec, coding_type, eol_type, plist;
2759 Lisp_Object val;
2760 int i;
2761
2762 /* Initialize some fields required for all kinds of coding systems. */
2763 coding->symbol = coding_system;
2764 coding->common_flags = 0;
2765 coding->mode = 0;
2766 coding->heading_ascii = -1;
2767 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2768 coding_spec = Fget (coding_system, Qcoding_system);
2769 if (!VECTORP (coding_spec)
2770 || XVECTOR (coding_spec)->size != 5
2771 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2772 goto label_invalid_coding_system;
2773
2774 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2775 if (VECTORP (eol_type))
2776 {
2777 coding->eol_type = CODING_EOL_UNDECIDED;
2778 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2779 }
2780 else if (XFASTINT (eol_type) == 1)
2781 {
2782 coding->eol_type = CODING_EOL_CRLF;
2783 coding->common_flags
2784 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2785 }
2786 else if (XFASTINT (eol_type) == 2)
2787 {
2788 coding->eol_type = CODING_EOL_CR;
2789 coding->common_flags
2790 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2791 }
2792 else
2793 coding->eol_type = CODING_EOL_LF;
2794
2795 coding_type = XVECTOR (coding_spec)->contents[0];
2796 /* Try short cut. */
2797 if (SYMBOLP (coding_type))
2798 {
2799 if (EQ (coding_type, Qt))
2800 {
2801 coding->type = coding_type_undecided;
2802 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2803 }
2804 else
2805 coding->type = coding_type_no_conversion;
2806 return 0;
2807 }
2808
2809 /* Initialize remaining fields. */
2810 coding->composing = 0;
2811 coding->character_unification_table_for_decode = Qnil;
2812 coding->character_unification_table_for_encode = Qnil;
2813
2814 /* Get values of coding system properties:
2815 `post-read-conversion', `pre-write-conversion',
2816 `character-unification-table-for-decode',
2817 `character-unification-table-for-encode'. */
2818 plist = XVECTOR (coding_spec)->contents[3];
2819 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2820 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2821 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2822 if (SYMBOLP (val))
2823 val = Fget (val, Qcharacter_unification_table_for_decode);
2824 coding->character_unification_table_for_decode
2825 = CHAR_TABLE_P (val) ? val : Qnil;
2826 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2827 if (SYMBOLP (val))
2828 val = Fget (val, Qcharacter_unification_table_for_encode);
2829 coding->character_unification_table_for_encode
2830 = CHAR_TABLE_P (val) ? val : Qnil;
2831 val = Fplist_get (plist, Qcoding_category);
2832 if (!NILP (val))
2833 {
2834 val = Fget (val, Qcoding_category_index);
2835 if (INTEGERP (val))
2836 coding->category_idx = XINT (val);
2837 else
2838 goto label_invalid_coding_system;
2839 }
2840 else
2841 goto label_invalid_coding_system;
2842
2843 val = Fplist_get (plist, Qsafe_charsets);
2844 if (EQ (val, Qt))
2845 {
2846 for (i = 0; i <= MAX_CHARSET; i++)
2847 coding->safe_charsets[i] = 1;
2848 }
2849 else
2850 {
2851 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2852 while (CONSP (val))
2853 {
2854 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2855 coding->safe_charsets[i] = 1;
2856 val = XCONS (val)->cdr;
2857 }
2858 }
2859
2860 switch (XFASTINT (coding_type))
2861 {
2862 case 0:
2863 coding->type = coding_type_emacs_mule;
2864 if (!NILP (coding->post_read_conversion))
2865 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2866 if (!NILP (coding->pre_write_conversion))
2867 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2868 break;
2869
2870 case 1:
2871 coding->type = coding_type_sjis;
2872 coding->common_flags
2873 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2874 break;
2875
2876 case 2:
2877 coding->type = coding_type_iso2022;
2878 coding->common_flags
2879 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2880 {
2881 Lisp_Object val, temp;
2882 Lisp_Object *flags;
2883 int i, charset, reg_bits = 0;
2884
2885 val = XVECTOR (coding_spec)->contents[4];
2886
2887 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2888 goto label_invalid_coding_system;
2889
2890 flags = XVECTOR (val)->contents;
2891 coding->flags
2892 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2893 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2894 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2895 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2896 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2897 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2898 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2899 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2900 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2901 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2902 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2903 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2904 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2905 );
2906
2907 /* Invoke graphic register 0 to plane 0. */
2908 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2909 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2910 CODING_SPEC_ISO_INVOCATION (coding, 1)
2911 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2912 /* Not single shifting at first. */
2913 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2914 /* Beginning of buffer should also be regarded as bol. */
2915 CODING_SPEC_ISO_BOL (coding) = 1;
2916
2917 for (charset = 0; charset <= MAX_CHARSET; charset++)
2918 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2919 val = Vcharset_revision_alist;
2920 while (CONSP (val))
2921 {
2922 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2923 if (charset >= 0
2924 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2925 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2926 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2927 val = XCONS (val)->cdr;
2928 }
2929
2930 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2931 FLAGS[REG] can be one of below:
2932 integer CHARSET: CHARSET occupies register I,
2933 t: designate nothing to REG initially, but can be used
2934 by any charsets,
2935 list of integer, nil, or t: designate the first
2936 element (if integer) to REG initially, the remaining
2937 elements (if integer) is designated to REG on request,
2938 if an element is t, REG can be used by any charsets,
2939 nil: REG is never used. */
2940 for (charset = 0; charset <= MAX_CHARSET; charset++)
2941 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2942 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2943 for (i = 0; i < 4; i++)
2944 {
2945 if (INTEGERP (flags[i])
2946 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2947 || (charset = get_charset_id (flags[i])) >= 0)
2948 {
2949 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2950 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2951 }
2952 else if (EQ (flags[i], Qt))
2953 {
2954 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2955 reg_bits |= 1 << i;
2956 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2957 }
2958 else if (CONSP (flags[i]))
2959 {
2960 Lisp_Object tail = flags[i];
2961
2962 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2963 if (INTEGERP (XCONS (tail)->car)
2964 && (charset = XINT (XCONS (tail)->car),
2965 CHARSET_VALID_P (charset))
2966 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2967 {
2968 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2969 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2970 }
2971 else
2972 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2973 tail = XCONS (tail)->cdr;
2974 while (CONSP (tail))
2975 {
2976 if (INTEGERP (XCONS (tail)->car)
2977 && (charset = XINT (XCONS (tail)->car),
2978 CHARSET_VALID_P (charset))
2979 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2980 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2981 = i;
2982 else if (EQ (XCONS (tail)->car, Qt))
2983 reg_bits |= 1 << i;
2984 tail = XCONS (tail)->cdr;
2985 }
2986 }
2987 else
2988 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2989
2990 CODING_SPEC_ISO_DESIGNATION (coding, i)
2991 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2992 }
2993
2994 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2995 {
2996 /* REG 1 can be used only by locking shift in 7-bit env. */
2997 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2998 reg_bits &= ~2;
2999 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3000 /* Without any shifting, only REG 0 and 1 can be used. */
3001 reg_bits &= 3;
3002 }
3003
3004 if (reg_bits)
3005 for (charset = 0; charset <= MAX_CHARSET; charset++)
3006 {
3007 if (CHARSET_VALID_P (charset))
3008 {
3009 /* There exist some default graphic registers to be
3010 used CHARSET. */
3011
3012 /* We had better avoid designating a charset of
3013 CHARS96 to REG 0 as far as possible. */
3014 if (CHARSET_CHARS (charset) == 96)
3015 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3016 = (reg_bits & 2
3017 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3018 else
3019 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3020 = (reg_bits & 1
3021 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3022 }
3023 }
3024 }
3025 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3026 coding->spec.iso2022.last_invalid_designation_register = -1;
3027 break;
3028
3029 case 3:
3030 coding->type = coding_type_big5;
3031 coding->common_flags
3032 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3033 coding->flags
3034 = (NILP (XVECTOR (coding_spec)->contents[4])
3035 ? CODING_FLAG_BIG5_HKU
3036 : CODING_FLAG_BIG5_ETEN);
3037 break;
3038
3039 case 4:
3040 coding->type = coding_type_ccl;
3041 coding->common_flags
3042 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3043 {
3044 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
3045 if (CONSP (val)
3046 && VECTORP (XCONS (val)->car)
3047 && VECTORP (XCONS (val)->cdr))
3048 {
3049 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
3050 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
3051 }
3052 else
3053 goto label_invalid_coding_system;
3054 }
3055 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3056 break;
3057
3058 case 5:
3059 coding->type = coding_type_raw_text;
3060 break;
3061
3062 default:
3063 goto label_invalid_coding_system;
3064 }
3065 return 0;
3066
3067 label_invalid_coding_system:
3068 coding->type = coding_type_no_conversion;
3069 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3070 coding->common_flags = 0;
3071 coding->eol_type = CODING_EOL_LF;
3072 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3073 return -1;
3074 }
3075
3076 /* Emacs has a mechanism to automatically detect a coding system if it
3077 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3078 it's impossible to distinguish some coding systems accurately
3079 because they use the same range of codes. So, at first, coding
3080 systems are categorized into 7, those are:
3081
3082 o coding-category-emacs-mule
3083
3084 The category for a coding system which has the same code range
3085 as Emacs' internal format. Assigned the coding-system (Lisp
3086 symbol) `emacs-mule' by default.
3087
3088 o coding-category-sjis
3089
3090 The category for a coding system which has the same code range
3091 as SJIS. Assigned the coding-system (Lisp
3092 symbol) `japanese-shift-jis' by default.
3093
3094 o coding-category-iso-7
3095
3096 The category for a coding system which has the same code range
3097 as ISO2022 of 7-bit environment. This doesn't use any locking
3098 shift and single shift functions. This can encode/decode all
3099 charsets. Assigned the coding-system (Lisp symbol)
3100 `iso-2022-7bit' by default.
3101
3102 o coding-category-iso-7-tight
3103
3104 Same as coding-category-iso-7 except that this can
3105 encode/decode only the specified charsets.
3106
3107 o coding-category-iso-8-1
3108
3109 The category for a coding system which has the same code range
3110 as ISO2022 of 8-bit environment and graphic plane 1 used only
3111 for DIMENSION1 charset. This doesn't use any locking shift
3112 and single shift functions. Assigned the coding-system (Lisp
3113 symbol) `iso-latin-1' by default.
3114
3115 o coding-category-iso-8-2
3116
3117 The category for a coding system which has the same code range
3118 as ISO2022 of 8-bit environment and graphic plane 1 used only
3119 for DIMENSION2 charset. This doesn't use any locking shift
3120 and single shift functions. Assigned the coding-system (Lisp
3121 symbol) `japanese-iso-8bit' by default.
3122
3123 o coding-category-iso-7-else
3124
3125 The category for a coding system which has the same code range
3126 as ISO2022 of 7-bit environemnt but uses locking shift or
3127 single shift functions. Assigned the coding-system (Lisp
3128 symbol) `iso-2022-7bit-lock' by default.
3129
3130 o coding-category-iso-8-else
3131
3132 The category for a coding system which has the same code range
3133 as ISO2022 of 8-bit environemnt but uses locking shift or
3134 single shift functions. Assigned the coding-system (Lisp
3135 symbol) `iso-2022-8bit-ss2' by default.
3136
3137 o coding-category-big5
3138
3139 The category for a coding system which has the same code range
3140 as BIG5. Assigned the coding-system (Lisp symbol)
3141 `cn-big5' by default.
3142
3143 o coding-category-binary
3144
3145 The category for a coding system not categorized in any of the
3146 above. Assigned the coding-system (Lisp symbol)
3147 `no-conversion' by default.
3148
3149 Each of them is a Lisp symbol and the value is an actual
3150 `coding-system's (this is also a Lisp symbol) assigned by a user.
3151 What Emacs does actually is to detect a category of coding system.
3152 Then, it uses a `coding-system' assigned to it. If Emacs can't
3153 decide only one possible category, it selects a category of the
3154 highest priority. Priorities of categories are also specified by a
3155 user in a Lisp variable `coding-category-list'.
3156
3157 */
3158
3159 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3160 If it detects possible coding systems, return an integer in which
3161 appropriate flag bits are set. Flag bits are defined by macros
3162 CODING_CATEGORY_MASK_XXX in `coding.h'.
3163
3164 How many ASCII characters are at the head is returned as *SKIP. */
3165
3166 static int
3167 detect_coding_mask (source, src_bytes, priorities, skip)
3168 unsigned char *source;
3169 int src_bytes, *priorities, *skip;
3170 {
3171 register unsigned char c;
3172 unsigned char *src = source, *src_end = source + src_bytes;
3173 unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT
3174 | CODING_CATEGORY_MASK_ISO_SHIFT);
3175 int i;
3176
3177 /* At first, skip all ASCII characters and control characters except
3178 for three ISO2022 specific control characters. */
3179 label_loop_detect_coding:
3180 while (src < src_end)
3181 {
3182 c = *src;
3183 if (c >= 0x80
3184 || ((mask & CODING_CATEGORY_MASK_ISO_7BIT)
3185 && c == ISO_CODE_ESC)
3186 || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT)
3187 && (c == ISO_CODE_SI || c == ISO_CODE_SO)))
3188 break;
3189 src++;
3190 }
3191 *skip = src - source;
3192
3193 if (src >= src_end)
3194 /* We found nothing other than ASCII. There's nothing to do. */
3195 return 0;
3196
3197 /* The text seems to be encoded in some multilingual coding system.
3198 Now, try to find in which coding system the text is encoded. */
3199 if (c < 0x80)
3200 {
3201 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3202 /* C is an ISO2022 specific control code of C0. */
3203 mask = detect_coding_iso2022 (src, src_end);
3204 if (mask == 0)
3205 {
3206 /* No valid ISO2022 code follows C. Try again. */
3207 src++;
3208 mask = (c != ISO_CODE_ESC
3209 ? CODING_CATEGORY_MASK_ISO_7BIT
3210 : CODING_CATEGORY_MASK_ISO_SHIFT);
3211 goto label_loop_detect_coding;
3212 }
3213 if (priorities)
3214 goto label_return_highest_only;
3215 }
3216 else
3217 {
3218 int try;
3219
3220 if (c < 0xA0)
3221 {
3222 /* C is the first byte of SJIS character code,
3223 or a leading-code of Emacs' internal format (emacs-mule). */
3224 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3225
3226 /* Or, if C is a special latin extra code,
3227 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3228 or is an ISO2022 control-sequence-introducer (CSI),
3229 we should also consider the possibility of ISO2022 codings. */
3230 if ((VECTORP (Vlatin_extra_code_table)
3231 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3232 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3233 || (c == ISO_CODE_CSI
3234 && (src < src_end
3235 && (*src == ']'
3236 || ((*src == '0' || *src == '1' || *src == '2')
3237 && src + 1 < src_end
3238 && src[1] == ']')))))
3239 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3240 | CODING_CATEGORY_MASK_ISO_8BIT);
3241 }
3242 else
3243 /* C is a character of ISO2022 in graphic plane right,
3244 or a SJIS's 1-byte character code (i.e. JISX0201),
3245 or the first byte of BIG5's 2-byte code. */
3246 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3247 | CODING_CATEGORY_MASK_ISO_8BIT
3248 | CODING_CATEGORY_MASK_SJIS
3249 | CODING_CATEGORY_MASK_BIG5);
3250
3251 mask = 0;
3252 if (priorities)
3253 {
3254 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3255 {
3256 priorities[i] &= try;
3257 if (priorities[i] & CODING_CATEGORY_MASK_ISO)
3258 mask = detect_coding_iso2022 (src, src_end);
3259 else if (priorities[i] & CODING_CATEGORY_MASK_SJIS)
3260 mask = detect_coding_sjis (src, src_end);
3261 else if (priorities[i] & CODING_CATEGORY_MASK_BIG5)
3262 mask = detect_coding_big5 (src, src_end);
3263 else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE)
3264 mask = detect_coding_emacs_mule (src, src_end);
3265 if (mask)
3266 goto label_return_highest_only;
3267 }
3268 return CODING_CATEGORY_MASK_RAW_TEXT;
3269 }
3270 if (try & CODING_CATEGORY_MASK_ISO)
3271 mask |= detect_coding_iso2022 (src, src_end);
3272 if (try & CODING_CATEGORY_MASK_SJIS)
3273 mask |= detect_coding_sjis (src, src_end);
3274 if (try & CODING_CATEGORY_MASK_BIG5)
3275 mask |= detect_coding_big5 (src, src_end);
3276 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3277 mask |= detect_coding_emacs_mule (src, src_end);
3278 }
3279 return (mask | CODING_CATEGORY_MASK_RAW_TEXT);
3280
3281 label_return_highest_only:
3282 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3283 {
3284 if (mask & priorities[i])
3285 return priorities[i];
3286 }
3287 return CODING_CATEGORY_MASK_RAW_TEXT;
3288 }
3289
3290 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3291 The information of the detected coding system is set in CODING. */
3292
3293 void
3294 detect_coding (coding, src, src_bytes)
3295 struct coding_system *coding;
3296 unsigned char *src;
3297 int src_bytes;
3298 {
3299 unsigned int idx;
3300 int skip, mask, i;
3301 int priorities[CODING_CATEGORY_IDX_MAX];
3302 Lisp_Object val = Vcoding_category_list;
3303
3304 i = 0;
3305 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
3306 {
3307 if (! SYMBOLP (XCONS (val)->car))
3308 break;
3309 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
3310 if (idx >= CODING_CATEGORY_IDX_MAX)
3311 break;
3312 priorities[i++] = (1 << idx);
3313 val = XCONS (val)->cdr;
3314 }
3315 /* If coding-category-list is valid and contains all coding
3316 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3317 the following code saves Emacs from craching. */
3318 while (i < CODING_CATEGORY_IDX_MAX)
3319 priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
3320
3321 mask = detect_coding_mask (src, src_bytes, priorities, &skip);
3322 coding->heading_ascii = skip;
3323
3324 if (!mask) return;
3325
3326 /* We found a single coding system of the highest priority in MASK. */
3327 idx = 0;
3328 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3329 if (! mask)
3330 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3331
3332 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3333
3334 if (coding->eol_type != CODING_EOL_UNDECIDED)
3335 {
3336 Lisp_Object tmp = Fget (val, Qeol_type);
3337
3338 if (VECTORP (tmp))
3339 val = XVECTOR (tmp)->contents[coding->eol_type];
3340 }
3341 setup_coding_system (val, coding);
3342 /* Set this again because setup_coding_system reset this member. */
3343 coding->heading_ascii = skip;
3344 }
3345
3346 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3347 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3348 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3349
3350 How many non-eol characters are at the head is returned as *SKIP. */
3351
3352 #define MAX_EOL_CHECK_COUNT 3
3353
3354 static int
3355 detect_eol_type (source, src_bytes, skip)
3356 unsigned char *source;
3357 int src_bytes, *skip;
3358 {
3359 unsigned char *src = source, *src_end = src + src_bytes;
3360 unsigned char c;
3361 int total = 0; /* How many end-of-lines are found so far. */
3362 int eol_type = CODING_EOL_UNDECIDED;
3363 int this_eol_type;
3364
3365 *skip = 0;
3366
3367 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3368 {
3369 c = *src++;
3370 if (c == '\n' || c == '\r')
3371 {
3372 if (*skip == 0)
3373 *skip = src - 1 - source;
3374 total++;
3375 if (c == '\n')
3376 this_eol_type = CODING_EOL_LF;
3377 else if (src >= src_end || *src != '\n')
3378 this_eol_type = CODING_EOL_CR;
3379 else
3380 this_eol_type = CODING_EOL_CRLF, src++;
3381
3382 if (eol_type == CODING_EOL_UNDECIDED)
3383 /* This is the first end-of-line. */
3384 eol_type = this_eol_type;
3385 else if (eol_type != this_eol_type)
3386 {
3387 /* The found type is different from what found before. */
3388 eol_type = CODING_EOL_INCONSISTENT;
3389 break;
3390 }
3391 }
3392 }
3393
3394 if (*skip == 0)
3395 *skip = src_end - source;
3396 return eol_type;
3397 }
3398
3399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3400 is encoded. If it detects an appropriate format of end-of-line, it
3401 sets the information in *CODING. */
3402
3403 void
3404 detect_eol (coding, src, src_bytes)
3405 struct coding_system *coding;
3406 unsigned char *src;
3407 int src_bytes;
3408 {
3409 Lisp_Object val;
3410 int skip;
3411 int eol_type = detect_eol_type (src, src_bytes, &skip);
3412
3413 if (coding->heading_ascii > skip)
3414 coding->heading_ascii = skip;
3415 else
3416 skip = coding->heading_ascii;
3417
3418 if (eol_type == CODING_EOL_UNDECIDED)
3419 return;
3420 if (eol_type == CODING_EOL_INCONSISTENT)
3421 {
3422 #if 0
3423 /* This code is suppressed until we find a better way to
3424 distinguish raw text file and binary file. */
3425
3426 /* If we have already detected that the coding is raw-text, the
3427 coding should actually be no-conversion. */
3428 if (coding->type == coding_type_raw_text)
3429 {
3430 setup_coding_system (Qno_conversion, coding);
3431 return;
3432 }
3433 /* Else, let's decode only text code anyway. */
3434 #endif /* 0 */
3435 eol_type = CODING_EOL_LF;
3436 }
3437
3438 val = Fget (coding->symbol, Qeol_type);
3439 if (VECTORP (val) && XVECTOR (val)->size == 3)
3440 {
3441 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3442 coding->heading_ascii = skip;
3443 }
3444 }
3445
3446 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3447
3448 #define DECODING_BUFFER_MAG(coding) \
3449 (coding->type == coding_type_iso2022 \
3450 ? 3 \
3451 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3452 ? 2 \
3453 : (coding->type == coding_type_raw_text \
3454 ? 1 \
3455 : (coding->type == coding_type_ccl \
3456 ? coding->spec.ccl.decoder.buf_magnification \
3457 : 2))))
3458
3459 /* Return maximum size (bytes) of a buffer enough for decoding
3460 SRC_BYTES of text encoded in CODING. */
3461
3462 int
3463 decoding_buffer_size (coding, src_bytes)
3464 struct coding_system *coding;
3465 int src_bytes;
3466 {
3467 return (src_bytes * DECODING_BUFFER_MAG (coding)
3468 + CONVERSION_BUFFER_EXTRA_ROOM);
3469 }
3470
3471 /* Return maximum size (bytes) of a buffer enough for encoding
3472 SRC_BYTES of text to CODING. */
3473
3474 int
3475 encoding_buffer_size (coding, src_bytes)
3476 struct coding_system *coding;
3477 int src_bytes;
3478 {
3479 int magnification;
3480
3481 if (coding->type == coding_type_ccl)
3482 magnification = coding->spec.ccl.encoder.buf_magnification;
3483 else
3484 magnification = 3;
3485
3486 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3487 }
3488
3489 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3490 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3491 #endif
3492
3493 char *conversion_buffer;
3494 int conversion_buffer_size;
3495
3496 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3497 or decoding. Sufficient memory is allocated automatically. If we
3498 run out of memory, return NULL. */
3499
3500 char *
3501 get_conversion_buffer (size)
3502 int size;
3503 {
3504 if (size > conversion_buffer_size)
3505 {
3506 char *buf;
3507 int real_size = conversion_buffer_size * 2;
3508
3509 while (real_size < size) real_size *= 2;
3510 buf = (char *) xmalloc (real_size);
3511 xfree (conversion_buffer);
3512 conversion_buffer = buf;
3513 conversion_buffer_size = real_size;
3514 }
3515 return conversion_buffer;
3516 }
3517
3518 int
3519 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3520 struct coding_system *coding;
3521 unsigned char *source, *destination;
3522 int src_bytes, dst_bytes, encodep;
3523 {
3524 struct ccl_program *ccl
3525 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3526 int result;
3527
3528 coding->produced = ccl_driver (ccl, source, destination,
3529 src_bytes, dst_bytes, &(coding->consumed));
3530 if (encodep)
3531 {
3532 coding->produced_char = coding->produced;
3533 coding->consumed_char
3534 = multibyte_chars_in_text (source, coding->consumed);
3535 }
3536 else
3537 {
3538 coding->produced_char
3539 = multibyte_chars_in_text (destination, coding->produced);
3540 coding->consumed_char = coding->consumed;
3541 }
3542 switch (ccl->status)
3543 {
3544 case CCL_STAT_SUSPEND_BY_SRC:
3545 result = CODING_FINISH_INSUFFICIENT_SRC;
3546 break;
3547 case CCL_STAT_SUSPEND_BY_DST:
3548 result = CODING_FINISH_INSUFFICIENT_DST;
3549 break;
3550 default:
3551 result = CODING_FINISH_NORMAL;
3552 break;
3553 }
3554 return result;
3555 }
3556
3557 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3558 decoding, it may detect coding system and format of end-of-line if
3559 those are not yet decided. */
3560
3561 int
3562 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3563 struct coding_system *coding;
3564 unsigned char *source, *destination;
3565 int src_bytes, dst_bytes;
3566 {
3567 int result;
3568
3569 if (src_bytes <= 0)
3570 {
3571 coding->produced = coding->produced_char = 0;
3572 coding->consumed = coding->consumed_char = 0;
3573 coding->fake_multibyte = 0;
3574 return CODING_FINISH_NORMAL;
3575 }
3576
3577 if (coding->type == coding_type_undecided)
3578 detect_coding (coding, source, src_bytes);
3579
3580 if (coding->eol_type == CODING_EOL_UNDECIDED)
3581 detect_eol (coding, source, src_bytes);
3582
3583 switch (coding->type)
3584 {
3585 case coding_type_emacs_mule:
3586 case coding_type_undecided:
3587 case coding_type_raw_text:
3588 if (coding->eol_type == CODING_EOL_LF
3589 || coding->eol_type == CODING_EOL_UNDECIDED)
3590 goto label_no_conversion;
3591 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3592 break;
3593
3594 case coding_type_sjis:
3595 result = decode_coding_sjis_big5 (coding, source, destination,
3596 src_bytes, dst_bytes, 1);
3597 break;
3598
3599 case coding_type_iso2022:
3600 result = decode_coding_iso2022 (coding, source, destination,
3601 src_bytes, dst_bytes);
3602 break;
3603
3604 case coding_type_big5:
3605 result = decode_coding_sjis_big5 (coding, source, destination,
3606 src_bytes, dst_bytes, 0);
3607 break;
3608
3609 case coding_type_ccl:
3610 result = ccl_coding_driver (coding, source, destination,
3611 src_bytes, dst_bytes, 0);
3612 break;
3613
3614 default: /* i.e. case coding_type_no_conversion: */
3615 label_no_conversion:
3616 if (dst_bytes && src_bytes > dst_bytes)
3617 {
3618 coding->produced = dst_bytes;
3619 result = CODING_FINISH_INSUFFICIENT_DST;
3620 }
3621 else
3622 {
3623 coding->produced = src_bytes;
3624 result = CODING_FINISH_NORMAL;
3625 }
3626 if (dst_bytes)
3627 bcopy (source, destination, coding->produced);
3628 else
3629 safe_bcopy (source, destination, coding->produced);
3630 coding->fake_multibyte = 1;
3631 coding->consumed
3632 = coding->consumed_char = coding->produced_char = coding->produced;
3633 break;
3634 }
3635
3636 return result;
3637 }
3638
3639 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3640
3641 int
3642 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3643 struct coding_system *coding;
3644 unsigned char *source, *destination;
3645 int src_bytes, dst_bytes;
3646 {
3647 int result;
3648
3649 if (src_bytes <= 0)
3650 {
3651 coding->produced = coding->produced_char = 0;
3652 coding->consumed = coding->consumed_char = 0;
3653 coding->fake_multibyte = 0;
3654 return CODING_FINISH_NORMAL;
3655 }
3656
3657 switch (coding->type)
3658 {
3659 case coding_type_emacs_mule:
3660 case coding_type_undecided:
3661 case coding_type_raw_text:
3662 if (coding->eol_type == CODING_EOL_LF
3663 || coding->eol_type == CODING_EOL_UNDECIDED)
3664 goto label_no_conversion;
3665 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3666 break;
3667
3668 case coding_type_sjis:
3669 result = encode_coding_sjis_big5 (coding, source, destination,
3670 src_bytes, dst_bytes, 1);
3671 break;
3672
3673 case coding_type_iso2022:
3674 result = encode_coding_iso2022 (coding, source, destination,
3675 src_bytes, dst_bytes);
3676 break;
3677
3678 case coding_type_big5:
3679 result = encode_coding_sjis_big5 (coding, source, destination,
3680 src_bytes, dst_bytes, 0);
3681 break;
3682
3683 case coding_type_ccl:
3684 result = ccl_coding_driver (coding, source, destination,
3685 src_bytes, dst_bytes, 1);
3686 break;
3687
3688 default: /* i.e. case coding_type_no_conversion: */
3689 label_no_conversion:
3690 if (dst_bytes && src_bytes > dst_bytes)
3691 {
3692 coding->produced = dst_bytes;
3693 result = CODING_FINISH_INSUFFICIENT_DST;
3694 }
3695 else
3696 {
3697 coding->produced = src_bytes;
3698 result = CODING_FINISH_NORMAL;
3699 }
3700 if (dst_bytes)
3701 bcopy (source, destination, coding->produced);
3702 else
3703 safe_bcopy (source, destination, coding->produced);
3704 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3705 {
3706 unsigned char *p = destination, *pend = p + coding->produced;
3707 while (p < pend)
3708 if (*p++ == '\015') p[-1] = '\n';
3709 }
3710 coding->fake_multibyte = 1;
3711 coding->consumed
3712 = coding->consumed_char = coding->produced_char = coding->produced;
3713 break;
3714 }
3715
3716 return result;
3717 }
3718
3719 /* Scan text in the region between *BEG and *END (byte positions),
3720 skip characters which we don't have to decode by coding system
3721 CODING at the head and tail, then set *BEG and *END to the region
3722 of the text we actually have to convert. The caller should move
3723 the gap out of the region in advance.
3724
3725 If STR is not NULL, *BEG and *END are indices into STR. */
3726
3727 static void
3728 shrink_decoding_region (beg, end, coding, str)
3729 int *beg, *end;
3730 struct coding_system *coding;
3731 unsigned char *str;
3732 {
3733 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3734 int eol_conversion;
3735
3736 if (coding->type == coding_type_ccl
3737 || coding->type == coding_type_undecided
3738 || !NILP (coding->post_read_conversion))
3739 {
3740 /* We can't skip any data. */
3741 return;
3742 }
3743 else if (coding->type == coding_type_no_conversion)
3744 {
3745 /* We need no conversion, but don't have to skip any data here.
3746 Decoding routine handles them effectively anyway. */
3747 return;
3748 }
3749
3750 if (coding->heading_ascii >= 0)
3751 /* Detection routine has already found how much we can skip at the
3752 head. */
3753 *beg += coding->heading_ascii;
3754
3755 if (str)
3756 {
3757 begp_orig = begp = str + *beg;
3758 endp_orig = endp = str + *end;
3759 }
3760 else
3761 {
3762 begp_orig = begp = BYTE_POS_ADDR (*beg);
3763 endp_orig = endp = begp + *end - *beg;
3764 }
3765
3766 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3767
3768 switch (coding->type)
3769 {
3770 case coding_type_emacs_mule:
3771 case coding_type_raw_text:
3772 if (eol_conversion)
3773 {
3774 if (coding->heading_ascii < 0)
3775 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3776 while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80)
3777 endp--;
3778 }
3779 else
3780 begp = endp;
3781 break;
3782
3783 case coding_type_sjis:
3784 case coding_type_big5:
3785 /* We can skip all ASCII characters at the head. */
3786 if (coding->heading_ascii < 0)
3787 {
3788 if (eol_conversion)
3789 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
3790 else
3791 while (begp < endp && *begp < 0x80) begp++;
3792 }
3793 /* We can skip all ASCII characters at the tail except for the
3794 second byte of SJIS or BIG5 code. */
3795 if (eol_conversion)
3796 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
3797 else
3798 while (begp < endp && endp[-1] < 0x80) endp--;
3799 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3800 endp++;
3801 break;
3802
3803 default: /* i.e. case coding_type_iso2022: */
3804 if (coding->heading_ascii < 0)
3805 {
3806 /* We can skip all ASCII characters at the head except for a
3807 few control codes. */
3808 while (begp < endp && (c = *begp) < 0x80
3809 && c != ISO_CODE_CR && c != ISO_CODE_SO
3810 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3811 && (!eol_conversion || c != ISO_CODE_LF))
3812 begp++;
3813 }
3814 switch (coding->category_idx)
3815 {
3816 case CODING_CATEGORY_IDX_ISO_8_1:
3817 case CODING_CATEGORY_IDX_ISO_8_2:
3818 /* We can skip all ASCII characters at the tail. */
3819 if (eol_conversion)
3820 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
3821 else
3822 while (begp < endp && endp[-1] < 0x80) endp--;
3823 break;
3824
3825 case CODING_CATEGORY_IDX_ISO_7:
3826 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3827 /* We can skip all charactes at the tail except for ESC and
3828 the following 2-byte at the tail. */
3829 if (eol_conversion)
3830 while (begp < endp
3831 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC && c != '\r')
3832 endp--;
3833 else
3834 while (begp < endp
3835 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC)
3836 endp--;
3837 if (begp < endp && endp[-1] == ISO_CODE_ESC)
3838 {
3839 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
3840 /* This is an ASCII designation sequence. We can
3841 surely skip the tail. */
3842 endp += 2;
3843 else
3844 /* Hmmm, we can't skip the tail. */
3845 endp = endp_orig;
3846 }
3847 }
3848 }
3849 *beg += begp - begp_orig;
3850 *end += endp - endp_orig;
3851 return;
3852 }
3853
3854 /* Like shrink_decoding_region but for encoding. */
3855
3856 static void
3857 shrink_encoding_region (beg, end, coding, str)
3858 int *beg, *end;
3859 struct coding_system *coding;
3860 unsigned char *str;
3861 {
3862 unsigned char *begp_orig, *begp, *endp_orig, *endp;
3863 int eol_conversion;
3864
3865 if (coding->type == coding_type_ccl)
3866 /* We can't skip any data. */
3867 return;
3868 else if (coding->type == coding_type_no_conversion)
3869 {
3870 /* We need no conversion. */
3871 *beg = *end;
3872 return;
3873 }
3874
3875 if (str)
3876 {
3877 begp_orig = begp = str + *beg;
3878 endp_orig = endp = str + *end;
3879 }
3880 else
3881 {
3882 begp_orig = begp = BYTE_POS_ADDR (*beg);
3883 endp_orig = endp = begp + *end - *beg;
3884 }
3885
3886 eol_conversion = (coding->eol_type == CODING_EOL_CR
3887 || coding->eol_type == CODING_EOL_CRLF);
3888
3889 /* Here, we don't have to check coding->pre_write_conversion because
3890 the caller is expected to have handled it already. */
3891 switch (coding->type)
3892 {
3893 case coding_type_undecided:
3894 case coding_type_emacs_mule:
3895 case coding_type_raw_text:
3896 if (eol_conversion)
3897 {
3898 while (begp < endp && *begp != '\n') begp++;
3899 while (begp < endp && endp[-1] != '\n') endp--;
3900 }
3901 else
3902 begp = endp;
3903 break;
3904
3905 case coding_type_iso2022:
3906 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3907 {
3908 unsigned char *bol = begp;
3909 while (begp < endp && *begp < 0x80)
3910 {
3911 begp++;
3912 if (begp[-1] == '\n')
3913 bol = begp;
3914 }
3915 begp = bol;
3916 goto label_skip_tail;
3917 }
3918 /* fall down ... */
3919
3920 default:
3921 /* We can skip all ASCII characters at the head and tail. */
3922 if (eol_conversion)
3923 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3924 else
3925 while (begp < endp && *begp < 0x80) begp++;
3926 label_skip_tail:
3927 if (eol_conversion)
3928 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3929 else
3930 while (begp < endp && *(endp - 1) < 0x80) endp--;
3931 break;
3932 }
3933
3934 *beg += begp - begp_orig;
3935 *end += endp - endp_orig;
3936 return;
3937 }
3938
3939 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3940 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3941 coding system CODING, and return the status code of code conversion
3942 (currently, this value has no meaning).
3943
3944 How many characters (and bytes) are converted to how many
3945 characters (and bytes) are recorded in members of the structure
3946 CODING.
3947
3948 If REPLACE is nonzero, we do various things as if the original text
3949 is deleted and a new text is inserted. See the comments in
3950 replace_range (insdel.c) to know what we are doing. */
3951
3952 int
3953 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
3954 int from, from_byte, to, to_byte, encodep, replace;
3955 struct coding_system *coding;
3956 {
3957 int len = to - from, len_byte = to_byte - from_byte;
3958 int require, inserted, inserted_byte;
3959 int head_skip, tail_skip, total_skip;
3960 Lisp_Object saved_coding_symbol = Qnil;
3961 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
3962 int first = 1;
3963 int fake_multibyte = 0;
3964 unsigned char *src, *dst;
3965 Lisp_Object deletion = Qnil;
3966
3967 if (replace)
3968 {
3969 int saved_from = from;
3970
3971 prepare_to_modify_buffer (from, to, &from);
3972 if (saved_from != from)
3973 {
3974 to = from + len;
3975 if (multibyte)
3976 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
3977 else
3978 from_byte = from, to_byte = to;
3979 len_byte = to_byte - from_byte;
3980 }
3981 }
3982
3983 if (! encodep && CODING_REQUIRE_DETECTION (coding))
3984 {
3985 /* We must detect encoding of text and eol format. */
3986
3987 if (from < GPT && to > GPT)
3988 move_gap_both (from, from_byte);
3989 if (coding->type == coding_type_undecided)
3990 {
3991 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
3992 if (coding->type == coding_type_undecided)
3993 /* It seems that the text contains only ASCII, but we
3994 should not left it undecided because the deeper
3995 decoding routine (decode_coding) tries to detect the
3996 encodings again in vain. */
3997 coding->type = coding_type_emacs_mule;
3998 }
3999 if (coding->eol_type == CODING_EOL_UNDECIDED)
4000 {
4001 saved_coding_symbol = coding->symbol;
4002 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4003 if (coding->eol_type == CODING_EOL_UNDECIDED)
4004 coding->eol_type = CODING_EOL_LF;
4005 /* We had better recover the original eol format if we
4006 encounter an inconsitent eol format while decoding. */
4007 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4008 }
4009 }
4010
4011 coding->consumed_char = len, coding->consumed = len_byte;
4012
4013 if (encodep
4014 ? ! CODING_REQUIRE_ENCODING (coding)
4015 : ! CODING_REQUIRE_DECODING (coding))
4016 {
4017 coding->produced = len_byte;
4018 if (multibyte
4019 && ! replace
4020 /* See the comment of the member heading_ascii in coding.h. */
4021 && coding->heading_ascii < len_byte)
4022 {
4023 /* We still may have to combine byte at the head and the
4024 tail of the text in the region. */
4025 if (from < GPT && GPT < to)
4026 move_gap_both (to, to_byte);
4027 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4028 adjust_after_insert (from, from_byte, to, to_byte, len);
4029 coding->produced_char = len;
4030 }
4031 else
4032 {
4033 if (!replace)
4034 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4035 coding->produced_char = len_byte;
4036 }
4037 return 0;
4038 }
4039
4040 /* Now we convert the text. */
4041
4042 /* For encoding, we must process pre-write-conversion in advance. */
4043 if (encodep
4044 && ! NILP (coding->pre_write_conversion)
4045 && SYMBOLP (coding->pre_write_conversion)
4046 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4047 {
4048 /* The function in pre-write-conversion may put a new text in a
4049 new buffer. */
4050 struct buffer *prev = current_buffer, *new;
4051
4052 call2 (coding->pre_write_conversion, from, to);
4053 if (current_buffer != prev)
4054 {
4055 len = ZV - BEGV;
4056 new = current_buffer;
4057 set_buffer_internal_1 (prev);
4058 del_range_2 (from, from_byte, to, to_byte);
4059 insert_from_buffer (new, BEG, len, 0);
4060 to = from + len;
4061 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4062 len_byte = to_byte - from_byte;
4063 }
4064 }
4065
4066 if (replace)
4067 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4068
4069 /* Try to skip the heading and tailing ASCIIs. */
4070 {
4071 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4072
4073 if (from < GPT && GPT < to)
4074 move_gap_both (from, from_byte);
4075 if (encodep)
4076 shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
4077 else
4078 shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
4079 if (from_byte == to_byte)
4080 {
4081 coding->produced = len_byte;
4082 coding->produced_char = multibyte ? len : len_byte;
4083 if (!replace)
4084 /* We must record and adjust for this new text now. */
4085 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4086 return 0;
4087 }
4088
4089 head_skip = from_byte - from_byte_orig;
4090 tail_skip = to_byte_orig - to_byte;
4091 total_skip = head_skip + tail_skip;
4092 from += head_skip;
4093 to -= tail_skip;
4094 len -= total_skip; len_byte -= total_skip;
4095 }
4096
4097 /* For converion, we must put the gap before the text in addition to
4098 making the gap larger for efficient decoding. The required gap
4099 size starts from 2000 which is the magic number used in make_gap.
4100 But, after one batch of conversion, it will be incremented if we
4101 find that it is not enough . */
4102 require = 2000;
4103
4104 if (GAP_SIZE < require)
4105 make_gap (require - GAP_SIZE);
4106 move_gap_both (from, from_byte);
4107
4108 if (GPT - BEG < beg_unchanged)
4109 beg_unchanged = GPT - BEG;
4110 if (Z - GPT < end_unchanged)
4111 end_unchanged = Z - GPT;
4112
4113 inserted = inserted_byte = 0;
4114 src = GAP_END_ADDR, dst = GPT_ADDR;
4115
4116 GAP_SIZE += len_byte;
4117 ZV -= len;
4118 Z -= len;
4119 ZV_BYTE -= len_byte;
4120 Z_BYTE -= len_byte;
4121
4122 for (;;)
4123 {
4124 int result;
4125
4126 /* The buffer memory is changed from:
4127 +--------+converted-text+---------+-------original-text------+---+
4128 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4129 |<------------------- GAP_SIZE -------------------->| */
4130 if (encodep)
4131 result = encode_coding (coding, src, dst, len_byte, 0);
4132 else
4133 result = decode_coding (coding, src, dst, len_byte, 0);
4134 /* to:
4135 +--------+-------converted-text--------+--+---original-text--+---+
4136 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4137 |<------------------- GAP_SIZE -------------------->| */
4138 if (coding->fake_multibyte)
4139 fake_multibyte = 1;
4140
4141 if (!encodep && !multibyte)
4142 coding->produced_char = coding->produced;
4143 inserted += coding->produced_char;
4144 inserted_byte += coding->produced;
4145 len_byte -= coding->consumed;
4146 src += coding->consumed;
4147 dst += inserted_byte;
4148
4149 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4150 {
4151 unsigned char *pend = dst, *p = pend - inserted_byte;
4152
4153 /* Encode LFs back to the original eol format (CR or CRLF). */
4154 if (coding->eol_type == CODING_EOL_CR)
4155 {
4156 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4157 }
4158 else
4159 {
4160 int count = 0;
4161
4162 while (p < pend) if (*p++ == '\n') count++;
4163 if (src - dst < count)
4164 {
4165 /* We don't have sufficient room for putting LFs
4166 back to CRLF. We must record converted and
4167 not-yet-converted text back to the buffer
4168 content, enlarge the gap, then record them out of
4169 the buffer contents again. */
4170 int add = len_byte + inserted_byte;
4171
4172 GAP_SIZE -= add;
4173 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4174 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4175 make_gap (count - GAP_SIZE);
4176 GAP_SIZE += add;
4177 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4178 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4179 /* Don't forget to update SRC, DST, and PEND. */
4180 src = GAP_END_ADDR - len_byte;
4181 dst = GPT_ADDR + inserted_byte;
4182 pend = dst;
4183 }
4184 inserted += count;
4185 inserted_byte += count;
4186 coding->produced += count;
4187 p = dst = pend + count;
4188 while (count)
4189 {
4190 *--p = *--pend;
4191 if (*p == '\n') count--, *--p = '\r';
4192 }
4193 }
4194
4195 /* Suppress eol-format conversion in the further conversion. */
4196 coding->eol_type = CODING_EOL_LF;
4197
4198 /* Restore the original symbol. */
4199 coding->symbol = saved_coding_symbol;
4200
4201 continue;
4202 }
4203 if (len_byte <= 0)
4204 break;
4205 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4206 {
4207 /* The source text ends in invalid codes. Let's just
4208 make them valid buffer contents, and finish conversion. */
4209 inserted += len_byte;
4210 inserted_byte += len_byte;
4211 while (len_byte--)
4212 *src++ = *dst++;
4213 fake_multibyte = 1;
4214 break;
4215 }
4216 if (first)
4217 {
4218 /* We have just done the first batch of conversion which was
4219 stoped because of insufficient gap. Let's reconsider the
4220 required gap size (i.e. SRT - DST) now.
4221
4222 We have converted ORIG bytes (== coding->consumed) into
4223 NEW bytes (coding->produced). To convert the remaining
4224 LEN bytes, we may need REQUIRE bytes of gap, where:
4225 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4226 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4227 Here, we are sure that NEW >= ORIG. */
4228 float ratio = coding->produced - coding->consumed;
4229 ratio /= coding->consumed;
4230 require = len_byte * ratio;
4231 first = 0;
4232 }
4233 if ((src - dst) < (require + 2000))
4234 {
4235 /* See the comment above the previous call of make_gap. */
4236 int add = len_byte + inserted_byte;
4237
4238 GAP_SIZE -= add;
4239 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4240 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4241 make_gap (require + 2000);
4242 GAP_SIZE += add;
4243 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4244 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4245 /* Don't forget to update SRC, DST. */
4246 src = GAP_END_ADDR - len_byte;
4247 dst = GPT_ADDR + inserted_byte;
4248 }
4249 }
4250 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4251
4252 if (multibyte
4253 && (fake_multibyte
4254 || !encodep && (to - from) != (to_byte - from_byte)))
4255 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4256
4257 /* If we have shrinked the conversion area, adjust it now. */
4258 if (total_skip > 0)
4259 {
4260 if (tail_skip > 0)
4261 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4262 inserted += total_skip; inserted_byte += total_skip;
4263 GAP_SIZE += total_skip;
4264 GPT -= head_skip; GPT_BYTE -= head_skip;
4265 ZV -= total_skip; ZV_BYTE -= total_skip;
4266 Z -= total_skip; Z_BYTE -= total_skip;
4267 from -= head_skip; from_byte -= head_skip;
4268 to += tail_skip; to_byte += tail_skip;
4269 }
4270
4271 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4272
4273 if (! encodep && ! NILP (coding->post_read_conversion))
4274 {
4275 Lisp_Object val;
4276 int orig_inserted = inserted, pos = PT;
4277
4278 if (from != pos)
4279 temp_set_point_both (current_buffer, from, from_byte);
4280 val = call1 (coding->post_read_conversion, make_number (inserted));
4281 if (! NILP (val))
4282 {
4283 CHECK_NUMBER (val, 0);
4284 inserted = XFASTINT (val);
4285 }
4286 if (pos >= from + orig_inserted)
4287 temp_set_point (current_buffer, pos + (inserted - orig_inserted));
4288 }
4289
4290 signal_after_change (from, to - from, inserted);
4291
4292 {
4293 coding->consumed = to_byte - from_byte;
4294 coding->consumed_char = to - from;
4295 coding->produced = inserted_byte;
4296 coding->produced_char = inserted;
4297 }
4298
4299 return 0;
4300 }
4301
4302 Lisp_Object
4303 code_convert_string (str, coding, encodep, nocopy)
4304 Lisp_Object str;
4305 struct coding_system *coding;
4306 int encodep, nocopy;
4307 {
4308 int len;
4309 char *buf;
4310 int from = 0, to = XSTRING (str)->size;
4311 int to_byte = STRING_BYTES (XSTRING (str));
4312 struct gcpro gcpro1;
4313 Lisp_Object saved_coding_symbol = Qnil;
4314 int result;
4315
4316 if (encodep && !NILP (coding->pre_write_conversion)
4317 || !encodep && !NILP (coding->post_read_conversion))
4318 {
4319 /* Since we have to call Lisp functions which assume target text
4320 is in a buffer, after setting a temporary buffer, call
4321 code_convert_region. */
4322 int count = specpdl_ptr - specpdl;
4323 struct buffer *prev = current_buffer;
4324
4325 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4326 temp_output_buffer_setup (" *code-converting-work*");
4327 set_buffer_internal (XBUFFER (Vstandard_output));
4328 if (encodep)
4329 insert_from_string (str, 0, 0, to, to_byte, 0);
4330 else
4331 {
4332 /* We must insert the contents of STR as is without
4333 unibyte<->multibyte conversion. */
4334 current_buffer->enable_multibyte_characters = Qnil;
4335 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4336 current_buffer->enable_multibyte_characters = Qt;
4337 }
4338 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4339 if (encodep)
4340 /* We must return the buffer contents as unibyte string. */
4341 current_buffer->enable_multibyte_characters = Qnil;
4342 str = make_buffer_string (BEGV, ZV, 0);
4343 set_buffer_internal (prev);
4344 return unbind_to (count, str);
4345 }
4346
4347 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4348 {
4349 /* See the comments in code_convert_region. */
4350 if (coding->type == coding_type_undecided)
4351 {
4352 detect_coding (coding, XSTRING (str)->data, to_byte);
4353 if (coding->type == coding_type_undecided)
4354 coding->type = coding_type_emacs_mule;
4355 }
4356 if (coding->eol_type == CODING_EOL_UNDECIDED)
4357 {
4358 saved_coding_symbol = coding->symbol;
4359 detect_eol (coding, XSTRING (str)->data, to_byte);
4360 if (coding->eol_type == CODING_EOL_UNDECIDED)
4361 coding->eol_type = CODING_EOL_LF;
4362 /* We had better recover the original eol format if we
4363 encounter an inconsitent eol format while decoding. */
4364 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4365 }
4366 }
4367
4368 if (encodep
4369 ? ! CODING_REQUIRE_ENCODING (coding)
4370 : ! CODING_REQUIRE_DECODING (coding))
4371 from = to_byte;
4372 else
4373 {
4374 /* Try to skip the heading and tailing ASCIIs. */
4375 if (encodep)
4376 shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4377 else
4378 shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4379 }
4380 if (from == to_byte)
4381 return (nocopy ? str : Fcopy_sequence (str));
4382
4383 if (encodep)
4384 len = encoding_buffer_size (coding, to_byte - from);
4385 else
4386 len = decoding_buffer_size (coding, to_byte - from);
4387 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4388 GCPRO1 (str);
4389 buf = get_conversion_buffer (len);
4390 UNGCPRO;
4391
4392 if (from > 0)
4393 bcopy (XSTRING (str)->data, buf, from);
4394 result = (encodep
4395 ? encode_coding (coding, XSTRING (str)->data + from,
4396 buf + from, to_byte - from, len)
4397 : decode_coding (coding, XSTRING (str)->data + from,
4398 buf + from, to_byte - from, len));
4399 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4400 {
4401 /* We simple try to decode the whole string again but without
4402 eol-conversion this time. */
4403 coding->eol_type = CODING_EOL_LF;
4404 coding->symbol = saved_coding_symbol;
4405 return code_convert_string (str, coding, encodep, nocopy);
4406 }
4407
4408 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4409 STRING_BYTES (XSTRING (str)) - to_byte);
4410
4411 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4412 if (encodep)
4413 str = make_unibyte_string (buf, len + coding->produced);
4414 else
4415 str = make_string_from_bytes (buf, len + coding->produced_char,
4416 len + coding->produced);
4417 return str;
4418 }
4419
4420 \f
4421 #ifdef emacs
4422 /*** 7. Emacs Lisp library functions ***/
4423
4424 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4425 "Return t if OBJECT is nil or a coding-system.\n\
4426 See the documentation of `make-coding-system' for information\n\
4427 about coding-system objects.")
4428 (obj)
4429 Lisp_Object obj;
4430 {
4431 if (NILP (obj))
4432 return Qt;
4433 if (!SYMBOLP (obj))
4434 return Qnil;
4435 /* Get coding-spec vector for OBJ. */
4436 obj = Fget (obj, Qcoding_system);
4437 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4438 ? Qt : Qnil);
4439 }
4440
4441 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4442 Sread_non_nil_coding_system, 1, 1, 0,
4443 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4444 (prompt)
4445 Lisp_Object prompt;
4446 {
4447 Lisp_Object val;
4448 do
4449 {
4450 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4451 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4452 }
4453 while (XSTRING (val)->size == 0);
4454 return (Fintern (val, Qnil));
4455 }
4456
4457 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4458 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4459 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4460 (prompt, default_coding_system)
4461 Lisp_Object prompt, default_coding_system;
4462 {
4463 Lisp_Object val;
4464 if (SYMBOLP (default_coding_system))
4465 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4466 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4467 Qt, Qnil, Qcoding_system_history,
4468 default_coding_system, Qnil);
4469 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4470 }
4471
4472 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4473 1, 1, 0,
4474 "Check validity of CODING-SYSTEM.\n\
4475 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4476 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4477 The value of property should be a vector of length 5.")
4478 (coding_system)
4479 Lisp_Object coding_system;
4480 {
4481 CHECK_SYMBOL (coding_system, 0);
4482 if (!NILP (Fcoding_system_p (coding_system)))
4483 return coding_system;
4484 while (1)
4485 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4486 }
4487 \f
4488 Lisp_Object
4489 detect_coding_system (src, src_bytes, highest)
4490 unsigned char *src;
4491 int src_bytes, highest;
4492 {
4493 int coding_mask, eol_type;
4494 Lisp_Object val, tmp;
4495 int dummy;
4496
4497 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4498 eol_type = detect_eol_type (src, src_bytes, &dummy);
4499 if (eol_type == CODING_EOL_INCONSISTENT)
4500 eol_type == CODING_EOL_UNDECIDED;
4501
4502 if (!coding_mask)
4503 {
4504 val = Qundecided;
4505 if (eol_type != CODING_EOL_UNDECIDED)
4506 {
4507 Lisp_Object val2;
4508 val2 = Fget (Qundecided, Qeol_type);
4509 if (VECTORP (val2))
4510 val = XVECTOR (val2)->contents[eol_type];
4511 }
4512 return val;
4513 }
4514
4515 /* At first, gather possible coding systems in VAL. */
4516 val = Qnil;
4517 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4518 {
4519 int idx
4520 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4521 if (coding_mask & (1 << idx))
4522 {
4523 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4524 if (highest)
4525 break;
4526 }
4527 }
4528 if (!highest)
4529 val = Fnreverse (val);
4530
4531 /* Then, substitute the elements by subsidiary coding systems. */
4532 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4533 {
4534 if (eol_type != CODING_EOL_UNDECIDED)
4535 {
4536 Lisp_Object eol;
4537 eol = Fget (XCONS (tmp)->car, Qeol_type);
4538 if (VECTORP (eol))
4539 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4540 }
4541 }
4542 return (highest ? XCONS (val)->car : val);
4543 }
4544
4545 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4546 2, 3, 0,
4547 "Detect coding system of the text in the region between START and END.\n\
4548 Return a list of possible coding systems ordered by priority.\n\
4549 \n\
4550 If only ASCII characters are found, it returns `undecided'\n\
4551 or its subsidiary coding system according to a detected end-of-line format.\n\
4552 \n\
4553 If optional argument HIGHEST is non-nil, return the coding system of\n\
4554 highest priority.")
4555 (start, end, highest)
4556 Lisp_Object start, end, highest;
4557 {
4558 int from, to;
4559 int from_byte, to_byte;
4560
4561 CHECK_NUMBER_COERCE_MARKER (start, 0);
4562 CHECK_NUMBER_COERCE_MARKER (end, 1);
4563
4564 validate_region (&start, &end);
4565 from = XINT (start), to = XINT (end);
4566 from_byte = CHAR_TO_BYTE (from);
4567 to_byte = CHAR_TO_BYTE (to);
4568
4569 if (from < GPT && to >= GPT)
4570 move_gap_both (to, to_byte);
4571
4572 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4573 to_byte - from_byte,
4574 !NILP (highest));
4575 }
4576
4577 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4578 1, 2, 0,
4579 "Detect coding system of the text in STRING.\n\
4580 Return a list of possible coding systems ordered by priority.\n\
4581 \n\
4582 If only ASCII characters are found, it returns `undecided'\n\
4583 or its subsidiary coding system according to a detected end-of-line format.\n\
4584 \n\
4585 If optional argument HIGHEST is non-nil, return the coding system of\n\
4586 highest priority.")
4587 (string, highest)
4588 Lisp_Object string, highest;
4589 {
4590 CHECK_STRING (string, 0);
4591
4592 return detect_coding_system (XSTRING (string)->data,
4593 STRING_BYTES (XSTRING (string)),
4594 !NILP (highest));
4595 }
4596
4597 Lisp_Object
4598 code_convert_region1 (start, end, coding_system, encodep)
4599 Lisp_Object start, end, coding_system;
4600 int encodep;
4601 {
4602 struct coding_system coding;
4603 int from, to, len;
4604
4605 CHECK_NUMBER_COERCE_MARKER (start, 0);
4606 CHECK_NUMBER_COERCE_MARKER (end, 1);
4607 CHECK_SYMBOL (coding_system, 2);
4608
4609 validate_region (&start, &end);
4610 from = XFASTINT (start);
4611 to = XFASTINT (end);
4612
4613 if (NILP (coding_system))
4614 return make_number (to - from);
4615
4616 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4617 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4618
4619 coding.mode |= CODING_MODE_LAST_BLOCK;
4620 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4621 &coding, encodep, 1);
4622 return make_number (coding.produced_char);
4623 }
4624
4625 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4626 3, 3, "r\nzCoding system: ",
4627 "Decode the current region by specified coding system.\n\
4628 When called from a program, takes three arguments:\n\
4629 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4630 Return length of decoded text.")
4631 (start, end, coding_system)
4632 Lisp_Object start, end, coding_system;
4633 {
4634 return code_convert_region1 (start, end, coding_system, 0);
4635 }
4636
4637 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4638 3, 3, "r\nzCoding system: ",
4639 "Encode the current region by specified coding system.\n\
4640 When called from a program, takes three arguments:\n\
4641 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4642 Return length of encoded text.")
4643 (start, end, coding_system)
4644 Lisp_Object start, end, coding_system;
4645 {
4646 return code_convert_region1 (start, end, coding_system, 1);
4647 }
4648
4649 Lisp_Object
4650 code_convert_string1 (string, coding_system, nocopy, encodep)
4651 Lisp_Object string, coding_system, nocopy;
4652 int encodep;
4653 {
4654 struct coding_system coding;
4655
4656 CHECK_STRING (string, 0);
4657 CHECK_SYMBOL (coding_system, 1);
4658
4659 if (NILP (coding_system))
4660 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4661
4662 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4663 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4664
4665 coding.mode |= CODING_MODE_LAST_BLOCK;
4666 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4667 }
4668
4669 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4670 2, 3, 0,
4671 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4672 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4673 if the decoding operation is trivial.")
4674 (string, coding_system, nocopy)
4675 Lisp_Object string, coding_system, nocopy;
4676 {
4677 return code_convert_string1(string, coding_system, nocopy, 0);
4678 }
4679
4680 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4681 2, 3, 0,
4682 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4683 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4684 if the encoding operation is trivial.")
4685 (string, coding_system, nocopy)
4686 Lisp_Object string, coding_system, nocopy;
4687 {
4688 return code_convert_string1(string, coding_system, nocopy, 1);
4689 }
4690
4691 \f
4692 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
4693 "Decode a JISX0208 character of shift-jis encoding.\n\
4694 CODE is the character code in SJIS.\n\
4695 Return the corresponding character.")
4696 (code)
4697 Lisp_Object code;
4698 {
4699 unsigned char c1, c2, s1, s2;
4700 Lisp_Object val;
4701
4702 CHECK_NUMBER (code, 0);
4703 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
4704 DECODE_SJIS (s1, s2, c1, c2);
4705 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
4706 return val;
4707 }
4708
4709 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
4710 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4711 Return the corresponding character code in SJIS.")
4712 (ch)
4713 Lisp_Object ch;
4714 {
4715 int charset, c1, c2, s1, s2;
4716 Lisp_Object val;
4717
4718 CHECK_NUMBER (ch, 0);
4719 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4720 if (charset == charset_jisx0208)
4721 {
4722 ENCODE_SJIS (c1, c2, s1, s2);
4723 XSETFASTINT (val, (s1 << 8) | s2);
4724 }
4725 else
4726 XSETFASTINT (val, 0);
4727 return val;
4728 }
4729
4730 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
4731 "Decode a Big5 character CODE of BIG5 coding system.\n\
4732 CODE is the character code in BIG5.\n\
4733 Return the corresponding character.")
4734 (code)
4735 Lisp_Object code;
4736 {
4737 int charset;
4738 unsigned char b1, b2, c1, c2;
4739 Lisp_Object val;
4740
4741 CHECK_NUMBER (code, 0);
4742 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
4743 DECODE_BIG5 (b1, b2, charset, c1, c2);
4744 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
4745 return val;
4746 }
4747
4748 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
4749 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4750 Return the corresponding character code in Big5.")
4751 (ch)
4752 Lisp_Object ch;
4753 {
4754 int charset, c1, c2, b1, b2;
4755 Lisp_Object val;
4756
4757 CHECK_NUMBER (ch, 0);
4758 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4759 if (charset == charset_big5_1 || charset == charset_big5_2)
4760 {
4761 ENCODE_BIG5 (charset, c1, c2, b1, b2);
4762 XSETFASTINT (val, (b1 << 8) | b2);
4763 }
4764 else
4765 XSETFASTINT (val, 0);
4766 return val;
4767 }
4768 \f
4769 DEFUN ("set-terminal-coding-system-internal",
4770 Fset_terminal_coding_system_internal,
4771 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4772 (coding_system)
4773 Lisp_Object coding_system;
4774 {
4775 CHECK_SYMBOL (coding_system, 0);
4776 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4777 /* We had better not send unsafe characters to terminal. */
4778 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
4779
4780 return Qnil;
4781 }
4782
4783 DEFUN ("set-safe-terminal-coding-system-internal",
4784 Fset_safe_terminal_coding_system_internal,
4785 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
4786 (coding_system)
4787 Lisp_Object coding_system;
4788 {
4789 CHECK_SYMBOL (coding_system, 0);
4790 setup_coding_system (Fcheck_coding_system (coding_system),
4791 &safe_terminal_coding);
4792 return Qnil;
4793 }
4794
4795 DEFUN ("terminal-coding-system",
4796 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
4797 "Return coding system specified for terminal output.")
4798 ()
4799 {
4800 return terminal_coding.symbol;
4801 }
4802
4803 DEFUN ("set-keyboard-coding-system-internal",
4804 Fset_keyboard_coding_system_internal,
4805 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4806 (coding_system)
4807 Lisp_Object coding_system;
4808 {
4809 CHECK_SYMBOL (coding_system, 0);
4810 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
4811 return Qnil;
4812 }
4813
4814 DEFUN ("keyboard-coding-system",
4815 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
4816 "Return coding system specified for decoding keyboard input.")
4817 ()
4818 {
4819 return keyboard_coding.symbol;
4820 }
4821
4822 \f
4823 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
4824 Sfind_operation_coding_system, 1, MANY, 0,
4825 "Choose a coding system for an operation based on the target name.\n\
4826 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4827 DECODING-SYSTEM is the coding system to use for decoding\n\
4828 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4829 for encoding (in case OPERATION does encoding).\n\
4830 \n\
4831 The first argument OPERATION specifies an I/O primitive:\n\
4832 For file I/O, `insert-file-contents' or `write-region'.\n\
4833 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4834 For network I/O, `open-network-stream'.\n\
4835 \n\
4836 The remaining arguments should be the same arguments that were passed\n\
4837 to the primitive. Depending on which primitive, one of those arguments\n\
4838 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4839 whichever argument specifies the file name is TARGET.\n\
4840 \n\
4841 TARGET has a meaning which depends on OPERATION:\n\
4842 For file I/O, TARGET is a file name.\n\
4843 For process I/O, TARGET is a process name.\n\
4844 For network I/O, TARGET is a service name or a port number\n\
4845 \n\
4846 This function looks up what specified for TARGET in,\n\
4847 `file-coding-system-alist', `process-coding-system-alist',\n\
4848 or `network-coding-system-alist' depending on OPERATION.\n\
4849 They may specify a coding system, a cons of coding systems,\n\
4850 or a function symbol to call.\n\
4851 In the last case, we call the function with one argument,\n\
4852 which is a list of all the arguments given to this function.")
4853 (nargs, args)
4854 int nargs;
4855 Lisp_Object *args;
4856 {
4857 Lisp_Object operation, target_idx, target, val;
4858 register Lisp_Object chain;
4859
4860 if (nargs < 2)
4861 error ("Too few arguments");
4862 operation = args[0];
4863 if (!SYMBOLP (operation)
4864 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
4865 error ("Invalid first arguement");
4866 if (nargs < 1 + XINT (target_idx))
4867 error ("Too few arguments for operation: %s",
4868 XSYMBOL (operation)->name->data);
4869 target = args[XINT (target_idx) + 1];
4870 if (!(STRINGP (target)
4871 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
4872 error ("Invalid %dth argument", XINT (target_idx) + 1);
4873
4874 chain = ((EQ (operation, Qinsert_file_contents)
4875 || EQ (operation, Qwrite_region))
4876 ? Vfile_coding_system_alist
4877 : (EQ (operation, Qopen_network_stream)
4878 ? Vnetwork_coding_system_alist
4879 : Vprocess_coding_system_alist));
4880 if (NILP (chain))
4881 return Qnil;
4882
4883 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4884 {
4885 Lisp_Object elt;
4886 elt = XCONS (chain)->car;
4887
4888 if (CONSP (elt)
4889 && ((STRINGP (target)
4890 && STRINGP (XCONS (elt)->car)
4891 && fast_string_match (XCONS (elt)->car, target) >= 0)
4892 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
4893 {
4894 val = XCONS (elt)->cdr;
4895 /* Here, if VAL is both a valid coding system and a valid
4896 function symbol, we return VAL as a coding system. */
4897 if (CONSP (val))
4898 return val;
4899 if (! SYMBOLP (val))
4900 return Qnil;
4901 if (! NILP (Fcoding_system_p (val)))
4902 return Fcons (val, val);
4903 if (! NILP (Ffboundp (val)))
4904 {
4905 val = call1 (val, Flist (nargs, args));
4906 if (CONSP (val))
4907 return val;
4908 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
4909 return Fcons (val, val);
4910 }
4911 return Qnil;
4912 }
4913 }
4914 return Qnil;
4915 }
4916
4917 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems,
4918 Supdate_iso_coding_systems, 0, 0, 0,
4919 "Update internal database for ISO2022 based coding systems.\n\
4920 When values of the following coding categories are changed, you must\n\
4921 call this function:\n\
4922 coding-category-iso-7, coding-category-iso-7-tight,\n\
4923 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4924 coding-category-iso-7-else, coding-category-iso-8-else")
4925 ()
4926 {
4927 int i;
4928
4929 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_ISO_8_ELSE;
4930 i++)
4931 {
4932 if (! coding_system_table[i])
4933 coding_system_table[i]
4934 = (struct coding_system *) xmalloc (sizeof (struct coding_system));
4935 setup_coding_system
4936 (XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value,
4937 coding_system_table[i]);
4938 }
4939 return Qnil;
4940 }
4941
4942 #endif /* emacs */
4943
4944 \f
4945 /*** 8. Post-amble ***/
4946
4947 void
4948 init_coding_once ()
4949 {
4950 int i;
4951
4952 /* Emacs' internal format specific initialize routine. */
4953 for (i = 0; i <= 0x20; i++)
4954 emacs_code_class[i] = EMACS_control_code;
4955 emacs_code_class[0x0A] = EMACS_linefeed_code;
4956 emacs_code_class[0x0D] = EMACS_carriage_return_code;
4957 for (i = 0x21 ; i < 0x7F; i++)
4958 emacs_code_class[i] = EMACS_ascii_code;
4959 emacs_code_class[0x7F] = EMACS_control_code;
4960 emacs_code_class[0x80] = EMACS_leading_code_composition;
4961 for (i = 0x81; i < 0xFF; i++)
4962 emacs_code_class[i] = EMACS_invalid_code;
4963 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
4964 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
4965 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
4966 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
4967
4968 /* ISO2022 specific initialize routine. */
4969 for (i = 0; i < 0x20; i++)
4970 iso_code_class[i] = ISO_control_code;
4971 for (i = 0x21; i < 0x7F; i++)
4972 iso_code_class[i] = ISO_graphic_plane_0;
4973 for (i = 0x80; i < 0xA0; i++)
4974 iso_code_class[i] = ISO_control_code;
4975 for (i = 0xA1; i < 0xFF; i++)
4976 iso_code_class[i] = ISO_graphic_plane_1;
4977 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
4978 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4979 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
4980 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
4981 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
4982 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
4983 iso_code_class[ISO_CODE_ESC] = ISO_escape;
4984 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
4985 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
4986 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
4987
4988 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
4989 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
4990
4991 setup_coding_system (Qnil, &keyboard_coding);
4992 setup_coding_system (Qnil, &terminal_coding);
4993 setup_coding_system (Qnil, &safe_terminal_coding);
4994
4995 bzero (coding_system_table, sizeof coding_system_table);
4996
4997 #if defined (MSDOS) || defined (WINDOWSNT)
4998 system_eol_type = CODING_EOL_CRLF;
4999 #else
5000 system_eol_type = CODING_EOL_LF;
5001 #endif
5002 }
5003
5004 #ifdef emacs
5005
5006 void
5007 syms_of_coding ()
5008 {
5009 Qtarget_idx = intern ("target-idx");
5010 staticpro (&Qtarget_idx);
5011
5012 Qcoding_system_history = intern ("coding-system-history");
5013 staticpro (&Qcoding_system_history);
5014 Fset (Qcoding_system_history, Qnil);
5015
5016 /* Target FILENAME is the first argument. */
5017 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5018 /* Target FILENAME is the third argument. */
5019 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5020
5021 Qcall_process = intern ("call-process");
5022 staticpro (&Qcall_process);
5023 /* Target PROGRAM is the first argument. */
5024 Fput (Qcall_process, Qtarget_idx, make_number (0));
5025
5026 Qcall_process_region = intern ("call-process-region");
5027 staticpro (&Qcall_process_region);
5028 /* Target PROGRAM is the third argument. */
5029 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5030
5031 Qstart_process = intern ("start-process");
5032 staticpro (&Qstart_process);
5033 /* Target PROGRAM is the third argument. */
5034 Fput (Qstart_process, Qtarget_idx, make_number (2));
5035
5036 Qopen_network_stream = intern ("open-network-stream");
5037 staticpro (&Qopen_network_stream);
5038 /* Target SERVICE is the fourth argument. */
5039 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5040
5041 Qcoding_system = intern ("coding-system");
5042 staticpro (&Qcoding_system);
5043
5044 Qeol_type = intern ("eol-type");
5045 staticpro (&Qeol_type);
5046
5047 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5048 staticpro (&Qbuffer_file_coding_system);
5049
5050 Qpost_read_conversion = intern ("post-read-conversion");
5051 staticpro (&Qpost_read_conversion);
5052
5053 Qpre_write_conversion = intern ("pre-write-conversion");
5054 staticpro (&Qpre_write_conversion);
5055
5056 Qno_conversion = intern ("no-conversion");
5057 staticpro (&Qno_conversion);
5058
5059 Qundecided = intern ("undecided");
5060 staticpro (&Qundecided);
5061
5062 Qcoding_system_p = intern ("coding-system-p");
5063 staticpro (&Qcoding_system_p);
5064
5065 Qcoding_system_error = intern ("coding-system-error");
5066 staticpro (&Qcoding_system_error);
5067
5068 Fput (Qcoding_system_error, Qerror_conditions,
5069 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5070 Fput (Qcoding_system_error, Qerror_message,
5071 build_string ("Invalid coding system"));
5072
5073 Qcoding_category = intern ("coding-category");
5074 staticpro (&Qcoding_category);
5075 Qcoding_category_index = intern ("coding-category-index");
5076 staticpro (&Qcoding_category_index);
5077
5078 Vcoding_category_table
5079 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5080 staticpro (&Vcoding_category_table);
5081 {
5082 int i;
5083 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5084 {
5085 XVECTOR (Vcoding_category_table)->contents[i]
5086 = intern (coding_category_name[i]);
5087 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5088 Qcoding_category_index, make_number (i));
5089 }
5090 }
5091
5092 Qcharacter_unification_table = intern ("character-unification-table");
5093 staticpro (&Qcharacter_unification_table);
5094 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
5095 make_number (0));
5096
5097 Qcharacter_unification_table_for_decode
5098 = intern ("character-unification-table-for-decode");
5099 staticpro (&Qcharacter_unification_table_for_decode);
5100
5101 Qcharacter_unification_table_for_encode
5102 = intern ("character-unification-table-for-encode");
5103 staticpro (&Qcharacter_unification_table_for_encode);
5104
5105 Qsafe_charsets = intern ("safe-charsets");
5106 staticpro (&Qsafe_charsets);
5107
5108 Qemacs_mule = intern ("emacs-mule");
5109 staticpro (&Qemacs_mule);
5110
5111 Qraw_text = intern ("raw-text");
5112 staticpro (&Qraw_text);
5113
5114 defsubr (&Scoding_system_p);
5115 defsubr (&Sread_coding_system);
5116 defsubr (&Sread_non_nil_coding_system);
5117 defsubr (&Scheck_coding_system);
5118 defsubr (&Sdetect_coding_region);
5119 defsubr (&Sdetect_coding_string);
5120 defsubr (&Sdecode_coding_region);
5121 defsubr (&Sencode_coding_region);
5122 defsubr (&Sdecode_coding_string);
5123 defsubr (&Sencode_coding_string);
5124 defsubr (&Sdecode_sjis_char);
5125 defsubr (&Sencode_sjis_char);
5126 defsubr (&Sdecode_big5_char);
5127 defsubr (&Sencode_big5_char);
5128 defsubr (&Sset_terminal_coding_system_internal);
5129 defsubr (&Sset_safe_terminal_coding_system_internal);
5130 defsubr (&Sterminal_coding_system);
5131 defsubr (&Sset_keyboard_coding_system_internal);
5132 defsubr (&Skeyboard_coding_system);
5133 defsubr (&Sfind_operation_coding_system);
5134 defsubr (&Supdate_iso_coding_systems);
5135
5136 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5137 "List of coding systems.\n\
5138 \n\
5139 Do not alter the value of this variable manually. This variable should be\n\
5140 updated by the functions `make-coding-system' and\n\
5141 `define-coding-system-alias'.");
5142 Vcoding_system_list = Qnil;
5143
5144 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5145 "Alist of coding system names.\n\
5146 Each element is one element list of coding system name.\n\
5147 This variable is given to `completing-read' as TABLE argument.\n\
5148 \n\
5149 Do not alter the value of this variable manually. This variable should be\n\
5150 updated by the functions `make-coding-system' and\n\
5151 `define-coding-system-alias'.");
5152 Vcoding_system_alist = Qnil;
5153
5154 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5155 "List of coding-categories (symbols) ordered by priority.");
5156 {
5157 int i;
5158
5159 Vcoding_category_list = Qnil;
5160 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5161 Vcoding_category_list
5162 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5163 Vcoding_category_list);
5164 }
5165
5166 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5167 "Specify the coding system for read operations.\n\
5168 It is useful to bind this variable with `let', but do not set it globally.\n\
5169 If the value is a coding system, it is used for decoding on read operation.\n\
5170 If not, an appropriate element is used from one of the coding system alists:\n\
5171 There are three such tables, `file-coding-system-alist',\n\
5172 `process-coding-system-alist', and `network-coding-system-alist'.");
5173 Vcoding_system_for_read = Qnil;
5174
5175 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5176 "Specify the coding system for write operations.\n\
5177 It is useful to bind this variable with `let', but do not set it globally.\n\
5178 If the value is a coding system, it is used for encoding on write operation.\n\
5179 If not, an appropriate element is used from one of the coding system alists:\n\
5180 There are three such tables, `file-coding-system-alist',\n\
5181 `process-coding-system-alist', and `network-coding-system-alist'.");
5182 Vcoding_system_for_write = Qnil;
5183
5184 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5185 "Coding system used in the latest file or process I/O.");
5186 Vlast_coding_system_used = Qnil;
5187
5188 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5189 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5190 inhibit_eol_conversion = 0;
5191
5192 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5193 "Alist to decide a coding system to use for a file I/O operation.\n\
5194 The format is ((PATTERN . VAL) ...),\n\
5195 where PATTERN is a regular expression matching a file name,\n\
5196 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5197 If VAL is a coding system, it is used for both decoding and encoding\n\
5198 the file contents.\n\
5199 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5200 and the cdr part is used for encoding.\n\
5201 If VAL is a function symbol, the function must return a coding system\n\
5202 or a cons of coding systems which are used as above.\n\
5203 \n\
5204 See also the function `find-operation-coding-system'.");
5205 Vfile_coding_system_alist = Qnil;
5206
5207 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5208 "Alist to decide a coding system to use for a process I/O operation.\n\
5209 The format is ((PATTERN . VAL) ...),\n\
5210 where PATTERN is a regular expression matching a program name,\n\
5211 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5212 If VAL is a coding system, it is used for both decoding what received\n\
5213 from the program and encoding what sent to the program.\n\
5214 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5215 and the cdr part is used for encoding.\n\
5216 If VAL is a function symbol, the function must return a coding system\n\
5217 or a cons of coding systems which are used as above.\n\
5218 \n\
5219 See also the function `find-operation-coding-system'.");
5220 Vprocess_coding_system_alist = Qnil;
5221
5222 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5223 "Alist to decide a coding system to use for a network I/O operation.\n\
5224 The format is ((PATTERN . VAL) ...),\n\
5225 where PATTERN is a regular expression matching a network service name\n\
5226 or is a port number to connect to,\n\
5227 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5228 If VAL is a coding system, it is used for both decoding what received\n\
5229 from the network stream and encoding what sent to the network stream.\n\
5230 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5231 and the cdr part is used for encoding.\n\
5232 If VAL is a function symbol, the function must return a coding system\n\
5233 or a cons of coding systems which are used as above.\n\
5234 \n\
5235 See also the function `find-operation-coding-system'.");
5236 Vnetwork_coding_system_alist = Qnil;
5237
5238 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5239 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5240 eol_mnemonic_unix = ':';
5241
5242 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5243 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5244 eol_mnemonic_dos = '\\';
5245
5246 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5247 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5248 eol_mnemonic_mac = '/';
5249
5250 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5251 "Mnemonic character indicating end-of-line format is not yet decided.");
5252 eol_mnemonic_undecided = ':';
5253
5254 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
5255 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5256 Venable_character_unification = Qt;
5257
5258 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5259 &Vstandard_character_unification_table_for_decode,
5260 "Table for unifying characters when reading.");
5261 Vstandard_character_unification_table_for_decode = Qnil;
5262
5263 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5264 &Vstandard_character_unification_table_for_encode,
5265 "Table for unifying characters when writing.");
5266 Vstandard_character_unification_table_for_encode = Qnil;
5267
5268 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5269 "Alist of charsets vs revision numbers.\n\
5270 While encoding, if a charset (car part of an element) is found,\n\
5271 designate it with the escape sequence identifing revision (cdr part of the element).");
5272 Vcharset_revision_alist = Qnil;
5273
5274 DEFVAR_LISP ("default-process-coding-system",
5275 &Vdefault_process_coding_system,
5276 "Cons of coding systems used for process I/O by default.\n\
5277 The car part is used for decoding a process output,\n\
5278 the cdr part is used for encoding a text to be sent to a process.");
5279 Vdefault_process_coding_system = Qnil;
5280
5281 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5282 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5283 This is a vector of length 256.\n\
5284 If Nth element is non-nil, the existence of code N in a file\n\
5285 \(or output of subprocess) doesn't prevent it to be detected as\n\
5286 a coding system of ISO 2022 variant which has a flag\n\
5287 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5288 or reading output of a subprocess.\n\
5289 Only 128th through 159th elements has a meaning.");
5290 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5291
5292 DEFVAR_LISP ("select-safe-coding-system-function",
5293 &Vselect_safe_coding_system_function,
5294 "Function to call to select safe coding system for encoding a text.\n\
5295 \n\
5296 If set, this function is called to force a user to select a proper\n\
5297 coding system which can encode the text in the case that a default\n\
5298 coding system used in each operation can't encode the text.\n\
5299 \n\
5300 The default value is `select-safe-codign-system' (which see).");
5301 Vselect_safe_coding_system_function = Qnil;
5302
5303 }
5304
5305 #endif /* emacs */