(describe_command): Use quotes around symbol name.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
74
75 5. Other
76
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
86
87 */
88
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is usually one byte of
95 `carriage-return'.
96
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
101
102 */
103
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
112 int
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
115 {
116 ...
117 }
118 #endif
119
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
128
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
131
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
135
136 Below is a template of these functions. */
137 #if 0
138 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
139 struct coding_system *coding;
140 unsigned char *source, *destination;
141 int src_bytes, dst_bytes;
142 {
143 ...
144 }
145 #endif
146
147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
148
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
151 a place pointed to by DESTINATION, the length of which should not
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
155
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
158
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
162
163 Below is a template of these functions. */
164 #if 0
165 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
166 struct coding_system *coding;
167 unsigned char *source, *destination;
168 int src_bytes, dst_bytes;
169 {
170 ...
171 }
172 #endif
173
174 /*** COMMONLY USED MACROS ***/
175
176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
181
182 #define ONE_MORE_BYTE(c1) \
183 do { \
184 if (src < src_end) \
185 c1 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190 #define TWO_MORE_BYTES(c1, c2) \
191 do { \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
194 else \
195 goto label_end_of_loop; \
196 } while (0)
197
198 #define THREE_MORE_BYTES(c1, c2, c3) \
199 do { \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
202 else \
203 goto label_end_of_loop; \
204 } while (0)
205
206 /* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
212
213 /* Decode one ASCII character C. */
214
215 #define DECODE_CHARACTER_ASCII(c) \
216 do { \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
219 else \
220 { \
221 *dst++ = (c); \
222 coding->produced_char++; \
223 } \
224 } while (0)
225
226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
227 position-code is C. */
228
229 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
230 do { \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
234 else \
235 { \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
238 } \
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
242 } while (0)
243
244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
245 position-codes are C1 and C2. */
246
247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
248 do { \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
251 } while (0)
252
253 \f
254 /*** 1. Preamble ***/
255
256 #include <stdio.h>
257
258 #ifdef emacs
259
260 #include <config.h>
261 #include "lisp.h"
262 #include "buffer.h"
263 #include "charset.h"
264 #include "ccl.h"
265 #include "coding.h"
266 #include "window.h"
267
268 #else /* not emacs */
269
270 #include "mulelib.h"
271
272 #endif /* not emacs */
273
274 Lisp_Object Qcoding_system, Qeol_type;
275 Lisp_Object Qbuffer_file_coding_system;
276 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
277 Lisp_Object Qno_conversion, Qundecided;
278 Lisp_Object Qcoding_system_history;
279 Lisp_Object Qsafe_charsets;
280
281 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
282 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
283 Lisp_Object Qstart_process, Qopen_network_stream;
284 Lisp_Object Qtarget_idx;
285
286 Lisp_Object Vselect_safe_coding_system_function;
287
288 /* Mnemonic character of each format of end-of-line. */
289 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
290 /* Mnemonic character to indicate format of end-of-line is not yet
291 decided. */
292 int eol_mnemonic_undecided;
293
294 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
296 int system_eol_type;
297
298 #ifdef emacs
299
300 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
301
302 Lisp_Object Qcoding_system_p, Qcoding_system_error;
303
304 /* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306 Lisp_Object Qemacs_mule, Qraw_text;
307
308 /* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310 /* Coding-system for reading files and receiving data from process. */
311 Lisp_Object Vcoding_system_for_read;
312 /* Coding-system for writing files and sending data to process. */
313 Lisp_Object Vcoding_system_for_write;
314 /* Coding-system actually used in the latest I/O. */
315 Lisp_Object Vlast_coding_system_used;
316
317 /* A vector of length 256 which contains information about special
318 Latin codes (espepcially for dealing with Microsoft code). */
319 Lisp_Object Vlatin_extra_code_table;
320
321 /* Flag to inhibit code conversion of end-of-line format. */
322 int inhibit_eol_conversion;
323
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding;
326
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding;
330
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding;
333
334 Lisp_Object Vfile_coding_system_alist;
335 Lisp_Object Vprocess_coding_system_alist;
336 Lisp_Object Vnetwork_coding_system_alist;
337
338 #endif /* emacs */
339
340 Lisp_Object Qcoding_category, Qcoding_category_index;
341
342 /* List of symbols `coding-category-xxx' ordered by priority. */
343 Lisp_Object Vcoding_category_list;
344
345 /* Table of coding categories (Lisp symbols). */
346 Lisp_Object Vcoding_category_table;
347
348 /* Table of names of symbol for each coding-category. */
349 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
350 "coding-category-emacs-mule",
351 "coding-category-sjis",
352 "coding-category-iso-7",
353 "coding-category-iso-7-tight",
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
358 "coding-category-big5",
359 "coding-category-raw-text",
360 "coding-category-binary"
361 };
362
363 /* Table pointers to coding systems corresponding to each coding
364 categories. */
365 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
366
367 /* Flag to tell if we look up unification table on character code
368 conversion. */
369 Lisp_Object Venable_character_unification;
370 /* Standard unification table to look up on decoding (reading). */
371 Lisp_Object Vstandard_character_unification_table_for_decode;
372 /* Standard unification table to look up on encoding (writing). */
373 Lisp_Object Vstandard_character_unification_table_for_encode;
374
375 Lisp_Object Qcharacter_unification_table;
376 Lisp_Object Qcharacter_unification_table_for_decode;
377 Lisp_Object Qcharacter_unification_table_for_encode;
378
379 /* Alist of charsets vs revision number. */
380 Lisp_Object Vcharset_revision_alist;
381
382 /* Default coding systems used for process I/O. */
383 Lisp_Object Vdefault_process_coding_system;
384
385 \f
386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
387
388 /* Emacs' internal format for encoding multiple character sets is a
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
400
401 There's one exception to this rule. Special leading-code
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
408 range as follows:
409
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
412 ASCII 0x00 .. 0x7F
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
416
417 */
418
419 enum emacs_code_class_type emacs_code_class[256];
420
421 /* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423 #define CHECK_CODE_RANGE_A0_FF \
424 do { \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
428 return 0; \
429 } while (0)
430
431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
434
435 int
436 detect_coding_emacs_mule (src, src_end)
437 unsigned char *src, *src_end;
438 {
439 unsigned char c;
440 int composing = 0;
441
442 while (src < src_end)
443 {
444 c = *src++;
445
446 if (composing)
447 {
448 if (c < 0xA0)
449 composing = 0;
450 else
451 c -= 0x20;
452 }
453
454 switch (emacs_code_class[c])
455 {
456 case EMACS_ascii_code:
457 case EMACS_linefeed_code:
458 break;
459
460 case EMACS_control_code:
461 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
462 return 0;
463 break;
464
465 case EMACS_invalid_code:
466 return 0;
467
468 case EMACS_leading_code_composition: /* c == 0x80 */
469 if (composing)
470 CHECK_CODE_RANGE_A0_FF;
471 else
472 composing = 1;
473 break;
474
475 case EMACS_leading_code_4:
476 CHECK_CODE_RANGE_A0_FF;
477 /* fall down to check it two more times ... */
478
479 case EMACS_leading_code_3:
480 CHECK_CODE_RANGE_A0_FF;
481 /* fall down to check it one more time ... */
482
483 case EMACS_leading_code_2:
484 CHECK_CODE_RANGE_A0_FF;
485 break;
486
487 default:
488 label_end_of_switch:
489 break;
490 }
491 }
492 return CODING_CATEGORY_MASK_EMACS_MULE;
493 }
494
495 \f
496 /*** 3. ISO2022 handlers ***/
497
498 /* The following note describes the coding system ISO2022 briefly.
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
503
504 ISO2022 provides many mechanisms to encode several character sets
505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
506 all text is encoded by codes of less than 128. This may make the
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
509
510 There are two kinds of character set: control character set and
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
513 functions are provided also by escape sequences). The latter
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
516
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
526
527 Note (*): ECMA = European Computer Manufacturers Association
528
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
534
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
540
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
550
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
558
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
562
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
568 sequences.
569
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
574 SO (shift-out) 0x0E none invoke G1 to GL
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
582
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
604
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
607
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
612
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
615
616 Now you may notice that there are a lot of ways for encoding the
617 same multilingual text in ISO2022. Actually, there exists many
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
622
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
626
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
634
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
640
641 enum iso_code_class_type iso_code_class[256];
642
643 #define CHARSET_OK(idx, charset) \
644 (coding_system_table[idx]->safe_charsets[charset] \
645 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
646 (coding_system_table[idx], charset) \
647 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
648
649 #define SHIFT_OUT_OK(idx) \
650 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
651
652 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
653 Check if a text is encoded in ISO2022. If it is, returns an
654 integer in which appropriate flag bits any of:
655 CODING_CATEGORY_MASK_ISO_7
656 CODING_CATEGORY_MASK_ISO_7_TIGHT
657 CODING_CATEGORY_MASK_ISO_8_1
658 CODING_CATEGORY_MASK_ISO_8_2
659 CODING_CATEGORY_MASK_ISO_7_ELSE
660 CODING_CATEGORY_MASK_ISO_8_ELSE
661 are set. If a code which should never appear in ISO2022 is found,
662 returns 0. */
663
664 int
665 detect_coding_iso2022 (src, src_end)
666 unsigned char *src, *src_end;
667 {
668 int mask = CODING_CATEGORY_MASK_ISO;
669 int mask_found = 0;
670 int reg[4], shift_out = 0;
671 int c, c1, i, charset;
672
673 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
674 while (mask && src < src_end)
675 {
676 c = *src++;
677 switch (c)
678 {
679 case ISO_CODE_ESC:
680 if (src >= src_end)
681 break;
682 c = *src++;
683 if (c >= '(' && c <= '/')
684 {
685 /* Designation sequence for a charset of dimension 1. */
686 if (src >= src_end)
687 break;
688 c1 = *src++;
689 if (c1 < ' ' || c1 >= 0x80
690 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
691 /* Invalid designation sequence. Just ignore. */
692 break;
693 reg[(c - '(') % 4] = charset;
694 }
695 else if (c == '$')
696 {
697 /* Designation sequence for a charset of dimension 2. */
698 if (src >= src_end)
699 break;
700 c = *src++;
701 if (c >= '@' && c <= 'B')
702 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
703 reg[0] = charset = iso_charset_table[1][0][c];
704 else if (c >= '(' && c <= '/')
705 {
706 if (src >= src_end)
707 break;
708 c1 = *src++;
709 if (c1 < ' ' || c1 >= 0x80
710 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
711 /* Invalid designation sequence. Just ignore. */
712 break;
713 reg[(c - '(') % 4] = charset;
714 }
715 else
716 /* Invalid designation sequence. Just ignore. */
717 break;
718 }
719 else if (c == 'N' || c == 'n')
720 {
721 if (shift_out == 0
722 && (reg[1] >= 0
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
724 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
725 {
726 /* Locking shift out. */
727 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
728 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
729 shift_out = 1;
730 }
731 break;
732 }
733 else if (c == 'O' || c == 'o')
734 {
735 if (shift_out == 1)
736 {
737 /* Locking shift in. */
738 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
739 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
740 shift_out = 0;
741 }
742 break;
743 }
744 else if (c == '0' || c == '1' || c == '2')
745 /* Start/end composition. Just ignore. */
746 break;
747 else
748 /* Invalid escape sequence. Just ignore. */
749 break;
750
751 /* We found a valid designation sequence for CHARSET. */
752 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
753 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
754 mask_found |= CODING_CATEGORY_MASK_ISO_7;
755 else
756 mask &= ~CODING_CATEGORY_MASK_ISO_7;
757 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
758 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
759 else
760 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
761 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
762 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
763 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
764 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
765 break;
766
767 case ISO_CODE_SO:
768 if (shift_out == 0
769 && (reg[1] >= 0
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
771 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
772 {
773 /* Locking shift out. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
775 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
776 }
777 break;
778
779 case ISO_CODE_SI:
780 if (shift_out == 1)
781 {
782 /* Locking shift in. */
783 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
784 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
785 }
786 break;
787
788 case ISO_CODE_CSI:
789 case ISO_CODE_SS2:
790 case ISO_CODE_SS3:
791 {
792 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
793
794 if (c != ISO_CODE_CSI)
795 {
796 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
797 & CODING_FLAG_ISO_SINGLE_SHIFT)
798 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
799 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
800 & CODING_FLAG_ISO_SINGLE_SHIFT)
801 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
802 }
803 if (VECTORP (Vlatin_extra_code_table)
804 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
805 {
806 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
807 & CODING_FLAG_ISO_LATIN_EXTRA)
808 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
809 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
810 & CODING_FLAG_ISO_LATIN_EXTRA)
811 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
812 }
813 mask &= newmask;
814 mask_found |= newmask;
815 }
816 break;
817
818 default:
819 if (c < 0x80)
820 break;
821 else if (c < 0xA0)
822 {
823 if (VECTORP (Vlatin_extra_code_table)
824 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
825 {
826 int newmask = 0;
827
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
829 & CODING_FLAG_ISO_LATIN_EXTRA)
830 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
832 & CODING_FLAG_ISO_LATIN_EXTRA)
833 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
834 mask &= newmask;
835 mask_found |= newmask;
836 }
837 else
838 return 0;
839 }
840 else
841 {
842 unsigned char *src_begin = src;
843
844 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
845 | CODING_CATEGORY_MASK_ISO_7_ELSE);
846 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
847 while (src < src_end && *src >= 0xA0)
848 src++;
849 if ((src - src_begin - 1) & 1 && src < src_end)
850 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
851 else
852 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
853 }
854 break;
855 }
856 }
857
858 return (mask & mask_found);
859 }
860
861 /* Decode a character of which charset is CHARSET and the 1st position
862 code is C1. If dimension of CHARSET is 2, the 2nd position code is
863 fetched from SRC and set to C2. If CHARSET is negative, it means
864 that we are decoding ill formed text, and what we can do is just to
865 read C1 as is. */
866
867 #define DECODE_ISO_CHARACTER(charset, c1) \
868 do { \
869 int c_alt, charset_alt = (charset); \
870 if (COMPOSING_HEAD_P (coding->composing)) \
871 { \
872 *dst++ = LEADING_CODE_COMPOSITION; \
873 if (COMPOSING_WITH_RULE_P (coding->composing)) \
874 /* To tell composition rules are embeded. */ \
875 *dst++ = 0xFF; \
876 coding->composing += 2; \
877 } \
878 if ((charset) >= 0) \
879 { \
880 if (CHARSET_DIMENSION (charset) == 2) \
881 { \
882 ONE_MORE_BYTE (c2); \
883 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
884 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
885 { \
886 src--; \
887 c2 = ' '; \
888 } \
889 } \
890 if (!NILP (unification_table) \
891 && ((c_alt = unify_char (unification_table, \
892 -1, (charset), c1, c2)) >= 0)) \
893 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
894 } \
895 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
896 DECODE_CHARACTER_ASCII (c1); \
897 else if (CHARSET_DIMENSION (charset_alt) == 1) \
898 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
899 else \
900 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
901 if (COMPOSING_WITH_RULE_P (coding->composing)) \
902 /* To tell a composition rule follows. */ \
903 coding->composing = COMPOSING_WITH_RULE_RULE; \
904 } while (0)
905
906 /* Set designation state into CODING. */
907 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
908 do { \
909 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
910 make_number (chars), \
911 make_number (final_char)); \
912 if (charset >= 0 \
913 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
914 || coding->safe_charsets[charset])) \
915 { \
916 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
917 && reg == 0 \
918 && charset == CHARSET_ASCII) \
919 { \
920 /* We should insert this designation sequence as is so \
921 that it is surely written back to a file. */ \
922 coding->spec.iso2022.last_invalid_designation_register = -1; \
923 goto label_invalid_code; \
924 } \
925 coding->spec.iso2022.last_invalid_designation_register = -1; \
926 if ((coding->mode & CODING_MODE_DIRECTION) \
927 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
928 charset = CHARSET_REVERSE_CHARSET (charset); \
929 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
930 } \
931 else \
932 { \
933 coding->spec.iso2022.last_invalid_designation_register = reg; \
934 goto label_invalid_code; \
935 } \
936 } while (0)
937
938 /* Check if the current composing sequence contains only valid codes.
939 If the composing sequence doesn't end before SRC_END, return -1.
940 Else, if it contains only valid codes, return 0.
941 Else return the length of the composing sequence. */
942
943 int check_composing_code (coding, src, src_end)
944 struct coding_system *coding;
945 unsigned char *src, *src_end;
946 {
947 unsigned char *src_start = src;
948 int invalid_code_found = 0;
949 int charset, c, c1, dim;
950
951 while (src < src_end)
952 {
953 if (*src++ != ISO_CODE_ESC) continue;
954 if (src >= src_end) break;
955 if ((c = *src++) == '1') /* end of compsition */
956 return (invalid_code_found ? src - src_start : 0);
957 if (src + 2 >= src_end) break;
958 if (!coding->flags & CODING_FLAG_ISO_DESIGNATION)
959 invalid_code_found = 1;
960 else
961 {
962 dim = 0;
963 if (c == '$')
964 {
965 dim = 1;
966 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
967 }
968 if (c >= '(' && c <= '/')
969 {
970 c1 = *src++;
971 if ((c1 < ' ' || c1 >= 0x80)
972 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
973 || ! coding->safe_charsets[charset]
974 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
975 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
976 invalid_code_found = 1;
977 }
978 else
979 invalid_code_found = 1;
980 }
981 }
982 return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1);
983 }
984
985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
986
987 int
988 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
989 struct coding_system *coding;
990 unsigned char *source, *destination;
991 int src_bytes, dst_bytes;
992 {
993 unsigned char *src = source;
994 unsigned char *src_end = source + src_bytes;
995 unsigned char *dst = destination;
996 unsigned char *dst_end = destination + dst_bytes;
997 /* Since the maximum bytes produced by each loop is 7, we subtract 6
998 from DST_END to assure that overflow checking is necessary only
999 at the head of loop. */
1000 unsigned char *adjusted_dst_end = dst_end - 6;
1001 int charset;
1002 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1003 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1004 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1005 Lisp_Object unification_table
1006 = coding->character_unification_table_for_decode;
1007 int result = CODING_FINISH_NORMAL;
1008
1009 if (!NILP (Venable_character_unification) && NILP (unification_table))
1010 unification_table = Vstandard_character_unification_table_for_decode;
1011
1012 coding->produced_char = 0;
1013 coding->fake_multibyte = 0;
1014 while (src < src_end && (dst_bytes
1015 ? (dst < adjusted_dst_end)
1016 : (dst < src - 6)))
1017 {
1018 /* SRC_BASE remembers the start position in source in each loop.
1019 The loop will be exited when there's not enough source text
1020 to analyze long escape sequence or 2-byte code (within macros
1021 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1022 to SRC_BASE before exiting. */
1023 unsigned char *src_base = src;
1024 int c1 = *src++, c2;
1025
1026 switch (iso_code_class [c1])
1027 {
1028 case ISO_0x20_or_0x7F:
1029 if (!coding->composing
1030 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1031 {
1032 /* This is SPACE or DEL. */
1033 *dst++ = c1;
1034 coding->produced_char++;
1035 break;
1036 }
1037 /* This is a graphic character, we fall down ... */
1038
1039 case ISO_graphic_plane_0:
1040 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1041 {
1042 /* This is a composition rule. */
1043 *dst++ = c1 | 0x80;
1044 coding->composing = COMPOSING_WITH_RULE_TAIL;
1045 }
1046 else
1047 DECODE_ISO_CHARACTER (charset0, c1);
1048 break;
1049
1050 case ISO_0xA0_or_0xFF:
1051 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1052 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1053 goto label_invalid_code;
1054 /* This is a graphic character, we fall down ... */
1055
1056 case ISO_graphic_plane_1:
1057 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1058 goto label_invalid_code;
1059 else
1060 DECODE_ISO_CHARACTER (charset1, c1);
1061 break;
1062
1063 case ISO_control_code:
1064 /* All ISO2022 control characters in this class have the
1065 same representation in Emacs internal format. */
1066 if (c1 == '\n'
1067 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1068 && (coding->eol_type == CODING_EOL_CR
1069 || coding->eol_type == CODING_EOL_CRLF))
1070 {
1071 result = CODING_FINISH_INCONSISTENT_EOL;
1072 goto label_end_of_loop_2;
1073 }
1074 *dst++ = c1;
1075 coding->produced_char++;
1076 break;
1077
1078 case ISO_carriage_return:
1079 if (coding->eol_type == CODING_EOL_CR)
1080 *dst++ = '\n';
1081 else if (coding->eol_type == CODING_EOL_CRLF)
1082 {
1083 ONE_MORE_BYTE (c1);
1084 if (c1 == ISO_CODE_LF)
1085 *dst++ = '\n';
1086 else
1087 {
1088 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1089 {
1090 result = CODING_FINISH_INCONSISTENT_EOL;
1091 goto label_end_of_loop_2;
1092 }
1093 src--;
1094 *dst++ = '\r';
1095 }
1096 }
1097 else
1098 *dst++ = c1;
1099 coding->produced_char++;
1100 break;
1101
1102 case ISO_shift_out:
1103 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1104 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1105 goto label_invalid_code;
1106 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1107 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1108 break;
1109
1110 case ISO_shift_in:
1111 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1112 goto label_invalid_code;
1113 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1114 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1115 break;
1116
1117 case ISO_single_shift_2_7:
1118 case ISO_single_shift_2:
1119 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1120 goto label_invalid_code;
1121 /* SS2 is handled as an escape sequence of ESC 'N' */
1122 c1 = 'N';
1123 goto label_escape_sequence;
1124
1125 case ISO_single_shift_3:
1126 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1127 goto label_invalid_code;
1128 /* SS2 is handled as an escape sequence of ESC 'O' */
1129 c1 = 'O';
1130 goto label_escape_sequence;
1131
1132 case ISO_control_sequence_introducer:
1133 /* CSI is handled as an escape sequence of ESC '[' ... */
1134 c1 = '[';
1135 goto label_escape_sequence;
1136
1137 case ISO_escape:
1138 ONE_MORE_BYTE (c1);
1139 label_escape_sequence:
1140 /* Escape sequences handled by Emacs are invocation,
1141 designation, direction specification, and character
1142 composition specification. */
1143 switch (c1)
1144 {
1145 case '&': /* revision of following character set */
1146 ONE_MORE_BYTE (c1);
1147 if (!(c1 >= '@' && c1 <= '~'))
1148 goto label_invalid_code;
1149 ONE_MORE_BYTE (c1);
1150 if (c1 != ISO_CODE_ESC)
1151 goto label_invalid_code;
1152 ONE_MORE_BYTE (c1);
1153 goto label_escape_sequence;
1154
1155 case '$': /* designation of 2-byte character set */
1156 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1157 goto label_invalid_code;
1158 ONE_MORE_BYTE (c1);
1159 if (c1 >= '@' && c1 <= 'B')
1160 { /* designation of JISX0208.1978, GB2312.1980,
1161 or JISX0208.1980 */
1162 DECODE_DESIGNATION (0, 2, 94, c1);
1163 }
1164 else if (c1 >= 0x28 && c1 <= 0x2B)
1165 { /* designation of DIMENSION2_CHARS94 character set */
1166 ONE_MORE_BYTE (c2);
1167 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1168 }
1169 else if (c1 >= 0x2C && c1 <= 0x2F)
1170 { /* designation of DIMENSION2_CHARS96 character set */
1171 ONE_MORE_BYTE (c2);
1172 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1173 }
1174 else
1175 goto label_invalid_code;
1176 break;
1177
1178 case 'n': /* invocation of locking-shift-2 */
1179 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1180 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1181 goto label_invalid_code;
1182 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1183 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1184 break;
1185
1186 case 'o': /* invocation of locking-shift-3 */
1187 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1188 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1189 goto label_invalid_code;
1190 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1191 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1192 break;
1193
1194 case 'N': /* invocation of single-shift-2 */
1195 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1196 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1197 goto label_invalid_code;
1198 ONE_MORE_BYTE (c1);
1199 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1200 DECODE_ISO_CHARACTER (charset, c1);
1201 break;
1202
1203 case 'O': /* invocation of single-shift-3 */
1204 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1205 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1206 goto label_invalid_code;
1207 ONE_MORE_BYTE (c1);
1208 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1209 DECODE_ISO_CHARACTER (charset, c1);
1210 break;
1211
1212 case '0': case '2': /* start composing */
1213 /* Before processing composing, we must be sure that all
1214 characters being composed are supported by CODING.
1215 If not, we must give up composing and insert the
1216 bunch of codes for composing as is without decoding. */
1217 {
1218 int result1;
1219
1220 result1 = check_composing_code (coding, src, src_end);
1221 if (result1 == 0)
1222 coding->composing = (c1 == '0'
1223 ? COMPOSING_NO_RULE_HEAD
1224 : COMPOSING_WITH_RULE_HEAD);
1225 else if (result1 > 0)
1226 {
1227 if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst)
1228 {
1229 bcopy (src_base, dst, result1 + 2);
1230 src += result1;
1231 dst += result1 + 2;
1232 coding->produced_char += result1 + 2;
1233 }
1234 else
1235 {
1236 result = CODING_FINISH_INSUFFICIENT_DST;
1237 goto label_end_of_loop_2;
1238 }
1239 }
1240 else
1241 goto label_end_of_loop;
1242 }
1243 break;
1244
1245 case '1': /* end composing */
1246 coding->composing = COMPOSING_NO;
1247 coding->produced_char++;
1248 break;
1249
1250 case '[': /* specification of direction */
1251 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1252 goto label_invalid_code;
1253 /* For the moment, nested direction is not supported.
1254 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1255 left-to-right, and nozero means right-to-left. */
1256 ONE_MORE_BYTE (c1);
1257 switch (c1)
1258 {
1259 case ']': /* end of the current direction */
1260 coding->mode &= ~CODING_MODE_DIRECTION;
1261
1262 case '0': /* end of the current direction */
1263 case '1': /* start of left-to-right direction */
1264 ONE_MORE_BYTE (c1);
1265 if (c1 == ']')
1266 coding->mode &= ~CODING_MODE_DIRECTION;
1267 else
1268 goto label_invalid_code;
1269 break;
1270
1271 case '2': /* start of right-to-left direction */
1272 ONE_MORE_BYTE (c1);
1273 if (c1 == ']')
1274 coding->mode |= CODING_MODE_DIRECTION;
1275 else
1276 goto label_invalid_code;
1277 break;
1278
1279 default:
1280 goto label_invalid_code;
1281 }
1282 break;
1283
1284 default:
1285 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1286 goto label_invalid_code;
1287 if (c1 >= 0x28 && c1 <= 0x2B)
1288 { /* designation of DIMENSION1_CHARS94 character set */
1289 ONE_MORE_BYTE (c2);
1290 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1291 }
1292 else if (c1 >= 0x2C && c1 <= 0x2F)
1293 { /* designation of DIMENSION1_CHARS96 character set */
1294 ONE_MORE_BYTE (c2);
1295 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1296 }
1297 else
1298 {
1299 goto label_invalid_code;
1300 }
1301 }
1302 /* We must update these variables now. */
1303 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1304 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1305 break;
1306
1307 label_invalid_code:
1308 while (src_base < src)
1309 *dst++ = *src_base++;
1310 coding->fake_multibyte = 1;
1311 }
1312 continue;
1313
1314 label_end_of_loop:
1315 result = CODING_FINISH_INSUFFICIENT_SRC;
1316 label_end_of_loop_2:
1317 src = src_base;
1318 break;
1319 }
1320
1321 if (src < src_end)
1322 {
1323 if (result == CODING_FINISH_NORMAL)
1324 result = CODING_FINISH_INSUFFICIENT_DST;
1325 else if (result != CODING_FINISH_INCONSISTENT_EOL
1326 && coding->mode & CODING_MODE_LAST_BLOCK)
1327 {
1328 /* This is the last block of the text to be decoded. We had
1329 better just flush out all remaining codes in the text
1330 although they are not valid characters. */
1331 src_bytes = src_end - src;
1332 if (dst_bytes && (dst_end - dst < src_bytes))
1333 src_bytes = dst_end - dst;
1334 bcopy (src, dst, src_bytes);
1335 dst += src_bytes;
1336 src += src_bytes;
1337 coding->fake_multibyte = 1;
1338 }
1339 }
1340
1341 coding->consumed = coding->consumed_char = src - source;
1342 coding->produced = dst - destination;
1343 return result;
1344 }
1345
1346 /* ISO2022 encoding stuff. */
1347
1348 /*
1349 It is not enough to say just "ISO2022" on encoding, we have to
1350 specify more details. In Emacs, each coding system of ISO2022
1351 variant has the following specifications:
1352 1. Initial designation to G0 thru G3.
1353 2. Allows short-form designation?
1354 3. ASCII should be designated to G0 before control characters?
1355 4. ASCII should be designated to G0 at end of line?
1356 5. 7-bit environment or 8-bit environment?
1357 6. Use locking-shift?
1358 7. Use Single-shift?
1359 And the following two are only for Japanese:
1360 8. Use ASCII in place of JIS0201-1976-Roman?
1361 9. Use JISX0208-1983 in place of JISX0208-1978?
1362 These specifications are encoded in `coding->flags' as flag bits
1363 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1364 details.
1365 */
1366
1367 /* Produce codes (escape sequence) for designating CHARSET to graphic
1368 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1369 the coding system CODING allows, produce designation sequence of
1370 short-form. */
1371
1372 #define ENCODE_DESIGNATION(charset, reg, coding) \
1373 do { \
1374 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1375 char *intermediate_char_94 = "()*+"; \
1376 char *intermediate_char_96 = ",-./"; \
1377 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1378 if (revision < 255) \
1379 { \
1380 *dst++ = ISO_CODE_ESC; \
1381 *dst++ = '&'; \
1382 *dst++ = '@' + revision; \
1383 } \
1384 *dst++ = ISO_CODE_ESC; \
1385 if (CHARSET_DIMENSION (charset) == 1) \
1386 { \
1387 if (CHARSET_CHARS (charset) == 94) \
1388 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1389 else \
1390 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1391 } \
1392 else \
1393 { \
1394 *dst++ = '$'; \
1395 if (CHARSET_CHARS (charset) == 94) \
1396 { \
1397 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1398 || reg != 0 \
1399 || final_char < '@' || final_char > 'B') \
1400 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1401 } \
1402 else \
1403 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1404 } \
1405 *dst++ = final_char; \
1406 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1407 } while (0)
1408
1409 /* The following two macros produce codes (control character or escape
1410 sequence) for ISO2022 single-shift functions (single-shift-2 and
1411 single-shift-3). */
1412
1413 #define ENCODE_SINGLE_SHIFT_2 \
1414 do { \
1415 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1416 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1417 else \
1418 { \
1419 *dst++ = ISO_CODE_SS2; \
1420 coding->fake_multibyte = 1; \
1421 } \
1422 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1423 } while (0)
1424
1425 #define ENCODE_SINGLE_SHIFT_3 \
1426 do { \
1427 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1428 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1429 else \
1430 { \
1431 *dst++ = ISO_CODE_SS3; \
1432 coding->fake_multibyte = 1; \
1433 } \
1434 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1435 } while (0)
1436
1437 /* The following four macros produce codes (control character or
1438 escape sequence) for ISO2022 locking-shift functions (shift-in,
1439 shift-out, locking-shift-2, and locking-shift-3). */
1440
1441 #define ENCODE_SHIFT_IN \
1442 do { \
1443 *dst++ = ISO_CODE_SI; \
1444 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1445 } while (0)
1446
1447 #define ENCODE_SHIFT_OUT \
1448 do { \
1449 *dst++ = ISO_CODE_SO; \
1450 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1451 } while (0)
1452
1453 #define ENCODE_LOCKING_SHIFT_2 \
1454 do { \
1455 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1457 } while (0)
1458
1459 #define ENCODE_LOCKING_SHIFT_3 \
1460 do { \
1461 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1462 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1463 } while (0)
1464
1465 /* Produce codes for a DIMENSION1 character whose character set is
1466 CHARSET and whose position-code is C1. Designation and invocation
1467 sequences are also produced in advance if necessary. */
1468
1469
1470 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1471 do { \
1472 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1473 { \
1474 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1475 *dst++ = c1 & 0x7F; \
1476 else \
1477 *dst++ = c1 | 0x80; \
1478 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1479 break; \
1480 } \
1481 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1482 { \
1483 *dst++ = c1 & 0x7F; \
1484 break; \
1485 } \
1486 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1487 { \
1488 *dst++ = c1 | 0x80; \
1489 break; \
1490 } \
1491 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1492 && !coding->safe_charsets[charset]) \
1493 { \
1494 /* We should not encode this character, instead produce one or \
1495 two `?'s. */ \
1496 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1497 if (CHARSET_WIDTH (charset) == 2) \
1498 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1499 break; \
1500 } \
1501 else \
1502 /* Since CHARSET is not yet invoked to any graphic planes, we \
1503 must invoke it, or, at first, designate it to some graphic \
1504 register. Then repeat the loop to actually produce the \
1505 character. */ \
1506 dst = encode_invocation_designation (charset, coding, dst); \
1507 } while (1)
1508
1509 /* Produce codes for a DIMENSION2 character whose character set is
1510 CHARSET and whose position-codes are C1 and C2. Designation and
1511 invocation codes are also produced in advance if necessary. */
1512
1513 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1514 do { \
1515 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1516 { \
1517 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1518 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1519 else \
1520 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1521 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1522 break; \
1523 } \
1524 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1525 { \
1526 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1527 break; \
1528 } \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1530 { \
1531 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1532 break; \
1533 } \
1534 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1535 && !coding->safe_charsets[charset]) \
1536 { \
1537 /* We should not encode this character, instead produce one or \
1538 two `?'s. */ \
1539 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1540 if (CHARSET_WIDTH (charset) == 2) \
1541 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1542 break; \
1543 } \
1544 else \
1545 /* Since CHARSET is not yet invoked to any graphic planes, we \
1546 must invoke it, or, at first, designate it to some graphic \
1547 register. Then repeat the loop to actually produce the \
1548 character. */ \
1549 dst = encode_invocation_designation (charset, coding, dst); \
1550 } while (1)
1551
1552 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1553 do { \
1554 int c_alt, charset_alt; \
1555 if (!NILP (unification_table) \
1556 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1557 >= 0)) \
1558 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1559 else \
1560 charset_alt = charset; \
1561 if (CHARSET_DIMENSION (charset_alt) == 1) \
1562 { \
1563 if (charset == CHARSET_ASCII \
1564 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1565 charset_alt = charset_latin_jisx0201; \
1566 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1567 } \
1568 else \
1569 { \
1570 if (charset == charset_jisx0208 \
1571 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1572 charset_alt = charset_jisx0208_1978; \
1573 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1574 } \
1575 if (! COMPOSING_P (coding->composing)) \
1576 coding->consumed_char++; \
1577 } while (0)
1578
1579 /* Produce designation and invocation codes at a place pointed by DST
1580 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1581 Return new DST. */
1582
1583 unsigned char *
1584 encode_invocation_designation (charset, coding, dst)
1585 int charset;
1586 struct coding_system *coding;
1587 unsigned char *dst;
1588 {
1589 int reg; /* graphic register number */
1590
1591 /* At first, check designations. */
1592 for (reg = 0; reg < 4; reg++)
1593 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1594 break;
1595
1596 if (reg >= 4)
1597 {
1598 /* CHARSET is not yet designated to any graphic registers. */
1599 /* At first check the requested designation. */
1600 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1601 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1602 /* Since CHARSET requests no special designation, designate it
1603 to graphic register 0. */
1604 reg = 0;
1605
1606 ENCODE_DESIGNATION (charset, reg, coding);
1607 }
1608
1609 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1610 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1611 {
1612 /* Since the graphic register REG is not invoked to any graphic
1613 planes, invoke it to graphic plane 0. */
1614 switch (reg)
1615 {
1616 case 0: /* graphic register 0 */
1617 ENCODE_SHIFT_IN;
1618 break;
1619
1620 case 1: /* graphic register 1 */
1621 ENCODE_SHIFT_OUT;
1622 break;
1623
1624 case 2: /* graphic register 2 */
1625 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1626 ENCODE_SINGLE_SHIFT_2;
1627 else
1628 ENCODE_LOCKING_SHIFT_2;
1629 break;
1630
1631 case 3: /* graphic register 3 */
1632 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1633 ENCODE_SINGLE_SHIFT_3;
1634 else
1635 ENCODE_LOCKING_SHIFT_3;
1636 break;
1637 }
1638 }
1639 return dst;
1640 }
1641
1642 /* The following two macros produce codes for indicating composition. */
1643 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1644 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1645 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1646
1647 /* The following three macros produce codes for indicating direction
1648 of text. */
1649 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1650 do { \
1651 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1652 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1653 else \
1654 *dst++ = ISO_CODE_CSI; \
1655 } while (0)
1656
1657 #define ENCODE_DIRECTION_R2L \
1658 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1659
1660 #define ENCODE_DIRECTION_L2R \
1661 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1662
1663 /* Produce codes for designation and invocation to reset the graphic
1664 planes and registers to initial state. */
1665 #define ENCODE_RESET_PLANE_AND_REGISTER \
1666 do { \
1667 int reg; \
1668 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1669 ENCODE_SHIFT_IN; \
1670 for (reg = 0; reg < 4; reg++) \
1671 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1672 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1673 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1674 ENCODE_DESIGNATION \
1675 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1676 } while (0)
1677
1678 /* Produce designation sequences of charsets in the line started from
1679 SRC to a place pointed by *DSTP, and update DSTP.
1680
1681 If the current block ends before any end-of-line, we may fail to
1682 find all the necessary designations. */
1683
1684 encode_designation_at_bol (coding, table, src, src_end, dstp)
1685 struct coding_system *coding;
1686 Lisp_Object table;
1687 unsigned char *src, *src_end, **dstp;
1688 {
1689 int charset, c, found = 0, reg;
1690 /* Table of charsets to be designated to each graphic register. */
1691 int r[4];
1692 unsigned char *dst = *dstp;
1693
1694 for (reg = 0; reg < 4; reg++)
1695 r[reg] = -1;
1696
1697 while (src < src_end && *src != '\n' && found < 4)
1698 {
1699 int bytes = BYTES_BY_CHAR_HEAD (*src);
1700
1701 if (NILP (table))
1702 charset = CHARSET_AT (src);
1703 else
1704 {
1705 int c_alt;
1706 unsigned char c1, c2;
1707
1708 SPLIT_STRING(src, bytes, charset, c1, c2);
1709 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1710 charset = CHAR_CHARSET (c_alt);
1711 }
1712
1713 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1714 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1715 {
1716 found++;
1717 r[reg] = charset;
1718 }
1719
1720 src += bytes;
1721 }
1722
1723 if (found)
1724 {
1725 for (reg = 0; reg < 4; reg++)
1726 if (r[reg] >= 0
1727 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1728 ENCODE_DESIGNATION (r[reg], reg, coding);
1729 *dstp = dst;
1730 }
1731 }
1732
1733 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1734
1735 int
1736 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1737 struct coding_system *coding;
1738 unsigned char *source, *destination;
1739 int src_bytes, dst_bytes;
1740 {
1741 unsigned char *src = source;
1742 unsigned char *src_end = source + src_bytes;
1743 unsigned char *dst = destination;
1744 unsigned char *dst_end = destination + dst_bytes;
1745 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1746 from DST_END to assure overflow checking is necessary only at the
1747 head of loop. */
1748 unsigned char *adjusted_dst_end = dst_end - 19;
1749 Lisp_Object unification_table
1750 = coding->character_unification_table_for_encode;
1751 int result = CODING_FINISH_NORMAL;
1752
1753 if (!NILP (Venable_character_unification) && NILP (unification_table))
1754 unification_table = Vstandard_character_unification_table_for_encode;
1755
1756 coding->consumed_char = 0;
1757 coding->fake_multibyte = 0;
1758 while (src < src_end && (dst_bytes
1759 ? (dst < adjusted_dst_end)
1760 : (dst < src - 19)))
1761 {
1762 /* SRC_BASE remembers the start position in source in each loop.
1763 The loop will be exited when there's not enough source text
1764 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1765 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1766 reset to SRC_BASE before exiting. */
1767 unsigned char *src_base = src;
1768 int charset, c1, c2, c3, c4;
1769
1770 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1771 && CODING_SPEC_ISO_BOL (coding))
1772 {
1773 /* We have to produce designation sequences if any now. */
1774 encode_designation_at_bol (coding, unification_table,
1775 src, src_end, &dst);
1776 CODING_SPEC_ISO_BOL (coding) = 0;
1777 }
1778
1779 c1 = *src++;
1780 /* If we are seeing a component of a composite character, we are
1781 seeing a leading-code encoded irregularly for composition, or
1782 a composition rule if composing with rule. We must set C1 to
1783 a normal leading-code or an ASCII code. If we are not seeing
1784 a composite character, we must reset composition,
1785 designation, and invocation states. */
1786 if (COMPOSING_P (coding->composing))
1787 {
1788 if (c1 < 0xA0)
1789 {
1790 /* We are not in a composite character any longer. */
1791 coding->composing = COMPOSING_NO;
1792 ENCODE_RESET_PLANE_AND_REGISTER;
1793 ENCODE_COMPOSITION_END;
1794 }
1795 else
1796 {
1797 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1798 {
1799 *dst++ = c1 & 0x7F;
1800 coding->composing = COMPOSING_WITH_RULE_HEAD;
1801 continue;
1802 }
1803 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1804 coding->composing = COMPOSING_WITH_RULE_RULE;
1805 if (c1 == 0xA0)
1806 {
1807 /* This is an ASCII component. */
1808 ONE_MORE_BYTE (c1);
1809 c1 &= 0x7F;
1810 }
1811 else
1812 /* This is a leading-code of non ASCII component. */
1813 c1 -= 0x20;
1814 }
1815 }
1816
1817 /* Now encode one character. C1 is a control character, an
1818 ASCII character, or a leading-code of multi-byte character. */
1819 switch (emacs_code_class[c1])
1820 {
1821 case EMACS_ascii_code:
1822 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1823 break;
1824
1825 case EMACS_control_code:
1826 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1827 ENCODE_RESET_PLANE_AND_REGISTER;
1828 *dst++ = c1;
1829 coding->consumed_char++;
1830 break;
1831
1832 case EMACS_carriage_return_code:
1833 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1834 {
1835 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1836 ENCODE_RESET_PLANE_AND_REGISTER;
1837 *dst++ = c1;
1838 coding->consumed_char++;
1839 break;
1840 }
1841 /* fall down to treat '\r' as '\n' ... */
1842
1843 case EMACS_linefeed_code:
1844 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1845 ENCODE_RESET_PLANE_AND_REGISTER;
1846 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1847 bcopy (coding->spec.iso2022.initial_designation,
1848 coding->spec.iso2022.current_designation,
1849 sizeof coding->spec.iso2022.initial_designation);
1850 if (coding->eol_type == CODING_EOL_LF
1851 || coding->eol_type == CODING_EOL_UNDECIDED)
1852 *dst++ = ISO_CODE_LF;
1853 else if (coding->eol_type == CODING_EOL_CRLF)
1854 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1855 else
1856 *dst++ = ISO_CODE_CR;
1857 CODING_SPEC_ISO_BOL (coding) = 1;
1858 coding->consumed_char++;
1859 break;
1860
1861 case EMACS_leading_code_2:
1862 ONE_MORE_BYTE (c2);
1863 if (c2 < 0xA0)
1864 {
1865 /* invalid sequence */
1866 *dst++ = c1;
1867 *dst++ = c2;
1868 coding->consumed_char += 2;
1869 }
1870 else
1871 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1872 break;
1873
1874 case EMACS_leading_code_3:
1875 TWO_MORE_BYTES (c2, c3);
1876 if (c2 < 0xA0 || c3 < 0xA0)
1877 {
1878 /* invalid sequence */
1879 *dst++ = c1;
1880 *dst++ = c2;
1881 *dst++ = c3;
1882 coding->consumed_char += 3;
1883 }
1884 else if (c1 < LEADING_CODE_PRIVATE_11)
1885 ENCODE_ISO_CHARACTER (c1, c2, c3);
1886 else
1887 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1888 break;
1889
1890 case EMACS_leading_code_4:
1891 THREE_MORE_BYTES (c2, c3, c4);
1892 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1893 {
1894 /* invalid sequence */
1895 *dst++ = c1;
1896 *dst++ = c2;
1897 *dst++ = c3;
1898 *dst++ = c4;
1899 coding->consumed_char += 4;
1900 }
1901 else
1902 ENCODE_ISO_CHARACTER (c2, c3, c4);
1903 break;
1904
1905 case EMACS_leading_code_composition:
1906 ONE_MORE_BYTE (c2);
1907 if (c2 < 0xA0)
1908 {
1909 /* invalid sequence */
1910 *dst++ = c1;
1911 *dst++ = c2;
1912 coding->consumed_char += 2;
1913 }
1914 else if (c2 == 0xFF)
1915 {
1916 ENCODE_RESET_PLANE_AND_REGISTER;
1917 coding->composing = COMPOSING_WITH_RULE_HEAD;
1918 ENCODE_COMPOSITION_WITH_RULE_START;
1919 coding->consumed_char++;
1920 }
1921 else
1922 {
1923 ENCODE_RESET_PLANE_AND_REGISTER;
1924 /* Rewind one byte because it is a character code of
1925 composition elements. */
1926 src--;
1927 coding->composing = COMPOSING_NO_RULE_HEAD;
1928 ENCODE_COMPOSITION_NO_RULE_START;
1929 coding->consumed_char++;
1930 }
1931 break;
1932
1933 case EMACS_invalid_code:
1934 *dst++ = c1;
1935 coding->consumed_char++;
1936 break;
1937 }
1938 continue;
1939 label_end_of_loop:
1940 result = CODING_FINISH_INSUFFICIENT_SRC;
1941 src = src_base;
1942 break;
1943 }
1944
1945 if (src < src_end)
1946 {
1947 if (result == CODING_FINISH_NORMAL)
1948 result = CODING_FINISH_INSUFFICIENT_DST;
1949 else
1950 /* If this is the last block of the text to be encoded, we
1951 must reset graphic planes and registers to the initial
1952 state, and flush out the carryover if any. */
1953 if (coding->mode & CODING_MODE_LAST_BLOCK)
1954 ENCODE_RESET_PLANE_AND_REGISTER;
1955 }
1956
1957 coding->consumed = src - source;
1958 coding->produced = coding->produced_char = dst - destination;
1959 return result;
1960 }
1961
1962 \f
1963 /*** 4. SJIS and BIG5 handlers ***/
1964
1965 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1966 quite widely. So, for the moment, Emacs supports them in the bare
1967 C code. But, in the future, they may be supported only by CCL. */
1968
1969 /* SJIS is a coding system encoding three character sets: ASCII, right
1970 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1971 as is. A character of charset katakana-jisx0201 is encoded by
1972 "position-code + 0x80". A character of charset japanese-jisx0208
1973 is encoded in 2-byte but two position-codes are divided and shifted
1974 so that it fit in the range below.
1975
1976 --- CODE RANGE of SJIS ---
1977 (character set) (range)
1978 ASCII 0x00 .. 0x7F
1979 KATAKANA-JISX0201 0xA0 .. 0xDF
1980 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1981 (2nd byte) 0x40 .. 0xFF
1982 -------------------------------
1983
1984 */
1985
1986 /* BIG5 is a coding system encoding two character sets: ASCII and
1987 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1988 character set and is encoded in two-byte.
1989
1990 --- CODE RANGE of BIG5 ---
1991 (character set) (range)
1992 ASCII 0x00 .. 0x7F
1993 Big5 (1st byte) 0xA1 .. 0xFE
1994 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1995 --------------------------
1996
1997 Since the number of characters in Big5 is larger than maximum
1998 characters in Emacs' charset (96x96), it can't be handled as one
1999 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2000 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2001 contains frequently used characters and the latter contains less
2002 frequently used characters. */
2003
2004 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2005 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2006 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2007 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2008
2009 /* Number of Big5 characters which have the same code in 1st byte. */
2010 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2011
2012 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2013 do { \
2014 unsigned int temp \
2015 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2016 if (b1 < 0xC9) \
2017 charset = charset_big5_1; \
2018 else \
2019 { \
2020 charset = charset_big5_2; \
2021 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2022 } \
2023 c1 = temp / (0xFF - 0xA1) + 0x21; \
2024 c2 = temp % (0xFF - 0xA1) + 0x21; \
2025 } while (0)
2026
2027 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2028 do { \
2029 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2030 if (charset == charset_big5_2) \
2031 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2032 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2033 b2 = temp % BIG5_SAME_ROW; \
2034 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2035 } while (0)
2036
2037 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2038 do { \
2039 int c_alt, charset_alt = (charset); \
2040 if (!NILP (unification_table) \
2041 && ((c_alt = unify_char (unification_table, \
2042 -1, (charset), c1, c2)) >= 0)) \
2043 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2044 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2045 DECODE_CHARACTER_ASCII (c1); \
2046 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2047 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2048 else \
2049 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2050 } while (0)
2051
2052 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2053 do { \
2054 int c_alt, charset_alt; \
2055 if (!NILP (unification_table) \
2056 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2057 >= 0)) \
2058 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2059 else \
2060 charset_alt = charset; \
2061 if (charset_alt == charset_ascii) \
2062 *dst++ = c1; \
2063 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2064 { \
2065 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2066 *dst++ = c1; \
2067 else \
2068 { \
2069 *dst++ = charset_alt, *dst++ = c1; \
2070 coding->fake_multibyte = 1; \
2071 } \
2072 } \
2073 else \
2074 { \
2075 c1 &= 0x7F, c2 &= 0x7F; \
2076 if (sjis_p && charset_alt == charset_jisx0208) \
2077 { \
2078 unsigned char s1, s2; \
2079 \
2080 ENCODE_SJIS (c1, c2, s1, s2); \
2081 *dst++ = s1, *dst++ = s2; \
2082 coding->fake_multibyte = 1; \
2083 } \
2084 else if (!sjis_p \
2085 && (charset_alt == charset_big5_1 \
2086 || charset_alt == charset_big5_2)) \
2087 { \
2088 unsigned char b1, b2; \
2089 \
2090 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2091 *dst++ = b1, *dst++ = b2; \
2092 } \
2093 else \
2094 { \
2095 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2096 coding->fake_multibyte = 1; \
2097 } \
2098 } \
2099 coding->consumed_char++; \
2100 } while (0);
2101
2102 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2103 Check if a text is encoded in SJIS. If it is, return
2104 CODING_CATEGORY_MASK_SJIS, else return 0. */
2105
2106 int
2107 detect_coding_sjis (src, src_end)
2108 unsigned char *src, *src_end;
2109 {
2110 unsigned char c;
2111
2112 while (src < src_end)
2113 {
2114 c = *src++;
2115 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2116 {
2117 if (src < src_end && *src++ < 0x40)
2118 return 0;
2119 }
2120 }
2121 return CODING_CATEGORY_MASK_SJIS;
2122 }
2123
2124 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2125 Check if a text is encoded in BIG5. If it is, return
2126 CODING_CATEGORY_MASK_BIG5, else return 0. */
2127
2128 int
2129 detect_coding_big5 (src, src_end)
2130 unsigned char *src, *src_end;
2131 {
2132 unsigned char c;
2133
2134 while (src < src_end)
2135 {
2136 c = *src++;
2137 if (c >= 0xA1)
2138 {
2139 if (src >= src_end)
2140 break;
2141 c = *src++;
2142 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2143 return 0;
2144 }
2145 }
2146 return CODING_CATEGORY_MASK_BIG5;
2147 }
2148
2149 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2150 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2151
2152 int
2153 decode_coding_sjis_big5 (coding, source, destination,
2154 src_bytes, dst_bytes, sjis_p)
2155 struct coding_system *coding;
2156 unsigned char *source, *destination;
2157 int src_bytes, dst_bytes;
2158 int sjis_p;
2159 {
2160 unsigned char *src = source;
2161 unsigned char *src_end = source + src_bytes;
2162 unsigned char *dst = destination;
2163 unsigned char *dst_end = destination + dst_bytes;
2164 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2165 from DST_END to assure overflow checking is necessary only at the
2166 head of loop. */
2167 unsigned char *adjusted_dst_end = dst_end - 3;
2168 Lisp_Object unification_table
2169 = coding->character_unification_table_for_decode;
2170 int result = CODING_FINISH_NORMAL;
2171
2172 if (!NILP (Venable_character_unification) && NILP (unification_table))
2173 unification_table = Vstandard_character_unification_table_for_decode;
2174
2175 coding->produced_char = 0;
2176 coding->fake_multibyte = 0;
2177 while (src < src_end && (dst_bytes
2178 ? (dst < adjusted_dst_end)
2179 : (dst < src - 3)))
2180 {
2181 /* SRC_BASE remembers the start position in source in each loop.
2182 The loop will be exited when there's not enough source text
2183 to analyze two-byte character (within macro ONE_MORE_BYTE).
2184 In that case, SRC is reset to SRC_BASE before exiting. */
2185 unsigned char *src_base = src;
2186 unsigned char c1 = *src++, c2, c3, c4;
2187
2188 if (c1 < 0x20)
2189 {
2190 if (c1 == '\r')
2191 {
2192 if (coding->eol_type == CODING_EOL_CRLF)
2193 {
2194 ONE_MORE_BYTE (c2);
2195 if (c2 == '\n')
2196 *dst++ = c2;
2197 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2198 {
2199 result = CODING_FINISH_INCONSISTENT_EOL;
2200 goto label_end_of_loop_2;
2201 }
2202 else
2203 /* To process C2 again, SRC is subtracted by 1. */
2204 *dst++ = c1, src--;
2205 }
2206 else if (coding->eol_type == CODING_EOL_CR)
2207 *dst++ = '\n';
2208 else
2209 *dst++ = c1;
2210 }
2211 else if (c1 == '\n'
2212 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2213 && (coding->eol_type == CODING_EOL_CR
2214 || coding->eol_type == CODING_EOL_CRLF))
2215 {
2216 result = CODING_FINISH_INCONSISTENT_EOL;
2217 goto label_end_of_loop_2;
2218 }
2219 else
2220 *dst++ = c1;
2221 coding->produced_char++;
2222 }
2223 else if (c1 < 0x80)
2224 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2225 else if (c1 < 0xA0)
2226 {
2227 /* SJIS -> JISX0208 */
2228 if (sjis_p)
2229 {
2230 ONE_MORE_BYTE (c2);
2231 if (c2 >= 0x40)
2232 {
2233 DECODE_SJIS (c1, c2, c3, c4);
2234 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2235 }
2236 else
2237 goto label_invalid_code_2;
2238 }
2239 else
2240 goto label_invalid_code_1;
2241 }
2242 else if (c1 < 0xE0)
2243 {
2244 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2245 if (sjis_p)
2246 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2247 /* dummy */ c2);
2248 else
2249 {
2250 int charset;
2251
2252 ONE_MORE_BYTE (c2);
2253 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2254 {
2255 DECODE_BIG5 (c1, c2, charset, c3, c4);
2256 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2257 }
2258 else
2259 goto label_invalid_code_2;
2260 }
2261 }
2262 else /* C1 >= 0xE0 */
2263 {
2264 /* SJIS -> JISX0208, BIG5 -> Big5 */
2265 if (sjis_p)
2266 {
2267 ONE_MORE_BYTE (c2);
2268 if (c2 >= 0x40)
2269 {
2270 DECODE_SJIS (c1, c2, c3, c4);
2271 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2272 }
2273 else
2274 goto label_invalid_code_2;
2275 }
2276 else
2277 {
2278 int charset;
2279
2280 ONE_MORE_BYTE (c2);
2281 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2282 {
2283 DECODE_BIG5 (c1, c2, charset, c3, c4);
2284 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2285 }
2286 else
2287 goto label_invalid_code_2;
2288 }
2289 }
2290 continue;
2291
2292 label_invalid_code_1:
2293 *dst++ = c1;
2294 coding->produced_char++;
2295 coding->fake_multibyte = 1;
2296 continue;
2297
2298 label_invalid_code_2:
2299 *dst++ = c1; *dst++= c2;
2300 coding->produced_char += 2;
2301 coding->fake_multibyte = 1;
2302 continue;
2303
2304 label_end_of_loop:
2305 result = CODING_FINISH_INSUFFICIENT_SRC;
2306 label_end_of_loop_2:
2307 src = src_base;
2308 break;
2309 }
2310
2311 if (src < src_end)
2312 {
2313 if (result == CODING_FINISH_NORMAL)
2314 result = CODING_FINISH_INSUFFICIENT_DST;
2315 else if (result != CODING_FINISH_INCONSISTENT_EOL
2316 && coding->mode & CODING_MODE_LAST_BLOCK)
2317 {
2318 src_bytes = src_end - src;
2319 if (dst_bytes && (dst_end - dst < src_bytes))
2320 src_bytes = dst_end - dst;
2321 bcopy (dst, src, src_bytes);
2322 src += src_bytes;
2323 dst += src_bytes;
2324 coding->fake_multibyte = 1;
2325 }
2326 }
2327
2328 coding->consumed = coding->consumed_char = src - source;
2329 coding->produced = dst - destination;
2330 return result;
2331 }
2332
2333 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2334 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2335 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2336 sure that all these charsets are registered as official charset
2337 (i.e. do not have extended leading-codes). Characters of other
2338 charsets are produced without any encoding. If SJIS_P is 1, encode
2339 SJIS text, else encode BIG5 text. */
2340
2341 int
2342 encode_coding_sjis_big5 (coding, source, destination,
2343 src_bytes, dst_bytes, sjis_p)
2344 struct coding_system *coding;
2345 unsigned char *source, *destination;
2346 int src_bytes, dst_bytes;
2347 int sjis_p;
2348 {
2349 unsigned char *src = source;
2350 unsigned char *src_end = source + src_bytes;
2351 unsigned char *dst = destination;
2352 unsigned char *dst_end = destination + dst_bytes;
2353 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2354 from DST_END to assure overflow checking is necessary only at the
2355 head of loop. */
2356 unsigned char *adjusted_dst_end = dst_end - 1;
2357 Lisp_Object unification_table
2358 = coding->character_unification_table_for_encode;
2359 int result = CODING_FINISH_NORMAL;
2360
2361 if (!NILP (Venable_character_unification) && NILP (unification_table))
2362 unification_table = Vstandard_character_unification_table_for_encode;
2363
2364 coding->consumed_char = 0;
2365 coding->fake_multibyte = 0;
2366 while (src < src_end && (dst_bytes
2367 ? (dst < adjusted_dst_end)
2368 : (dst < src - 1)))
2369 {
2370 /* SRC_BASE remembers the start position in source in each loop.
2371 The loop will be exited when there's not enough source text
2372 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2373 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2374 before exiting. */
2375 unsigned char *src_base = src;
2376 unsigned char c1 = *src++, c2, c3, c4;
2377
2378 if (coding->composing)
2379 {
2380 if (c1 == 0xA0)
2381 {
2382 ONE_MORE_BYTE (c1);
2383 c1 &= 0x7F;
2384 }
2385 else if (c1 >= 0xA0)
2386 c1 -= 0x20;
2387 else
2388 coding->composing = 0;
2389 }
2390
2391 switch (emacs_code_class[c1])
2392 {
2393 case EMACS_ascii_code:
2394 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2395 break;
2396
2397 case EMACS_control_code:
2398 *dst++ = c1;
2399 coding->consumed_char++;
2400 break;
2401
2402 case EMACS_carriage_return_code:
2403 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2404 {
2405 *dst++ = c1;
2406 coding->consumed_char++;
2407 break;
2408 }
2409 /* fall down to treat '\r' as '\n' ... */
2410
2411 case EMACS_linefeed_code:
2412 if (coding->eol_type == CODING_EOL_LF
2413 || coding->eol_type == CODING_EOL_UNDECIDED)
2414 *dst++ = '\n';
2415 else if (coding->eol_type == CODING_EOL_CRLF)
2416 *dst++ = '\r', *dst++ = '\n';
2417 else
2418 *dst++ = '\r';
2419 coding->consumed_char++;
2420 break;
2421
2422 case EMACS_leading_code_2:
2423 ONE_MORE_BYTE (c2);
2424 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2425 break;
2426
2427 case EMACS_leading_code_3:
2428 TWO_MORE_BYTES (c2, c3);
2429 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2430 break;
2431
2432 case EMACS_leading_code_4:
2433 THREE_MORE_BYTES (c2, c3, c4);
2434 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2435 break;
2436
2437 case EMACS_leading_code_composition:
2438 coding->composing = 1;
2439 break;
2440
2441 default: /* i.e. case EMACS_invalid_code: */
2442 *dst++ = c1;
2443 coding->consumed_char++;
2444 }
2445 continue;
2446
2447 label_end_of_loop:
2448 result = CODING_FINISH_INSUFFICIENT_SRC;
2449 src = src_base;
2450 break;
2451 }
2452
2453 if (result == CODING_FINISH_NORMAL
2454 && src < src_end)
2455 result = CODING_FINISH_INSUFFICIENT_DST;
2456 coding->consumed = src - source;
2457 coding->produced = coding->produced_char = dst - destination;
2458 return result;
2459 }
2460
2461 \f
2462 /*** 5. End-of-line handlers ***/
2463
2464 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2465 This function is called only when `coding->eol_type' is
2466 CODING_EOL_CRLF or CODING_EOL_CR. */
2467
2468 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2469 struct coding_system *coding;
2470 unsigned char *source, *destination;
2471 int src_bytes, dst_bytes;
2472 {
2473 unsigned char *src = source;
2474 unsigned char *src_end = source + src_bytes;
2475 unsigned char *dst = destination;
2476 unsigned char *dst_end = destination + dst_bytes;
2477 unsigned char c;
2478 int result = CODING_FINISH_NORMAL;
2479
2480 coding->fake_multibyte = 0;
2481
2482 if (src_bytes <= 0)
2483 return result;
2484
2485 switch (coding->eol_type)
2486 {
2487 case CODING_EOL_CRLF:
2488 {
2489 /* Since the maximum bytes produced by each loop is 2, we
2490 subtract 1 from DST_END to assure overflow checking is
2491 necessary only at the head of loop. */
2492 unsigned char *adjusted_dst_end = dst_end - 1;
2493
2494 while (src < src_end && (dst_bytes
2495 ? (dst < adjusted_dst_end)
2496 : (dst < src - 1)))
2497 {
2498 unsigned char *src_base = src;
2499
2500 c = *src++;
2501 if (c == '\r')
2502 {
2503 ONE_MORE_BYTE (c);
2504 if (c != '\n')
2505 {
2506 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2507 {
2508 result = CODING_FINISH_INCONSISTENT_EOL;
2509 goto label_end_of_loop_2;
2510 }
2511 *dst++ = '\r';
2512 if (BASE_LEADING_CODE_P (c))
2513 coding->fake_multibyte = 1;
2514 }
2515 *dst++ = c;
2516 }
2517 else if (c == '\n'
2518 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2519 {
2520 result = CODING_FINISH_INCONSISTENT_EOL;
2521 goto label_end_of_loop_2;
2522 }
2523 else
2524 {
2525 *dst++ = c;
2526 if (BASE_LEADING_CODE_P (c))
2527 coding->fake_multibyte = 1;
2528 }
2529 continue;
2530
2531 label_end_of_loop:
2532 result = CODING_FINISH_INSUFFICIENT_SRC;
2533 label_end_of_loop_2:
2534 src = src_base;
2535 break;
2536 }
2537 if (result == CODING_FINISH_NORMAL
2538 && src < src_end)
2539 result = CODING_FINISH_INSUFFICIENT_DST;
2540 }
2541 break;
2542
2543 case CODING_EOL_CR:
2544 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2545 {
2546 while (src < src_end)
2547 {
2548 if ((c = *src++) == '\n')
2549 break;
2550 if (BASE_LEADING_CODE_P (c))
2551 coding->fake_multibyte = 1;
2552 }
2553 if (*--src == '\n')
2554 {
2555 src_bytes = src - source;
2556 result = CODING_FINISH_INCONSISTENT_EOL;
2557 }
2558 }
2559 if (dst_bytes && src_bytes > dst_bytes)
2560 {
2561 result = CODING_FINISH_INSUFFICIENT_DST;
2562 src_bytes = dst_bytes;
2563 }
2564 if (dst_bytes)
2565 bcopy (source, destination, src_bytes);
2566 else
2567 safe_bcopy (source, destination, src_bytes);
2568 src = source + src_bytes;
2569 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2570 break;
2571
2572 default: /* i.e. case: CODING_EOL_LF */
2573 if (dst_bytes && src_bytes > dst_bytes)
2574 {
2575 result = CODING_FINISH_INSUFFICIENT_DST;
2576 src_bytes = dst_bytes;
2577 }
2578 if (dst_bytes)
2579 bcopy (source, destination, src_bytes);
2580 else
2581 safe_bcopy (source, destination, src_bytes);
2582 src += src_bytes;
2583 dst += dst_bytes;
2584 coding->fake_multibyte = 1;
2585 break;
2586 }
2587
2588 coding->consumed = coding->consumed_char = src - source;
2589 coding->produced = coding->produced_char = dst - destination;
2590 return result;
2591 }
2592
2593 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2594 format of end-of-line according to `coding->eol_type'. If
2595 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2596 '\r' in source text also means end-of-line. */
2597
2598 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2599 struct coding_system *coding;
2600 unsigned char *source, *destination;
2601 int src_bytes, dst_bytes;
2602 {
2603 unsigned char *src = source;
2604 unsigned char *dst = destination;
2605 int result = CODING_FINISH_NORMAL;
2606
2607 coding->fake_multibyte = 0;
2608
2609 if (coding->eol_type == CODING_EOL_CRLF)
2610 {
2611 unsigned char c;
2612 unsigned char *src_end = source + src_bytes;
2613 unsigned char *dst_end = destination + dst_bytes;
2614 /* Since the maximum bytes produced by each loop is 2, we
2615 subtract 1 from DST_END to assure overflow checking is
2616 necessary only at the head of loop. */
2617 unsigned char *adjusted_dst_end = dst_end - 1;
2618
2619 while (src < src_end && (dst_bytes
2620 ? (dst < adjusted_dst_end)
2621 : (dst < src - 1)))
2622 {
2623 c = *src++;
2624 if (c == '\n'
2625 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2626 *dst++ = '\r', *dst++ = '\n';
2627 else
2628 {
2629 *dst++ = c;
2630 if (BASE_LEADING_CODE_P (c))
2631 coding->fake_multibyte = 1;
2632 }
2633 }
2634 if (src < src_end)
2635 result = CODING_FINISH_INSUFFICIENT_DST;
2636 }
2637 else
2638 {
2639 unsigned char c;
2640
2641 if (dst_bytes && src_bytes > dst_bytes)
2642 {
2643 src_bytes = dst_bytes;
2644 result = CODING_FINISH_INSUFFICIENT_DST;
2645 }
2646 if (dst_bytes)
2647 bcopy (source, destination, src_bytes);
2648 else
2649 {
2650 safe_bcopy (source, destination, src_bytes);
2651 dst_bytes = src_bytes;
2652 }
2653 if (coding->eol_type == CODING_EOL_CRLF)
2654 {
2655 while (src_bytes--)
2656 {
2657 if ((c = *dst++) == '\n')
2658 dst[-1] = '\r';
2659 else if (BASE_LEADING_CODE_P (c))
2660 coding->fake_multibyte = 1;
2661 }
2662 }
2663 else
2664 {
2665 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2666 {
2667 while (src_bytes--)
2668 if (*dst++ == '\r') dst[-1] = '\n';
2669 }
2670 coding->fake_multibyte = 1;
2671 }
2672 src = source + dst_bytes;
2673 dst = destination + dst_bytes;
2674 }
2675
2676 coding->consumed = coding->consumed_char = src - source;
2677 coding->produced = coding->produced_char = dst - destination;
2678 return result;
2679 }
2680
2681 \f
2682 /*** 6. C library functions ***/
2683
2684 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2685 has a property `coding-system'. The value of this property is a
2686 vector of length 5 (called as coding-vector). Among elements of
2687 this vector, the first (element[0]) and the fifth (element[4])
2688 carry important information for decoding/encoding. Before
2689 decoding/encoding, this information should be set in fields of a
2690 structure of type `coding_system'.
2691
2692 A value of property `coding-system' can be a symbol of another
2693 subsidiary coding-system. In that case, Emacs gets coding-vector
2694 from that symbol.
2695
2696 `element[0]' contains information to be set in `coding->type'. The
2697 value and its meaning is as follows:
2698
2699 0 -- coding_type_emacs_mule
2700 1 -- coding_type_sjis
2701 2 -- coding_type_iso2022
2702 3 -- coding_type_big5
2703 4 -- coding_type_ccl encoder/decoder written in CCL
2704 nil -- coding_type_no_conversion
2705 t -- coding_type_undecided (automatic conversion on decoding,
2706 no-conversion on encoding)
2707
2708 `element[4]' contains information to be set in `coding->flags' and
2709 `coding->spec'. The meaning varies by `coding->type'.
2710
2711 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2712 of length 32 (of which the first 13 sub-elements are used now).
2713 Meanings of these sub-elements are:
2714
2715 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2716 If the value is an integer of valid charset, the charset is
2717 assumed to be designated to graphic register N initially.
2718
2719 If the value is minus, it is a minus value of charset which
2720 reserves graphic register N, which means that the charset is
2721 not designated initially but should be designated to graphic
2722 register N just before encoding a character in that charset.
2723
2724 If the value is nil, graphic register N is never used on
2725 encoding.
2726
2727 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2728 Each value takes t or nil. See the section ISO2022 of
2729 `coding.h' for more information.
2730
2731 If `coding->type' is `coding_type_big5', element[4] is t to denote
2732 BIG5-ETen or nil to denote BIG5-HKU.
2733
2734 If `coding->type' takes the other value, element[4] is ignored.
2735
2736 Emacs Lisp's coding system also carries information about format of
2737 end-of-line in a value of property `eol-type'. If the value is
2738 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2739 means CODING_EOL_CR. If it is not integer, it should be a vector
2740 of subsidiary coding systems of which property `eol-type' has one
2741 of above values.
2742
2743 */
2744
2745 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2746 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2747 is setup so that no conversion is necessary and return -1, else
2748 return 0. */
2749
2750 int
2751 setup_coding_system (coding_system, coding)
2752 Lisp_Object coding_system;
2753 struct coding_system *coding;
2754 {
2755 Lisp_Object coding_spec, coding_type, eol_type, plist;
2756 Lisp_Object val;
2757 int i;
2758
2759 /* Initialize some fields required for all kinds of coding systems. */
2760 coding->symbol = coding_system;
2761 coding->common_flags = 0;
2762 coding->mode = 0;
2763 coding->heading_ascii = -1;
2764 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2765 coding_spec = Fget (coding_system, Qcoding_system);
2766 if (!VECTORP (coding_spec)
2767 || XVECTOR (coding_spec)->size != 5
2768 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2769 goto label_invalid_coding_system;
2770
2771 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2772 if (VECTORP (eol_type))
2773 {
2774 coding->eol_type = CODING_EOL_UNDECIDED;
2775 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2776 }
2777 else if (XFASTINT (eol_type) == 1)
2778 {
2779 coding->eol_type = CODING_EOL_CRLF;
2780 coding->common_flags
2781 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2782 }
2783 else if (XFASTINT (eol_type) == 2)
2784 {
2785 coding->eol_type = CODING_EOL_CR;
2786 coding->common_flags
2787 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2788 }
2789 else
2790 coding->eol_type = CODING_EOL_LF;
2791
2792 coding_type = XVECTOR (coding_spec)->contents[0];
2793 /* Try short cut. */
2794 if (SYMBOLP (coding_type))
2795 {
2796 if (EQ (coding_type, Qt))
2797 {
2798 coding->type = coding_type_undecided;
2799 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2800 }
2801 else
2802 coding->type = coding_type_no_conversion;
2803 return 0;
2804 }
2805
2806 /* Initialize remaining fields. */
2807 coding->composing = 0;
2808 coding->character_unification_table_for_decode = Qnil;
2809 coding->character_unification_table_for_encode = Qnil;
2810
2811 /* Get values of coding system properties:
2812 `post-read-conversion', `pre-write-conversion',
2813 `character-unification-table-for-decode',
2814 `character-unification-table-for-encode'. */
2815 plist = XVECTOR (coding_spec)->contents[3];
2816 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2817 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2818 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2819 if (SYMBOLP (val))
2820 val = Fget (val, Qcharacter_unification_table_for_decode);
2821 coding->character_unification_table_for_decode
2822 = CHAR_TABLE_P (val) ? val : Qnil;
2823 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2824 if (SYMBOLP (val))
2825 val = Fget (val, Qcharacter_unification_table_for_encode);
2826 coding->character_unification_table_for_encode
2827 = CHAR_TABLE_P (val) ? val : Qnil;
2828 val = Fplist_get (plist, Qcoding_category);
2829 if (!NILP (val))
2830 {
2831 val = Fget (val, Qcoding_category_index);
2832 if (INTEGERP (val))
2833 coding->category_idx = XINT (val);
2834 else
2835 goto label_invalid_coding_system;
2836 }
2837 else
2838 goto label_invalid_coding_system;
2839
2840 val = Fplist_get (plist, Qsafe_charsets);
2841 if (EQ (val, Qt))
2842 {
2843 for (i = 0; i <= MAX_CHARSET; i++)
2844 coding->safe_charsets[i] = 1;
2845 }
2846 else
2847 {
2848 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2849 while (CONSP (val))
2850 {
2851 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2852 coding->safe_charsets[i] = 1;
2853 val = XCONS (val)->cdr;
2854 }
2855 }
2856
2857 switch (XFASTINT (coding_type))
2858 {
2859 case 0:
2860 coding->type = coding_type_emacs_mule;
2861 if (!NILP (coding->post_read_conversion))
2862 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2863 if (!NILP (coding->pre_write_conversion))
2864 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2865 break;
2866
2867 case 1:
2868 coding->type = coding_type_sjis;
2869 coding->common_flags
2870 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2871 break;
2872
2873 case 2:
2874 coding->type = coding_type_iso2022;
2875 coding->common_flags
2876 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2877 {
2878 Lisp_Object val, temp;
2879 Lisp_Object *flags;
2880 int i, charset, reg_bits = 0;
2881
2882 val = XVECTOR (coding_spec)->contents[4];
2883
2884 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2885 goto label_invalid_coding_system;
2886
2887 flags = XVECTOR (val)->contents;
2888 coding->flags
2889 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2890 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2891 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2892 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2893 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2894 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2895 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2896 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2897 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2898 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2899 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2900 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2901 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2902 );
2903
2904 /* Invoke graphic register 0 to plane 0. */
2905 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2906 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2907 CODING_SPEC_ISO_INVOCATION (coding, 1)
2908 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2909 /* Not single shifting at first. */
2910 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2911 /* Beginning of buffer should also be regarded as bol. */
2912 CODING_SPEC_ISO_BOL (coding) = 1;
2913
2914 for (charset = 0; charset <= MAX_CHARSET; charset++)
2915 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2916 val = Vcharset_revision_alist;
2917 while (CONSP (val))
2918 {
2919 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2920 if (charset >= 0
2921 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2922 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2923 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2924 val = XCONS (val)->cdr;
2925 }
2926
2927 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2928 FLAGS[REG] can be one of below:
2929 integer CHARSET: CHARSET occupies register I,
2930 t: designate nothing to REG initially, but can be used
2931 by any charsets,
2932 list of integer, nil, or t: designate the first
2933 element (if integer) to REG initially, the remaining
2934 elements (if integer) is designated to REG on request,
2935 if an element is t, REG can be used by any charsets,
2936 nil: REG is never used. */
2937 for (charset = 0; charset <= MAX_CHARSET; charset++)
2938 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2939 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2940 for (i = 0; i < 4; i++)
2941 {
2942 if (INTEGERP (flags[i])
2943 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2944 || (charset = get_charset_id (flags[i])) >= 0)
2945 {
2946 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2947 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2948 }
2949 else if (EQ (flags[i], Qt))
2950 {
2951 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2952 reg_bits |= 1 << i;
2953 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2954 }
2955 else if (CONSP (flags[i]))
2956 {
2957 Lisp_Object tail = flags[i];
2958
2959 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2960 if (INTEGERP (XCONS (tail)->car)
2961 && (charset = XINT (XCONS (tail)->car),
2962 CHARSET_VALID_P (charset))
2963 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2964 {
2965 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2966 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2967 }
2968 else
2969 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2970 tail = XCONS (tail)->cdr;
2971 while (CONSP (tail))
2972 {
2973 if (INTEGERP (XCONS (tail)->car)
2974 && (charset = XINT (XCONS (tail)->car),
2975 CHARSET_VALID_P (charset))
2976 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2977 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2978 = i;
2979 else if (EQ (XCONS (tail)->car, Qt))
2980 reg_bits |= 1 << i;
2981 tail = XCONS (tail)->cdr;
2982 }
2983 }
2984 else
2985 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2986
2987 CODING_SPEC_ISO_DESIGNATION (coding, i)
2988 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2989 }
2990
2991 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2992 {
2993 /* REG 1 can be used only by locking shift in 7-bit env. */
2994 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2995 reg_bits &= ~2;
2996 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2997 /* Without any shifting, only REG 0 and 1 can be used. */
2998 reg_bits &= 3;
2999 }
3000
3001 if (reg_bits)
3002 for (charset = 0; charset <= MAX_CHARSET; charset++)
3003 {
3004 if (CHARSET_VALID_P (charset))
3005 {
3006 /* There exist some default graphic registers to be
3007 used CHARSET. */
3008
3009 /* We had better avoid designating a charset of
3010 CHARS96 to REG 0 as far as possible. */
3011 if (CHARSET_CHARS (charset) == 96)
3012 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3013 = (reg_bits & 2
3014 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3015 else
3016 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3017 = (reg_bits & 1
3018 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3019 }
3020 }
3021 }
3022 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3023 coding->spec.iso2022.last_invalid_designation_register = -1;
3024 break;
3025
3026 case 3:
3027 coding->type = coding_type_big5;
3028 coding->common_flags
3029 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3030 coding->flags
3031 = (NILP (XVECTOR (coding_spec)->contents[4])
3032 ? CODING_FLAG_BIG5_HKU
3033 : CODING_FLAG_BIG5_ETEN);
3034 break;
3035
3036 case 4:
3037 coding->type = coding_type_ccl;
3038 coding->common_flags
3039 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3040 {
3041 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
3042 if (CONSP (val)
3043 && VECTORP (XCONS (val)->car)
3044 && VECTORP (XCONS (val)->cdr))
3045 {
3046 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
3047 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
3048 }
3049 else
3050 goto label_invalid_coding_system;
3051 }
3052 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3053 break;
3054
3055 case 5:
3056 coding->type = coding_type_raw_text;
3057 break;
3058
3059 default:
3060 goto label_invalid_coding_system;
3061 }
3062 return 0;
3063
3064 label_invalid_coding_system:
3065 coding->type = coding_type_no_conversion;
3066 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3067 coding->common_flags = 0;
3068 coding->eol_type = CODING_EOL_LF;
3069 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3070 return -1;
3071 }
3072
3073 /* Emacs has a mechanism to automatically detect a coding system if it
3074 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3075 it's impossible to distinguish some coding systems accurately
3076 because they use the same range of codes. So, at first, coding
3077 systems are categorized into 7, those are:
3078
3079 o coding-category-emacs-mule
3080
3081 The category for a coding system which has the same code range
3082 as Emacs' internal format. Assigned the coding-system (Lisp
3083 symbol) `emacs-mule' by default.
3084
3085 o coding-category-sjis
3086
3087 The category for a coding system which has the same code range
3088 as SJIS. Assigned the coding-system (Lisp
3089 symbol) `japanese-shift-jis' by default.
3090
3091 o coding-category-iso-7
3092
3093 The category for a coding system which has the same code range
3094 as ISO2022 of 7-bit environment. This doesn't use any locking
3095 shift and single shift functions. This can encode/decode all
3096 charsets. Assigned the coding-system (Lisp symbol)
3097 `iso-2022-7bit' by default.
3098
3099 o coding-category-iso-7-tight
3100
3101 Same as coding-category-iso-7 except that this can
3102 encode/decode only the specified charsets.
3103
3104 o coding-category-iso-8-1
3105
3106 The category for a coding system which has the same code range
3107 as ISO2022 of 8-bit environment and graphic plane 1 used only
3108 for DIMENSION1 charset. This doesn't use any locking shift
3109 and single shift functions. Assigned the coding-system (Lisp
3110 symbol) `iso-latin-1' by default.
3111
3112 o coding-category-iso-8-2
3113
3114 The category for a coding system which has the same code range
3115 as ISO2022 of 8-bit environment and graphic plane 1 used only
3116 for DIMENSION2 charset. This doesn't use any locking shift
3117 and single shift functions. Assigned the coding-system (Lisp
3118 symbol) `japanese-iso-8bit' by default.
3119
3120 o coding-category-iso-7-else
3121
3122 The category for a coding system which has the same code range
3123 as ISO2022 of 7-bit environemnt but uses locking shift or
3124 single shift functions. Assigned the coding-system (Lisp
3125 symbol) `iso-2022-7bit-lock' by default.
3126
3127 o coding-category-iso-8-else
3128
3129 The category for a coding system which has the same code range
3130 as ISO2022 of 8-bit environemnt but uses locking shift or
3131 single shift functions. Assigned the coding-system (Lisp
3132 symbol) `iso-2022-8bit-ss2' by default.
3133
3134 o coding-category-big5
3135
3136 The category for a coding system which has the same code range
3137 as BIG5. Assigned the coding-system (Lisp symbol)
3138 `cn-big5' by default.
3139
3140 o coding-category-binary
3141
3142 The category for a coding system not categorized in any of the
3143 above. Assigned the coding-system (Lisp symbol)
3144 `no-conversion' by default.
3145
3146 Each of them is a Lisp symbol and the value is an actual
3147 `coding-system's (this is also a Lisp symbol) assigned by a user.
3148 What Emacs does actually is to detect a category of coding system.
3149 Then, it uses a `coding-system' assigned to it. If Emacs can't
3150 decide only one possible category, it selects a category of the
3151 highest priority. Priorities of categories are also specified by a
3152 user in a Lisp variable `coding-category-list'.
3153
3154 */
3155
3156 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3157 If it detects possible coding systems, return an integer in which
3158 appropriate flag bits are set. Flag bits are defined by macros
3159 CODING_CATEGORY_MASK_XXX in `coding.h'.
3160
3161 How many ASCII characters are at the head is returned as *SKIP. */
3162
3163 static int
3164 detect_coding_mask (source, src_bytes, priorities, skip)
3165 unsigned char *source;
3166 int src_bytes, *priorities, *skip;
3167 {
3168 register unsigned char c;
3169 unsigned char *src = source, *src_end = source + src_bytes;
3170 unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT
3171 | CODING_CATEGORY_MASK_ISO_SHIFT);
3172 int i;
3173
3174 /* At first, skip all ASCII characters and control characters except
3175 for three ISO2022 specific control characters. */
3176 label_loop_detect_coding:
3177 while (src < src_end)
3178 {
3179 c = *src;
3180 if (c >= 0x80
3181 || ((mask & CODING_CATEGORY_MASK_ISO_7BIT)
3182 && c == ISO_CODE_ESC)
3183 || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT)
3184 && (c == ISO_CODE_SI || c == ISO_CODE_SO)))
3185 break;
3186 src++;
3187 }
3188 *skip = src - source;
3189
3190 if (src >= src_end)
3191 /* We found nothing other than ASCII. There's nothing to do. */
3192 return 0;
3193
3194 /* The text seems to be encoded in some multilingual coding system.
3195 Now, try to find in which coding system the text is encoded. */
3196 if (c < 0x80)
3197 {
3198 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3199 /* C is an ISO2022 specific control code of C0. */
3200 mask = detect_coding_iso2022 (src, src_end);
3201 if (mask == 0)
3202 {
3203 /* No valid ISO2022 code follows C. Try again. */
3204 src++;
3205 mask = (c != ISO_CODE_ESC
3206 ? CODING_CATEGORY_MASK_ISO_7BIT
3207 : CODING_CATEGORY_MASK_ISO_SHIFT);
3208 goto label_loop_detect_coding;
3209 }
3210 if (priorities)
3211 goto label_return_highest_only;
3212 }
3213 else
3214 {
3215 int try;
3216
3217 if (c < 0xA0)
3218 {
3219 /* C is the first byte of SJIS character code,
3220 or a leading-code of Emacs' internal format (emacs-mule). */
3221 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3222
3223 /* Or, if C is a special latin extra code,
3224 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3225 or is an ISO2022 control-sequence-introducer (CSI),
3226 we should also consider the possibility of ISO2022 codings. */
3227 if ((VECTORP (Vlatin_extra_code_table)
3228 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3229 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3230 || (c == ISO_CODE_CSI
3231 && (src < src_end
3232 && (*src == ']'
3233 || ((*src == '0' || *src == '1' || *src == '2')
3234 && src + 1 < src_end
3235 && src[1] == ']')))))
3236 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3237 | CODING_CATEGORY_MASK_ISO_8BIT);
3238 }
3239 else
3240 /* C is a character of ISO2022 in graphic plane right,
3241 or a SJIS's 1-byte character code (i.e. JISX0201),
3242 or the first byte of BIG5's 2-byte code. */
3243 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3244 | CODING_CATEGORY_MASK_ISO_8BIT
3245 | CODING_CATEGORY_MASK_SJIS
3246 | CODING_CATEGORY_MASK_BIG5);
3247
3248 mask = 0;
3249 if (priorities)
3250 {
3251 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3252 {
3253 priorities[i] &= try;
3254 if (priorities[i] & CODING_CATEGORY_MASK_ISO)
3255 mask = detect_coding_iso2022 (src, src_end);
3256 else if (priorities[i] & CODING_CATEGORY_MASK_SJIS)
3257 mask = detect_coding_sjis (src, src_end);
3258 else if (priorities[i] & CODING_CATEGORY_MASK_BIG5)
3259 mask = detect_coding_big5 (src, src_end);
3260 else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE)
3261 mask = detect_coding_emacs_mule (src, src_end);
3262 if (mask)
3263 goto label_return_highest_only;
3264 }
3265 return CODING_CATEGORY_MASK_RAW_TEXT;
3266 }
3267 if (try & CODING_CATEGORY_MASK_ISO)
3268 mask |= detect_coding_iso2022 (src, src_end);
3269 if (try & CODING_CATEGORY_MASK_SJIS)
3270 mask |= detect_coding_sjis (src, src_end);
3271 if (try & CODING_CATEGORY_MASK_BIG5)
3272 mask |= detect_coding_big5 (src, src_end);
3273 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3274 mask |= detect_coding_emacs_mule (src, src_end);
3275 }
3276 return (mask | CODING_CATEGORY_MASK_RAW_TEXT);
3277
3278 label_return_highest_only:
3279 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3280 {
3281 if (mask & priorities[i])
3282 return priorities[i];
3283 }
3284 return CODING_CATEGORY_MASK_RAW_TEXT;
3285 }
3286
3287 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3288 The information of the detected coding system is set in CODING. */
3289
3290 void
3291 detect_coding (coding, src, src_bytes)
3292 struct coding_system *coding;
3293 unsigned char *src;
3294 int src_bytes;
3295 {
3296 unsigned int idx;
3297 int skip, mask, i;
3298 int priorities[CODING_CATEGORY_IDX_MAX];
3299 Lisp_Object val = Vcoding_category_list;
3300
3301 i = 0;
3302 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
3303 {
3304 if (! SYMBOLP (XCONS (val)->car))
3305 break;
3306 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
3307 if (idx >= CODING_CATEGORY_IDX_MAX)
3308 break;
3309 priorities[i++] = (1 << idx);
3310 val = XCONS (val)->cdr;
3311 }
3312 /* If coding-category-list is valid and contains all coding
3313 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3314 the following code saves Emacs from craching. */
3315 while (i < CODING_CATEGORY_IDX_MAX)
3316 priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
3317
3318 mask = detect_coding_mask (src, src_bytes, priorities, &skip);
3319 coding->heading_ascii = skip;
3320
3321 if (!mask) return;
3322
3323 /* We found a single coding system of the highest priority in MASK. */
3324 idx = 0;
3325 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3326 if (! mask)
3327 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3328
3329 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3330
3331 if (coding->eol_type != CODING_EOL_UNDECIDED)
3332 {
3333 Lisp_Object tmp = Fget (val, Qeol_type);
3334
3335 if (VECTORP (tmp))
3336 val = XVECTOR (tmp)->contents[coding->eol_type];
3337 }
3338 setup_coding_system (val, coding);
3339 /* Set this again because setup_coding_system reset this member. */
3340 coding->heading_ascii = skip;
3341 }
3342
3343 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3344 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3345 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3346
3347 How many non-eol characters are at the head is returned as *SKIP. */
3348
3349 #define MAX_EOL_CHECK_COUNT 3
3350
3351 static int
3352 detect_eol_type (source, src_bytes, skip)
3353 unsigned char *source;
3354 int src_bytes, *skip;
3355 {
3356 unsigned char *src = source, *src_end = src + src_bytes;
3357 unsigned char c;
3358 int total = 0; /* How many end-of-lines are found so far. */
3359 int eol_type = CODING_EOL_UNDECIDED;
3360 int this_eol_type;
3361
3362 *skip = 0;
3363
3364 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3365 {
3366 c = *src++;
3367 if (c == '\n' || c == '\r')
3368 {
3369 if (*skip == 0)
3370 *skip = src - 1 - source;
3371 total++;
3372 if (c == '\n')
3373 this_eol_type = CODING_EOL_LF;
3374 else if (src >= src_end || *src != '\n')
3375 this_eol_type = CODING_EOL_CR;
3376 else
3377 this_eol_type = CODING_EOL_CRLF, src++;
3378
3379 if (eol_type == CODING_EOL_UNDECIDED)
3380 /* This is the first end-of-line. */
3381 eol_type = this_eol_type;
3382 else if (eol_type != this_eol_type)
3383 {
3384 /* The found type is different from what found before. */
3385 eol_type = CODING_EOL_INCONSISTENT;
3386 break;
3387 }
3388 }
3389 }
3390
3391 if (*skip == 0)
3392 *skip = src_end - source;
3393 return eol_type;
3394 }
3395
3396 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3397 is encoded. If it detects an appropriate format of end-of-line, it
3398 sets the information in *CODING. */
3399
3400 void
3401 detect_eol (coding, src, src_bytes)
3402 struct coding_system *coding;
3403 unsigned char *src;
3404 int src_bytes;
3405 {
3406 Lisp_Object val;
3407 int skip;
3408 int eol_type = detect_eol_type (src, src_bytes, &skip);
3409
3410 if (coding->heading_ascii > skip)
3411 coding->heading_ascii = skip;
3412 else
3413 skip = coding->heading_ascii;
3414
3415 if (eol_type == CODING_EOL_UNDECIDED)
3416 return;
3417 if (eol_type == CODING_EOL_INCONSISTENT)
3418 {
3419 #if 0
3420 /* This code is suppressed until we find a better way to
3421 distinguish raw text file and binary file. */
3422
3423 /* If we have already detected that the coding is raw-text, the
3424 coding should actually be no-conversion. */
3425 if (coding->type == coding_type_raw_text)
3426 {
3427 setup_coding_system (Qno_conversion, coding);
3428 return;
3429 }
3430 /* Else, let's decode only text code anyway. */
3431 #endif /* 0 */
3432 eol_type = CODING_EOL_LF;
3433 }
3434
3435 val = Fget (coding->symbol, Qeol_type);
3436 if (VECTORP (val) && XVECTOR (val)->size == 3)
3437 {
3438 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3439 coding->heading_ascii = skip;
3440 }
3441 }
3442
3443 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3444
3445 #define DECODING_BUFFER_MAG(coding) \
3446 (coding->type == coding_type_iso2022 \
3447 ? 3 \
3448 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3449 ? 2 \
3450 : (coding->type == coding_type_raw_text \
3451 ? 1 \
3452 : (coding->type == coding_type_ccl \
3453 ? coding->spec.ccl.decoder.buf_magnification \
3454 : 2))))
3455
3456 /* Return maximum size (bytes) of a buffer enough for decoding
3457 SRC_BYTES of text encoded in CODING. */
3458
3459 int
3460 decoding_buffer_size (coding, src_bytes)
3461 struct coding_system *coding;
3462 int src_bytes;
3463 {
3464 return (src_bytes * DECODING_BUFFER_MAG (coding)
3465 + CONVERSION_BUFFER_EXTRA_ROOM);
3466 }
3467
3468 /* Return maximum size (bytes) of a buffer enough for encoding
3469 SRC_BYTES of text to CODING. */
3470
3471 int
3472 encoding_buffer_size (coding, src_bytes)
3473 struct coding_system *coding;
3474 int src_bytes;
3475 {
3476 int magnification;
3477
3478 if (coding->type == coding_type_ccl)
3479 magnification = coding->spec.ccl.encoder.buf_magnification;
3480 else
3481 magnification = 3;
3482
3483 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3484 }
3485
3486 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3487 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3488 #endif
3489
3490 char *conversion_buffer;
3491 int conversion_buffer_size;
3492
3493 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3494 or decoding. Sufficient memory is allocated automatically. If we
3495 run out of memory, return NULL. */
3496
3497 char *
3498 get_conversion_buffer (size)
3499 int size;
3500 {
3501 if (size > conversion_buffer_size)
3502 {
3503 char *buf;
3504 int real_size = conversion_buffer_size * 2;
3505
3506 while (real_size < size) real_size *= 2;
3507 buf = (char *) xmalloc (real_size);
3508 xfree (conversion_buffer);
3509 conversion_buffer = buf;
3510 conversion_buffer_size = real_size;
3511 }
3512 return conversion_buffer;
3513 }
3514
3515 int
3516 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3517 struct coding_system *coding;
3518 unsigned char *source, *destination;
3519 int src_bytes, dst_bytes, encodep;
3520 {
3521 struct ccl_program *ccl
3522 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3523 int result;
3524
3525 coding->produced = ccl_driver (ccl, source, destination,
3526 src_bytes, dst_bytes, &(coding->consumed));
3527 if (encodep)
3528 {
3529 coding->produced_char = coding->produced;
3530 coding->consumed_char
3531 = multibyte_chars_in_text (source, coding->consumed);
3532 }
3533 else
3534 {
3535 coding->produced_char
3536 = multibyte_chars_in_text (destination, coding->produced);
3537 coding->consumed_char = coding->consumed;
3538 }
3539 switch (ccl->status)
3540 {
3541 case CCL_STAT_SUSPEND_BY_SRC:
3542 result = CODING_FINISH_INSUFFICIENT_SRC;
3543 break;
3544 case CCL_STAT_SUSPEND_BY_DST:
3545 result = CODING_FINISH_INSUFFICIENT_DST;
3546 break;
3547 default:
3548 result = CODING_FINISH_NORMAL;
3549 break;
3550 }
3551 return result;
3552 }
3553
3554 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3555 decoding, it may detect coding system and format of end-of-line if
3556 those are not yet decided. */
3557
3558 int
3559 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3560 struct coding_system *coding;
3561 unsigned char *source, *destination;
3562 int src_bytes, dst_bytes;
3563 {
3564 int result;
3565
3566 if (src_bytes <= 0)
3567 {
3568 coding->produced = coding->produced_char = 0;
3569 coding->consumed = coding->consumed_char = 0;
3570 coding->fake_multibyte = 0;
3571 return CODING_FINISH_NORMAL;
3572 }
3573
3574 if (coding->type == coding_type_undecided)
3575 detect_coding (coding, source, src_bytes);
3576
3577 if (coding->eol_type == CODING_EOL_UNDECIDED)
3578 detect_eol (coding, source, src_bytes);
3579
3580 switch (coding->type)
3581 {
3582 case coding_type_emacs_mule:
3583 case coding_type_undecided:
3584 case coding_type_raw_text:
3585 if (coding->eol_type == CODING_EOL_LF
3586 || coding->eol_type == CODING_EOL_UNDECIDED)
3587 goto label_no_conversion;
3588 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3589 break;
3590
3591 case coding_type_sjis:
3592 result = decode_coding_sjis_big5 (coding, source, destination,
3593 src_bytes, dst_bytes, 1);
3594 break;
3595
3596 case coding_type_iso2022:
3597 result = decode_coding_iso2022 (coding, source, destination,
3598 src_bytes, dst_bytes);
3599 break;
3600
3601 case coding_type_big5:
3602 result = decode_coding_sjis_big5 (coding, source, destination,
3603 src_bytes, dst_bytes, 0);
3604 break;
3605
3606 case coding_type_ccl:
3607 result = ccl_coding_driver (coding, source, destination,
3608 src_bytes, dst_bytes, 0);
3609 break;
3610
3611 default: /* i.e. case coding_type_no_conversion: */
3612 label_no_conversion:
3613 if (dst_bytes && src_bytes > dst_bytes)
3614 {
3615 coding->produced = dst_bytes;
3616 result = CODING_FINISH_INSUFFICIENT_DST;
3617 }
3618 else
3619 {
3620 coding->produced = src_bytes;
3621 result = CODING_FINISH_NORMAL;
3622 }
3623 if (dst_bytes)
3624 bcopy (source, destination, coding->produced);
3625 else
3626 safe_bcopy (source, destination, coding->produced);
3627 coding->fake_multibyte = 1;
3628 coding->consumed
3629 = coding->consumed_char = coding->produced_char = coding->produced;
3630 break;
3631 }
3632
3633 return result;
3634 }
3635
3636 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3637
3638 int
3639 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3640 struct coding_system *coding;
3641 unsigned char *source, *destination;
3642 int src_bytes, dst_bytes;
3643 {
3644 int result;
3645
3646 if (src_bytes <= 0)
3647 {
3648 coding->produced = coding->produced_char = 0;
3649 coding->consumed = coding->consumed_char = 0;
3650 coding->fake_multibyte = 0;
3651 return CODING_FINISH_NORMAL;
3652 }
3653
3654 switch (coding->type)
3655 {
3656 case coding_type_emacs_mule:
3657 case coding_type_undecided:
3658 case coding_type_raw_text:
3659 if (coding->eol_type == CODING_EOL_LF
3660 || coding->eol_type == CODING_EOL_UNDECIDED)
3661 goto label_no_conversion;
3662 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3663 break;
3664
3665 case coding_type_sjis:
3666 result = encode_coding_sjis_big5 (coding, source, destination,
3667 src_bytes, dst_bytes, 1);
3668 break;
3669
3670 case coding_type_iso2022:
3671 result = encode_coding_iso2022 (coding, source, destination,
3672 src_bytes, dst_bytes);
3673 break;
3674
3675 case coding_type_big5:
3676 result = encode_coding_sjis_big5 (coding, source, destination,
3677 src_bytes, dst_bytes, 0);
3678 break;
3679
3680 case coding_type_ccl:
3681 result = ccl_coding_driver (coding, source, destination,
3682 src_bytes, dst_bytes, 1);
3683 break;
3684
3685 default: /* i.e. case coding_type_no_conversion: */
3686 label_no_conversion:
3687 if (dst_bytes && src_bytes > dst_bytes)
3688 {
3689 coding->produced = dst_bytes;
3690 result = CODING_FINISH_INSUFFICIENT_DST;
3691 }
3692 else
3693 {
3694 coding->produced = src_bytes;
3695 result = CODING_FINISH_NORMAL;
3696 }
3697 if (dst_bytes)
3698 bcopy (source, destination, coding->produced);
3699 else
3700 safe_bcopy (source, destination, coding->produced);
3701 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3702 {
3703 unsigned char *p = destination, *pend = p + coding->produced;
3704 while (p < pend)
3705 if (*p++ == '\015') p[-1] = '\n';
3706 }
3707 coding->fake_multibyte = 1;
3708 coding->consumed
3709 = coding->consumed_char = coding->produced_char = coding->produced;
3710 break;
3711 }
3712
3713 return result;
3714 }
3715
3716 /* Scan text in the region between *BEG and *END (byte positions),
3717 skip characters which we don't have to decode by coding system
3718 CODING at the head and tail, then set *BEG and *END to the region
3719 of the text we actually have to convert. The caller should move
3720 the gap out of the region in advance.
3721
3722 If STR is not NULL, *BEG and *END are indices into STR. */
3723
3724 static void
3725 shrink_decoding_region (beg, end, coding, str)
3726 int *beg, *end;
3727 struct coding_system *coding;
3728 unsigned char *str;
3729 {
3730 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3731 int eol_conversion;
3732
3733 if (coding->type == coding_type_ccl
3734 || coding->type == coding_type_undecided
3735 || !NILP (coding->post_read_conversion))
3736 {
3737 /* We can't skip any data. */
3738 return;
3739 }
3740 else if (coding->type == coding_type_no_conversion)
3741 {
3742 /* We need no conversion, but don't have to skip any data here.
3743 Decoding routine handles them effectively anyway. */
3744 return;
3745 }
3746
3747 if (coding->heading_ascii >= 0)
3748 /* Detection routine has already found how much we can skip at the
3749 head. */
3750 *beg += coding->heading_ascii;
3751
3752 if (str)
3753 {
3754 begp_orig = begp = str + *beg;
3755 endp_orig = endp = str + *end;
3756 }
3757 else
3758 {
3759 begp_orig = begp = BYTE_POS_ADDR (*beg);
3760 endp_orig = endp = begp + *end - *beg;
3761 }
3762
3763 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3764
3765 switch (coding->type)
3766 {
3767 case coding_type_emacs_mule:
3768 case coding_type_raw_text:
3769 if (eol_conversion)
3770 {
3771 if (coding->heading_ascii < 0)
3772 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3773 while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80)
3774 endp--;
3775 }
3776 else
3777 begp = endp;
3778 break;
3779
3780 case coding_type_sjis:
3781 case coding_type_big5:
3782 /* We can skip all ASCII characters at the head. */
3783 if (coding->heading_ascii < 0)
3784 {
3785 if (eol_conversion)
3786 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
3787 else
3788 while (begp < endp && *begp < 0x80) begp++;
3789 }
3790 /* We can skip all ASCII characters at the tail except for the
3791 second byte of SJIS or BIG5 code. */
3792 if (eol_conversion)
3793 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
3794 else
3795 while (begp < endp && endp[-1] < 0x80) endp--;
3796 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3797 endp++;
3798 break;
3799
3800 default: /* i.e. case coding_type_iso2022: */
3801 if (coding->heading_ascii < 0)
3802 {
3803 /* We can skip all ASCII characters at the head except for a
3804 few control codes. */
3805 while (begp < endp && (c = *begp) < 0x80
3806 && c != ISO_CODE_CR && c != ISO_CODE_SO
3807 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3808 && (!eol_conversion || c != ISO_CODE_LF))
3809 begp++;
3810 }
3811 switch (coding->category_idx)
3812 {
3813 case CODING_CATEGORY_IDX_ISO_8_1:
3814 case CODING_CATEGORY_IDX_ISO_8_2:
3815 /* We can skip all ASCII characters at the tail. */
3816 if (eol_conversion)
3817 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
3818 else
3819 while (begp < endp && endp[-1] < 0x80) endp--;
3820 break;
3821
3822 case CODING_CATEGORY_IDX_ISO_7:
3823 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3824 /* We can skip all charactes at the tail except for ESC and
3825 the following 2-byte at the tail. */
3826 if (eol_conversion)
3827 while (begp < endp
3828 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC && c != '\r')
3829 endp--;
3830 else
3831 while (begp < endp
3832 && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC)
3833 endp--;
3834 if (begp < endp && endp[-1] == ISO_CODE_ESC)
3835 {
3836 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
3837 /* This is an ASCII designation sequence. We can
3838 surely skip the tail. */
3839 endp += 2;
3840 else
3841 /* Hmmm, we can't skip the tail. */
3842 endp = endp_orig;
3843 }
3844 }
3845 }
3846 *beg += begp - begp_orig;
3847 *end += endp - endp_orig;
3848 return;
3849 }
3850
3851 /* Like shrink_decoding_region but for encoding. */
3852
3853 static void
3854 shrink_encoding_region (beg, end, coding, str)
3855 int *beg, *end;
3856 struct coding_system *coding;
3857 unsigned char *str;
3858 {
3859 unsigned char *begp_orig, *begp, *endp_orig, *endp;
3860 int eol_conversion;
3861
3862 if (coding->type == coding_type_ccl)
3863 /* We can't skip any data. */
3864 return;
3865 else if (coding->type == coding_type_no_conversion)
3866 {
3867 /* We need no conversion. */
3868 *beg = *end;
3869 return;
3870 }
3871
3872 if (str)
3873 {
3874 begp_orig = begp = str + *beg;
3875 endp_orig = endp = str + *end;
3876 }
3877 else
3878 {
3879 begp_orig = begp = BYTE_POS_ADDR (*beg);
3880 endp_orig = endp = begp + *end - *beg;
3881 }
3882
3883 eol_conversion = (coding->eol_type == CODING_EOL_CR
3884 || coding->eol_type == CODING_EOL_CRLF);
3885
3886 /* Here, we don't have to check coding->pre_write_conversion because
3887 the caller is expected to have handled it already. */
3888 switch (coding->type)
3889 {
3890 case coding_type_undecided:
3891 case coding_type_emacs_mule:
3892 case coding_type_raw_text:
3893 if (eol_conversion)
3894 {
3895 while (begp < endp && *begp != '\n') begp++;
3896 while (begp < endp && endp[-1] != '\n') endp--;
3897 }
3898 else
3899 begp = endp;
3900 break;
3901
3902 case coding_type_iso2022:
3903 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3904 {
3905 unsigned char *bol = begp;
3906 while (begp < endp && *begp < 0x80)
3907 {
3908 begp++;
3909 if (begp[-1] == '\n')
3910 bol = begp;
3911 }
3912 begp = bol;
3913 goto label_skip_tail;
3914 }
3915 /* fall down ... */
3916
3917 default:
3918 /* We can skip all ASCII characters at the head and tail. */
3919 if (eol_conversion)
3920 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3921 else
3922 while (begp < endp && *begp < 0x80) begp++;
3923 label_skip_tail:
3924 if (eol_conversion)
3925 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3926 else
3927 while (begp < endp && *(endp - 1) < 0x80) endp--;
3928 break;
3929 }
3930
3931 *beg += begp - begp_orig;
3932 *end += endp - endp_orig;
3933 return;
3934 }
3935
3936 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3937 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3938 coding system CODING, and return the status code of code conversion
3939 (currently, this value has no meaning).
3940
3941 How many characters (and bytes) are converted to how many
3942 characters (and bytes) are recorded in members of the structure
3943 CODING.
3944
3945 If REPLACE is nonzero, we do various things as if the original text
3946 is deleted and a new text is inserted. See the comments in
3947 replace_range (insdel.c) to know what we are doing. */
3948
3949 int
3950 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
3951 int from, from_byte, to, to_byte, encodep, replace;
3952 struct coding_system *coding;
3953 {
3954 int len = to - from, len_byte = to_byte - from_byte;
3955 int require, inserted, inserted_byte;
3956 int head_skip, tail_skip, total_skip;
3957 Lisp_Object saved_coding_symbol = Qnil;
3958 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
3959 int first = 1;
3960 int fake_multibyte = 0;
3961 unsigned char *src, *dst;
3962 Lisp_Object deletion = Qnil;
3963
3964 if (replace)
3965 {
3966 int saved_from = from;
3967
3968 prepare_to_modify_buffer (from, to, &from);
3969 if (saved_from != from)
3970 {
3971 to = from + len;
3972 if (multibyte)
3973 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
3974 else
3975 from_byte = from, to_byte = to;
3976 len_byte = to_byte - from_byte;
3977 }
3978 }
3979
3980 if (! encodep && CODING_REQUIRE_DETECTION (coding))
3981 {
3982 /* We must detect encoding of text and eol format. */
3983
3984 if (from < GPT && to > GPT)
3985 move_gap_both (from, from_byte);
3986 if (coding->type == coding_type_undecided)
3987 {
3988 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
3989 if (coding->type == coding_type_undecided)
3990 /* It seems that the text contains only ASCII, but we
3991 should not left it undecided because the deeper
3992 decoding routine (decode_coding) tries to detect the
3993 encodings again in vain. */
3994 coding->type = coding_type_emacs_mule;
3995 }
3996 if (coding->eol_type == CODING_EOL_UNDECIDED)
3997 {
3998 saved_coding_symbol = coding->symbol;
3999 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4000 if (coding->eol_type == CODING_EOL_UNDECIDED)
4001 coding->eol_type = CODING_EOL_LF;
4002 /* We had better recover the original eol format if we
4003 encounter an inconsitent eol format while decoding. */
4004 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4005 }
4006 }
4007
4008 coding->consumed_char = len, coding->consumed = len_byte;
4009
4010 if (encodep
4011 ? ! CODING_REQUIRE_ENCODING (coding)
4012 : ! CODING_REQUIRE_DECODING (coding))
4013 {
4014 coding->produced = len_byte;
4015 if (multibyte
4016 && ! replace
4017 /* See the comment of the member heading_ascii in coding.h. */
4018 && coding->heading_ascii < len_byte)
4019 {
4020 /* We still may have to combine byte at the head and the
4021 tail of the text in the region. */
4022 if (from < GPT && GPT < to)
4023 move_gap_both (to, to_byte);
4024 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4025 adjust_after_insert (from, from_byte, to, to_byte, len);
4026 coding->produced_char = len;
4027 }
4028 else
4029 coding->produced_char = len_byte;
4030 return 0;
4031 }
4032
4033 /* Now we convert the text. */
4034
4035 /* For encoding, we must process pre-write-conversion in advance. */
4036 if (encodep
4037 && ! NILP (coding->pre_write_conversion)
4038 && SYMBOLP (coding->pre_write_conversion)
4039 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4040 {
4041 /* The function in pre-write-conversion may put a new text in a
4042 new buffer. */
4043 struct buffer *prev = current_buffer, *new;
4044
4045 call2 (coding->pre_write_conversion, from, to);
4046 if (current_buffer != prev)
4047 {
4048 len = ZV - BEGV;
4049 new = current_buffer;
4050 set_buffer_internal_1 (prev);
4051 del_range_2 (from, from_byte, to, to_byte);
4052 insert_from_buffer (new, BEG, len, 0);
4053 to = from + len;
4054 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4055 len_byte = to_byte - from_byte;
4056 }
4057 }
4058
4059 if (replace)
4060 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4061
4062 /* Try to skip the heading and tailing ASCIIs. */
4063 {
4064 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4065
4066 if (from < GPT && GPT < to)
4067 move_gap_both (from, from_byte);
4068 if (encodep)
4069 shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
4070 else
4071 shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
4072 if (from_byte == to_byte)
4073 {
4074 coding->produced = len_byte;
4075 coding->produced_char = multibyte ? len : len_byte;
4076 if (!replace)
4077 /* We must record and adjust for this new text now. */
4078 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4079 return 0;
4080 }
4081
4082 head_skip = from_byte - from_byte_orig;
4083 tail_skip = to_byte_orig - to_byte;
4084 total_skip = head_skip + tail_skip;
4085 from += head_skip;
4086 to -= tail_skip;
4087 len -= total_skip; len_byte -= total_skip;
4088 }
4089
4090 /* For converion, we must put the gap before the text in addition to
4091 making the gap larger for efficient decoding. The required gap
4092 size starts from 2000 which is the magic number used in make_gap.
4093 But, after one batch of conversion, it will be incremented if we
4094 find that it is not enough . */
4095 require = 2000;
4096
4097 if (GAP_SIZE < require)
4098 make_gap (require - GAP_SIZE);
4099 move_gap_both (from, from_byte);
4100
4101 if (GPT - BEG < beg_unchanged)
4102 beg_unchanged = GPT - BEG;
4103 if (Z - GPT < end_unchanged)
4104 end_unchanged = Z - GPT;
4105
4106 inserted = inserted_byte = 0;
4107 src = GAP_END_ADDR, dst = GPT_ADDR;
4108
4109 GAP_SIZE += len_byte;
4110 ZV -= len;
4111 Z -= len;
4112 ZV_BYTE -= len_byte;
4113 Z_BYTE -= len_byte;
4114
4115 for (;;)
4116 {
4117 int result;
4118
4119 /* The buffer memory is changed from:
4120 +--------+converted-text+---------+-------original-text------+---+
4121 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4122 |<------------------- GAP_SIZE -------------------->| */
4123 if (encodep)
4124 result = encode_coding (coding, src, dst, len_byte, 0);
4125 else
4126 result = decode_coding (coding, src, dst, len_byte, 0);
4127 /* to:
4128 +--------+-------converted-text--------+--+---original-text--+---+
4129 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4130 |<------------------- GAP_SIZE -------------------->| */
4131 if (coding->fake_multibyte)
4132 fake_multibyte = 1;
4133
4134 if (!encodep && !multibyte)
4135 coding->produced_char = coding->produced;
4136 inserted += coding->produced_char;
4137 inserted_byte += coding->produced;
4138 len_byte -= coding->consumed;
4139 src += coding->consumed;
4140 dst += inserted_byte;
4141
4142 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4143 {
4144 unsigned char *pend = dst, *p = pend - inserted_byte;
4145
4146 /* Encode LFs back to the original eol format (CR or CRLF). */
4147 if (coding->eol_type == CODING_EOL_CR)
4148 {
4149 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4150 }
4151 else
4152 {
4153 int count = 0;
4154
4155 while (p < pend) if (*p++ == '\n') count++;
4156 if (src - dst < count)
4157 {
4158 /* We don't have sufficient room for putting LFs
4159 back to CRLF. We must record converted and
4160 not-yet-converted text back to the buffer
4161 content, enlarge the gap, then record them out of
4162 the buffer contents again. */
4163 int add = len_byte + inserted_byte;
4164
4165 GAP_SIZE -= add;
4166 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4167 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4168 make_gap (count - GAP_SIZE);
4169 GAP_SIZE += add;
4170 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4171 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4172 /* Don't forget to update SRC, DST, and PEND. */
4173 src = GAP_END_ADDR - len_byte;
4174 dst = GPT_ADDR + inserted_byte;
4175 pend = dst;
4176 }
4177 inserted += count;
4178 inserted_byte += count;
4179 coding->produced += count;
4180 p = dst = pend + count;
4181 while (count)
4182 {
4183 *--p = *--pend;
4184 if (*p == '\n') count--, *--p = '\r';
4185 }
4186 }
4187
4188 /* Suppress eol-format conversion in the further conversion. */
4189 coding->eol_type = CODING_EOL_LF;
4190
4191 /* Restore the original symbol. */
4192 coding->symbol = saved_coding_symbol;
4193
4194 continue;
4195 }
4196 if (len_byte <= 0)
4197 break;
4198 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4199 {
4200 /* The source text ends in invalid codes. Let's just
4201 make them valid buffer contents, and finish conversion. */
4202 inserted += len_byte;
4203 inserted_byte += len_byte;
4204 while (len_byte--)
4205 *src++ = *dst++;
4206 fake_multibyte = 1;
4207 break;
4208 }
4209 if (first)
4210 {
4211 /* We have just done the first batch of conversion which was
4212 stoped because of insufficient gap. Let's reconsider the
4213 required gap size (i.e. SRT - DST) now.
4214
4215 We have converted ORIG bytes (== coding->consumed) into
4216 NEW bytes (coding->produced). To convert the remaining
4217 LEN bytes, we may need REQUIRE bytes of gap, where:
4218 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4219 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4220 Here, we are sure that NEW >= ORIG. */
4221 float ratio = coding->produced - coding->consumed;
4222 ratio /= coding->consumed;
4223 require = len_byte * ratio;
4224 first = 0;
4225 }
4226 if ((src - dst) < (require + 2000))
4227 {
4228 /* See the comment above the previous call of make_gap. */
4229 int add = len_byte + inserted_byte;
4230
4231 GAP_SIZE -= add;
4232 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4233 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4234 make_gap (require + 2000);
4235 GAP_SIZE += add;
4236 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4237 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4238 /* Don't forget to update SRC, DST. */
4239 src = GAP_END_ADDR - len_byte;
4240 dst = GPT_ADDR + inserted_byte;
4241 }
4242 }
4243 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4244
4245 if (multibyte
4246 && (fake_multibyte
4247 || !encodep && (to - from) != (to_byte - from_byte)))
4248 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4249
4250 /* If we have shrinked the conversion area, adjust it now. */
4251 if (total_skip > 0)
4252 {
4253 if (tail_skip > 0)
4254 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4255 inserted += total_skip; inserted_byte += total_skip;
4256 GAP_SIZE += total_skip;
4257 GPT -= head_skip; GPT_BYTE -= head_skip;
4258 ZV -= total_skip; ZV_BYTE -= total_skip;
4259 Z -= total_skip; Z_BYTE -= total_skip;
4260 from -= head_skip; from_byte -= head_skip;
4261 to += tail_skip; to_byte += tail_skip;
4262 }
4263
4264 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4265
4266 if (! encodep && ! NILP (coding->post_read_conversion))
4267 {
4268 Lisp_Object val;
4269 int orig_inserted = inserted, pos = PT;
4270
4271 if (from != pos)
4272 temp_set_point_both (current_buffer, from, from_byte);
4273 val = call1 (coding->post_read_conversion, make_number (inserted));
4274 if (! NILP (val))
4275 {
4276 CHECK_NUMBER (val, 0);
4277 inserted = XFASTINT (val);
4278 }
4279 if (pos >= from + orig_inserted)
4280 temp_set_point (current_buffer, pos + (inserted - orig_inserted));
4281 }
4282
4283 signal_after_change (from, to - from, inserted);
4284
4285 {
4286 coding->consumed = to_byte - from_byte;
4287 coding->consumed_char = to - from;
4288 coding->produced = inserted_byte;
4289 coding->produced_char = inserted;
4290 }
4291
4292 return 0;
4293 }
4294
4295 Lisp_Object
4296 code_convert_string (str, coding, encodep, nocopy)
4297 Lisp_Object str;
4298 struct coding_system *coding;
4299 int encodep, nocopy;
4300 {
4301 int len;
4302 char *buf;
4303 int from = 0, to = XSTRING (str)->size;
4304 int to_byte = STRING_BYTES (XSTRING (str));
4305 struct gcpro gcpro1;
4306 Lisp_Object saved_coding_symbol = Qnil;
4307 int result;
4308
4309 if (encodep && !NILP (coding->pre_write_conversion)
4310 || !encodep && !NILP (coding->post_read_conversion))
4311 {
4312 /* Since we have to call Lisp functions which assume target text
4313 is in a buffer, after setting a temporary buffer, call
4314 code_convert_region. */
4315 int count = specpdl_ptr - specpdl;
4316 struct buffer *prev = current_buffer;
4317
4318 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4319 temp_output_buffer_setup (" *code-converting-work*");
4320 set_buffer_internal (XBUFFER (Vstandard_output));
4321 if (encodep)
4322 insert_from_string (str, 0, 0, to, to_byte, 0);
4323 else
4324 {
4325 /* We must insert the contents of STR as is without
4326 unibyte<->multibyte conversion. */
4327 current_buffer->enable_multibyte_characters = Qnil;
4328 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4329 current_buffer->enable_multibyte_characters = Qt;
4330 }
4331 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4332 if (encodep)
4333 /* We must return the buffer contents as unibyte string. */
4334 current_buffer->enable_multibyte_characters = Qnil;
4335 str = make_buffer_string (BEGV, ZV, 0);
4336 set_buffer_internal (prev);
4337 return unbind_to (count, str);
4338 }
4339
4340 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4341 {
4342 /* See the comments in code_convert_region. */
4343 if (coding->type == coding_type_undecided)
4344 {
4345 detect_coding (coding, XSTRING (str)->data, to_byte);
4346 if (coding->type == coding_type_undecided)
4347 coding->type = coding_type_emacs_mule;
4348 }
4349 if (coding->eol_type == CODING_EOL_UNDECIDED)
4350 {
4351 saved_coding_symbol = coding->symbol;
4352 detect_eol (coding, XSTRING (str)->data, to_byte);
4353 if (coding->eol_type == CODING_EOL_UNDECIDED)
4354 coding->eol_type = CODING_EOL_LF;
4355 /* We had better recover the original eol format if we
4356 encounter an inconsitent eol format while decoding. */
4357 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4358 }
4359 }
4360
4361 if (encodep
4362 ? ! CODING_REQUIRE_ENCODING (coding)
4363 : ! CODING_REQUIRE_DECODING (coding))
4364 from = to_byte;
4365 else
4366 {
4367 /* Try to skip the heading and tailing ASCIIs. */
4368 if (encodep)
4369 shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4370 else
4371 shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4372 }
4373 if (from == to_byte)
4374 return (nocopy ? str : Fcopy_sequence (str));
4375
4376 if (encodep)
4377 len = encoding_buffer_size (coding, to_byte - from);
4378 else
4379 len = decoding_buffer_size (coding, to_byte - from);
4380 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4381 GCPRO1 (str);
4382 buf = get_conversion_buffer (len);
4383 UNGCPRO;
4384
4385 if (from > 0)
4386 bcopy (XSTRING (str)->data, buf, from);
4387 result = (encodep
4388 ? encode_coding (coding, XSTRING (str)->data + from,
4389 buf + from, to_byte - from, len)
4390 : decode_coding (coding, XSTRING (str)->data + from,
4391 buf + from, to - from, len));
4392 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4393 {
4394 /* We simple try to decode the whole string again but without
4395 eol-conversion this time. */
4396 coding->eol_type = CODING_EOL_LF;
4397 coding->symbol = saved_coding_symbol;
4398 return code_convert_string (str, coding, encodep, nocopy);
4399 }
4400
4401 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4402 STRING_BYTES (XSTRING (str)) - to_byte);
4403
4404 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4405 if (encodep)
4406 str = make_unibyte_string (buf, len + coding->produced);
4407 else
4408 str = make_string_from_bytes (buf, len + coding->produced_char,
4409 len + coding->produced);
4410 return str;
4411 }
4412
4413 \f
4414 #ifdef emacs
4415 /*** 7. Emacs Lisp library functions ***/
4416
4417 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4418 "Return t if OBJECT is nil or a coding-system.\n\
4419 See the documentation of `make-coding-system' for information\n\
4420 about coding-system objects.")
4421 (obj)
4422 Lisp_Object obj;
4423 {
4424 if (NILP (obj))
4425 return Qt;
4426 if (!SYMBOLP (obj))
4427 return Qnil;
4428 /* Get coding-spec vector for OBJ. */
4429 obj = Fget (obj, Qcoding_system);
4430 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4431 ? Qt : Qnil);
4432 }
4433
4434 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4435 Sread_non_nil_coding_system, 1, 1, 0,
4436 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4437 (prompt)
4438 Lisp_Object prompt;
4439 {
4440 Lisp_Object val;
4441 do
4442 {
4443 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4444 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4445 }
4446 while (XSTRING (val)->size == 0);
4447 return (Fintern (val, Qnil));
4448 }
4449
4450 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4451 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4452 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4453 (prompt, default_coding_system)
4454 Lisp_Object prompt, default_coding_system;
4455 {
4456 Lisp_Object val;
4457 if (SYMBOLP (default_coding_system))
4458 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4459 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4460 Qt, Qnil, Qcoding_system_history,
4461 default_coding_system, Qnil);
4462 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4463 }
4464
4465 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4466 1, 1, 0,
4467 "Check validity of CODING-SYSTEM.\n\
4468 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4469 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4470 The value of property should be a vector of length 5.")
4471 (coding_system)
4472 Lisp_Object coding_system;
4473 {
4474 CHECK_SYMBOL (coding_system, 0);
4475 if (!NILP (Fcoding_system_p (coding_system)))
4476 return coding_system;
4477 while (1)
4478 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4479 }
4480 \f
4481 Lisp_Object
4482 detect_coding_system (src, src_bytes, highest)
4483 unsigned char *src;
4484 int src_bytes, highest;
4485 {
4486 int coding_mask, eol_type;
4487 Lisp_Object val, tmp;
4488 int dummy;
4489
4490 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4491 eol_type = detect_eol_type (src, src_bytes, &dummy);
4492 if (eol_type == CODING_EOL_INCONSISTENT)
4493 eol_type == CODING_EOL_UNDECIDED;
4494
4495 if (!coding_mask)
4496 {
4497 val = Qundecided;
4498 if (eol_type != CODING_EOL_UNDECIDED)
4499 {
4500 Lisp_Object val2;
4501 val2 = Fget (Qundecided, Qeol_type);
4502 if (VECTORP (val2))
4503 val = XVECTOR (val2)->contents[eol_type];
4504 }
4505 return val;
4506 }
4507
4508 /* At first, gather possible coding systems in VAL. */
4509 val = Qnil;
4510 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4511 {
4512 int idx
4513 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4514 if (coding_mask & (1 << idx))
4515 {
4516 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4517 if (highest)
4518 break;
4519 }
4520 }
4521 if (!highest)
4522 val = Fnreverse (val);
4523
4524 /* Then, substitute the elements by subsidiary coding systems. */
4525 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4526 {
4527 if (eol_type != CODING_EOL_UNDECIDED)
4528 {
4529 Lisp_Object eol;
4530 eol = Fget (XCONS (tmp)->car, Qeol_type);
4531 if (VECTORP (eol))
4532 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4533 }
4534 }
4535 return (highest ? XCONS (val)->car : val);
4536 }
4537
4538 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4539 2, 3, 0,
4540 "Detect coding system of the text in the region between START and END.\n\
4541 Return a list of possible coding systems ordered by priority.\n\
4542 \n\
4543 If only ASCII characters are found, it returns `undecided'\n\
4544 or its subsidiary coding system according to a detected end-of-line format.\n\
4545 \n\
4546 If optional argument HIGHEST is non-nil, return the coding system of\n\
4547 highest priority.")
4548 (start, end, highest)
4549 Lisp_Object start, end, highest;
4550 {
4551 int from, to;
4552 int from_byte, to_byte;
4553
4554 CHECK_NUMBER_COERCE_MARKER (start, 0);
4555 CHECK_NUMBER_COERCE_MARKER (end, 1);
4556
4557 validate_region (&start, &end);
4558 from = XINT (start), to = XINT (end);
4559 from_byte = CHAR_TO_BYTE (from);
4560 to_byte = CHAR_TO_BYTE (to);
4561
4562 if (from < GPT && to >= GPT)
4563 move_gap_both (to, to_byte);
4564
4565 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4566 to_byte - from_byte,
4567 !NILP (highest));
4568 }
4569
4570 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4571 1, 2, 0,
4572 "Detect coding system of the text in STRING.\n\
4573 Return a list of possible coding systems ordered by priority.\n\
4574 \n\
4575 If only ASCII characters are found, it returns `undecided'\n\
4576 or its subsidiary coding system according to a detected end-of-line format.\n\
4577 \n\
4578 If optional argument HIGHEST is non-nil, return the coding system of\n\
4579 highest priority.")
4580 (string, highest)
4581 Lisp_Object string, highest;
4582 {
4583 CHECK_STRING (string, 0);
4584
4585 return detect_coding_system (XSTRING (string)->data,
4586 STRING_BYTES (XSTRING (string)),
4587 !NILP (highest));
4588 }
4589
4590 Lisp_Object
4591 code_convert_region1 (start, end, coding_system, encodep)
4592 Lisp_Object start, end, coding_system;
4593 int encodep;
4594 {
4595 struct coding_system coding;
4596 int from, to, len;
4597
4598 CHECK_NUMBER_COERCE_MARKER (start, 0);
4599 CHECK_NUMBER_COERCE_MARKER (end, 1);
4600 CHECK_SYMBOL (coding_system, 2);
4601
4602 validate_region (&start, &end);
4603 from = XFASTINT (start);
4604 to = XFASTINT (end);
4605
4606 if (NILP (coding_system))
4607 return make_number (to - from);
4608
4609 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4610 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4611
4612 coding.mode |= CODING_MODE_LAST_BLOCK;
4613 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4614 &coding, encodep, 1);
4615 return make_number (coding.produced_char);
4616 }
4617
4618 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4619 3, 3, "r\nzCoding system: ",
4620 "Decode the current region by specified coding system.\n\
4621 When called from a program, takes three arguments:\n\
4622 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4623 Return length of decoded text.")
4624 (start, end, coding_system)
4625 Lisp_Object start, end, coding_system;
4626 {
4627 return code_convert_region1 (start, end, coding_system, 0);
4628 }
4629
4630 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4631 3, 3, "r\nzCoding system: ",
4632 "Encode the current region by specified coding system.\n\
4633 When called from a program, takes three arguments:\n\
4634 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4635 Return length of encoded text.")
4636 (start, end, coding_system)
4637 Lisp_Object start, end, coding_system;
4638 {
4639 return code_convert_region1 (start, end, coding_system, 1);
4640 }
4641
4642 Lisp_Object
4643 code_convert_string1 (string, coding_system, nocopy, encodep)
4644 Lisp_Object string, coding_system, nocopy;
4645 int encodep;
4646 {
4647 struct coding_system coding;
4648
4649 CHECK_STRING (string, 0);
4650 CHECK_SYMBOL (coding_system, 1);
4651
4652 if (NILP (coding_system))
4653 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4654
4655 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4656 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4657
4658 coding.mode |= CODING_MODE_LAST_BLOCK;
4659 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4660 }
4661
4662 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4663 2, 3, 0,
4664 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4665 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4666 if the decoding operation is trivial.")
4667 (string, coding_system, nocopy)
4668 Lisp_Object string, coding_system, nocopy;
4669 {
4670 return code_convert_string1(string, coding_system, nocopy, 0);
4671 }
4672
4673 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4674 2, 3, 0,
4675 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4676 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4677 if the encoding operation is trivial.")
4678 (string, coding_system, nocopy)
4679 Lisp_Object string, coding_system, nocopy;
4680 {
4681 return code_convert_string1(string, coding_system, nocopy, 1);
4682 }
4683
4684 \f
4685 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
4686 "Decode a JISX0208 character of shift-jis encoding.\n\
4687 CODE is the character code in SJIS.\n\
4688 Return the corresponding character.")
4689 (code)
4690 Lisp_Object code;
4691 {
4692 unsigned char c1, c2, s1, s2;
4693 Lisp_Object val;
4694
4695 CHECK_NUMBER (code, 0);
4696 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
4697 DECODE_SJIS (s1, s2, c1, c2);
4698 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
4699 return val;
4700 }
4701
4702 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
4703 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4704 Return the corresponding character code in SJIS.")
4705 (ch)
4706 Lisp_Object ch;
4707 {
4708 int charset, c1, c2, s1, s2;
4709 Lisp_Object val;
4710
4711 CHECK_NUMBER (ch, 0);
4712 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4713 if (charset == charset_jisx0208)
4714 {
4715 ENCODE_SJIS (c1, c2, s1, s2);
4716 XSETFASTINT (val, (s1 << 8) | s2);
4717 }
4718 else
4719 XSETFASTINT (val, 0);
4720 return val;
4721 }
4722
4723 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
4724 "Decode a Big5 character CODE of BIG5 coding system.\n\
4725 CODE is the character code in BIG5.\n\
4726 Return the corresponding character.")
4727 (code)
4728 Lisp_Object code;
4729 {
4730 int charset;
4731 unsigned char b1, b2, c1, c2;
4732 Lisp_Object val;
4733
4734 CHECK_NUMBER (code, 0);
4735 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
4736 DECODE_BIG5 (b1, b2, charset, c1, c2);
4737 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
4738 return val;
4739 }
4740
4741 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
4742 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4743 Return the corresponding character code in Big5.")
4744 (ch)
4745 Lisp_Object ch;
4746 {
4747 int charset, c1, c2, b1, b2;
4748 Lisp_Object val;
4749
4750 CHECK_NUMBER (ch, 0);
4751 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4752 if (charset == charset_big5_1 || charset == charset_big5_2)
4753 {
4754 ENCODE_BIG5 (charset, c1, c2, b1, b2);
4755 XSETFASTINT (val, (b1 << 8) | b2);
4756 }
4757 else
4758 XSETFASTINT (val, 0);
4759 return val;
4760 }
4761 \f
4762 DEFUN ("set-terminal-coding-system-internal",
4763 Fset_terminal_coding_system_internal,
4764 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4765 (coding_system)
4766 Lisp_Object coding_system;
4767 {
4768 CHECK_SYMBOL (coding_system, 0);
4769 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4770 /* We had better not send unsafe characters to terminal. */
4771 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
4772
4773 return Qnil;
4774 }
4775
4776 DEFUN ("set-safe-terminal-coding-system-internal",
4777 Fset_safe_terminal_coding_system_internal,
4778 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
4779 (coding_system)
4780 Lisp_Object coding_system;
4781 {
4782 CHECK_SYMBOL (coding_system, 0);
4783 setup_coding_system (Fcheck_coding_system (coding_system),
4784 &safe_terminal_coding);
4785 return Qnil;
4786 }
4787
4788 DEFUN ("terminal-coding-system",
4789 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
4790 "Return coding system specified for terminal output.")
4791 ()
4792 {
4793 return terminal_coding.symbol;
4794 }
4795
4796 DEFUN ("set-keyboard-coding-system-internal",
4797 Fset_keyboard_coding_system_internal,
4798 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4799 (coding_system)
4800 Lisp_Object coding_system;
4801 {
4802 CHECK_SYMBOL (coding_system, 0);
4803 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
4804 return Qnil;
4805 }
4806
4807 DEFUN ("keyboard-coding-system",
4808 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
4809 "Return coding system specified for decoding keyboard input.")
4810 ()
4811 {
4812 return keyboard_coding.symbol;
4813 }
4814
4815 \f
4816 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
4817 Sfind_operation_coding_system, 1, MANY, 0,
4818 "Choose a coding system for an operation based on the target name.\n\
4819 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4820 DECODING-SYSTEM is the coding system to use for decoding\n\
4821 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4822 for encoding (in case OPERATION does encoding).\n\
4823 \n\
4824 The first argument OPERATION specifies an I/O primitive:\n\
4825 For file I/O, `insert-file-contents' or `write-region'.\n\
4826 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4827 For network I/O, `open-network-stream'.\n\
4828 \n\
4829 The remaining arguments should be the same arguments that were passed\n\
4830 to the primitive. Depending on which primitive, one of those arguments\n\
4831 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4832 whichever argument specifies the file name is TARGET.\n\
4833 \n\
4834 TARGET has a meaning which depends on OPERATION:\n\
4835 For file I/O, TARGET is a file name.\n\
4836 For process I/O, TARGET is a process name.\n\
4837 For network I/O, TARGET is a service name or a port number\n\
4838 \n\
4839 This function looks up what specified for TARGET in,\n\
4840 `file-coding-system-alist', `process-coding-system-alist',\n\
4841 or `network-coding-system-alist' depending on OPERATION.\n\
4842 They may specify a coding system, a cons of coding systems,\n\
4843 or a function symbol to call.\n\
4844 In the last case, we call the function with one argument,\n\
4845 which is a list of all the arguments given to this function.")
4846 (nargs, args)
4847 int nargs;
4848 Lisp_Object *args;
4849 {
4850 Lisp_Object operation, target_idx, target, val;
4851 register Lisp_Object chain;
4852
4853 if (nargs < 2)
4854 error ("Too few arguments");
4855 operation = args[0];
4856 if (!SYMBOLP (operation)
4857 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
4858 error ("Invalid first arguement");
4859 if (nargs < 1 + XINT (target_idx))
4860 error ("Too few arguments for operation: %s",
4861 XSYMBOL (operation)->name->data);
4862 target = args[XINT (target_idx) + 1];
4863 if (!(STRINGP (target)
4864 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
4865 error ("Invalid %dth argument", XINT (target_idx) + 1);
4866
4867 chain = ((EQ (operation, Qinsert_file_contents)
4868 || EQ (operation, Qwrite_region))
4869 ? Vfile_coding_system_alist
4870 : (EQ (operation, Qopen_network_stream)
4871 ? Vnetwork_coding_system_alist
4872 : Vprocess_coding_system_alist));
4873 if (NILP (chain))
4874 return Qnil;
4875
4876 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4877 {
4878 Lisp_Object elt;
4879 elt = XCONS (chain)->car;
4880
4881 if (CONSP (elt)
4882 && ((STRINGP (target)
4883 && STRINGP (XCONS (elt)->car)
4884 && fast_string_match (XCONS (elt)->car, target) >= 0)
4885 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
4886 {
4887 val = XCONS (elt)->cdr;
4888 /* Here, if VAL is both a valid coding system and a valid
4889 function symbol, we return VAL as a coding system. */
4890 if (CONSP (val))
4891 return val;
4892 if (! SYMBOLP (val))
4893 return Qnil;
4894 if (! NILP (Fcoding_system_p (val)))
4895 return Fcons (val, val);
4896 if (! NILP (Ffboundp (val)))
4897 {
4898 val = call1 (val, Flist (nargs, args));
4899 if (CONSP (val))
4900 return val;
4901 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
4902 return Fcons (val, val);
4903 }
4904 return Qnil;
4905 }
4906 }
4907 return Qnil;
4908 }
4909
4910 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems,
4911 Supdate_iso_coding_systems, 0, 0, 0,
4912 "Update internal database for ISO2022 based coding systems.\n\
4913 When values of the following coding categories are changed, you must\n\
4914 call this function:\n\
4915 coding-category-iso-7, coding-category-iso-7-tight,\n\
4916 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4917 coding-category-iso-7-else, coding-category-iso-8-else")
4918 ()
4919 {
4920 int i;
4921
4922 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_ISO_8_ELSE;
4923 i++)
4924 {
4925 if (! coding_system_table[i])
4926 coding_system_table[i]
4927 = (struct coding_system *) xmalloc (sizeof (struct coding_system));
4928 setup_coding_system
4929 (XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value,
4930 coding_system_table[i]);
4931 }
4932 return Qnil;
4933 }
4934
4935 #endif /* emacs */
4936
4937 \f
4938 /*** 8. Post-amble ***/
4939
4940 init_coding_once ()
4941 {
4942 int i;
4943
4944 /* Emacs' internal format specific initialize routine. */
4945 for (i = 0; i <= 0x20; i++)
4946 emacs_code_class[i] = EMACS_control_code;
4947 emacs_code_class[0x0A] = EMACS_linefeed_code;
4948 emacs_code_class[0x0D] = EMACS_carriage_return_code;
4949 for (i = 0x21 ; i < 0x7F; i++)
4950 emacs_code_class[i] = EMACS_ascii_code;
4951 emacs_code_class[0x7F] = EMACS_control_code;
4952 emacs_code_class[0x80] = EMACS_leading_code_composition;
4953 for (i = 0x81; i < 0xFF; i++)
4954 emacs_code_class[i] = EMACS_invalid_code;
4955 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
4956 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
4957 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
4958 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
4959
4960 /* ISO2022 specific initialize routine. */
4961 for (i = 0; i < 0x20; i++)
4962 iso_code_class[i] = ISO_control_code;
4963 for (i = 0x21; i < 0x7F; i++)
4964 iso_code_class[i] = ISO_graphic_plane_0;
4965 for (i = 0x80; i < 0xA0; i++)
4966 iso_code_class[i] = ISO_control_code;
4967 for (i = 0xA1; i < 0xFF; i++)
4968 iso_code_class[i] = ISO_graphic_plane_1;
4969 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
4970 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4971 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
4972 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
4973 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
4974 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
4975 iso_code_class[ISO_CODE_ESC] = ISO_escape;
4976 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
4977 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
4978 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
4979
4980 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
4981 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
4982
4983 setup_coding_system (Qnil, &keyboard_coding);
4984 setup_coding_system (Qnil, &terminal_coding);
4985 setup_coding_system (Qnil, &safe_terminal_coding);
4986
4987 bzero (coding_system_table, sizeof coding_system_table);
4988
4989 #if defined (MSDOS) || defined (WINDOWSNT)
4990 system_eol_type = CODING_EOL_CRLF;
4991 #else
4992 system_eol_type = CODING_EOL_LF;
4993 #endif
4994 }
4995
4996 #ifdef emacs
4997
4998 syms_of_coding ()
4999 {
5000 Qtarget_idx = intern ("target-idx");
5001 staticpro (&Qtarget_idx);
5002
5003 Qcoding_system_history = intern ("coding-system-history");
5004 staticpro (&Qcoding_system_history);
5005 Fset (Qcoding_system_history, Qnil);
5006
5007 /* Target FILENAME is the first argument. */
5008 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5009 /* Target FILENAME is the third argument. */
5010 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5011
5012 Qcall_process = intern ("call-process");
5013 staticpro (&Qcall_process);
5014 /* Target PROGRAM is the first argument. */
5015 Fput (Qcall_process, Qtarget_idx, make_number (0));
5016
5017 Qcall_process_region = intern ("call-process-region");
5018 staticpro (&Qcall_process_region);
5019 /* Target PROGRAM is the third argument. */
5020 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5021
5022 Qstart_process = intern ("start-process");
5023 staticpro (&Qstart_process);
5024 /* Target PROGRAM is the third argument. */
5025 Fput (Qstart_process, Qtarget_idx, make_number (2));
5026
5027 Qopen_network_stream = intern ("open-network-stream");
5028 staticpro (&Qopen_network_stream);
5029 /* Target SERVICE is the fourth argument. */
5030 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5031
5032 Qcoding_system = intern ("coding-system");
5033 staticpro (&Qcoding_system);
5034
5035 Qeol_type = intern ("eol-type");
5036 staticpro (&Qeol_type);
5037
5038 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5039 staticpro (&Qbuffer_file_coding_system);
5040
5041 Qpost_read_conversion = intern ("post-read-conversion");
5042 staticpro (&Qpost_read_conversion);
5043
5044 Qpre_write_conversion = intern ("pre-write-conversion");
5045 staticpro (&Qpre_write_conversion);
5046
5047 Qno_conversion = intern ("no-conversion");
5048 staticpro (&Qno_conversion);
5049
5050 Qundecided = intern ("undecided");
5051 staticpro (&Qundecided);
5052
5053 Qcoding_system_p = intern ("coding-system-p");
5054 staticpro (&Qcoding_system_p);
5055
5056 Qcoding_system_error = intern ("coding-system-error");
5057 staticpro (&Qcoding_system_error);
5058
5059 Fput (Qcoding_system_error, Qerror_conditions,
5060 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5061 Fput (Qcoding_system_error, Qerror_message,
5062 build_string ("Invalid coding system"));
5063
5064 Qcoding_category = intern ("coding-category");
5065 staticpro (&Qcoding_category);
5066 Qcoding_category_index = intern ("coding-category-index");
5067 staticpro (&Qcoding_category_index);
5068
5069 Vcoding_category_table
5070 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5071 staticpro (&Vcoding_category_table);
5072 {
5073 int i;
5074 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5075 {
5076 XVECTOR (Vcoding_category_table)->contents[i]
5077 = intern (coding_category_name[i]);
5078 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5079 Qcoding_category_index, make_number (i));
5080 }
5081 }
5082
5083 Qcharacter_unification_table = intern ("character-unification-table");
5084 staticpro (&Qcharacter_unification_table);
5085 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
5086 make_number (0));
5087
5088 Qcharacter_unification_table_for_decode
5089 = intern ("character-unification-table-for-decode");
5090 staticpro (&Qcharacter_unification_table_for_decode);
5091
5092 Qcharacter_unification_table_for_encode
5093 = intern ("character-unification-table-for-encode");
5094 staticpro (&Qcharacter_unification_table_for_encode);
5095
5096 Qsafe_charsets = intern ("safe-charsets");
5097 staticpro (&Qsafe_charsets);
5098
5099 Qemacs_mule = intern ("emacs-mule");
5100 staticpro (&Qemacs_mule);
5101
5102 Qraw_text = intern ("raw-text");
5103 staticpro (&Qraw_text);
5104
5105 defsubr (&Scoding_system_p);
5106 defsubr (&Sread_coding_system);
5107 defsubr (&Sread_non_nil_coding_system);
5108 defsubr (&Scheck_coding_system);
5109 defsubr (&Sdetect_coding_region);
5110 defsubr (&Sdetect_coding_string);
5111 defsubr (&Sdecode_coding_region);
5112 defsubr (&Sencode_coding_region);
5113 defsubr (&Sdecode_coding_string);
5114 defsubr (&Sencode_coding_string);
5115 defsubr (&Sdecode_sjis_char);
5116 defsubr (&Sencode_sjis_char);
5117 defsubr (&Sdecode_big5_char);
5118 defsubr (&Sencode_big5_char);
5119 defsubr (&Sset_terminal_coding_system_internal);
5120 defsubr (&Sset_safe_terminal_coding_system_internal);
5121 defsubr (&Sterminal_coding_system);
5122 defsubr (&Sset_keyboard_coding_system_internal);
5123 defsubr (&Skeyboard_coding_system);
5124 defsubr (&Sfind_operation_coding_system);
5125 defsubr (&Supdate_iso_coding_systems);
5126
5127 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5128 "List of coding systems.\n\
5129 \n\
5130 Do not alter the value of this variable manually. This variable should be\n\
5131 updated by the functions `make-coding-system' and\n\
5132 `define-coding-system-alias'.");
5133 Vcoding_system_list = Qnil;
5134
5135 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5136 "Alist of coding system names.\n\
5137 Each element is one element list of coding system name.\n\
5138 This variable is given to `completing-read' as TABLE argument.\n\
5139 \n\
5140 Do not alter the value of this variable manually. This variable should be\n\
5141 updated by the functions `make-coding-system' and\n\
5142 `define-coding-system-alias'.");
5143 Vcoding_system_alist = Qnil;
5144
5145 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5146 "List of coding-categories (symbols) ordered by priority.");
5147 {
5148 int i;
5149
5150 Vcoding_category_list = Qnil;
5151 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5152 Vcoding_category_list
5153 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5154 Vcoding_category_list);
5155 }
5156
5157 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5158 "Specify the coding system for read operations.\n\
5159 It is useful to bind this variable with `let', but do not set it globally.\n\
5160 If the value is a coding system, it is used for decoding on read operation.\n\
5161 If not, an appropriate element is used from one of the coding system alists:\n\
5162 There are three such tables, `file-coding-system-alist',\n\
5163 `process-coding-system-alist', and `network-coding-system-alist'.");
5164 Vcoding_system_for_read = Qnil;
5165
5166 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5167 "Specify the coding system for write operations.\n\
5168 It is useful to bind this variable with `let', but do not set it globally.\n\
5169 If the value is a coding system, it is used for encoding on write operation.\n\
5170 If not, an appropriate element is used from one of the coding system alists:\n\
5171 There are three such tables, `file-coding-system-alist',\n\
5172 `process-coding-system-alist', and `network-coding-system-alist'.");
5173 Vcoding_system_for_write = Qnil;
5174
5175 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5176 "Coding system used in the latest file or process I/O.");
5177 Vlast_coding_system_used = Qnil;
5178
5179 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5180 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5181 inhibit_eol_conversion = 0;
5182
5183 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5184 "Alist to decide a coding system to use for a file I/O operation.\n\
5185 The format is ((PATTERN . VAL) ...),\n\
5186 where PATTERN is a regular expression matching a file name,\n\
5187 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5188 If VAL is a coding system, it is used for both decoding and encoding\n\
5189 the file contents.\n\
5190 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5191 and the cdr part is used for encoding.\n\
5192 If VAL is a function symbol, the function must return a coding system\n\
5193 or a cons of coding systems which are used as above.\n\
5194 \n\
5195 See also the function `find-operation-coding-system'.");
5196 Vfile_coding_system_alist = Qnil;
5197
5198 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5199 "Alist to decide a coding system to use for a process I/O operation.\n\
5200 The format is ((PATTERN . VAL) ...),\n\
5201 where PATTERN is a regular expression matching a program name,\n\
5202 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5203 If VAL is a coding system, it is used for both decoding what received\n\
5204 from the program and encoding what sent to the program.\n\
5205 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5206 and the cdr part is used for encoding.\n\
5207 If VAL is a function symbol, the function must return a coding system\n\
5208 or a cons of coding systems which are used as above.\n\
5209 \n\
5210 See also the function `find-operation-coding-system'.");
5211 Vprocess_coding_system_alist = Qnil;
5212
5213 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5214 "Alist to decide a coding system to use for a network I/O operation.\n\
5215 The format is ((PATTERN . VAL) ...),\n\
5216 where PATTERN is a regular expression matching a network service name\n\
5217 or is a port number to connect to,\n\
5218 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5219 If VAL is a coding system, it is used for both decoding what received\n\
5220 from the network stream and encoding what sent to the network stream.\n\
5221 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5222 and the cdr part is used for encoding.\n\
5223 If VAL is a function symbol, the function must return a coding system\n\
5224 or a cons of coding systems which are used as above.\n\
5225 \n\
5226 See also the function `find-operation-coding-system'.");
5227 Vnetwork_coding_system_alist = Qnil;
5228
5229 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5230 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5231 eol_mnemonic_unix = ':';
5232
5233 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5234 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5235 eol_mnemonic_dos = '\\';
5236
5237 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5238 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5239 eol_mnemonic_mac = '/';
5240
5241 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5242 "Mnemonic character indicating end-of-line format is not yet decided.");
5243 eol_mnemonic_undecided = ':';
5244
5245 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
5246 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5247 Venable_character_unification = Qt;
5248
5249 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5250 &Vstandard_character_unification_table_for_decode,
5251 "Table for unifying characters when reading.");
5252 Vstandard_character_unification_table_for_decode = Qnil;
5253
5254 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5255 &Vstandard_character_unification_table_for_encode,
5256 "Table for unifying characters when writing.");
5257 Vstandard_character_unification_table_for_encode = Qnil;
5258
5259 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5260 "Alist of charsets vs revision numbers.\n\
5261 While encoding, if a charset (car part of an element) is found,\n\
5262 designate it with the escape sequence identifing revision (cdr part of the element).");
5263 Vcharset_revision_alist = Qnil;
5264
5265 DEFVAR_LISP ("default-process-coding-system",
5266 &Vdefault_process_coding_system,
5267 "Cons of coding systems used for process I/O by default.\n\
5268 The car part is used for decoding a process output,\n\
5269 the cdr part is used for encoding a text to be sent to a process.");
5270 Vdefault_process_coding_system = Qnil;
5271
5272 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5273 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5274 This is a vector of length 256.\n\
5275 If Nth element is non-nil, the existence of code N in a file\n\
5276 \(or output of subprocess) doesn't prevent it to be detected as\n\
5277 a coding system of ISO 2022 variant which has a flag\n\
5278 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5279 or reading output of a subprocess.\n\
5280 Only 128th through 159th elements has a meaning.");
5281 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5282
5283 DEFVAR_LISP ("select-safe-coding-system-function",
5284 &Vselect_safe_coding_system_function,
5285 "Function to call to select safe coding system for encoding a text.\n\
5286 \n\
5287 If set, this function is called to force a user to select a proper\n\
5288 coding system which can encode the text in the case that a default\n\
5289 coding system used in each operation can't encode the text.\n\
5290 \n\
5291 The default value is `select-safe-codign-system' (which see).");
5292 Vselect_safe_coding_system_function = Qnil;
5293
5294 }
5295
5296 #endif /* emacs */