*** empty log message ***
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314
315 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
317 Lisp_Object Qstart_process, Qopen_network_stream;
318 Lisp_Object Qtarget_idx;
319
320 Lisp_Object Vselect_safe_coding_system_function;
321
322 /* Mnemonic string for each format of end-of-line. */
323 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
324 /* Mnemonic string to indicate format of end-of-line is not yet
325 decided. */
326 Lisp_Object eol_mnemonic_undecided;
327
328 #ifdef emacs
329
330 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
331
332 Lisp_Object Qcoding_system_p, Qcoding_system_error;
333
334 /* Coding system emacs-mule and raw-text are for converting only
335 end-of-line format. */
336 Lisp_Object Qemacs_mule, Qraw_text;
337
338 /* Coding-systems are handed between Emacs Lisp programs and C internal
339 routines by the following three variables. */
340 /* Coding-system for reading files and receiving data from process. */
341 Lisp_Object Vcoding_system_for_read;
342 /* Coding-system for writing files and sending data to process. */
343 Lisp_Object Vcoding_system_for_write;
344 /* Coding-system actually used in the latest I/O. */
345 Lisp_Object Vlast_coding_system_used;
346
347 /* A vector of length 256 which contains information about special
348 Latin codes (especially for dealing with Microsoft codes). */
349 Lisp_Object Vlatin_extra_code_table;
350
351 /* Flag to inhibit code conversion of end-of-line format. */
352 int inhibit_eol_conversion;
353
354 /* Flag to inhibit ISO2022 escape sequence detection. */
355 int inhibit_iso_escape_detection;
356
357 /* Flag to make buffer-file-coding-system inherit from process-coding. */
358 int inherit_process_coding_system;
359
360 /* Coding system to be used to encode text for terminal display. */
361 struct coding_system terminal_coding;
362
363 /* Coding system to be used to encode text for terminal display when
364 terminal coding system is nil. */
365 struct coding_system safe_terminal_coding;
366
367 /* Coding system of what is sent from terminal keyboard. */
368 struct coding_system keyboard_coding;
369
370 Lisp_Object Vfile_coding_system_alist;
371 Lisp_Object Vprocess_coding_system_alist;
372 Lisp_Object Vnetwork_coding_system_alist;
373
374 Lisp_Object Vlocale_coding_system;
375
376 #endif /* emacs */
377
378 /* Flag to tell if we look up translation table on character code
379 conversion. */
380 Lisp_Object Venable_character_translation;
381 /* Standard translation table to look up on decoding (reading). */
382 Lisp_Object Vstandard_translation_table_for_decode;
383 /* Standard translation table to look up on encoding (writing). */
384 Lisp_Object Vstandard_translation_table_for_encode;
385
386 Lisp_Object Qtranslation_table;
387 Lisp_Object Qtranslation_table_id;
388 Lisp_Object Qtranslation_table_for_decode;
389 Lisp_Object Qtranslation_table_for_encode;
390
391 /* Alist of charsets vs revision number. */
392 static Lisp_Object Vcharset_revision_table;
393
394 /* Default coding systems used for process I/O. */
395 Lisp_Object Vdefault_process_coding_system;
396
397 /* Global flag to tell that we can't call post-read-conversion and
398 pre-write-conversion functions. Usually the value is zero, but it
399 is set to 1 temporarily while such functions are running. This is
400 to avoid infinite recursive call. */
401 static int inhibit_pre_post_conversion;
402
403 /* Two special coding systems. */
404 Lisp_Object Vsjis_coding_system;
405 Lisp_Object Vbig5_coding_system;
406
407
408 static int detect_coding_utf_8 P_ ((struct coding_system *,
409 struct coding_detection_info *info));
410 static void decode_coding_utf_8 P_ ((struct coding_system *));
411 static int encode_coding_utf_8 P_ ((struct coding_system *));
412
413 static int detect_coding_utf_16 P_ ((struct coding_system *,
414 struct coding_detection_info *info));
415 static void decode_coding_utf_16 P_ ((struct coding_system *));
416 static int encode_coding_utf_16 P_ ((struct coding_system *));
417
418 static int detect_coding_iso_2022 P_ ((struct coding_system *,
419 struct coding_detection_info *info));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *,
424 struct coding_detection_info *info));
425 static void decode_coding_emacs_mule P_ ((struct coding_system *));
426 static int encode_coding_emacs_mule P_ ((struct coding_system *));
427
428 static int detect_coding_sjis P_ ((struct coding_system *,
429 struct coding_detection_info *info));
430 static void decode_coding_sjis P_ ((struct coding_system *));
431 static int encode_coding_sjis P_ ((struct coding_system *));
432
433 static int detect_coding_big5 P_ ((struct coding_system *,
434 struct coding_detection_info *info));
435 static void decode_coding_big5 P_ ((struct coding_system *));
436 static int encode_coding_big5 P_ ((struct coding_system *));
437
438 static int detect_coding_ccl P_ ((struct coding_system *,
439 struct coding_detection_info *info));
440 static void decode_coding_ccl P_ ((struct coding_system *));
441 static int encode_coding_ccl P_ ((struct coding_system *));
442
443 static void decode_coding_raw_text P_ ((struct coding_system *));
444 static int encode_coding_raw_text P_ ((struct coding_system *));
445
446
447 /* ISO2022 section */
448
449 #define CODING_ISO_INITIAL(coding, reg) \
450 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
451 coding_attr_iso_initial), \
452 reg)))
453
454
455 #define CODING_ISO_REQUEST(coding, charset_id) \
456 ((charset_id <= (coding)->max_charset_id \
457 ? (coding)->safe_charsets[charset_id] \
458 : -1))
459
460
461 #define CODING_ISO_FLAGS(coding) \
462 ((coding)->spec.iso_2022.flags)
463 #define CODING_ISO_DESIGNATION(coding, reg) \
464 ((coding)->spec.iso_2022.current_designation[reg])
465 #define CODING_ISO_INVOCATION(coding, plane) \
466 ((coding)->spec.iso_2022.current_invocation[plane])
467 #define CODING_ISO_SINGLE_SHIFTING(coding) \
468 ((coding)->spec.iso_2022.single_shifting)
469 #define CODING_ISO_BOL(coding) \
470 ((coding)->spec.iso_2022.bol)
471 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
472 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
473
474 /* Control characters of ISO2022. */
475 /* code */ /* function */
476 #define ISO_CODE_LF 0x0A /* line-feed */
477 #define ISO_CODE_CR 0x0D /* carriage-return */
478 #define ISO_CODE_SO 0x0E /* shift-out */
479 #define ISO_CODE_SI 0x0F /* shift-in */
480 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
481 #define ISO_CODE_ESC 0x1B /* escape */
482 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
483 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
484 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
485
486 /* All code (1-byte) of ISO2022 is classified into one of the
487 followings. */
488 enum iso_code_class_type
489 {
490 ISO_control_0, /* Control codes in the range
491 0x00..0x1F and 0x7F, except for the
492 following 5 codes. */
493 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
494 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
508 };
509
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
512
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
516
517 /* If set, reset graphic planes and registers at end-of-line to the
518 initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
520
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
524
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
527
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
530
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
534
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
537
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
540
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
543
544 /* If set, assume designation states are reset at beginning of line on
545 output. */
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
547
548 /* If set, designation sequence should be placed at beginning of line
549 on output. */
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
551
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
554
555 /* If set, extra latin codes (128..159) are accepted as a valid code
556 on input. */
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
558
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
560
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
562
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
564
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
566
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
568
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
572
573
574 /* UTF-16 section */
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
577
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
580
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
583
584
585 /* CCL section */
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
592 ->data)
593
594 /* Index for each coding category in `coding_categories' */
595
596 enum coding_category
597 {
598 coding_category_iso_7,
599 coding_category_iso_7_tight,
600 coding_category_iso_8_1,
601 coding_category_iso_8_2,
602 coding_category_iso_7_else,
603 coding_category_iso_8_else,
604 coding_category_utf_8,
605 coding_category_utf_16_auto,
606 coding_category_utf_16_be,
607 coding_category_utf_16_le,
608 coding_category_utf_16_be_nosig,
609 coding_category_utf_16_le_nosig,
610 coding_category_charset,
611 coding_category_sjis,
612 coding_category_big5,
613 coding_category_ccl,
614 coding_category_emacs_mule,
615 /* All above are targets of code detection. */
616 coding_category_raw_text,
617 coding_category_undecided,
618 coding_category_max
619 };
620
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
634 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
635 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
636 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
637 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
638 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
639
640 /* This value is returned if detect_coding_mask () find nothing other
641 than ASCII characters. */
642 #define CATEGORY_MASK_ANY \
643 (CATEGORY_MASK_ISO_7 \
644 | CATEGORY_MASK_ISO_7_TIGHT \
645 | CATEGORY_MASK_ISO_8_1 \
646 | CATEGORY_MASK_ISO_8_2 \
647 | CATEGORY_MASK_ISO_7_ELSE \
648 | CATEGORY_MASK_ISO_8_ELSE \
649 | CATEGORY_MASK_UTF_8 \
650 | CATEGORY_MASK_UTF_16_BE \
651 | CATEGORY_MASK_UTF_16_LE \
652 | CATEGORY_MASK_UTF_16_BE_NOSIG \
653 | CATEGORY_MASK_UTF_16_LE_NOSIG \
654 | CATEGORY_MASK_CHARSET \
655 | CATEGORY_MASK_SJIS \
656 | CATEGORY_MASK_BIG5 \
657 | CATEGORY_MASK_CCL \
658 | CATEGORY_MASK_EMACS_MULE)
659
660
661 #define CATEGORY_MASK_ISO_7BIT \
662 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
663
664 #define CATEGORY_MASK_ISO_8BIT \
665 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
666
667 #define CATEGORY_MASK_ISO_ELSE \
668 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
669
670 #define CATEGORY_MASK_ISO_ESCAPE \
671 (CATEGORY_MASK_ISO_7 \
672 | CATEGORY_MASK_ISO_7_TIGHT \
673 | CATEGORY_MASK_ISO_7_ELSE \
674 | CATEGORY_MASK_ISO_8_ELSE)
675
676 #define CATEGORY_MASK_ISO \
677 ( CATEGORY_MASK_ISO_7BIT \
678 | CATEGORY_MASK_ISO_8BIT \
679 | CATEGORY_MASK_ISO_ELSE)
680
681 #define CATEGORY_MASK_UTF_16 \
682 (CATEGORY_MASK_UTF_16_BE \
683 | CATEGORY_MASK_UTF_16_LE \
684 | CATEGORY_MASK_UTF_16_BE_NOSIG \
685 | CATEGORY_MASK_UTF_16_LE_NOSIG)
686
687
688 /* List of symbols `coding-category-xxx' ordered by priority. This
689 variable is exposed to Emacs Lisp. */
690 static Lisp_Object Vcoding_category_list;
691
692 /* Table of coding categories (Lisp symbols). This variable is for
693 internal use oly. */
694 static Lisp_Object Vcoding_category_table;
695
696 /* Table of coding-categories ordered by priority. */
697 static enum coding_category coding_priorities[coding_category_max];
698
699 /* Nth element is a coding context for the coding system bound to the
700 Nth coding category. */
701 static struct coding_system coding_categories[coding_category_max];
702
703 static int detected_mask[coding_category_raw_text] =
704 { CATEGORY_MASK_ISO,
705 CATEGORY_MASK_ISO,
706 CATEGORY_MASK_ISO,
707 CATEGORY_MASK_ISO,
708 CATEGORY_MASK_ISO,
709 CATEGORY_MASK_ISO,
710 CATEGORY_MASK_UTF_8,
711 CATEGORY_MASK_UTF_16,
712 CATEGORY_MASK_UTF_16,
713 CATEGORY_MASK_UTF_16,
714 CATEGORY_MASK_UTF_16,
715 CATEGORY_MASK_UTF_16,
716 CATEGORY_MASK_CHARSET,
717 CATEGORY_MASK_SJIS,
718 CATEGORY_MASK_BIG5,
719 CATEGORY_MASK_CCL,
720 CATEGORY_MASK_EMACS_MULE
721 };
722
723 /*** Commonly used macros and functions ***/
724
725 #ifndef min
726 #define min(a, b) ((a) < (b) ? (a) : (b))
727 #endif
728 #ifndef max
729 #define max(a, b) ((a) > (b) ? (a) : (b))
730 #endif
731
732 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
733 do { \
734 attrs = CODING_ID_ATTRS (coding->id); \
735 eol_type = CODING_ID_EOL_TYPE (coding->id); \
736 if (VECTORP (eol_type)) \
737 eol_type = Qunix; \
738 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
739 } while (0)
740
741
742 /* Safely get one byte from the source text pointed by SRC which ends
743 at SRC_END, and set C to that byte. If there are not enough bytes
744 in the source, it jumps to `no_more_source'. The caller
745 should declare and set these variables appropriately in advance:
746 src, src_end, multibytep
747 */
748
749 #define ONE_MORE_BYTE(c) \
750 do { \
751 if (src == src_end) \
752 { \
753 if (src_base < src) \
754 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
755 goto no_more_source; \
756 } \
757 c = *src++; \
758 if (multibytep && (c & 0x80)) \
759 { \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
763 } \
764 consumed_chars++; \
765 } while (0)
766
767
768 #define ONE_MORE_BYTE_NO_CHECK(c) \
769 do { \
770 c = *src++; \
771 if (multibytep && (c & 0x80)) \
772 { \
773 if ((c & 0xFE) != 0xC0) \
774 error ("Undecodable char found"); \
775 c = ((c & 1) << 6) | *src++; \
776 } \
777 consumed_chars++; \
778 } while (0)
779
780
781 /* Store a byte C in the place pointed by DST and increment DST to the
782 next free point, and increment PRODUCED_CHARS. The caller should
783 assure that C is 0..127, and declare and set the variable `dst'
784 appropriately in advance.
785 */
786
787
788 #define EMIT_ONE_ASCII_BYTE(c) \
789 do { \
790 produced_chars++; \
791 *dst++ = (c); \
792 } while (0)
793
794
795 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
796
797 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
798 do { \
799 produced_chars += 2; \
800 *dst++ = (c1), *dst++ = (c2); \
801 } while (0)
802
803
804 /* Store a byte C in the place pointed by DST and increment DST to the
805 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
806 nonzero, store in an appropriate multibyte from. The caller should
807 declare and set the variables `dst' and `multibytep' appropriately
808 in advance. */
809
810 #define EMIT_ONE_BYTE(c) \
811 do { \
812 produced_chars++; \
813 if (multibytep) \
814 { \
815 int ch = (c); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 } \
820 else \
821 *dst++ = (c); \
822 } while (0)
823
824
825 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
826
827 #define EMIT_TWO_BYTES(c1, c2) \
828 do { \
829 produced_chars += 2; \
830 if (multibytep) \
831 { \
832 int ch; \
833 \
834 ch = (c1); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 ch = (c2); \
839 if (ch >= 0x80) \
840 ch = BYTE8_TO_CHAR (ch); \
841 CHAR_STRING_ADVANCE (ch, dst); \
842 } \
843 else \
844 { \
845 *dst++ = (c1); \
846 *dst++ = (c2); \
847 } \
848 } while (0)
849
850
851 #define EMIT_THREE_BYTES(c1, c2, c3) \
852 do { \
853 EMIT_ONE_BYTE (c1); \
854 EMIT_TWO_BYTES (c2, c3); \
855 } while (0)
856
857
858 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
859 do { \
860 EMIT_TWO_BYTES (c1, c2); \
861 EMIT_TWO_BYTES (c3, c4); \
862 } while (0)
863
864
865 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
866 do { \
867 charset_map_loaded = 0; \
868 c = DECODE_CHAR (charset, code); \
869 if (charset_map_loaded) \
870 { \
871 unsigned char *orig = coding->source; \
872 EMACS_INT offset; \
873 \
874 coding_set_source (coding); \
875 offset = coding->source - orig; \
876 src += offset; \
877 src_base += offset; \
878 src_end += offset; \
879 } \
880 } while (0)
881
882
883 #define ASSURE_DESTINATION(bytes) \
884 do { \
885 if (dst + (bytes) >= dst_end) \
886 { \
887 int more_bytes = charbuf_end - charbuf + (bytes); \
888 \
889 dst = alloc_destination (coding, more_bytes, dst); \
890 dst_end = coding->destination + coding->dst_bytes; \
891 } \
892 } while (0)
893
894
895
896 static void
897 coding_set_source (coding)
898 struct coding_system *coding;
899 {
900 if (BUFFERP (coding->src_object))
901 {
902 struct buffer *buf = XBUFFER (coding->src_object);
903
904 if (coding->src_pos < 0)
905 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
906 else
907 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
908 }
909 else if (STRINGP (coding->src_object))
910 {
911 coding->source = (XSTRING (coding->src_object)->data
912 + coding->src_pos_byte);
913 }
914 else
915 /* Otherwise, the source is C string and is never relocated
916 automatically. Thus we don't have to update anything. */
917 ;
918 }
919
920 static void
921 coding_set_destination (coding)
922 struct coding_system *coding;
923 {
924 if (BUFFERP (coding->dst_object))
925 {
926 if (coding->src_pos < 0)
927 {
928 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
929 coding->dst_bytes = (GAP_END_ADDR
930 - (coding->src_bytes - coding->consumed)
931 - coding->destination);
932 }
933 else
934 {
935 /* We are sure that coding->dst_pos_byte is before the gap
936 of the buffer. */
937 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
938 + coding->dst_pos_byte - 1);
939 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
940 - coding->destination);
941 }
942 }
943 else
944 /* Otherwise, the destination is C string and is never relocated
945 automatically. Thus we don't have to update anything. */
946 ;
947 }
948
949
950 static void
951 coding_alloc_by_realloc (coding, bytes)
952 struct coding_system *coding;
953 EMACS_INT bytes;
954 {
955 coding->destination = (unsigned char *) xrealloc (coding->destination,
956 coding->dst_bytes + bytes);
957 coding->dst_bytes += bytes;
958 }
959
960 static void
961 coding_alloc_by_making_gap (coding, bytes)
962 struct coding_system *coding;
963 EMACS_INT bytes;
964 {
965 if (BUFFERP (coding->dst_object)
966 && EQ (coding->src_object, coding->dst_object))
967 {
968 EMACS_INT add = coding->src_bytes - coding->consumed;
969
970 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
971 make_gap (bytes);
972 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
973 }
974 else
975 {
976 Lisp_Object this_buffer;
977
978 this_buffer = Fcurrent_buffer ();
979 set_buffer_internal (XBUFFER (coding->dst_object));
980 make_gap (bytes);
981 set_buffer_internal (XBUFFER (this_buffer));
982 }
983 }
984
985
986 static unsigned char *
987 alloc_destination (coding, nbytes, dst)
988 struct coding_system *coding;
989 int nbytes;
990 unsigned char *dst;
991 {
992 EMACS_INT offset = dst - coding->destination;
993
994 if (BUFFERP (coding->dst_object))
995 coding_alloc_by_making_gap (coding, nbytes);
996 else
997 coding_alloc_by_realloc (coding, nbytes);
998 coding->result = CODING_RESULT_SUCCESS;
999 coding_set_destination (coding);
1000 dst = coding->destination + offset;
1001 return dst;
1002 }
1003
1004 /** Macros for annotations. */
1005
1006 /* Maximum length of annotation data (sum of annotations for
1007 composition and charset). */
1008 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1009
1010 /* An annotation data is stored in the array coding->charbuf in this
1011 format:
1012 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1013 LENGTH is the number of elements in the annotation.
1014 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1015 FROM and TO specify the range of text annotated. They are relative
1016 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1017
1018 The format of the following elements depend on ANNOTATION_MASK.
1019
1020 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1021 follows:
1022 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1023 METHOD is one of enum composition_method.
1024 Optionnal COMPOSITION-COMPONENTS are characters and composition
1025 rules.
1026
1027 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1028 follows. */
1029
1030 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1031 do { \
1032 *(buf)++ = -(len); \
1033 *(buf)++ = (mask); \
1034 *(buf)++ = (from); \
1035 *(buf)++ = (to); \
1036 coding->annotated = 1; \
1037 } while (0);
1038
1039 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1040 do { \
1041 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1042 *buf++ = method; \
1043 } while (0)
1044
1045
1046 #define ADD_CHARSET_DATA(buf, from, to, id) \
1047 do { \
1048 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1049 *buf++ = id; \
1050 } while (0)
1051
1052 \f
1053 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1054
1055
1056
1057 \f
1058 /*** 3. UTF-8 ***/
1059
1060 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1061 Check if a text is encoded in UTF-8. If it is, return 1, else
1062 return 0. */
1063
1064 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1065 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1066 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1067 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1068 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1069 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1070
1071 static int
1072 detect_coding_utf_8 (coding, detect_info)
1073 struct coding_system *coding;
1074 struct coding_detection_info *detect_info;
1075 {
1076 unsigned char *src = coding->source, *src_base = src;
1077 unsigned char *src_end = coding->source + coding->src_bytes;
1078 int multibytep = coding->src_multibyte;
1079 int consumed_chars = 0;
1080 int found = 0;
1081 int incomplete;
1082
1083 detect_info->checked |= CATEGORY_MASK_UTF_8;
1084 /* A coding system of this category is always ASCII compatible. */
1085 src += coding->head_ascii;
1086
1087 while (1)
1088 {
1089 int c, c1, c2, c3, c4;
1090
1091 incomplete = 0;
1092 ONE_MORE_BYTE (c);
1093 if (UTF_8_1_OCTET_P (c))
1094 continue;
1095 incomplete = 1;
1096 ONE_MORE_BYTE (c1);
1097 if (! UTF_8_EXTRA_OCTET_P (c1))
1098 break;
1099 if (UTF_8_2_OCTET_LEADING_P (c))
1100 {
1101 found = CATEGORY_MASK_UTF_8;
1102 continue;
1103 }
1104 ONE_MORE_BYTE (c2);
1105 if (! UTF_8_EXTRA_OCTET_P (c2))
1106 break;
1107 if (UTF_8_3_OCTET_LEADING_P (c))
1108 {
1109 found = CATEGORY_MASK_UTF_8;
1110 continue;
1111 }
1112 ONE_MORE_BYTE (c3);
1113 if (! UTF_8_EXTRA_OCTET_P (c3))
1114 break;
1115 if (UTF_8_4_OCTET_LEADING_P (c))
1116 {
1117 found = CATEGORY_MASK_UTF_8;
1118 continue;
1119 }
1120 ONE_MORE_BYTE (c4);
1121 if (! UTF_8_EXTRA_OCTET_P (c4))
1122 break;
1123 if (UTF_8_5_OCTET_LEADING_P (c))
1124 {
1125 found = CATEGORY_MASK_UTF_8;
1126 continue;
1127 }
1128 break;
1129 }
1130 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1131 return 0;
1132
1133 no_more_source:
1134 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1135 {
1136 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1137 return 0;
1138 }
1139 detect_info->found |= found;
1140 return 1;
1141 }
1142
1143
1144 static void
1145 decode_coding_utf_8 (coding)
1146 struct coding_system *coding;
1147 {
1148 unsigned char *src = coding->source + coding->consumed;
1149 unsigned char *src_end = coding->source + coding->src_bytes;
1150 unsigned char *src_base;
1151 int *charbuf = coding->charbuf;
1152 int *charbuf_end = charbuf + coding->charbuf_size;
1153 int consumed_chars = 0, consumed_chars_base;
1154 int multibytep = coding->src_multibyte;
1155 Lisp_Object attr, eol_type, charset_list;
1156
1157 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1158
1159 while (1)
1160 {
1161 int c, c1, c2, c3, c4, c5;
1162
1163 src_base = src;
1164 consumed_chars_base = consumed_chars;
1165
1166 if (charbuf >= charbuf_end)
1167 break;
1168
1169 ONE_MORE_BYTE (c1);
1170 if (UTF_8_1_OCTET_P(c1))
1171 {
1172 c = c1;
1173 if (c == '\r')
1174 {
1175 if (EQ (eol_type, Qdos))
1176 {
1177 if (src == src_end)
1178 {
1179 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1180 goto no_more_source;
1181 }
1182 if (*src == '\n')
1183 ONE_MORE_BYTE (c);
1184 }
1185 else if (EQ (eol_type, Qmac))
1186 c = '\n';
1187 }
1188 }
1189 else
1190 {
1191 ONE_MORE_BYTE (c2);
1192 if (! UTF_8_EXTRA_OCTET_P (c2))
1193 goto invalid_code;
1194 if (UTF_8_2_OCTET_LEADING_P (c1))
1195 {
1196 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1197 /* Reject overlong sequences here and below. Encoders
1198 producing them are incorrect, they can be misleading,
1199 and they mess up read/write invariance. */
1200 if (c < 128)
1201 goto invalid_code;
1202 }
1203 else
1204 {
1205 ONE_MORE_BYTE (c3);
1206 if (! UTF_8_EXTRA_OCTET_P (c3))
1207 goto invalid_code;
1208 if (UTF_8_3_OCTET_LEADING_P (c1))
1209 {
1210 c = (((c1 & 0xF) << 12)
1211 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1212 if (c < 0x800
1213 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1214 goto invalid_code;
1215 }
1216 else
1217 {
1218 ONE_MORE_BYTE (c4);
1219 if (! UTF_8_EXTRA_OCTET_P (c4))
1220 goto invalid_code;
1221 if (UTF_8_4_OCTET_LEADING_P (c1))
1222 {
1223 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1224 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1225 if (c < 0x10000)
1226 goto invalid_code;
1227 }
1228 else
1229 {
1230 ONE_MORE_BYTE (c5);
1231 if (! UTF_8_EXTRA_OCTET_P (c5))
1232 goto invalid_code;
1233 if (UTF_8_5_OCTET_LEADING_P (c1))
1234 {
1235 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1236 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1237 | (c5 & 0x3F));
1238 if ((c > MAX_CHAR) || (c < 0x200000))
1239 goto invalid_code;
1240 }
1241 else
1242 goto invalid_code;
1243 }
1244 }
1245 }
1246 }
1247
1248 *charbuf++ = c;
1249 continue;
1250
1251 invalid_code:
1252 src = src_base;
1253 consumed_chars = consumed_chars_base;
1254 ONE_MORE_BYTE (c);
1255 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1256 coding->errors++;
1257 }
1258
1259 no_more_source:
1260 coding->consumed_char += consumed_chars_base;
1261 coding->consumed = src_base - coding->source;
1262 coding->charbuf_used = charbuf - coding->charbuf;
1263 }
1264
1265
1266 static int
1267 encode_coding_utf_8 (coding)
1268 struct coding_system *coding;
1269 {
1270 int multibytep = coding->dst_multibyte;
1271 int *charbuf = coding->charbuf;
1272 int *charbuf_end = charbuf + coding->charbuf_used;
1273 unsigned char *dst = coding->destination + coding->produced;
1274 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1275 int produced_chars = 0;
1276 int c;
1277
1278 if (multibytep)
1279 {
1280 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1281
1282 while (charbuf < charbuf_end)
1283 {
1284 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1285
1286 ASSURE_DESTINATION (safe_room);
1287 c = *charbuf++;
1288 if (CHAR_BYTE8_P (c))
1289 {
1290 c = CHAR_TO_BYTE8 (c);
1291 EMIT_ONE_BYTE (c);
1292 }
1293 else
1294 {
1295 CHAR_STRING_ADVANCE (c, pend);
1296 for (p = str; p < pend; p++)
1297 EMIT_ONE_BYTE (*p);
1298 }
1299 }
1300 }
1301 else
1302 {
1303 int safe_room = MAX_MULTIBYTE_LENGTH;
1304
1305 while (charbuf < charbuf_end)
1306 {
1307 ASSURE_DESTINATION (safe_room);
1308 c = *charbuf++;
1309 dst += CHAR_STRING (c, dst);
1310 produced_chars++;
1311 }
1312 }
1313 coding->result = CODING_RESULT_SUCCESS;
1314 coding->produced_char += produced_chars;
1315 coding->produced = dst - coding->destination;
1316 return 0;
1317 }
1318
1319
1320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1321 Check if a text is encoded in one of UTF-16 based coding systems.
1322 If it is, return 1, else return 0. */
1323
1324 #define UTF_16_HIGH_SURROGATE_P(val) \
1325 (((val) & 0xFC00) == 0xD800)
1326
1327 #define UTF_16_LOW_SURROGATE_P(val) \
1328 (((val) & 0xFC00) == 0xDC00)
1329
1330 #define UTF_16_INVALID_P(val) \
1331 (((val) == 0xFFFE) \
1332 || ((val) == 0xFFFF) \
1333 || UTF_16_LOW_SURROGATE_P (val))
1334
1335
1336 static int
1337 detect_coding_utf_16 (coding, detect_info)
1338 struct coding_system *coding;
1339 struct coding_detection_info *detect_info;
1340 {
1341 unsigned char *src = coding->source, *src_base = src;
1342 unsigned char *src_end = coding->source + coding->src_bytes;
1343 int multibytep = coding->src_multibyte;
1344 int consumed_chars = 0;
1345 int c1, c2;
1346
1347 detect_info->checked |= CATEGORY_MASK_UTF_16;
1348
1349 if (coding->mode & CODING_MODE_LAST_BLOCK
1350 && (coding->src_bytes & 1))
1351 {
1352 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1353 return 0;
1354 }
1355 ONE_MORE_BYTE (c1);
1356 ONE_MORE_BYTE (c2);
1357
1358 if ((c1 == 0xFF) && (c2 == 0xFE))
1359 {
1360 detect_info->found |= CATEGORY_MASK_UTF_16_LE;
1361 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1362 }
1363 else if ((c1 == 0xFE) && (c2 == 0xFF))
1364 {
1365 detect_info->found |= CATEGORY_MASK_UTF_16_BE;
1366 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1367 }
1368 no_more_source:
1369 return 1;
1370 }
1371
1372 static void
1373 decode_coding_utf_16 (coding)
1374 struct coding_system *coding;
1375 {
1376 unsigned char *src = coding->source + coding->consumed;
1377 unsigned char *src_end = coding->source + coding->src_bytes;
1378 unsigned char *src_base;
1379 int *charbuf = coding->charbuf;
1380 int *charbuf_end = charbuf + coding->charbuf_size;
1381 int consumed_chars = 0, consumed_chars_base;
1382 int multibytep = coding->src_multibyte;
1383 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1384 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1385 int surrogate = CODING_UTF_16_SURROGATE (coding);
1386 Lisp_Object attr, eol_type, charset_list;
1387
1388 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1389
1390 if (bom != utf_16_without_bom)
1391 {
1392 int c, c1, c2;
1393
1394 src_base = src;
1395 ONE_MORE_BYTE (c1);
1396 ONE_MORE_BYTE (c2);
1397 c = (c1 << 8) | c2;
1398 if (bom == utf_16_with_bom)
1399 {
1400 if (endian == utf_16_big_endian
1401 ? c != 0xFEFF : c != 0xFFFE)
1402 {
1403 /* We are sure that there's enouph room at CHARBUF. */
1404 *charbuf++ = c1;
1405 *charbuf++ = c2;
1406 coding->errors++;
1407 }
1408 }
1409 else
1410 {
1411 if (c == 0xFEFF)
1412 CODING_UTF_16_ENDIAN (coding)
1413 = endian = utf_16_big_endian;
1414 else if (c == 0xFFFE)
1415 CODING_UTF_16_ENDIAN (coding)
1416 = endian = utf_16_little_endian;
1417 else
1418 {
1419 CODING_UTF_16_ENDIAN (coding)
1420 = endian = utf_16_big_endian;
1421 src = src_base;
1422 }
1423 }
1424 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1425 }
1426
1427 while (1)
1428 {
1429 int c, c1, c2;
1430
1431 src_base = src;
1432 consumed_chars_base = consumed_chars;
1433
1434 if (charbuf + 2 >= charbuf_end)
1435 break;
1436
1437 ONE_MORE_BYTE (c1);
1438 ONE_MORE_BYTE (c2);
1439 c = (endian == utf_16_big_endian
1440 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1441 if (surrogate)
1442 {
1443 if (! UTF_16_LOW_SURROGATE_P (c))
1444 {
1445 if (endian == utf_16_big_endian)
1446 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1447 else
1448 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1449 *charbuf++ = c1;
1450 *charbuf++ = c2;
1451 coding->errors++;
1452 if (UTF_16_HIGH_SURROGATE_P (c))
1453 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1454 else
1455 *charbuf++ = c;
1456 }
1457 else
1458 {
1459 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1460 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1461 *charbuf++ = c;
1462 }
1463 }
1464 else
1465 {
1466 if (UTF_16_HIGH_SURROGATE_P (c))
1467 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1468 else
1469 *charbuf++ = c;
1470 }
1471 }
1472
1473 no_more_source:
1474 coding->consumed_char += consumed_chars_base;
1475 coding->consumed = src_base - coding->source;
1476 coding->charbuf_used = charbuf - coding->charbuf;
1477 }
1478
1479 static int
1480 encode_coding_utf_16 (coding)
1481 struct coding_system *coding;
1482 {
1483 int multibytep = coding->dst_multibyte;
1484 int *charbuf = coding->charbuf;
1485 int *charbuf_end = charbuf + coding->charbuf_used;
1486 unsigned char *dst = coding->destination + coding->produced;
1487 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1488 int safe_room = 8;
1489 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1490 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1491 int produced_chars = 0;
1492 Lisp_Object attrs, eol_type, charset_list;
1493 int c;
1494
1495 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1496
1497 if (bom == utf_16_with_bom)
1498 {
1499 ASSURE_DESTINATION (safe_room);
1500 if (big_endian)
1501 EMIT_TWO_BYTES (0xFE, 0xFF);
1502 else
1503 EMIT_TWO_BYTES (0xFF, 0xFE);
1504 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1505 }
1506
1507 while (charbuf < charbuf_end)
1508 {
1509 ASSURE_DESTINATION (safe_room);
1510 c = *charbuf++;
1511 if (c >= MAX_UNICODE_CHAR)
1512 c = coding->default_char;
1513
1514 if (c < 0x10000)
1515 {
1516 if (big_endian)
1517 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1518 else
1519 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1520 }
1521 else
1522 {
1523 int c1, c2;
1524
1525 c -= 0x10000;
1526 c1 = (c >> 10) + 0xD800;
1527 c2 = (c & 0x3FF) + 0xDC00;
1528 if (big_endian)
1529 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1530 else
1531 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1532 }
1533 }
1534 coding->result = CODING_RESULT_SUCCESS;
1535 coding->produced = dst - coding->destination;
1536 coding->produced_char += produced_chars;
1537 return 0;
1538 }
1539
1540 \f
1541 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1542
1543 /* Emacs' internal format for representation of multiple character
1544 sets is a kind of multi-byte encoding, i.e. characters are
1545 represented by variable-length sequences of one-byte codes.
1546
1547 ASCII characters and control characters (e.g. `tab', `newline') are
1548 represented by one-byte sequences which are their ASCII codes, in
1549 the range 0x00 through 0x7F.
1550
1551 8-bit characters of the range 0x80..0x9F are represented by
1552 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1553 code + 0x20).
1554
1555 8-bit characters of the range 0xA0..0xFF are represented by
1556 one-byte sequences which are their 8-bit code.
1557
1558 The other characters are represented by a sequence of `base
1559 leading-code', optional `extended leading-code', and one or two
1560 `position-code's. The length of the sequence is determined by the
1561 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1562 whereas extended leading-code and position-code take the range 0xA0
1563 through 0xFF. See `charset.h' for more details about leading-code
1564 and position-code.
1565
1566 --- CODE RANGE of Emacs' internal format ---
1567 character set range
1568 ------------- -----
1569 ascii 0x00..0x7F
1570 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1571 eight-bit-graphic 0xA0..0xBF
1572 ELSE 0x81..0x9D + [0xA0..0xFF]+
1573 ---------------------------------------------
1574
1575 As this is the internal character representation, the format is
1576 usually not used externally (i.e. in a file or in a data sent to a
1577 process). But, it is possible to have a text externally in this
1578 format (i.e. by encoding by the coding system `emacs-mule').
1579
1580 In that case, a sequence of one-byte codes has a slightly different
1581 form.
1582
1583 At first, all characters in eight-bit-control are represented by
1584 one-byte sequences which are their 8-bit code.
1585
1586 Next, character composition data are represented by the byte
1587 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1588 where,
1589 METHOD is 0xF0 plus one of composition method (enum
1590 composition_method),
1591
1592 BYTES is 0xA0 plus a byte length of this composition data,
1593
1594 CHARS is 0x20 plus a number of characters composed by this
1595 data,
1596
1597 COMPONENTs are characters of multibye form or composition
1598 rules encoded by two-byte of ASCII codes.
1599
1600 In addition, for backward compatibility, the following formats are
1601 also recognized as composition data on decoding.
1602
1603 0x80 MSEQ ...
1604 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1605
1606 Here,
1607 MSEQ is a multibyte form but in these special format:
1608 ASCII: 0xA0 ASCII_CODE+0x80,
1609 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1610 RULE is a one byte code of the range 0xA0..0xF0 that
1611 represents a composition rule.
1612 */
1613
1614 char emacs_mule_bytes[256];
1615
1616 int
1617 emacs_mule_char (coding, src, nbytes, nchars, id)
1618 struct coding_system *coding;
1619 unsigned char *src;
1620 int *nbytes, *nchars, *id;
1621 {
1622 unsigned char *src_end = coding->source + coding->src_bytes;
1623 int multibytep = coding->src_multibyte;
1624 unsigned char *src_base = src;
1625 struct charset *charset;
1626 unsigned code;
1627 int c;
1628 int consumed_chars = 0;
1629
1630 ONE_MORE_BYTE (c);
1631 switch (emacs_mule_bytes[c])
1632 {
1633 case 2:
1634 if (! (charset = emacs_mule_charset[c]))
1635 goto invalid_code;
1636 ONE_MORE_BYTE (c);
1637 code = c & 0x7F;
1638 break;
1639
1640 case 3:
1641 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1642 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1643 {
1644 ONE_MORE_BYTE (c);
1645 if (! (charset = emacs_mule_charset[c]))
1646 goto invalid_code;
1647 ONE_MORE_BYTE (c);
1648 code = c & 0x7F;
1649 }
1650 else
1651 {
1652 if (! (charset = emacs_mule_charset[c]))
1653 goto invalid_code;
1654 ONE_MORE_BYTE (c);
1655 code = (c & 0x7F) << 8;
1656 ONE_MORE_BYTE (c);
1657 code |= c & 0x7F;
1658 }
1659 break;
1660
1661 case 4:
1662 ONE_MORE_BYTE (c);
1663 if (! (charset = emacs_mule_charset[c]))
1664 goto invalid_code;
1665 ONE_MORE_BYTE (c);
1666 code = (c & 0x7F) << 8;
1667 ONE_MORE_BYTE (c);
1668 code |= c & 0x7F;
1669 break;
1670
1671 case 1:
1672 code = c;
1673 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1674 ? charset_ascii : charset_eight_bit);
1675 break;
1676
1677 default:
1678 abort ();
1679 }
1680 c = DECODE_CHAR (charset, code);
1681 if (c < 0)
1682 goto invalid_code;
1683 *nbytes = src - src_base;
1684 *nchars = consumed_chars;
1685 if (id)
1686 *id = charset->id;
1687 return c;
1688
1689 no_more_source:
1690 return -2;
1691
1692 invalid_code:
1693 return -1;
1694 }
1695
1696
1697 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1698 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1699 else return 0. */
1700
1701 static int
1702 detect_coding_emacs_mule (coding, detect_info)
1703 struct coding_system *coding;
1704 struct coding_detection_info *detect_info;
1705 {
1706 unsigned char *src = coding->source, *src_base = src;
1707 unsigned char *src_end = coding->source + coding->src_bytes;
1708 int multibytep = coding->src_multibyte;
1709 int consumed_chars = 0;
1710 int c;
1711 int found = 0;
1712 int incomplete;
1713
1714 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1715 /* A coding system of this category is always ASCII compatible. */
1716 src += coding->head_ascii;
1717
1718 while (1)
1719 {
1720 incomplete = 0;
1721 ONE_MORE_BYTE (c);
1722 incomplete = 1;
1723
1724 if (c == 0x80)
1725 {
1726 /* Perhaps the start of composite character. We simple skip
1727 it because analyzing it is too heavy for detecting. But,
1728 at least, we check that the composite character
1729 constitues of more than 4 bytes. */
1730 unsigned char *src_base;
1731
1732 repeat:
1733 src_base = src;
1734 do
1735 {
1736 ONE_MORE_BYTE (c);
1737 }
1738 while (c >= 0xA0);
1739
1740 if (src - src_base <= 4)
1741 break;
1742 found = CATEGORY_MASK_EMACS_MULE;
1743 if (c == 0x80)
1744 goto repeat;
1745 }
1746
1747 if (c < 0x80)
1748 {
1749 if (c < 0x20
1750 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1751 break;
1752 }
1753 else
1754 {
1755 unsigned char *src_base = src - 1;
1756
1757 do
1758 {
1759 ONE_MORE_BYTE (c);
1760 }
1761 while (c >= 0xA0);
1762 if (src - src_base != emacs_mule_bytes[*src_base])
1763 break;
1764 found = CATEGORY_MASK_EMACS_MULE;
1765 }
1766 }
1767 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1768 return 0;
1769
1770 no_more_source:
1771 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1772 {
1773 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1774 return 0;
1775 }
1776 detect_info->found |= found;
1777 return 1;
1778 }
1779
1780
1781 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1782
1783 /* Decode a character represented as a component of composition
1784 sequence of Emacs 20/21 style at SRC. Set C to that character and
1785 update SRC to the head of next character (or an encoded composition
1786 rule). If SRC doesn't points a composition component, set C to -1.
1787 If SRC points an invalid byte sequence, global exit by a return
1788 value 0. */
1789
1790 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1791 if (1) \
1792 { \
1793 int c; \
1794 int nbytes, nchars; \
1795 \
1796 if (src == src_end) \
1797 break; \
1798 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1799 if (c < 0) \
1800 { \
1801 if (c == -2) \
1802 break; \
1803 goto invalid_code; \
1804 } \
1805 *buf++ = c; \
1806 src += nbytes; \
1807 consumed_chars += nchars; \
1808 } \
1809 else
1810
1811
1812 /* Decode a composition rule represented as a component of composition
1813 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1814 and increment BUF. If SRC points an invalid byte sequence, set C
1815 to -1. */
1816
1817 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1818 do { \
1819 int c, gref, nref; \
1820 \
1821 if (src >= src_end) \
1822 goto invalid_code; \
1823 ONE_MORE_BYTE_NO_CHECK (c); \
1824 c -= 0x20; \
1825 if (c < 0 || c >= 81) \
1826 goto invalid_code; \
1827 \
1828 gref = c / 9, nref = c % 9; \
1829 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1830 } while (0)
1831
1832
1833 /* Decode a composition rule represented as a component of composition
1834 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1835 and increment BUF. If SRC points an invalid byte sequence, set C
1836 to -1. */
1837
1838 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1839 do { \
1840 int gref, nref; \
1841 \
1842 if (src + 1>= src_end) \
1843 goto invalid_code; \
1844 ONE_MORE_BYTE_NO_CHECK (gref); \
1845 gref -= 0x20; \
1846 ONE_MORE_BYTE_NO_CHECK (nref); \
1847 nref -= 0x20; \
1848 if (gref < 0 || gref >= 81 \
1849 || nref < 0 || nref >= 81) \
1850 goto invalid_code; \
1851 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1852 } while (0)
1853
1854
1855 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1856 do { \
1857 /* Emacs 21 style format. The first three bytes at SRC are \
1858 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1859 the byte length of this composition information, CHARS is the \
1860 number of characters composed by this composition. */ \
1861 enum composition_method method = c - 0xF2; \
1862 int *charbuf_base = charbuf; \
1863 int from, to; \
1864 int consumed_chars_limit; \
1865 int nbytes, nchars; \
1866 \
1867 ONE_MORE_BYTE (c); \
1868 nbytes = c - 0xA0; \
1869 if (nbytes < 3) \
1870 goto invalid_code; \
1871 ONE_MORE_BYTE (c); \
1872 nchars = c - 0xA0; \
1873 from = coding->produced + char_offset; \
1874 to = from + nchars; \
1875 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1876 consumed_chars_limit = consumed_chars_base + nbytes; \
1877 if (method != COMPOSITION_RELATIVE) \
1878 { \
1879 int i = 0; \
1880 while (consumed_chars < consumed_chars_limit) \
1881 { \
1882 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1883 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1884 else \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1886 i++; \
1887 } \
1888 if (consumed_chars < consumed_chars_limit) \
1889 goto invalid_code; \
1890 charbuf_base[0] -= i; \
1891 } \
1892 } while (0)
1893
1894
1895 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1896 do { \
1897 /* Emacs 20 style format for relative composition. */ \
1898 /* Store multibyte form of characters to be composed. */ \
1899 enum composition_method method = COMPOSITION_RELATIVE; \
1900 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1901 int *buf = components; \
1902 int i, j; \
1903 int from, to; \
1904 \
1905 src = src_base; \
1906 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1908 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1909 if (i < 2) \
1910 goto invalid_code; \
1911 from = coding->produced_char + char_offset; \
1912 to = from + i; \
1913 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1914 for (j = 0; j < i; j++) \
1915 *charbuf++ = components[j]; \
1916 } while (0)
1917
1918
1919 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1920 do { \
1921 /* Emacs 20 style format for rule-base composition. */ \
1922 /* Store multibyte form of characters to be composed. */ \
1923 enum composition_method method = COMPOSITION_WITH_RULE; \
1924 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1925 int *buf = components; \
1926 int i, j; \
1927 int from, to; \
1928 \
1929 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1930 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1931 { \
1932 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1933 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1934 } \
1935 if (i < 1 || (buf - components) % 2 == 0) \
1936 goto invalid_code; \
1937 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1938 goto no_more_source; \
1939 from = coding->produced_char + char_offset; \
1940 to = from + i; \
1941 ADD_COMPOSITION_DATA (buf, from, to, method); \
1942 for (j = 0; j < i; j++) \
1943 *charbuf++ = components[j]; \
1944 for (j = 0; j < i; j += 2) \
1945 *charbuf++ = components[j]; \
1946 } while (0)
1947
1948
1949 static void
1950 decode_coding_emacs_mule (coding)
1951 struct coding_system *coding;
1952 {
1953 unsigned char *src = coding->source + coding->consumed;
1954 unsigned char *src_end = coding->source + coding->src_bytes;
1955 unsigned char *src_base;
1956 int *charbuf = coding->charbuf;
1957 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1958 int consumed_chars = 0, consumed_chars_base;
1959 int multibytep = coding->src_multibyte;
1960 Lisp_Object attrs, eol_type, charset_list;
1961 int char_offset = coding->produced_char;
1962 int last_offset = char_offset;
1963 int last_id = charset_ascii;
1964
1965 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1966
1967 while (1)
1968 {
1969 int c;
1970
1971 src_base = src;
1972 consumed_chars_base = consumed_chars;
1973
1974 if (charbuf >= charbuf_end)
1975 break;
1976
1977 ONE_MORE_BYTE (c);
1978
1979 if (c < 0x80)
1980 {
1981 if (c == '\r')
1982 {
1983 if (EQ (eol_type, Qdos))
1984 {
1985 if (src == src_end)
1986 {
1987 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1988 goto no_more_source;
1989 }
1990 if (*src == '\n')
1991 ONE_MORE_BYTE (c);
1992 }
1993 else if (EQ (eol_type, Qmac))
1994 c = '\n';
1995 }
1996 *charbuf++ = c;
1997 char_offset++;
1998 }
1999 else if (c == 0x80)
2000 {
2001 ONE_MORE_BYTE (c);
2002 if (c - 0xF2 >= COMPOSITION_RELATIVE
2003 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2004 DECODE_EMACS_MULE_21_COMPOSITION (c);
2005 else if (c < 0xC0)
2006 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2007 else if (c == 0xFF)
2008 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2009 else
2010 goto invalid_code;
2011 }
2012 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2013 {
2014 int nbytes, nchars;
2015 int id;
2016
2017 src = src_base;
2018 consumed_chars = consumed_chars_base;
2019 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2020 if (c < 0)
2021 {
2022 if (c == -2)
2023 break;
2024 goto invalid_code;
2025 }
2026 if (last_id != id)
2027 {
2028 if (last_id != charset_ascii)
2029 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2030 last_id = id;
2031 last_offset = char_offset;
2032 }
2033 *charbuf++ = c;
2034 src += nbytes;
2035 consumed_chars += nchars;
2036 char_offset++;
2037 }
2038 continue;
2039
2040 invalid_code:
2041 src = src_base;
2042 consumed_chars = consumed_chars_base;
2043 ONE_MORE_BYTE (c);
2044 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2045 char_offset++;
2046 coding->errors++;
2047 }
2048
2049 no_more_source:
2050 if (last_id != charset_ascii)
2051 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2052 coding->consumed_char += consumed_chars_base;
2053 coding->consumed = src_base - coding->source;
2054 coding->charbuf_used = charbuf - coding->charbuf;
2055 }
2056
2057
2058 #define EMACS_MULE_LEADING_CODES(id, codes) \
2059 do { \
2060 if (id < 0xA0) \
2061 codes[0] = id, codes[1] = 0; \
2062 else if (id < 0xE0) \
2063 codes[0] = 0x9A, codes[1] = id; \
2064 else if (id < 0xF0) \
2065 codes[0] = 0x9B, codes[1] = id; \
2066 else if (id < 0xF5) \
2067 codes[0] = 0x9C, codes[1] = id; \
2068 else \
2069 codes[0] = 0x9D, codes[1] = id; \
2070 } while (0);
2071
2072
2073 static int
2074 encode_coding_emacs_mule (coding)
2075 struct coding_system *coding;
2076 {
2077 int multibytep = coding->dst_multibyte;
2078 int *charbuf = coding->charbuf;
2079 int *charbuf_end = charbuf + coding->charbuf_used;
2080 unsigned char *dst = coding->destination + coding->produced;
2081 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2082 int safe_room = 8;
2083 int produced_chars = 0;
2084 Lisp_Object attrs, eol_type, charset_list;
2085 int c;
2086 int preferred_charset_id = -1;
2087
2088 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2089
2090 while (charbuf < charbuf_end)
2091 {
2092 ASSURE_DESTINATION (safe_room);
2093 c = *charbuf++;
2094
2095 if (c < 0)
2096 {
2097 /* Handle an annotation. */
2098 switch (*charbuf)
2099 {
2100 case CODING_ANNOTATE_COMPOSITION_MASK:
2101 /* Not yet implemented. */
2102 break;
2103 case CODING_ANNOTATE_CHARSET_MASK:
2104 preferred_charset_id = charbuf[3];
2105 if (preferred_charset_id >= 0
2106 && NILP (Fmemq (make_number (preferred_charset_id),
2107 charset_list)))
2108 preferred_charset_id = -1;
2109 break;
2110 default:
2111 abort ();
2112 }
2113 charbuf += -c - 1;
2114 continue;
2115 }
2116
2117 if (ASCII_CHAR_P (c))
2118 EMIT_ONE_ASCII_BYTE (c);
2119 else if (CHAR_BYTE8_P (c))
2120 {
2121 c = CHAR_TO_BYTE8 (c);
2122 EMIT_ONE_BYTE (c);
2123 }
2124 else
2125 {
2126 struct charset *charset;
2127 unsigned code;
2128 int dimension;
2129 int emacs_mule_id;
2130 unsigned char leading_codes[2];
2131
2132 if (preferred_charset_id >= 0)
2133 {
2134 charset = CHARSET_FROM_ID (preferred_charset_id);
2135 if (! CHAR_CHARSET_P (c, charset))
2136 charset = char_charset (c, charset_list, NULL);
2137 }
2138 else
2139 charset = char_charset (c, charset_list, &code);
2140 if (! charset)
2141 {
2142 c = coding->default_char;
2143 if (ASCII_CHAR_P (c))
2144 {
2145 EMIT_ONE_ASCII_BYTE (c);
2146 continue;
2147 }
2148 charset = char_charset (c, charset_list, &code);
2149 }
2150 dimension = CHARSET_DIMENSION (charset);
2151 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2152 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2153 EMIT_ONE_BYTE (leading_codes[0]);
2154 if (leading_codes[1])
2155 EMIT_ONE_BYTE (leading_codes[1]);
2156 if (dimension == 1)
2157 EMIT_ONE_BYTE (code);
2158 else
2159 {
2160 EMIT_ONE_BYTE (code >> 8);
2161 EMIT_ONE_BYTE (code & 0xFF);
2162 }
2163 }
2164 }
2165 coding->result = CODING_RESULT_SUCCESS;
2166 coding->produced_char += produced_chars;
2167 coding->produced = dst - coding->destination;
2168 return 0;
2169 }
2170
2171 \f
2172 /*** 7. ISO2022 handlers ***/
2173
2174 /* The following note describes the coding system ISO2022 briefly.
2175 Since the intention of this note is to help understand the
2176 functions in this file, some parts are NOT ACCURATE or are OVERLY
2177 SIMPLIFIED. For thorough understanding, please refer to the
2178 original document of ISO2022. This is equivalent to the standard
2179 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2180
2181 ISO2022 provides many mechanisms to encode several character sets
2182 in 7-bit and 8-bit environments. For 7-bit environments, all text
2183 is encoded using bytes less than 128. This may make the encoded
2184 text a little bit longer, but the text passes more easily through
2185 several types of gateway, some of which strip off the MSB (Most
2186 Significant Bit).
2187
2188 There are two kinds of character sets: control character sets and
2189 graphic character sets. The former contain control characters such
2190 as `newline' and `escape' to provide control functions (control
2191 functions are also provided by escape sequences). The latter
2192 contain graphic characters such as 'A' and '-'. Emacs recognizes
2193 two control character sets and many graphic character sets.
2194
2195 Graphic character sets are classified into one of the following
2196 four classes, according to the number of bytes (DIMENSION) and
2197 number of characters in one dimension (CHARS) of the set:
2198 - DIMENSION1_CHARS94
2199 - DIMENSION1_CHARS96
2200 - DIMENSION2_CHARS94
2201 - DIMENSION2_CHARS96
2202
2203 In addition, each character set is assigned an identification tag,
2204 unique for each set, called the "final character" (denoted as <F>
2205 hereafter). The <F> of each character set is decided by ECMA(*)
2206 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2207 (0x30..0x3F are for private use only).
2208
2209 Note (*): ECMA = European Computer Manufacturers Association
2210
2211 Here are examples of graphic character sets [NAME(<F>)]:
2212 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2213 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2214 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2215 o DIMENSION2_CHARS96 -- none for the moment
2216
2217 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2218 C0 [0x00..0x1F] -- control character plane 0
2219 GL [0x20..0x7F] -- graphic character plane 0
2220 C1 [0x80..0x9F] -- control character plane 1
2221 GR [0xA0..0xFF] -- graphic character plane 1
2222
2223 A control character set is directly designated and invoked to C0 or
2224 C1 by an escape sequence. The most common case is that:
2225 - ISO646's control character set is designated/invoked to C0, and
2226 - ISO6429's control character set is designated/invoked to C1,
2227 and usually these designations/invocations are omitted in encoded
2228 text. In a 7-bit environment, only C0 can be used, and a control
2229 character for C1 is encoded by an appropriate escape sequence to
2230 fit into the environment. All control characters for C1 are
2231 defined to have corresponding escape sequences.
2232
2233 A graphic character set is at first designated to one of four
2234 graphic registers (G0 through G3), then these graphic registers are
2235 invoked to GL or GR. These designations and invocations can be
2236 done independently. The most common case is that G0 is invoked to
2237 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2238 these invocations and designations are omitted in encoded text.
2239 In a 7-bit environment, only GL can be used.
2240
2241 When a graphic character set of CHARS94 is invoked to GL, codes
2242 0x20 and 0x7F of the GL area work as control characters SPACE and
2243 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2244 be used.
2245
2246 There are two ways of invocation: locking-shift and single-shift.
2247 With locking-shift, the invocation lasts until the next different
2248 invocation, whereas with single-shift, the invocation affects the
2249 following character only and doesn't affect the locking-shift
2250 state. Invocations are done by the following control characters or
2251 escape sequences:
2252
2253 ----------------------------------------------------------------------
2254 abbrev function cntrl escape seq description
2255 ----------------------------------------------------------------------
2256 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2257 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2258 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2259 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2260 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2261 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2262 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2263 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2264 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2265 ----------------------------------------------------------------------
2266 (*) These are not used by any known coding system.
2267
2268 Control characters for these functions are defined by macros
2269 ISO_CODE_XXX in `coding.h'.
2270
2271 Designations are done by the following escape sequences:
2272 ----------------------------------------------------------------------
2273 escape sequence description
2274 ----------------------------------------------------------------------
2275 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2276 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2277 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2278 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2279 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2280 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2281 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2282 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2283 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2284 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2285 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2286 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2287 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2288 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2289 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2290 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2291 ----------------------------------------------------------------------
2292
2293 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2294 of dimension 1, chars 94, and final character <F>, etc...
2295
2296 Note (*): Although these designations are not allowed in ISO2022,
2297 Emacs accepts them on decoding, and produces them on encoding
2298 CHARS96 character sets in a coding system which is characterized as
2299 7-bit environment, non-locking-shift, and non-single-shift.
2300
2301 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2302 '(' must be omitted. We refer to this as "short-form" hereafter.
2303
2304 Now you may notice that there are a lot of ways of encoding the
2305 same multilingual text in ISO2022. Actually, there exist many
2306 coding systems such as Compound Text (used in X11's inter client
2307 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2308 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2309 localized platforms), and all of these are variants of ISO2022.
2310
2311 In addition to the above, Emacs handles two more kinds of escape
2312 sequences: ISO6429's direction specification and Emacs' private
2313 sequence for specifying character composition.
2314
2315 ISO6429's direction specification takes the following form:
2316 o CSI ']' -- end of the current direction
2317 o CSI '0' ']' -- end of the current direction
2318 o CSI '1' ']' -- start of left-to-right text
2319 o CSI '2' ']' -- start of right-to-left text
2320 The control character CSI (0x9B: control sequence introducer) is
2321 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2322
2323 Character composition specification takes the following form:
2324 o ESC '0' -- start relative composition
2325 o ESC '1' -- end composition
2326 o ESC '2' -- start rule-base composition (*)
2327 o ESC '3' -- start relative composition with alternate chars (**)
2328 o ESC '4' -- start rule-base composition with alternate chars (**)
2329 Since these are not standard escape sequences of any ISO standard,
2330 the use of them with these meanings is restricted to Emacs only.
2331
2332 (*) This form is used only in Emacs 20.7 and older versions,
2333 but newer versions can safely decode it.
2334 (**) This form is used only in Emacs 21.1 and newer versions,
2335 and older versions can't decode it.
2336
2337 Here's a list of example usages of these composition escape
2338 sequences (categorized by `enum composition_method').
2339
2340 COMPOSITION_RELATIVE:
2341 ESC 0 CHAR [ CHAR ] ESC 1
2342 COMPOSITION_WITH_RULE:
2343 ESC 2 CHAR [ RULE CHAR ] ESC 1
2344 COMPOSITION_WITH_ALTCHARS:
2345 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2346 COMPOSITION_WITH_RULE_ALTCHARS:
2347 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2348
2349 enum iso_code_class_type iso_code_class[256];
2350
2351 #define SAFE_CHARSET_P(coding, id) \
2352 ((id) <= (coding)->max_charset_id \
2353 && (coding)->safe_charsets[id] >= 0)
2354
2355
2356 #define SHIFT_OUT_OK(category) \
2357 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2358
2359 static void
2360 setup_iso_safe_charsets (attrs)
2361 Lisp_Object attrs;
2362 {
2363 Lisp_Object charset_list, safe_charsets;
2364 Lisp_Object request;
2365 Lisp_Object reg_usage;
2366 Lisp_Object tail;
2367 int reg94, reg96;
2368 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2369 int max_charset_id;
2370
2371 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2372 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2373 && ! EQ (charset_list, Viso_2022_charset_list))
2374 {
2375 CODING_ATTR_CHARSET_LIST (attrs)
2376 = charset_list = Viso_2022_charset_list;
2377 ASET (attrs, coding_attr_safe_charsets, Qnil);
2378 }
2379
2380 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2381 return;
2382
2383 max_charset_id = 0;
2384 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2385 {
2386 int id = XINT (XCAR (tail));
2387 if (max_charset_id < id)
2388 max_charset_id = id;
2389 }
2390
2391 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2392 make_number (255));
2393 request = AREF (attrs, coding_attr_iso_request);
2394 reg_usage = AREF (attrs, coding_attr_iso_usage);
2395 reg94 = XINT (XCAR (reg_usage));
2396 reg96 = XINT (XCDR (reg_usage));
2397
2398 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2399 {
2400 Lisp_Object id;
2401 Lisp_Object reg;
2402 struct charset *charset;
2403
2404 id = XCAR (tail);
2405 charset = CHARSET_FROM_ID (XINT (id));
2406 reg = Fcdr (Fassq (id, request));
2407 if (! NILP (reg))
2408 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2409 else if (charset->iso_chars_96)
2410 {
2411 if (reg96 < 4)
2412 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2413 }
2414 else
2415 {
2416 if (reg94 < 4)
2417 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2418 }
2419 }
2420 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2421 }
2422
2423
2424 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2425 Check if a text is encoded in one of ISO-2022 based codig systems.
2426 If it is, return 1, else return 0. */
2427
2428 static int
2429 detect_coding_iso_2022 (coding, detect_info)
2430 struct coding_system *coding;
2431 struct coding_detection_info *detect_info;
2432 {
2433 unsigned char *src = coding->source, *src_base = src;
2434 unsigned char *src_end = coding->source + coding->src_bytes;
2435 int multibytep = coding->src_multibyte;
2436 int single_shifting = 0;
2437 int id;
2438 int c, c1;
2439 int consumed_chars = 0;
2440 int i;
2441 int rejected = 0;
2442 int found = 0;
2443
2444 detect_info->checked |= CATEGORY_MASK_ISO;
2445
2446 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2447 {
2448 struct coding_system *this = &(coding_categories[i]);
2449 Lisp_Object attrs, val;
2450
2451 attrs = CODING_ID_ATTRS (this->id);
2452 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2453 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2454 setup_iso_safe_charsets (attrs);
2455 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2456 this->max_charset_id = XSTRING (val)->size - 1;
2457 this->safe_charsets = (char *) XSTRING (val)->data;
2458 }
2459
2460 /* A coding system of this category is always ASCII compatible. */
2461 src += coding->head_ascii;
2462
2463 while (rejected != CATEGORY_MASK_ISO)
2464 {
2465 ONE_MORE_BYTE (c);
2466 switch (c)
2467 {
2468 case ISO_CODE_ESC:
2469 if (inhibit_iso_escape_detection)
2470 break;
2471 single_shifting = 0;
2472 ONE_MORE_BYTE (c);
2473 if (c >= '(' && c <= '/')
2474 {
2475 /* Designation sequence for a charset of dimension 1. */
2476 ONE_MORE_BYTE (c1);
2477 if (c1 < ' ' || c1 >= 0x80
2478 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2479 /* Invalid designation sequence. Just ignore. */
2480 break;
2481 }
2482 else if (c == '$')
2483 {
2484 /* Designation sequence for a charset of dimension 2. */
2485 ONE_MORE_BYTE (c);
2486 if (c >= '@' && c <= 'B')
2487 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2488 id = iso_charset_table[1][0][c];
2489 else if (c >= '(' && c <= '/')
2490 {
2491 ONE_MORE_BYTE (c1);
2492 if (c1 < ' ' || c1 >= 0x80
2493 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2494 /* Invalid designation sequence. Just ignore. */
2495 break;
2496 }
2497 else
2498 /* Invalid designation sequence. Just ignore it. */
2499 break;
2500 }
2501 else if (c == 'N' || c == 'O')
2502 {
2503 /* ESC <Fe> for SS2 or SS3. */
2504 single_shifting = 1;
2505 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2506 break;
2507 }
2508 else if (c >= '0' && c <= '4')
2509 {
2510 /* ESC <Fp> for start/end composition. */
2511 found |= CATEGORY_MASK_ISO;
2512 break;
2513 }
2514 else
2515 {
2516 /* Invalid escape sequence. Just ignore it. */
2517 break;
2518 }
2519
2520 /* We found a valid designation sequence for CHARSET. */
2521 rejected |= CATEGORY_MASK_ISO_8BIT;
2522 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2523 id))
2524 found |= CATEGORY_MASK_ISO_7;
2525 else
2526 rejected |= CATEGORY_MASK_ISO_7;
2527 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2528 id))
2529 found |= CATEGORY_MASK_ISO_7_TIGHT;
2530 else
2531 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2532 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2533 id))
2534 found |= CATEGORY_MASK_ISO_7_ELSE;
2535 else
2536 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2537 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2538 id))
2539 found |= CATEGORY_MASK_ISO_8_ELSE;
2540 else
2541 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2542 break;
2543
2544 case ISO_CODE_SO:
2545 case ISO_CODE_SI:
2546 /* Locking shift out/in. */
2547 if (inhibit_iso_escape_detection)
2548 break;
2549 single_shifting = 0;
2550 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2551 found |= CATEGORY_MASK_ISO_ELSE;
2552 break;
2553
2554 case ISO_CODE_CSI:
2555 /* Control sequence introducer. */
2556 single_shifting = 0;
2557 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2558 found |= CATEGORY_MASK_ISO_8_ELSE;
2559 goto check_extra_latin;
2560
2561
2562 case ISO_CODE_SS2:
2563 case ISO_CODE_SS3:
2564 /* Single shift. */
2565 if (inhibit_iso_escape_detection)
2566 break;
2567 single_shifting = 1;
2568 rejected |= CATEGORY_MASK_ISO_7BIT;
2569 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2570 & CODING_ISO_FLAG_SINGLE_SHIFT)
2571 found |= CATEGORY_MASK_ISO_8_1;
2572 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2573 & CODING_ISO_FLAG_SINGLE_SHIFT)
2574 found |= CATEGORY_MASK_ISO_8_2;
2575 goto check_extra_latin;
2576
2577 default:
2578 if (c < 0x80)
2579 {
2580 single_shifting = 0;
2581 break;
2582 }
2583 if (c >= 0xA0)
2584 {
2585 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2586 found |= CATEGORY_MASK_ISO_8_1;
2587 /* Check the length of succeeding codes of the range
2588 0xA0..0FF. If the byte length is even, we include
2589 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2590 only when we are not single shifting. */
2591 if (! single_shifting
2592 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2593 {
2594 int i = 1;
2595 while (src < src_end)
2596 {
2597 ONE_MORE_BYTE (c);
2598 if (c < 0xA0)
2599 break;
2600 i++;
2601 }
2602
2603 if (i & 1 && src < src_end)
2604 rejected |= CATEGORY_MASK_ISO_8_2;
2605 else
2606 found |= CATEGORY_MASK_ISO_8_2;
2607 }
2608 break;
2609 }
2610 check_extra_latin:
2611 single_shifting = 0;
2612 if (! VECTORP (Vlatin_extra_code_table)
2613 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2614 {
2615 rejected = CATEGORY_MASK_ISO;
2616 break;
2617 }
2618 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2619 & CODING_ISO_FLAG_LATIN_EXTRA)
2620 found |= CATEGORY_MASK_ISO_8_1;
2621 else
2622 rejected |= CATEGORY_MASK_ISO_8_1;
2623 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2624 & CODING_ISO_FLAG_LATIN_EXTRA)
2625 found |= CATEGORY_MASK_ISO_8_2;
2626 else
2627 rejected |= CATEGORY_MASK_ISO_8_2;
2628 }
2629 }
2630 detect_info->rejected |= CATEGORY_MASK_ISO;
2631 return 0;
2632
2633 no_more_source:
2634 detect_info->rejected |= rejected;
2635 detect_info->found |= (found & ~rejected);
2636 return 1;
2637 }
2638
2639
2640 /* Set designation state into CODING. */
2641 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2642 do { \
2643 int id, prev; \
2644 \
2645 if (final < '0' || final >= 128 \
2646 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2647 || !SAFE_CHARSET_P (coding, id)) \
2648 { \
2649 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2650 goto invalid_code; \
2651 } \
2652 prev = CODING_ISO_DESIGNATION (coding, reg); \
2653 if (id == charset_jisx0201_roman) \
2654 { \
2655 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2656 id = charset_ascii; \
2657 } \
2658 else if (id == charset_jisx0208_1978) \
2659 { \
2660 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2661 id = charset_jisx0208; \
2662 } \
2663 CODING_ISO_DESIGNATION (coding, reg) = id; \
2664 /* If there was an invalid designation to REG previously, and this \
2665 designation is ASCII to REG, we should keep this designation \
2666 sequence. */ \
2667 if (prev == -2 && id == charset_ascii) \
2668 goto invalid_code; \
2669 } while (0)
2670
2671
2672 #define MAYBE_FINISH_COMPOSITION() \
2673 do { \
2674 int i; \
2675 if (composition_state == COMPOSING_NO) \
2676 break; \
2677 /* It is assured that we have enough room for producing \
2678 characters stored in the table `components'. */ \
2679 if (charbuf + component_idx > charbuf_end) \
2680 goto no_more_source; \
2681 composition_state = COMPOSING_NO; \
2682 if (method == COMPOSITION_RELATIVE \
2683 || method == COMPOSITION_WITH_ALTCHARS) \
2684 { \
2685 for (i = 0; i < component_idx; i++) \
2686 *charbuf++ = components[i]; \
2687 char_offset += component_idx; \
2688 } \
2689 else \
2690 { \
2691 for (i = 0; i < component_idx; i += 2) \
2692 *charbuf++ = components[i]; \
2693 char_offset += (component_idx / 2) + 1; \
2694 } \
2695 } while (0)
2696
2697
2698 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2699 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2700 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2701 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2702 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2703 */
2704
2705 #define DECODE_COMPOSITION_START(c1) \
2706 do { \
2707 if (c1 == '0' \
2708 && composition_state == COMPOSING_COMPONENT_RULE) \
2709 { \
2710 component_len = component_idx; \
2711 composition_state = COMPOSING_CHAR; \
2712 } \
2713 else \
2714 { \
2715 unsigned char *p; \
2716 \
2717 MAYBE_FINISH_COMPOSITION (); \
2718 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2719 goto no_more_source; \
2720 for (p = src; p < src_end - 1; p++) \
2721 if (*p == ISO_CODE_ESC && p[1] == '1') \
2722 break; \
2723 if (p == src_end - 1) \
2724 { \
2725 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2726 goto invalid_code; \
2727 goto no_more_source; \
2728 } \
2729 \
2730 /* This is surely the start of a composition. */ \
2731 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2732 : c1 == '2' ? COMPOSITION_WITH_RULE \
2733 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2734 : COMPOSITION_WITH_RULE_ALTCHARS); \
2735 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2736 : COMPOSING_COMPONENT_CHAR); \
2737 component_idx = component_len = 0; \
2738 } \
2739 } while (0)
2740
2741
2742 /* Handle compositoin end sequence ESC 1. */
2743
2744 #define DECODE_COMPOSITION_END() \
2745 do { \
2746 int nchars = (component_len > 0 ? component_idx - component_len \
2747 : method == COMPOSITION_RELATIVE ? component_idx \
2748 : (component_idx + 1) / 2); \
2749 int i; \
2750 int *saved_charbuf = charbuf; \
2751 int from = coding->produced_char + char_offset; \
2752 int to = from + nchars; \
2753 \
2754 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2755 if (method != COMPOSITION_RELATIVE) \
2756 { \
2757 if (component_len == 0) \
2758 for (i = 0; i < component_idx; i++) \
2759 *charbuf++ = components[i]; \
2760 else \
2761 for (i = 0; i < component_len; i++) \
2762 *charbuf++ = components[i]; \
2763 *saved_charbuf = saved_charbuf - charbuf; \
2764 } \
2765 if (method == COMPOSITION_WITH_RULE) \
2766 for (i = 0; i < component_idx; i += 2, char_offset++) \
2767 *charbuf++ = components[i]; \
2768 else \
2769 for (i = component_len; i < component_idx; i++, char_offset++) \
2770 *charbuf++ = components[i]; \
2771 coding->annotated = 1; \
2772 composition_state = COMPOSING_NO; \
2773 } while (0)
2774
2775
2776 /* Decode a composition rule from the byte C1 (and maybe one more byte
2777 from SRC) and store one encoded composition rule in
2778 coding->cmp_data. */
2779
2780 #define DECODE_COMPOSITION_RULE(c1) \
2781 do { \
2782 (c1) -= 32; \
2783 if (c1 < 81) /* old format (before ver.21) */ \
2784 { \
2785 int gref = (c1) / 9; \
2786 int nref = (c1) % 9; \
2787 if (gref == 4) gref = 10; \
2788 if (nref == 4) nref = 10; \
2789 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2790 } \
2791 else if (c1 < 93) /* new format (after ver.21) */ \
2792 { \
2793 ONE_MORE_BYTE (c2); \
2794 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2795 } \
2796 else \
2797 c1 = 0; \
2798 } while (0)
2799
2800
2801 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2802
2803 static void
2804 decode_coding_iso_2022 (coding)
2805 struct coding_system *coding;
2806 {
2807 unsigned char *src = coding->source + coding->consumed;
2808 unsigned char *src_end = coding->source + coding->src_bytes;
2809 unsigned char *src_base;
2810 int *charbuf = coding->charbuf;
2811 int *charbuf_end
2812 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2813 int consumed_chars = 0, consumed_chars_base;
2814 int multibytep = coding->src_multibyte;
2815 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2816 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2817 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2818 struct charset *charset;
2819 int c;
2820 /* For handling composition sequence. */
2821 #define COMPOSING_NO 0
2822 #define COMPOSING_CHAR 1
2823 #define COMPOSING_RULE 2
2824 #define COMPOSING_COMPONENT_CHAR 3
2825 #define COMPOSING_COMPONENT_RULE 4
2826
2827 int composition_state = COMPOSING_NO;
2828 enum composition_method method;
2829 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2830 int component_idx;
2831 int component_len;
2832 Lisp_Object attrs, eol_type, charset_list;
2833 int char_offset = coding->produced_char;
2834 int last_offset = char_offset;
2835 int last_id = charset_ascii;
2836
2837 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2838 setup_iso_safe_charsets (attrs);
2839
2840 while (1)
2841 {
2842 int c1, c2;
2843
2844 src_base = src;
2845 consumed_chars_base = consumed_chars;
2846
2847 if (charbuf >= charbuf_end)
2848 break;
2849
2850 ONE_MORE_BYTE (c1);
2851
2852 /* We produce at most one character. */
2853 switch (iso_code_class [c1])
2854 {
2855 case ISO_0x20_or_0x7F:
2856 if (composition_state != COMPOSING_NO)
2857 {
2858 if (composition_state == COMPOSING_RULE
2859 || composition_state == COMPOSING_COMPONENT_RULE)
2860 {
2861 DECODE_COMPOSITION_RULE (c1);
2862 components[component_idx++] = c1;
2863 composition_state--;
2864 continue;
2865 }
2866 }
2867 if (charset_id_0 < 0
2868 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2869 /* This is SPACE or DEL. */
2870 charset = CHARSET_FROM_ID (charset_ascii);
2871 else
2872 charset = CHARSET_FROM_ID (charset_id_0);
2873 break;
2874
2875 case ISO_graphic_plane_0:
2876 if (composition_state != COMPOSING_NO)
2877 {
2878 if (composition_state == COMPOSING_RULE
2879 || composition_state == COMPOSING_COMPONENT_RULE)
2880 {
2881 DECODE_COMPOSITION_RULE (c1);
2882 components[component_idx++] = c1;
2883 composition_state--;
2884 continue;
2885 }
2886 }
2887 charset = CHARSET_FROM_ID (charset_id_0);
2888 break;
2889
2890 case ISO_0xA0_or_0xFF:
2891 if (charset_id_1 < 0
2892 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2893 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2894 goto invalid_code;
2895 /* This is a graphic character, we fall down ... */
2896
2897 case ISO_graphic_plane_1:
2898 if (charset_id_1 < 0)
2899 goto invalid_code;
2900 charset = CHARSET_FROM_ID (charset_id_1);
2901 break;
2902
2903 case ISO_carriage_return:
2904 if (c1 == '\r')
2905 {
2906 if (EQ (eol_type, Qdos))
2907 {
2908 if (src == src_end)
2909 {
2910 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2911 goto no_more_source;
2912 }
2913 if (*src == '\n')
2914 ONE_MORE_BYTE (c1);
2915 }
2916 else if (EQ (eol_type, Qmac))
2917 c1 = '\n';
2918 }
2919 /* fall through */
2920
2921 case ISO_control_0:
2922 MAYBE_FINISH_COMPOSITION ();
2923 charset = CHARSET_FROM_ID (charset_ascii);
2924 break;
2925
2926 case ISO_control_1:
2927 MAYBE_FINISH_COMPOSITION ();
2928 goto invalid_code;
2929
2930 case ISO_shift_out:
2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2932 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2933 goto invalid_code;
2934 CODING_ISO_INVOCATION (coding, 0) = 1;
2935 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2936 continue;
2937
2938 case ISO_shift_in:
2939 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2940 goto invalid_code;
2941 CODING_ISO_INVOCATION (coding, 0) = 0;
2942 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2943 continue;
2944
2945 case ISO_single_shift_2_7:
2946 case ISO_single_shift_2:
2947 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2948 goto invalid_code;
2949 /* SS2 is handled as an escape sequence of ESC 'N' */
2950 c1 = 'N';
2951 goto label_escape_sequence;
2952
2953 case ISO_single_shift_3:
2954 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2955 goto invalid_code;
2956 /* SS2 is handled as an escape sequence of ESC 'O' */
2957 c1 = 'O';
2958 goto label_escape_sequence;
2959
2960 case ISO_control_sequence_introducer:
2961 /* CSI is handled as an escape sequence of ESC '[' ... */
2962 c1 = '[';
2963 goto label_escape_sequence;
2964
2965 case ISO_escape:
2966 ONE_MORE_BYTE (c1);
2967 label_escape_sequence:
2968 /* Escape sequences handled here are invocation,
2969 designation, direction specification, and character
2970 composition specification. */
2971 switch (c1)
2972 {
2973 case '&': /* revision of following character set */
2974 ONE_MORE_BYTE (c1);
2975 if (!(c1 >= '@' && c1 <= '~'))
2976 goto invalid_code;
2977 ONE_MORE_BYTE (c1);
2978 if (c1 != ISO_CODE_ESC)
2979 goto invalid_code;
2980 ONE_MORE_BYTE (c1);
2981 goto label_escape_sequence;
2982
2983 case '$': /* designation of 2-byte character set */
2984 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2985 goto invalid_code;
2986 ONE_MORE_BYTE (c1);
2987 if (c1 >= '@' && c1 <= 'B')
2988 { /* designation of JISX0208.1978, GB2312.1980,
2989 or JISX0208.1980 */
2990 DECODE_DESIGNATION (0, 2, 0, c1);
2991 }
2992 else if (c1 >= 0x28 && c1 <= 0x2B)
2993 { /* designation of DIMENSION2_CHARS94 character set */
2994 ONE_MORE_BYTE (c2);
2995 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2996 }
2997 else if (c1 >= 0x2C && c1 <= 0x2F)
2998 { /* designation of DIMENSION2_CHARS96 character set */
2999 ONE_MORE_BYTE (c2);
3000 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3001 }
3002 else
3003 goto invalid_code;
3004 /* We must update these variables now. */
3005 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3006 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3007 continue;
3008
3009 case 'n': /* invocation of locking-shift-2 */
3010 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3011 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3012 goto invalid_code;
3013 CODING_ISO_INVOCATION (coding, 0) = 2;
3014 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3015 continue;
3016
3017 case 'o': /* invocation of locking-shift-3 */
3018 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3019 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3020 goto invalid_code;
3021 CODING_ISO_INVOCATION (coding, 0) = 3;
3022 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3023 continue;
3024
3025 case 'N': /* invocation of single-shift-2 */
3026 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3027 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3028 goto invalid_code;
3029 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3030 ONE_MORE_BYTE (c1);
3031 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3032 goto invalid_code;
3033 break;
3034
3035 case 'O': /* invocation of single-shift-3 */
3036 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3037 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3038 goto invalid_code;
3039 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3040 ONE_MORE_BYTE (c1);
3041 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3042 goto invalid_code;
3043 break;
3044
3045 case '0': case '2': case '3': case '4': /* start composition */
3046 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3047 goto invalid_code;
3048 DECODE_COMPOSITION_START (c1);
3049 continue;
3050
3051 case '1': /* end composition */
3052 if (composition_state == COMPOSING_NO)
3053 goto invalid_code;
3054 DECODE_COMPOSITION_END ();
3055 continue;
3056
3057 case '[': /* specification of direction */
3058 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3059 goto invalid_code;
3060 /* For the moment, nested direction is not supported.
3061 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3062 left-to-right, and nozero means right-to-left. */
3063 ONE_MORE_BYTE (c1);
3064 switch (c1)
3065 {
3066 case ']': /* end of the current direction */
3067 coding->mode &= ~CODING_MODE_DIRECTION;
3068
3069 case '0': /* end of the current direction */
3070 case '1': /* start of left-to-right direction */
3071 ONE_MORE_BYTE (c1);
3072 if (c1 == ']')
3073 coding->mode &= ~CODING_MODE_DIRECTION;
3074 else
3075 goto invalid_code;
3076 break;
3077
3078 case '2': /* start of right-to-left direction */
3079 ONE_MORE_BYTE (c1);
3080 if (c1 == ']')
3081 coding->mode |= CODING_MODE_DIRECTION;
3082 else
3083 goto invalid_code;
3084 break;
3085
3086 default:
3087 goto invalid_code;
3088 }
3089 continue;
3090
3091 default:
3092 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3093 goto invalid_code;
3094 if (c1 >= 0x28 && c1 <= 0x2B)
3095 { /* designation of DIMENSION1_CHARS94 character set */
3096 ONE_MORE_BYTE (c2);
3097 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3098 }
3099 else if (c1 >= 0x2C && c1 <= 0x2F)
3100 { /* designation of DIMENSION1_CHARS96 character set */
3101 ONE_MORE_BYTE (c2);
3102 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3103 }
3104 else
3105 goto invalid_code;
3106 /* We must update these variables now. */
3107 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3108 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3109 continue;
3110 }
3111 }
3112
3113 if (charset->id != charset_ascii
3114 && last_id != charset->id)
3115 {
3116 if (last_id != charset_ascii)
3117 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3118 last_id = charset->id;
3119 last_offset = char_offset;
3120 }
3121
3122 /* Now we know CHARSET and 1st position code C1 of a character.
3123 Produce a decoded character while getting 2nd position code
3124 C2 if necessary. */
3125 c1 &= 0x7F;
3126 if (CHARSET_DIMENSION (charset) > 1)
3127 {
3128 ONE_MORE_BYTE (c2);
3129 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3130 /* C2 is not in a valid range. */
3131 goto invalid_code;
3132 c1 = (c1 << 8) | (c2 & 0x7F);
3133 if (CHARSET_DIMENSION (charset) > 2)
3134 {
3135 ONE_MORE_BYTE (c2);
3136 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3137 /* C2 is not in a valid range. */
3138 goto invalid_code;
3139 c1 = (c1 << 8) | (c2 & 0x7F);
3140 }
3141 }
3142
3143 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3144 if (c < 0)
3145 {
3146 MAYBE_FINISH_COMPOSITION ();
3147 for (; src_base < src; src_base++, char_offset++)
3148 {
3149 if (ASCII_BYTE_P (*src_base))
3150 *charbuf++ = *src_base;
3151 else
3152 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3153 char_offset++;
3154 }
3155 }
3156 else if (composition_state == COMPOSING_NO)
3157 {
3158 *charbuf++ = c;
3159 char_offset++;
3160 }
3161 else
3162 {
3163 components[component_idx++] = c;
3164 if (method == COMPOSITION_WITH_RULE
3165 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3166 && composition_state == COMPOSING_COMPONENT_CHAR))
3167 composition_state++;
3168 }
3169 continue;
3170
3171 invalid_code:
3172 MAYBE_FINISH_COMPOSITION ();
3173 src = src_base;
3174 consumed_chars = consumed_chars_base;
3175 ONE_MORE_BYTE (c);
3176 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3177 char_offset++;
3178 coding->errors++;
3179 }
3180
3181 no_more_source:
3182 if (last_id != charset_ascii)
3183 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3184 coding->consumed_char += consumed_chars_base;
3185 coding->consumed = src_base - coding->source;
3186 coding->charbuf_used = charbuf - coding->charbuf;
3187 }
3188
3189
3190 /* ISO2022 encoding stuff. */
3191
3192 /*
3193 It is not enough to say just "ISO2022" on encoding, we have to
3194 specify more details. In Emacs, each coding system of ISO2022
3195 variant has the following specifications:
3196 1. Initial designation to G0 thru G3.
3197 2. Allows short-form designation?
3198 3. ASCII should be designated to G0 before control characters?
3199 4. ASCII should be designated to G0 at end of line?
3200 5. 7-bit environment or 8-bit environment?
3201 6. Use locking-shift?
3202 7. Use Single-shift?
3203 And the following two are only for Japanese:
3204 8. Use ASCII in place of JIS0201-1976-Roman?
3205 9. Use JISX0208-1983 in place of JISX0208-1978?
3206 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3207 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3208 details.
3209 */
3210
3211 /* Produce codes (escape sequence) for designating CHARSET to graphic
3212 register REG at DST, and increment DST. If <final-char> of CHARSET is
3213 '@', 'A', or 'B' and the coding system CODING allows, produce
3214 designation sequence of short-form. */
3215
3216 #define ENCODE_DESIGNATION(charset, reg, coding) \
3217 do { \
3218 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3219 char *intermediate_char_94 = "()*+"; \
3220 char *intermediate_char_96 = ",-./"; \
3221 int revision = -1; \
3222 int c; \
3223 \
3224 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3225 revision = CHARSET_ISO_REVISION (charset); \
3226 \
3227 if (revision >= 0) \
3228 { \
3229 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3230 EMIT_ONE_BYTE ('@' + revision); \
3231 } \
3232 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3233 if (CHARSET_DIMENSION (charset) == 1) \
3234 { \
3235 if (! CHARSET_ISO_CHARS_96 (charset)) \
3236 c = intermediate_char_94[reg]; \
3237 else \
3238 c = intermediate_char_96[reg]; \
3239 EMIT_ONE_ASCII_BYTE (c); \
3240 } \
3241 else \
3242 { \
3243 EMIT_ONE_ASCII_BYTE ('$'); \
3244 if (! CHARSET_ISO_CHARS_96 (charset)) \
3245 { \
3246 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3247 || reg != 0 \
3248 || final_char < '@' || final_char > 'B') \
3249 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3250 } \
3251 else \
3252 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3253 } \
3254 EMIT_ONE_ASCII_BYTE (final_char); \
3255 \
3256 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3257 } while (0)
3258
3259
3260 /* The following two macros produce codes (control character or escape
3261 sequence) for ISO2022 single-shift functions (single-shift-2 and
3262 single-shift-3). */
3263
3264 #define ENCODE_SINGLE_SHIFT_2 \
3265 do { \
3266 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3267 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3268 else \
3269 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3270 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3271 } while (0)
3272
3273
3274 #define ENCODE_SINGLE_SHIFT_3 \
3275 do { \
3276 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3277 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3278 else \
3279 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3280 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3281 } while (0)
3282
3283
3284 /* The following four macros produce codes (control character or
3285 escape sequence) for ISO2022 locking-shift functions (shift-in,
3286 shift-out, locking-shift-2, and locking-shift-3). */
3287
3288 #define ENCODE_SHIFT_IN \
3289 do { \
3290 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3291 CODING_ISO_INVOCATION (coding, 0) = 0; \
3292 } while (0)
3293
3294
3295 #define ENCODE_SHIFT_OUT \
3296 do { \
3297 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3298 CODING_ISO_INVOCATION (coding, 0) = 1; \
3299 } while (0)
3300
3301
3302 #define ENCODE_LOCKING_SHIFT_2 \
3303 do { \
3304 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3305 CODING_ISO_INVOCATION (coding, 0) = 2; \
3306 } while (0)
3307
3308
3309 #define ENCODE_LOCKING_SHIFT_3 \
3310 do { \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3312 CODING_ISO_INVOCATION (coding, 0) = 3; \
3313 } while (0)
3314
3315
3316 /* Produce codes for a DIMENSION1 character whose character set is
3317 CHARSET and whose position-code is C1. Designation and invocation
3318 sequences are also produced in advance if necessary. */
3319
3320 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3321 do { \
3322 int id = CHARSET_ID (charset); \
3323 \
3324 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3325 && id == charset_ascii) \
3326 { \
3327 id = charset_jisx0201_roman; \
3328 charset = CHARSET_FROM_ID (id); \
3329 } \
3330 \
3331 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3332 { \
3333 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3334 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3335 else \
3336 EMIT_ONE_BYTE (c1 | 0x80); \
3337 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3338 break; \
3339 } \
3340 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3341 { \
3342 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3343 break; \
3344 } \
3345 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3346 { \
3347 EMIT_ONE_BYTE (c1 | 0x80); \
3348 break; \
3349 } \
3350 else \
3351 /* Since CHARSET is not yet invoked to any graphic planes, we \
3352 must invoke it, or, at first, designate it to some graphic \
3353 register. Then repeat the loop to actually produce the \
3354 character. */ \
3355 dst = encode_invocation_designation (charset, coding, dst, \
3356 &produced_chars); \
3357 } while (1)
3358
3359
3360 /* Produce codes for a DIMENSION2 character whose character set is
3361 CHARSET and whose position-codes are C1 and C2. Designation and
3362 invocation codes are also produced in advance if necessary. */
3363
3364 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3365 do { \
3366 int id = CHARSET_ID (charset); \
3367 \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3369 && id == charset_jisx0208) \
3370 { \
3371 id = charset_jisx0208_1978; \
3372 charset = CHARSET_FROM_ID (id); \
3373 } \
3374 \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3376 { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3379 else \
3380 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3382 break; \
3383 } \
3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3385 { \
3386 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3387 break; \
3388 } \
3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3390 { \
3391 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3392 break; \
3393 } \
3394 else \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3398 character. */ \
3399 dst = encode_invocation_designation (charset, coding, dst, \
3400 &produced_chars); \
3401 } while (1)
3402
3403
3404 #define ENCODE_ISO_CHARACTER(charset, c) \
3405 do { \
3406 int code = ENCODE_CHAR ((charset),(c)); \
3407 \
3408 if (CHARSET_DIMENSION (charset) == 1) \
3409 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3410 else \
3411 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3412 } while (0)
3413
3414
3415 /* Produce designation and invocation codes at a place pointed by DST
3416 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3417 Return new DST. */
3418
3419 unsigned char *
3420 encode_invocation_designation (charset, coding, dst, p_nchars)
3421 struct charset *charset;
3422 struct coding_system *coding;
3423 unsigned char *dst;
3424 int *p_nchars;
3425 {
3426 int multibytep = coding->dst_multibyte;
3427 int produced_chars = *p_nchars;
3428 int reg; /* graphic register number */
3429 int id = CHARSET_ID (charset);
3430
3431 /* At first, check designations. */
3432 for (reg = 0; reg < 4; reg++)
3433 if (id == CODING_ISO_DESIGNATION (coding, reg))
3434 break;
3435
3436 if (reg >= 4)
3437 {
3438 /* CHARSET is not yet designated to any graphic registers. */
3439 /* At first check the requested designation. */
3440 reg = CODING_ISO_REQUEST (coding, id);
3441 if (reg < 0)
3442 /* Since CHARSET requests no special designation, designate it
3443 to graphic register 0. */
3444 reg = 0;
3445
3446 ENCODE_DESIGNATION (charset, reg, coding);
3447 }
3448
3449 if (CODING_ISO_INVOCATION (coding, 0) != reg
3450 && CODING_ISO_INVOCATION (coding, 1) != reg)
3451 {
3452 /* Since the graphic register REG is not invoked to any graphic
3453 planes, invoke it to graphic plane 0. */
3454 switch (reg)
3455 {
3456 case 0: /* graphic register 0 */
3457 ENCODE_SHIFT_IN;
3458 break;
3459
3460 case 1: /* graphic register 1 */
3461 ENCODE_SHIFT_OUT;
3462 break;
3463
3464 case 2: /* graphic register 2 */
3465 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3466 ENCODE_SINGLE_SHIFT_2;
3467 else
3468 ENCODE_LOCKING_SHIFT_2;
3469 break;
3470
3471 case 3: /* graphic register 3 */
3472 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3473 ENCODE_SINGLE_SHIFT_3;
3474 else
3475 ENCODE_LOCKING_SHIFT_3;
3476 break;
3477 }
3478 }
3479
3480 *p_nchars = produced_chars;
3481 return dst;
3482 }
3483
3484 /* The following three macros produce codes for indicating direction
3485 of text. */
3486 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3487 do { \
3488 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3489 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3490 else \
3491 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3492 } while (0)
3493
3494
3495 #define ENCODE_DIRECTION_R2L() \
3496 do { \
3497 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3498 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3499 } while (0)
3500
3501
3502 #define ENCODE_DIRECTION_L2R() \
3503 do { \
3504 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3505 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3506 } while (0)
3507
3508
3509 /* Produce codes for designation and invocation to reset the graphic
3510 planes and registers to initial state. */
3511 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3512 do { \
3513 int reg; \
3514 struct charset *charset; \
3515 \
3516 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3517 ENCODE_SHIFT_IN; \
3518 for (reg = 0; reg < 4; reg++) \
3519 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3520 && (CODING_ISO_DESIGNATION (coding, reg) \
3521 != CODING_ISO_INITIAL (coding, reg))) \
3522 { \
3523 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3524 ENCODE_DESIGNATION (charset, reg, coding); \
3525 } \
3526 } while (0)
3527
3528
3529 /* Produce designation sequences of charsets in the line started from
3530 SRC to a place pointed by DST, and return updated DST.
3531
3532 If the current block ends before any end-of-line, we may fail to
3533 find all the necessary designations. */
3534
3535 static unsigned char *
3536 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3537 struct coding_system *coding;
3538 int *charbuf, *charbuf_end;
3539 unsigned char *dst;
3540 {
3541 struct charset *charset;
3542 /* Table of charsets to be designated to each graphic register. */
3543 int r[4];
3544 int c, found = 0, reg;
3545 int produced_chars = 0;
3546 int multibytep = coding->dst_multibyte;
3547 Lisp_Object attrs;
3548 Lisp_Object charset_list;
3549
3550 attrs = CODING_ID_ATTRS (coding->id);
3551 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3552 if (EQ (charset_list, Qiso_2022))
3553 charset_list = Viso_2022_charset_list;
3554
3555 for (reg = 0; reg < 4; reg++)
3556 r[reg] = -1;
3557
3558 while (found < 4)
3559 {
3560 int id;
3561
3562 c = *charbuf++;
3563 if (c == '\n')
3564 break;
3565 charset = char_charset (c, charset_list, NULL);
3566 id = CHARSET_ID (charset);
3567 reg = CODING_ISO_REQUEST (coding, id);
3568 if (reg >= 0 && r[reg] < 0)
3569 {
3570 found++;
3571 r[reg] = id;
3572 }
3573 }
3574
3575 if (found)
3576 {
3577 for (reg = 0; reg < 4; reg++)
3578 if (r[reg] >= 0
3579 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3580 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3581 }
3582
3583 return dst;
3584 }
3585
3586 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3587
3588 static int
3589 encode_coding_iso_2022 (coding)
3590 struct coding_system *coding;
3591 {
3592 int multibytep = coding->dst_multibyte;
3593 int *charbuf = coding->charbuf;
3594 int *charbuf_end = charbuf + coding->charbuf_used;
3595 unsigned char *dst = coding->destination + coding->produced;
3596 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3597 int safe_room = 16;
3598 int bol_designation
3599 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3600 && CODING_ISO_BOL (coding));
3601 int produced_chars = 0;
3602 Lisp_Object attrs, eol_type, charset_list;
3603 int ascii_compatible;
3604 int c;
3605 int preferred_charset_id = -1;
3606
3607 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3608 setup_iso_safe_charsets (attrs);
3609 /* Charset list may have been changed. */
3610 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3611 coding->safe_charsets
3612 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3613
3614 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3615
3616 while (charbuf < charbuf_end)
3617 {
3618 ASSURE_DESTINATION (safe_room);
3619
3620 if (bol_designation)
3621 {
3622 unsigned char *dst_prev = dst;
3623
3624 /* We have to produce designation sequences if any now. */
3625 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3626 bol_designation = 0;
3627 /* We are sure that designation sequences are all ASCII bytes. */
3628 produced_chars += dst - dst_prev;
3629 }
3630
3631 c = *charbuf++;
3632
3633 if (c < 0)
3634 {
3635 /* Handle an annotation. */
3636 switch (*charbuf)
3637 {
3638 case CODING_ANNOTATE_COMPOSITION_MASK:
3639 /* Not yet implemented. */
3640 break;
3641 case CODING_ANNOTATE_CHARSET_MASK:
3642 preferred_charset_id = charbuf[3];
3643 if (preferred_charset_id >= 0
3644 && NILP (Fmemq (make_number (preferred_charset_id),
3645 charset_list)))
3646 preferred_charset_id = -1;
3647 break;
3648 default:
3649 abort ();
3650 }
3651 charbuf += -c - 1;
3652 continue;
3653 }
3654
3655 /* Now encode the character C. */
3656 if (c < 0x20 || c == 0x7F)
3657 {
3658 if (c == '\n'
3659 || (c == '\r' && EQ (eol_type, Qmac)))
3660 {
3661 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3662 ENCODE_RESET_PLANE_AND_REGISTER ();
3663 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3664 {
3665 int i;
3666
3667 for (i = 0; i < 4; i++)
3668 CODING_ISO_DESIGNATION (coding, i)
3669 = CODING_ISO_INITIAL (coding, i);
3670 }
3671 bol_designation
3672 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3673 }
3674 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3675 ENCODE_RESET_PLANE_AND_REGISTER ();
3676 EMIT_ONE_ASCII_BYTE (c);
3677 }
3678 else if (ASCII_CHAR_P (c))
3679 {
3680 if (ascii_compatible)
3681 EMIT_ONE_ASCII_BYTE (c);
3682 else
3683 {
3684 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3685 ENCODE_ISO_CHARACTER (charset, c);
3686 }
3687 }
3688 else if (CHAR_BYTE8_P (c))
3689 {
3690 c = CHAR_TO_BYTE8 (c);
3691 EMIT_ONE_BYTE (c);
3692 }
3693 else
3694 {
3695 struct charset *charset;
3696
3697 if (preferred_charset_id >= 0)
3698 {
3699 charset = CHARSET_FROM_ID (preferred_charset_id);
3700 if (! CHAR_CHARSET_P (c, charset))
3701 charset = char_charset (c, charset_list, NULL);
3702 }
3703 else
3704 charset = char_charset (c, charset_list, NULL);
3705 if (!charset)
3706 {
3707 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3708 {
3709 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3710 charset = CHARSET_FROM_ID (charset_ascii);
3711 }
3712 else
3713 {
3714 c = coding->default_char;
3715 charset = char_charset (c, charset_list, NULL);
3716 }
3717 }
3718 ENCODE_ISO_CHARACTER (charset, c);
3719 }
3720 }
3721
3722 if (coding->mode & CODING_MODE_LAST_BLOCK
3723 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3724 {
3725 ASSURE_DESTINATION (safe_room);
3726 ENCODE_RESET_PLANE_AND_REGISTER ();
3727 }
3728 coding->result = CODING_RESULT_SUCCESS;
3729 CODING_ISO_BOL (coding) = bol_designation;
3730 coding->produced_char += produced_chars;
3731 coding->produced = dst - coding->destination;
3732 return 0;
3733 }
3734
3735 \f
3736 /*** 8,9. SJIS and BIG5 handlers ***/
3737
3738 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3739 quite widely. So, for the moment, Emacs supports them in the bare
3740 C code. But, in the future, they may be supported only by CCL. */
3741
3742 /* SJIS is a coding system encoding three character sets: ASCII, right
3743 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3744 as is. A character of charset katakana-jisx0201 is encoded by
3745 "position-code + 0x80". A character of charset japanese-jisx0208
3746 is encoded in 2-byte but two position-codes are divided and shifted
3747 so that it fit in the range below.
3748
3749 --- CODE RANGE of SJIS ---
3750 (character set) (range)
3751 ASCII 0x00 .. 0x7F
3752 KATAKANA-JISX0201 0xA0 .. 0xDF
3753 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3754 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3755 -------------------------------
3756
3757 */
3758
3759 /* BIG5 is a coding system encoding two character sets: ASCII and
3760 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3761 character set and is encoded in two-byte.
3762
3763 --- CODE RANGE of BIG5 ---
3764 (character set) (range)
3765 ASCII 0x00 .. 0x7F
3766 Big5 (1st byte) 0xA1 .. 0xFE
3767 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3768 --------------------------
3769
3770 */
3771
3772 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3773 Check if a text is encoded in SJIS. If it is, return
3774 CATEGORY_MASK_SJIS, else return 0. */
3775
3776 static int
3777 detect_coding_sjis (coding, detect_info)
3778 struct coding_system *coding;
3779 struct coding_detection_info *detect_info;
3780 {
3781 unsigned char *src = coding->source, *src_base = src;
3782 unsigned char *src_end = coding->source + coding->src_bytes;
3783 int multibytep = coding->src_multibyte;
3784 int consumed_chars = 0;
3785 int found = 0;
3786 int c;
3787 int incomplete;
3788
3789 detect_info->checked |= CATEGORY_MASK_SJIS;
3790 /* A coding system of this category is always ASCII compatible. */
3791 src += coding->head_ascii;
3792
3793 while (1)
3794 {
3795 incomplete = 0;
3796 ONE_MORE_BYTE (c);
3797 incomplete = 1;
3798 if (c < 0x80)
3799 continue;
3800 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3801 {
3802 ONE_MORE_BYTE (c);
3803 if (c < 0x40 || c == 0x7F || c > 0xFC)
3804 break;
3805 found = CATEGORY_MASK_SJIS;
3806 }
3807 else if (c >= 0xA0 && c < 0xE0)
3808 found = CATEGORY_MASK_SJIS;
3809 else
3810 break;
3811 }
3812 detect_info->rejected |= CATEGORY_MASK_SJIS;
3813 return 0;
3814
3815 no_more_source:
3816 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3817 {
3818 detect_info->rejected |= CATEGORY_MASK_SJIS;
3819 return 0;
3820 }
3821 detect_info->found |= found;
3822 return 1;
3823 }
3824
3825 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3826 Check if a text is encoded in BIG5. If it is, return
3827 CATEGORY_MASK_BIG5, else return 0. */
3828
3829 static int
3830 detect_coding_big5 (coding, detect_info)
3831 struct coding_system *coding;
3832 struct coding_detection_info *detect_info;
3833 {
3834 unsigned char *src = coding->source, *src_base = src;
3835 unsigned char *src_end = coding->source + coding->src_bytes;
3836 int multibytep = coding->src_multibyte;
3837 int consumed_chars = 0;
3838 int found = 0;
3839 int c;
3840 int incomplete;
3841
3842 detect_info->checked |= CATEGORY_MASK_BIG5;
3843 /* A coding system of this category is always ASCII compatible. */
3844 src += coding->head_ascii;
3845
3846 while (1)
3847 {
3848 incomplete = 0;
3849 ONE_MORE_BYTE (c);
3850 incomplete = 1;
3851 if (c < 0x80)
3852 continue;
3853 if (c >= 0xA1)
3854 {
3855 ONE_MORE_BYTE (c);
3856 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3857 return 0;
3858 found = CATEGORY_MASK_BIG5;
3859 }
3860 else
3861 break;
3862 }
3863 detect_info->rejected |= CATEGORY_MASK_BIG5;
3864 return 0;
3865
3866 no_more_source:
3867 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3868 {
3869 detect_info->rejected |= CATEGORY_MASK_BIG5;
3870 return 0;
3871 }
3872 detect_info->found |= found;
3873 return 1;
3874 }
3875
3876 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3877 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3878
3879 static void
3880 decode_coding_sjis (coding)
3881 struct coding_system *coding;
3882 {
3883 unsigned char *src = coding->source + coding->consumed;
3884 unsigned char *src_end = coding->source + coding->src_bytes;
3885 unsigned char *src_base;
3886 int *charbuf = coding->charbuf;
3887 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3888 int consumed_chars = 0, consumed_chars_base;
3889 int multibytep = coding->src_multibyte;
3890 struct charset *charset_roman, *charset_kanji, *charset_kana;
3891 Lisp_Object attrs, eol_type, charset_list, val;
3892 int char_offset = coding->produced_char;
3893 int last_offset = char_offset;
3894 int last_id = charset_ascii;
3895
3896 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3897
3898 val = charset_list;
3899 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3900 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3901 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3902
3903 while (1)
3904 {
3905 int c, c1;
3906
3907 src_base = src;
3908 consumed_chars_base = consumed_chars;
3909
3910 if (charbuf >= charbuf_end)
3911 break;
3912
3913 ONE_MORE_BYTE (c);
3914
3915 if (c == '\r')
3916 {
3917 if (EQ (eol_type, Qdos))
3918 {
3919 if (src == src_end)
3920 {
3921 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3922 goto no_more_source;
3923 }
3924 if (*src == '\n')
3925 ONE_MORE_BYTE (c);
3926 }
3927 else if (EQ (eol_type, Qmac))
3928 c = '\n';
3929 }
3930 else
3931 {
3932 struct charset *charset;
3933
3934 if (c < 0x80)
3935 charset = charset_roman;
3936 else
3937 {
3938 if (c >= 0xF0)
3939 goto invalid_code;
3940 if (c < 0xA0 || c >= 0xE0)
3941 {
3942 /* SJIS -> JISX0208 */
3943 ONE_MORE_BYTE (c1);
3944 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3945 goto invalid_code;
3946 c = (c << 8) | c1;
3947 SJIS_TO_JIS (c);
3948 charset = charset_kanji;
3949 }
3950 else
3951 {
3952 /* SJIS -> JISX0201-Kana */
3953 c &= 0x7F;
3954 charset = charset_kana;
3955 }
3956 }
3957 if (charset->id != charset_ascii
3958 && last_id != charset->id)
3959 {
3960 if (last_id != charset_ascii)
3961 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3962 last_id = charset->id;
3963 last_offset = char_offset;
3964 }
3965 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3966 }
3967 *charbuf++ = c;
3968 char_offset++;
3969 continue;
3970
3971 invalid_code:
3972 src = src_base;
3973 consumed_chars = consumed_chars_base;
3974 ONE_MORE_BYTE (c);
3975 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3976 char_offset++;
3977 coding->errors++;
3978 }
3979
3980 no_more_source:
3981 if (last_id != charset_ascii)
3982 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3983 coding->consumed_char += consumed_chars_base;
3984 coding->consumed = src_base - coding->source;
3985 coding->charbuf_used = charbuf - coding->charbuf;
3986 }
3987
3988 static void
3989 decode_coding_big5 (coding)
3990 struct coding_system *coding;
3991 {
3992 unsigned char *src = coding->source + coding->consumed;
3993 unsigned char *src_end = coding->source + coding->src_bytes;
3994 unsigned char *src_base;
3995 int *charbuf = coding->charbuf;
3996 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3997 int consumed_chars = 0, consumed_chars_base;
3998 int multibytep = coding->src_multibyte;
3999 struct charset *charset_roman, *charset_big5;
4000 Lisp_Object attrs, eol_type, charset_list, val;
4001 int char_offset = coding->produced_char;
4002 int last_offset = char_offset;
4003 int last_id = charset_ascii;
4004
4005 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4006 val = charset_list;
4007 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4008 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4009
4010 while (1)
4011 {
4012 int c, c1;
4013
4014 src_base = src;
4015 consumed_chars_base = consumed_chars;
4016
4017 if (charbuf >= charbuf_end)
4018 break;
4019
4020 ONE_MORE_BYTE (c);
4021
4022 if (c == '\r')
4023 {
4024 if (EQ (eol_type, Qdos))
4025 {
4026 if (src == src_end)
4027 {
4028 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4029 goto no_more_source;
4030 }
4031 if (*src == '\n')
4032 ONE_MORE_BYTE (c);
4033 }
4034 else if (EQ (eol_type, Qmac))
4035 c = '\n';
4036 }
4037 else
4038 {
4039 struct charset *charset;
4040 if (c < 0x80)
4041 charset = charset_roman;
4042 else
4043 {
4044 /* BIG5 -> Big5 */
4045 if (c < 0xA1 || c > 0xFE)
4046 goto invalid_code;
4047 ONE_MORE_BYTE (c1);
4048 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4049 goto invalid_code;
4050 c = c << 8 | c1;
4051 charset = charset_big5;
4052 }
4053 if (charset->id != charset_ascii
4054 && last_id != charset->id)
4055 {
4056 if (last_id != charset_ascii)
4057 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4058 last_id = charset->id;
4059 last_offset = char_offset;
4060 }
4061 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4062 }
4063
4064 *charbuf++ = c;
4065 char_offset++;
4066 continue;
4067
4068 invalid_code:
4069 src = src_base;
4070 consumed_chars = consumed_chars_base;
4071 ONE_MORE_BYTE (c);
4072 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4073 char_offset++;
4074 coding->errors++;
4075 }
4076
4077 no_more_source:
4078 if (last_id != charset_ascii)
4079 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4080 coding->consumed_char += consumed_chars_base;
4081 coding->consumed = src_base - coding->source;
4082 coding->charbuf_used = charbuf - coding->charbuf;
4083 }
4084
4085 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4086 This function can encode charsets `ascii', `katakana-jisx0201',
4087 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4088 are sure that all these charsets are registered as official charset
4089 (i.e. do not have extended leading-codes). Characters of other
4090 charsets are produced without any encoding. If SJIS_P is 1, encode
4091 SJIS text, else encode BIG5 text. */
4092
4093 static int
4094 encode_coding_sjis (coding)
4095 struct coding_system *coding;
4096 {
4097 int multibytep = coding->dst_multibyte;
4098 int *charbuf = coding->charbuf;
4099 int *charbuf_end = charbuf + coding->charbuf_used;
4100 unsigned char *dst = coding->destination + coding->produced;
4101 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4102 int safe_room = 4;
4103 int produced_chars = 0;
4104 Lisp_Object attrs, eol_type, charset_list, val;
4105 int ascii_compatible;
4106 struct charset *charset_roman, *charset_kanji, *charset_kana;
4107 int c;
4108
4109 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4110 val = charset_list;
4111 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4112 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4113 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4114
4115 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4116
4117 while (charbuf < charbuf_end)
4118 {
4119 ASSURE_DESTINATION (safe_room);
4120 c = *charbuf++;
4121 /* Now encode the character C. */
4122 if (ASCII_CHAR_P (c) && ascii_compatible)
4123 EMIT_ONE_ASCII_BYTE (c);
4124 else if (CHAR_BYTE8_P (c))
4125 {
4126 c = CHAR_TO_BYTE8 (c);
4127 EMIT_ONE_BYTE (c);
4128 }
4129 else
4130 {
4131 unsigned code;
4132 struct charset *charset = char_charset (c, charset_list, &code);
4133
4134 if (!charset)
4135 {
4136 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4137 {
4138 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4139 charset = CHARSET_FROM_ID (charset_ascii);
4140 }
4141 else
4142 {
4143 c = coding->default_char;
4144 charset = char_charset (c, charset_list, &code);
4145 }
4146 }
4147 if (code == CHARSET_INVALID_CODE (charset))
4148 abort ();
4149 if (charset == charset_kanji)
4150 {
4151 int c1, c2;
4152 JIS_TO_SJIS (code);
4153 c1 = code >> 8, c2 = code & 0xFF;
4154 EMIT_TWO_BYTES (c1, c2);
4155 }
4156 else if (charset == charset_kana)
4157 EMIT_ONE_BYTE (code | 0x80);
4158 else
4159 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4160 }
4161 }
4162 coding->result = CODING_RESULT_SUCCESS;
4163 coding->produced_char += produced_chars;
4164 coding->produced = dst - coding->destination;
4165 return 0;
4166 }
4167
4168 static int
4169 encode_coding_big5 (coding)
4170 struct coding_system *coding;
4171 {
4172 int multibytep = coding->dst_multibyte;
4173 int *charbuf = coding->charbuf;
4174 int *charbuf_end = charbuf + coding->charbuf_used;
4175 unsigned char *dst = coding->destination + coding->produced;
4176 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4177 int safe_room = 4;
4178 int produced_chars = 0;
4179 Lisp_Object attrs, eol_type, charset_list, val;
4180 int ascii_compatible;
4181 struct charset *charset_roman, *charset_big5;
4182 int c;
4183
4184 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4185 val = charset_list;
4186 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4187 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4188 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4189
4190 while (charbuf < charbuf_end)
4191 {
4192 ASSURE_DESTINATION (safe_room);
4193 c = *charbuf++;
4194 /* Now encode the character C. */
4195 if (ASCII_CHAR_P (c) && ascii_compatible)
4196 EMIT_ONE_ASCII_BYTE (c);
4197 else if (CHAR_BYTE8_P (c))
4198 {
4199 c = CHAR_TO_BYTE8 (c);
4200 EMIT_ONE_BYTE (c);
4201 }
4202 else
4203 {
4204 unsigned code;
4205 struct charset *charset = char_charset (c, charset_list, &code);
4206
4207 if (! charset)
4208 {
4209 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4210 {
4211 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4212 charset = CHARSET_FROM_ID (charset_ascii);
4213 }
4214 else
4215 {
4216 c = coding->default_char;
4217 charset = char_charset (c, charset_list, &code);
4218 }
4219 }
4220 if (code == CHARSET_INVALID_CODE (charset))
4221 abort ();
4222 if (charset == charset_big5)
4223 {
4224 int c1, c2;
4225
4226 c1 = code >> 8, c2 = code & 0xFF;
4227 EMIT_TWO_BYTES (c1, c2);
4228 }
4229 else
4230 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4231 }
4232 }
4233 coding->result = CODING_RESULT_SUCCESS;
4234 coding->produced_char += produced_chars;
4235 coding->produced = dst - coding->destination;
4236 return 0;
4237 }
4238
4239 \f
4240 /*** 10. CCL handlers ***/
4241
4242 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4243 Check if a text is encoded in a coding system of which
4244 encoder/decoder are written in CCL program. If it is, return
4245 CATEGORY_MASK_CCL, else return 0. */
4246
4247 static int
4248 detect_coding_ccl (coding, detect_info)
4249 struct coding_system *coding;
4250 struct coding_detection_info *detect_info;
4251 {
4252 unsigned char *src = coding->source, *src_base = src;
4253 unsigned char *src_end = coding->source + coding->src_bytes;
4254 int multibytep = coding->src_multibyte;
4255 int consumed_chars = 0;
4256 int found = 0;
4257 unsigned char *valids = CODING_CCL_VALIDS (coding);
4258 int head_ascii = coding->head_ascii;
4259 Lisp_Object attrs;
4260
4261 detect_info->checked |= CATEGORY_MASK_CCL;
4262
4263 coding = &coding_categories[coding_category_ccl];
4264 attrs = CODING_ID_ATTRS (coding->id);
4265 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4266 src += head_ascii;
4267
4268 while (1)
4269 {
4270 int c;
4271 ONE_MORE_BYTE (c);
4272 if (! valids[c])
4273 break;
4274 if ((valids[c] > 1))
4275 found = CATEGORY_MASK_CCL;
4276 }
4277 detect_info->rejected |= CATEGORY_MASK_CCL;
4278 return 0;
4279
4280 no_more_source:
4281 detect_info->found |= found;
4282 return 1;
4283 }
4284
4285 static void
4286 decode_coding_ccl (coding)
4287 struct coding_system *coding;
4288 {
4289 const unsigned char *src = coding->source + coding->consumed;
4290 unsigned char *src_end = coding->source + coding->src_bytes;
4291 int *charbuf = coding->charbuf;
4292 int *charbuf_end = charbuf + coding->charbuf_size;
4293 int consumed_chars = 0;
4294 int multibytep = coding->src_multibyte;
4295 struct ccl_program ccl;
4296 int source_charbuf[1024];
4297 int source_byteidx[1024];
4298 Lisp_Object attrs, eol_type, charset_list, valids;
4299
4300 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4301 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4302
4303 while (src < src_end)
4304 {
4305 const unsigned char *p = src;
4306 int *source, *source_end;
4307 int i = 0;
4308
4309 if (multibytep)
4310 while (i < 1024 && p < src_end)
4311 {
4312 source_byteidx[i] = p - src;
4313 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4314 }
4315 else
4316 while (i < 1024 && p < src_end)
4317 source_charbuf[i++] = *p++;
4318
4319 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4320 ccl.last_block = 1;
4321
4322 source = source_charbuf;
4323 source_end = source + i;
4324 while (source < source_end)
4325 {
4326 ccl_driver (&ccl, source, charbuf,
4327 source_end - source, charbuf_end - charbuf,
4328 charset_list);
4329 source += ccl.consumed;
4330 charbuf += ccl.produced;
4331 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4332 break;
4333 }
4334 if (source < source_end)
4335 src += source_byteidx[source - source_charbuf];
4336 else
4337 src = p;
4338 consumed_chars += source - source_charbuf;
4339
4340 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4341 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4342 break;
4343 }
4344
4345 switch (ccl.status)
4346 {
4347 case CCL_STAT_SUSPEND_BY_SRC:
4348 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4349 break;
4350 case CCL_STAT_SUSPEND_BY_DST:
4351 break;
4352 case CCL_STAT_QUIT:
4353 case CCL_STAT_INVALID_CMD:
4354 coding->result = CODING_RESULT_INTERRUPT;
4355 break;
4356 default:
4357 coding->result = CODING_RESULT_SUCCESS;
4358 break;
4359 }
4360 coding->consumed_char += consumed_chars;
4361 coding->consumed = src - coding->source;
4362 coding->charbuf_used = charbuf - coding->charbuf;
4363 }
4364
4365 static int
4366 encode_coding_ccl (coding)
4367 struct coding_system *coding;
4368 {
4369 struct ccl_program ccl;
4370 int multibytep = coding->dst_multibyte;
4371 int *charbuf = coding->charbuf;
4372 int *charbuf_end = charbuf + coding->charbuf_used;
4373 unsigned char *dst = coding->destination + coding->produced;
4374 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4375 unsigned char *adjusted_dst_end = dst_end - 1;
4376 int destination_charbuf[1024];
4377 int i, produced_chars = 0;
4378 Lisp_Object attrs, eol_type, charset_list;
4379
4380 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4381 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4382
4383 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4384 ccl.dst_multibyte = coding->dst_multibyte;
4385
4386 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4387 {
4388 int dst_bytes = dst_end - dst;
4389 if (dst_bytes > 1024)
4390 dst_bytes = 1024;
4391
4392 ccl_driver (&ccl, charbuf, destination_charbuf,
4393 charbuf_end - charbuf, dst_bytes, charset_list);
4394 charbuf += ccl.consumed;
4395 if (multibytep)
4396 for (i = 0; i < ccl.produced; i++)
4397 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4398 else
4399 {
4400 for (i = 0; i < ccl.produced; i++)
4401 *dst++ = destination_charbuf[i] & 0xFF;
4402 produced_chars += ccl.produced;
4403 }
4404 }
4405
4406 switch (ccl.status)
4407 {
4408 case CCL_STAT_SUSPEND_BY_SRC:
4409 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4410 break;
4411 case CCL_STAT_SUSPEND_BY_DST:
4412 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4413 break;
4414 case CCL_STAT_QUIT:
4415 case CCL_STAT_INVALID_CMD:
4416 coding->result = CODING_RESULT_INTERRUPT;
4417 break;
4418 default:
4419 coding->result = CODING_RESULT_SUCCESS;
4420 break;
4421 }
4422
4423 coding->produced_char += produced_chars;
4424 coding->produced = dst - coding->destination;
4425 return 0;
4426 }
4427
4428
4429 \f
4430 /*** 10, 11. no-conversion handlers ***/
4431
4432 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4433
4434 static void
4435 decode_coding_raw_text (coding)
4436 struct coding_system *coding;
4437 {
4438 coding->chars_at_source = 1;
4439 coding->consumed_char = 0;
4440 coding->consumed = 0;
4441 coding->result = CODING_RESULT_SUCCESS;
4442 }
4443
4444 static int
4445 encode_coding_raw_text (coding)
4446 struct coding_system *coding;
4447 {
4448 int multibytep = coding->dst_multibyte;
4449 int *charbuf = coding->charbuf;
4450 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4451 unsigned char *dst = coding->destination + coding->produced;
4452 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4453 int produced_chars = 0;
4454 int c;
4455
4456 if (multibytep)
4457 {
4458 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4459
4460 if (coding->src_multibyte)
4461 while (charbuf < charbuf_end)
4462 {
4463 ASSURE_DESTINATION (safe_room);
4464 c = *charbuf++;
4465 if (ASCII_CHAR_P (c))
4466 EMIT_ONE_ASCII_BYTE (c);
4467 else if (CHAR_BYTE8_P (c))
4468 {
4469 c = CHAR_TO_BYTE8 (c);
4470 EMIT_ONE_BYTE (c);
4471 }
4472 else
4473 {
4474 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4475
4476 CHAR_STRING_ADVANCE (c, p1);
4477 while (p0 < p1)
4478 {
4479 EMIT_ONE_BYTE (*p0);
4480 p0++;
4481 }
4482 }
4483 }
4484 else
4485 while (charbuf < charbuf_end)
4486 {
4487 ASSURE_DESTINATION (safe_room);
4488 c = *charbuf++;
4489 EMIT_ONE_BYTE (c);
4490 }
4491 }
4492 else
4493 {
4494 if (coding->src_multibyte)
4495 {
4496 int safe_room = MAX_MULTIBYTE_LENGTH;
4497
4498 while (charbuf < charbuf_end)
4499 {
4500 ASSURE_DESTINATION (safe_room);
4501 c = *charbuf++;
4502 if (ASCII_CHAR_P (c))
4503 *dst++ = c;
4504 else if (CHAR_BYTE8_P (c))
4505 *dst++ = CHAR_TO_BYTE8 (c);
4506 else
4507 CHAR_STRING_ADVANCE (c, dst);
4508 produced_chars++;
4509 }
4510 }
4511 else
4512 {
4513 ASSURE_DESTINATION (charbuf_end - charbuf);
4514 while (charbuf < charbuf_end && dst < dst_end)
4515 *dst++ = *charbuf++;
4516 produced_chars = dst - (coding->destination + coding->dst_bytes);
4517 }
4518 }
4519 coding->result = CODING_RESULT_SUCCESS;
4520 coding->produced_char += produced_chars;
4521 coding->produced = dst - coding->destination;
4522 return 0;
4523 }
4524
4525 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4526 Check if a text is encoded in a charset-based coding system. If it
4527 is, return 1, else return 0. */
4528
4529 static int
4530 detect_coding_charset (coding, detect_info)
4531 struct coding_system *coding;
4532 struct coding_detection_info *detect_info;
4533 {
4534 unsigned char *src = coding->source, *src_base = src;
4535 unsigned char *src_end = coding->source + coding->src_bytes;
4536 int multibytep = coding->src_multibyte;
4537 int consumed_chars = 0;
4538 Lisp_Object attrs, valids;
4539 int found = 0;
4540
4541 detect_info->checked |= CATEGORY_MASK_CHARSET;
4542
4543 coding = &coding_categories[coding_category_charset];
4544 attrs = CODING_ID_ATTRS (coding->id);
4545 valids = AREF (attrs, coding_attr_charset_valids);
4546
4547 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4548 src += coding->head_ascii;
4549
4550 while (1)
4551 {
4552 int c;
4553
4554 ONE_MORE_BYTE (c);
4555 if (NILP (AREF (valids, c)))
4556 break;
4557 if (c >= 0x80)
4558 found = CATEGORY_MASK_CHARSET;
4559 }
4560 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4561 return 0;
4562
4563 no_more_source:
4564 detect_info->found |= found;
4565 return 1;
4566 }
4567
4568 static void
4569 decode_coding_charset (coding)
4570 struct coding_system *coding;
4571 {
4572 unsigned char *src = coding->source + coding->consumed;
4573 unsigned char *src_end = coding->source + coding->src_bytes;
4574 unsigned char *src_base;
4575 int *charbuf = coding->charbuf;
4576 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4577 int consumed_chars = 0, consumed_chars_base;
4578 int multibytep = coding->src_multibyte;
4579 Lisp_Object attrs, eol_type, charset_list, valids;
4580 int char_offset = coding->produced_char;
4581 int last_offset = char_offset;
4582 int last_id = charset_ascii;
4583
4584 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4585 valids = AREF (attrs, coding_attr_charset_valids);
4586
4587 while (1)
4588 {
4589 int c;
4590
4591 src_base = src;
4592 consumed_chars_base = consumed_chars;
4593
4594 if (charbuf >= charbuf_end)
4595 break;
4596
4597 ONE_MORE_BYTE (c);
4598 if (c == '\r')
4599 {
4600 /* Here we assume that no charset maps '\r' to something
4601 else. */
4602 if (EQ (eol_type, Qdos))
4603 {
4604 if (src == src_end)
4605 {
4606 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4607 goto no_more_source;
4608 }
4609 if (*src == '\n')
4610 ONE_MORE_BYTE (c);
4611 }
4612 else if (EQ (eol_type, Qmac))
4613 c = '\n';
4614 }
4615 else
4616 {
4617 Lisp_Object val;
4618 struct charset *charset;
4619 int dim;
4620 int len = 1;
4621 unsigned code = c;
4622
4623 val = AREF (valids, c);
4624 if (NILP (val))
4625 goto invalid_code;
4626 if (INTEGERP (val))
4627 {
4628 charset = CHARSET_FROM_ID (XFASTINT (val));
4629 dim = CHARSET_DIMENSION (charset);
4630 while (len < dim)
4631 {
4632 ONE_MORE_BYTE (c);
4633 code = (code << 8) | c;
4634 len++;
4635 }
4636 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4637 charset, code, c);
4638 }
4639 else
4640 {
4641 /* VAL is a list of charset IDs. It is assured that the
4642 list is sorted by charset dimensions (smaller one
4643 comes first). */
4644 while (CONSP (val))
4645 {
4646 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4647 dim = CHARSET_DIMENSION (charset);
4648 while (len < dim)
4649 {
4650 ONE_MORE_BYTE (c);
4651 code = (code << 8) | c;
4652 len++;
4653 }
4654 CODING_DECODE_CHAR (coding, src, src_base,
4655 src_end, charset, code, c);
4656 if (c >= 0)
4657 break;
4658 val = XCDR (val);
4659 }
4660 }
4661 if (c < 0)
4662 goto invalid_code;
4663 if (charset->id != charset_ascii
4664 && last_id != charset->id)
4665 {
4666 if (last_id != charset_ascii)
4667 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4668 last_id = charset->id;
4669 last_offset = char_offset;
4670 }
4671 }
4672 *charbuf++ = c;
4673 char_offset++;
4674 continue;
4675
4676 invalid_code:
4677 src = src_base;
4678 consumed_chars = consumed_chars_base;
4679 ONE_MORE_BYTE (c);
4680 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4681 char_offset++;
4682 coding->errors++;
4683 }
4684
4685 no_more_source:
4686 if (last_id != charset_ascii)
4687 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4688 coding->consumed_char += consumed_chars_base;
4689 coding->consumed = src_base - coding->source;
4690 coding->charbuf_used = charbuf - coding->charbuf;
4691 }
4692
4693 static int
4694 encode_coding_charset (coding)
4695 struct coding_system *coding;
4696 {
4697 int multibytep = coding->dst_multibyte;
4698 int *charbuf = coding->charbuf;
4699 int *charbuf_end = charbuf + coding->charbuf_used;
4700 unsigned char *dst = coding->destination + coding->produced;
4701 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4702 int safe_room = MAX_MULTIBYTE_LENGTH;
4703 int produced_chars = 0;
4704 Lisp_Object attrs, eol_type, charset_list;
4705 int ascii_compatible;
4706 int c;
4707
4708 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4709 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4710
4711 while (charbuf < charbuf_end)
4712 {
4713 struct charset *charset;
4714 unsigned code;
4715
4716 ASSURE_DESTINATION (safe_room);
4717 c = *charbuf++;
4718 if (ascii_compatible && ASCII_CHAR_P (c))
4719 EMIT_ONE_ASCII_BYTE (c);
4720 else if (CHAR_BYTE8_P (c))
4721 {
4722 c = CHAR_TO_BYTE8 (c);
4723 EMIT_ONE_BYTE (c);
4724 }
4725 else
4726 {
4727 charset = char_charset (c, charset_list, &code);
4728 if (charset)
4729 {
4730 if (CHARSET_DIMENSION (charset) == 1)
4731 EMIT_ONE_BYTE (code);
4732 else if (CHARSET_DIMENSION (charset) == 2)
4733 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4734 else if (CHARSET_DIMENSION (charset) == 3)
4735 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4736 else
4737 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4738 (code >> 8) & 0xFF, code & 0xFF);
4739 }
4740 else
4741 {
4742 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4743 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4744 else
4745 c = coding->default_char;
4746 EMIT_ONE_BYTE (c);
4747 }
4748 }
4749 }
4750
4751 coding->result = CODING_RESULT_SUCCESS;
4752 coding->produced_char += produced_chars;
4753 coding->produced = dst - coding->destination;
4754 return 0;
4755 }
4756
4757 \f
4758 /*** 7. C library functions ***/
4759
4760 /* Setup coding context CODING from information about CODING_SYSTEM.
4761 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4762 CODING_SYSTEM is invalid, signal an error. */
4763
4764 void
4765 setup_coding_system (coding_system, coding)
4766 Lisp_Object coding_system;
4767 struct coding_system *coding;
4768 {
4769 Lisp_Object attrs;
4770 Lisp_Object eol_type;
4771 Lisp_Object coding_type;
4772 Lisp_Object val;
4773
4774 if (NILP (coding_system))
4775 coding_system = Qno_conversion;
4776
4777 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4778
4779 attrs = CODING_ID_ATTRS (coding->id);
4780 eol_type = CODING_ID_EOL_TYPE (coding->id);
4781
4782 coding->mode = 0;
4783 coding->head_ascii = -1;
4784 coding->common_flags
4785 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4786
4787 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4788 coding->max_charset_id = XSTRING (val)->size - 1;
4789 coding->safe_charsets = (char *) XSTRING (val)->data;
4790 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4791
4792 coding_type = CODING_ATTR_TYPE (attrs);
4793 if (EQ (coding_type, Qundecided))
4794 {
4795 coding->detector = NULL;
4796 coding->decoder = decode_coding_raw_text;
4797 coding->encoder = encode_coding_raw_text;
4798 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4799 }
4800 else if (EQ (coding_type, Qiso_2022))
4801 {
4802 int i;
4803 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4804 enum coding_category category = XINT (CODING_ATTR_CATEGORY (attrs));
4805
4806 /* Invoke graphic register 0 to plane 0. */
4807 CODING_ISO_INVOCATION (coding, 0) = 0;
4808 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4809 CODING_ISO_INVOCATION (coding, 1)
4810 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4811 /* Setup the initial status of designation. */
4812 for (i = 0; i < 4; i++)
4813 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4814 /* Not single shifting initially. */
4815 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4816 /* Beginning of buffer should also be regarded as bol. */
4817 CODING_ISO_BOL (coding) = 1;
4818 coding->detector = detect_coding_iso_2022;
4819 coding->decoder = decode_coding_iso_2022;
4820 coding->encoder = encode_coding_iso_2022;
4821 if (flags & CODING_ISO_FLAG_SAFE)
4822 coding->mode |= CODING_MODE_SAFE_ENCODING;
4823 coding->common_flags
4824 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4825 | CODING_REQUIRE_FLUSHING_MASK);
4826 if (flags & CODING_ISO_FLAG_COMPOSITION)
4827 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4828 if (flags & CODING_ISO_FLAG_DESIGNATION)
4829 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4830 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4831 {
4832 setup_iso_safe_charsets (attrs);
4833 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4834 coding->max_charset_id = XSTRING (val)->size - 1;
4835 coding->safe_charsets = (char *) XSTRING (val)->data;
4836 }
4837 CODING_ISO_FLAGS (coding) = flags;
4838 }
4839 else if (EQ (coding_type, Qcharset))
4840 {
4841 coding->detector = detect_coding_charset;
4842 coding->decoder = decode_coding_charset;
4843 coding->encoder = encode_coding_charset;
4844 coding->common_flags
4845 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4846 }
4847 else if (EQ (coding_type, Qutf_8))
4848 {
4849 coding->detector = detect_coding_utf_8;
4850 coding->decoder = decode_coding_utf_8;
4851 coding->encoder = encode_coding_utf_8;
4852 coding->common_flags
4853 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4854 }
4855 else if (EQ (coding_type, Qutf_16))
4856 {
4857 val = AREF (attrs, coding_attr_utf_16_bom);
4858 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4859 : EQ (val, Qt) ? utf_16_with_bom
4860 : utf_16_without_bom);
4861 val = AREF (attrs, coding_attr_utf_16_endian);
4862 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4863 : utf_16_little_endian);
4864 CODING_UTF_16_SURROGATE (coding) = 0;
4865 coding->detector = detect_coding_utf_16;
4866 coding->decoder = decode_coding_utf_16;
4867 coding->encoder = encode_coding_utf_16;
4868 coding->common_flags
4869 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4870 }
4871 else if (EQ (coding_type, Qccl))
4872 {
4873 coding->detector = detect_coding_ccl;
4874 coding->decoder = decode_coding_ccl;
4875 coding->encoder = encode_coding_ccl;
4876 coding->common_flags
4877 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4878 | CODING_REQUIRE_FLUSHING_MASK);
4879 }
4880 else if (EQ (coding_type, Qemacs_mule))
4881 {
4882 coding->detector = detect_coding_emacs_mule;
4883 coding->decoder = decode_coding_emacs_mule;
4884 coding->encoder = encode_coding_emacs_mule;
4885 coding->common_flags
4886 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4887 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4888 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4889 {
4890 Lisp_Object tail, safe_charsets;
4891 int max_charset_id = 0;
4892
4893 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4894 tail = XCDR (tail))
4895 if (max_charset_id < XFASTINT (XCAR (tail)))
4896 max_charset_id = XFASTINT (XCAR (tail));
4897 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4898 make_number (255));
4899 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4900 tail = XCDR (tail))
4901 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4902 coding->max_charset_id = max_charset_id;
4903 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4904 }
4905 }
4906 else if (EQ (coding_type, Qshift_jis))
4907 {
4908 coding->detector = detect_coding_sjis;
4909 coding->decoder = decode_coding_sjis;
4910 coding->encoder = encode_coding_sjis;
4911 coding->common_flags
4912 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4913 }
4914 else if (EQ (coding_type, Qbig5))
4915 {
4916 coding->detector = detect_coding_big5;
4917 coding->decoder = decode_coding_big5;
4918 coding->encoder = encode_coding_big5;
4919 coding->common_flags
4920 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4921 }
4922 else /* EQ (coding_type, Qraw_text) */
4923 {
4924 coding->detector = NULL;
4925 coding->decoder = decode_coding_raw_text;
4926 coding->encoder = encode_coding_raw_text;
4927 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4928 }
4929
4930 return;
4931 }
4932
4933 /* Return raw-text or one of its subsidiaries that has the same
4934 eol_type as CODING-SYSTEM. */
4935
4936 Lisp_Object
4937 raw_text_coding_system (coding_system)
4938 Lisp_Object coding_system;
4939 {
4940 Lisp_Object spec, attrs;
4941 Lisp_Object eol_type, raw_text_eol_type;
4942
4943 spec = CODING_SYSTEM_SPEC (coding_system);
4944 attrs = AREF (spec, 0);
4945
4946 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4947 return coding_system;
4948
4949 eol_type = AREF (spec, 2);
4950 if (VECTORP (eol_type))
4951 return Qraw_text;
4952 spec = CODING_SYSTEM_SPEC (Qraw_text);
4953 raw_text_eol_type = AREF (spec, 2);
4954 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4955 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4956 : AREF (raw_text_eol_type, 2));
4957 }
4958
4959
4960 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4961 does, return one of the subsidiary that has the same eol-spec as
4962 PARENT. Otherwise, return CODING_SYSTEM. */
4963
4964 Lisp_Object
4965 coding_inherit_eol_type (coding_system, parent)
4966 Lisp_Object coding_system, parent;
4967 {
4968 Lisp_Object spec, attrs, eol_type;
4969
4970 spec = CODING_SYSTEM_SPEC (coding_system);
4971 attrs = AREF (spec, 0);
4972 eol_type = AREF (spec, 2);
4973 if (VECTORP (eol_type))
4974 {
4975 Lisp_Object parent_spec;
4976 Lisp_Object parent_eol_type;
4977
4978 parent_spec
4979 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4980 parent_eol_type = AREF (parent_spec, 2);
4981 if (EQ (parent_eol_type, Qunix))
4982 coding_system = AREF (eol_type, 0);
4983 else if (EQ (parent_eol_type, Qdos))
4984 coding_system = AREF (eol_type, 1);
4985 else if (EQ (parent_eol_type, Qmac))
4986 coding_system = AREF (eol_type, 2);
4987 }
4988 return coding_system;
4989 }
4990
4991 /* Emacs has a mechanism to automatically detect a coding system if it
4992 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4993 it's impossible to distinguish some coding systems accurately
4994 because they use the same range of codes. So, at first, coding
4995 systems are categorized into 7, those are:
4996
4997 o coding-category-emacs-mule
4998
4999 The category for a coding system which has the same code range
5000 as Emacs' internal format. Assigned the coding-system (Lisp
5001 symbol) `emacs-mule' by default.
5002
5003 o coding-category-sjis
5004
5005 The category for a coding system which has the same code range
5006 as SJIS. Assigned the coding-system (Lisp
5007 symbol) `japanese-shift-jis' by default.
5008
5009 o coding-category-iso-7
5010
5011 The category for a coding system which has the same code range
5012 as ISO2022 of 7-bit environment. This doesn't use any locking
5013 shift and single shift functions. This can encode/decode all
5014 charsets. Assigned the coding-system (Lisp symbol)
5015 `iso-2022-7bit' by default.
5016
5017 o coding-category-iso-7-tight
5018
5019 Same as coding-category-iso-7 except that this can
5020 encode/decode only the specified charsets.
5021
5022 o coding-category-iso-8-1
5023
5024 The category for a coding system which has the same code range
5025 as ISO2022 of 8-bit environment and graphic plane 1 used only
5026 for DIMENSION1 charset. This doesn't use any locking shift
5027 and single shift functions. Assigned the coding-system (Lisp
5028 symbol) `iso-latin-1' by default.
5029
5030 o coding-category-iso-8-2
5031
5032 The category for a coding system which has the same code range
5033 as ISO2022 of 8-bit environment and graphic plane 1 used only
5034 for DIMENSION2 charset. This doesn't use any locking shift
5035 and single shift functions. Assigned the coding-system (Lisp
5036 symbol) `japanese-iso-8bit' by default.
5037
5038 o coding-category-iso-7-else
5039
5040 The category for a coding system which has the same code range
5041 as ISO2022 of 7-bit environemnt but uses locking shift or
5042 single shift functions. Assigned the coding-system (Lisp
5043 symbol) `iso-2022-7bit-lock' by default.
5044
5045 o coding-category-iso-8-else
5046
5047 The category for a coding system which has the same code range
5048 as ISO2022 of 8-bit environemnt but uses locking shift or
5049 single shift functions. Assigned the coding-system (Lisp
5050 symbol) `iso-2022-8bit-ss2' by default.
5051
5052 o coding-category-big5
5053
5054 The category for a coding system which has the same code range
5055 as BIG5. Assigned the coding-system (Lisp symbol)
5056 `cn-big5' by default.
5057
5058 o coding-category-utf-8
5059
5060 The category for a coding system which has the same code range
5061 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5062 symbol) `utf-8' by default.
5063
5064 o coding-category-utf-16-be
5065
5066 The category for a coding system in which a text has an
5067 Unicode signature (cf. Unicode Standard) in the order of BIG
5068 endian at the head. Assigned the coding-system (Lisp symbol)
5069 `utf-16-be' by default.
5070
5071 o coding-category-utf-16-le
5072
5073 The category for a coding system in which a text has an
5074 Unicode signature (cf. Unicode Standard) in the order of
5075 LITTLE endian at the head. Assigned the coding-system (Lisp
5076 symbol) `utf-16-le' by default.
5077
5078 o coding-category-ccl
5079
5080 The category for a coding system of which encoder/decoder is
5081 written in CCL programs. The default value is nil, i.e., no
5082 coding system is assigned.
5083
5084 o coding-category-binary
5085
5086 The category for a coding system not categorized in any of the
5087 above. Assigned the coding-system (Lisp symbol)
5088 `no-conversion' by default.
5089
5090 Each of them is a Lisp symbol and the value is an actual
5091 `coding-system's (this is also a Lisp symbol) assigned by a user.
5092 What Emacs does actually is to detect a category of coding system.
5093 Then, it uses a `coding-system' assigned to it. If Emacs can't
5094 decide only one possible category, it selects a category of the
5095 highest priority. Priorities of categories are also specified by a
5096 user in a Lisp variable `coding-category-list'.
5097
5098 */
5099
5100 #define EOL_SEEN_NONE 0
5101 #define EOL_SEEN_LF 1
5102 #define EOL_SEEN_CR 2
5103 #define EOL_SEEN_CRLF 4
5104
5105 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5106 SOURCE is encoded. If CATEGORY is one of
5107 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5108 two-byte, else they are encoded by one-byte.
5109
5110 Return one of EOL_SEEN_XXX. */
5111
5112 #define MAX_EOL_CHECK_COUNT 3
5113
5114 static int
5115 detect_eol (source, src_bytes, category)
5116 unsigned char *source;
5117 EMACS_INT src_bytes;
5118 enum coding_category category;
5119 {
5120 unsigned char *src = source, *src_end = src + src_bytes;
5121 unsigned char c;
5122 int total = 0;
5123 int eol_seen = EOL_SEEN_NONE;
5124
5125 if ((1 << category) & CATEGORY_MASK_UTF_16)
5126 {
5127 int msb, lsb;
5128
5129 msb = category == (coding_category_utf_16_le
5130 | coding_category_utf_16_le_nosig);
5131 lsb = 1 - msb;
5132
5133 while (src + 1 < src_end)
5134 {
5135 c = src[lsb];
5136 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5137 {
5138 int this_eol;
5139
5140 if (c == '\n')
5141 this_eol = EOL_SEEN_LF;
5142 else if (src + 3 >= src_end
5143 || src[msb + 2] != 0
5144 || src[lsb + 2] != '\n')
5145 this_eol = EOL_SEEN_CR;
5146 else
5147 this_eol = EOL_SEEN_CRLF;
5148
5149 if (eol_seen == EOL_SEEN_NONE)
5150 /* This is the first end-of-line. */
5151 eol_seen = this_eol;
5152 else if (eol_seen != this_eol)
5153 {
5154 /* The found type is different from what found before. */
5155 eol_seen = EOL_SEEN_LF;
5156 break;
5157 }
5158 if (++total == MAX_EOL_CHECK_COUNT)
5159 break;
5160 }
5161 src += 2;
5162 }
5163 }
5164 else
5165 {
5166 while (src < src_end)
5167 {
5168 c = *src++;
5169 if (c == '\n' || c == '\r')
5170 {
5171 int this_eol;
5172
5173 if (c == '\n')
5174 this_eol = EOL_SEEN_LF;
5175 else if (src >= src_end || *src != '\n')
5176 this_eol = EOL_SEEN_CR;
5177 else
5178 this_eol = EOL_SEEN_CRLF, src++;
5179
5180 if (eol_seen == EOL_SEEN_NONE)
5181 /* This is the first end-of-line. */
5182 eol_seen = this_eol;
5183 else if (eol_seen != this_eol)
5184 {
5185 /* The found type is different from what found before. */
5186 eol_seen = EOL_SEEN_LF;
5187 break;
5188 }
5189 if (++total == MAX_EOL_CHECK_COUNT)
5190 break;
5191 }
5192 }
5193 }
5194 return eol_seen;
5195 }
5196
5197
5198 static void
5199 adjust_coding_eol_type (coding, eol_seen)
5200 struct coding_system *coding;
5201 int eol_seen;
5202 {
5203 Lisp_Object eol_type;
5204
5205 eol_type = CODING_ID_EOL_TYPE (coding->id);
5206 if (eol_seen & EOL_SEEN_LF)
5207 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5208 else if (eol_seen & EOL_SEEN_CRLF)
5209 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5210 else if (eol_seen & EOL_SEEN_CR)
5211 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5212 }
5213
5214 /* Detect how a text specified in CODING is encoded. If a coding
5215 system is detected, update fields of CODING by the detected coding
5216 system. */
5217
5218 void
5219 detect_coding (coding)
5220 struct coding_system *coding;
5221 {
5222 unsigned char *src, *src_end;
5223 Lisp_Object attrs, coding_type;
5224
5225 coding->consumed = coding->consumed_char = 0;
5226 coding->produced = coding->produced_char = 0;
5227 coding_set_source (coding);
5228
5229 src_end = coding->source + coding->src_bytes;
5230
5231 /* If we have not yet decided the text encoding type, detect it
5232 now. */
5233 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5234 {
5235 int c, i;
5236
5237 for (src = coding->source; src < src_end; src++)
5238 {
5239 c = *src;
5240 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5241 || c == ISO_CODE_SI
5242 || c == ISO_CODE_SO)))
5243 break;
5244 }
5245 coding->head_ascii = src - (coding->source + coding->consumed);
5246
5247 if (coding->head_ascii < coding->src_bytes)
5248 {
5249 struct coding_detection_info detect_info;
5250 enum coding_category category;
5251 struct coding_system *this;
5252
5253 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5254 for (i = 0; i < coding_category_raw_text; i++)
5255 {
5256 category = coding_priorities[i];
5257 this = coding_categories + category;
5258 if (this->id < 0)
5259 {
5260 /* No coding system of this category is defined. */
5261 detect_info.rejected |= (1 << category);
5262 }
5263 else if (category >= coding_category_raw_text)
5264 continue;
5265 else if (detect_info.checked & (1 << category))
5266 {
5267 if (detect_info.found & (1 << category))
5268 break;
5269 }
5270 else if ((*(this->detector)) (coding, &detect_info)
5271 && detect_info.found & (1 << category))
5272 break;
5273 }
5274 if (i < coding_category_raw_text)
5275 setup_coding_system (CODING_ID_NAME (this->id), coding);
5276 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5277 setup_coding_system (Qraw_text, coding);
5278 else if (detect_info.rejected)
5279 for (i = 0; i < coding_category_raw_text; i++)
5280 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5281 {
5282 this = coding_categories + coding_priorities[i];
5283 setup_coding_system (CODING_ID_NAME (this->id), coding);
5284 break;
5285 }
5286 }
5287 }
5288
5289 attrs = CODING_ID_ATTRS (coding->id);
5290 coding_type = CODING_ATTR_TYPE (attrs);
5291
5292 /* If we have not yet decided the EOL type, detect it now. But, the
5293 detection is impossible for a CCL based coding system, in which
5294 case, we detct the EOL type after decoding. */
5295 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5296 && ! EQ (coding_type, Qccl))
5297 {
5298 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5299 XINT (CODING_ATTR_CATEGORY (attrs)));
5300
5301 if (eol_seen != EOL_SEEN_NONE)
5302 adjust_coding_eol_type (coding, eol_seen);
5303 }
5304 }
5305
5306
5307 static void
5308 decode_eol (coding)
5309 struct coding_system *coding;
5310 {
5311 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5312 {
5313 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5314 unsigned char *pend = p + coding->produced;
5315 int eol_seen = EOL_SEEN_NONE;
5316
5317 for (; p < pend; p++)
5318 {
5319 if (*p == '\n')
5320 eol_seen |= EOL_SEEN_LF;
5321 else if (*p == '\r')
5322 {
5323 if (p + 1 < pend && *(p + 1) == '\n')
5324 {
5325 eol_seen |= EOL_SEEN_CRLF;
5326 p++;
5327 }
5328 else
5329 eol_seen |= EOL_SEEN_CR;
5330 }
5331 }
5332 if (eol_seen != EOL_SEEN_NONE)
5333 adjust_coding_eol_type (coding, eol_seen);
5334 }
5335
5336 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5337 {
5338 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5339 unsigned char *pend = p + coding->produced;
5340
5341 for (; p < pend; p++)
5342 if (*p == '\r')
5343 *p = '\n';
5344 }
5345 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5346 {
5347 unsigned char *p, *pbeg, *pend;
5348 Lisp_Object undo_list;
5349
5350 move_gap_both (coding->dst_pos + coding->produced_char,
5351 coding->dst_pos_byte + coding->produced);
5352 undo_list = current_buffer->undo_list;
5353 current_buffer->undo_list = Qt;
5354 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5355 current_buffer->undo_list = undo_list;
5356 pbeg = GPT_ADDR;
5357 pend = pbeg + coding->produced;
5358
5359 for (p = pend - 1; p >= pbeg; p--)
5360 if (*p == '\r')
5361 {
5362 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5363 pend--;
5364 }
5365 coding->produced_char -= coding->produced - (pend - pbeg);
5366 coding->produced = pend - pbeg;
5367 insert_from_gap (coding->produced_char, coding->produced);
5368 }
5369 }
5370
5371 static void
5372 translate_chars (coding, table)
5373 struct coding_system *coding;
5374 Lisp_Object table;
5375 {
5376 int *charbuf = coding->charbuf;
5377 int *charbuf_end = charbuf + coding->charbuf_used;
5378 int c;
5379
5380 if (coding->chars_at_source)
5381 return;
5382
5383 while (charbuf < charbuf_end)
5384 {
5385 c = *charbuf;
5386 if (c < 0)
5387 charbuf += c;
5388 else
5389 *charbuf++ = translate_char (table, c);
5390 }
5391 }
5392
5393 static int
5394 produce_chars (coding)
5395 struct coding_system *coding;
5396 {
5397 unsigned char *dst = coding->destination + coding->produced;
5398 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5399 int produced;
5400 int produced_chars = 0;
5401
5402 if (! coding->chars_at_source)
5403 {
5404 /* Characters are in coding->charbuf. */
5405 int *buf = coding->charbuf;
5406 int *buf_end = buf + coding->charbuf_used;
5407 unsigned char *adjusted_dst_end;
5408
5409 if (BUFFERP (coding->src_object)
5410 && EQ (coding->src_object, coding->dst_object))
5411 dst_end = coding->source + coding->consumed;
5412 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5413
5414 while (buf < buf_end)
5415 {
5416 int c = *buf++;
5417
5418 if (dst >= adjusted_dst_end)
5419 {
5420 dst = alloc_destination (coding,
5421 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5422 dst);
5423 dst_end = coding->destination + coding->dst_bytes;
5424 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5425 }
5426 if (c >= 0)
5427 {
5428 if (coding->dst_multibyte
5429 || ! CHAR_BYTE8_P (c))
5430 CHAR_STRING_ADVANCE (c, dst);
5431 else
5432 *dst++ = CHAR_TO_BYTE8 (c);
5433 produced_chars++;
5434 }
5435 else
5436 /* This is an annotation datum. */
5437 buf -= c + 1;
5438 }
5439 }
5440 else
5441 {
5442 unsigned char *src = coding->source;
5443 unsigned char *src_end = src + coding->src_bytes;
5444 Lisp_Object eol_type;
5445
5446 eol_type = CODING_ID_EOL_TYPE (coding->id);
5447
5448 if (coding->src_multibyte != coding->dst_multibyte)
5449 {
5450 if (coding->src_multibyte)
5451 {
5452 int multibytep = 1;
5453 int consumed_chars;
5454
5455 while (1)
5456 {
5457 unsigned char *src_base = src;
5458 int c;
5459
5460 ONE_MORE_BYTE (c);
5461 if (c == '\r')
5462 {
5463 if (EQ (eol_type, Qdos))
5464 {
5465 if (src == src_end)
5466 {
5467 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5468 goto no_more_source;
5469 }
5470 if (*src == '\n')
5471 c = *src++;
5472 }
5473 else if (EQ (eol_type, Qmac))
5474 c = '\n';
5475 }
5476 if (dst == dst_end)
5477 {
5478 coding->consumed = src - coding->source;
5479
5480 if (EQ (coding->src_object, coding->dst_object))
5481 dst_end = src;
5482 if (dst == dst_end)
5483 {
5484 dst = alloc_destination (coding, src_end - src + 1,
5485 dst);
5486 dst_end = coding->destination + coding->dst_bytes;
5487 coding_set_source (coding);
5488 src = coding->source + coding->consumed;
5489 src_end = coding->source + coding->src_bytes;
5490 }
5491 }
5492 *dst++ = c;
5493 produced_chars++;
5494 }
5495 no_more_source:
5496 ;
5497 }
5498 else
5499 while (src < src_end)
5500 {
5501 int multibytep = 1;
5502 int c = *src++;
5503
5504 if (c == '\r')
5505 {
5506 if (EQ (eol_type, Qdos))
5507 {
5508 if (src < src_end
5509 && *src == '\n')
5510 c = *src++;
5511 }
5512 else if (EQ (eol_type, Qmac))
5513 c = '\n';
5514 }
5515 if (dst >= dst_end - 1)
5516 {
5517 coding->consumed = src - coding->source;
5518
5519 if (EQ (coding->src_object, coding->dst_object))
5520 dst_end = src;
5521 if (dst >= dst_end - 1)
5522 {
5523 dst = alloc_destination (coding, src_end - src + 2,
5524 dst);
5525 dst_end = coding->destination + coding->dst_bytes;
5526 coding_set_source (coding);
5527 src = coding->source + coding->consumed;
5528 src_end = coding->source + coding->src_bytes;
5529 }
5530 }
5531 EMIT_ONE_BYTE (c);
5532 }
5533 }
5534 else
5535 {
5536 if (!EQ (coding->src_object, coding->dst_object))
5537 {
5538 int require = coding->src_bytes - coding->dst_bytes;
5539
5540 if (require > 0)
5541 {
5542 EMACS_INT offset = src - coding->source;
5543
5544 dst = alloc_destination (coding, require, dst);
5545 coding_set_source (coding);
5546 src = coding->source + offset;
5547 src_end = coding->source + coding->src_bytes;
5548 }
5549 }
5550 produced_chars = coding->src_chars;
5551 while (src < src_end)
5552 {
5553 int c = *src++;
5554
5555 if (c == '\r')
5556 {
5557 if (EQ (eol_type, Qdos))
5558 {
5559 if (src < src_end
5560 && *src == '\n')
5561 c = *src++;
5562 produced_chars--;
5563 }
5564 else if (EQ (eol_type, Qmac))
5565 c = '\n';
5566 }
5567 *dst++ = c;
5568 }
5569 }
5570 coding->consumed = coding->src_bytes;
5571 coding->consumed_char = coding->src_chars;
5572 }
5573
5574 produced = dst - (coding->destination + coding->produced);
5575 if (BUFFERP (coding->dst_object))
5576 insert_from_gap (produced_chars, produced);
5577 coding->produced += produced;
5578 coding->produced_char += produced_chars;
5579 return produced_chars;
5580 }
5581
5582 /* Compose text in CODING->object according to the annotation data at
5583 CHARBUF. CHARBUF is an array:
5584 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5585 */
5586
5587 static INLINE void
5588 produce_composition (coding, charbuf)
5589 struct coding_system *coding;
5590 int *charbuf;
5591 {
5592 int len;
5593 EMACS_INT from, to;
5594 enum composition_method method;
5595 Lisp_Object components;
5596
5597 len = -charbuf[0];
5598 from = coding->dst_pos + charbuf[2];
5599 to = coding->dst_pos + charbuf[3];
5600 method = (enum composition_method) (charbuf[4]);
5601
5602 if (method == COMPOSITION_RELATIVE)
5603 components = Qnil;
5604 else
5605 {
5606 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5607 int i;
5608
5609 len -= 5;
5610 charbuf += 5;
5611 for (i = 0; i < len; i++)
5612 args[i] = make_number (charbuf[i]);
5613 components = (method == COMPOSITION_WITH_ALTCHARS
5614 ? Fstring (len, args) : Fvector (len, args));
5615 }
5616 compose_text (from, to, components, Qnil, coding->dst_object);
5617 }
5618
5619
5620 /* Put `charset' property on text in CODING->object according to
5621 the annotation data at CHARBUF. CHARBUF is an array:
5622 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5623 */
5624
5625 static INLINE void
5626 produce_charset (coding, charbuf)
5627 struct coding_system *coding;
5628 int *charbuf;
5629 {
5630 EMACS_INT from = coding->dst_pos + charbuf[2];
5631 EMACS_INT to = coding->dst_pos + charbuf[3];
5632 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5633
5634 Fput_text_property (make_number (from), make_number (to),
5635 Qcharset, CHARSET_NAME (charset),
5636 coding->dst_object);
5637 }
5638
5639
5640 #define CHARBUF_SIZE 0x4000
5641
5642 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5643 do { \
5644 int size = CHARBUF_SIZE;; \
5645 \
5646 coding->charbuf = NULL; \
5647 while (size > 1024) \
5648 { \
5649 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5650 if (coding->charbuf) \
5651 break; \
5652 size >>= 1; \
5653 } \
5654 if (! coding->charbuf) \
5655 { \
5656 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5657 return coding->result; \
5658 } \
5659 coding->charbuf_size = size; \
5660 } while (0)
5661
5662
5663 static void
5664 produce_annotation (coding)
5665 struct coding_system *coding;
5666 {
5667 int *charbuf = coding->charbuf;
5668 int *charbuf_end = charbuf + coding->charbuf_used;
5669
5670 if (NILP (coding->dst_object))
5671 return;
5672
5673 while (charbuf < charbuf_end)
5674 {
5675 if (*charbuf >= 0)
5676 charbuf++;
5677 else
5678 {
5679 int len = -*charbuf;
5680 switch (charbuf[1])
5681 {
5682 case CODING_ANNOTATE_COMPOSITION_MASK:
5683 produce_composition (coding, charbuf);
5684 break;
5685 case CODING_ANNOTATE_CHARSET_MASK:
5686 produce_charset (coding, charbuf);
5687 break;
5688 default:
5689 abort ();
5690 }
5691 charbuf += len;
5692 }
5693 }
5694 }
5695
5696 /* Decode the data at CODING->src_object into CODING->dst_object.
5697 CODING->src_object is a buffer, a string, or nil.
5698 CODING->dst_object is a buffer.
5699
5700 If CODING->src_object is a buffer, it must be the current buffer.
5701 In this case, if CODING->src_pos is positive, it is a position of
5702 the source text in the buffer, otherwise, the source text is in the
5703 gap area of the buffer, and CODING->src_pos specifies the offset of
5704 the text from GPT (which must be the same as PT). If this is the
5705 same buffer as CODING->dst_object, CODING->src_pos must be
5706 negative.
5707
5708 If CODING->src_object is a string, CODING->src_pos in an index to
5709 that string.
5710
5711 If CODING->src_object is nil, CODING->source must already point to
5712 the non-relocatable memory area. In this case, CODING->src_pos is
5713 an offset from CODING->source.
5714
5715 The decoded data is inserted at the current point of the buffer
5716 CODING->dst_object.
5717 */
5718
5719 static int
5720 decode_coding (coding)
5721 struct coding_system *coding;
5722 {
5723 Lisp_Object attrs;
5724
5725 if (BUFFERP (coding->src_object)
5726 && coding->src_pos > 0
5727 && coding->src_pos < GPT
5728 && coding->src_pos + coding->src_chars > GPT)
5729 move_gap_both (coding->src_pos, coding->src_pos_byte);
5730
5731 if (BUFFERP (coding->dst_object))
5732 {
5733 if (current_buffer != XBUFFER (coding->dst_object))
5734 set_buffer_internal (XBUFFER (coding->dst_object));
5735 if (GPT != PT)
5736 move_gap_both (PT, PT_BYTE);
5737 }
5738
5739 coding->consumed = coding->consumed_char = 0;
5740 coding->produced = coding->produced_char = 0;
5741 coding->chars_at_source = 0;
5742 coding->result = CODING_RESULT_SUCCESS;
5743 coding->errors = 0;
5744
5745 ALLOC_CONVERSION_WORK_AREA (coding);
5746
5747 attrs = CODING_ID_ATTRS (coding->id);
5748
5749 do
5750 {
5751 coding_set_source (coding);
5752 coding->annotated = 0;
5753 (*(coding->decoder)) (coding);
5754 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5755 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5756 else if (!NILP (Vstandard_translation_table_for_decode))
5757 translate_chars (coding, Vstandard_translation_table_for_decode);
5758 coding_set_destination (coding);
5759 produce_chars (coding);
5760 if (coding->annotated)
5761 produce_annotation (coding);
5762 }
5763 while (coding->consumed < coding->src_bytes
5764 && ! coding->result);
5765
5766 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5767 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5768 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5769 decode_eol (coding);
5770
5771 coding->carryover_bytes = 0;
5772 if (coding->consumed < coding->src_bytes)
5773 {
5774 int nbytes = coding->src_bytes - coding->consumed;
5775 unsigned char *src;
5776
5777 coding_set_source (coding);
5778 coding_set_destination (coding);
5779 src = coding->source + coding->consumed;
5780
5781 if (coding->mode & CODING_MODE_LAST_BLOCK)
5782 {
5783 /* Flush out unprocessed data as binary chars. We are sure
5784 that the number of data is less than the size of
5785 coding->charbuf. */
5786 while (nbytes-- > 0)
5787 {
5788 int c = *src++;
5789
5790 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5791 }
5792 produce_chars (coding);
5793 }
5794 else
5795 {
5796 /* Record unprocessed bytes in coding->carryover. We are
5797 sure that the number of data is less than the size of
5798 coding->carryover. */
5799 unsigned char *p = coding->carryover;
5800
5801 coding->carryover_bytes = nbytes;
5802 while (nbytes-- > 0)
5803 *p++ = *src++;
5804 }
5805 coding->consumed = coding->src_bytes;
5806 }
5807
5808 return coding->result;
5809 }
5810
5811
5812 /* Extract an annotation datum from a composition starting at POS and
5813 ending before LIMIT of CODING->src_object (buffer or string), store
5814 the data in BUF, set *STOP to a starting position of the next
5815 composition (if any) or to LIMIT, and return the address of the
5816 next element of BUF.
5817
5818 If such an annotation is not found, set *STOP to a starting
5819 position of a composition after POS (if any) or to LIMIT, and
5820 return BUF. */
5821
5822 static INLINE int *
5823 handle_composition_annotation (pos, limit, coding, buf, stop)
5824 EMACS_INT pos, limit;
5825 struct coding_system *coding;
5826 int *buf;
5827 EMACS_INT *stop;
5828 {
5829 EMACS_INT start, end;
5830 Lisp_Object prop;
5831
5832 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5833 || end > limit)
5834 *stop = limit;
5835 else if (start > pos)
5836 *stop = start;
5837 else
5838 {
5839 if (start == pos)
5840 {
5841 /* We found a composition. Store the corresponding
5842 annotation data in BUF. */
5843 int *head = buf;
5844 enum composition_method method = COMPOSITION_METHOD (prop);
5845 int nchars = COMPOSITION_LENGTH (prop);
5846
5847 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5848 if (method != COMPOSITION_RELATIVE)
5849 {
5850 Lisp_Object components;
5851 int len, i, i_byte;
5852
5853 components = COMPOSITION_COMPONENTS (prop);
5854 if (VECTORP (components))
5855 {
5856 len = XVECTOR (components)->size;
5857 for (i = 0; i < len; i++)
5858 *buf++ = XINT (AREF (components, i));
5859 }
5860 else if (STRINGP (components))
5861 {
5862 len = XSTRING (components)->size;
5863 i = i_byte = 0;
5864 while (i < len)
5865 {
5866 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5867 buf++;
5868 }
5869 }
5870 else if (INTEGERP (components))
5871 {
5872 len = 1;
5873 *buf++ = XINT (components);
5874 }
5875 else if (CONSP (components))
5876 {
5877 for (len = 0; CONSP (components);
5878 len++, components = XCDR (components))
5879 *buf++ = XINT (XCAR (components));
5880 }
5881 else
5882 abort ();
5883 *head -= len;
5884 }
5885 }
5886
5887 if (find_composition (end, limit, &start, &end, &prop,
5888 coding->src_object)
5889 && end <= limit)
5890 *stop = start;
5891 else
5892 *stop = limit;
5893 }
5894 return buf;
5895 }
5896
5897
5898 /* Extract an annotation datum from a text property `charset' at POS of
5899 CODING->src_object (buffer of string), store the data in BUF, set
5900 *STOP to the position where the value of `charset' property changes
5901 (limiting by LIMIT), and return the address of the next element of
5902 BUF.
5903
5904 If the property value is nil, set *STOP to the position where the
5905 property value is non-nil (limiting by LIMIT), and return BUF. */
5906
5907 static INLINE int *
5908 handle_charset_annotation (pos, limit, coding, buf, stop)
5909 EMACS_INT pos, limit;
5910 struct coding_system *coding;
5911 int *buf;
5912 EMACS_INT *stop;
5913 {
5914 Lisp_Object val, next;
5915 int id;
5916
5917 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5918 if (! NILP (val) && CHARSETP (val))
5919 id = XINT (CHARSET_SYMBOL_ID (val));
5920 else
5921 id = -1;
5922 ADD_CHARSET_DATA (buf, 0, 0, id);
5923 next = Fnext_single_property_change (make_number (pos), Qcharset,
5924 coding->src_object,
5925 make_number (limit));
5926 *stop = XINT (next);
5927 return buf;
5928 }
5929
5930
5931 static void
5932 consume_chars (coding)
5933 struct coding_system *coding;
5934 {
5935 int *buf = coding->charbuf;
5936 int *buf_end = coding->charbuf + coding->charbuf_size;
5937 const unsigned char *src = coding->source + coding->consumed;
5938 EMACS_INT pos = coding->src_pos + coding->consumed_char;
5939 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5940 int multibytep = coding->src_multibyte;
5941 Lisp_Object eol_type;
5942 int c;
5943 EMACS_INT stop, stop_composition, stop_charset;
5944 int id;
5945
5946 eol_type = CODING_ID_EOL_TYPE (coding->id);
5947 if (VECTORP (eol_type))
5948 eol_type = Qunix;
5949
5950 /* Note: composition handling is not yet implemented. */
5951 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5952
5953 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5954 stop = stop_composition = pos;
5955 else
5956 stop = stop_composition = end_pos;
5957 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5958 stop = stop_charset = pos;
5959 else
5960 stop_charset = end_pos;
5961
5962 /* Compensate for CRLF and annotation. */
5963 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5964 while (buf < buf_end)
5965 {
5966 if (pos == stop)
5967 {
5968 int *p;
5969
5970 if (pos == end_pos)
5971 break;
5972 if (pos == stop_composition)
5973 buf = handle_composition_annotation (pos, end_pos, coding,
5974 buf, &stop_composition);
5975 if (pos == stop_charset)
5976 buf = handle_charset_annotation (pos, end_pos, coding,
5977 buf, &stop_charset);
5978 stop = (stop_composition < stop_charset
5979 ? stop_composition : stop_charset);
5980 }
5981
5982 if (! multibytep)
5983 c = *src++;
5984 else
5985 c = STRING_CHAR_ADVANCE (src);
5986 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5987 c = '\n';
5988 if (! EQ (eol_type, Qunix))
5989 {
5990 if (c == '\n')
5991 {
5992 if (EQ (eol_type, Qdos))
5993 *buf++ = '\r';
5994 else
5995 c = '\r';
5996 }
5997 }
5998 *buf++ = c;
5999 pos++;
6000 }
6001
6002 coding->consumed = src - coding->source;
6003 coding->consumed_char = pos - coding->src_pos;
6004 coding->charbuf_used = buf - coding->charbuf;
6005 coding->chars_at_source = 0;
6006 }
6007
6008
6009 /* Encode the text at CODING->src_object into CODING->dst_object.
6010 CODING->src_object is a buffer or a string.
6011 CODING->dst_object is a buffer or nil.
6012
6013 If CODING->src_object is a buffer, it must be the current buffer.
6014 In this case, if CODING->src_pos is positive, it is a position of
6015 the source text in the buffer, otherwise. the source text is in the
6016 gap area of the buffer, and coding->src_pos specifies the offset of
6017 the text from GPT (which must be the same as PT). If this is the
6018 same buffer as CODING->dst_object, CODING->src_pos must be
6019 negative and CODING should not have `pre-write-conversion'.
6020
6021 If CODING->src_object is a string, CODING should not have
6022 `pre-write-conversion'.
6023
6024 If CODING->dst_object is a buffer, the encoded data is inserted at
6025 the current point of that buffer.
6026
6027 If CODING->dst_object is nil, the encoded data is placed at the
6028 memory area specified by CODING->destination. */
6029
6030 static int
6031 encode_coding (coding)
6032 struct coding_system *coding;
6033 {
6034 Lisp_Object attrs;
6035
6036 attrs = CODING_ID_ATTRS (coding->id);
6037
6038 if (BUFFERP (coding->dst_object))
6039 {
6040 set_buffer_internal (XBUFFER (coding->dst_object));
6041 coding->dst_multibyte
6042 = ! NILP (current_buffer->enable_multibyte_characters);
6043 }
6044
6045 coding->consumed = coding->consumed_char = 0;
6046 coding->produced = coding->produced_char = 0;
6047 coding->result = CODING_RESULT_SUCCESS;
6048 coding->errors = 0;
6049
6050 ALLOC_CONVERSION_WORK_AREA (coding);
6051
6052 do {
6053 coding_set_source (coding);
6054 consume_chars (coding);
6055
6056 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6057 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6058 else if (!NILP (Vstandard_translation_table_for_encode))
6059 translate_chars (coding, Vstandard_translation_table_for_encode);
6060
6061 coding_set_destination (coding);
6062 (*(coding->encoder)) (coding);
6063 } while (coding->consumed_char < coding->src_chars);
6064
6065 if (BUFFERP (coding->dst_object))
6066 insert_from_gap (coding->produced_char, coding->produced);
6067
6068 return (coding->result);
6069 }
6070
6071 /* Work buffer */
6072
6073 /* List of currently used working buffer. */
6074 Lisp_Object Vcode_conversion_work_buf_list;
6075
6076 /* A working buffer used by the top level conversion. */
6077 Lisp_Object Vcode_conversion_reused_work_buf;
6078
6079
6080 /* Return a working buffer that can be freely used by the following
6081 code conversion. MULTIBYTEP specifies the multibyteness of the
6082 buffer. */
6083
6084 Lisp_Object
6085 make_conversion_work_buffer (multibytep)
6086 int multibytep;
6087 {
6088 struct buffer *current = current_buffer;
6089 Lisp_Object buf;
6090
6091 if (NILP (Vcode_conversion_work_buf_list))
6092 {
6093 if (NILP (Vcode_conversion_reused_work_buf))
6094 Vcode_conversion_reused_work_buf
6095 = Fget_buffer_create (build_string (" *code-conversion-work*"));
6096 Vcode_conversion_work_buf_list
6097 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
6098 }
6099 else
6100 {
6101 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6102 char str[128];
6103
6104 sprintf (str, " *code-conversion-work*<%d>", depth);
6105 Vcode_conversion_work_buf_list
6106 = Fcons (Fget_buffer_create (build_string (str)),
6107 Vcode_conversion_work_buf_list);
6108 }
6109
6110 buf = XCAR (Vcode_conversion_work_buf_list);
6111 set_buffer_internal (XBUFFER (buf));
6112 current_buffer->undo_list = Qt;
6113 Ferase_buffer ();
6114 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6115 set_buffer_internal (current);
6116 return buf;
6117 }
6118
6119 static struct coding_system *saved_coding;
6120
6121 Lisp_Object
6122 code_conversion_restore (info)
6123 Lisp_Object info;
6124 {
6125 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6126 Lisp_Object buf;
6127
6128 if (depth > 0)
6129 {
6130 buf = XCAR (Vcode_conversion_work_buf_list);
6131 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6132 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
6133 Fkill_buffer (buf);
6134 }
6135
6136 if (EQ (saved_coding->dst_object, Qt)
6137 && saved_coding->destination)
6138 xfree (saved_coding->destination);
6139
6140 return save_excursion_restore (info);
6141 }
6142
6143
6144 int
6145 decode_coding_gap (coding, chars, bytes)
6146 struct coding_system *coding;
6147 EMACS_INT chars, bytes;
6148 {
6149 int count = specpdl_ptr - specpdl;
6150
6151 saved_coding = coding;
6152 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6153
6154 coding->src_object = Fcurrent_buffer ();
6155 coding->src_chars = chars;
6156 coding->src_bytes = bytes;
6157 coding->src_pos = -chars;
6158 coding->src_pos_byte = -bytes;
6159 coding->src_multibyte = chars < bytes;
6160 coding->dst_object = coding->src_object;
6161 coding->dst_pos = PT;
6162 coding->dst_pos_byte = PT_BYTE;
6163 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6164 coding->mode |= CODING_MODE_LAST_BLOCK;
6165
6166 if (CODING_REQUIRE_DETECTION (coding))
6167 detect_coding (coding);
6168
6169 decode_coding (coding);
6170
6171 unbind_to (count, Qnil);
6172 return coding->result;
6173 }
6174
6175 int
6176 encode_coding_gap (coding, chars, bytes)
6177 struct coding_system *coding;
6178 EMACS_INT chars, bytes;
6179 {
6180 int count = specpdl_ptr - specpdl;
6181 Lisp_Object buffer;
6182
6183 saved_coding = coding;
6184 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6185
6186 buffer = Fcurrent_buffer ();
6187 coding->src_object = buffer;
6188 coding->src_chars = chars;
6189 coding->src_bytes = bytes;
6190 coding->src_pos = -chars;
6191 coding->src_pos_byte = -bytes;
6192 coding->src_multibyte = chars < bytes;
6193 coding->dst_object = coding->src_object;
6194 coding->dst_pos = PT;
6195 coding->dst_pos_byte = PT_BYTE;
6196
6197 encode_coding (coding);
6198
6199 unbind_to (count, Qnil);
6200 return coding->result;
6201 }
6202
6203
6204 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6205 SRC_OBJECT into DST_OBJECT by coding context CODING.
6206
6207 SRC_OBJECT is a buffer, a string, or Qnil.
6208
6209 If it is a buffer, the text is at point of the buffer. FROM and TO
6210 are positions in the buffer.
6211
6212 If it is a string, the text is at the beginning of the string.
6213 FROM and TO are indices to the string.
6214
6215 If it is nil, the text is at coding->source. FROM and TO are
6216 indices to coding->source.
6217
6218 DST_OBJECT is a buffer, Qt, or Qnil.
6219
6220 If it is a buffer, the decoded text is inserted at point of the
6221 buffer. If the buffer is the same as SRC_OBJECT, the source text
6222 is deleted.
6223
6224 If it is Qt, a string is made from the decoded text, and
6225 set in CODING->dst_object.
6226
6227 If it is Qnil, the decoded text is stored at CODING->destination.
6228 The caller must allocate CODING->dst_bytes bytes at
6229 CODING->destination by xmalloc. If the decoded text is longer than
6230 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6231 */
6232
6233 void
6234 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6235 dst_object)
6236 struct coding_system *coding;
6237 Lisp_Object src_object;
6238 EMACS_INT from, from_byte, to, to_byte;
6239 Lisp_Object dst_object;
6240 {
6241 int count = specpdl_ptr - specpdl;
6242 unsigned char *destination;
6243 EMACS_INT dst_bytes;
6244 EMACS_INT chars = to - from;
6245 EMACS_INT bytes = to_byte - from_byte;
6246 Lisp_Object attrs;
6247
6248 saved_coding = coding;
6249 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6250
6251 if (NILP (dst_object))
6252 {
6253 destination = coding->destination;
6254 dst_bytes = coding->dst_bytes;
6255 }
6256
6257 coding->src_object = src_object;
6258 coding->src_chars = chars;
6259 coding->src_bytes = bytes;
6260 coding->src_multibyte = chars < bytes;
6261
6262 if (STRINGP (src_object))
6263 {
6264 coding->src_pos = from;
6265 coding->src_pos_byte = from_byte;
6266 }
6267 else if (BUFFERP (src_object))
6268 {
6269 set_buffer_internal (XBUFFER (src_object));
6270 if (from != GPT)
6271 move_gap_both (from, from_byte);
6272 if (EQ (src_object, dst_object))
6273 {
6274 TEMP_SET_PT_BOTH (from, from_byte);
6275 del_range_both (from, from_byte, to, to_byte, 1);
6276 coding->src_pos = -chars;
6277 coding->src_pos_byte = -bytes;
6278 }
6279 else
6280 {
6281 coding->src_pos = from;
6282 coding->src_pos_byte = from_byte;
6283 }
6284 }
6285
6286 if (CODING_REQUIRE_DETECTION (coding))
6287 detect_coding (coding);
6288 attrs = CODING_ID_ATTRS (coding->id);
6289
6290 if (EQ (dst_object, Qt)
6291 || (! NILP (CODING_ATTR_POST_READ (attrs))
6292 && NILP (dst_object)))
6293 {
6294 coding->dst_object = make_conversion_work_buffer (1);
6295 coding->dst_pos = BEG;
6296 coding->dst_pos_byte = BEG_BYTE;
6297 coding->dst_multibyte = 1;
6298 }
6299 else if (BUFFERP (dst_object))
6300 {
6301 coding->dst_object = dst_object;
6302 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6303 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6304 coding->dst_multibyte
6305 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6306 }
6307 else
6308 {
6309 coding->dst_object = Qnil;
6310 coding->dst_multibyte = 1;
6311 }
6312
6313 decode_coding (coding);
6314
6315 if (BUFFERP (coding->dst_object))
6316 set_buffer_internal (XBUFFER (coding->dst_object));
6317
6318 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6319 {
6320 struct gcpro gcpro1, gcpro2;
6321 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6322 Lisp_Object val;
6323
6324 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6325 GCPRO2 (coding->src_object, coding->dst_object);
6326 val = call1 (CODING_ATTR_POST_READ (attrs),
6327 make_number (coding->produced_char));
6328 UNGCPRO;
6329 CHECK_NATNUM (val);
6330 coding->produced_char += Z - prev_Z;
6331 coding->produced += Z_BYTE - prev_Z_BYTE;
6332 }
6333
6334 if (EQ (dst_object, Qt))
6335 {
6336 coding->dst_object = Fbuffer_string ();
6337 }
6338 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6339 {
6340 set_buffer_internal (XBUFFER (coding->dst_object));
6341 if (dst_bytes < coding->produced)
6342 {
6343 destination
6344 = (unsigned char *) xrealloc (destination, coding->produced);
6345 if (! destination)
6346 {
6347 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6348 unbind_to (count, Qnil);
6349 return;
6350 }
6351 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6352 move_gap_both (BEGV, BEGV_BYTE);
6353 bcopy (BEGV_ADDR, destination, coding->produced);
6354 coding->destination = destination;
6355 }
6356 }
6357
6358 unbind_to (count, Qnil);
6359 }
6360
6361
6362 void
6363 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6364 dst_object)
6365 struct coding_system *coding;
6366 Lisp_Object src_object;
6367 EMACS_INT from, from_byte, to, to_byte;
6368 Lisp_Object dst_object;
6369 {
6370 int count = specpdl_ptr - specpdl;
6371 EMACS_INT chars = to - from;
6372 EMACS_INT bytes = to_byte - from_byte;
6373 Lisp_Object attrs;
6374
6375 saved_coding = coding;
6376 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6377
6378 coding->src_object = src_object;
6379 coding->src_chars = chars;
6380 coding->src_bytes = bytes;
6381 coding->src_multibyte = chars < bytes;
6382
6383 attrs = CODING_ID_ATTRS (coding->id);
6384
6385 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6386 {
6387 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6388 set_buffer_internal (XBUFFER (coding->src_object));
6389 if (STRINGP (src_object))
6390 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6391 else if (BUFFERP (src_object))
6392 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6393 else
6394 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6395
6396 if (EQ (src_object, dst_object))
6397 {
6398 set_buffer_internal (XBUFFER (src_object));
6399 del_range_both (from, from_byte, to, to_byte, 1);
6400 set_buffer_internal (XBUFFER (coding->src_object));
6401 }
6402
6403 call2 (CODING_ATTR_PRE_WRITE (attrs),
6404 make_number (BEG), make_number (Z));
6405 coding->src_object = Fcurrent_buffer ();
6406 if (BEG != GPT)
6407 move_gap_both (BEG, BEG_BYTE);
6408 coding->src_chars = Z - BEG;
6409 coding->src_bytes = Z_BYTE - BEG_BYTE;
6410 coding->src_pos = BEG;
6411 coding->src_pos_byte = BEG_BYTE;
6412 coding->src_multibyte = Z < Z_BYTE;
6413 }
6414 else if (STRINGP (src_object))
6415 {
6416 coding->src_pos = from;
6417 coding->src_pos_byte = from_byte;
6418 }
6419 else if (BUFFERP (src_object))
6420 {
6421 set_buffer_internal (XBUFFER (src_object));
6422 if (EQ (src_object, dst_object))
6423 {
6424 coding->src_object = del_range_1 (from, to, 1, 1);
6425 coding->src_pos = 0;
6426 coding->src_pos_byte = 0;
6427 }
6428 else
6429 {
6430 if (from < GPT && to >= GPT)
6431 move_gap_both (from, from_byte);
6432 coding->src_pos = from;
6433 coding->src_pos_byte = from_byte;
6434 }
6435 }
6436
6437 if (BUFFERP (dst_object))
6438 {
6439 coding->dst_object = dst_object;
6440 if (EQ (src_object, dst_object))
6441 {
6442 coding->dst_pos = from;
6443 coding->dst_pos_byte = from_byte;
6444 }
6445 else
6446 {
6447 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6448 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6449 }
6450 coding->dst_multibyte
6451 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6452 }
6453 else if (EQ (dst_object, Qt))
6454 {
6455 coding->dst_object = Qnil;
6456 coding->dst_bytes = coding->src_chars;
6457 if (coding->dst_bytes == 0)
6458 coding->dst_bytes = 1;
6459 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6460 coding->dst_multibyte = 0;
6461 }
6462 else
6463 {
6464 coding->dst_object = Qnil;
6465 coding->dst_multibyte = 0;
6466 }
6467
6468 encode_coding (coding);
6469
6470 if (EQ (dst_object, Qt))
6471 {
6472 if (BUFFERP (coding->dst_object))
6473 coding->dst_object = Fbuffer_string ();
6474 else
6475 {
6476 coding->dst_object
6477 = make_unibyte_string ((char *) coding->destination,
6478 coding->produced);
6479 xfree (coding->destination);
6480 }
6481 }
6482
6483 unbind_to (count, Qnil);
6484 }
6485
6486
6487 Lisp_Object
6488 preferred_coding_system ()
6489 {
6490 int id = coding_categories[coding_priorities[0]].id;
6491
6492 return CODING_ID_NAME (id);
6493 }
6494
6495 \f
6496 #ifdef emacs
6497 /*** 8. Emacs Lisp library functions ***/
6498
6499 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6500 doc: /* Return t if OBJECT is nil or a coding-system.
6501 See the documentation of `define-coding-system' for information
6502 about coding-system objects. */)
6503 (obj)
6504 Lisp_Object obj;
6505 {
6506 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6507 }
6508
6509 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6510 Sread_non_nil_coding_system, 1, 1, 0,
6511 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6512 (prompt)
6513 Lisp_Object prompt;
6514 {
6515 Lisp_Object val;
6516 do
6517 {
6518 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6519 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6520 }
6521 while (XSTRING (val)->size == 0);
6522 return (Fintern (val, Qnil));
6523 }
6524
6525 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6526 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6527 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6528 (prompt, default_coding_system)
6529 Lisp_Object prompt, default_coding_system;
6530 {
6531 Lisp_Object val;
6532 if (SYMBOLP (default_coding_system))
6533 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6534 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6535 Qt, Qnil, Qcoding_system_history,
6536 default_coding_system, Qnil);
6537 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6538 }
6539
6540 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6541 1, 1, 0,
6542 doc: /* Check validity of CODING-SYSTEM.
6543 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6544 (coding_system)
6545 Lisp_Object coding_system;
6546 {
6547 CHECK_SYMBOL (coding_system);
6548 if (!NILP (Fcoding_system_p (coding_system)))
6549 return coding_system;
6550 while (1)
6551 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6552 }
6553
6554 \f
6555 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6556 HIGHEST is nonzero, return the coding system of the highest
6557 priority among the detected coding systems. Otherwize return a
6558 list of detected coding systems sorted by their priorities. If
6559 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6560 multibyte form but contains only ASCII and eight-bit chars.
6561 Otherwise, the bytes are raw bytes.
6562
6563 CODING-SYSTEM controls the detection as below:
6564
6565 If it is nil, detect both text-format and eol-format. If the
6566 text-format part of CODING-SYSTEM is already specified
6567 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6568 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6569 detect only text-format. */
6570
6571 Lisp_Object
6572 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6573 unsigned char *src;
6574 int src_bytes, highest;
6575 int multibytep;
6576 Lisp_Object coding_system;
6577 {
6578 unsigned char *src_end = src + src_bytes;
6579 int mask = CATEGORY_MASK_ANY;
6580 Lisp_Object attrs, eol_type;
6581 Lisp_Object val;
6582 struct coding_system coding;
6583 int id;
6584 struct coding_detection_info detect_info;
6585
6586 if (NILP (coding_system))
6587 coding_system = Qundecided;
6588 setup_coding_system (coding_system, &coding);
6589 attrs = CODING_ID_ATTRS (coding.id);
6590 eol_type = CODING_ID_EOL_TYPE (coding.id);
6591 coding_system = CODING_ATTR_BASE_NAME (attrs);
6592
6593 coding.source = src;
6594 coding.src_bytes = src_bytes;
6595 coding.src_multibyte = multibytep;
6596 coding.consumed = 0;
6597 coding.mode |= CODING_MODE_LAST_BLOCK;
6598
6599 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6600
6601 /* At first, detect text-format if necessary. */
6602 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6603 {
6604 enum coding_category category;
6605 struct coding_system *this;
6606 int c, i;
6607
6608 for (; src < src_end; src++)
6609 {
6610 c = *src;
6611 if (c & 0x80
6612 || (c < 0x20 && (c == ISO_CODE_ESC
6613 || c == ISO_CODE_SI
6614 || c == ISO_CODE_SO)))
6615 break;
6616 }
6617 coding.head_ascii = src - coding.source;
6618
6619 if (src < src_end)
6620 for (i = 0; i < coding_category_raw_text; i++)
6621 {
6622 category = coding_priorities[i];
6623 this = coding_categories + category;
6624
6625 if (this->id < 0)
6626 {
6627 /* No coding system of this category is defined. */
6628 detect_info.rejected |= (1 << category);
6629 }
6630 else if (category >= coding_category_raw_text)
6631 continue;
6632 else if (detect_info.checked & (1 << category))
6633 {
6634 if (highest
6635 && (detect_info.found & (1 << category)))
6636 break;
6637 }
6638 else
6639 {
6640 if ((*(this->detector)) (&coding, &detect_info)
6641 && highest
6642 && (detect_info.found & (1 << category)))
6643 break;
6644 }
6645 }
6646
6647
6648 if (detect_info.rejected == CATEGORY_MASK_ANY)
6649 {
6650 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6651 id = coding_categories[coding_category_raw_text].id;
6652 val = Fcons (make_number (id), Qnil);
6653 }
6654 else if (! detect_info.rejected && ! detect_info.found)
6655 {
6656 detect_info.found = CATEGORY_MASK_ANY;
6657 id = coding_categories[coding_category_undecided].id;
6658 val = Fcons (make_number (id), Qnil);
6659 }
6660 else if (highest)
6661 {
6662 if (detect_info.found)
6663 {
6664 detect_info.found = 1 << category;
6665 val = Fcons (make_number (this->id), Qnil);
6666 }
6667 else
6668 for (i = 0; i < coding_category_raw_text; i++)
6669 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6670 {
6671 detect_info.found = 1 << coding_priorities[i];
6672 id = coding_categories[coding_priorities[i]].id;
6673 val = Fcons (make_number (id), Qnil);
6674 break;
6675 }
6676 }
6677 else
6678 {
6679 int mask = detect_info.rejected | detect_info.found;
6680 int found = 0;
6681 val = Qnil;
6682
6683 for (i = coding_category_raw_text - 1; i >= 0; i--)
6684 {
6685 category = coding_priorities[i];
6686 if (! (mask & (1 << category)))
6687 {
6688 found |= 1 << category;
6689 id = coding_categories[category].id;
6690 val = Fcons (make_number (id), val);
6691 }
6692 }
6693 for (i = coding_category_raw_text - 1; i >= 0; i--)
6694 {
6695 category = coding_priorities[i];
6696 if (detect_info.found & (1 << category))
6697 {
6698 id = coding_categories[category].id;
6699 val = Fcons (make_number (id), val);
6700 }
6701 }
6702 detect_info.found |= found;
6703 }
6704 }
6705 else
6706 {
6707 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6708 val = Fcons (make_number (coding.id), Qnil);
6709 }
6710
6711 /* Then, detect eol-format if necessary. */
6712 {
6713 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6714 Lisp_Object tail;
6715
6716 if (VECTORP (eol_type))
6717 {
6718 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6719 normal_eol = detect_eol (coding.source, src_bytes,
6720 coding_category_raw_text);
6721 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6722 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6723 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6724 coding_category_utf_16_be);
6725 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6726 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6727 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6728 coding_category_utf_16_le);
6729 }
6730 else
6731 {
6732 if (EQ (eol_type, Qunix))
6733 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6734 else if (EQ (eol_type, Qdos))
6735 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6736 else
6737 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6738 }
6739
6740 for (tail = val; CONSP (tail); tail = XCDR (tail))
6741 {
6742 enum coding_category category;
6743 int this_eol;
6744
6745 id = XINT (XCAR (tail));
6746 attrs = CODING_ID_ATTRS (id);
6747 category = XINT (CODING_ATTR_CATEGORY (attrs));
6748 eol_type = CODING_ID_EOL_TYPE (id);
6749 if (VECTORP (eol_type))
6750 {
6751 if (category == coding_category_utf_16_be
6752 || category == coding_category_utf_16_be_nosig)
6753 this_eol = utf_16_be_eol;
6754 else if (category == coding_category_utf_16_le
6755 || category == coding_category_utf_16_le_nosig)
6756 this_eol = utf_16_le_eol;
6757 else
6758 this_eol = normal_eol;
6759
6760 if (this_eol == EOL_SEEN_LF)
6761 XSETCAR (tail, AREF (eol_type, 0));
6762 else if (this_eol == EOL_SEEN_CRLF)
6763 XSETCAR (tail, AREF (eol_type, 1));
6764 else if (this_eol == EOL_SEEN_CR)
6765 XSETCAR (tail, AREF (eol_type, 2));
6766 else
6767 XSETCAR (tail, CODING_ID_NAME (id));
6768 }
6769 else
6770 XSETCAR (tail, CODING_ID_NAME (id));
6771 }
6772 }
6773
6774 return (highest ? XCAR (val) : val);
6775 }
6776
6777
6778 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6779 2, 3, 0,
6780 doc: /* Detect coding system of the text in the region between START and END.
6781 Return a list of possible coding systems ordered by priority.
6782
6783 If only ASCII characters are found, it returns a list of single element
6784 `undecided' or its subsidiary coding system according to a detected
6785 end-of-line format.
6786
6787 If optional argument HIGHEST is non-nil, return the coding system of
6788 highest priority. */)
6789 (start, end, highest)
6790 Lisp_Object start, end, highest;
6791 {
6792 int from, to;
6793 int from_byte, to_byte;
6794
6795 CHECK_NUMBER_COERCE_MARKER (start);
6796 CHECK_NUMBER_COERCE_MARKER (end);
6797
6798 validate_region (&start, &end);
6799 from = XINT (start), to = XINT (end);
6800 from_byte = CHAR_TO_BYTE (from);
6801 to_byte = CHAR_TO_BYTE (to);
6802
6803 if (from < GPT && to >= GPT)
6804 move_gap_both (to, to_byte);
6805
6806 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6807 to_byte - from_byte,
6808 !NILP (highest),
6809 !NILP (current_buffer
6810 ->enable_multibyte_characters),
6811 Qnil);
6812 }
6813
6814 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6815 1, 2, 0,
6816 doc: /* Detect coding system of the text in STRING.
6817 Return a list of possible coding systems ordered by priority.
6818
6819 If only ASCII characters are found, it returns a list of single element
6820 `undecided' or its subsidiary coding system according to a detected
6821 end-of-line format.
6822
6823 If optional argument HIGHEST is non-nil, return the coding system of
6824 highest priority. */)
6825 (string, highest)
6826 Lisp_Object string, highest;
6827 {
6828 CHECK_STRING (string);
6829
6830 return detect_coding_system (XSTRING (string)->data,
6831 STRING_BYTES (XSTRING (string)),
6832 !NILP (highest),
6833 STRING_MULTIBYTE (string),
6834 Qnil);
6835 }
6836
6837
6838 static INLINE int
6839 char_encodable_p (c, attrs)
6840 int c;
6841 Lisp_Object attrs;
6842 {
6843 Lisp_Object tail;
6844 struct charset *charset;
6845
6846 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6847 CONSP (tail); tail = XCDR (tail))
6848 {
6849 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6850 if (CHAR_CHARSET_P (c, charset))
6851 break;
6852 }
6853 return (! NILP (tail));
6854 }
6855
6856
6857 /* Return a list of coding systems that safely encode the text between
6858 START and END. If EXCLUDE is non-nil, it is a list of coding
6859 systems not to check. The returned list doesn't contain any such
6860 coding systems. In any case, if the text contains only ASCII or is
6861 unibyte, return t. */
6862
6863 DEFUN ("find-coding-systems-region-internal",
6864 Ffind_coding_systems_region_internal,
6865 Sfind_coding_systems_region_internal, 2, 3, 0,
6866 doc: /* Internal use only. */)
6867 (start, end, exclude)
6868 Lisp_Object start, end, exclude;
6869 {
6870 Lisp_Object coding_attrs_list, safe_codings;
6871 EMACS_INT start_byte, end_byte;
6872 const unsigned char *p, *pbeg, *pend;
6873 int c;
6874 Lisp_Object tail, elt;
6875
6876 if (STRINGP (start))
6877 {
6878 if (!STRING_MULTIBYTE (start)
6879 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
6880 return Qt;
6881 start_byte = 0;
6882 end_byte = STRING_BYTES (XSTRING (start));
6883 }
6884 else
6885 {
6886 CHECK_NUMBER_COERCE_MARKER (start);
6887 CHECK_NUMBER_COERCE_MARKER (end);
6888 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6889 args_out_of_range (start, end);
6890 if (NILP (current_buffer->enable_multibyte_characters))
6891 return Qt;
6892 start_byte = CHAR_TO_BYTE (XINT (start));
6893 end_byte = CHAR_TO_BYTE (XINT (end));
6894 if (XINT (end) - XINT (start) == end_byte - start_byte)
6895 return Qt;
6896
6897 if (XINT (start) < GPT && XINT (end) > GPT)
6898 {
6899 if ((GPT - XINT (start)) < (XINT (end) - GPT))
6900 move_gap_both (XINT (start), start_byte);
6901 else
6902 move_gap_both (XINT (end), end_byte);
6903 }
6904 }
6905
6906 coding_attrs_list = Qnil;
6907 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6908 if (NILP (exclude)
6909 || NILP (Fmemq (XCAR (tail), exclude)))
6910 {
6911 Lisp_Object attrs;
6912
6913 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6914 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6915 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6916 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6917 }
6918
6919 if (STRINGP (start))
6920 p = pbeg = XSTRING (start)->data;
6921 else
6922 p = pbeg = BYTE_POS_ADDR (start_byte);
6923 pend = p + (end_byte - start_byte);
6924
6925 while (p < pend && ASCII_BYTE_P (*p)) p++;
6926 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6927
6928 while (p < pend)
6929 {
6930 if (ASCII_BYTE_P (*p))
6931 p++;
6932 else
6933 {
6934 c = STRING_CHAR_ADVANCE (p);
6935
6936 charset_map_loaded = 0;
6937 for (tail = coding_attrs_list; CONSP (tail);)
6938 {
6939 elt = XCAR (tail);
6940 if (NILP (elt))
6941 tail = XCDR (tail);
6942 else if (char_encodable_p (c, elt))
6943 tail = XCDR (tail);
6944 else if (CONSP (XCDR (tail)))
6945 {
6946 XSETCAR (tail, XCAR (XCDR (tail)));
6947 XSETCDR (tail, XCDR (XCDR (tail)));
6948 }
6949 else
6950 {
6951 XSETCAR (tail, Qnil);
6952 tail = XCDR (tail);
6953 }
6954 }
6955 if (charset_map_loaded)
6956 {
6957 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6958
6959 if (STRINGP (start))
6960 pbeg = XSTRING (start)->data;
6961 else
6962 pbeg = BYTE_POS_ADDR (start_byte);
6963 p = pbeg + p_offset;
6964 pend = pbeg + pend_offset;
6965 }
6966 }
6967 }
6968
6969 safe_codings = Qnil;
6970 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6971 if (! NILP (XCAR (tail)))
6972 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6973
6974 return safe_codings;
6975 }
6976
6977
6978 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6979 Scheck_coding_systems_region, 3, 3, 0,
6980 doc: /* Check if the region is encodable by coding systems.
6981
6982 START and END are buffer positions specifying the region.
6983 CODING-SYSTEM-LIST is a list of coding systems to check.
6984
6985 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6986 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6987 whole region, POS0, POS1, ... are buffer positions where non-encodable
6988 characters are found.
6989
6990 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6991 value is nil.
6992
6993 START may be a string. In that case, check if the string is
6994 encodable, and the value contains indices to the string instead of
6995 buffer positions. END is ignored. */)
6996 (start, end, coding_system_list)
6997 Lisp_Object start, end, coding_system_list;
6998 {
6999 Lisp_Object list;
7000 EMACS_INT start_byte, end_byte;
7001 int pos;
7002 const unsigned char *p, *pbeg, *pend;
7003 int c;
7004 Lisp_Object tail, elt;
7005
7006 if (STRINGP (start))
7007 {
7008 if (!STRING_MULTIBYTE (start)
7009 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7010 return Qnil;
7011 start_byte = 0;
7012 end_byte = STRING_BYTES (XSTRING (start));
7013 pos = 0;
7014 }
7015 else
7016 {
7017 CHECK_NUMBER_COERCE_MARKER (start);
7018 CHECK_NUMBER_COERCE_MARKER (end);
7019 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7020 args_out_of_range (start, end);
7021 if (NILP (current_buffer->enable_multibyte_characters))
7022 return Qnil;
7023 start_byte = CHAR_TO_BYTE (XINT (start));
7024 end_byte = CHAR_TO_BYTE (XINT (end));
7025 if (XINT (end) - XINT (start) == end_byte - start_byte)
7026 return Qt;
7027
7028 if (XINT (start) < GPT && XINT (end) > GPT)
7029 {
7030 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7031 move_gap_both (XINT (start), start_byte);
7032 else
7033 move_gap_both (XINT (end), end_byte);
7034 }
7035 pos = XINT (start);
7036 }
7037
7038 list = Qnil;
7039 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7040 {
7041 elt = XCAR (tail);
7042 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7043 Qnil)),
7044 list);
7045 }
7046
7047 if (STRINGP (start))
7048 p = pbeg = XSTRING (start)->data;
7049 else
7050 p = pbeg = BYTE_POS_ADDR (start_byte);
7051 pend = p + (end_byte - start_byte);
7052
7053 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7054 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7055
7056 while (p < pend)
7057 {
7058 if (ASCII_BYTE_P (*p))
7059 p++;
7060 else
7061 {
7062 c = STRING_CHAR_ADVANCE (p);
7063
7064 charset_map_loaded = 0;
7065 for (tail = list; CONSP (tail); tail = XCDR (tail))
7066 {
7067 elt = XCDR (XCAR (tail));
7068 if (! char_encodable_p (c, XCAR (elt)))
7069 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7070 }
7071 if (charset_map_loaded)
7072 {
7073 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7074
7075 if (STRINGP (start))
7076 pbeg = XSTRING (start)->data;
7077 else
7078 pbeg = BYTE_POS_ADDR (start_byte);
7079 p = pbeg + p_offset;
7080 pend = pbeg + pend_offset;
7081 }
7082 }
7083 pos++;
7084 }
7085
7086 tail = list;
7087 list = Qnil;
7088 for (; CONSP (tail); tail = XCDR (tail))
7089 {
7090 elt = XCAR (tail);
7091 if (CONSP (XCDR (XCDR (elt))))
7092 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7093 list);
7094 }
7095
7096 return list;
7097 }
7098
7099
7100
7101 Lisp_Object
7102 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7103 Lisp_Object start, end, coding_system, dst_object;
7104 int encodep, norecord;
7105 {
7106 struct coding_system coding;
7107 EMACS_INT from, from_byte, to, to_byte;
7108 Lisp_Object src_object;
7109
7110 CHECK_NUMBER_COERCE_MARKER (start);
7111 CHECK_NUMBER_COERCE_MARKER (end);
7112 if (NILP (coding_system))
7113 coding_system = Qno_conversion;
7114 else
7115 CHECK_CODING_SYSTEM (coding_system);
7116 src_object = Fcurrent_buffer ();
7117 if (NILP (dst_object))
7118 dst_object = src_object;
7119 else if (! EQ (dst_object, Qt))
7120 CHECK_BUFFER (dst_object);
7121
7122 validate_region (&start, &end);
7123 from = XFASTINT (start);
7124 from_byte = CHAR_TO_BYTE (from);
7125 to = XFASTINT (end);
7126 to_byte = CHAR_TO_BYTE (to);
7127
7128 setup_coding_system (coding_system, &coding);
7129 coding.mode |= CODING_MODE_LAST_BLOCK;
7130
7131 if (encodep)
7132 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7133 dst_object);
7134 else
7135 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7136 dst_object);
7137 if (! norecord)
7138 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7139
7140 if (coding.result != CODING_RESULT_SUCCESS)
7141 error ("Code conversion error: %d", coding.result);
7142
7143 return (BUFFERP (dst_object)
7144 ? make_number (coding.produced_char)
7145 : coding.dst_object);
7146 }
7147
7148
7149 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7150 3, 4, "r\nzCoding system: ",
7151 doc: /* Decode the current region from the specified coding system.
7152 When called from a program, takes four arguments:
7153 START, END, CODING-SYSTEM, and DESTINATION.
7154 START and END are buffer positions.
7155
7156 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7157 If nil, the region between START and END is replace by the decoded text.
7158 If buffer, the decoded text is inserted in the buffer.
7159 If t, the decoded text is returned.
7160
7161 This function sets `last-coding-system-used' to the precise coding system
7162 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7163 not fully specified.)
7164 It returns the length of the decoded text. */)
7165 (start, end, coding_system, destination)
7166 Lisp_Object start, end, coding_system, destination;
7167 {
7168 return code_convert_region (start, end, coding_system, destination, 0, 0);
7169 }
7170
7171 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7172 3, 4, "r\nzCoding system: ",
7173 doc: /* Encode the current region by specified coding system.
7174 When called from a program, takes three arguments:
7175 START, END, and CODING-SYSTEM. START and END are buffer positions.
7176
7177 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7178 If nil, the region between START and END is replace by the encoded text.
7179 If buffer, the encoded text is inserted in the buffer.
7180 If t, the encoded text is returned.
7181
7182 This function sets `last-coding-system-used' to the precise coding system
7183 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7184 not fully specified.)
7185 It returns the length of the encoded text. */)
7186 (start, end, coding_system, destination)
7187 Lisp_Object start, end, coding_system, destination;
7188 {
7189 return code_convert_region (start, end, coding_system, destination, 1, 0);
7190 }
7191
7192 Lisp_Object
7193 code_convert_string (string, coding_system, dst_object,
7194 encodep, nocopy, norecord)
7195 Lisp_Object string, coding_system, dst_object;
7196 int encodep, nocopy, norecord;
7197 {
7198 struct coding_system coding;
7199 EMACS_INT chars, bytes;
7200
7201 CHECK_STRING (string);
7202 if (NILP (coding_system))
7203 {
7204 if (! norecord)
7205 Vlast_coding_system_used = Qno_conversion;
7206 if (NILP (dst_object))
7207 return (nocopy ? Fcopy_sequence (string) : string);
7208 }
7209
7210 if (NILP (coding_system))
7211 coding_system = Qno_conversion;
7212 else
7213 CHECK_CODING_SYSTEM (coding_system);
7214 if (NILP (dst_object))
7215 dst_object = Qt;
7216 else if (! EQ (dst_object, Qt))
7217 CHECK_BUFFER (dst_object);
7218
7219 setup_coding_system (coding_system, &coding);
7220 coding.mode |= CODING_MODE_LAST_BLOCK;
7221 chars = XSTRING (string)->size;
7222 bytes = STRING_BYTES (XSTRING (string));
7223 if (encodep)
7224 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7225 else
7226 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7227 if (! norecord)
7228 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7229
7230 if (coding.result != CODING_RESULT_SUCCESS)
7231 error ("Code conversion error: %d", coding.result);
7232
7233 return (BUFFERP (dst_object)
7234 ? make_number (coding.produced_char)
7235 : coding.dst_object);
7236 }
7237
7238
7239 /* Encode or decode STRING according to CODING_SYSTEM.
7240 Do not set Vlast_coding_system_used.
7241
7242 This function is called only from macros DECODE_FILE and
7243 ENCODE_FILE, thus we ignore character composition. */
7244
7245 Lisp_Object
7246 code_convert_string_norecord (string, coding_system, encodep)
7247 Lisp_Object string, coding_system;
7248 int encodep;
7249 {
7250 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7251 }
7252
7253
7254 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7255 2, 4, 0,
7256 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7257
7258 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7259 if the decoding operation is trivial.
7260
7261 Optional fourth arg BUFFER non-nil meant that the decoded text is
7262 inserted in BUFFER instead of returned as a string. In this case,
7263 the return value is BUFFER.
7264
7265 This function sets `last-coding-system-used' to the precise coding system
7266 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7267 not fully specified. */)
7268 (string, coding_system, nocopy, buffer)
7269 Lisp_Object string, coding_system, nocopy, buffer;
7270 {
7271 return code_convert_string (string, coding_system, buffer,
7272 0, ! NILP (nocopy), 0);
7273 }
7274
7275 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7276 2, 4, 0,
7277 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7278
7279 Optional third arg NOCOPY non-nil means it is OK to return STRING
7280 itself if the encoding operation is trivial.
7281
7282 Optional fourth arg BUFFER non-nil meant that the encoded text is
7283 inserted in BUFFER instead of returned as a string. In this case,
7284 the return value is BUFFER.
7285
7286 This function sets `last-coding-system-used' to the precise coding system
7287 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7288 not fully specified.) */)
7289 (string, coding_system, nocopy, buffer)
7290 Lisp_Object string, coding_system, nocopy, buffer;
7291 {
7292 return code_convert_string (string, coding_system, buffer,
7293 1, ! NILP (nocopy), 1);
7294 }
7295
7296 \f
7297 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7298 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7299 Return the corresponding character. */)
7300 (code)
7301 Lisp_Object code;
7302 {
7303 Lisp_Object spec, attrs, val;
7304 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7305 int c;
7306
7307 CHECK_NATNUM (code);
7308 c = XFASTINT (code);
7309 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7310 attrs = AREF (spec, 0);
7311
7312 if (ASCII_BYTE_P (c)
7313 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7314 return code;
7315
7316 val = CODING_ATTR_CHARSET_LIST (attrs);
7317 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7318 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7319 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7320
7321 if (c <= 0x7F)
7322 charset = charset_roman;
7323 else if (c >= 0xA0 && c < 0xDF)
7324 {
7325 charset = charset_kana;
7326 c -= 0x80;
7327 }
7328 else
7329 {
7330 int s1 = c >> 8, s2 = c & 0xFF;
7331
7332 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7333 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7334 error ("Invalid code: %d", code);
7335 SJIS_TO_JIS (c);
7336 charset = charset_kanji;
7337 }
7338 c = DECODE_CHAR (charset, c);
7339 if (c < 0)
7340 error ("Invalid code: %d", code);
7341 return make_number (c);
7342 }
7343
7344
7345 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7346 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7347 Return the corresponding code in SJIS. */)
7348 (ch)
7349 Lisp_Object ch;
7350 {
7351 Lisp_Object spec, attrs, charset_list;
7352 int c;
7353 struct charset *charset;
7354 unsigned code;
7355
7356 CHECK_CHARACTER (ch);
7357 c = XFASTINT (ch);
7358 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7359 attrs = AREF (spec, 0);
7360
7361 if (ASCII_CHAR_P (c)
7362 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7363 return ch;
7364
7365 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7366 charset = char_charset (c, charset_list, &code);
7367 if (code == CHARSET_INVALID_CODE (charset))
7368 error ("Can't encode by shift_jis encoding: %d", c);
7369 JIS_TO_SJIS (code);
7370
7371 return make_number (code);
7372 }
7373
7374 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7375 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7376 Return the corresponding character. */)
7377 (code)
7378 Lisp_Object code;
7379 {
7380 Lisp_Object spec, attrs, val;
7381 struct charset *charset_roman, *charset_big5, *charset;
7382 int c;
7383
7384 CHECK_NATNUM (code);
7385 c = XFASTINT (code);
7386 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7387 attrs = AREF (spec, 0);
7388
7389 if (ASCII_BYTE_P (c)
7390 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7391 return code;
7392
7393 val = CODING_ATTR_CHARSET_LIST (attrs);
7394 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7395 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7396
7397 if (c <= 0x7F)
7398 charset = charset_roman;
7399 else
7400 {
7401 int b1 = c >> 8, b2 = c & 0x7F;
7402 if (b1 < 0xA1 || b1 > 0xFE
7403 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7404 error ("Invalid code: %d", code);
7405 charset = charset_big5;
7406 }
7407 c = DECODE_CHAR (charset, (unsigned )c);
7408 if (c < 0)
7409 error ("Invalid code: %d", code);
7410 return make_number (c);
7411 }
7412
7413 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7414 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7415 Return the corresponding character code in Big5. */)
7416 (ch)
7417 Lisp_Object ch;
7418 {
7419 Lisp_Object spec, attrs, charset_list;
7420 struct charset *charset;
7421 int c;
7422 unsigned code;
7423
7424 CHECK_CHARACTER (ch);
7425 c = XFASTINT (ch);
7426 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7427 attrs = AREF (spec, 0);
7428 if (ASCII_CHAR_P (c)
7429 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7430 return ch;
7431
7432 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7433 charset = char_charset (c, charset_list, &code);
7434 if (code == CHARSET_INVALID_CODE (charset))
7435 error ("Can't encode by Big5 encoding: %d", c);
7436
7437 return make_number (code);
7438 }
7439
7440 \f
7441 DEFUN ("set-terminal-coding-system-internal",
7442 Fset_terminal_coding_system_internal,
7443 Sset_terminal_coding_system_internal, 1, 1, 0,
7444 doc: /* Internal use only. */)
7445 (coding_system)
7446 Lisp_Object coding_system;
7447 {
7448 CHECK_SYMBOL (coding_system);
7449 setup_coding_system (Fcheck_coding_system (coding_system),
7450 &terminal_coding);
7451
7452 /* We had better not send unsafe characters to terminal. */
7453 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7454 /* Characer composition should be disabled. */
7455 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7456 terminal_coding.src_multibyte = 1;
7457 terminal_coding.dst_multibyte = 0;
7458 return Qnil;
7459 }
7460
7461 DEFUN ("set-safe-terminal-coding-system-internal",
7462 Fset_safe_terminal_coding_system_internal,
7463 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7464 doc: /* Internal use only. */)
7465 (coding_system)
7466 Lisp_Object coding_system;
7467 {
7468 CHECK_SYMBOL (coding_system);
7469 setup_coding_system (Fcheck_coding_system (coding_system),
7470 &safe_terminal_coding);
7471 /* Characer composition should be disabled. */
7472 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7473 safe_terminal_coding.src_multibyte = 1;
7474 safe_terminal_coding.dst_multibyte = 0;
7475 return Qnil;
7476 }
7477
7478 DEFUN ("terminal-coding-system",
7479 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7480 doc: /* Return coding system specified for terminal output. */)
7481 ()
7482 {
7483 return CODING_ID_NAME (terminal_coding.id);
7484 }
7485
7486 DEFUN ("set-keyboard-coding-system-internal",
7487 Fset_keyboard_coding_system_internal,
7488 Sset_keyboard_coding_system_internal, 1, 1, 0,
7489 doc: /* Internal use only. */)
7490 (coding_system)
7491 Lisp_Object coding_system;
7492 {
7493 CHECK_SYMBOL (coding_system);
7494 setup_coding_system (Fcheck_coding_system (coding_system),
7495 &keyboard_coding);
7496 /* Characer composition should be disabled. */
7497 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7498 return Qnil;
7499 }
7500
7501 DEFUN ("keyboard-coding-system",
7502 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7503 doc: /* Return coding system specified for decoding keyboard input. */)
7504 ()
7505 {
7506 return CODING_ID_NAME (keyboard_coding.id);
7507 }
7508
7509 \f
7510 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7511 Sfind_operation_coding_system, 1, MANY, 0,
7512 doc: /* Choose a coding system for an operation based on the target name.
7513 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7514 DECODING-SYSTEM is the coding system to use for decoding
7515 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7516 for encoding (in case OPERATION does encoding).
7517
7518 The first argument OPERATION specifies an I/O primitive:
7519 For file I/O, `insert-file-contents' or `write-region'.
7520 For process I/O, `call-process', `call-process-region', or `start-process'.
7521 For network I/O, `open-network-stream'.
7522
7523 The remaining arguments should be the same arguments that were passed
7524 to the primitive. Depending on which primitive, one of those arguments
7525 is selected as the TARGET. For example, if OPERATION does file I/O,
7526 whichever argument specifies the file name is TARGET.
7527
7528 TARGET has a meaning which depends on OPERATION:
7529 For file I/O, TARGET is a file name.
7530 For process I/O, TARGET is a process name.
7531 For network I/O, TARGET is a service name or a port number
7532
7533 This function looks up what specified for TARGET in,
7534 `file-coding-system-alist', `process-coding-system-alist',
7535 or `network-coding-system-alist' depending on OPERATION.
7536 They may specify a coding system, a cons of coding systems,
7537 or a function symbol to call.
7538 In the last case, we call the function with one argument,
7539 which is a list of all the arguments given to this function.
7540
7541 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7542 (nargs, args)
7543 int nargs;
7544 Lisp_Object *args;
7545 {
7546 Lisp_Object operation, target_idx, target, val;
7547 register Lisp_Object chain;
7548
7549 if (nargs < 2)
7550 error ("Too few arguments");
7551 operation = args[0];
7552 if (!SYMBOLP (operation)
7553 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7554 error ("Invalid first arguement");
7555 if (nargs < 1 + XINT (target_idx))
7556 error ("Too few arguments for operation: %s",
7557 XSYMBOL (operation)->name->data);
7558 target = args[XINT (target_idx) + 1];
7559 if (!(STRINGP (target)
7560 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7561 error ("Invalid %dth argument", XINT (target_idx) + 1);
7562
7563 chain = ((EQ (operation, Qinsert_file_contents)
7564 || EQ (operation, Qwrite_region))
7565 ? Vfile_coding_system_alist
7566 : (EQ (operation, Qopen_network_stream)
7567 ? Vnetwork_coding_system_alist
7568 : Vprocess_coding_system_alist));
7569 if (NILP (chain))
7570 return Qnil;
7571
7572 for (; CONSP (chain); chain = XCDR (chain))
7573 {
7574 Lisp_Object elt;
7575
7576 elt = XCAR (chain);
7577 if (CONSP (elt)
7578 && ((STRINGP (target)
7579 && STRINGP (XCAR (elt))
7580 && fast_string_match (XCAR (elt), target) >= 0)
7581 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7582 {
7583 val = XCDR (elt);
7584 /* Here, if VAL is both a valid coding system and a valid
7585 function symbol, we return VAL as a coding system. */
7586 if (CONSP (val))
7587 return val;
7588 if (! SYMBOLP (val))
7589 return Qnil;
7590 if (! NILP (Fcoding_system_p (val)))
7591 return Fcons (val, val);
7592 if (! NILP (Ffboundp (val)))
7593 {
7594 val = call1 (val, Flist (nargs, args));
7595 if (CONSP (val))
7596 return val;
7597 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7598 return Fcons (val, val);
7599 }
7600 return Qnil;
7601 }
7602 }
7603 return Qnil;
7604 }
7605
7606 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7607 Sset_coding_system_priority, 0, MANY, 0,
7608 doc: /* Assign higher priority to the coding systems given as arguments.
7609 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7610 (nargs, args)
7611 int nargs;
7612 Lisp_Object *args;
7613 {
7614 int i, j;
7615 int changed[coding_category_max];
7616 enum coding_category priorities[coding_category_max];
7617
7618 bzero (changed, sizeof changed);
7619
7620 for (i = j = 0; i < nargs; i++)
7621 {
7622 enum coding_category category;
7623 Lisp_Object spec, attrs;
7624
7625 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7626 attrs = AREF (spec, 0);
7627 category = XINT (CODING_ATTR_CATEGORY (attrs));
7628 if (changed[category])
7629 /* Ignore this coding system because a coding system of the
7630 same category already had a higher priority. */
7631 continue;
7632 changed[category] = 1;
7633 priorities[j++] = category;
7634 if (coding_categories[category].id >= 0
7635 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7636 setup_coding_system (args[i], &coding_categories[category]);
7637 }
7638
7639 /* Now we have decided top J priorities. Reflect the order of the
7640 original priorities to the remaining priorities. */
7641
7642 for (i = j, j = 0; i < coding_category_max; i++, j++)
7643 {
7644 while (j < coding_category_max
7645 && changed[coding_priorities[j]])
7646 j++;
7647 if (j == coding_category_max)
7648 abort ();
7649 priorities[i] = coding_priorities[j];
7650 }
7651
7652 bcopy (priorities, coding_priorities, sizeof priorities);
7653 return Qnil;
7654 }
7655
7656 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7657 Scoding_system_priority_list, 0, 1, 0,
7658 doc: /* Return a list of coding systems ordered by their priorities.
7659 HIGHESTP non-nil means just return the highest priority one. */)
7660 (highestp)
7661 Lisp_Object highestp;
7662 {
7663 int i;
7664 Lisp_Object val;
7665
7666 for (i = 0, val = Qnil; i < coding_category_max; i++)
7667 {
7668 enum coding_category category = coding_priorities[i];
7669 int id = coding_categories[category].id;
7670 Lisp_Object attrs;
7671
7672 if (id < 0)
7673 continue;
7674 attrs = CODING_ID_ATTRS (id);
7675 if (! NILP (highestp))
7676 return CODING_ATTR_BASE_NAME (attrs);
7677 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7678 }
7679 return Fnreverse (val);
7680 }
7681
7682 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7683
7684 static Lisp_Object
7685 make_subsidiaries (base)
7686 Lisp_Object base;
7687 {
7688 Lisp_Object subsidiaries;
7689 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7690 char *buf = (char *) alloca (base_name_len + 6);
7691 int i;
7692
7693 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7694 subsidiaries = Fmake_vector (make_number (3), Qnil);
7695 for (i = 0; i < 3; i++)
7696 {
7697 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7698 ASET (subsidiaries, i, intern (buf));
7699 }
7700 return subsidiaries;
7701 }
7702
7703
7704 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7705 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7706 doc: /* For internal use only.
7707 usage: (define-coding-system-internal ...) */)
7708 (nargs, args)
7709 int nargs;
7710 Lisp_Object *args;
7711 {
7712 Lisp_Object name;
7713 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7714 Lisp_Object attrs; /* Vector of attributes. */
7715 Lisp_Object eol_type;
7716 Lisp_Object aliases;
7717 Lisp_Object coding_type, charset_list, safe_charsets;
7718 enum coding_category category;
7719 Lisp_Object tail, val;
7720 int max_charset_id = 0;
7721 int i;
7722
7723 if (nargs < coding_arg_max)
7724 goto short_args;
7725
7726 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7727
7728 name = args[coding_arg_name];
7729 CHECK_SYMBOL (name);
7730 CODING_ATTR_BASE_NAME (attrs) = name;
7731
7732 val = args[coding_arg_mnemonic];
7733 if (! STRINGP (val))
7734 CHECK_CHARACTER (val);
7735 CODING_ATTR_MNEMONIC (attrs) = val;
7736
7737 coding_type = args[coding_arg_coding_type];
7738 CHECK_SYMBOL (coding_type);
7739 CODING_ATTR_TYPE (attrs) = coding_type;
7740
7741 charset_list = args[coding_arg_charset_list];
7742 if (SYMBOLP (charset_list))
7743 {
7744 if (EQ (charset_list, Qiso_2022))
7745 {
7746 if (! EQ (coding_type, Qiso_2022))
7747 error ("Invalid charset-list");
7748 charset_list = Viso_2022_charset_list;
7749 }
7750 else if (EQ (charset_list, Qemacs_mule))
7751 {
7752 if (! EQ (coding_type, Qemacs_mule))
7753 error ("Invalid charset-list");
7754 charset_list = Vemacs_mule_charset_list;
7755 }
7756 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7757 if (max_charset_id < XFASTINT (XCAR (tail)))
7758 max_charset_id = XFASTINT (XCAR (tail));
7759 }
7760 else
7761 {
7762 charset_list = Fcopy_sequence (charset_list);
7763 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7764 {
7765 struct charset *charset;
7766
7767 val = Fcar (tail);
7768 CHECK_CHARSET_GET_CHARSET (val, charset);
7769 if (EQ (coding_type, Qiso_2022)
7770 ? CHARSET_ISO_FINAL (charset) < 0
7771 : EQ (coding_type, Qemacs_mule)
7772 ? CHARSET_EMACS_MULE_ID (charset) < 0
7773 : 0)
7774 error ("Can't handle charset `%s'",
7775 XSYMBOL (CHARSET_NAME (charset))->name->data);
7776
7777 XCAR (tail) = make_number (charset->id);
7778 if (max_charset_id < charset->id)
7779 max_charset_id = charset->id;
7780 }
7781 }
7782 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7783
7784 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7785 make_number (255));
7786 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7787 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7788 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7789
7790 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7791
7792 val = args[coding_arg_decode_translation_table];
7793 if (! NILP (val))
7794 CHECK_CHAR_TABLE (val);
7795 CODING_ATTR_DECODE_TBL (attrs) = val;
7796
7797 val = args[coding_arg_encode_translation_table];
7798 if (! NILP (val))
7799 CHECK_CHAR_TABLE (val);
7800 CODING_ATTR_ENCODE_TBL (attrs) = val;
7801
7802 val = args[coding_arg_post_read_conversion];
7803 CHECK_SYMBOL (val);
7804 CODING_ATTR_POST_READ (attrs) = val;
7805
7806 val = args[coding_arg_pre_write_conversion];
7807 CHECK_SYMBOL (val);
7808 CODING_ATTR_PRE_WRITE (attrs) = val;
7809
7810 val = args[coding_arg_default_char];
7811 if (NILP (val))
7812 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7813 else
7814 {
7815 CHECK_CHARACTER (val);
7816 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7817 }
7818
7819 val = args[coding_arg_plist];
7820 CHECK_LIST (val);
7821 CODING_ATTR_PLIST (attrs) = val;
7822
7823 if (EQ (coding_type, Qcharset))
7824 {
7825 /* Generate a lisp vector of 256 elements. Each element is nil,
7826 integer, or a list of charset IDs.
7827
7828 If Nth element is nil, the byte code N is invalid in this
7829 coding system.
7830
7831 If Nth element is a number NUM, N is the first byte of a
7832 charset whose ID is NUM.
7833
7834 If Nth element is a list of charset IDs, N is the first byte
7835 of one of them. The list is sorted by dimensions of the
7836 charsets. A charset of smaller dimension comes firtst.
7837 */
7838 val = Fmake_vector (make_number (256), Qnil);
7839
7840 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7841 {
7842 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7843 int dim = CHARSET_DIMENSION (charset);
7844 int idx = (dim - 1) * 4;
7845
7846 if (CHARSET_ASCII_COMPATIBLE_P (charset))
7847 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7848
7849 for (i = charset->code_space[idx];
7850 i <= charset->code_space[idx + 1]; i++)
7851 {
7852 Lisp_Object tmp, tmp2;
7853 int dim2;
7854
7855 tmp = AREF (val, i);
7856 if (NILP (tmp))
7857 tmp = XCAR (tail);
7858 else if (NUMBERP (tmp))
7859 {
7860 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7861 if (dim < dim2)
7862 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7863 else
7864 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7865 }
7866 else
7867 {
7868 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7869 {
7870 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7871 if (dim < dim2)
7872 break;
7873 }
7874 if (NILP (tmp2))
7875 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7876 else
7877 {
7878 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7879 XSETCAR (tmp2, XCAR (tail));
7880 }
7881 }
7882 ASET (val, i, tmp);
7883 }
7884 }
7885 ASET (attrs, coding_attr_charset_valids, val);
7886 category = coding_category_charset;
7887 }
7888 else if (EQ (coding_type, Qccl))
7889 {
7890 Lisp_Object valids;
7891
7892 if (nargs < coding_arg_ccl_max)
7893 goto short_args;
7894
7895 val = args[coding_arg_ccl_decoder];
7896 CHECK_CCL_PROGRAM (val);
7897 if (VECTORP (val))
7898 val = Fcopy_sequence (val);
7899 ASET (attrs, coding_attr_ccl_decoder, val);
7900
7901 val = args[coding_arg_ccl_encoder];
7902 CHECK_CCL_PROGRAM (val);
7903 if (VECTORP (val))
7904 val = Fcopy_sequence (val);
7905 ASET (attrs, coding_attr_ccl_encoder, val);
7906
7907 val = args[coding_arg_ccl_valids];
7908 valids = Fmake_string (make_number (256), make_number (0));
7909 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7910 {
7911 int from, to;
7912
7913 val = Fcar (tail);
7914 if (INTEGERP (val))
7915 {
7916 from = to = XINT (val);
7917 if (from < 0 || from > 255)
7918 args_out_of_range_3 (val, make_number (0), make_number (255));
7919 }
7920 else
7921 {
7922 CHECK_CONS (val);
7923 CHECK_NUMBER (XCAR (val));
7924 CHECK_NUMBER (XCDR (val));
7925 from = XINT (XCAR (val));
7926 if (from < 0 || from > 255)
7927 args_out_of_range_3 (XCAR (val),
7928 make_number (0), make_number (255));
7929 to = XINT (XCDR (val));
7930 if (to < from || to > 255)
7931 args_out_of_range_3 (XCDR (val),
7932 XCAR (val), make_number (255));
7933 }
7934 for (i = from; i <= to; i++)
7935 XSTRING (valids)->data[i] = 1;
7936 }
7937 ASET (attrs, coding_attr_ccl_valids, valids);
7938
7939 category = coding_category_ccl;
7940 }
7941 else if (EQ (coding_type, Qutf_16))
7942 {
7943 Lisp_Object bom, endian;
7944
7945 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
7946
7947 if (nargs < coding_arg_utf16_max)
7948 goto short_args;
7949
7950 bom = args[coding_arg_utf16_bom];
7951 if (! NILP (bom) && ! EQ (bom, Qt))
7952 {
7953 CHECK_CONS (bom);
7954 CHECK_CODING_SYSTEM (XCAR (bom));
7955 CHECK_CODING_SYSTEM (XCDR (bom));
7956 }
7957 ASET (attrs, coding_attr_utf_16_bom, bom);
7958
7959 endian = args[coding_arg_utf16_endian];
7960 ASET (attrs, coding_attr_utf_16_endian, endian);
7961
7962 category = (CONSP (bom)
7963 ? coding_category_utf_16_auto
7964 : NILP (bom)
7965 ? (NILP (endian)
7966 ? coding_category_utf_16_be_nosig
7967 : coding_category_utf_16_le_nosig)
7968 : (NILP (endian)
7969 ? coding_category_utf_16_be
7970 : coding_category_utf_16_le));
7971 }
7972 else if (EQ (coding_type, Qiso_2022))
7973 {
7974 Lisp_Object initial, reg_usage, request, flags;
7975 int i, id;
7976
7977 if (nargs < coding_arg_iso2022_max)
7978 goto short_args;
7979
7980 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7981 CHECK_VECTOR (initial);
7982 for (i = 0; i < 4; i++)
7983 {
7984 val = Faref (initial, make_number (i));
7985 if (! NILP (val))
7986 {
7987 struct charset *charset;
7988
7989 CHECK_CHARSET_GET_CHARSET (val, charset);
7990 ASET (initial, i, make_number (CHARSET_ID (charset)));
7991 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
7992 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7993 }
7994 else
7995 ASET (initial, i, make_number (-1));
7996 }
7997
7998 reg_usage = args[coding_arg_iso2022_reg_usage];
7999 CHECK_CONS (reg_usage);
8000 CHECK_NATNUM (XCAR (reg_usage));
8001 CHECK_NATNUM (XCDR (reg_usage));
8002
8003 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8004 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8005 {
8006 int id;
8007
8008 val = Fcar (tail);
8009 CHECK_CONS (val);
8010 CHECK_CHARSET_GET_ID (XCAR (val), id);
8011 CHECK_NATNUM (XCDR (val));
8012 if (XINT (XCDR (val)) >= 4)
8013 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8014 XCAR (val) = make_number (id);
8015 }
8016
8017 flags = args[coding_arg_iso2022_flags];
8018 CHECK_NATNUM (flags);
8019 i = XINT (flags);
8020 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8021 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8022
8023 ASET (attrs, coding_attr_iso_initial, initial);
8024 ASET (attrs, coding_attr_iso_usage, reg_usage);
8025 ASET (attrs, coding_attr_iso_request, request);
8026 ASET (attrs, coding_attr_iso_flags, flags);
8027 setup_iso_safe_charsets (attrs);
8028
8029 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8030 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8031 | CODING_ISO_FLAG_SINGLE_SHIFT))
8032 ? coding_category_iso_7_else
8033 : EQ (args[coding_arg_charset_list], Qiso_2022)
8034 ? coding_category_iso_7
8035 : coding_category_iso_7_tight);
8036 else
8037 {
8038 int id = XINT (AREF (initial, 1));
8039
8040 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8041 || EQ (args[coding_arg_charset_list], Qiso_2022)
8042 || id < 0)
8043 ? coding_category_iso_8_else
8044 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8045 ? coding_category_iso_8_1
8046 : coding_category_iso_8_2);
8047 }
8048 if (category != coding_category_iso_8_1
8049 && category != coding_category_iso_8_2)
8050 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8051 }
8052 else if (EQ (coding_type, Qemacs_mule))
8053 {
8054 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8055 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8056 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8057 category = coding_category_emacs_mule;
8058 }
8059 else if (EQ (coding_type, Qshift_jis))
8060 {
8061
8062 struct charset *charset;
8063
8064 if (XINT (Flength (charset_list)) != 3)
8065 error ("There should be just three charsets");
8066
8067 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8068 if (CHARSET_DIMENSION (charset) != 1)
8069 error ("Dimension of charset %s is not one",
8070 XSYMBOL (CHARSET_NAME (charset))->name->data);
8071 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8072 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8073
8074 charset_list = XCDR (charset_list);
8075 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8076 if (CHARSET_DIMENSION (charset) != 1)
8077 error ("Dimension of charset %s is not one",
8078 XSYMBOL (CHARSET_NAME (charset))->name->data);
8079
8080 charset_list = XCDR (charset_list);
8081 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8082 if (CHARSET_DIMENSION (charset) != 2)
8083 error ("Dimension of charset %s is not two",
8084 XSYMBOL (CHARSET_NAME (charset))->name->data);
8085
8086 category = coding_category_sjis;
8087 Vsjis_coding_system = name;
8088 }
8089 else if (EQ (coding_type, Qbig5))
8090 {
8091 struct charset *charset;
8092
8093 if (XINT (Flength (charset_list)) != 2)
8094 error ("There should be just two charsets");
8095
8096 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8097 if (CHARSET_DIMENSION (charset) != 1)
8098 error ("Dimension of charset %s is not one",
8099 XSYMBOL (CHARSET_NAME (charset))->name->data);
8100 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8101 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8102
8103 charset_list = XCDR (charset_list);
8104 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8105 if (CHARSET_DIMENSION (charset) != 2)
8106 error ("Dimension of charset %s is not two",
8107 XSYMBOL (CHARSET_NAME (charset))->name->data);
8108
8109 category = coding_category_big5;
8110 Vbig5_coding_system = name;
8111 }
8112 else if (EQ (coding_type, Qraw_text))
8113 {
8114 category = coding_category_raw_text;
8115 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8116 }
8117 else if (EQ (coding_type, Qutf_8))
8118 {
8119 category = coding_category_utf_8;
8120 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8121 }
8122 else if (EQ (coding_type, Qundecided))
8123 category = coding_category_undecided;
8124 else
8125 error ("Invalid coding system type: %s",
8126 XSYMBOL (coding_type)->name->data);
8127
8128 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8129
8130 eol_type = args[coding_arg_eol_type];
8131 if (! NILP (eol_type)
8132 && ! EQ (eol_type, Qunix)
8133 && ! EQ (eol_type, Qdos)
8134 && ! EQ (eol_type, Qmac))
8135 error ("Invalid eol-type");
8136
8137 aliases = Fcons (name, Qnil);
8138
8139 if (NILP (eol_type))
8140 {
8141 eol_type = make_subsidiaries (name);
8142 for (i = 0; i < 3; i++)
8143 {
8144 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8145
8146 this_name = AREF (eol_type, i);
8147 this_aliases = Fcons (this_name, Qnil);
8148 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8149 this_spec = Fmake_vector (make_number (3), attrs);
8150 ASET (this_spec, 1, this_aliases);
8151 ASET (this_spec, 2, this_eol_type);
8152 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8153 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8154 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8155 Vcoding_system_alist);
8156 }
8157 }
8158
8159 spec_vec = Fmake_vector (make_number (3), attrs);
8160 ASET (spec_vec, 1, aliases);
8161 ASET (spec_vec, 2, eol_type);
8162
8163 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8164 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8165 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8166 Vcoding_system_alist);
8167
8168 {
8169 int id = coding_categories[category].id;
8170
8171 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8172 setup_coding_system (name, &coding_categories[category]);
8173 }
8174
8175 return Qnil;
8176
8177 short_args:
8178 return Fsignal (Qwrong_number_of_arguments,
8179 Fcons (intern ("define-coding-system-internal"),
8180 make_number (nargs)));
8181 }
8182
8183 /* Fixme: should this record the alias relationships for
8184 diagnostics? Should it update coding-system-list? */
8185 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8186 Sdefine_coding_system_alias, 2, 2, 0,
8187 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8188 (alias, coding_system)
8189 Lisp_Object alias, coding_system;
8190 {
8191 Lisp_Object spec, aliases, eol_type;
8192
8193 CHECK_SYMBOL (alias);
8194 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8195 aliases = AREF (spec, 1);
8196 while (!NILP (XCDR (aliases)))
8197 aliases = XCDR (aliases);
8198 XCDR (aliases) = Fcons (alias, Qnil);
8199
8200 eol_type = AREF (spec, 2);
8201 if (VECTORP (eol_type))
8202 {
8203 Lisp_Object subsidiaries;
8204 int i;
8205
8206 subsidiaries = make_subsidiaries (alias);
8207 for (i = 0; i < 3; i++)
8208 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8209 AREF (eol_type, i));
8210
8211 ASET (spec, 2, subsidiaries);
8212 }
8213
8214 Fputhash (alias, spec, Vcoding_system_hash_table);
8215 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8216 Vcoding_system_alist);
8217
8218 return Qnil;
8219 }
8220
8221 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8222 1, 1, 0,
8223 doc: /* Return the base of CODING-SYSTEM.
8224 Any alias or subsidiary coding system is not a base coding system. */)
8225 (coding_system)
8226 Lisp_Object coding_system;
8227 {
8228 Lisp_Object spec, attrs;
8229
8230 if (NILP (coding_system))
8231 return (Qno_conversion);
8232 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8233 attrs = AREF (spec, 0);
8234 return CODING_ATTR_BASE_NAME (attrs);
8235 }
8236
8237 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8238 1, 1, 0,
8239 doc: "Return the property list of CODING-SYSTEM.")
8240 (coding_system)
8241 Lisp_Object coding_system;
8242 {
8243 Lisp_Object spec, attrs;
8244
8245 if (NILP (coding_system))
8246 coding_system = Qno_conversion;
8247 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8248 attrs = AREF (spec, 0);
8249 return CODING_ATTR_PLIST (attrs);
8250 }
8251
8252
8253 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8254 1, 1, 0,
8255 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8256 (coding_system)
8257 Lisp_Object coding_system;
8258 {
8259 Lisp_Object spec;
8260
8261 if (NILP (coding_system))
8262 coding_system = Qno_conversion;
8263 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8264 return AREF (spec, 1);
8265 }
8266
8267 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8268 Scoding_system_eol_type, 1, 1, 0,
8269 doc: /* Return eol-type of CODING-SYSTEM.
8270 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8271
8272 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8273 and CR respectively.
8274
8275 A vector value indicates that a format of end-of-line should be
8276 detected automatically. Nth element of the vector is the subsidiary
8277 coding system whose eol-type is N. */)
8278 (coding_system)
8279 Lisp_Object coding_system;
8280 {
8281 Lisp_Object spec, eol_type;
8282 int n;
8283
8284 if (NILP (coding_system))
8285 coding_system = Qno_conversion;
8286 if (! CODING_SYSTEM_P (coding_system))
8287 return Qnil;
8288 spec = CODING_SYSTEM_SPEC (coding_system);
8289 eol_type = AREF (spec, 2);
8290 if (VECTORP (eol_type))
8291 return Fcopy_sequence (eol_type);
8292 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8293 return make_number (n);
8294 }
8295
8296 #endif /* emacs */
8297
8298 \f
8299 /*** 9. Post-amble ***/
8300
8301 void
8302 init_coding_once ()
8303 {
8304 int i;
8305
8306 for (i = 0; i < coding_category_max; i++)
8307 {
8308 coding_categories[i].id = -1;
8309 coding_priorities[i] = i;
8310 }
8311
8312 /* ISO2022 specific initialize routine. */
8313 for (i = 0; i < 0x20; i++)
8314 iso_code_class[i] = ISO_control_0;
8315 for (i = 0x21; i < 0x7F; i++)
8316 iso_code_class[i] = ISO_graphic_plane_0;
8317 for (i = 0x80; i < 0xA0; i++)
8318 iso_code_class[i] = ISO_control_1;
8319 for (i = 0xA1; i < 0xFF; i++)
8320 iso_code_class[i] = ISO_graphic_plane_1;
8321 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8322 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8323 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8324 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8325 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8326 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8327 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8328 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8329 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8330 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8331
8332 inhibit_pre_post_conversion = 0;
8333
8334 for (i = 0; i < 256; i++)
8335 {
8336 emacs_mule_bytes[i] = 1;
8337 }
8338 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8339 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8340 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8341 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8342 }
8343
8344 #ifdef emacs
8345
8346 void
8347 syms_of_coding ()
8348 {
8349 staticpro (&Vcoding_system_hash_table);
8350 Vcoding_system_hash_table = Fmakehash (Qeq);
8351
8352 staticpro (&Vsjis_coding_system);
8353 Vsjis_coding_system = Qnil;
8354
8355 staticpro (&Vbig5_coding_system);
8356 Vbig5_coding_system = Qnil;
8357
8358 staticpro (&Vcode_conversion_work_buf_list);
8359 Vcode_conversion_work_buf_list = Qnil;
8360
8361 staticpro (&Vcode_conversion_reused_work_buf);
8362 Vcode_conversion_reused_work_buf = Qnil;
8363
8364 DEFSYM (Qcharset, "charset");
8365 DEFSYM (Qtarget_idx, "target-idx");
8366 DEFSYM (Qcoding_system_history, "coding-system-history");
8367 Fset (Qcoding_system_history, Qnil);
8368
8369 /* Target FILENAME is the first argument. */
8370 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8371 /* Target FILENAME is the third argument. */
8372 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8373
8374 DEFSYM (Qcall_process, "call-process");
8375 /* Target PROGRAM is the first argument. */
8376 Fput (Qcall_process, Qtarget_idx, make_number (0));
8377
8378 DEFSYM (Qcall_process_region, "call-process-region");
8379 /* Target PROGRAM is the third argument. */
8380 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8381
8382 DEFSYM (Qstart_process, "start-process");
8383 /* Target PROGRAM is the third argument. */
8384 Fput (Qstart_process, Qtarget_idx, make_number (2));
8385
8386 DEFSYM (Qopen_network_stream, "open-network-stream");
8387 /* Target SERVICE is the fourth argument. */
8388 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8389
8390 DEFSYM (Qcoding_system, "coding-system");
8391 DEFSYM (Qcoding_aliases, "coding-aliases");
8392
8393 DEFSYM (Qeol_type, "eol-type");
8394 DEFSYM (Qunix, "unix");
8395 DEFSYM (Qdos, "dos");
8396
8397 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8398 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8399 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8400 DEFSYM (Qdefault_char, "default-char");
8401 DEFSYM (Qundecided, "undecided");
8402 DEFSYM (Qno_conversion, "no-conversion");
8403 DEFSYM (Qraw_text, "raw-text");
8404
8405 DEFSYM (Qiso_2022, "iso-2022");
8406
8407 DEFSYM (Qutf_8, "utf-8");
8408
8409 DEFSYM (Qutf_16, "utf-16");
8410 DEFSYM (Qsignature, "signature");
8411 DEFSYM (Qendian, "endian");
8412 DEFSYM (Qbig, "big");
8413 DEFSYM (Qlittle, "little");
8414
8415 DEFSYM (Qshift_jis, "shift-jis");
8416 DEFSYM (Qbig5, "big5");
8417
8418 DEFSYM (Qcoding_system_p, "coding-system-p");
8419
8420 DEFSYM (Qcoding_system_error, "coding-system-error");
8421 Fput (Qcoding_system_error, Qerror_conditions,
8422 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8423 Fput (Qcoding_system_error, Qerror_message,
8424 build_string ("Invalid coding system"));
8425
8426 /* Intern this now in case it isn't already done.
8427 Setting this variable twice is harmless.
8428 But don't staticpro it here--that is done in alloc.c. */
8429 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8430
8431 DEFSYM (Qtranslation_table, "translation-table");
8432 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8433 DEFSYM (Qtranslation_table_id, "translation-table-id");
8434 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8435 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8436
8437 DEFSYM (Qvalid_codes, "valid-codes");
8438
8439 DEFSYM (Qemacs_mule, "emacs-mule");
8440
8441 Vcoding_category_table
8442 = Fmake_vector (make_number (coding_category_max), Qnil);
8443 staticpro (&Vcoding_category_table);
8444 /* Followings are target of code detection. */
8445 ASET (Vcoding_category_table, coding_category_iso_7,
8446 intern ("coding-category-iso-7"));
8447 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8448 intern ("coding-category-iso-7-tight"));
8449 ASET (Vcoding_category_table, coding_category_iso_8_1,
8450 intern ("coding-category-iso-8-1"));
8451 ASET (Vcoding_category_table, coding_category_iso_8_2,
8452 intern ("coding-category-iso-8-2"));
8453 ASET (Vcoding_category_table, coding_category_iso_7_else,
8454 intern ("coding-category-iso-7-else"));
8455 ASET (Vcoding_category_table, coding_category_iso_8_else,
8456 intern ("coding-category-iso-8-else"));
8457 ASET (Vcoding_category_table, coding_category_utf_8,
8458 intern ("coding-category-utf-8"));
8459 ASET (Vcoding_category_table, coding_category_utf_16_be,
8460 intern ("coding-category-utf-16-be"));
8461 ASET (Vcoding_category_table, coding_category_utf_16_le,
8462 intern ("coding-category-utf-16-le"));
8463 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8464 intern ("coding-category-utf-16-be-nosig"));
8465 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8466 intern ("coding-category-utf-16-le-nosig"));
8467 ASET (Vcoding_category_table, coding_category_charset,
8468 intern ("coding-category-charset"));
8469 ASET (Vcoding_category_table, coding_category_sjis,
8470 intern ("coding-category-sjis"));
8471 ASET (Vcoding_category_table, coding_category_big5,
8472 intern ("coding-category-big5"));
8473 ASET (Vcoding_category_table, coding_category_ccl,
8474 intern ("coding-category-ccl"));
8475 ASET (Vcoding_category_table, coding_category_emacs_mule,
8476 intern ("coding-category-emacs-mule"));
8477 /* Followings are NOT target of code detection. */
8478 ASET (Vcoding_category_table, coding_category_raw_text,
8479 intern ("coding-category-raw-text"));
8480 ASET (Vcoding_category_table, coding_category_undecided,
8481 intern ("coding-category-undecided"));
8482
8483 defsubr (&Scoding_system_p);
8484 defsubr (&Sread_coding_system);
8485 defsubr (&Sread_non_nil_coding_system);
8486 defsubr (&Scheck_coding_system);
8487 defsubr (&Sdetect_coding_region);
8488 defsubr (&Sdetect_coding_string);
8489 defsubr (&Sfind_coding_systems_region_internal);
8490 defsubr (&Scheck_coding_systems_region);
8491 defsubr (&Sdecode_coding_region);
8492 defsubr (&Sencode_coding_region);
8493 defsubr (&Sdecode_coding_string);
8494 defsubr (&Sencode_coding_string);
8495 defsubr (&Sdecode_sjis_char);
8496 defsubr (&Sencode_sjis_char);
8497 defsubr (&Sdecode_big5_char);
8498 defsubr (&Sencode_big5_char);
8499 defsubr (&Sset_terminal_coding_system_internal);
8500 defsubr (&Sset_safe_terminal_coding_system_internal);
8501 defsubr (&Sterminal_coding_system);
8502 defsubr (&Sset_keyboard_coding_system_internal);
8503 defsubr (&Skeyboard_coding_system);
8504 defsubr (&Sfind_operation_coding_system);
8505 defsubr (&Sset_coding_system_priority);
8506 defsubr (&Sdefine_coding_system_internal);
8507 defsubr (&Sdefine_coding_system_alias);
8508 defsubr (&Scoding_system_base);
8509 defsubr (&Scoding_system_plist);
8510 defsubr (&Scoding_system_aliases);
8511 defsubr (&Scoding_system_eol_type);
8512 defsubr (&Scoding_system_priority_list);
8513
8514 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8515 doc: /* List of coding systems.
8516
8517 Do not alter the value of this variable manually. This variable should be
8518 updated by the functions `define-coding-system' and
8519 `define-coding-system-alias'. */);
8520 Vcoding_system_list = Qnil;
8521
8522 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8523 doc: /* Alist of coding system names.
8524 Each element is one element list of coding system name.
8525 This variable is given to `completing-read' as TABLE argument.
8526
8527 Do not alter the value of this variable manually. This variable should be
8528 updated by the functions `make-coding-system' and
8529 `define-coding-system-alias'. */);
8530 Vcoding_system_alist = Qnil;
8531
8532 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8533 doc: /* List of coding-categories (symbols) ordered by priority.
8534
8535 On detecting a coding system, Emacs tries code detection algorithms
8536 associated with each coding-category one by one in this order. When
8537 one algorithm agrees with a byte sequence of source text, the coding
8538 system bound to the corresponding coding-category is selected. */);
8539 {
8540 int i;
8541
8542 Vcoding_category_list = Qnil;
8543 for (i = coding_category_max - 1; i >= 0; i--)
8544 Vcoding_category_list
8545 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8546 Vcoding_category_list);
8547 }
8548
8549 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8550 doc: /* Specify the coding system for read operations.
8551 It is useful to bind this variable with `let', but do not set it globally.
8552 If the value is a coding system, it is used for decoding on read operation.
8553 If not, an appropriate element is used from one of the coding system alists:
8554 There are three such tables, `file-coding-system-alist',
8555 `process-coding-system-alist', and `network-coding-system-alist'. */);
8556 Vcoding_system_for_read = Qnil;
8557
8558 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8559 doc: /* Specify the coding system for write operations.
8560 Programs bind this variable with `let', but you should not set it globally.
8561 If the value is a coding system, it is used for encoding of output,
8562 when writing it to a file and when sending it to a file or subprocess.
8563
8564 If this does not specify a coding system, an appropriate element
8565 is used from one of the coding system alists:
8566 There are three such tables, `file-coding-system-alist',
8567 `process-coding-system-alist', and `network-coding-system-alist'.
8568 For output to files, if the above procedure does not specify a coding system,
8569 the value of `buffer-file-coding-system' is used. */);
8570 Vcoding_system_for_write = Qnil;
8571
8572 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8573 doc: /*
8574 Coding system used in the latest file or process I/O. */);
8575 Vlast_coding_system_used = Qnil;
8576
8577 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8578 doc: /*
8579 *Non-nil means always inhibit code conversion of end-of-line format.
8580 See info node `Coding Systems' and info node `Text and Binary' concerning
8581 such conversion. */);
8582 inhibit_eol_conversion = 0;
8583
8584 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8585 doc: /*
8586 Non-nil means process buffer inherits coding system of process output.
8587 Bind it to t if the process output is to be treated as if it were a file
8588 read from some filesystem. */);
8589 inherit_process_coding_system = 0;
8590
8591 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8592 doc: /*
8593 Alist to decide a coding system to use for a file I/O operation.
8594 The format is ((PATTERN . VAL) ...),
8595 where PATTERN is a regular expression matching a file name,
8596 VAL is a coding system, a cons of coding systems, or a function symbol.
8597 If VAL is a coding system, it is used for both decoding and encoding
8598 the file contents.
8599 If VAL is a cons of coding systems, the car part is used for decoding,
8600 and the cdr part is used for encoding.
8601 If VAL is a function symbol, the function must return a coding system
8602 or a cons of coding systems which are used as above. The function gets
8603 the arguments with which `find-operation-coding-systems' was called.
8604
8605 See also the function `find-operation-coding-system'
8606 and the variable `auto-coding-alist'. */);
8607 Vfile_coding_system_alist = Qnil;
8608
8609 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8610 doc: /*
8611 Alist to decide a coding system to use for a process I/O operation.
8612 The format is ((PATTERN . VAL) ...),
8613 where PATTERN is a regular expression matching a program name,
8614 VAL is a coding system, a cons of coding systems, or a function symbol.
8615 If VAL is a coding system, it is used for both decoding what received
8616 from the program and encoding what sent to the program.
8617 If VAL is a cons of coding systems, the car part is used for decoding,
8618 and the cdr part is used for encoding.
8619 If VAL is a function symbol, the function must return a coding system
8620 or a cons of coding systems which are used as above.
8621
8622 See also the function `find-operation-coding-system'. */);
8623 Vprocess_coding_system_alist = Qnil;
8624
8625 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8626 doc: /*
8627 Alist to decide a coding system to use for a network I/O operation.
8628 The format is ((PATTERN . VAL) ...),
8629 where PATTERN is a regular expression matching a network service name
8630 or is a port number to connect to,
8631 VAL is a coding system, a cons of coding systems, or a function symbol.
8632 If VAL is a coding system, it is used for both decoding what received
8633 from the network stream and encoding what sent to the network stream.
8634 If VAL is a cons of coding systems, the car part is used for decoding,
8635 and the cdr part is used for encoding.
8636 If VAL is a function symbol, the function must return a coding system
8637 or a cons of coding systems which are used as above.
8638
8639 See also the function `find-operation-coding-system'. */);
8640 Vnetwork_coding_system_alist = Qnil;
8641
8642 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8643 doc: /* Coding system to use with system messages.
8644 Also used for decoding keyboard input on X Window system. */);
8645 Vlocale_coding_system = Qnil;
8646
8647 /* The eol mnemonics are reset in startup.el system-dependently. */
8648 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8649 doc: /*
8650 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8651 eol_mnemonic_unix = build_string (":");
8652
8653 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8654 doc: /*
8655 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8656 eol_mnemonic_dos = build_string ("\\");
8657
8658 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8659 doc: /*
8660 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8661 eol_mnemonic_mac = build_string ("/");
8662
8663 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8664 doc: /*
8665 *String displayed in mode line when end-of-line format is not yet determined. */);
8666 eol_mnemonic_undecided = build_string (":");
8667
8668 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8669 doc: /*
8670 *Non-nil enables character translation while encoding and decoding. */);
8671 Venable_character_translation = Qt;
8672
8673 DEFVAR_LISP ("standard-translation-table-for-decode",
8674 &Vstandard_translation_table_for_decode,
8675 doc: /* Table for translating characters while decoding. */);
8676 Vstandard_translation_table_for_decode = Qnil;
8677
8678 DEFVAR_LISP ("standard-translation-table-for-encode",
8679 &Vstandard_translation_table_for_encode,
8680 doc: /* Table for translating characters while encoding. */);
8681 Vstandard_translation_table_for_encode = Qnil;
8682
8683 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8684 doc: /* Alist of charsets vs revision numbers.
8685 While encoding, if a charset (car part of an element) is found,
8686 designate it with the escape sequence identifying revision (cdr part
8687 of the element). */);
8688 Vcharset_revision_table = Qnil;
8689
8690 DEFVAR_LISP ("default-process-coding-system",
8691 &Vdefault_process_coding_system,
8692 doc: /* Cons of coding systems used for process I/O by default.
8693 The car part is used for decoding a process output,
8694 the cdr part is used for encoding a text to be sent to a process. */);
8695 Vdefault_process_coding_system = Qnil;
8696
8697 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8698 doc: /*
8699 Table of extra Latin codes in the range 128..159 (inclusive).
8700 This is a vector of length 256.
8701 If Nth element is non-nil, the existence of code N in a file
8702 \(or output of subprocess) doesn't prevent it to be detected as
8703 a coding system of ISO 2022 variant which has a flag
8704 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8705 or reading output of a subprocess.
8706 Only 128th through 159th elements has a meaning. */);
8707 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8708
8709 DEFVAR_LISP ("select-safe-coding-system-function",
8710 &Vselect_safe_coding_system_function,
8711 doc: /*
8712 Function to call to select safe coding system for encoding a text.
8713
8714 If set, this function is called to force a user to select a proper
8715 coding system which can encode the text in the case that a default
8716 coding system used in each operation can't encode the text.
8717
8718 The default value is `select-safe-coding-system' (which see). */);
8719 Vselect_safe_coding_system_function = Qnil;
8720
8721 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8722 &inhibit_iso_escape_detection,
8723 doc: /*
8724 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8725
8726 By default, on reading a file, Emacs tries to detect how the text is
8727 encoded. This code detection is sensitive to escape sequences. If
8728 the sequence is valid as ISO2022, the code is determined as one of
8729 the ISO2022 encodings, and the file is decoded by the corresponding
8730 coding system (e.g. `iso-2022-7bit').
8731
8732 However, there may be a case that you want to read escape sequences in
8733 a file as is. In such a case, you can set this variable to non-nil.
8734 Then, as the code detection ignores any escape sequences, no file is
8735 detected as encoded in some ISO2022 encoding. The result is that all
8736 escape sequences become visible in a buffer.
8737
8738 The default value is nil, and it is strongly recommended not to change
8739 it. That is because many Emacs Lisp source files that contain
8740 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8741 in Emacs's distribution, and they won't be decoded correctly on
8742 reading if you suppress escape sequence detection.
8743
8744 The other way to read escape sequences in a file without decoding is
8745 to explicitly specify some coding system that doesn't use ISO2022's
8746 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8747 inhibit_iso_escape_detection = 0;
8748
8749 {
8750 Lisp_Object args[coding_arg_max];
8751 Lisp_Object plist[14];
8752 int i;
8753
8754 for (i = 0; i < coding_arg_max; i++)
8755 args[i] = Qnil;
8756
8757 plist[0] = intern (":name");
8758 plist[1] = args[coding_arg_name] = Qno_conversion;
8759 plist[2] = intern (":mnemonic");
8760 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8761 plist[4] = intern (":coding-type");
8762 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8763 plist[6] = intern (":ascii-compatible-p");
8764 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8765 plist[8] = intern (":default-char");
8766 plist[9] = args[coding_arg_default_char] = make_number (0);
8767 plist[10] = intern (":docstring");
8768 plist[11] = build_string ("Do no conversion.\n\
8769 \n\
8770 When you visit a file with this coding, the file is read into a\n\
8771 unibyte buffer as is, thus each byte of a file is treated as a\n\
8772 character.");
8773 plist[12] = intern (":eol-type");
8774 plist[13] = args[coding_arg_eol_type] = Qunix;
8775 args[coding_arg_plist] = Flist (14, plist);
8776 Fdefine_coding_system_internal (coding_arg_max, args);
8777 }
8778
8779 setup_coding_system (Qno_conversion, &keyboard_coding);
8780 setup_coding_system (Qno_conversion, &terminal_coding);
8781 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8782 }
8783
8784 char *
8785 emacs_strerror (error_number)
8786 int error_number;
8787 {
8788 char *str;
8789
8790 synchronize_system_messages_locale ();
8791 str = strerror (error_number);
8792
8793 if (! NILP (Vlocale_coding_system))
8794 {
8795 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8796 Vlocale_coding_system,
8797 0);
8798 str = (char *) XSTRING (dec)->data;
8799 }
8800
8801 return str;
8802 }
8803
8804 #endif /* emacs */