(Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig)
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314
315 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
317 Lisp_Object Qstart_process, Qopen_network_stream;
318 Lisp_Object Qtarget_idx;
319
320 Lisp_Object Vselect_safe_coding_system_function;
321
322 /* Mnemonic string for each format of end-of-line. */
323 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
324 /* Mnemonic string to indicate format of end-of-line is not yet
325 decided. */
326 Lisp_Object eol_mnemonic_undecided;
327
328 #ifdef emacs
329
330 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
331
332 Lisp_Object Qcoding_system_p, Qcoding_system_error;
333
334 /* Coding system emacs-mule and raw-text are for converting only
335 end-of-line format. */
336 Lisp_Object Qemacs_mule, Qraw_text;
337
338 /* Coding-systems are handed between Emacs Lisp programs and C internal
339 routines by the following three variables. */
340 /* Coding-system for reading files and receiving data from process. */
341 Lisp_Object Vcoding_system_for_read;
342 /* Coding-system for writing files and sending data to process. */
343 Lisp_Object Vcoding_system_for_write;
344 /* Coding-system actually used in the latest I/O. */
345 Lisp_Object Vlast_coding_system_used;
346
347 /* A vector of length 256 which contains information about special
348 Latin codes (especially for dealing with Microsoft codes). */
349 Lisp_Object Vlatin_extra_code_table;
350
351 /* Flag to inhibit code conversion of end-of-line format. */
352 int inhibit_eol_conversion;
353
354 /* Flag to inhibit ISO2022 escape sequence detection. */
355 int inhibit_iso_escape_detection;
356
357 /* Flag to make buffer-file-coding-system inherit from process-coding. */
358 int inherit_process_coding_system;
359
360 /* Coding system to be used to encode text for terminal display. */
361 struct coding_system terminal_coding;
362
363 /* Coding system to be used to encode text for terminal display when
364 terminal coding system is nil. */
365 struct coding_system safe_terminal_coding;
366
367 /* Coding system of what is sent from terminal keyboard. */
368 struct coding_system keyboard_coding;
369
370 Lisp_Object Vfile_coding_system_alist;
371 Lisp_Object Vprocess_coding_system_alist;
372 Lisp_Object Vnetwork_coding_system_alist;
373
374 Lisp_Object Vlocale_coding_system;
375
376 #endif /* emacs */
377
378 /* Flag to tell if we look up translation table on character code
379 conversion. */
380 Lisp_Object Venable_character_translation;
381 /* Standard translation table to look up on decoding (reading). */
382 Lisp_Object Vstandard_translation_table_for_decode;
383 /* Standard translation table to look up on encoding (writing). */
384 Lisp_Object Vstandard_translation_table_for_encode;
385
386 Lisp_Object Qtranslation_table;
387 Lisp_Object Qtranslation_table_id;
388 Lisp_Object Qtranslation_table_for_decode;
389 Lisp_Object Qtranslation_table_for_encode;
390
391 /* Alist of charsets vs revision number. */
392 static Lisp_Object Vcharset_revision_table;
393
394 /* Default coding systems used for process I/O. */
395 Lisp_Object Vdefault_process_coding_system;
396
397 /* Global flag to tell that we can't call post-read-conversion and
398 pre-write-conversion functions. Usually the value is zero, but it
399 is set to 1 temporarily while such functions are running. This is
400 to avoid infinite recursive call. */
401 static int inhibit_pre_post_conversion;
402
403 /* Two special coding systems. */
404 Lisp_Object Vsjis_coding_system;
405 Lisp_Object Vbig5_coding_system;
406
407
408 static int detect_coding_utf_8 P_ ((struct coding_system *,
409 struct coding_detection_info *info));
410 static void decode_coding_utf_8 P_ ((struct coding_system *));
411 static int encode_coding_utf_8 P_ ((struct coding_system *));
412
413 static int detect_coding_utf_16 P_ ((struct coding_system *,
414 struct coding_detection_info *info));
415 static void decode_coding_utf_16 P_ ((struct coding_system *));
416 static int encode_coding_utf_16 P_ ((struct coding_system *));
417
418 static int detect_coding_iso_2022 P_ ((struct coding_system *,
419 struct coding_detection_info *info));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *,
424 struct coding_detection_info *info));
425 static void decode_coding_emacs_mule P_ ((struct coding_system *));
426 static int encode_coding_emacs_mule P_ ((struct coding_system *));
427
428 static int detect_coding_sjis P_ ((struct coding_system *,
429 struct coding_detection_info *info));
430 static void decode_coding_sjis P_ ((struct coding_system *));
431 static int encode_coding_sjis P_ ((struct coding_system *));
432
433 static int detect_coding_big5 P_ ((struct coding_system *,
434 struct coding_detection_info *info));
435 static void decode_coding_big5 P_ ((struct coding_system *));
436 static int encode_coding_big5 P_ ((struct coding_system *));
437
438 static int detect_coding_ccl P_ ((struct coding_system *,
439 struct coding_detection_info *info));
440 static void decode_coding_ccl P_ ((struct coding_system *));
441 static int encode_coding_ccl P_ ((struct coding_system *));
442
443 static void decode_coding_raw_text P_ ((struct coding_system *));
444 static int encode_coding_raw_text P_ ((struct coding_system *));
445
446
447 /* ISO2022 section */
448
449 #define CODING_ISO_INITIAL(coding, reg) \
450 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
451 coding_attr_iso_initial), \
452 reg)))
453
454
455 #define CODING_ISO_REQUEST(coding, charset_id) \
456 ((charset_id <= (coding)->max_charset_id \
457 ? (coding)->safe_charsets[charset_id] \
458 : -1))
459
460
461 #define CODING_ISO_FLAGS(coding) \
462 ((coding)->spec.iso_2022.flags)
463 #define CODING_ISO_DESIGNATION(coding, reg) \
464 ((coding)->spec.iso_2022.current_designation[reg])
465 #define CODING_ISO_INVOCATION(coding, plane) \
466 ((coding)->spec.iso_2022.current_invocation[plane])
467 #define CODING_ISO_SINGLE_SHIFTING(coding) \
468 ((coding)->spec.iso_2022.single_shifting)
469 #define CODING_ISO_BOL(coding) \
470 ((coding)->spec.iso_2022.bol)
471 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
472 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
473
474 /* Control characters of ISO2022. */
475 /* code */ /* function */
476 #define ISO_CODE_LF 0x0A /* line-feed */
477 #define ISO_CODE_CR 0x0D /* carriage-return */
478 #define ISO_CODE_SO 0x0E /* shift-out */
479 #define ISO_CODE_SI 0x0F /* shift-in */
480 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
481 #define ISO_CODE_ESC 0x1B /* escape */
482 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
483 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
484 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
485
486 /* All code (1-byte) of ISO2022 is classified into one of the
487 followings. */
488 enum iso_code_class_type
489 {
490 ISO_control_0, /* Control codes in the range
491 0x00..0x1F and 0x7F, except for the
492 following 5 codes. */
493 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
494 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
508 };
509
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
512
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
516
517 /* If set, reset graphic planes and registers at end-of-line to the
518 initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
520
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
524
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
527
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
530
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
534
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
537
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
540
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
543
544 /* If set, assume designation states are reset at beginning of line on
545 output. */
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
547
548 /* If set, designation sequence should be placed at beginning of line
549 on output. */
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
551
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
554
555 /* If set, extra latin codes (128..159) are accepted as a valid code
556 on input. */
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
558
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
560
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
562
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
564
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
566
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
568
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
572
573
574 /* UTF-16 section */
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
577
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
580
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
583
584
585 /* CCL section */
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
592 ->data)
593
594 /* Index for each coding category in `coding_categories' */
595
596 enum coding_category
597 {
598 coding_category_iso_7,
599 coding_category_iso_7_tight,
600 coding_category_iso_8_1,
601 coding_category_iso_8_2,
602 coding_category_iso_7_else,
603 coding_category_iso_8_else,
604 coding_category_utf_8,
605 coding_category_utf_16_auto,
606 coding_category_utf_16_be,
607 coding_category_utf_16_le,
608 coding_category_utf_16_be_nosig,
609 coding_category_utf_16_le_nosig,
610 coding_category_charset,
611 coding_category_sjis,
612 coding_category_big5,
613 coding_category_ccl,
614 coding_category_emacs_mule,
615 /* All above are targets of code detection. */
616 coding_category_raw_text,
617 coding_category_undecided,
618 coding_category_max
619 };
620
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
634 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
635 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
636 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
637 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
638 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
639
640 /* This value is returned if detect_coding_mask () find nothing other
641 than ASCII characters. */
642 #define CATEGORY_MASK_ANY \
643 (CATEGORY_MASK_ISO_7 \
644 | CATEGORY_MASK_ISO_7_TIGHT \
645 | CATEGORY_MASK_ISO_8_1 \
646 | CATEGORY_MASK_ISO_8_2 \
647 | CATEGORY_MASK_ISO_7_ELSE \
648 | CATEGORY_MASK_ISO_8_ELSE \
649 | CATEGORY_MASK_UTF_8 \
650 | CATEGORY_MASK_UTF_16_BE \
651 | CATEGORY_MASK_UTF_16_LE \
652 | CATEGORY_MASK_UTF_16_BE_NOSIG \
653 | CATEGORY_MASK_UTF_16_LE_NOSIG \
654 | CATEGORY_MASK_CHARSET \
655 | CATEGORY_MASK_SJIS \
656 | CATEGORY_MASK_BIG5 \
657 | CATEGORY_MASK_CCL \
658 | CATEGORY_MASK_EMACS_MULE)
659
660
661 #define CATEGORY_MASK_ISO_7BIT \
662 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
663
664 #define CATEGORY_MASK_ISO_8BIT \
665 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
666
667 #define CATEGORY_MASK_ISO_ELSE \
668 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
669
670 #define CATEGORY_MASK_ISO_ESCAPE \
671 (CATEGORY_MASK_ISO_7 \
672 | CATEGORY_MASK_ISO_7_TIGHT \
673 | CATEGORY_MASK_ISO_7_ELSE \
674 | CATEGORY_MASK_ISO_8_ELSE)
675
676 #define CATEGORY_MASK_ISO \
677 ( CATEGORY_MASK_ISO_7BIT \
678 | CATEGORY_MASK_ISO_8BIT \
679 | CATEGORY_MASK_ISO_ELSE)
680
681 #define CATEGORY_MASK_UTF_16 \
682 (CATEGORY_MASK_UTF_16_BE \
683 | CATEGORY_MASK_UTF_16_LE \
684 | CATEGORY_MASK_UTF_16_BE_NOSIG \
685 | CATEGORY_MASK_UTF_16_LE_NOSIG)
686
687
688 /* List of symbols `coding-category-xxx' ordered by priority. This
689 variable is exposed to Emacs Lisp. */
690 static Lisp_Object Vcoding_category_list;
691
692 /* Table of coding categories (Lisp symbols). This variable is for
693 internal use oly. */
694 static Lisp_Object Vcoding_category_table;
695
696 /* Table of coding-categories ordered by priority. */
697 static enum coding_category coding_priorities[coding_category_max];
698
699 /* Nth element is a coding context for the coding system bound to the
700 Nth coding category. */
701 static struct coding_system coding_categories[coding_category_max];
702
703 static int detected_mask[coding_category_raw_text] =
704 { CATEGORY_MASK_ISO,
705 CATEGORY_MASK_ISO,
706 CATEGORY_MASK_ISO,
707 CATEGORY_MASK_ISO,
708 CATEGORY_MASK_ISO,
709 CATEGORY_MASK_ISO,
710 CATEGORY_MASK_UTF_8,
711 CATEGORY_MASK_UTF_16,
712 CATEGORY_MASK_UTF_16,
713 CATEGORY_MASK_UTF_16,
714 CATEGORY_MASK_UTF_16,
715 CATEGORY_MASK_UTF_16,
716 CATEGORY_MASK_CHARSET,
717 CATEGORY_MASK_SJIS,
718 CATEGORY_MASK_BIG5,
719 CATEGORY_MASK_CCL,
720 CATEGORY_MASK_EMACS_MULE
721 };
722
723 /*** Commonly used macros and functions ***/
724
725 #ifndef min
726 #define min(a, b) ((a) < (b) ? (a) : (b))
727 #endif
728 #ifndef max
729 #define max(a, b) ((a) > (b) ? (a) : (b))
730 #endif
731
732 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
733 do { \
734 attrs = CODING_ID_ATTRS (coding->id); \
735 eol_type = CODING_ID_EOL_TYPE (coding->id); \
736 if (VECTORP (eol_type)) \
737 eol_type = Qunix; \
738 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
739 } while (0)
740
741
742 /* Safely get one byte from the source text pointed by SRC which ends
743 at SRC_END, and set C to that byte. If there are not enough bytes
744 in the source, it jumps to `no_more_source'. The caller
745 should declare and set these variables appropriately in advance:
746 src, src_end, multibytep
747 */
748
749 #define ONE_MORE_BYTE(c) \
750 do { \
751 if (src == src_end) \
752 { \
753 if (src_base < src) \
754 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
755 goto no_more_source; \
756 } \
757 c = *src++; \
758 if (multibytep && (c & 0x80)) \
759 { \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
763 } \
764 consumed_chars++; \
765 } while (0)
766
767
768 #define ONE_MORE_BYTE_NO_CHECK(c) \
769 do { \
770 c = *src++; \
771 if (multibytep && (c & 0x80)) \
772 { \
773 if ((c & 0xFE) != 0xC0) \
774 error ("Undecodable char found"); \
775 c = ((c & 1) << 6) | *src++; \
776 } \
777 consumed_chars++; \
778 } while (0)
779
780
781 /* Store a byte C in the place pointed by DST and increment DST to the
782 next free point, and increment PRODUCED_CHARS. The caller should
783 assure that C is 0..127, and declare and set the variable `dst'
784 appropriately in advance.
785 */
786
787
788 #define EMIT_ONE_ASCII_BYTE(c) \
789 do { \
790 produced_chars++; \
791 *dst++ = (c); \
792 } while (0)
793
794
795 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
796
797 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
798 do { \
799 produced_chars += 2; \
800 *dst++ = (c1), *dst++ = (c2); \
801 } while (0)
802
803
804 /* Store a byte C in the place pointed by DST and increment DST to the
805 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
806 nonzero, store in an appropriate multibyte from. The caller should
807 declare and set the variables `dst' and `multibytep' appropriately
808 in advance. */
809
810 #define EMIT_ONE_BYTE(c) \
811 do { \
812 produced_chars++; \
813 if (multibytep) \
814 { \
815 int ch = (c); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 } \
820 else \
821 *dst++ = (c); \
822 } while (0)
823
824
825 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
826
827 #define EMIT_TWO_BYTES(c1, c2) \
828 do { \
829 produced_chars += 2; \
830 if (multibytep) \
831 { \
832 int ch; \
833 \
834 ch = (c1); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 ch = (c2); \
839 if (ch >= 0x80) \
840 ch = BYTE8_TO_CHAR (ch); \
841 CHAR_STRING_ADVANCE (ch, dst); \
842 } \
843 else \
844 { \
845 *dst++ = (c1); \
846 *dst++ = (c2); \
847 } \
848 } while (0)
849
850
851 #define EMIT_THREE_BYTES(c1, c2, c3) \
852 do { \
853 EMIT_ONE_BYTE (c1); \
854 EMIT_TWO_BYTES (c2, c3); \
855 } while (0)
856
857
858 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
859 do { \
860 EMIT_TWO_BYTES (c1, c2); \
861 EMIT_TWO_BYTES (c3, c4); \
862 } while (0)
863
864
865 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
866 do { \
867 charset_map_loaded = 0; \
868 c = DECODE_CHAR (charset, code); \
869 if (charset_map_loaded) \
870 { \
871 unsigned char *orig = coding->source; \
872 EMACS_INT offset; \
873 \
874 coding_set_source (coding); \
875 offset = coding->source - orig; \
876 src += offset; \
877 src_base += offset; \
878 src_end += offset; \
879 } \
880 } while (0)
881
882
883 #define ASSURE_DESTINATION(bytes) \
884 do { \
885 if (dst + (bytes) >= dst_end) \
886 { \
887 int more_bytes = charbuf_end - charbuf + (bytes); \
888 \
889 dst = alloc_destination (coding, more_bytes, dst); \
890 dst_end = coding->destination + coding->dst_bytes; \
891 } \
892 } while (0)
893
894
895
896 static void
897 coding_set_source (coding)
898 struct coding_system *coding;
899 {
900 if (BUFFERP (coding->src_object))
901 {
902 if (coding->src_pos < 0)
903 coding->source = GAP_END_ADDR + coding->src_pos_byte;
904 else
905 {
906 struct buffer *buf = XBUFFER (coding->src_object);
907 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
908 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
909
910 coding->source = beg_addr + coding->src_pos_byte - 1;
911 if (coding->src_pos_byte >= gpt_byte)
912 coding->source += BUF_GAP_SIZE (buf);
913 }
914 }
915 else if (STRINGP (coding->src_object))
916 {
917 coding->source = (XSTRING (coding->src_object)->data
918 + coding->src_pos_byte);
919 }
920 else
921 /* Otherwise, the source is C string and is never relocated
922 automatically. Thus we don't have to update anything. */
923 ;
924 }
925
926 static void
927 coding_set_destination (coding)
928 struct coding_system *coding;
929 {
930 if (BUFFERP (coding->dst_object))
931 {
932 if (coding->src_pos < 0)
933 {
934 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
935 coding->dst_bytes = (GAP_END_ADDR
936 - (coding->src_bytes - coding->consumed)
937 - coding->destination);
938 }
939 else
940 {
941 /* We are sure that coding->dst_pos_byte is before the gap
942 of the buffer. */
943 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
944 + coding->dst_pos_byte - 1);
945 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
946 - coding->destination);
947 }
948 }
949 else
950 /* Otherwise, the destination is C string and is never relocated
951 automatically. Thus we don't have to update anything. */
952 ;
953 }
954
955
956 static void
957 coding_alloc_by_realloc (coding, bytes)
958 struct coding_system *coding;
959 EMACS_INT bytes;
960 {
961 coding->destination = (unsigned char *) xrealloc (coding->destination,
962 coding->dst_bytes + bytes);
963 coding->dst_bytes += bytes;
964 }
965
966 static void
967 coding_alloc_by_making_gap (coding, bytes)
968 struct coding_system *coding;
969 EMACS_INT bytes;
970 {
971 if (BUFFERP (coding->dst_object)
972 && EQ (coding->src_object, coding->dst_object))
973 {
974 EMACS_INT add = coding->src_bytes - coding->consumed;
975
976 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
977 make_gap (bytes);
978 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
979 }
980 else
981 {
982 Lisp_Object this_buffer;
983
984 this_buffer = Fcurrent_buffer ();
985 set_buffer_internal (XBUFFER (coding->dst_object));
986 make_gap (bytes);
987 set_buffer_internal (XBUFFER (this_buffer));
988 }
989 }
990
991
992 static unsigned char *
993 alloc_destination (coding, nbytes, dst)
994 struct coding_system *coding;
995 int nbytes;
996 unsigned char *dst;
997 {
998 EMACS_INT offset = dst - coding->destination;
999
1000 if (BUFFERP (coding->dst_object))
1001 coding_alloc_by_making_gap (coding, nbytes);
1002 else
1003 coding_alloc_by_realloc (coding, nbytes);
1004 coding->result = CODING_RESULT_SUCCESS;
1005 coding_set_destination (coding);
1006 dst = coding->destination + offset;
1007 return dst;
1008 }
1009
1010 /** Macros for annotations. */
1011
1012 /* Maximum length of annotation data (sum of annotations for
1013 composition and charset). */
1014 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1015
1016 /* An annotation data is stored in the array coding->charbuf in this
1017 format:
1018 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1019 LENGTH is the number of elements in the annotation.
1020 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1021 FROM and TO specify the range of text annotated. They are relative
1022 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1023
1024 The format of the following elements depend on ANNOTATION_MASK.
1025
1026 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1027 follows:
1028 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1029 METHOD is one of enum composition_method.
1030 Optionnal COMPOSITION-COMPONENTS are characters and composition
1031 rules.
1032
1033 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1034 follows. */
1035
1036 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1037 do { \
1038 *(buf)++ = -(len); \
1039 *(buf)++ = (mask); \
1040 *(buf)++ = (from); \
1041 *(buf)++ = (to); \
1042 coding->annotated = 1; \
1043 } while (0);
1044
1045 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1046 do { \
1047 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1048 *buf++ = method; \
1049 } while (0)
1050
1051
1052 #define ADD_CHARSET_DATA(buf, from, to, id) \
1053 do { \
1054 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1055 *buf++ = id; \
1056 } while (0)
1057
1058 \f
1059 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1060
1061
1062
1063 \f
1064 /*** 3. UTF-8 ***/
1065
1066 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1067 Check if a text is encoded in UTF-8. If it is, return 1, else
1068 return 0. */
1069
1070 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1071 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1072 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1073 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1074 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1075 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1076
1077 static int
1078 detect_coding_utf_8 (coding, detect_info)
1079 struct coding_system *coding;
1080 struct coding_detection_info *detect_info;
1081 {
1082 unsigned char *src = coding->source, *src_base = src;
1083 unsigned char *src_end = coding->source + coding->src_bytes;
1084 int multibytep = coding->src_multibyte;
1085 int consumed_chars = 0;
1086 int found = 0;
1087 int incomplete;
1088
1089 detect_info->checked |= CATEGORY_MASK_UTF_8;
1090 /* A coding system of this category is always ASCII compatible. */
1091 src += coding->head_ascii;
1092
1093 while (1)
1094 {
1095 int c, c1, c2, c3, c4;
1096
1097 incomplete = 0;
1098 ONE_MORE_BYTE (c);
1099 if (UTF_8_1_OCTET_P (c))
1100 continue;
1101 incomplete = 1;
1102 ONE_MORE_BYTE (c1);
1103 if (! UTF_8_EXTRA_OCTET_P (c1))
1104 break;
1105 if (UTF_8_2_OCTET_LEADING_P (c))
1106 {
1107 found = CATEGORY_MASK_UTF_8;
1108 continue;
1109 }
1110 ONE_MORE_BYTE (c2);
1111 if (! UTF_8_EXTRA_OCTET_P (c2))
1112 break;
1113 if (UTF_8_3_OCTET_LEADING_P (c))
1114 {
1115 found = CATEGORY_MASK_UTF_8;
1116 continue;
1117 }
1118 ONE_MORE_BYTE (c3);
1119 if (! UTF_8_EXTRA_OCTET_P (c3))
1120 break;
1121 if (UTF_8_4_OCTET_LEADING_P (c))
1122 {
1123 found = CATEGORY_MASK_UTF_8;
1124 continue;
1125 }
1126 ONE_MORE_BYTE (c4);
1127 if (! UTF_8_EXTRA_OCTET_P (c4))
1128 break;
1129 if (UTF_8_5_OCTET_LEADING_P (c))
1130 {
1131 found = CATEGORY_MASK_UTF_8;
1132 continue;
1133 }
1134 break;
1135 }
1136 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1137 return 0;
1138
1139 no_more_source:
1140 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1141 {
1142 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1143 return 0;
1144 }
1145 detect_info->found |= found;
1146 return 1;
1147 }
1148
1149
1150 static void
1151 decode_coding_utf_8 (coding)
1152 struct coding_system *coding;
1153 {
1154 unsigned char *src = coding->source + coding->consumed;
1155 unsigned char *src_end = coding->source + coding->src_bytes;
1156 unsigned char *src_base;
1157 int *charbuf = coding->charbuf;
1158 int *charbuf_end = charbuf + coding->charbuf_size;
1159 int consumed_chars = 0, consumed_chars_base;
1160 int multibytep = coding->src_multibyte;
1161 Lisp_Object attr, eol_type, charset_list;
1162
1163 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1164
1165 while (1)
1166 {
1167 int c, c1, c2, c3, c4, c5;
1168
1169 src_base = src;
1170 consumed_chars_base = consumed_chars;
1171
1172 if (charbuf >= charbuf_end)
1173 break;
1174
1175 ONE_MORE_BYTE (c1);
1176 if (UTF_8_1_OCTET_P(c1))
1177 {
1178 c = c1;
1179 if (c == '\r')
1180 {
1181 if (EQ (eol_type, Qdos))
1182 {
1183 if (src == src_end)
1184 {
1185 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1186 goto no_more_source;
1187 }
1188 if (*src == '\n')
1189 ONE_MORE_BYTE (c);
1190 }
1191 else if (EQ (eol_type, Qmac))
1192 c = '\n';
1193 }
1194 }
1195 else
1196 {
1197 ONE_MORE_BYTE (c2);
1198 if (! UTF_8_EXTRA_OCTET_P (c2))
1199 goto invalid_code;
1200 if (UTF_8_2_OCTET_LEADING_P (c1))
1201 {
1202 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1203 /* Reject overlong sequences here and below. Encoders
1204 producing them are incorrect, they can be misleading,
1205 and they mess up read/write invariance. */
1206 if (c < 128)
1207 goto invalid_code;
1208 }
1209 else
1210 {
1211 ONE_MORE_BYTE (c3);
1212 if (! UTF_8_EXTRA_OCTET_P (c3))
1213 goto invalid_code;
1214 if (UTF_8_3_OCTET_LEADING_P (c1))
1215 {
1216 c = (((c1 & 0xF) << 12)
1217 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1218 if (c < 0x800
1219 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1220 goto invalid_code;
1221 }
1222 else
1223 {
1224 ONE_MORE_BYTE (c4);
1225 if (! UTF_8_EXTRA_OCTET_P (c4))
1226 goto invalid_code;
1227 if (UTF_8_4_OCTET_LEADING_P (c1))
1228 {
1229 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1230 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1231 if (c < 0x10000)
1232 goto invalid_code;
1233 }
1234 else
1235 {
1236 ONE_MORE_BYTE (c5);
1237 if (! UTF_8_EXTRA_OCTET_P (c5))
1238 goto invalid_code;
1239 if (UTF_8_5_OCTET_LEADING_P (c1))
1240 {
1241 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1242 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1243 | (c5 & 0x3F));
1244 if ((c > MAX_CHAR) || (c < 0x200000))
1245 goto invalid_code;
1246 }
1247 else
1248 goto invalid_code;
1249 }
1250 }
1251 }
1252 }
1253
1254 *charbuf++ = c;
1255 continue;
1256
1257 invalid_code:
1258 src = src_base;
1259 consumed_chars = consumed_chars_base;
1260 ONE_MORE_BYTE (c);
1261 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1262 coding->errors++;
1263 }
1264
1265 no_more_source:
1266 coding->consumed_char += consumed_chars_base;
1267 coding->consumed = src_base - coding->source;
1268 coding->charbuf_used = charbuf - coding->charbuf;
1269 }
1270
1271
1272 static int
1273 encode_coding_utf_8 (coding)
1274 struct coding_system *coding;
1275 {
1276 int multibytep = coding->dst_multibyte;
1277 int *charbuf = coding->charbuf;
1278 int *charbuf_end = charbuf + coding->charbuf_used;
1279 unsigned char *dst = coding->destination + coding->produced;
1280 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1281 int produced_chars = 0;
1282 int c;
1283
1284 if (multibytep)
1285 {
1286 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1287
1288 while (charbuf < charbuf_end)
1289 {
1290 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1291
1292 ASSURE_DESTINATION (safe_room);
1293 c = *charbuf++;
1294 if (CHAR_BYTE8_P (c))
1295 {
1296 c = CHAR_TO_BYTE8 (c);
1297 EMIT_ONE_BYTE (c);
1298 }
1299 else
1300 {
1301 CHAR_STRING_ADVANCE (c, pend);
1302 for (p = str; p < pend; p++)
1303 EMIT_ONE_BYTE (*p);
1304 }
1305 }
1306 }
1307 else
1308 {
1309 int safe_room = MAX_MULTIBYTE_LENGTH;
1310
1311 while (charbuf < charbuf_end)
1312 {
1313 ASSURE_DESTINATION (safe_room);
1314 c = *charbuf++;
1315 dst += CHAR_STRING (c, dst);
1316 produced_chars++;
1317 }
1318 }
1319 coding->result = CODING_RESULT_SUCCESS;
1320 coding->produced_char += produced_chars;
1321 coding->produced = dst - coding->destination;
1322 return 0;
1323 }
1324
1325
1326 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1327 Check if a text is encoded in one of UTF-16 based coding systems.
1328 If it is, return 1, else return 0. */
1329
1330 #define UTF_16_HIGH_SURROGATE_P(val) \
1331 (((val) & 0xFC00) == 0xD800)
1332
1333 #define UTF_16_LOW_SURROGATE_P(val) \
1334 (((val) & 0xFC00) == 0xDC00)
1335
1336 #define UTF_16_INVALID_P(val) \
1337 (((val) == 0xFFFE) \
1338 || ((val) == 0xFFFF) \
1339 || UTF_16_LOW_SURROGATE_P (val))
1340
1341
1342 static int
1343 detect_coding_utf_16 (coding, detect_info)
1344 struct coding_system *coding;
1345 struct coding_detection_info *detect_info;
1346 {
1347 unsigned char *src = coding->source, *src_base = src;
1348 unsigned char *src_end = coding->source + coding->src_bytes;
1349 int multibytep = coding->src_multibyte;
1350 int consumed_chars = 0;
1351 int c1, c2;
1352
1353 detect_info->checked |= CATEGORY_MASK_UTF_16;
1354
1355 if (coding->mode & CODING_MODE_LAST_BLOCK
1356 && (coding->src_bytes & 1))
1357 {
1358 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359 return 0;
1360 }
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
1363
1364 if ((c1 == 0xFF) && (c2 == 0xFE))
1365 {
1366 detect_info->found |= CATEGORY_MASK_UTF_16_LE;
1367 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1368 }
1369 else if ((c1 == 0xFE) && (c2 == 0xFF))
1370 {
1371 detect_info->found |= CATEGORY_MASK_UTF_16_BE;
1372 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1373 }
1374 no_more_source:
1375 return 1;
1376 }
1377
1378 static void
1379 decode_coding_utf_16 (coding)
1380 struct coding_system *coding;
1381 {
1382 unsigned char *src = coding->source + coding->consumed;
1383 unsigned char *src_end = coding->source + coding->src_bytes;
1384 unsigned char *src_base;
1385 int *charbuf = coding->charbuf;
1386 int *charbuf_end = charbuf + coding->charbuf_size;
1387 int consumed_chars = 0, consumed_chars_base;
1388 int multibytep = coding->src_multibyte;
1389 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1390 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1391 int surrogate = CODING_UTF_16_SURROGATE (coding);
1392 Lisp_Object attr, eol_type, charset_list;
1393
1394 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1395
1396 if (bom != utf_16_without_bom)
1397 {
1398 int c, c1, c2;
1399
1400 src_base = src;
1401 ONE_MORE_BYTE (c1);
1402 ONE_MORE_BYTE (c2);
1403 c = (c1 << 8) | c2;
1404 if (bom == utf_16_with_bom)
1405 {
1406 if (endian == utf_16_big_endian
1407 ? c != 0xFEFF : c != 0xFFFE)
1408 {
1409 /* We are sure that there's enouph room at CHARBUF. */
1410 *charbuf++ = c1;
1411 *charbuf++ = c2;
1412 coding->errors++;
1413 }
1414 }
1415 else
1416 {
1417 if (c == 0xFEFF)
1418 CODING_UTF_16_ENDIAN (coding)
1419 = endian = utf_16_big_endian;
1420 else if (c == 0xFFFE)
1421 CODING_UTF_16_ENDIAN (coding)
1422 = endian = utf_16_little_endian;
1423 else
1424 {
1425 CODING_UTF_16_ENDIAN (coding)
1426 = endian = utf_16_big_endian;
1427 src = src_base;
1428 }
1429 }
1430 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1431 }
1432
1433 while (1)
1434 {
1435 int c, c1, c2;
1436
1437 src_base = src;
1438 consumed_chars_base = consumed_chars;
1439
1440 if (charbuf + 2 >= charbuf_end)
1441 break;
1442
1443 ONE_MORE_BYTE (c1);
1444 ONE_MORE_BYTE (c2);
1445 c = (endian == utf_16_big_endian
1446 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1447 if (surrogate)
1448 {
1449 if (! UTF_16_LOW_SURROGATE_P (c))
1450 {
1451 if (endian == utf_16_big_endian)
1452 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1453 else
1454 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1455 *charbuf++ = c1;
1456 *charbuf++ = c2;
1457 coding->errors++;
1458 if (UTF_16_HIGH_SURROGATE_P (c))
1459 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1460 else
1461 *charbuf++ = c;
1462 }
1463 else
1464 {
1465 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1466 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1467 *charbuf++ = c;
1468 }
1469 }
1470 else
1471 {
1472 if (UTF_16_HIGH_SURROGATE_P (c))
1473 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1474 else
1475 *charbuf++ = c;
1476 }
1477 }
1478
1479 no_more_source:
1480 coding->consumed_char += consumed_chars_base;
1481 coding->consumed = src_base - coding->source;
1482 coding->charbuf_used = charbuf - coding->charbuf;
1483 }
1484
1485 static int
1486 encode_coding_utf_16 (coding)
1487 struct coding_system *coding;
1488 {
1489 int multibytep = coding->dst_multibyte;
1490 int *charbuf = coding->charbuf;
1491 int *charbuf_end = charbuf + coding->charbuf_used;
1492 unsigned char *dst = coding->destination + coding->produced;
1493 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1494 int safe_room = 8;
1495 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1496 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1497 int produced_chars = 0;
1498 Lisp_Object attrs, eol_type, charset_list;
1499 int c;
1500
1501 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1502
1503 if (bom == utf_16_with_bom)
1504 {
1505 ASSURE_DESTINATION (safe_room);
1506 if (big_endian)
1507 EMIT_TWO_BYTES (0xFE, 0xFF);
1508 else
1509 EMIT_TWO_BYTES (0xFF, 0xFE);
1510 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1511 }
1512
1513 while (charbuf < charbuf_end)
1514 {
1515 ASSURE_DESTINATION (safe_room);
1516 c = *charbuf++;
1517 if (c >= MAX_UNICODE_CHAR)
1518 c = coding->default_char;
1519
1520 if (c < 0x10000)
1521 {
1522 if (big_endian)
1523 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1524 else
1525 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1526 }
1527 else
1528 {
1529 int c1, c2;
1530
1531 c -= 0x10000;
1532 c1 = (c >> 10) + 0xD800;
1533 c2 = (c & 0x3FF) + 0xDC00;
1534 if (big_endian)
1535 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1536 else
1537 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1538 }
1539 }
1540 coding->result = CODING_RESULT_SUCCESS;
1541 coding->produced = dst - coding->destination;
1542 coding->produced_char += produced_chars;
1543 return 0;
1544 }
1545
1546 \f
1547 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1548
1549 /* Emacs' internal format for representation of multiple character
1550 sets is a kind of multi-byte encoding, i.e. characters are
1551 represented by variable-length sequences of one-byte codes.
1552
1553 ASCII characters and control characters (e.g. `tab', `newline') are
1554 represented by one-byte sequences which are their ASCII codes, in
1555 the range 0x00 through 0x7F.
1556
1557 8-bit characters of the range 0x80..0x9F are represented by
1558 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1559 code + 0x20).
1560
1561 8-bit characters of the range 0xA0..0xFF are represented by
1562 one-byte sequences which are their 8-bit code.
1563
1564 The other characters are represented by a sequence of `base
1565 leading-code', optional `extended leading-code', and one or two
1566 `position-code's. The length of the sequence is determined by the
1567 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1568 whereas extended leading-code and position-code take the range 0xA0
1569 through 0xFF. See `charset.h' for more details about leading-code
1570 and position-code.
1571
1572 --- CODE RANGE of Emacs' internal format ---
1573 character set range
1574 ------------- -----
1575 ascii 0x00..0x7F
1576 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1577 eight-bit-graphic 0xA0..0xBF
1578 ELSE 0x81..0x9D + [0xA0..0xFF]+
1579 ---------------------------------------------
1580
1581 As this is the internal character representation, the format is
1582 usually not used externally (i.e. in a file or in a data sent to a
1583 process). But, it is possible to have a text externally in this
1584 format (i.e. by encoding by the coding system `emacs-mule').
1585
1586 In that case, a sequence of one-byte codes has a slightly different
1587 form.
1588
1589 At first, all characters in eight-bit-control are represented by
1590 one-byte sequences which are their 8-bit code.
1591
1592 Next, character composition data are represented by the byte
1593 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1594 where,
1595 METHOD is 0xF0 plus one of composition method (enum
1596 composition_method),
1597
1598 BYTES is 0xA0 plus a byte length of this composition data,
1599
1600 CHARS is 0x20 plus a number of characters composed by this
1601 data,
1602
1603 COMPONENTs are characters of multibye form or composition
1604 rules encoded by two-byte of ASCII codes.
1605
1606 In addition, for backward compatibility, the following formats are
1607 also recognized as composition data on decoding.
1608
1609 0x80 MSEQ ...
1610 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1611
1612 Here,
1613 MSEQ is a multibyte form but in these special format:
1614 ASCII: 0xA0 ASCII_CODE+0x80,
1615 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1616 RULE is a one byte code of the range 0xA0..0xF0 that
1617 represents a composition rule.
1618 */
1619
1620 char emacs_mule_bytes[256];
1621
1622 int
1623 emacs_mule_char (coding, src, nbytes, nchars, id)
1624 struct coding_system *coding;
1625 unsigned char *src;
1626 int *nbytes, *nchars, *id;
1627 {
1628 unsigned char *src_end = coding->source + coding->src_bytes;
1629 int multibytep = coding->src_multibyte;
1630 unsigned char *src_base = src;
1631 struct charset *charset;
1632 unsigned code;
1633 int c;
1634 int consumed_chars = 0;
1635
1636 ONE_MORE_BYTE (c);
1637 switch (emacs_mule_bytes[c])
1638 {
1639 case 2:
1640 if (! (charset = emacs_mule_charset[c]))
1641 goto invalid_code;
1642 ONE_MORE_BYTE (c);
1643 code = c & 0x7F;
1644 break;
1645
1646 case 3:
1647 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1648 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1649 {
1650 ONE_MORE_BYTE (c);
1651 if (! (charset = emacs_mule_charset[c]))
1652 goto invalid_code;
1653 ONE_MORE_BYTE (c);
1654 code = c & 0x7F;
1655 }
1656 else
1657 {
1658 if (! (charset = emacs_mule_charset[c]))
1659 goto invalid_code;
1660 ONE_MORE_BYTE (c);
1661 code = (c & 0x7F) << 8;
1662 ONE_MORE_BYTE (c);
1663 code |= c & 0x7F;
1664 }
1665 break;
1666
1667 case 4:
1668 ONE_MORE_BYTE (c);
1669 if (! (charset = emacs_mule_charset[c]))
1670 goto invalid_code;
1671 ONE_MORE_BYTE (c);
1672 code = (c & 0x7F) << 8;
1673 ONE_MORE_BYTE (c);
1674 code |= c & 0x7F;
1675 break;
1676
1677 case 1:
1678 code = c;
1679 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1680 ? charset_ascii : charset_eight_bit);
1681 break;
1682
1683 default:
1684 abort ();
1685 }
1686 c = DECODE_CHAR (charset, code);
1687 if (c < 0)
1688 goto invalid_code;
1689 *nbytes = src - src_base;
1690 *nchars = consumed_chars;
1691 if (id)
1692 *id = charset->id;
1693 return c;
1694
1695 no_more_source:
1696 return -2;
1697
1698 invalid_code:
1699 return -1;
1700 }
1701
1702
1703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1704 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1705 else return 0. */
1706
1707 static int
1708 detect_coding_emacs_mule (coding, detect_info)
1709 struct coding_system *coding;
1710 struct coding_detection_info *detect_info;
1711 {
1712 unsigned char *src = coding->source, *src_base = src;
1713 unsigned char *src_end = coding->source + coding->src_bytes;
1714 int multibytep = coding->src_multibyte;
1715 int consumed_chars = 0;
1716 int c;
1717 int found = 0;
1718 int incomplete;
1719
1720 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1721 /* A coding system of this category is always ASCII compatible. */
1722 src += coding->head_ascii;
1723
1724 while (1)
1725 {
1726 incomplete = 0;
1727 ONE_MORE_BYTE (c);
1728 incomplete = 1;
1729
1730 if (c == 0x80)
1731 {
1732 /* Perhaps the start of composite character. We simple skip
1733 it because analyzing it is too heavy for detecting. But,
1734 at least, we check that the composite character
1735 constitues of more than 4 bytes. */
1736 unsigned char *src_base;
1737
1738 repeat:
1739 src_base = src;
1740 do
1741 {
1742 ONE_MORE_BYTE (c);
1743 }
1744 while (c >= 0xA0);
1745
1746 if (src - src_base <= 4)
1747 break;
1748 found = CATEGORY_MASK_EMACS_MULE;
1749 if (c == 0x80)
1750 goto repeat;
1751 }
1752
1753 if (c < 0x80)
1754 {
1755 if (c < 0x20
1756 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1757 break;
1758 }
1759 else
1760 {
1761 unsigned char *src_base = src - 1;
1762
1763 do
1764 {
1765 ONE_MORE_BYTE (c);
1766 }
1767 while (c >= 0xA0);
1768 if (src - src_base != emacs_mule_bytes[*src_base])
1769 break;
1770 found = CATEGORY_MASK_EMACS_MULE;
1771 }
1772 }
1773 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1774 return 0;
1775
1776 no_more_source:
1777 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1778 {
1779 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1780 return 0;
1781 }
1782 detect_info->found |= found;
1783 return 1;
1784 }
1785
1786
1787 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1788
1789 /* Decode a character represented as a component of composition
1790 sequence of Emacs 20/21 style at SRC. Set C to that character and
1791 update SRC to the head of next character (or an encoded composition
1792 rule). If SRC doesn't points a composition component, set C to -1.
1793 If SRC points an invalid byte sequence, global exit by a return
1794 value 0. */
1795
1796 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1797 if (1) \
1798 { \
1799 int c; \
1800 int nbytes, nchars; \
1801 \
1802 if (src == src_end) \
1803 break; \
1804 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1805 if (c < 0) \
1806 { \
1807 if (c == -2) \
1808 break; \
1809 goto invalid_code; \
1810 } \
1811 *buf++ = c; \
1812 src += nbytes; \
1813 consumed_chars += nchars; \
1814 } \
1815 else
1816
1817
1818 /* Decode a composition rule represented as a component of composition
1819 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1820 and increment BUF. If SRC points an invalid byte sequence, set C
1821 to -1. */
1822
1823 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1824 do { \
1825 int c, gref, nref; \
1826 \
1827 if (src >= src_end) \
1828 goto invalid_code; \
1829 ONE_MORE_BYTE_NO_CHECK (c); \
1830 c -= 0x20; \
1831 if (c < 0 || c >= 81) \
1832 goto invalid_code; \
1833 \
1834 gref = c / 9, nref = c % 9; \
1835 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1836 } while (0)
1837
1838
1839 /* Decode a composition rule represented as a component of composition
1840 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1841 and increment BUF. If SRC points an invalid byte sequence, set C
1842 to -1. */
1843
1844 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1845 do { \
1846 int gref, nref; \
1847 \
1848 if (src + 1>= src_end) \
1849 goto invalid_code; \
1850 ONE_MORE_BYTE_NO_CHECK (gref); \
1851 gref -= 0x20; \
1852 ONE_MORE_BYTE_NO_CHECK (nref); \
1853 nref -= 0x20; \
1854 if (gref < 0 || gref >= 81 \
1855 || nref < 0 || nref >= 81) \
1856 goto invalid_code; \
1857 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1858 } while (0)
1859
1860
1861 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1862 do { \
1863 /* Emacs 21 style format. The first three bytes at SRC are \
1864 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1865 the byte length of this composition information, CHARS is the \
1866 number of characters composed by this composition. */ \
1867 enum composition_method method = c - 0xF2; \
1868 int *charbuf_base = charbuf; \
1869 int from, to; \
1870 int consumed_chars_limit; \
1871 int nbytes, nchars; \
1872 \
1873 ONE_MORE_BYTE (c); \
1874 nbytes = c - 0xA0; \
1875 if (nbytes < 3) \
1876 goto invalid_code; \
1877 ONE_MORE_BYTE (c); \
1878 nchars = c - 0xA0; \
1879 from = coding->produced + char_offset; \
1880 to = from + nchars; \
1881 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1882 consumed_chars_limit = consumed_chars_base + nbytes; \
1883 if (method != COMPOSITION_RELATIVE) \
1884 { \
1885 int i = 0; \
1886 while (consumed_chars < consumed_chars_limit) \
1887 { \
1888 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1889 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1890 else \
1891 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1892 i++; \
1893 } \
1894 if (consumed_chars < consumed_chars_limit) \
1895 goto invalid_code; \
1896 charbuf_base[0] -= i; \
1897 } \
1898 } while (0)
1899
1900
1901 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1902 do { \
1903 /* Emacs 20 style format for relative composition. */ \
1904 /* Store multibyte form of characters to be composed. */ \
1905 enum composition_method method = COMPOSITION_RELATIVE; \
1906 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1907 int *buf = components; \
1908 int i, j; \
1909 int from, to; \
1910 \
1911 src = src_base; \
1912 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1913 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1914 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1915 if (i < 2) \
1916 goto invalid_code; \
1917 from = coding->produced_char + char_offset; \
1918 to = from + i; \
1919 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1920 for (j = 0; j < i; j++) \
1921 *charbuf++ = components[j]; \
1922 } while (0)
1923
1924
1925 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1926 do { \
1927 /* Emacs 20 style format for rule-base composition. */ \
1928 /* Store multibyte form of characters to be composed. */ \
1929 enum composition_method method = COMPOSITION_WITH_RULE; \
1930 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1931 int *buf = components; \
1932 int i, j; \
1933 int from, to; \
1934 \
1935 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1936 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1937 { \
1938 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1939 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1940 } \
1941 if (i < 1 || (buf - components) % 2 == 0) \
1942 goto invalid_code; \
1943 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1944 goto no_more_source; \
1945 from = coding->produced_char + char_offset; \
1946 to = from + i; \
1947 ADD_COMPOSITION_DATA (buf, from, to, method); \
1948 for (j = 0; j < i; j++) \
1949 *charbuf++ = components[j]; \
1950 for (j = 0; j < i; j += 2) \
1951 *charbuf++ = components[j]; \
1952 } while (0)
1953
1954
1955 static void
1956 decode_coding_emacs_mule (coding)
1957 struct coding_system *coding;
1958 {
1959 unsigned char *src = coding->source + coding->consumed;
1960 unsigned char *src_end = coding->source + coding->src_bytes;
1961 unsigned char *src_base;
1962 int *charbuf = coding->charbuf;
1963 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1964 int consumed_chars = 0, consumed_chars_base;
1965 int multibytep = coding->src_multibyte;
1966 Lisp_Object attrs, eol_type, charset_list;
1967 int char_offset = coding->produced_char;
1968 int last_offset = char_offset;
1969 int last_id = charset_ascii;
1970
1971 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1972
1973 while (1)
1974 {
1975 int c;
1976
1977 src_base = src;
1978 consumed_chars_base = consumed_chars;
1979
1980 if (charbuf >= charbuf_end)
1981 break;
1982
1983 ONE_MORE_BYTE (c);
1984
1985 if (c < 0x80)
1986 {
1987 if (c == '\r')
1988 {
1989 if (EQ (eol_type, Qdos))
1990 {
1991 if (src == src_end)
1992 {
1993 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1994 goto no_more_source;
1995 }
1996 if (*src == '\n')
1997 ONE_MORE_BYTE (c);
1998 }
1999 else if (EQ (eol_type, Qmac))
2000 c = '\n';
2001 }
2002 *charbuf++ = c;
2003 char_offset++;
2004 }
2005 else if (c == 0x80)
2006 {
2007 ONE_MORE_BYTE (c);
2008 if (c - 0xF2 >= COMPOSITION_RELATIVE
2009 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2010 DECODE_EMACS_MULE_21_COMPOSITION (c);
2011 else if (c < 0xC0)
2012 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2013 else if (c == 0xFF)
2014 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2015 else
2016 goto invalid_code;
2017 }
2018 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2019 {
2020 int nbytes, nchars;
2021 int id;
2022
2023 src = src_base;
2024 consumed_chars = consumed_chars_base;
2025 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2026 if (c < 0)
2027 {
2028 if (c == -2)
2029 break;
2030 goto invalid_code;
2031 }
2032 if (last_id != id)
2033 {
2034 if (last_id != charset_ascii)
2035 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2036 last_id = id;
2037 last_offset = char_offset;
2038 }
2039 *charbuf++ = c;
2040 src += nbytes;
2041 consumed_chars += nchars;
2042 char_offset++;
2043 }
2044 continue;
2045
2046 invalid_code:
2047 src = src_base;
2048 consumed_chars = consumed_chars_base;
2049 ONE_MORE_BYTE (c);
2050 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2051 char_offset++;
2052 coding->errors++;
2053 }
2054
2055 no_more_source:
2056 if (last_id != charset_ascii)
2057 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2058 coding->consumed_char += consumed_chars_base;
2059 coding->consumed = src_base - coding->source;
2060 coding->charbuf_used = charbuf - coding->charbuf;
2061 }
2062
2063
2064 #define EMACS_MULE_LEADING_CODES(id, codes) \
2065 do { \
2066 if (id < 0xA0) \
2067 codes[0] = id, codes[1] = 0; \
2068 else if (id < 0xE0) \
2069 codes[0] = 0x9A, codes[1] = id; \
2070 else if (id < 0xF0) \
2071 codes[0] = 0x9B, codes[1] = id; \
2072 else if (id < 0xF5) \
2073 codes[0] = 0x9C, codes[1] = id; \
2074 else \
2075 codes[0] = 0x9D, codes[1] = id; \
2076 } while (0);
2077
2078
2079 static int
2080 encode_coding_emacs_mule (coding)
2081 struct coding_system *coding;
2082 {
2083 int multibytep = coding->dst_multibyte;
2084 int *charbuf = coding->charbuf;
2085 int *charbuf_end = charbuf + coding->charbuf_used;
2086 unsigned char *dst = coding->destination + coding->produced;
2087 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2088 int safe_room = 8;
2089 int produced_chars = 0;
2090 Lisp_Object attrs, eol_type, charset_list;
2091 int c;
2092 int preferred_charset_id = -1;
2093
2094 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2095
2096 while (charbuf < charbuf_end)
2097 {
2098 ASSURE_DESTINATION (safe_room);
2099 c = *charbuf++;
2100
2101 if (c < 0)
2102 {
2103 /* Handle an annotation. */
2104 switch (*charbuf)
2105 {
2106 case CODING_ANNOTATE_COMPOSITION_MASK:
2107 /* Not yet implemented. */
2108 break;
2109 case CODING_ANNOTATE_CHARSET_MASK:
2110 preferred_charset_id = charbuf[3];
2111 if (preferred_charset_id >= 0
2112 && NILP (Fmemq (make_number (preferred_charset_id),
2113 charset_list)))
2114 preferred_charset_id = -1;
2115 break;
2116 default:
2117 abort ();
2118 }
2119 charbuf += -c - 1;
2120 continue;
2121 }
2122
2123 if (ASCII_CHAR_P (c))
2124 EMIT_ONE_ASCII_BYTE (c);
2125 else if (CHAR_BYTE8_P (c))
2126 {
2127 c = CHAR_TO_BYTE8 (c);
2128 EMIT_ONE_BYTE (c);
2129 }
2130 else
2131 {
2132 struct charset *charset;
2133 unsigned code;
2134 int dimension;
2135 int emacs_mule_id;
2136 unsigned char leading_codes[2];
2137
2138 if (preferred_charset_id >= 0)
2139 {
2140 charset = CHARSET_FROM_ID (preferred_charset_id);
2141 if (! CHAR_CHARSET_P (c, charset))
2142 charset = char_charset (c, charset_list, NULL);
2143 }
2144 else
2145 charset = char_charset (c, charset_list, &code);
2146 if (! charset)
2147 {
2148 c = coding->default_char;
2149 if (ASCII_CHAR_P (c))
2150 {
2151 EMIT_ONE_ASCII_BYTE (c);
2152 continue;
2153 }
2154 charset = char_charset (c, charset_list, &code);
2155 }
2156 dimension = CHARSET_DIMENSION (charset);
2157 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2158 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2159 EMIT_ONE_BYTE (leading_codes[0]);
2160 if (leading_codes[1])
2161 EMIT_ONE_BYTE (leading_codes[1]);
2162 if (dimension == 1)
2163 EMIT_ONE_BYTE (code);
2164 else
2165 {
2166 EMIT_ONE_BYTE (code >> 8);
2167 EMIT_ONE_BYTE (code & 0xFF);
2168 }
2169 }
2170 }
2171 coding->result = CODING_RESULT_SUCCESS;
2172 coding->produced_char += produced_chars;
2173 coding->produced = dst - coding->destination;
2174 return 0;
2175 }
2176
2177 \f
2178 /*** 7. ISO2022 handlers ***/
2179
2180 /* The following note describes the coding system ISO2022 briefly.
2181 Since the intention of this note is to help understand the
2182 functions in this file, some parts are NOT ACCURATE or are OVERLY
2183 SIMPLIFIED. For thorough understanding, please refer to the
2184 original document of ISO2022. This is equivalent to the standard
2185 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2186
2187 ISO2022 provides many mechanisms to encode several character sets
2188 in 7-bit and 8-bit environments. For 7-bit environments, all text
2189 is encoded using bytes less than 128. This may make the encoded
2190 text a little bit longer, but the text passes more easily through
2191 several types of gateway, some of which strip off the MSB (Most
2192 Significant Bit).
2193
2194 There are two kinds of character sets: control character sets and
2195 graphic character sets. The former contain control characters such
2196 as `newline' and `escape' to provide control functions (control
2197 functions are also provided by escape sequences). The latter
2198 contain graphic characters such as 'A' and '-'. Emacs recognizes
2199 two control character sets and many graphic character sets.
2200
2201 Graphic character sets are classified into one of the following
2202 four classes, according to the number of bytes (DIMENSION) and
2203 number of characters in one dimension (CHARS) of the set:
2204 - DIMENSION1_CHARS94
2205 - DIMENSION1_CHARS96
2206 - DIMENSION2_CHARS94
2207 - DIMENSION2_CHARS96
2208
2209 In addition, each character set is assigned an identification tag,
2210 unique for each set, called the "final character" (denoted as <F>
2211 hereafter). The <F> of each character set is decided by ECMA(*)
2212 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2213 (0x30..0x3F are for private use only).
2214
2215 Note (*): ECMA = European Computer Manufacturers Association
2216
2217 Here are examples of graphic character sets [NAME(<F>)]:
2218 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2219 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2220 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2221 o DIMENSION2_CHARS96 -- none for the moment
2222
2223 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2224 C0 [0x00..0x1F] -- control character plane 0
2225 GL [0x20..0x7F] -- graphic character plane 0
2226 C1 [0x80..0x9F] -- control character plane 1
2227 GR [0xA0..0xFF] -- graphic character plane 1
2228
2229 A control character set is directly designated and invoked to C0 or
2230 C1 by an escape sequence. The most common case is that:
2231 - ISO646's control character set is designated/invoked to C0, and
2232 - ISO6429's control character set is designated/invoked to C1,
2233 and usually these designations/invocations are omitted in encoded
2234 text. In a 7-bit environment, only C0 can be used, and a control
2235 character for C1 is encoded by an appropriate escape sequence to
2236 fit into the environment. All control characters for C1 are
2237 defined to have corresponding escape sequences.
2238
2239 A graphic character set is at first designated to one of four
2240 graphic registers (G0 through G3), then these graphic registers are
2241 invoked to GL or GR. These designations and invocations can be
2242 done independently. The most common case is that G0 is invoked to
2243 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2244 these invocations and designations are omitted in encoded text.
2245 In a 7-bit environment, only GL can be used.
2246
2247 When a graphic character set of CHARS94 is invoked to GL, codes
2248 0x20 and 0x7F of the GL area work as control characters SPACE and
2249 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2250 be used.
2251
2252 There are two ways of invocation: locking-shift and single-shift.
2253 With locking-shift, the invocation lasts until the next different
2254 invocation, whereas with single-shift, the invocation affects the
2255 following character only and doesn't affect the locking-shift
2256 state. Invocations are done by the following control characters or
2257 escape sequences:
2258
2259 ----------------------------------------------------------------------
2260 abbrev function cntrl escape seq description
2261 ----------------------------------------------------------------------
2262 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2263 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2264 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2265 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2266 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2267 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2268 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2269 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2270 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2271 ----------------------------------------------------------------------
2272 (*) These are not used by any known coding system.
2273
2274 Control characters for these functions are defined by macros
2275 ISO_CODE_XXX in `coding.h'.
2276
2277 Designations are done by the following escape sequences:
2278 ----------------------------------------------------------------------
2279 escape sequence description
2280 ----------------------------------------------------------------------
2281 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2282 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2283 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2284 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2285 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2286 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2287 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2288 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2289 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2290 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2291 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2292 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2293 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2294 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2295 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2296 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2297 ----------------------------------------------------------------------
2298
2299 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2300 of dimension 1, chars 94, and final character <F>, etc...
2301
2302 Note (*): Although these designations are not allowed in ISO2022,
2303 Emacs accepts them on decoding, and produces them on encoding
2304 CHARS96 character sets in a coding system which is characterized as
2305 7-bit environment, non-locking-shift, and non-single-shift.
2306
2307 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2308 '(' must be omitted. We refer to this as "short-form" hereafter.
2309
2310 Now you may notice that there are a lot of ways of encoding the
2311 same multilingual text in ISO2022. Actually, there exist many
2312 coding systems such as Compound Text (used in X11's inter client
2313 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2314 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2315 localized platforms), and all of these are variants of ISO2022.
2316
2317 In addition to the above, Emacs handles two more kinds of escape
2318 sequences: ISO6429's direction specification and Emacs' private
2319 sequence for specifying character composition.
2320
2321 ISO6429's direction specification takes the following form:
2322 o CSI ']' -- end of the current direction
2323 o CSI '0' ']' -- end of the current direction
2324 o CSI '1' ']' -- start of left-to-right text
2325 o CSI '2' ']' -- start of right-to-left text
2326 The control character CSI (0x9B: control sequence introducer) is
2327 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2328
2329 Character composition specification takes the following form:
2330 o ESC '0' -- start relative composition
2331 o ESC '1' -- end composition
2332 o ESC '2' -- start rule-base composition (*)
2333 o ESC '3' -- start relative composition with alternate chars (**)
2334 o ESC '4' -- start rule-base composition with alternate chars (**)
2335 Since these are not standard escape sequences of any ISO standard,
2336 the use of them with these meanings is restricted to Emacs only.
2337
2338 (*) This form is used only in Emacs 20.7 and older versions,
2339 but newer versions can safely decode it.
2340 (**) This form is used only in Emacs 21.1 and newer versions,
2341 and older versions can't decode it.
2342
2343 Here's a list of example usages of these composition escape
2344 sequences (categorized by `enum composition_method').
2345
2346 COMPOSITION_RELATIVE:
2347 ESC 0 CHAR [ CHAR ] ESC 1
2348 COMPOSITION_WITH_RULE:
2349 ESC 2 CHAR [ RULE CHAR ] ESC 1
2350 COMPOSITION_WITH_ALTCHARS:
2351 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2352 COMPOSITION_WITH_RULE_ALTCHARS:
2353 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2354
2355 enum iso_code_class_type iso_code_class[256];
2356
2357 #define SAFE_CHARSET_P(coding, id) \
2358 ((id) <= (coding)->max_charset_id \
2359 && (coding)->safe_charsets[id] >= 0)
2360
2361
2362 #define SHIFT_OUT_OK(category) \
2363 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2364
2365 static void
2366 setup_iso_safe_charsets (attrs)
2367 Lisp_Object attrs;
2368 {
2369 Lisp_Object charset_list, safe_charsets;
2370 Lisp_Object request;
2371 Lisp_Object reg_usage;
2372 Lisp_Object tail;
2373 int reg94, reg96;
2374 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2375 int max_charset_id;
2376
2377 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2378 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2379 && ! EQ (charset_list, Viso_2022_charset_list))
2380 {
2381 CODING_ATTR_CHARSET_LIST (attrs)
2382 = charset_list = Viso_2022_charset_list;
2383 ASET (attrs, coding_attr_safe_charsets, Qnil);
2384 }
2385
2386 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2387 return;
2388
2389 max_charset_id = 0;
2390 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2391 {
2392 int id = XINT (XCAR (tail));
2393 if (max_charset_id < id)
2394 max_charset_id = id;
2395 }
2396
2397 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2398 make_number (255));
2399 request = AREF (attrs, coding_attr_iso_request);
2400 reg_usage = AREF (attrs, coding_attr_iso_usage);
2401 reg94 = XINT (XCAR (reg_usage));
2402 reg96 = XINT (XCDR (reg_usage));
2403
2404 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2405 {
2406 Lisp_Object id;
2407 Lisp_Object reg;
2408 struct charset *charset;
2409
2410 id = XCAR (tail);
2411 charset = CHARSET_FROM_ID (XINT (id));
2412 reg = Fcdr (Fassq (id, request));
2413 if (! NILP (reg))
2414 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2415 else if (charset->iso_chars_96)
2416 {
2417 if (reg96 < 4)
2418 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2419 }
2420 else
2421 {
2422 if (reg94 < 4)
2423 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2424 }
2425 }
2426 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2427 }
2428
2429
2430 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2431 Check if a text is encoded in one of ISO-2022 based codig systems.
2432 If it is, return 1, else return 0. */
2433
2434 static int
2435 detect_coding_iso_2022 (coding, detect_info)
2436 struct coding_system *coding;
2437 struct coding_detection_info *detect_info;
2438 {
2439 unsigned char *src = coding->source, *src_base = src;
2440 unsigned char *src_end = coding->source + coding->src_bytes;
2441 int multibytep = coding->src_multibyte;
2442 int single_shifting = 0;
2443 int id;
2444 int c, c1;
2445 int consumed_chars = 0;
2446 int i;
2447 int rejected = 0;
2448 int found = 0;
2449
2450 detect_info->checked |= CATEGORY_MASK_ISO;
2451
2452 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2453 {
2454 struct coding_system *this = &(coding_categories[i]);
2455 Lisp_Object attrs, val;
2456
2457 attrs = CODING_ID_ATTRS (this->id);
2458 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2459 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2460 setup_iso_safe_charsets (attrs);
2461 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2462 this->max_charset_id = XSTRING (val)->size - 1;
2463 this->safe_charsets = (char *) XSTRING (val)->data;
2464 }
2465
2466 /* A coding system of this category is always ASCII compatible. */
2467 src += coding->head_ascii;
2468
2469 while (rejected != CATEGORY_MASK_ISO)
2470 {
2471 ONE_MORE_BYTE (c);
2472 switch (c)
2473 {
2474 case ISO_CODE_ESC:
2475 if (inhibit_iso_escape_detection)
2476 break;
2477 single_shifting = 0;
2478 ONE_MORE_BYTE (c);
2479 if (c >= '(' && c <= '/')
2480 {
2481 /* Designation sequence for a charset of dimension 1. */
2482 ONE_MORE_BYTE (c1);
2483 if (c1 < ' ' || c1 >= 0x80
2484 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2485 /* Invalid designation sequence. Just ignore. */
2486 break;
2487 }
2488 else if (c == '$')
2489 {
2490 /* Designation sequence for a charset of dimension 2. */
2491 ONE_MORE_BYTE (c);
2492 if (c >= '@' && c <= 'B')
2493 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2494 id = iso_charset_table[1][0][c];
2495 else if (c >= '(' && c <= '/')
2496 {
2497 ONE_MORE_BYTE (c1);
2498 if (c1 < ' ' || c1 >= 0x80
2499 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2500 /* Invalid designation sequence. Just ignore. */
2501 break;
2502 }
2503 else
2504 /* Invalid designation sequence. Just ignore it. */
2505 break;
2506 }
2507 else if (c == 'N' || c == 'O')
2508 {
2509 /* ESC <Fe> for SS2 or SS3. */
2510 single_shifting = 1;
2511 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2512 break;
2513 }
2514 else if (c >= '0' && c <= '4')
2515 {
2516 /* ESC <Fp> for start/end composition. */
2517 found |= CATEGORY_MASK_ISO;
2518 break;
2519 }
2520 else
2521 {
2522 /* Invalid escape sequence. Just ignore it. */
2523 break;
2524 }
2525
2526 /* We found a valid designation sequence for CHARSET. */
2527 rejected |= CATEGORY_MASK_ISO_8BIT;
2528 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2529 id))
2530 found |= CATEGORY_MASK_ISO_7;
2531 else
2532 rejected |= CATEGORY_MASK_ISO_7;
2533 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2534 id))
2535 found |= CATEGORY_MASK_ISO_7_TIGHT;
2536 else
2537 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2538 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2539 id))
2540 found |= CATEGORY_MASK_ISO_7_ELSE;
2541 else
2542 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2543 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2544 id))
2545 found |= CATEGORY_MASK_ISO_8_ELSE;
2546 else
2547 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2548 break;
2549
2550 case ISO_CODE_SO:
2551 case ISO_CODE_SI:
2552 /* Locking shift out/in. */
2553 if (inhibit_iso_escape_detection)
2554 break;
2555 single_shifting = 0;
2556 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2557 found |= CATEGORY_MASK_ISO_ELSE;
2558 break;
2559
2560 case ISO_CODE_CSI:
2561 /* Control sequence introducer. */
2562 single_shifting = 0;
2563 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2564 found |= CATEGORY_MASK_ISO_8_ELSE;
2565 goto check_extra_latin;
2566
2567
2568 case ISO_CODE_SS2:
2569 case ISO_CODE_SS3:
2570 /* Single shift. */
2571 if (inhibit_iso_escape_detection)
2572 break;
2573 single_shifting = 1;
2574 rejected |= CATEGORY_MASK_ISO_7BIT;
2575 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2576 & CODING_ISO_FLAG_SINGLE_SHIFT)
2577 found |= CATEGORY_MASK_ISO_8_1;
2578 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2579 & CODING_ISO_FLAG_SINGLE_SHIFT)
2580 found |= CATEGORY_MASK_ISO_8_2;
2581 goto check_extra_latin;
2582
2583 default:
2584 if (c < 0x80)
2585 {
2586 single_shifting = 0;
2587 break;
2588 }
2589 if (c >= 0xA0)
2590 {
2591 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2592 found |= CATEGORY_MASK_ISO_8_1;
2593 /* Check the length of succeeding codes of the range
2594 0xA0..0FF. If the byte length is even, we include
2595 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2596 only when we are not single shifting. */
2597 if (! single_shifting
2598 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2599 {
2600 int i = 1;
2601 while (src < src_end)
2602 {
2603 ONE_MORE_BYTE (c);
2604 if (c < 0xA0)
2605 break;
2606 i++;
2607 }
2608
2609 if (i & 1 && src < src_end)
2610 rejected |= CATEGORY_MASK_ISO_8_2;
2611 else
2612 found |= CATEGORY_MASK_ISO_8_2;
2613 }
2614 break;
2615 }
2616 check_extra_latin:
2617 single_shifting = 0;
2618 if (! VECTORP (Vlatin_extra_code_table)
2619 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2620 {
2621 rejected = CATEGORY_MASK_ISO;
2622 break;
2623 }
2624 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2625 & CODING_ISO_FLAG_LATIN_EXTRA)
2626 found |= CATEGORY_MASK_ISO_8_1;
2627 else
2628 rejected |= CATEGORY_MASK_ISO_8_1;
2629 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2630 & CODING_ISO_FLAG_LATIN_EXTRA)
2631 found |= CATEGORY_MASK_ISO_8_2;
2632 else
2633 rejected |= CATEGORY_MASK_ISO_8_2;
2634 }
2635 }
2636 detect_info->rejected |= CATEGORY_MASK_ISO;
2637 return 0;
2638
2639 no_more_source:
2640 detect_info->rejected |= rejected;
2641 detect_info->found |= (found & ~rejected);
2642 return 1;
2643 }
2644
2645
2646 /* Set designation state into CODING. */
2647 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2648 do { \
2649 int id, prev; \
2650 \
2651 if (final < '0' || final >= 128 \
2652 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2653 || !SAFE_CHARSET_P (coding, id)) \
2654 { \
2655 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2656 goto invalid_code; \
2657 } \
2658 prev = CODING_ISO_DESIGNATION (coding, reg); \
2659 if (id == charset_jisx0201_roman) \
2660 { \
2661 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2662 id = charset_ascii; \
2663 } \
2664 else if (id == charset_jisx0208_1978) \
2665 { \
2666 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2667 id = charset_jisx0208; \
2668 } \
2669 CODING_ISO_DESIGNATION (coding, reg) = id; \
2670 /* If there was an invalid designation to REG previously, and this \
2671 designation is ASCII to REG, we should keep this designation \
2672 sequence. */ \
2673 if (prev == -2 && id == charset_ascii) \
2674 goto invalid_code; \
2675 } while (0)
2676
2677
2678 #define MAYBE_FINISH_COMPOSITION() \
2679 do { \
2680 int i; \
2681 if (composition_state == COMPOSING_NO) \
2682 break; \
2683 /* It is assured that we have enough room for producing \
2684 characters stored in the table `components'. */ \
2685 if (charbuf + component_idx > charbuf_end) \
2686 goto no_more_source; \
2687 composition_state = COMPOSING_NO; \
2688 if (method == COMPOSITION_RELATIVE \
2689 || method == COMPOSITION_WITH_ALTCHARS) \
2690 { \
2691 for (i = 0; i < component_idx; i++) \
2692 *charbuf++ = components[i]; \
2693 char_offset += component_idx; \
2694 } \
2695 else \
2696 { \
2697 for (i = 0; i < component_idx; i += 2) \
2698 *charbuf++ = components[i]; \
2699 char_offset += (component_idx / 2) + 1; \
2700 } \
2701 } while (0)
2702
2703
2704 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2705 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2706 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2707 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2708 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2709 */
2710
2711 #define DECODE_COMPOSITION_START(c1) \
2712 do { \
2713 if (c1 == '0' \
2714 && composition_state == COMPOSING_COMPONENT_RULE) \
2715 { \
2716 component_len = component_idx; \
2717 composition_state = COMPOSING_CHAR; \
2718 } \
2719 else \
2720 { \
2721 unsigned char *p; \
2722 \
2723 MAYBE_FINISH_COMPOSITION (); \
2724 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2725 goto no_more_source; \
2726 for (p = src; p < src_end - 1; p++) \
2727 if (*p == ISO_CODE_ESC && p[1] == '1') \
2728 break; \
2729 if (p == src_end - 1) \
2730 { \
2731 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2732 goto invalid_code; \
2733 goto no_more_source; \
2734 } \
2735 \
2736 /* This is surely the start of a composition. */ \
2737 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2738 : c1 == '2' ? COMPOSITION_WITH_RULE \
2739 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2740 : COMPOSITION_WITH_RULE_ALTCHARS); \
2741 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2742 : COMPOSING_COMPONENT_CHAR); \
2743 component_idx = component_len = 0; \
2744 } \
2745 } while (0)
2746
2747
2748 /* Handle compositoin end sequence ESC 1. */
2749
2750 #define DECODE_COMPOSITION_END() \
2751 do { \
2752 int nchars = (component_len > 0 ? component_idx - component_len \
2753 : method == COMPOSITION_RELATIVE ? component_idx \
2754 : (component_idx + 1) / 2); \
2755 int i; \
2756 int *saved_charbuf = charbuf; \
2757 int from = coding->produced_char + char_offset; \
2758 int to = from + nchars; \
2759 \
2760 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2761 if (method != COMPOSITION_RELATIVE) \
2762 { \
2763 if (component_len == 0) \
2764 for (i = 0; i < component_idx; i++) \
2765 *charbuf++ = components[i]; \
2766 else \
2767 for (i = 0; i < component_len; i++) \
2768 *charbuf++ = components[i]; \
2769 *saved_charbuf = saved_charbuf - charbuf; \
2770 } \
2771 if (method == COMPOSITION_WITH_RULE) \
2772 for (i = 0; i < component_idx; i += 2, char_offset++) \
2773 *charbuf++ = components[i]; \
2774 else \
2775 for (i = component_len; i < component_idx; i++, char_offset++) \
2776 *charbuf++ = components[i]; \
2777 coding->annotated = 1; \
2778 composition_state = COMPOSING_NO; \
2779 } while (0)
2780
2781
2782 /* Decode a composition rule from the byte C1 (and maybe one more byte
2783 from SRC) and store one encoded composition rule in
2784 coding->cmp_data. */
2785
2786 #define DECODE_COMPOSITION_RULE(c1) \
2787 do { \
2788 (c1) -= 32; \
2789 if (c1 < 81) /* old format (before ver.21) */ \
2790 { \
2791 int gref = (c1) / 9; \
2792 int nref = (c1) % 9; \
2793 if (gref == 4) gref = 10; \
2794 if (nref == 4) nref = 10; \
2795 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2796 } \
2797 else if (c1 < 93) /* new format (after ver.21) */ \
2798 { \
2799 ONE_MORE_BYTE (c2); \
2800 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2801 } \
2802 else \
2803 c1 = 0; \
2804 } while (0)
2805
2806
2807 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2808
2809 static void
2810 decode_coding_iso_2022 (coding)
2811 struct coding_system *coding;
2812 {
2813 unsigned char *src = coding->source + coding->consumed;
2814 unsigned char *src_end = coding->source + coding->src_bytes;
2815 unsigned char *src_base;
2816 int *charbuf = coding->charbuf;
2817 int *charbuf_end
2818 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2819 int consumed_chars = 0, consumed_chars_base;
2820 int multibytep = coding->src_multibyte;
2821 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2822 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2823 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2824 struct charset *charset;
2825 int c;
2826 /* For handling composition sequence. */
2827 #define COMPOSING_NO 0
2828 #define COMPOSING_CHAR 1
2829 #define COMPOSING_RULE 2
2830 #define COMPOSING_COMPONENT_CHAR 3
2831 #define COMPOSING_COMPONENT_RULE 4
2832
2833 int composition_state = COMPOSING_NO;
2834 enum composition_method method;
2835 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2836 int component_idx;
2837 int component_len;
2838 Lisp_Object attrs, eol_type, charset_list;
2839 int char_offset = coding->produced_char;
2840 int last_offset = char_offset;
2841 int last_id = charset_ascii;
2842
2843 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2844 setup_iso_safe_charsets (attrs);
2845
2846 while (1)
2847 {
2848 int c1, c2;
2849
2850 src_base = src;
2851 consumed_chars_base = consumed_chars;
2852
2853 if (charbuf >= charbuf_end)
2854 break;
2855
2856 ONE_MORE_BYTE (c1);
2857
2858 /* We produce at most one character. */
2859 switch (iso_code_class [c1])
2860 {
2861 case ISO_0x20_or_0x7F:
2862 if (composition_state != COMPOSING_NO)
2863 {
2864 if (composition_state == COMPOSING_RULE
2865 || composition_state == COMPOSING_COMPONENT_RULE)
2866 {
2867 DECODE_COMPOSITION_RULE (c1);
2868 components[component_idx++] = c1;
2869 composition_state--;
2870 continue;
2871 }
2872 }
2873 if (charset_id_0 < 0
2874 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2875 /* This is SPACE or DEL. */
2876 charset = CHARSET_FROM_ID (charset_ascii);
2877 else
2878 charset = CHARSET_FROM_ID (charset_id_0);
2879 break;
2880
2881 case ISO_graphic_plane_0:
2882 if (composition_state != COMPOSING_NO)
2883 {
2884 if (composition_state == COMPOSING_RULE
2885 || composition_state == COMPOSING_COMPONENT_RULE)
2886 {
2887 DECODE_COMPOSITION_RULE (c1);
2888 components[component_idx++] = c1;
2889 composition_state--;
2890 continue;
2891 }
2892 }
2893 charset = CHARSET_FROM_ID (charset_id_0);
2894 break;
2895
2896 case ISO_0xA0_or_0xFF:
2897 if (charset_id_1 < 0
2898 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2899 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2900 goto invalid_code;
2901 /* This is a graphic character, we fall down ... */
2902
2903 case ISO_graphic_plane_1:
2904 if (charset_id_1 < 0)
2905 goto invalid_code;
2906 charset = CHARSET_FROM_ID (charset_id_1);
2907 break;
2908
2909 case ISO_carriage_return:
2910 if (c1 == '\r')
2911 {
2912 if (EQ (eol_type, Qdos))
2913 {
2914 if (src == src_end)
2915 {
2916 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2917 goto no_more_source;
2918 }
2919 if (*src == '\n')
2920 ONE_MORE_BYTE (c1);
2921 }
2922 else if (EQ (eol_type, Qmac))
2923 c1 = '\n';
2924 }
2925 /* fall through */
2926
2927 case ISO_control_0:
2928 MAYBE_FINISH_COMPOSITION ();
2929 charset = CHARSET_FROM_ID (charset_ascii);
2930 break;
2931
2932 case ISO_control_1:
2933 MAYBE_FINISH_COMPOSITION ();
2934 goto invalid_code;
2935
2936 case ISO_shift_out:
2937 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2938 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2939 goto invalid_code;
2940 CODING_ISO_INVOCATION (coding, 0) = 1;
2941 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2942 continue;
2943
2944 case ISO_shift_in:
2945 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2946 goto invalid_code;
2947 CODING_ISO_INVOCATION (coding, 0) = 0;
2948 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2949 continue;
2950
2951 case ISO_single_shift_2_7:
2952 case ISO_single_shift_2:
2953 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2954 goto invalid_code;
2955 /* SS2 is handled as an escape sequence of ESC 'N' */
2956 c1 = 'N';
2957 goto label_escape_sequence;
2958
2959 case ISO_single_shift_3:
2960 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2961 goto invalid_code;
2962 /* SS2 is handled as an escape sequence of ESC 'O' */
2963 c1 = 'O';
2964 goto label_escape_sequence;
2965
2966 case ISO_control_sequence_introducer:
2967 /* CSI is handled as an escape sequence of ESC '[' ... */
2968 c1 = '[';
2969 goto label_escape_sequence;
2970
2971 case ISO_escape:
2972 ONE_MORE_BYTE (c1);
2973 label_escape_sequence:
2974 /* Escape sequences handled here are invocation,
2975 designation, direction specification, and character
2976 composition specification. */
2977 switch (c1)
2978 {
2979 case '&': /* revision of following character set */
2980 ONE_MORE_BYTE (c1);
2981 if (!(c1 >= '@' && c1 <= '~'))
2982 goto invalid_code;
2983 ONE_MORE_BYTE (c1);
2984 if (c1 != ISO_CODE_ESC)
2985 goto invalid_code;
2986 ONE_MORE_BYTE (c1);
2987 goto label_escape_sequence;
2988
2989 case '$': /* designation of 2-byte character set */
2990 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2991 goto invalid_code;
2992 ONE_MORE_BYTE (c1);
2993 if (c1 >= '@' && c1 <= 'B')
2994 { /* designation of JISX0208.1978, GB2312.1980,
2995 or JISX0208.1980 */
2996 DECODE_DESIGNATION (0, 2, 0, c1);
2997 }
2998 else if (c1 >= 0x28 && c1 <= 0x2B)
2999 { /* designation of DIMENSION2_CHARS94 character set */
3000 ONE_MORE_BYTE (c2);
3001 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3002 }
3003 else if (c1 >= 0x2C && c1 <= 0x2F)
3004 { /* designation of DIMENSION2_CHARS96 character set */
3005 ONE_MORE_BYTE (c2);
3006 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3007 }
3008 else
3009 goto invalid_code;
3010 /* We must update these variables now. */
3011 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3012 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3013 continue;
3014
3015 case 'n': /* invocation of locking-shift-2 */
3016 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3017 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3018 goto invalid_code;
3019 CODING_ISO_INVOCATION (coding, 0) = 2;
3020 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3021 continue;
3022
3023 case 'o': /* invocation of locking-shift-3 */
3024 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3025 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3026 goto invalid_code;
3027 CODING_ISO_INVOCATION (coding, 0) = 3;
3028 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3029 continue;
3030
3031 case 'N': /* invocation of single-shift-2 */
3032 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3033 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3034 goto invalid_code;
3035 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3036 ONE_MORE_BYTE (c1);
3037 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3038 goto invalid_code;
3039 break;
3040
3041 case 'O': /* invocation of single-shift-3 */
3042 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3043 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3044 goto invalid_code;
3045 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3046 ONE_MORE_BYTE (c1);
3047 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3048 goto invalid_code;
3049 break;
3050
3051 case '0': case '2': case '3': case '4': /* start composition */
3052 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3053 goto invalid_code;
3054 DECODE_COMPOSITION_START (c1);
3055 continue;
3056
3057 case '1': /* end composition */
3058 if (composition_state == COMPOSING_NO)
3059 goto invalid_code;
3060 DECODE_COMPOSITION_END ();
3061 continue;
3062
3063 case '[': /* specification of direction */
3064 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3065 goto invalid_code;
3066 /* For the moment, nested direction is not supported.
3067 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3068 left-to-right, and nozero means right-to-left. */
3069 ONE_MORE_BYTE (c1);
3070 switch (c1)
3071 {
3072 case ']': /* end of the current direction */
3073 coding->mode &= ~CODING_MODE_DIRECTION;
3074
3075 case '0': /* end of the current direction */
3076 case '1': /* start of left-to-right direction */
3077 ONE_MORE_BYTE (c1);
3078 if (c1 == ']')
3079 coding->mode &= ~CODING_MODE_DIRECTION;
3080 else
3081 goto invalid_code;
3082 break;
3083
3084 case '2': /* start of right-to-left direction */
3085 ONE_MORE_BYTE (c1);
3086 if (c1 == ']')
3087 coding->mode |= CODING_MODE_DIRECTION;
3088 else
3089 goto invalid_code;
3090 break;
3091
3092 default:
3093 goto invalid_code;
3094 }
3095 continue;
3096
3097 default:
3098 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3099 goto invalid_code;
3100 if (c1 >= 0x28 && c1 <= 0x2B)
3101 { /* designation of DIMENSION1_CHARS94 character set */
3102 ONE_MORE_BYTE (c2);
3103 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3104 }
3105 else if (c1 >= 0x2C && c1 <= 0x2F)
3106 { /* designation of DIMENSION1_CHARS96 character set */
3107 ONE_MORE_BYTE (c2);
3108 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3109 }
3110 else
3111 goto invalid_code;
3112 /* We must update these variables now. */
3113 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3114 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3115 continue;
3116 }
3117 }
3118
3119 if (charset->id != charset_ascii
3120 && last_id != charset->id)
3121 {
3122 if (last_id != charset_ascii)
3123 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3124 last_id = charset->id;
3125 last_offset = char_offset;
3126 }
3127
3128 /* Now we know CHARSET and 1st position code C1 of a character.
3129 Produce a decoded character while getting 2nd position code
3130 C2 if necessary. */
3131 c1 &= 0x7F;
3132 if (CHARSET_DIMENSION (charset) > 1)
3133 {
3134 ONE_MORE_BYTE (c2);
3135 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3136 /* C2 is not in a valid range. */
3137 goto invalid_code;
3138 c1 = (c1 << 8) | (c2 & 0x7F);
3139 if (CHARSET_DIMENSION (charset) > 2)
3140 {
3141 ONE_MORE_BYTE (c2);
3142 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3143 /* C2 is not in a valid range. */
3144 goto invalid_code;
3145 c1 = (c1 << 8) | (c2 & 0x7F);
3146 }
3147 }
3148
3149 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3150 if (c < 0)
3151 {
3152 MAYBE_FINISH_COMPOSITION ();
3153 for (; src_base < src; src_base++, char_offset++)
3154 {
3155 if (ASCII_BYTE_P (*src_base))
3156 *charbuf++ = *src_base;
3157 else
3158 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3159 char_offset++;
3160 }
3161 }
3162 else if (composition_state == COMPOSING_NO)
3163 {
3164 *charbuf++ = c;
3165 char_offset++;
3166 }
3167 else
3168 {
3169 components[component_idx++] = c;
3170 if (method == COMPOSITION_WITH_RULE
3171 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3172 && composition_state == COMPOSING_COMPONENT_CHAR))
3173 composition_state++;
3174 }
3175 continue;
3176
3177 invalid_code:
3178 MAYBE_FINISH_COMPOSITION ();
3179 src = src_base;
3180 consumed_chars = consumed_chars_base;
3181 ONE_MORE_BYTE (c);
3182 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3183 char_offset++;
3184 coding->errors++;
3185 }
3186
3187 no_more_source:
3188 if (last_id != charset_ascii)
3189 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3190 coding->consumed_char += consumed_chars_base;
3191 coding->consumed = src_base - coding->source;
3192 coding->charbuf_used = charbuf - coding->charbuf;
3193 }
3194
3195
3196 /* ISO2022 encoding stuff. */
3197
3198 /*
3199 It is not enough to say just "ISO2022" on encoding, we have to
3200 specify more details. In Emacs, each coding system of ISO2022
3201 variant has the following specifications:
3202 1. Initial designation to G0 thru G3.
3203 2. Allows short-form designation?
3204 3. ASCII should be designated to G0 before control characters?
3205 4. ASCII should be designated to G0 at end of line?
3206 5. 7-bit environment or 8-bit environment?
3207 6. Use locking-shift?
3208 7. Use Single-shift?
3209 And the following two are only for Japanese:
3210 8. Use ASCII in place of JIS0201-1976-Roman?
3211 9. Use JISX0208-1983 in place of JISX0208-1978?
3212 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3213 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3214 details.
3215 */
3216
3217 /* Produce codes (escape sequence) for designating CHARSET to graphic
3218 register REG at DST, and increment DST. If <final-char> of CHARSET is
3219 '@', 'A', or 'B' and the coding system CODING allows, produce
3220 designation sequence of short-form. */
3221
3222 #define ENCODE_DESIGNATION(charset, reg, coding) \
3223 do { \
3224 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3225 char *intermediate_char_94 = "()*+"; \
3226 char *intermediate_char_96 = ",-./"; \
3227 int revision = -1; \
3228 int c; \
3229 \
3230 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3231 revision = CHARSET_ISO_REVISION (charset); \
3232 \
3233 if (revision >= 0) \
3234 { \
3235 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3236 EMIT_ONE_BYTE ('@' + revision); \
3237 } \
3238 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3239 if (CHARSET_DIMENSION (charset) == 1) \
3240 { \
3241 if (! CHARSET_ISO_CHARS_96 (charset)) \
3242 c = intermediate_char_94[reg]; \
3243 else \
3244 c = intermediate_char_96[reg]; \
3245 EMIT_ONE_ASCII_BYTE (c); \
3246 } \
3247 else \
3248 { \
3249 EMIT_ONE_ASCII_BYTE ('$'); \
3250 if (! CHARSET_ISO_CHARS_96 (charset)) \
3251 { \
3252 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3253 || reg != 0 \
3254 || final_char < '@' || final_char > 'B') \
3255 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3256 } \
3257 else \
3258 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3259 } \
3260 EMIT_ONE_ASCII_BYTE (final_char); \
3261 \
3262 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3263 } while (0)
3264
3265
3266 /* The following two macros produce codes (control character or escape
3267 sequence) for ISO2022 single-shift functions (single-shift-2 and
3268 single-shift-3). */
3269
3270 #define ENCODE_SINGLE_SHIFT_2 \
3271 do { \
3272 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3274 else \
3275 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3276 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3277 } while (0)
3278
3279
3280 #define ENCODE_SINGLE_SHIFT_3 \
3281 do { \
3282 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3283 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3284 else \
3285 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3286 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3287 } while (0)
3288
3289
3290 /* The following four macros produce codes (control character or
3291 escape sequence) for ISO2022 locking-shift functions (shift-in,
3292 shift-out, locking-shift-2, and locking-shift-3). */
3293
3294 #define ENCODE_SHIFT_IN \
3295 do { \
3296 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3297 CODING_ISO_INVOCATION (coding, 0) = 0; \
3298 } while (0)
3299
3300
3301 #define ENCODE_SHIFT_OUT \
3302 do { \
3303 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3304 CODING_ISO_INVOCATION (coding, 0) = 1; \
3305 } while (0)
3306
3307
3308 #define ENCODE_LOCKING_SHIFT_2 \
3309 do { \
3310 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3311 CODING_ISO_INVOCATION (coding, 0) = 2; \
3312 } while (0)
3313
3314
3315 #define ENCODE_LOCKING_SHIFT_3 \
3316 do { \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3318 CODING_ISO_INVOCATION (coding, 0) = 3; \
3319 } while (0)
3320
3321
3322 /* Produce codes for a DIMENSION1 character whose character set is
3323 CHARSET and whose position-code is C1. Designation and invocation
3324 sequences are also produced in advance if necessary. */
3325
3326 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3327 do { \
3328 int id = CHARSET_ID (charset); \
3329 \
3330 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3331 && id == charset_ascii) \
3332 { \
3333 id = charset_jisx0201_roman; \
3334 charset = CHARSET_FROM_ID (id); \
3335 } \
3336 \
3337 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3338 { \
3339 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3340 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3341 else \
3342 EMIT_ONE_BYTE (c1 | 0x80); \
3343 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3344 break; \
3345 } \
3346 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3347 { \
3348 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3349 break; \
3350 } \
3351 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3352 { \
3353 EMIT_ONE_BYTE (c1 | 0x80); \
3354 break; \
3355 } \
3356 else \
3357 /* Since CHARSET is not yet invoked to any graphic planes, we \
3358 must invoke it, or, at first, designate it to some graphic \
3359 register. Then repeat the loop to actually produce the \
3360 character. */ \
3361 dst = encode_invocation_designation (charset, coding, dst, \
3362 &produced_chars); \
3363 } while (1)
3364
3365
3366 /* Produce codes for a DIMENSION2 character whose character set is
3367 CHARSET and whose position-codes are C1 and C2. Designation and
3368 invocation codes are also produced in advance if necessary. */
3369
3370 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3371 do { \
3372 int id = CHARSET_ID (charset); \
3373 \
3374 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3375 && id == charset_jisx0208) \
3376 { \
3377 id = charset_jisx0208_1978; \
3378 charset = CHARSET_FROM_ID (id); \
3379 } \
3380 \
3381 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3382 { \
3383 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3384 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3385 else \
3386 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3387 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3388 break; \
3389 } \
3390 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3391 { \
3392 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3393 break; \
3394 } \
3395 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3396 { \
3397 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3398 break; \
3399 } \
3400 else \
3401 /* Since CHARSET is not yet invoked to any graphic planes, we \
3402 must invoke it, or, at first, designate it to some graphic \
3403 register. Then repeat the loop to actually produce the \
3404 character. */ \
3405 dst = encode_invocation_designation (charset, coding, dst, \
3406 &produced_chars); \
3407 } while (1)
3408
3409
3410 #define ENCODE_ISO_CHARACTER(charset, c) \
3411 do { \
3412 int code = ENCODE_CHAR ((charset),(c)); \
3413 \
3414 if (CHARSET_DIMENSION (charset) == 1) \
3415 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3416 else \
3417 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3418 } while (0)
3419
3420
3421 /* Produce designation and invocation codes at a place pointed by DST
3422 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3423 Return new DST. */
3424
3425 unsigned char *
3426 encode_invocation_designation (charset, coding, dst, p_nchars)
3427 struct charset *charset;
3428 struct coding_system *coding;
3429 unsigned char *dst;
3430 int *p_nchars;
3431 {
3432 int multibytep = coding->dst_multibyte;
3433 int produced_chars = *p_nchars;
3434 int reg; /* graphic register number */
3435 int id = CHARSET_ID (charset);
3436
3437 /* At first, check designations. */
3438 for (reg = 0; reg < 4; reg++)
3439 if (id == CODING_ISO_DESIGNATION (coding, reg))
3440 break;
3441
3442 if (reg >= 4)
3443 {
3444 /* CHARSET is not yet designated to any graphic registers. */
3445 /* At first check the requested designation. */
3446 reg = CODING_ISO_REQUEST (coding, id);
3447 if (reg < 0)
3448 /* Since CHARSET requests no special designation, designate it
3449 to graphic register 0. */
3450 reg = 0;
3451
3452 ENCODE_DESIGNATION (charset, reg, coding);
3453 }
3454
3455 if (CODING_ISO_INVOCATION (coding, 0) != reg
3456 && CODING_ISO_INVOCATION (coding, 1) != reg)
3457 {
3458 /* Since the graphic register REG is not invoked to any graphic
3459 planes, invoke it to graphic plane 0. */
3460 switch (reg)
3461 {
3462 case 0: /* graphic register 0 */
3463 ENCODE_SHIFT_IN;
3464 break;
3465
3466 case 1: /* graphic register 1 */
3467 ENCODE_SHIFT_OUT;
3468 break;
3469
3470 case 2: /* graphic register 2 */
3471 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3472 ENCODE_SINGLE_SHIFT_2;
3473 else
3474 ENCODE_LOCKING_SHIFT_2;
3475 break;
3476
3477 case 3: /* graphic register 3 */
3478 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3479 ENCODE_SINGLE_SHIFT_3;
3480 else
3481 ENCODE_LOCKING_SHIFT_3;
3482 break;
3483 }
3484 }
3485
3486 *p_nchars = produced_chars;
3487 return dst;
3488 }
3489
3490 /* The following three macros produce codes for indicating direction
3491 of text. */
3492 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3493 do { \
3494 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3495 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3496 else \
3497 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3498 } while (0)
3499
3500
3501 #define ENCODE_DIRECTION_R2L() \
3502 do { \
3503 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3504 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3505 } while (0)
3506
3507
3508 #define ENCODE_DIRECTION_L2R() \
3509 do { \
3510 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3511 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3512 } while (0)
3513
3514
3515 /* Produce codes for designation and invocation to reset the graphic
3516 planes and registers to initial state. */
3517 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3518 do { \
3519 int reg; \
3520 struct charset *charset; \
3521 \
3522 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3523 ENCODE_SHIFT_IN; \
3524 for (reg = 0; reg < 4; reg++) \
3525 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3526 && (CODING_ISO_DESIGNATION (coding, reg) \
3527 != CODING_ISO_INITIAL (coding, reg))) \
3528 { \
3529 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3530 ENCODE_DESIGNATION (charset, reg, coding); \
3531 } \
3532 } while (0)
3533
3534
3535 /* Produce designation sequences of charsets in the line started from
3536 SRC to a place pointed by DST, and return updated DST.
3537
3538 If the current block ends before any end-of-line, we may fail to
3539 find all the necessary designations. */
3540
3541 static unsigned char *
3542 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3543 struct coding_system *coding;
3544 int *charbuf, *charbuf_end;
3545 unsigned char *dst;
3546 {
3547 struct charset *charset;
3548 /* Table of charsets to be designated to each graphic register. */
3549 int r[4];
3550 int c, found = 0, reg;
3551 int produced_chars = 0;
3552 int multibytep = coding->dst_multibyte;
3553 Lisp_Object attrs;
3554 Lisp_Object charset_list;
3555
3556 attrs = CODING_ID_ATTRS (coding->id);
3557 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3558 if (EQ (charset_list, Qiso_2022))
3559 charset_list = Viso_2022_charset_list;
3560
3561 for (reg = 0; reg < 4; reg++)
3562 r[reg] = -1;
3563
3564 while (found < 4)
3565 {
3566 int id;
3567
3568 c = *charbuf++;
3569 if (c == '\n')
3570 break;
3571 charset = char_charset (c, charset_list, NULL);
3572 id = CHARSET_ID (charset);
3573 reg = CODING_ISO_REQUEST (coding, id);
3574 if (reg >= 0 && r[reg] < 0)
3575 {
3576 found++;
3577 r[reg] = id;
3578 }
3579 }
3580
3581 if (found)
3582 {
3583 for (reg = 0; reg < 4; reg++)
3584 if (r[reg] >= 0
3585 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3586 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3587 }
3588
3589 return dst;
3590 }
3591
3592 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3593
3594 static int
3595 encode_coding_iso_2022 (coding)
3596 struct coding_system *coding;
3597 {
3598 int multibytep = coding->dst_multibyte;
3599 int *charbuf = coding->charbuf;
3600 int *charbuf_end = charbuf + coding->charbuf_used;
3601 unsigned char *dst = coding->destination + coding->produced;
3602 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3603 int safe_room = 16;
3604 int bol_designation
3605 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3606 && CODING_ISO_BOL (coding));
3607 int produced_chars = 0;
3608 Lisp_Object attrs, eol_type, charset_list;
3609 int ascii_compatible;
3610 int c;
3611 int preferred_charset_id = -1;
3612
3613 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3614 setup_iso_safe_charsets (attrs);
3615 /* Charset list may have been changed. */
3616 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3617 coding->safe_charsets
3618 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3619
3620 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3621
3622 while (charbuf < charbuf_end)
3623 {
3624 ASSURE_DESTINATION (safe_room);
3625
3626 if (bol_designation)
3627 {
3628 unsigned char *dst_prev = dst;
3629
3630 /* We have to produce designation sequences if any now. */
3631 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3632 bol_designation = 0;
3633 /* We are sure that designation sequences are all ASCII bytes. */
3634 produced_chars += dst - dst_prev;
3635 }
3636
3637 c = *charbuf++;
3638
3639 if (c < 0)
3640 {
3641 /* Handle an annotation. */
3642 switch (*charbuf)
3643 {
3644 case CODING_ANNOTATE_COMPOSITION_MASK:
3645 /* Not yet implemented. */
3646 break;
3647 case CODING_ANNOTATE_CHARSET_MASK:
3648 preferred_charset_id = charbuf[3];
3649 if (preferred_charset_id >= 0
3650 && NILP (Fmemq (make_number (preferred_charset_id),
3651 charset_list)))
3652 preferred_charset_id = -1;
3653 break;
3654 default:
3655 abort ();
3656 }
3657 charbuf += -c - 1;
3658 continue;
3659 }
3660
3661 /* Now encode the character C. */
3662 if (c < 0x20 || c == 0x7F)
3663 {
3664 if (c == '\n'
3665 || (c == '\r' && EQ (eol_type, Qmac)))
3666 {
3667 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3668 ENCODE_RESET_PLANE_AND_REGISTER ();
3669 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3670 {
3671 int i;
3672
3673 for (i = 0; i < 4; i++)
3674 CODING_ISO_DESIGNATION (coding, i)
3675 = CODING_ISO_INITIAL (coding, i);
3676 }
3677 bol_designation
3678 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3679 }
3680 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3681 ENCODE_RESET_PLANE_AND_REGISTER ();
3682 EMIT_ONE_ASCII_BYTE (c);
3683 }
3684 else if (ASCII_CHAR_P (c))
3685 {
3686 if (ascii_compatible)
3687 EMIT_ONE_ASCII_BYTE (c);
3688 else
3689 {
3690 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3691 ENCODE_ISO_CHARACTER (charset, c);
3692 }
3693 }
3694 else if (CHAR_BYTE8_P (c))
3695 {
3696 c = CHAR_TO_BYTE8 (c);
3697 EMIT_ONE_BYTE (c);
3698 }
3699 else
3700 {
3701 struct charset *charset;
3702
3703 if (preferred_charset_id >= 0)
3704 {
3705 charset = CHARSET_FROM_ID (preferred_charset_id);
3706 if (! CHAR_CHARSET_P (c, charset))
3707 charset = char_charset (c, charset_list, NULL);
3708 }
3709 else
3710 charset = char_charset (c, charset_list, NULL);
3711 if (!charset)
3712 {
3713 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3714 {
3715 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3716 charset = CHARSET_FROM_ID (charset_ascii);
3717 }
3718 else
3719 {
3720 c = coding->default_char;
3721 charset = char_charset (c, charset_list, NULL);
3722 }
3723 }
3724 ENCODE_ISO_CHARACTER (charset, c);
3725 }
3726 }
3727
3728 if (coding->mode & CODING_MODE_LAST_BLOCK
3729 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3730 {
3731 ASSURE_DESTINATION (safe_room);
3732 ENCODE_RESET_PLANE_AND_REGISTER ();
3733 }
3734 coding->result = CODING_RESULT_SUCCESS;
3735 CODING_ISO_BOL (coding) = bol_designation;
3736 coding->produced_char += produced_chars;
3737 coding->produced = dst - coding->destination;
3738 return 0;
3739 }
3740
3741 \f
3742 /*** 8,9. SJIS and BIG5 handlers ***/
3743
3744 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3745 quite widely. So, for the moment, Emacs supports them in the bare
3746 C code. But, in the future, they may be supported only by CCL. */
3747
3748 /* SJIS is a coding system encoding three character sets: ASCII, right
3749 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3750 as is. A character of charset katakana-jisx0201 is encoded by
3751 "position-code + 0x80". A character of charset japanese-jisx0208
3752 is encoded in 2-byte but two position-codes are divided and shifted
3753 so that it fit in the range below.
3754
3755 --- CODE RANGE of SJIS ---
3756 (character set) (range)
3757 ASCII 0x00 .. 0x7F
3758 KATAKANA-JISX0201 0xA0 .. 0xDF
3759 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3760 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3761 -------------------------------
3762
3763 */
3764
3765 /* BIG5 is a coding system encoding two character sets: ASCII and
3766 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3767 character set and is encoded in two-byte.
3768
3769 --- CODE RANGE of BIG5 ---
3770 (character set) (range)
3771 ASCII 0x00 .. 0x7F
3772 Big5 (1st byte) 0xA1 .. 0xFE
3773 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3774 --------------------------
3775
3776 */
3777
3778 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3779 Check if a text is encoded in SJIS. If it is, return
3780 CATEGORY_MASK_SJIS, else return 0. */
3781
3782 static int
3783 detect_coding_sjis (coding, detect_info)
3784 struct coding_system *coding;
3785 struct coding_detection_info *detect_info;
3786 {
3787 unsigned char *src = coding->source, *src_base = src;
3788 unsigned char *src_end = coding->source + coding->src_bytes;
3789 int multibytep = coding->src_multibyte;
3790 int consumed_chars = 0;
3791 int found = 0;
3792 int c;
3793 int incomplete;
3794
3795 detect_info->checked |= CATEGORY_MASK_SJIS;
3796 /* A coding system of this category is always ASCII compatible. */
3797 src += coding->head_ascii;
3798
3799 while (1)
3800 {
3801 incomplete = 0;
3802 ONE_MORE_BYTE (c);
3803 incomplete = 1;
3804 if (c < 0x80)
3805 continue;
3806 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3807 {
3808 ONE_MORE_BYTE (c);
3809 if (c < 0x40 || c == 0x7F || c > 0xFC)
3810 break;
3811 found = CATEGORY_MASK_SJIS;
3812 }
3813 else if (c >= 0xA0 && c < 0xE0)
3814 found = CATEGORY_MASK_SJIS;
3815 else
3816 break;
3817 }
3818 detect_info->rejected |= CATEGORY_MASK_SJIS;
3819 return 0;
3820
3821 no_more_source:
3822 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3823 {
3824 detect_info->rejected |= CATEGORY_MASK_SJIS;
3825 return 0;
3826 }
3827 detect_info->found |= found;
3828 return 1;
3829 }
3830
3831 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3832 Check if a text is encoded in BIG5. If it is, return
3833 CATEGORY_MASK_BIG5, else return 0. */
3834
3835 static int
3836 detect_coding_big5 (coding, detect_info)
3837 struct coding_system *coding;
3838 struct coding_detection_info *detect_info;
3839 {
3840 unsigned char *src = coding->source, *src_base = src;
3841 unsigned char *src_end = coding->source + coding->src_bytes;
3842 int multibytep = coding->src_multibyte;
3843 int consumed_chars = 0;
3844 int found = 0;
3845 int c;
3846 int incomplete;
3847
3848 detect_info->checked |= CATEGORY_MASK_BIG5;
3849 /* A coding system of this category is always ASCII compatible. */
3850 src += coding->head_ascii;
3851
3852 while (1)
3853 {
3854 incomplete = 0;
3855 ONE_MORE_BYTE (c);
3856 incomplete = 1;
3857 if (c < 0x80)
3858 continue;
3859 if (c >= 0xA1)
3860 {
3861 ONE_MORE_BYTE (c);
3862 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3863 return 0;
3864 found = CATEGORY_MASK_BIG5;
3865 }
3866 else
3867 break;
3868 }
3869 detect_info->rejected |= CATEGORY_MASK_BIG5;
3870 return 0;
3871
3872 no_more_source:
3873 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3874 {
3875 detect_info->rejected |= CATEGORY_MASK_BIG5;
3876 return 0;
3877 }
3878 detect_info->found |= found;
3879 return 1;
3880 }
3881
3882 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3883 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3884
3885 static void
3886 decode_coding_sjis (coding)
3887 struct coding_system *coding;
3888 {
3889 unsigned char *src = coding->source + coding->consumed;
3890 unsigned char *src_end = coding->source + coding->src_bytes;
3891 unsigned char *src_base;
3892 int *charbuf = coding->charbuf;
3893 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3894 int consumed_chars = 0, consumed_chars_base;
3895 int multibytep = coding->src_multibyte;
3896 struct charset *charset_roman, *charset_kanji, *charset_kana;
3897 Lisp_Object attrs, eol_type, charset_list, val;
3898 int char_offset = coding->produced_char;
3899 int last_offset = char_offset;
3900 int last_id = charset_ascii;
3901
3902 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3903
3904 val = charset_list;
3905 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3906 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3907 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3908
3909 while (1)
3910 {
3911 int c, c1;
3912
3913 src_base = src;
3914 consumed_chars_base = consumed_chars;
3915
3916 if (charbuf >= charbuf_end)
3917 break;
3918
3919 ONE_MORE_BYTE (c);
3920
3921 if (c == '\r')
3922 {
3923 if (EQ (eol_type, Qdos))
3924 {
3925 if (src == src_end)
3926 {
3927 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3928 goto no_more_source;
3929 }
3930 if (*src == '\n')
3931 ONE_MORE_BYTE (c);
3932 }
3933 else if (EQ (eol_type, Qmac))
3934 c = '\n';
3935 }
3936 else
3937 {
3938 struct charset *charset;
3939
3940 if (c < 0x80)
3941 charset = charset_roman;
3942 else
3943 {
3944 if (c >= 0xF0)
3945 goto invalid_code;
3946 if (c < 0xA0 || c >= 0xE0)
3947 {
3948 /* SJIS -> JISX0208 */
3949 ONE_MORE_BYTE (c1);
3950 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3951 goto invalid_code;
3952 c = (c << 8) | c1;
3953 SJIS_TO_JIS (c);
3954 charset = charset_kanji;
3955 }
3956 else
3957 {
3958 /* SJIS -> JISX0201-Kana */
3959 c &= 0x7F;
3960 charset = charset_kana;
3961 }
3962 }
3963 if (charset->id != charset_ascii
3964 && last_id != charset->id)
3965 {
3966 if (last_id != charset_ascii)
3967 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3968 last_id = charset->id;
3969 last_offset = char_offset;
3970 }
3971 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3972 }
3973 *charbuf++ = c;
3974 char_offset++;
3975 continue;
3976
3977 invalid_code:
3978 src = src_base;
3979 consumed_chars = consumed_chars_base;
3980 ONE_MORE_BYTE (c);
3981 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3982 char_offset++;
3983 coding->errors++;
3984 }
3985
3986 no_more_source:
3987 if (last_id != charset_ascii)
3988 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3989 coding->consumed_char += consumed_chars_base;
3990 coding->consumed = src_base - coding->source;
3991 coding->charbuf_used = charbuf - coding->charbuf;
3992 }
3993
3994 static void
3995 decode_coding_big5 (coding)
3996 struct coding_system *coding;
3997 {
3998 unsigned char *src = coding->source + coding->consumed;
3999 unsigned char *src_end = coding->source + coding->src_bytes;
4000 unsigned char *src_base;
4001 int *charbuf = coding->charbuf;
4002 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4003 int consumed_chars = 0, consumed_chars_base;
4004 int multibytep = coding->src_multibyte;
4005 struct charset *charset_roman, *charset_big5;
4006 Lisp_Object attrs, eol_type, charset_list, val;
4007 int char_offset = coding->produced_char;
4008 int last_offset = char_offset;
4009 int last_id = charset_ascii;
4010
4011 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4012 val = charset_list;
4013 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4014 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4015
4016 while (1)
4017 {
4018 int c, c1;
4019
4020 src_base = src;
4021 consumed_chars_base = consumed_chars;
4022
4023 if (charbuf >= charbuf_end)
4024 break;
4025
4026 ONE_MORE_BYTE (c);
4027
4028 if (c == '\r')
4029 {
4030 if (EQ (eol_type, Qdos))
4031 {
4032 if (src == src_end)
4033 {
4034 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4035 goto no_more_source;
4036 }
4037 if (*src == '\n')
4038 ONE_MORE_BYTE (c);
4039 }
4040 else if (EQ (eol_type, Qmac))
4041 c = '\n';
4042 }
4043 else
4044 {
4045 struct charset *charset;
4046 if (c < 0x80)
4047 charset = charset_roman;
4048 else
4049 {
4050 /* BIG5 -> Big5 */
4051 if (c < 0xA1 || c > 0xFE)
4052 goto invalid_code;
4053 ONE_MORE_BYTE (c1);
4054 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4055 goto invalid_code;
4056 c = c << 8 | c1;
4057 charset = charset_big5;
4058 }
4059 if (charset->id != charset_ascii
4060 && last_id != charset->id)
4061 {
4062 if (last_id != charset_ascii)
4063 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4064 last_id = charset->id;
4065 last_offset = char_offset;
4066 }
4067 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4068 }
4069
4070 *charbuf++ = c;
4071 char_offset++;
4072 continue;
4073
4074 invalid_code:
4075 src = src_base;
4076 consumed_chars = consumed_chars_base;
4077 ONE_MORE_BYTE (c);
4078 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4079 char_offset++;
4080 coding->errors++;
4081 }
4082
4083 no_more_source:
4084 if (last_id != charset_ascii)
4085 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4086 coding->consumed_char += consumed_chars_base;
4087 coding->consumed = src_base - coding->source;
4088 coding->charbuf_used = charbuf - coding->charbuf;
4089 }
4090
4091 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4092 This function can encode charsets `ascii', `katakana-jisx0201',
4093 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4094 are sure that all these charsets are registered as official charset
4095 (i.e. do not have extended leading-codes). Characters of other
4096 charsets are produced without any encoding. If SJIS_P is 1, encode
4097 SJIS text, else encode BIG5 text. */
4098
4099 static int
4100 encode_coding_sjis (coding)
4101 struct coding_system *coding;
4102 {
4103 int multibytep = coding->dst_multibyte;
4104 int *charbuf = coding->charbuf;
4105 int *charbuf_end = charbuf + coding->charbuf_used;
4106 unsigned char *dst = coding->destination + coding->produced;
4107 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4108 int safe_room = 4;
4109 int produced_chars = 0;
4110 Lisp_Object attrs, eol_type, charset_list, val;
4111 int ascii_compatible;
4112 struct charset *charset_roman, *charset_kanji, *charset_kana;
4113 int c;
4114
4115 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4116 val = charset_list;
4117 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4118 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4119 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4120
4121 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4122
4123 while (charbuf < charbuf_end)
4124 {
4125 ASSURE_DESTINATION (safe_room);
4126 c = *charbuf++;
4127 /* Now encode the character C. */
4128 if (ASCII_CHAR_P (c) && ascii_compatible)
4129 EMIT_ONE_ASCII_BYTE (c);
4130 else if (CHAR_BYTE8_P (c))
4131 {
4132 c = CHAR_TO_BYTE8 (c);
4133 EMIT_ONE_BYTE (c);
4134 }
4135 else
4136 {
4137 unsigned code;
4138 struct charset *charset = char_charset (c, charset_list, &code);
4139
4140 if (!charset)
4141 {
4142 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4143 {
4144 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4145 charset = CHARSET_FROM_ID (charset_ascii);
4146 }
4147 else
4148 {
4149 c = coding->default_char;
4150 charset = char_charset (c, charset_list, &code);
4151 }
4152 }
4153 if (code == CHARSET_INVALID_CODE (charset))
4154 abort ();
4155 if (charset == charset_kanji)
4156 {
4157 int c1, c2;
4158 JIS_TO_SJIS (code);
4159 c1 = code >> 8, c2 = code & 0xFF;
4160 EMIT_TWO_BYTES (c1, c2);
4161 }
4162 else if (charset == charset_kana)
4163 EMIT_ONE_BYTE (code | 0x80);
4164 else
4165 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4166 }
4167 }
4168 coding->result = CODING_RESULT_SUCCESS;
4169 coding->produced_char += produced_chars;
4170 coding->produced = dst - coding->destination;
4171 return 0;
4172 }
4173
4174 static int
4175 encode_coding_big5 (coding)
4176 struct coding_system *coding;
4177 {
4178 int multibytep = coding->dst_multibyte;
4179 int *charbuf = coding->charbuf;
4180 int *charbuf_end = charbuf + coding->charbuf_used;
4181 unsigned char *dst = coding->destination + coding->produced;
4182 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4183 int safe_room = 4;
4184 int produced_chars = 0;
4185 Lisp_Object attrs, eol_type, charset_list, val;
4186 int ascii_compatible;
4187 struct charset *charset_roman, *charset_big5;
4188 int c;
4189
4190 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4191 val = charset_list;
4192 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4193 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4194 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4195
4196 while (charbuf < charbuf_end)
4197 {
4198 ASSURE_DESTINATION (safe_room);
4199 c = *charbuf++;
4200 /* Now encode the character C. */
4201 if (ASCII_CHAR_P (c) && ascii_compatible)
4202 EMIT_ONE_ASCII_BYTE (c);
4203 else if (CHAR_BYTE8_P (c))
4204 {
4205 c = CHAR_TO_BYTE8 (c);
4206 EMIT_ONE_BYTE (c);
4207 }
4208 else
4209 {
4210 unsigned code;
4211 struct charset *charset = char_charset (c, charset_list, &code);
4212
4213 if (! charset)
4214 {
4215 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4216 {
4217 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4218 charset = CHARSET_FROM_ID (charset_ascii);
4219 }
4220 else
4221 {
4222 c = coding->default_char;
4223 charset = char_charset (c, charset_list, &code);
4224 }
4225 }
4226 if (code == CHARSET_INVALID_CODE (charset))
4227 abort ();
4228 if (charset == charset_big5)
4229 {
4230 int c1, c2;
4231
4232 c1 = code >> 8, c2 = code & 0xFF;
4233 EMIT_TWO_BYTES (c1, c2);
4234 }
4235 else
4236 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4237 }
4238 }
4239 coding->result = CODING_RESULT_SUCCESS;
4240 coding->produced_char += produced_chars;
4241 coding->produced = dst - coding->destination;
4242 return 0;
4243 }
4244
4245 \f
4246 /*** 10. CCL handlers ***/
4247
4248 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4249 Check if a text is encoded in a coding system of which
4250 encoder/decoder are written in CCL program. If it is, return
4251 CATEGORY_MASK_CCL, else return 0. */
4252
4253 static int
4254 detect_coding_ccl (coding, detect_info)
4255 struct coding_system *coding;
4256 struct coding_detection_info *detect_info;
4257 {
4258 unsigned char *src = coding->source, *src_base = src;
4259 unsigned char *src_end = coding->source + coding->src_bytes;
4260 int multibytep = coding->src_multibyte;
4261 int consumed_chars = 0;
4262 int found = 0;
4263 unsigned char *valids = CODING_CCL_VALIDS (coding);
4264 int head_ascii = coding->head_ascii;
4265 Lisp_Object attrs;
4266
4267 detect_info->checked |= CATEGORY_MASK_CCL;
4268
4269 coding = &coding_categories[coding_category_ccl];
4270 attrs = CODING_ID_ATTRS (coding->id);
4271 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4272 src += head_ascii;
4273
4274 while (1)
4275 {
4276 int c;
4277 ONE_MORE_BYTE (c);
4278 if (! valids[c])
4279 break;
4280 if ((valids[c] > 1))
4281 found = CATEGORY_MASK_CCL;
4282 }
4283 detect_info->rejected |= CATEGORY_MASK_CCL;
4284 return 0;
4285
4286 no_more_source:
4287 detect_info->found |= found;
4288 return 1;
4289 }
4290
4291 static void
4292 decode_coding_ccl (coding)
4293 struct coding_system *coding;
4294 {
4295 const unsigned char *src = coding->source + coding->consumed;
4296 unsigned char *src_end = coding->source + coding->src_bytes;
4297 int *charbuf = coding->charbuf;
4298 int *charbuf_end = charbuf + coding->charbuf_size;
4299 int consumed_chars = 0;
4300 int multibytep = coding->src_multibyte;
4301 struct ccl_program ccl;
4302 int source_charbuf[1024];
4303 int source_byteidx[1024];
4304 Lisp_Object attrs, eol_type, charset_list, valids;
4305
4306 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4307 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4308
4309 while (src < src_end)
4310 {
4311 const unsigned char *p = src;
4312 int *source, *source_end;
4313 int i = 0;
4314
4315 if (multibytep)
4316 while (i < 1024 && p < src_end)
4317 {
4318 source_byteidx[i] = p - src;
4319 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4320 }
4321 else
4322 while (i < 1024 && p < src_end)
4323 source_charbuf[i++] = *p++;
4324
4325 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4326 ccl.last_block = 1;
4327
4328 source = source_charbuf;
4329 source_end = source + i;
4330 while (source < source_end)
4331 {
4332 ccl_driver (&ccl, source, charbuf,
4333 source_end - source, charbuf_end - charbuf,
4334 charset_list);
4335 source += ccl.consumed;
4336 charbuf += ccl.produced;
4337 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4338 break;
4339 }
4340 if (source < source_end)
4341 src += source_byteidx[source - source_charbuf];
4342 else
4343 src = p;
4344 consumed_chars += source - source_charbuf;
4345
4346 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4347 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4348 break;
4349 }
4350
4351 switch (ccl.status)
4352 {
4353 case CCL_STAT_SUSPEND_BY_SRC:
4354 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4355 break;
4356 case CCL_STAT_SUSPEND_BY_DST:
4357 break;
4358 case CCL_STAT_QUIT:
4359 case CCL_STAT_INVALID_CMD:
4360 coding->result = CODING_RESULT_INTERRUPT;
4361 break;
4362 default:
4363 coding->result = CODING_RESULT_SUCCESS;
4364 break;
4365 }
4366 coding->consumed_char += consumed_chars;
4367 coding->consumed = src - coding->source;
4368 coding->charbuf_used = charbuf - coding->charbuf;
4369 }
4370
4371 static int
4372 encode_coding_ccl (coding)
4373 struct coding_system *coding;
4374 {
4375 struct ccl_program ccl;
4376 int multibytep = coding->dst_multibyte;
4377 int *charbuf = coding->charbuf;
4378 int *charbuf_end = charbuf + coding->charbuf_used;
4379 unsigned char *dst = coding->destination + coding->produced;
4380 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4381 unsigned char *adjusted_dst_end = dst_end - 1;
4382 int destination_charbuf[1024];
4383 int i, produced_chars = 0;
4384 Lisp_Object attrs, eol_type, charset_list;
4385
4386 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4387 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4388
4389 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4390 ccl.dst_multibyte = coding->dst_multibyte;
4391
4392 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4393 {
4394 int dst_bytes = dst_end - dst;
4395 if (dst_bytes > 1024)
4396 dst_bytes = 1024;
4397
4398 ccl_driver (&ccl, charbuf, destination_charbuf,
4399 charbuf_end - charbuf, dst_bytes, charset_list);
4400 charbuf += ccl.consumed;
4401 if (multibytep)
4402 for (i = 0; i < ccl.produced; i++)
4403 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4404 else
4405 {
4406 for (i = 0; i < ccl.produced; i++)
4407 *dst++ = destination_charbuf[i] & 0xFF;
4408 produced_chars += ccl.produced;
4409 }
4410 }
4411
4412 switch (ccl.status)
4413 {
4414 case CCL_STAT_SUSPEND_BY_SRC:
4415 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4416 break;
4417 case CCL_STAT_SUSPEND_BY_DST:
4418 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4419 break;
4420 case CCL_STAT_QUIT:
4421 case CCL_STAT_INVALID_CMD:
4422 coding->result = CODING_RESULT_INTERRUPT;
4423 break;
4424 default:
4425 coding->result = CODING_RESULT_SUCCESS;
4426 break;
4427 }
4428
4429 coding->produced_char += produced_chars;
4430 coding->produced = dst - coding->destination;
4431 return 0;
4432 }
4433
4434
4435 \f
4436 /*** 10, 11. no-conversion handlers ***/
4437
4438 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4439
4440 static void
4441 decode_coding_raw_text (coding)
4442 struct coding_system *coding;
4443 {
4444 coding->chars_at_source = 1;
4445 coding->consumed_char = 0;
4446 coding->consumed = 0;
4447 coding->result = CODING_RESULT_SUCCESS;
4448 }
4449
4450 static int
4451 encode_coding_raw_text (coding)
4452 struct coding_system *coding;
4453 {
4454 int multibytep = coding->dst_multibyte;
4455 int *charbuf = coding->charbuf;
4456 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4457 unsigned char *dst = coding->destination + coding->produced;
4458 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4459 int produced_chars = 0;
4460 int c;
4461
4462 if (multibytep)
4463 {
4464 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4465
4466 if (coding->src_multibyte)
4467 while (charbuf < charbuf_end)
4468 {
4469 ASSURE_DESTINATION (safe_room);
4470 c = *charbuf++;
4471 if (ASCII_CHAR_P (c))
4472 EMIT_ONE_ASCII_BYTE (c);
4473 else if (CHAR_BYTE8_P (c))
4474 {
4475 c = CHAR_TO_BYTE8 (c);
4476 EMIT_ONE_BYTE (c);
4477 }
4478 else
4479 {
4480 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4481
4482 CHAR_STRING_ADVANCE (c, p1);
4483 while (p0 < p1)
4484 {
4485 EMIT_ONE_BYTE (*p0);
4486 p0++;
4487 }
4488 }
4489 }
4490 else
4491 while (charbuf < charbuf_end)
4492 {
4493 ASSURE_DESTINATION (safe_room);
4494 c = *charbuf++;
4495 EMIT_ONE_BYTE (c);
4496 }
4497 }
4498 else
4499 {
4500 if (coding->src_multibyte)
4501 {
4502 int safe_room = MAX_MULTIBYTE_LENGTH;
4503
4504 while (charbuf < charbuf_end)
4505 {
4506 ASSURE_DESTINATION (safe_room);
4507 c = *charbuf++;
4508 if (ASCII_CHAR_P (c))
4509 *dst++ = c;
4510 else if (CHAR_BYTE8_P (c))
4511 *dst++ = CHAR_TO_BYTE8 (c);
4512 else
4513 CHAR_STRING_ADVANCE (c, dst);
4514 produced_chars++;
4515 }
4516 }
4517 else
4518 {
4519 ASSURE_DESTINATION (charbuf_end - charbuf);
4520 while (charbuf < charbuf_end && dst < dst_end)
4521 *dst++ = *charbuf++;
4522 produced_chars = dst - (coding->destination + coding->dst_bytes);
4523 }
4524 }
4525 coding->result = CODING_RESULT_SUCCESS;
4526 coding->produced_char += produced_chars;
4527 coding->produced = dst - coding->destination;
4528 return 0;
4529 }
4530
4531 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4532 Check if a text is encoded in a charset-based coding system. If it
4533 is, return 1, else return 0. */
4534
4535 static int
4536 detect_coding_charset (coding, detect_info)
4537 struct coding_system *coding;
4538 struct coding_detection_info *detect_info;
4539 {
4540 unsigned char *src = coding->source, *src_base = src;
4541 unsigned char *src_end = coding->source + coding->src_bytes;
4542 int multibytep = coding->src_multibyte;
4543 int consumed_chars = 0;
4544 Lisp_Object attrs, valids;
4545 int found = 0;
4546
4547 detect_info->checked |= CATEGORY_MASK_CHARSET;
4548
4549 coding = &coding_categories[coding_category_charset];
4550 attrs = CODING_ID_ATTRS (coding->id);
4551 valids = AREF (attrs, coding_attr_charset_valids);
4552
4553 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4554 src += coding->head_ascii;
4555
4556 while (1)
4557 {
4558 int c;
4559
4560 ONE_MORE_BYTE (c);
4561 if (NILP (AREF (valids, c)))
4562 break;
4563 if (c >= 0x80)
4564 found = CATEGORY_MASK_CHARSET;
4565 }
4566 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4567 return 0;
4568
4569 no_more_source:
4570 detect_info->found |= found;
4571 return 1;
4572 }
4573
4574 static void
4575 decode_coding_charset (coding)
4576 struct coding_system *coding;
4577 {
4578 unsigned char *src = coding->source + coding->consumed;
4579 unsigned char *src_end = coding->source + coding->src_bytes;
4580 unsigned char *src_base;
4581 int *charbuf = coding->charbuf;
4582 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4583 int consumed_chars = 0, consumed_chars_base;
4584 int multibytep = coding->src_multibyte;
4585 Lisp_Object attrs, eol_type, charset_list, valids;
4586 int char_offset = coding->produced_char;
4587 int last_offset = char_offset;
4588 int last_id = charset_ascii;
4589
4590 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4591 valids = AREF (attrs, coding_attr_charset_valids);
4592
4593 while (1)
4594 {
4595 int c;
4596
4597 src_base = src;
4598 consumed_chars_base = consumed_chars;
4599
4600 if (charbuf >= charbuf_end)
4601 break;
4602
4603 ONE_MORE_BYTE (c);
4604 if (c == '\r')
4605 {
4606 /* Here we assume that no charset maps '\r' to something
4607 else. */
4608 if (EQ (eol_type, Qdos))
4609 {
4610 if (src == src_end)
4611 {
4612 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4613 goto no_more_source;
4614 }
4615 if (*src == '\n')
4616 ONE_MORE_BYTE (c);
4617 }
4618 else if (EQ (eol_type, Qmac))
4619 c = '\n';
4620 }
4621 else
4622 {
4623 Lisp_Object val;
4624 struct charset *charset;
4625 int dim;
4626 int len = 1;
4627 unsigned code = c;
4628
4629 val = AREF (valids, c);
4630 if (NILP (val))
4631 goto invalid_code;
4632 if (INTEGERP (val))
4633 {
4634 charset = CHARSET_FROM_ID (XFASTINT (val));
4635 dim = CHARSET_DIMENSION (charset);
4636 while (len < dim)
4637 {
4638 ONE_MORE_BYTE (c);
4639 code = (code << 8) | c;
4640 len++;
4641 }
4642 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4643 charset, code, c);
4644 }
4645 else
4646 {
4647 /* VAL is a list of charset IDs. It is assured that the
4648 list is sorted by charset dimensions (smaller one
4649 comes first). */
4650 while (CONSP (val))
4651 {
4652 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4653 dim = CHARSET_DIMENSION (charset);
4654 while (len < dim)
4655 {
4656 ONE_MORE_BYTE (c);
4657 code = (code << 8) | c;
4658 len++;
4659 }
4660 CODING_DECODE_CHAR (coding, src, src_base,
4661 src_end, charset, code, c);
4662 if (c >= 0)
4663 break;
4664 val = XCDR (val);
4665 }
4666 }
4667 if (c < 0)
4668 goto invalid_code;
4669 if (charset->id != charset_ascii
4670 && last_id != charset->id)
4671 {
4672 if (last_id != charset_ascii)
4673 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4674 last_id = charset->id;
4675 last_offset = char_offset;
4676 }
4677 }
4678 *charbuf++ = c;
4679 char_offset++;
4680 continue;
4681
4682 invalid_code:
4683 src = src_base;
4684 consumed_chars = consumed_chars_base;
4685 ONE_MORE_BYTE (c);
4686 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4687 char_offset++;
4688 coding->errors++;
4689 }
4690
4691 no_more_source:
4692 if (last_id != charset_ascii)
4693 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4694 coding->consumed_char += consumed_chars_base;
4695 coding->consumed = src_base - coding->source;
4696 coding->charbuf_used = charbuf - coding->charbuf;
4697 }
4698
4699 static int
4700 encode_coding_charset (coding)
4701 struct coding_system *coding;
4702 {
4703 int multibytep = coding->dst_multibyte;
4704 int *charbuf = coding->charbuf;
4705 int *charbuf_end = charbuf + coding->charbuf_used;
4706 unsigned char *dst = coding->destination + coding->produced;
4707 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4708 int safe_room = MAX_MULTIBYTE_LENGTH;
4709 int produced_chars = 0;
4710 Lisp_Object attrs, eol_type, charset_list;
4711 int ascii_compatible;
4712 int c;
4713
4714 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4715 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4716
4717 while (charbuf < charbuf_end)
4718 {
4719 struct charset *charset;
4720 unsigned code;
4721
4722 ASSURE_DESTINATION (safe_room);
4723 c = *charbuf++;
4724 if (ascii_compatible && ASCII_CHAR_P (c))
4725 EMIT_ONE_ASCII_BYTE (c);
4726 else if (CHAR_BYTE8_P (c))
4727 {
4728 c = CHAR_TO_BYTE8 (c);
4729 EMIT_ONE_BYTE (c);
4730 }
4731 else
4732 {
4733 charset = char_charset (c, charset_list, &code);
4734 if (charset)
4735 {
4736 if (CHARSET_DIMENSION (charset) == 1)
4737 EMIT_ONE_BYTE (code);
4738 else if (CHARSET_DIMENSION (charset) == 2)
4739 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4740 else if (CHARSET_DIMENSION (charset) == 3)
4741 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4742 else
4743 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4744 (code >> 8) & 0xFF, code & 0xFF);
4745 }
4746 else
4747 {
4748 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4749 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4750 else
4751 c = coding->default_char;
4752 EMIT_ONE_BYTE (c);
4753 }
4754 }
4755 }
4756
4757 coding->result = CODING_RESULT_SUCCESS;
4758 coding->produced_char += produced_chars;
4759 coding->produced = dst - coding->destination;
4760 return 0;
4761 }
4762
4763 \f
4764 /*** 7. C library functions ***/
4765
4766 /* Setup coding context CODING from information about CODING_SYSTEM.
4767 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4768 CODING_SYSTEM is invalid, signal an error. */
4769
4770 void
4771 setup_coding_system (coding_system, coding)
4772 Lisp_Object coding_system;
4773 struct coding_system *coding;
4774 {
4775 Lisp_Object attrs;
4776 Lisp_Object eol_type;
4777 Lisp_Object coding_type;
4778 Lisp_Object val;
4779
4780 if (NILP (coding_system))
4781 coding_system = Qno_conversion;
4782
4783 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4784
4785 attrs = CODING_ID_ATTRS (coding->id);
4786 eol_type = CODING_ID_EOL_TYPE (coding->id);
4787
4788 coding->mode = 0;
4789 coding->head_ascii = -1;
4790 coding->common_flags
4791 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4792
4793 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4794 coding->max_charset_id = XSTRING (val)->size - 1;
4795 coding->safe_charsets = (char *) XSTRING (val)->data;
4796 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4797
4798 coding_type = CODING_ATTR_TYPE (attrs);
4799 if (EQ (coding_type, Qundecided))
4800 {
4801 coding->detector = NULL;
4802 coding->decoder = decode_coding_raw_text;
4803 coding->encoder = encode_coding_raw_text;
4804 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4805 }
4806 else if (EQ (coding_type, Qiso_2022))
4807 {
4808 int i;
4809 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4810 enum coding_category category = XINT (CODING_ATTR_CATEGORY (attrs));
4811
4812 /* Invoke graphic register 0 to plane 0. */
4813 CODING_ISO_INVOCATION (coding, 0) = 0;
4814 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4815 CODING_ISO_INVOCATION (coding, 1)
4816 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4817 /* Setup the initial status of designation. */
4818 for (i = 0; i < 4; i++)
4819 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4820 /* Not single shifting initially. */
4821 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4822 /* Beginning of buffer should also be regarded as bol. */
4823 CODING_ISO_BOL (coding) = 1;
4824 coding->detector = detect_coding_iso_2022;
4825 coding->decoder = decode_coding_iso_2022;
4826 coding->encoder = encode_coding_iso_2022;
4827 if (flags & CODING_ISO_FLAG_SAFE)
4828 coding->mode |= CODING_MODE_SAFE_ENCODING;
4829 coding->common_flags
4830 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4831 | CODING_REQUIRE_FLUSHING_MASK);
4832 if (flags & CODING_ISO_FLAG_COMPOSITION)
4833 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4834 if (flags & CODING_ISO_FLAG_DESIGNATION)
4835 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4836 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4837 {
4838 setup_iso_safe_charsets (attrs);
4839 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4840 coding->max_charset_id = XSTRING (val)->size - 1;
4841 coding->safe_charsets = (char *) XSTRING (val)->data;
4842 }
4843 CODING_ISO_FLAGS (coding) = flags;
4844 }
4845 else if (EQ (coding_type, Qcharset))
4846 {
4847 coding->detector = detect_coding_charset;
4848 coding->decoder = decode_coding_charset;
4849 coding->encoder = encode_coding_charset;
4850 coding->common_flags
4851 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4852 }
4853 else if (EQ (coding_type, Qutf_8))
4854 {
4855 coding->detector = detect_coding_utf_8;
4856 coding->decoder = decode_coding_utf_8;
4857 coding->encoder = encode_coding_utf_8;
4858 coding->common_flags
4859 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4860 }
4861 else if (EQ (coding_type, Qutf_16))
4862 {
4863 val = AREF (attrs, coding_attr_utf_16_bom);
4864 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4865 : EQ (val, Qt) ? utf_16_with_bom
4866 : utf_16_without_bom);
4867 val = AREF (attrs, coding_attr_utf_16_endian);
4868 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4869 : utf_16_little_endian);
4870 CODING_UTF_16_SURROGATE (coding) = 0;
4871 coding->detector = detect_coding_utf_16;
4872 coding->decoder = decode_coding_utf_16;
4873 coding->encoder = encode_coding_utf_16;
4874 coding->common_flags
4875 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4876 }
4877 else if (EQ (coding_type, Qccl))
4878 {
4879 coding->detector = detect_coding_ccl;
4880 coding->decoder = decode_coding_ccl;
4881 coding->encoder = encode_coding_ccl;
4882 coding->common_flags
4883 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4884 | CODING_REQUIRE_FLUSHING_MASK);
4885 }
4886 else if (EQ (coding_type, Qemacs_mule))
4887 {
4888 coding->detector = detect_coding_emacs_mule;
4889 coding->decoder = decode_coding_emacs_mule;
4890 coding->encoder = encode_coding_emacs_mule;
4891 coding->common_flags
4892 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4893 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4894 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4895 {
4896 Lisp_Object tail, safe_charsets;
4897 int max_charset_id = 0;
4898
4899 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4900 tail = XCDR (tail))
4901 if (max_charset_id < XFASTINT (XCAR (tail)))
4902 max_charset_id = XFASTINT (XCAR (tail));
4903 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4904 make_number (255));
4905 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4906 tail = XCDR (tail))
4907 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4908 coding->max_charset_id = max_charset_id;
4909 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4910 }
4911 }
4912 else if (EQ (coding_type, Qshift_jis))
4913 {
4914 coding->detector = detect_coding_sjis;
4915 coding->decoder = decode_coding_sjis;
4916 coding->encoder = encode_coding_sjis;
4917 coding->common_flags
4918 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4919 }
4920 else if (EQ (coding_type, Qbig5))
4921 {
4922 coding->detector = detect_coding_big5;
4923 coding->decoder = decode_coding_big5;
4924 coding->encoder = encode_coding_big5;
4925 coding->common_flags
4926 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4927 }
4928 else /* EQ (coding_type, Qraw_text) */
4929 {
4930 coding->detector = NULL;
4931 coding->decoder = decode_coding_raw_text;
4932 coding->encoder = encode_coding_raw_text;
4933 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4934 }
4935
4936 return;
4937 }
4938
4939 /* Return raw-text or one of its subsidiaries that has the same
4940 eol_type as CODING-SYSTEM. */
4941
4942 Lisp_Object
4943 raw_text_coding_system (coding_system)
4944 Lisp_Object coding_system;
4945 {
4946 Lisp_Object spec, attrs;
4947 Lisp_Object eol_type, raw_text_eol_type;
4948
4949 spec = CODING_SYSTEM_SPEC (coding_system);
4950 attrs = AREF (spec, 0);
4951
4952 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4953 return coding_system;
4954
4955 eol_type = AREF (spec, 2);
4956 if (VECTORP (eol_type))
4957 return Qraw_text;
4958 spec = CODING_SYSTEM_SPEC (Qraw_text);
4959 raw_text_eol_type = AREF (spec, 2);
4960 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4961 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4962 : AREF (raw_text_eol_type, 2));
4963 }
4964
4965
4966 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4967 does, return one of the subsidiary that has the same eol-spec as
4968 PARENT. Otherwise, return CODING_SYSTEM. */
4969
4970 Lisp_Object
4971 coding_inherit_eol_type (coding_system, parent)
4972 Lisp_Object coding_system, parent;
4973 {
4974 Lisp_Object spec, attrs, eol_type;
4975
4976 spec = CODING_SYSTEM_SPEC (coding_system);
4977 attrs = AREF (spec, 0);
4978 eol_type = AREF (spec, 2);
4979 if (VECTORP (eol_type))
4980 {
4981 Lisp_Object parent_spec;
4982 Lisp_Object parent_eol_type;
4983
4984 parent_spec
4985 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4986 parent_eol_type = AREF (parent_spec, 2);
4987 if (EQ (parent_eol_type, Qunix))
4988 coding_system = AREF (eol_type, 0);
4989 else if (EQ (parent_eol_type, Qdos))
4990 coding_system = AREF (eol_type, 1);
4991 else if (EQ (parent_eol_type, Qmac))
4992 coding_system = AREF (eol_type, 2);
4993 }
4994 return coding_system;
4995 }
4996
4997 /* Emacs has a mechanism to automatically detect a coding system if it
4998 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4999 it's impossible to distinguish some coding systems accurately
5000 because they use the same range of codes. So, at first, coding
5001 systems are categorized into 7, those are:
5002
5003 o coding-category-emacs-mule
5004
5005 The category for a coding system which has the same code range
5006 as Emacs' internal format. Assigned the coding-system (Lisp
5007 symbol) `emacs-mule' by default.
5008
5009 o coding-category-sjis
5010
5011 The category for a coding system which has the same code range
5012 as SJIS. Assigned the coding-system (Lisp
5013 symbol) `japanese-shift-jis' by default.
5014
5015 o coding-category-iso-7
5016
5017 The category for a coding system which has the same code range
5018 as ISO2022 of 7-bit environment. This doesn't use any locking
5019 shift and single shift functions. This can encode/decode all
5020 charsets. Assigned the coding-system (Lisp symbol)
5021 `iso-2022-7bit' by default.
5022
5023 o coding-category-iso-7-tight
5024
5025 Same as coding-category-iso-7 except that this can
5026 encode/decode only the specified charsets.
5027
5028 o coding-category-iso-8-1
5029
5030 The category for a coding system which has the same code range
5031 as ISO2022 of 8-bit environment and graphic plane 1 used only
5032 for DIMENSION1 charset. This doesn't use any locking shift
5033 and single shift functions. Assigned the coding-system (Lisp
5034 symbol) `iso-latin-1' by default.
5035
5036 o coding-category-iso-8-2
5037
5038 The category for a coding system which has the same code range
5039 as ISO2022 of 8-bit environment and graphic plane 1 used only
5040 for DIMENSION2 charset. This doesn't use any locking shift
5041 and single shift functions. Assigned the coding-system (Lisp
5042 symbol) `japanese-iso-8bit' by default.
5043
5044 o coding-category-iso-7-else
5045
5046 The category for a coding system which has the same code range
5047 as ISO2022 of 7-bit environemnt but uses locking shift or
5048 single shift functions. Assigned the coding-system (Lisp
5049 symbol) `iso-2022-7bit-lock' by default.
5050
5051 o coding-category-iso-8-else
5052
5053 The category for a coding system which has the same code range
5054 as ISO2022 of 8-bit environemnt but uses locking shift or
5055 single shift functions. Assigned the coding-system (Lisp
5056 symbol) `iso-2022-8bit-ss2' by default.
5057
5058 o coding-category-big5
5059
5060 The category for a coding system which has the same code range
5061 as BIG5. Assigned the coding-system (Lisp symbol)
5062 `cn-big5' by default.
5063
5064 o coding-category-utf-8
5065
5066 The category for a coding system which has the same code range
5067 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5068 symbol) `utf-8' by default.
5069
5070 o coding-category-utf-16-be
5071
5072 The category for a coding system in which a text has an
5073 Unicode signature (cf. Unicode Standard) in the order of BIG
5074 endian at the head. Assigned the coding-system (Lisp symbol)
5075 `utf-16-be' by default.
5076
5077 o coding-category-utf-16-le
5078
5079 The category for a coding system in which a text has an
5080 Unicode signature (cf. Unicode Standard) in the order of
5081 LITTLE endian at the head. Assigned the coding-system (Lisp
5082 symbol) `utf-16-le' by default.
5083
5084 o coding-category-ccl
5085
5086 The category for a coding system of which encoder/decoder is
5087 written in CCL programs. The default value is nil, i.e., no
5088 coding system is assigned.
5089
5090 o coding-category-binary
5091
5092 The category for a coding system not categorized in any of the
5093 above. Assigned the coding-system (Lisp symbol)
5094 `no-conversion' by default.
5095
5096 Each of them is a Lisp symbol and the value is an actual
5097 `coding-system's (this is also a Lisp symbol) assigned by a user.
5098 What Emacs does actually is to detect a category of coding system.
5099 Then, it uses a `coding-system' assigned to it. If Emacs can't
5100 decide only one possible category, it selects a category of the
5101 highest priority. Priorities of categories are also specified by a
5102 user in a Lisp variable `coding-category-list'.
5103
5104 */
5105
5106 #define EOL_SEEN_NONE 0
5107 #define EOL_SEEN_LF 1
5108 #define EOL_SEEN_CR 2
5109 #define EOL_SEEN_CRLF 4
5110
5111 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5112 SOURCE is encoded. If CATEGORY is one of
5113 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5114 two-byte, else they are encoded by one-byte.
5115
5116 Return one of EOL_SEEN_XXX. */
5117
5118 #define MAX_EOL_CHECK_COUNT 3
5119
5120 static int
5121 detect_eol (source, src_bytes, category)
5122 unsigned char *source;
5123 EMACS_INT src_bytes;
5124 enum coding_category category;
5125 {
5126 unsigned char *src = source, *src_end = src + src_bytes;
5127 unsigned char c;
5128 int total = 0;
5129 int eol_seen = EOL_SEEN_NONE;
5130
5131 if ((1 << category) & CATEGORY_MASK_UTF_16)
5132 {
5133 int msb, lsb;
5134
5135 msb = category == (coding_category_utf_16_le
5136 | coding_category_utf_16_le_nosig);
5137 lsb = 1 - msb;
5138
5139 while (src + 1 < src_end)
5140 {
5141 c = src[lsb];
5142 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5143 {
5144 int this_eol;
5145
5146 if (c == '\n')
5147 this_eol = EOL_SEEN_LF;
5148 else if (src + 3 >= src_end
5149 || src[msb + 2] != 0
5150 || src[lsb + 2] != '\n')
5151 this_eol = EOL_SEEN_CR;
5152 else
5153 this_eol = EOL_SEEN_CRLF;
5154
5155 if (eol_seen == EOL_SEEN_NONE)
5156 /* This is the first end-of-line. */
5157 eol_seen = this_eol;
5158 else if (eol_seen != this_eol)
5159 {
5160 /* The found type is different from what found before. */
5161 eol_seen = EOL_SEEN_LF;
5162 break;
5163 }
5164 if (++total == MAX_EOL_CHECK_COUNT)
5165 break;
5166 }
5167 src += 2;
5168 }
5169 }
5170 else
5171 {
5172 while (src < src_end)
5173 {
5174 c = *src++;
5175 if (c == '\n' || c == '\r')
5176 {
5177 int this_eol;
5178
5179 if (c == '\n')
5180 this_eol = EOL_SEEN_LF;
5181 else if (src >= src_end || *src != '\n')
5182 this_eol = EOL_SEEN_CR;
5183 else
5184 this_eol = EOL_SEEN_CRLF, src++;
5185
5186 if (eol_seen == EOL_SEEN_NONE)
5187 /* This is the first end-of-line. */
5188 eol_seen = this_eol;
5189 else if (eol_seen != this_eol)
5190 {
5191 /* The found type is different from what found before. */
5192 eol_seen = EOL_SEEN_LF;
5193 break;
5194 }
5195 if (++total == MAX_EOL_CHECK_COUNT)
5196 break;
5197 }
5198 }
5199 }
5200 return eol_seen;
5201 }
5202
5203
5204 static void
5205 adjust_coding_eol_type (coding, eol_seen)
5206 struct coding_system *coding;
5207 int eol_seen;
5208 {
5209 Lisp_Object eol_type;
5210
5211 eol_type = CODING_ID_EOL_TYPE (coding->id);
5212 if (eol_seen & EOL_SEEN_LF)
5213 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5214 else if (eol_seen & EOL_SEEN_CRLF)
5215 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5216 else if (eol_seen & EOL_SEEN_CR)
5217 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5218 }
5219
5220 /* Detect how a text specified in CODING is encoded. If a coding
5221 system is detected, update fields of CODING by the detected coding
5222 system. */
5223
5224 void
5225 detect_coding (coding)
5226 struct coding_system *coding;
5227 {
5228 unsigned char *src, *src_end;
5229 Lisp_Object attrs, coding_type;
5230
5231 coding->consumed = coding->consumed_char = 0;
5232 coding->produced = coding->produced_char = 0;
5233 coding_set_source (coding);
5234
5235 src_end = coding->source + coding->src_bytes;
5236
5237 /* If we have not yet decided the text encoding type, detect it
5238 now. */
5239 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5240 {
5241 int c, i;
5242
5243 for (src = coding->source; src < src_end; src++)
5244 {
5245 c = *src;
5246 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5247 || c == ISO_CODE_SI
5248 || c == ISO_CODE_SO)))
5249 break;
5250 }
5251 coding->head_ascii = src - (coding->source + coding->consumed);
5252
5253 if (coding->head_ascii < coding->src_bytes)
5254 {
5255 struct coding_detection_info detect_info;
5256 enum coding_category category;
5257 struct coding_system *this;
5258
5259 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5260 for (i = 0; i < coding_category_raw_text; i++)
5261 {
5262 category = coding_priorities[i];
5263 this = coding_categories + category;
5264 if (this->id < 0)
5265 {
5266 /* No coding system of this category is defined. */
5267 detect_info.rejected |= (1 << category);
5268 }
5269 else if (category >= coding_category_raw_text)
5270 continue;
5271 else if (detect_info.checked & (1 << category))
5272 {
5273 if (detect_info.found & (1 << category))
5274 break;
5275 }
5276 else if ((*(this->detector)) (coding, &detect_info)
5277 && detect_info.found & (1 << category))
5278 break;
5279 }
5280 if (i < coding_category_raw_text)
5281 setup_coding_system (CODING_ID_NAME (this->id), coding);
5282 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5283 setup_coding_system (Qraw_text, coding);
5284 else if (detect_info.rejected)
5285 for (i = 0; i < coding_category_raw_text; i++)
5286 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5287 {
5288 this = coding_categories + coding_priorities[i];
5289 setup_coding_system (CODING_ID_NAME (this->id), coding);
5290 break;
5291 }
5292 }
5293 }
5294
5295 attrs = CODING_ID_ATTRS (coding->id);
5296 coding_type = CODING_ATTR_TYPE (attrs);
5297
5298 /* If we have not yet decided the EOL type, detect it now. But, the
5299 detection is impossible for a CCL based coding system, in which
5300 case, we detct the EOL type after decoding. */
5301 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5302 && ! EQ (coding_type, Qccl))
5303 {
5304 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5305 XINT (CODING_ATTR_CATEGORY (attrs)));
5306
5307 if (eol_seen != EOL_SEEN_NONE)
5308 adjust_coding_eol_type (coding, eol_seen);
5309 }
5310 }
5311
5312
5313 static void
5314 decode_eol (coding)
5315 struct coding_system *coding;
5316 {
5317 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5318 {
5319 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5320 unsigned char *pend = p + coding->produced;
5321 int eol_seen = EOL_SEEN_NONE;
5322
5323 for (; p < pend; p++)
5324 {
5325 if (*p == '\n')
5326 eol_seen |= EOL_SEEN_LF;
5327 else if (*p == '\r')
5328 {
5329 if (p + 1 < pend && *(p + 1) == '\n')
5330 {
5331 eol_seen |= EOL_SEEN_CRLF;
5332 p++;
5333 }
5334 else
5335 eol_seen |= EOL_SEEN_CR;
5336 }
5337 }
5338 if (eol_seen != EOL_SEEN_NONE)
5339 adjust_coding_eol_type (coding, eol_seen);
5340 }
5341
5342 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5343 {
5344 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5345 unsigned char *pend = p + coding->produced;
5346
5347 for (; p < pend; p++)
5348 if (*p == '\r')
5349 *p = '\n';
5350 }
5351 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5352 {
5353 unsigned char *p, *pbeg, *pend;
5354 Lisp_Object undo_list;
5355
5356 move_gap_both (coding->dst_pos + coding->produced_char,
5357 coding->dst_pos_byte + coding->produced);
5358 undo_list = current_buffer->undo_list;
5359 current_buffer->undo_list = Qt;
5360 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5361 current_buffer->undo_list = undo_list;
5362 pbeg = GPT_ADDR;
5363 pend = pbeg + coding->produced;
5364
5365 for (p = pend - 1; p >= pbeg; p--)
5366 if (*p == '\r')
5367 {
5368 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5369 pend--;
5370 }
5371 coding->produced_char -= coding->produced - (pend - pbeg);
5372 coding->produced = pend - pbeg;
5373 insert_from_gap (coding->produced_char, coding->produced);
5374 }
5375 }
5376
5377 static void
5378 translate_chars (coding, table)
5379 struct coding_system *coding;
5380 Lisp_Object table;
5381 {
5382 int *charbuf = coding->charbuf;
5383 int *charbuf_end = charbuf + coding->charbuf_used;
5384 int c;
5385
5386 if (coding->chars_at_source)
5387 return;
5388
5389 while (charbuf < charbuf_end)
5390 {
5391 c = *charbuf;
5392 if (c < 0)
5393 charbuf += c;
5394 else
5395 *charbuf++ = translate_char (table, c);
5396 }
5397 }
5398
5399 static int
5400 produce_chars (coding)
5401 struct coding_system *coding;
5402 {
5403 unsigned char *dst = coding->destination + coding->produced;
5404 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5405 int produced;
5406 int produced_chars = 0;
5407
5408 if (! coding->chars_at_source)
5409 {
5410 /* Characters are in coding->charbuf. */
5411 int *buf = coding->charbuf;
5412 int *buf_end = buf + coding->charbuf_used;
5413 unsigned char *adjusted_dst_end;
5414
5415 if (BUFFERP (coding->src_object)
5416 && EQ (coding->src_object, coding->dst_object))
5417 dst_end = coding->source + coding->consumed;
5418 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5419
5420 while (buf < buf_end)
5421 {
5422 int c = *buf++;
5423
5424 if (dst >= adjusted_dst_end)
5425 {
5426 dst = alloc_destination (coding,
5427 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5428 dst);
5429 dst_end = coding->destination + coding->dst_bytes;
5430 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5431 }
5432 if (c >= 0)
5433 {
5434 if (coding->dst_multibyte
5435 || ! CHAR_BYTE8_P (c))
5436 CHAR_STRING_ADVANCE (c, dst);
5437 else
5438 *dst++ = CHAR_TO_BYTE8 (c);
5439 produced_chars++;
5440 }
5441 else
5442 /* This is an annotation datum. */
5443 buf -= c + 1;
5444 }
5445 }
5446 else
5447 {
5448 unsigned char *src = coding->source;
5449 unsigned char *src_end = src + coding->src_bytes;
5450 Lisp_Object eol_type;
5451
5452 eol_type = CODING_ID_EOL_TYPE (coding->id);
5453
5454 if (coding->src_multibyte != coding->dst_multibyte)
5455 {
5456 if (coding->src_multibyte)
5457 {
5458 int multibytep = 1;
5459 int consumed_chars;
5460
5461 while (1)
5462 {
5463 unsigned char *src_base = src;
5464 int c;
5465
5466 ONE_MORE_BYTE (c);
5467 if (c == '\r')
5468 {
5469 if (EQ (eol_type, Qdos))
5470 {
5471 if (src == src_end)
5472 {
5473 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5474 goto no_more_source;
5475 }
5476 if (*src == '\n')
5477 c = *src++;
5478 }
5479 else if (EQ (eol_type, Qmac))
5480 c = '\n';
5481 }
5482 if (dst == dst_end)
5483 {
5484 coding->consumed = src - coding->source;
5485
5486 if (EQ (coding->src_object, coding->dst_object))
5487 dst_end = src;
5488 if (dst == dst_end)
5489 {
5490 dst = alloc_destination (coding, src_end - src + 1,
5491 dst);
5492 dst_end = coding->destination + coding->dst_bytes;
5493 coding_set_source (coding);
5494 src = coding->source + coding->consumed;
5495 src_end = coding->source + coding->src_bytes;
5496 }
5497 }
5498 *dst++ = c;
5499 produced_chars++;
5500 }
5501 no_more_source:
5502 ;
5503 }
5504 else
5505 while (src < src_end)
5506 {
5507 int multibytep = 1;
5508 int c = *src++;
5509
5510 if (c == '\r')
5511 {
5512 if (EQ (eol_type, Qdos))
5513 {
5514 if (src < src_end
5515 && *src == '\n')
5516 c = *src++;
5517 }
5518 else if (EQ (eol_type, Qmac))
5519 c = '\n';
5520 }
5521 if (dst >= dst_end - 1)
5522 {
5523 coding->consumed = src - coding->source;
5524
5525 if (EQ (coding->src_object, coding->dst_object))
5526 dst_end = src;
5527 if (dst >= dst_end - 1)
5528 {
5529 dst = alloc_destination (coding, src_end - src + 2,
5530 dst);
5531 dst_end = coding->destination + coding->dst_bytes;
5532 coding_set_source (coding);
5533 src = coding->source + coding->consumed;
5534 src_end = coding->source + coding->src_bytes;
5535 }
5536 }
5537 EMIT_ONE_BYTE (c);
5538 }
5539 }
5540 else
5541 {
5542 if (!EQ (coding->src_object, coding->dst_object))
5543 {
5544 int require = coding->src_bytes - coding->dst_bytes;
5545
5546 if (require > 0)
5547 {
5548 EMACS_INT offset = src - coding->source;
5549
5550 dst = alloc_destination (coding, require, dst);
5551 coding_set_source (coding);
5552 src = coding->source + offset;
5553 src_end = coding->source + coding->src_bytes;
5554 }
5555 }
5556 produced_chars = coding->src_chars;
5557 while (src < src_end)
5558 {
5559 int c = *src++;
5560
5561 if (c == '\r')
5562 {
5563 if (EQ (eol_type, Qdos))
5564 {
5565 if (src < src_end
5566 && *src == '\n')
5567 c = *src++;
5568 produced_chars--;
5569 }
5570 else if (EQ (eol_type, Qmac))
5571 c = '\n';
5572 }
5573 *dst++ = c;
5574 }
5575 }
5576 coding->consumed = coding->src_bytes;
5577 coding->consumed_char = coding->src_chars;
5578 }
5579
5580 produced = dst - (coding->destination + coding->produced);
5581 if (BUFFERP (coding->dst_object))
5582 insert_from_gap (produced_chars, produced);
5583 coding->produced += produced;
5584 coding->produced_char += produced_chars;
5585 return produced_chars;
5586 }
5587
5588 /* Compose text in CODING->object according to the annotation data at
5589 CHARBUF. CHARBUF is an array:
5590 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5591 */
5592
5593 static INLINE void
5594 produce_composition (coding, charbuf)
5595 struct coding_system *coding;
5596 int *charbuf;
5597 {
5598 int len;
5599 EMACS_INT from, to;
5600 enum composition_method method;
5601 Lisp_Object components;
5602
5603 len = -charbuf[0];
5604 from = coding->dst_pos + charbuf[2];
5605 to = coding->dst_pos + charbuf[3];
5606 method = (enum composition_method) (charbuf[4]);
5607
5608 if (method == COMPOSITION_RELATIVE)
5609 components = Qnil;
5610 else
5611 {
5612 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5613 int i;
5614
5615 len -= 5;
5616 charbuf += 5;
5617 for (i = 0; i < len; i++)
5618 args[i] = make_number (charbuf[i]);
5619 components = (method == COMPOSITION_WITH_ALTCHARS
5620 ? Fstring (len, args) : Fvector (len, args));
5621 }
5622 compose_text (from, to, components, Qnil, coding->dst_object);
5623 }
5624
5625
5626 /* Put `charset' property on text in CODING->object according to
5627 the annotation data at CHARBUF. CHARBUF is an array:
5628 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5629 */
5630
5631 static INLINE void
5632 produce_charset (coding, charbuf)
5633 struct coding_system *coding;
5634 int *charbuf;
5635 {
5636 EMACS_INT from = coding->dst_pos + charbuf[2];
5637 EMACS_INT to = coding->dst_pos + charbuf[3];
5638 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5639
5640 Fput_text_property (make_number (from), make_number (to),
5641 Qcharset, CHARSET_NAME (charset),
5642 coding->dst_object);
5643 }
5644
5645
5646 #define CHARBUF_SIZE 0x4000
5647
5648 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5649 do { \
5650 int size = CHARBUF_SIZE;; \
5651 \
5652 coding->charbuf = NULL; \
5653 while (size > 1024) \
5654 { \
5655 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5656 if (coding->charbuf) \
5657 break; \
5658 size >>= 1; \
5659 } \
5660 if (! coding->charbuf) \
5661 { \
5662 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5663 return coding->result; \
5664 } \
5665 coding->charbuf_size = size; \
5666 } while (0)
5667
5668
5669 static void
5670 produce_annotation (coding)
5671 struct coding_system *coding;
5672 {
5673 int *charbuf = coding->charbuf;
5674 int *charbuf_end = charbuf + coding->charbuf_used;
5675
5676 if (NILP (coding->dst_object))
5677 return;
5678
5679 while (charbuf < charbuf_end)
5680 {
5681 if (*charbuf >= 0)
5682 charbuf++;
5683 else
5684 {
5685 int len = -*charbuf;
5686 switch (charbuf[1])
5687 {
5688 case CODING_ANNOTATE_COMPOSITION_MASK:
5689 produce_composition (coding, charbuf);
5690 break;
5691 case CODING_ANNOTATE_CHARSET_MASK:
5692 produce_charset (coding, charbuf);
5693 break;
5694 default:
5695 abort ();
5696 }
5697 charbuf += len;
5698 }
5699 }
5700 }
5701
5702 /* Decode the data at CODING->src_object into CODING->dst_object.
5703 CODING->src_object is a buffer, a string, or nil.
5704 CODING->dst_object is a buffer.
5705
5706 If CODING->src_object is a buffer, it must be the current buffer.
5707 In this case, if CODING->src_pos is positive, it is a position of
5708 the source text in the buffer, otherwise, the source text is in the
5709 gap area of the buffer, and CODING->src_pos specifies the offset of
5710 the text from GPT (which must be the same as PT). If this is the
5711 same buffer as CODING->dst_object, CODING->src_pos must be
5712 negative.
5713
5714 If CODING->src_object is a string, CODING->src_pos in an index to
5715 that string.
5716
5717 If CODING->src_object is nil, CODING->source must already point to
5718 the non-relocatable memory area. In this case, CODING->src_pos is
5719 an offset from CODING->source.
5720
5721 The decoded data is inserted at the current point of the buffer
5722 CODING->dst_object.
5723 */
5724
5725 static int
5726 decode_coding (coding)
5727 struct coding_system *coding;
5728 {
5729 Lisp_Object attrs;
5730
5731 if (BUFFERP (coding->src_object)
5732 && coding->src_pos > 0
5733 && coding->src_pos < GPT
5734 && coding->src_pos + coding->src_chars > GPT)
5735 move_gap_both (coding->src_pos, coding->src_pos_byte);
5736
5737 if (BUFFERP (coding->dst_object))
5738 {
5739 if (current_buffer != XBUFFER (coding->dst_object))
5740 set_buffer_internal (XBUFFER (coding->dst_object));
5741 if (GPT != PT)
5742 move_gap_both (PT, PT_BYTE);
5743 }
5744
5745 coding->consumed = coding->consumed_char = 0;
5746 coding->produced = coding->produced_char = 0;
5747 coding->chars_at_source = 0;
5748 coding->result = CODING_RESULT_SUCCESS;
5749 coding->errors = 0;
5750
5751 ALLOC_CONVERSION_WORK_AREA (coding);
5752
5753 attrs = CODING_ID_ATTRS (coding->id);
5754
5755 do
5756 {
5757 coding_set_source (coding);
5758 coding->annotated = 0;
5759 (*(coding->decoder)) (coding);
5760 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5761 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5762 else if (!NILP (Vstandard_translation_table_for_decode))
5763 translate_chars (coding, Vstandard_translation_table_for_decode);
5764 coding_set_destination (coding);
5765 produce_chars (coding);
5766 if (coding->annotated)
5767 produce_annotation (coding);
5768 }
5769 while (coding->consumed < coding->src_bytes
5770 && ! coding->result);
5771
5772 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5773 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5774 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5775 decode_eol (coding);
5776
5777 coding->carryover_bytes = 0;
5778 if (coding->consumed < coding->src_bytes)
5779 {
5780 int nbytes = coding->src_bytes - coding->consumed;
5781 unsigned char *src;
5782
5783 coding_set_source (coding);
5784 coding_set_destination (coding);
5785 src = coding->source + coding->consumed;
5786
5787 if (coding->mode & CODING_MODE_LAST_BLOCK)
5788 {
5789 /* Flush out unprocessed data as binary chars. We are sure
5790 that the number of data is less than the size of
5791 coding->charbuf. */
5792 while (nbytes-- > 0)
5793 {
5794 int c = *src++;
5795
5796 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5797 }
5798 produce_chars (coding);
5799 }
5800 else
5801 {
5802 /* Record unprocessed bytes in coding->carryover. We are
5803 sure that the number of data is less than the size of
5804 coding->carryover. */
5805 unsigned char *p = coding->carryover;
5806
5807 coding->carryover_bytes = nbytes;
5808 while (nbytes-- > 0)
5809 *p++ = *src++;
5810 }
5811 coding->consumed = coding->src_bytes;
5812 }
5813
5814 return coding->result;
5815 }
5816
5817
5818 /* Extract an annotation datum from a composition starting at POS and
5819 ending before LIMIT of CODING->src_object (buffer or string), store
5820 the data in BUF, set *STOP to a starting position of the next
5821 composition (if any) or to LIMIT, and return the address of the
5822 next element of BUF.
5823
5824 If such an annotation is not found, set *STOP to a starting
5825 position of a composition after POS (if any) or to LIMIT, and
5826 return BUF. */
5827
5828 static INLINE int *
5829 handle_composition_annotation (pos, limit, coding, buf, stop)
5830 EMACS_INT pos, limit;
5831 struct coding_system *coding;
5832 int *buf;
5833 EMACS_INT *stop;
5834 {
5835 EMACS_INT start, end;
5836 Lisp_Object prop;
5837
5838 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5839 || end > limit)
5840 *stop = limit;
5841 else if (start > pos)
5842 *stop = start;
5843 else
5844 {
5845 if (start == pos)
5846 {
5847 /* We found a composition. Store the corresponding
5848 annotation data in BUF. */
5849 int *head = buf;
5850 enum composition_method method = COMPOSITION_METHOD (prop);
5851 int nchars = COMPOSITION_LENGTH (prop);
5852
5853 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5854 if (method != COMPOSITION_RELATIVE)
5855 {
5856 Lisp_Object components;
5857 int len, i, i_byte;
5858
5859 components = COMPOSITION_COMPONENTS (prop);
5860 if (VECTORP (components))
5861 {
5862 len = XVECTOR (components)->size;
5863 for (i = 0; i < len; i++)
5864 *buf++ = XINT (AREF (components, i));
5865 }
5866 else if (STRINGP (components))
5867 {
5868 len = XSTRING (components)->size;
5869 i = i_byte = 0;
5870 while (i < len)
5871 {
5872 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5873 buf++;
5874 }
5875 }
5876 else if (INTEGERP (components))
5877 {
5878 len = 1;
5879 *buf++ = XINT (components);
5880 }
5881 else if (CONSP (components))
5882 {
5883 for (len = 0; CONSP (components);
5884 len++, components = XCDR (components))
5885 *buf++ = XINT (XCAR (components));
5886 }
5887 else
5888 abort ();
5889 *head -= len;
5890 }
5891 }
5892
5893 if (find_composition (end, limit, &start, &end, &prop,
5894 coding->src_object)
5895 && end <= limit)
5896 *stop = start;
5897 else
5898 *stop = limit;
5899 }
5900 return buf;
5901 }
5902
5903
5904 /* Extract an annotation datum from a text property `charset' at POS of
5905 CODING->src_object (buffer of string), store the data in BUF, set
5906 *STOP to the position where the value of `charset' property changes
5907 (limiting by LIMIT), and return the address of the next element of
5908 BUF.
5909
5910 If the property value is nil, set *STOP to the position where the
5911 property value is non-nil (limiting by LIMIT), and return BUF. */
5912
5913 static INLINE int *
5914 handle_charset_annotation (pos, limit, coding, buf, stop)
5915 EMACS_INT pos, limit;
5916 struct coding_system *coding;
5917 int *buf;
5918 EMACS_INT *stop;
5919 {
5920 Lisp_Object val, next;
5921 int id;
5922
5923 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5924 if (! NILP (val) && CHARSETP (val))
5925 id = XINT (CHARSET_SYMBOL_ID (val));
5926 else
5927 id = -1;
5928 ADD_CHARSET_DATA (buf, 0, 0, id);
5929 next = Fnext_single_property_change (make_number (pos), Qcharset,
5930 coding->src_object,
5931 make_number (limit));
5932 *stop = XINT (next);
5933 return buf;
5934 }
5935
5936
5937 static void
5938 consume_chars (coding)
5939 struct coding_system *coding;
5940 {
5941 int *buf = coding->charbuf;
5942 int *buf_end = coding->charbuf + coding->charbuf_size;
5943 const unsigned char *src = coding->source + coding->consumed;
5944 EMACS_INT pos = coding->src_pos + coding->consumed_char;
5945 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5946 int multibytep = coding->src_multibyte;
5947 Lisp_Object eol_type;
5948 int c;
5949 EMACS_INT stop, stop_composition, stop_charset;
5950 int id;
5951
5952 eol_type = CODING_ID_EOL_TYPE (coding->id);
5953 if (VECTORP (eol_type))
5954 eol_type = Qunix;
5955
5956 /* Note: composition handling is not yet implemented. */
5957 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5958
5959 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5960 stop = stop_composition = pos;
5961 else
5962 stop = stop_composition = end_pos;
5963 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5964 stop = stop_charset = pos;
5965 else
5966 stop_charset = end_pos;
5967
5968 /* Compensate for CRLF and annotation. */
5969 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5970 while (buf < buf_end)
5971 {
5972 if (pos == stop)
5973 {
5974 int *p;
5975
5976 if (pos == end_pos)
5977 break;
5978 if (pos == stop_composition)
5979 buf = handle_composition_annotation (pos, end_pos, coding,
5980 buf, &stop_composition);
5981 if (pos == stop_charset)
5982 buf = handle_charset_annotation (pos, end_pos, coding,
5983 buf, &stop_charset);
5984 stop = (stop_composition < stop_charset
5985 ? stop_composition : stop_charset);
5986 }
5987
5988 if (! multibytep)
5989 c = *src++;
5990 else
5991 c = STRING_CHAR_ADVANCE (src);
5992 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5993 c = '\n';
5994 if (! EQ (eol_type, Qunix))
5995 {
5996 if (c == '\n')
5997 {
5998 if (EQ (eol_type, Qdos))
5999 *buf++ = '\r';
6000 else
6001 c = '\r';
6002 }
6003 }
6004 *buf++ = c;
6005 pos++;
6006 }
6007
6008 coding->consumed = src - coding->source;
6009 coding->consumed_char = pos - coding->src_pos;
6010 coding->charbuf_used = buf - coding->charbuf;
6011 coding->chars_at_source = 0;
6012 }
6013
6014
6015 /* Encode the text at CODING->src_object into CODING->dst_object.
6016 CODING->src_object is a buffer or a string.
6017 CODING->dst_object is a buffer or nil.
6018
6019 If CODING->src_object is a buffer, it must be the current buffer.
6020 In this case, if CODING->src_pos is positive, it is a position of
6021 the source text in the buffer, otherwise. the source text is in the
6022 gap area of the buffer, and coding->src_pos specifies the offset of
6023 the text from GPT (which must be the same as PT). If this is the
6024 same buffer as CODING->dst_object, CODING->src_pos must be
6025 negative and CODING should not have `pre-write-conversion'.
6026
6027 If CODING->src_object is a string, CODING should not have
6028 `pre-write-conversion'.
6029
6030 If CODING->dst_object is a buffer, the encoded data is inserted at
6031 the current point of that buffer.
6032
6033 If CODING->dst_object is nil, the encoded data is placed at the
6034 memory area specified by CODING->destination. */
6035
6036 static int
6037 encode_coding (coding)
6038 struct coding_system *coding;
6039 {
6040 Lisp_Object attrs;
6041
6042 attrs = CODING_ID_ATTRS (coding->id);
6043
6044 if (BUFFERP (coding->dst_object))
6045 {
6046 set_buffer_internal (XBUFFER (coding->dst_object));
6047 coding->dst_multibyte
6048 = ! NILP (current_buffer->enable_multibyte_characters);
6049 }
6050
6051 coding->consumed = coding->consumed_char = 0;
6052 coding->produced = coding->produced_char = 0;
6053 coding->result = CODING_RESULT_SUCCESS;
6054 coding->errors = 0;
6055
6056 ALLOC_CONVERSION_WORK_AREA (coding);
6057
6058 do {
6059 coding_set_source (coding);
6060 consume_chars (coding);
6061
6062 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6063 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6064 else if (!NILP (Vstandard_translation_table_for_encode))
6065 translate_chars (coding, Vstandard_translation_table_for_encode);
6066
6067 coding_set_destination (coding);
6068 (*(coding->encoder)) (coding);
6069 } while (coding->consumed_char < coding->src_chars);
6070
6071 if (BUFFERP (coding->dst_object))
6072 insert_from_gap (coding->produced_char, coding->produced);
6073
6074 return (coding->result);
6075 }
6076
6077 /* Work buffer */
6078
6079 /* List of currently used working buffer. */
6080 Lisp_Object Vcode_conversion_work_buf_list;
6081
6082 /* A working buffer used by the top level conversion. */
6083 Lisp_Object Vcode_conversion_reused_work_buf;
6084
6085
6086 /* Return a working buffer that can be freely used by the following
6087 code conversion. MULTIBYTEP specifies the multibyteness of the
6088 buffer. */
6089
6090 Lisp_Object
6091 make_conversion_work_buffer (multibytep)
6092 int multibytep;
6093 {
6094 struct buffer *current = current_buffer;
6095 Lisp_Object buf;
6096
6097 if (NILP (Vcode_conversion_work_buf_list))
6098 {
6099 if (NILP (Vcode_conversion_reused_work_buf))
6100 Vcode_conversion_reused_work_buf
6101 = Fget_buffer_create (build_string (" *code-conversion-work*"));
6102 Vcode_conversion_work_buf_list
6103 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
6104 }
6105 else
6106 {
6107 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6108 char str[128];
6109
6110 sprintf (str, " *code-conversion-work*<%d>", depth);
6111 Vcode_conversion_work_buf_list
6112 = Fcons (Fget_buffer_create (build_string (str)),
6113 Vcode_conversion_work_buf_list);
6114 }
6115
6116 buf = XCAR (Vcode_conversion_work_buf_list);
6117 set_buffer_internal (XBUFFER (buf));
6118 current_buffer->undo_list = Qt;
6119 Ferase_buffer ();
6120 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6121 set_buffer_internal (current);
6122 return buf;
6123 }
6124
6125 static struct coding_system *saved_coding;
6126
6127 Lisp_Object
6128 code_conversion_restore (info)
6129 Lisp_Object info;
6130 {
6131 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6132 Lisp_Object buf;
6133
6134 if (depth > 0)
6135 {
6136 buf = XCAR (Vcode_conversion_work_buf_list);
6137 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6138 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
6139 Fkill_buffer (buf);
6140 }
6141
6142 if (EQ (saved_coding->dst_object, Qt)
6143 && saved_coding->destination)
6144 xfree (saved_coding->destination);
6145
6146 return save_excursion_restore (info);
6147 }
6148
6149
6150 int
6151 decode_coding_gap (coding, chars, bytes)
6152 struct coding_system *coding;
6153 EMACS_INT chars, bytes;
6154 {
6155 int count = specpdl_ptr - specpdl;
6156
6157 saved_coding = coding;
6158 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6159
6160 coding->src_object = Fcurrent_buffer ();
6161 coding->src_chars = chars;
6162 coding->src_bytes = bytes;
6163 coding->src_pos = -chars;
6164 coding->src_pos_byte = -bytes;
6165 coding->src_multibyte = chars < bytes;
6166 coding->dst_object = coding->src_object;
6167 coding->dst_pos = PT;
6168 coding->dst_pos_byte = PT_BYTE;
6169 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6170 coding->mode |= CODING_MODE_LAST_BLOCK;
6171
6172 if (CODING_REQUIRE_DETECTION (coding))
6173 detect_coding (coding);
6174
6175 decode_coding (coding);
6176
6177 unbind_to (count, Qnil);
6178 return coding->result;
6179 }
6180
6181 int
6182 encode_coding_gap (coding, chars, bytes)
6183 struct coding_system *coding;
6184 EMACS_INT chars, bytes;
6185 {
6186 int count = specpdl_ptr - specpdl;
6187 Lisp_Object buffer;
6188
6189 saved_coding = coding;
6190 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6191
6192 buffer = Fcurrent_buffer ();
6193 coding->src_object = buffer;
6194 coding->src_chars = chars;
6195 coding->src_bytes = bytes;
6196 coding->src_pos = -chars;
6197 coding->src_pos_byte = -bytes;
6198 coding->src_multibyte = chars < bytes;
6199 coding->dst_object = coding->src_object;
6200 coding->dst_pos = PT;
6201 coding->dst_pos_byte = PT_BYTE;
6202
6203 encode_coding (coding);
6204
6205 unbind_to (count, Qnil);
6206 return coding->result;
6207 }
6208
6209
6210 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6211 SRC_OBJECT into DST_OBJECT by coding context CODING.
6212
6213 SRC_OBJECT is a buffer, a string, or Qnil.
6214
6215 If it is a buffer, the text is at point of the buffer. FROM and TO
6216 are positions in the buffer.
6217
6218 If it is a string, the text is at the beginning of the string.
6219 FROM and TO are indices to the string.
6220
6221 If it is nil, the text is at coding->source. FROM and TO are
6222 indices to coding->source.
6223
6224 DST_OBJECT is a buffer, Qt, or Qnil.
6225
6226 If it is a buffer, the decoded text is inserted at point of the
6227 buffer. If the buffer is the same as SRC_OBJECT, the source text
6228 is deleted.
6229
6230 If it is Qt, a string is made from the decoded text, and
6231 set in CODING->dst_object.
6232
6233 If it is Qnil, the decoded text is stored at CODING->destination.
6234 The called must allocate CODING->dst_bytes bytes at
6235 CODING->destination by xmalloc. If the decoded text is longer than
6236 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6237 */
6238
6239 void
6240 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6241 dst_object)
6242 struct coding_system *coding;
6243 Lisp_Object src_object;
6244 EMACS_INT from, from_byte, to, to_byte;
6245 Lisp_Object dst_object;
6246 {
6247 int count = specpdl_ptr - specpdl;
6248 unsigned char *destination;
6249 EMACS_INT dst_bytes;
6250 EMACS_INT chars = to - from;
6251 EMACS_INT bytes = to_byte - from_byte;
6252 Lisp_Object attrs;
6253
6254 saved_coding = coding;
6255 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6256
6257 if (NILP (dst_object))
6258 {
6259 destination = coding->destination;
6260 dst_bytes = coding->dst_bytes;
6261 }
6262
6263 coding->src_object = src_object;
6264 coding->src_chars = chars;
6265 coding->src_bytes = bytes;
6266 coding->src_multibyte = chars < bytes;
6267
6268 if (STRINGP (src_object))
6269 {
6270 coding->src_pos = from;
6271 coding->src_pos_byte = from_byte;
6272 }
6273 else if (BUFFERP (src_object))
6274 {
6275 set_buffer_internal (XBUFFER (src_object));
6276 if (from != GPT)
6277 move_gap_both (from, from_byte);
6278 if (EQ (src_object, dst_object))
6279 {
6280 TEMP_SET_PT_BOTH (from, from_byte);
6281 del_range_both (from, from_byte, to, to_byte, 1);
6282 coding->src_pos = -chars;
6283 coding->src_pos_byte = -bytes;
6284 }
6285 else
6286 {
6287 coding->src_pos = from;
6288 coding->src_pos_byte = from_byte;
6289 }
6290 }
6291
6292 if (CODING_REQUIRE_DETECTION (coding))
6293 detect_coding (coding);
6294 attrs = CODING_ID_ATTRS (coding->id);
6295
6296 if (! NILP (CODING_ATTR_POST_READ (attrs))
6297 || EQ (dst_object, Qt))
6298 {
6299 coding->dst_object = make_conversion_work_buffer (1);
6300 coding->dst_pos = BEG;
6301 coding->dst_pos_byte = BEG_BYTE;
6302 coding->dst_multibyte = 1;
6303 }
6304 else if (BUFFERP (dst_object))
6305 {
6306 coding->dst_object = dst_object;
6307 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6308 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6309 coding->dst_multibyte
6310 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6311 }
6312 else
6313 {
6314 coding->dst_object = Qnil;
6315 coding->dst_multibyte = 1;
6316 }
6317
6318 decode_coding (coding);
6319
6320 if (BUFFERP (coding->dst_object))
6321 set_buffer_internal (XBUFFER (coding->dst_object));
6322
6323 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6324 {
6325 struct gcpro gcpro1, gcpro2;
6326 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6327 Lisp_Object val;
6328
6329 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6330 GCPRO2 (coding->src_object, coding->dst_object);
6331 val = call1 (CODING_ATTR_POST_READ (attrs),
6332 make_number (coding->produced_char));
6333 UNGCPRO;
6334 CHECK_NATNUM (val);
6335 coding->produced_char += Z - prev_Z;
6336 coding->produced += Z_BYTE - prev_Z_BYTE;
6337 }
6338
6339 if (EQ (dst_object, Qt))
6340 {
6341 coding->dst_object = Fbuffer_string ();
6342 }
6343 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6344 {
6345 set_buffer_internal (XBUFFER (coding->dst_object));
6346 if (dst_bytes < coding->produced)
6347 {
6348 destination
6349 = (unsigned char *) xrealloc (destination, coding->produced);
6350 if (! destination)
6351 {
6352 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6353 unbind_to (count, Qnil);
6354 return;
6355 }
6356 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6357 move_gap_both (BEGV, BEGV_BYTE);
6358 bcopy (BEGV_ADDR, destination, coding->produced);
6359 coding->destination = destination;
6360 }
6361 }
6362
6363 unbind_to (count, Qnil);
6364 }
6365
6366
6367 void
6368 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6369 dst_object)
6370 struct coding_system *coding;
6371 Lisp_Object src_object;
6372 EMACS_INT from, from_byte, to, to_byte;
6373 Lisp_Object dst_object;
6374 {
6375 int count = specpdl_ptr - specpdl;
6376 EMACS_INT chars = to - from;
6377 EMACS_INT bytes = to_byte - from_byte;
6378 Lisp_Object attrs;
6379
6380 saved_coding = coding;
6381 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6382
6383 coding->src_object = src_object;
6384 coding->src_chars = chars;
6385 coding->src_bytes = bytes;
6386 coding->src_multibyte = chars < bytes;
6387
6388 attrs = CODING_ID_ATTRS (coding->id);
6389
6390 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6391 {
6392 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6393 set_buffer_internal (XBUFFER (coding->src_object));
6394 if (STRINGP (src_object))
6395 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6396 else if (BUFFERP (src_object))
6397 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6398 else
6399 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6400
6401 if (EQ (src_object, dst_object))
6402 {
6403 set_buffer_internal (XBUFFER (src_object));
6404 del_range_both (from, from_byte, to, to_byte, 1);
6405 set_buffer_internal (XBUFFER (coding->src_object));
6406 }
6407
6408 call2 (CODING_ATTR_PRE_WRITE (attrs),
6409 make_number (BEG), make_number (Z));
6410 coding->src_object = Fcurrent_buffer ();
6411 if (BEG != GPT)
6412 move_gap_both (BEG, BEG_BYTE);
6413 coding->src_chars = Z - BEG;
6414 coding->src_bytes = Z_BYTE - BEG_BYTE;
6415 coding->src_pos = BEG;
6416 coding->src_pos_byte = BEG_BYTE;
6417 coding->src_multibyte = Z < Z_BYTE;
6418 }
6419 else if (STRINGP (src_object))
6420 {
6421 coding->src_pos = from;
6422 coding->src_pos_byte = from_byte;
6423 }
6424 else if (BUFFERP (src_object))
6425 {
6426 set_buffer_internal (XBUFFER (src_object));
6427 if (EQ (src_object, dst_object))
6428 {
6429 coding->src_object = del_range_1 (from, to, 1, 1);
6430 coding->src_pos = 0;
6431 coding->src_pos_byte = 0;
6432 }
6433 else
6434 {
6435 if (from < GPT && to >= GPT)
6436 move_gap_both (from, from_byte);
6437 coding->src_pos = from;
6438 coding->src_pos_byte = from_byte;
6439 }
6440 }
6441
6442 if (BUFFERP (dst_object))
6443 {
6444 coding->dst_object = dst_object;
6445 if (EQ (src_object, dst_object))
6446 {
6447 coding->dst_pos = from;
6448 coding->dst_pos_byte = from_byte;
6449 }
6450 else
6451 {
6452 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6453 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6454 }
6455 coding->dst_multibyte
6456 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6457 }
6458 else if (EQ (dst_object, Qt))
6459 {
6460 coding->dst_object = Qnil;
6461 coding->dst_bytes = coding->src_chars;
6462 if (coding->dst_bytes == 0)
6463 coding->dst_bytes = 1;
6464 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6465 coding->dst_multibyte = 0;
6466 }
6467 else
6468 {
6469 coding->dst_object = Qnil;
6470 coding->dst_multibyte = 0;
6471 }
6472
6473 encode_coding (coding);
6474
6475 if (EQ (dst_object, Qt))
6476 {
6477 if (BUFFERP (coding->dst_object))
6478 coding->dst_object = Fbuffer_string ();
6479 else
6480 {
6481 coding->dst_object
6482 = make_unibyte_string ((char *) coding->destination,
6483 coding->produced);
6484 xfree (coding->destination);
6485 }
6486 }
6487
6488 unbind_to (count, Qnil);
6489 }
6490
6491
6492 Lisp_Object
6493 preferred_coding_system ()
6494 {
6495 int id = coding_categories[coding_priorities[0]].id;
6496
6497 return CODING_ID_NAME (id);
6498 }
6499
6500 \f
6501 #ifdef emacs
6502 /*** 8. Emacs Lisp library functions ***/
6503
6504 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6505 doc: /* Return t if OBJECT is nil or a coding-system.
6506 See the documentation of `define-coding-system' for information
6507 about coding-system objects. */)
6508 (obj)
6509 Lisp_Object obj;
6510 {
6511 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6512 }
6513
6514 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6515 Sread_non_nil_coding_system, 1, 1, 0,
6516 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6517 (prompt)
6518 Lisp_Object prompt;
6519 {
6520 Lisp_Object val;
6521 do
6522 {
6523 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6524 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6525 }
6526 while (XSTRING (val)->size == 0);
6527 return (Fintern (val, Qnil));
6528 }
6529
6530 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6531 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6532 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6533 (prompt, default_coding_system)
6534 Lisp_Object prompt, default_coding_system;
6535 {
6536 Lisp_Object val;
6537 if (SYMBOLP (default_coding_system))
6538 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6539 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6540 Qt, Qnil, Qcoding_system_history,
6541 default_coding_system, Qnil);
6542 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6543 }
6544
6545 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6546 1, 1, 0,
6547 doc: /* Check validity of CODING-SYSTEM.
6548 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6549 (coding_system)
6550 Lisp_Object coding_system;
6551 {
6552 CHECK_SYMBOL (coding_system);
6553 if (!NILP (Fcoding_system_p (coding_system)))
6554 return coding_system;
6555 while (1)
6556 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6557 }
6558
6559 \f
6560 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6561 HIGHEST is nonzero, return the coding system of the highest
6562 priority among the detected coding systems. Otherwize return a
6563 list of detected coding systems sorted by their priorities. If
6564 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6565 multibyte form but contains only ASCII and eight-bit chars.
6566 Otherwise, the bytes are raw bytes.
6567
6568 CODING-SYSTEM controls the detection as below:
6569
6570 If it is nil, detect both text-format and eol-format. If the
6571 text-format part of CODING-SYSTEM is already specified
6572 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6573 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6574 detect only text-format. */
6575
6576 Lisp_Object
6577 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6578 unsigned char *src;
6579 int src_bytes, highest;
6580 int multibytep;
6581 Lisp_Object coding_system;
6582 {
6583 unsigned char *src_end = src + src_bytes;
6584 int mask = CATEGORY_MASK_ANY;
6585 Lisp_Object attrs, eol_type;
6586 Lisp_Object val;
6587 struct coding_system coding;
6588 int id;
6589 struct coding_detection_info detect_info;
6590
6591 if (NILP (coding_system))
6592 coding_system = Qundecided;
6593 setup_coding_system (coding_system, &coding);
6594 attrs = CODING_ID_ATTRS (coding.id);
6595 eol_type = CODING_ID_EOL_TYPE (coding.id);
6596 coding_system = CODING_ATTR_BASE_NAME (attrs);
6597
6598 coding.source = src;
6599 coding.src_bytes = src_bytes;
6600 coding.src_multibyte = multibytep;
6601 coding.consumed = 0;
6602 coding.mode |= CODING_MODE_LAST_BLOCK;
6603
6604 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6605
6606 /* At first, detect text-format if necessary. */
6607 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6608 {
6609 enum coding_category category;
6610 struct coding_system *this;
6611 int c, i;
6612
6613 for (; src < src_end; src++)
6614 {
6615 c = *src;
6616 if (c & 0x80
6617 || (c < 0x20 && (c == ISO_CODE_ESC
6618 || c == ISO_CODE_SI
6619 || c == ISO_CODE_SO)))
6620 break;
6621 }
6622 coding.head_ascii = src - coding.source;
6623
6624 if (src < src_end)
6625 for (i = 0; i < coding_category_raw_text; i++)
6626 {
6627 category = coding_priorities[i];
6628 this = coding_categories + category;
6629
6630 if (this->id < 0)
6631 {
6632 /* No coding system of this category is defined. */
6633 detect_info.rejected |= (1 << category);
6634 }
6635 else if (category >= coding_category_raw_text)
6636 continue;
6637 else if (detect_info.checked & (1 << category))
6638 {
6639 if (highest
6640 && (detect_info.found & (1 << category)))
6641 break;
6642 }
6643 else
6644 {
6645 if ((*(this->detector)) (&coding, &detect_info)
6646 && highest
6647 && (detect_info.found & (1 << category)))
6648 break;
6649 }
6650 }
6651
6652
6653 if (detect_info.rejected == CATEGORY_MASK_ANY)
6654 {
6655 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6656 id = coding_categories[coding_category_raw_text].id;
6657 val = Fcons (make_number (id), Qnil);
6658 }
6659 else if (! detect_info.rejected && ! detect_info.found)
6660 {
6661 detect_info.found = CATEGORY_MASK_ANY;
6662 id = coding_categories[coding_category_undecided].id;
6663 val = Fcons (make_number (id), Qnil);
6664 }
6665 else if (highest)
6666 {
6667 if (detect_info.found)
6668 {
6669 detect_info.found = 1 << category;
6670 val = Fcons (make_number (this->id), Qnil);
6671 }
6672 else
6673 for (i = 0; i < coding_category_raw_text; i++)
6674 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6675 {
6676 detect_info.found = 1 << coding_priorities[i];
6677 id = coding_categories[coding_priorities[i]].id;
6678 val = Fcons (make_number (id), Qnil);
6679 break;
6680 }
6681 }
6682 else
6683 {
6684 int mask = detect_info.rejected | detect_info.found;
6685 int found = 0;
6686 val = Qnil;
6687
6688 for (i = coding_category_raw_text - 1; i >= 0; i--)
6689 {
6690 category = coding_priorities[i];
6691 if (! (mask & (1 << category)))
6692 {
6693 found |= 1 << category;
6694 id = coding_categories[category].id;
6695 val = Fcons (make_number (id), val);
6696 }
6697 }
6698 for (i = coding_category_raw_text - 1; i >= 0; i--)
6699 {
6700 category = coding_priorities[i];
6701 if (detect_info.found & (1 << category))
6702 {
6703 id = coding_categories[category].id;
6704 val = Fcons (make_number (id), val);
6705 }
6706 }
6707 detect_info.found |= found;
6708 }
6709 }
6710 else
6711 {
6712 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6713 val = Fcons (make_number (coding.id), Qnil);
6714 }
6715
6716 /* Then, detect eol-format if necessary. */
6717 {
6718 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6719 Lisp_Object tail;
6720
6721 if (VECTORP (eol_type))
6722 {
6723 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6724 normal_eol = detect_eol (coding.source, src_bytes,
6725 coding_category_raw_text);
6726 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6727 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6728 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6729 coding_category_utf_16_be);
6730 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6731 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6732 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6733 coding_category_utf_16_le);
6734 }
6735 else
6736 {
6737 if (EQ (eol_type, Qunix))
6738 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6739 else if (EQ (eol_type, Qdos))
6740 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6741 else
6742 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6743 }
6744
6745 for (tail = val; CONSP (tail); tail = XCDR (tail))
6746 {
6747 enum coding_category category;
6748 int this_eol;
6749
6750 id = XINT (XCAR (tail));
6751 attrs = CODING_ID_ATTRS (id);
6752 category = XINT (CODING_ATTR_CATEGORY (attrs));
6753 eol_type = CODING_ID_EOL_TYPE (id);
6754 if (VECTORP (eol_type))
6755 {
6756 if (category == coding_category_utf_16_be
6757 || category == coding_category_utf_16_be_nosig)
6758 this_eol = utf_16_be_eol;
6759 else if (category == coding_category_utf_16_le
6760 || category == coding_category_utf_16_le_nosig)
6761 this_eol = utf_16_le_eol;
6762 else
6763 this_eol = normal_eol;
6764
6765 if (this_eol == EOL_SEEN_LF)
6766 XSETCAR (tail, AREF (eol_type, 0));
6767 else if (this_eol == EOL_SEEN_CRLF)
6768 XSETCAR (tail, AREF (eol_type, 1));
6769 else if (this_eol == EOL_SEEN_CR)
6770 XSETCAR (tail, AREF (eol_type, 2));
6771 else
6772 XSETCAR (tail, CODING_ID_NAME (id));
6773 }
6774 else
6775 XSETCAR (tail, CODING_ID_NAME (id));
6776 }
6777 }
6778
6779 return (highest ? XCAR (val) : val);
6780 }
6781
6782
6783 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6784 2, 3, 0,
6785 doc: /* Detect coding system of the text in the region between START and END.
6786 Return a list of possible coding systems ordered by priority.
6787
6788 If only ASCII characters are found, it returns a list of single element
6789 `undecided' or its subsidiary coding system according to a detected
6790 end-of-line format.
6791
6792 If optional argument HIGHEST is non-nil, return the coding system of
6793 highest priority. */)
6794 (start, end, highest)
6795 Lisp_Object start, end, highest;
6796 {
6797 int from, to;
6798 int from_byte, to_byte;
6799
6800 CHECK_NUMBER_COERCE_MARKER (start);
6801 CHECK_NUMBER_COERCE_MARKER (end);
6802
6803 validate_region (&start, &end);
6804 from = XINT (start), to = XINT (end);
6805 from_byte = CHAR_TO_BYTE (from);
6806 to_byte = CHAR_TO_BYTE (to);
6807
6808 if (from < GPT && to >= GPT)
6809 move_gap_both (to, to_byte);
6810
6811 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6812 to_byte - from_byte,
6813 !NILP (highest),
6814 !NILP (current_buffer
6815 ->enable_multibyte_characters),
6816 Qnil);
6817 }
6818
6819 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6820 1, 2, 0,
6821 doc: /* Detect coding system of the text in STRING.
6822 Return a list of possible coding systems ordered by priority.
6823
6824 If only ASCII characters are found, it returns a list of single element
6825 `undecided' or its subsidiary coding system according to a detected
6826 end-of-line format.
6827
6828 If optional argument HIGHEST is non-nil, return the coding system of
6829 highest priority. */)
6830 (string, highest)
6831 Lisp_Object string, highest;
6832 {
6833 CHECK_STRING (string);
6834
6835 return detect_coding_system (XSTRING (string)->data,
6836 STRING_BYTES (XSTRING (string)),
6837 !NILP (highest),
6838 STRING_MULTIBYTE (string),
6839 Qnil);
6840 }
6841
6842
6843 static INLINE int
6844 char_encodable_p (c, attrs)
6845 int c;
6846 Lisp_Object attrs;
6847 {
6848 Lisp_Object tail;
6849 struct charset *charset;
6850
6851 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6852 CONSP (tail); tail = XCDR (tail))
6853 {
6854 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6855 if (CHAR_CHARSET_P (c, charset))
6856 break;
6857 }
6858 return (! NILP (tail));
6859 }
6860
6861
6862 /* Return a list of coding systems that safely encode the text between
6863 START and END. If EXCLUDE is non-nil, it is a list of coding
6864 systems not to check. The returned list doesn't contain any such
6865 coding systems. In any case, if the text contains only ASCII or is
6866 unibyte, return t. */
6867
6868 DEFUN ("find-coding-systems-region-internal",
6869 Ffind_coding_systems_region_internal,
6870 Sfind_coding_systems_region_internal, 2, 3, 0,
6871 doc: /* Internal use only. */)
6872 (start, end, exclude)
6873 Lisp_Object start, end, exclude;
6874 {
6875 Lisp_Object coding_attrs_list, safe_codings;
6876 EMACS_INT start_byte, end_byte;
6877 const unsigned char *p, *pbeg, *pend;
6878 int c;
6879 Lisp_Object tail, elt;
6880
6881 if (STRINGP (start))
6882 {
6883 if (!STRING_MULTIBYTE (start)
6884 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
6885 return Qt;
6886 start_byte = 0;
6887 end_byte = STRING_BYTES (XSTRING (start));
6888 }
6889 else
6890 {
6891 CHECK_NUMBER_COERCE_MARKER (start);
6892 CHECK_NUMBER_COERCE_MARKER (end);
6893 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6894 args_out_of_range (start, end);
6895 if (NILP (current_buffer->enable_multibyte_characters))
6896 return Qt;
6897 start_byte = CHAR_TO_BYTE (XINT (start));
6898 end_byte = CHAR_TO_BYTE (XINT (end));
6899 if (XINT (end) - XINT (start) == end_byte - start_byte)
6900 return Qt;
6901
6902 if (XINT (start) < GPT && XINT (end) > GPT)
6903 {
6904 if ((GPT - XINT (start)) < (XINT (end) - GPT))
6905 move_gap_both (XINT (start), start_byte);
6906 else
6907 move_gap_both (XINT (end), end_byte);
6908 }
6909 }
6910
6911 coding_attrs_list = Qnil;
6912 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6913 if (NILP (exclude)
6914 || NILP (Fmemq (XCAR (tail), exclude)))
6915 {
6916 Lisp_Object attrs;
6917
6918 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6919 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6920 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6921 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6922 }
6923
6924 if (STRINGP (start))
6925 p = pbeg = XSTRING (start)->data;
6926 else
6927 p = pbeg = BYTE_POS_ADDR (start_byte);
6928 pend = p + (end_byte - start_byte);
6929
6930 while (p < pend && ASCII_BYTE_P (*p)) p++;
6931 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6932
6933 while (p < pend)
6934 {
6935 if (ASCII_BYTE_P (*p))
6936 p++;
6937 else
6938 {
6939 c = STRING_CHAR_ADVANCE (p);
6940
6941 charset_map_loaded = 0;
6942 for (tail = coding_attrs_list; CONSP (tail);)
6943 {
6944 elt = XCAR (tail);
6945 if (NILP (elt))
6946 tail = XCDR (tail);
6947 else if (char_encodable_p (c, elt))
6948 tail = XCDR (tail);
6949 else if (CONSP (XCDR (tail)))
6950 {
6951 XSETCAR (tail, XCAR (XCDR (tail)));
6952 XSETCDR (tail, XCDR (XCDR (tail)));
6953 }
6954 else
6955 {
6956 XSETCAR (tail, Qnil);
6957 tail = XCDR (tail);
6958 }
6959 }
6960 if (charset_map_loaded)
6961 {
6962 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6963
6964 if (STRINGP (start))
6965 pbeg = XSTRING (start)->data;
6966 else
6967 pbeg = BYTE_POS_ADDR (start_byte);
6968 p = pbeg + p_offset;
6969 pend = pbeg + pend_offset;
6970 }
6971 }
6972 }
6973
6974 safe_codings = Qnil;
6975 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6976 if (! NILP (XCAR (tail)))
6977 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6978
6979 return safe_codings;
6980 }
6981
6982
6983 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6984 Scheck_coding_systems_region, 3, 3, 0,
6985 doc: /* Check if the region is encodable by coding systems.
6986
6987 START and END are buffer positions specifying the region.
6988 CODING-SYSTEM-LIST is a list of coding systems to check.
6989
6990 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6991 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6992 whole region, POS0, POS1, ... are buffer positions where non-encodable
6993 characters are found.
6994
6995 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6996 value is nil.
6997
6998 START may be a string. In that case, check if the string is
6999 encodable, and the value contains indices to the string instead of
7000 buffer positions. END is ignored. */)
7001 (start, end, coding_system_list)
7002 Lisp_Object start, end, coding_system_list;
7003 {
7004 Lisp_Object list;
7005 EMACS_INT start_byte, end_byte;
7006 int pos;
7007 const unsigned char *p, *pbeg, *pend;
7008 int c;
7009 Lisp_Object tail, elt;
7010
7011 if (STRINGP (start))
7012 {
7013 if (!STRING_MULTIBYTE (start)
7014 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7015 return Qnil;
7016 start_byte = 0;
7017 end_byte = STRING_BYTES (XSTRING (start));
7018 pos = 0;
7019 }
7020 else
7021 {
7022 CHECK_NUMBER_COERCE_MARKER (start);
7023 CHECK_NUMBER_COERCE_MARKER (end);
7024 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7025 args_out_of_range (start, end);
7026 if (NILP (current_buffer->enable_multibyte_characters))
7027 return Qnil;
7028 start_byte = CHAR_TO_BYTE (XINT (start));
7029 end_byte = CHAR_TO_BYTE (XINT (end));
7030 if (XINT (end) - XINT (start) == end_byte - start_byte)
7031 return Qt;
7032
7033 if (XINT (start) < GPT && XINT (end) > GPT)
7034 {
7035 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7036 move_gap_both (XINT (start), start_byte);
7037 else
7038 move_gap_both (XINT (end), end_byte);
7039 }
7040 pos = XINT (start);
7041 }
7042
7043 list = Qnil;
7044 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7045 {
7046 elt = XCAR (tail);
7047 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7048 Qnil)),
7049 list);
7050 }
7051
7052 if (STRINGP (start))
7053 p = pbeg = XSTRING (start)->data;
7054 else
7055 p = pbeg = BYTE_POS_ADDR (start_byte);
7056 pend = p + (end_byte - start_byte);
7057
7058 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7059 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7060
7061 while (p < pend)
7062 {
7063 if (ASCII_BYTE_P (*p))
7064 p++;
7065 else
7066 {
7067 c = STRING_CHAR_ADVANCE (p);
7068
7069 charset_map_loaded = 0;
7070 for (tail = list; CONSP (tail); tail = XCDR (tail))
7071 {
7072 elt = XCDR (XCAR (tail));
7073 if (! char_encodable_p (c, XCAR (elt)))
7074 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7075 }
7076 if (charset_map_loaded)
7077 {
7078 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7079
7080 if (STRINGP (start))
7081 pbeg = XSTRING (start)->data;
7082 else
7083 pbeg = BYTE_POS_ADDR (start_byte);
7084 p = pbeg + p_offset;
7085 pend = pbeg + pend_offset;
7086 }
7087 }
7088 pos++;
7089 }
7090
7091 tail = list;
7092 list = Qnil;
7093 for (; CONSP (tail); tail = XCDR (tail))
7094 {
7095 elt = XCAR (tail);
7096 if (CONSP (XCDR (XCDR (elt))))
7097 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7098 list);
7099 }
7100
7101 return list;
7102 }
7103
7104
7105
7106 Lisp_Object
7107 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7108 Lisp_Object start, end, coding_system, dst_object;
7109 int encodep, norecord;
7110 {
7111 struct coding_system coding;
7112 EMACS_INT from, from_byte, to, to_byte;
7113 Lisp_Object src_object;
7114
7115 CHECK_NUMBER_COERCE_MARKER (start);
7116 CHECK_NUMBER_COERCE_MARKER (end);
7117 if (NILP (coding_system))
7118 coding_system = Qno_conversion;
7119 else
7120 CHECK_CODING_SYSTEM (coding_system);
7121 src_object = Fcurrent_buffer ();
7122 if (NILP (dst_object))
7123 dst_object = src_object;
7124 else if (! EQ (dst_object, Qt))
7125 CHECK_BUFFER (dst_object);
7126
7127 validate_region (&start, &end);
7128 from = XFASTINT (start);
7129 from_byte = CHAR_TO_BYTE (from);
7130 to = XFASTINT (end);
7131 to_byte = CHAR_TO_BYTE (to);
7132
7133 setup_coding_system (coding_system, &coding);
7134 coding.mode |= CODING_MODE_LAST_BLOCK;
7135
7136 if (encodep)
7137 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7138 dst_object);
7139 else
7140 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7141 dst_object);
7142 if (! norecord)
7143 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7144
7145 if (coding.result != CODING_RESULT_SUCCESS)
7146 error ("Code conversion error: %d", coding.result);
7147
7148 return (BUFFERP (dst_object)
7149 ? make_number (coding.produced_char)
7150 : coding.dst_object);
7151 }
7152
7153
7154 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7155 3, 4, "r\nzCoding system: ",
7156 doc: /* Decode the current region from the specified coding system.
7157 When called from a program, takes four arguments:
7158 START, END, CODING-SYSTEM, and DESTINATION.
7159 START and END are buffer positions.
7160
7161 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7162 If nil, the region between START and END is replace by the decoded text.
7163 If buffer, the decoded text is inserted in the buffer.
7164 If t, the decoded text is returned.
7165
7166 This function sets `last-coding-system-used' to the precise coding system
7167 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7168 not fully specified.)
7169 It returns the length of the decoded text. */)
7170 (start, end, coding_system, destination)
7171 Lisp_Object start, end, coding_system, destination;
7172 {
7173 return code_convert_region (start, end, coding_system, destination, 0, 0);
7174 }
7175
7176 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7177 3, 4, "r\nzCoding system: ",
7178 doc: /* Encode the current region by specified coding system.
7179 When called from a program, takes three arguments:
7180 START, END, and CODING-SYSTEM. START and END are buffer positions.
7181
7182 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7183 If nil, the region between START and END is replace by the encoded text.
7184 If buffer, the encoded text is inserted in the buffer.
7185 If t, the encoded text is returned.
7186
7187 This function sets `last-coding-system-used' to the precise coding system
7188 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7189 not fully specified.)
7190 It returns the length of the encoded text. */)
7191 (start, end, coding_system, destination)
7192 Lisp_Object start, end, coding_system, destination;
7193 {
7194 return code_convert_region (start, end, coding_system, destination, 1, 0);
7195 }
7196
7197 Lisp_Object
7198 code_convert_string (string, coding_system, dst_object,
7199 encodep, nocopy, norecord)
7200 Lisp_Object string, coding_system, dst_object;
7201 int encodep, nocopy, norecord;
7202 {
7203 struct coding_system coding;
7204 EMACS_INT chars, bytes;
7205
7206 CHECK_STRING (string);
7207 if (NILP (coding_system))
7208 {
7209 if (! norecord)
7210 Vlast_coding_system_used = Qno_conversion;
7211 if (NILP (dst_object))
7212 return (nocopy ? Fcopy_sequence (string) : string);
7213 }
7214
7215 if (NILP (coding_system))
7216 coding_system = Qno_conversion;
7217 else
7218 CHECK_CODING_SYSTEM (coding_system);
7219 if (NILP (dst_object))
7220 dst_object = Qt;
7221 else if (! EQ (dst_object, Qt))
7222 CHECK_BUFFER (dst_object);
7223
7224 setup_coding_system (coding_system, &coding);
7225 coding.mode |= CODING_MODE_LAST_BLOCK;
7226 chars = XSTRING (string)->size;
7227 bytes = STRING_BYTES (XSTRING (string));
7228 if (encodep)
7229 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7230 else
7231 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7232 if (! norecord)
7233 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7234
7235 if (coding.result != CODING_RESULT_SUCCESS)
7236 error ("Code conversion error: %d", coding.result);
7237
7238 return (BUFFERP (dst_object)
7239 ? make_number (coding.produced_char)
7240 : coding.dst_object);
7241 }
7242
7243
7244 /* Encode or decode STRING according to CODING_SYSTEM.
7245 Do not set Vlast_coding_system_used.
7246
7247 This function is called only from macros DECODE_FILE and
7248 ENCODE_FILE, thus we ignore character composition. */
7249
7250 Lisp_Object
7251 code_convert_string_norecord (string, coding_system, encodep)
7252 Lisp_Object string, coding_system;
7253 int encodep;
7254 {
7255 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7256 }
7257
7258
7259 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7260 2, 4, 0,
7261 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7262
7263 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7264 if the decoding operation is trivial.
7265
7266 Optional fourth arg BUFFER non-nil meant that the decoded text is
7267 inserted in BUFFER instead of returned as a string. In this case,
7268 the return value is BUFFER.
7269
7270 This function sets `last-coding-system-used' to the precise coding system
7271 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7272 not fully specified. */)
7273 (string, coding_system, nocopy, buffer)
7274 Lisp_Object string, coding_system, nocopy, buffer;
7275 {
7276 return code_convert_string (string, coding_system, buffer,
7277 0, ! NILP (nocopy), 0);
7278 }
7279
7280 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7281 2, 4, 0,
7282 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7283
7284 Optional third arg NOCOPY non-nil means it is OK to return STRING
7285 itself if the encoding operation is trivial.
7286
7287 Optional fourth arg BUFFER non-nil meant that the encoded text is
7288 inserted in BUFFER instead of returned as a string. In this case,
7289 the return value is BUFFER.
7290
7291 This function sets `last-coding-system-used' to the precise coding system
7292 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7293 not fully specified.) */)
7294 (string, coding_system, nocopy, buffer)
7295 Lisp_Object string, coding_system, nocopy, buffer;
7296 {
7297 return code_convert_string (string, coding_system, buffer,
7298 1, ! NILP (nocopy), 1);
7299 }
7300
7301 \f
7302 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7303 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7304 Return the corresponding character. */)
7305 (code)
7306 Lisp_Object code;
7307 {
7308 Lisp_Object spec, attrs, val;
7309 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7310 int c;
7311
7312 CHECK_NATNUM (code);
7313 c = XFASTINT (code);
7314 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7315 attrs = AREF (spec, 0);
7316
7317 if (ASCII_BYTE_P (c)
7318 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7319 return code;
7320
7321 val = CODING_ATTR_CHARSET_LIST (attrs);
7322 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7323 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7324 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7325
7326 if (c <= 0x7F)
7327 charset = charset_roman;
7328 else if (c >= 0xA0 && c < 0xDF)
7329 {
7330 charset = charset_kana;
7331 c -= 0x80;
7332 }
7333 else
7334 {
7335 int s1 = c >> 8, s2 = c & 0xFF;
7336
7337 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7338 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7339 error ("Invalid code: %d", code);
7340 SJIS_TO_JIS (c);
7341 charset = charset_kanji;
7342 }
7343 c = DECODE_CHAR (charset, c);
7344 if (c < 0)
7345 error ("Invalid code: %d", code);
7346 return make_number (c);
7347 }
7348
7349
7350 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7351 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7352 Return the corresponding code in SJIS. */)
7353 (ch)
7354 Lisp_Object ch;
7355 {
7356 Lisp_Object spec, attrs, charset_list;
7357 int c;
7358 struct charset *charset;
7359 unsigned code;
7360
7361 CHECK_CHARACTER (ch);
7362 c = XFASTINT (ch);
7363 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7364 attrs = AREF (spec, 0);
7365
7366 if (ASCII_CHAR_P (c)
7367 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7368 return ch;
7369
7370 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7371 charset = char_charset (c, charset_list, &code);
7372 if (code == CHARSET_INVALID_CODE (charset))
7373 error ("Can't encode by shift_jis encoding: %d", c);
7374 JIS_TO_SJIS (code);
7375
7376 return make_number (code);
7377 }
7378
7379 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7380 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7381 Return the corresponding character. */)
7382 (code)
7383 Lisp_Object code;
7384 {
7385 Lisp_Object spec, attrs, val;
7386 struct charset *charset_roman, *charset_big5, *charset;
7387 int c;
7388
7389 CHECK_NATNUM (code);
7390 c = XFASTINT (code);
7391 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7392 attrs = AREF (spec, 0);
7393
7394 if (ASCII_BYTE_P (c)
7395 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7396 return code;
7397
7398 val = CODING_ATTR_CHARSET_LIST (attrs);
7399 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7400 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7401
7402 if (c <= 0x7F)
7403 charset = charset_roman;
7404 else
7405 {
7406 int b1 = c >> 8, b2 = c & 0x7F;
7407 if (b1 < 0xA1 || b1 > 0xFE
7408 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7409 error ("Invalid code: %d", code);
7410 charset = charset_big5;
7411 }
7412 c = DECODE_CHAR (charset, (unsigned )c);
7413 if (c < 0)
7414 error ("Invalid code: %d", code);
7415 return make_number (c);
7416 }
7417
7418 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7419 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7420 Return the corresponding character code in Big5. */)
7421 (ch)
7422 Lisp_Object ch;
7423 {
7424 Lisp_Object spec, attrs, charset_list;
7425 struct charset *charset;
7426 int c;
7427 unsigned code;
7428
7429 CHECK_CHARACTER (ch);
7430 c = XFASTINT (ch);
7431 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7432 attrs = AREF (spec, 0);
7433 if (ASCII_CHAR_P (c)
7434 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7435 return ch;
7436
7437 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7438 charset = char_charset (c, charset_list, &code);
7439 if (code == CHARSET_INVALID_CODE (charset))
7440 error ("Can't encode by Big5 encoding: %d", c);
7441
7442 return make_number (code);
7443 }
7444
7445 \f
7446 DEFUN ("set-terminal-coding-system-internal",
7447 Fset_terminal_coding_system_internal,
7448 Sset_terminal_coding_system_internal, 1, 1, 0,
7449 doc: /* Internal use only. */)
7450 (coding_system)
7451 Lisp_Object coding_system;
7452 {
7453 CHECK_SYMBOL (coding_system);
7454 setup_coding_system (Fcheck_coding_system (coding_system),
7455 &terminal_coding);
7456
7457 /* We had better not send unsafe characters to terminal. */
7458 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7459 /* Characer composition should be disabled. */
7460 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7461 terminal_coding.src_multibyte = 1;
7462 terminal_coding.dst_multibyte = 0;
7463 return Qnil;
7464 }
7465
7466 DEFUN ("set-safe-terminal-coding-system-internal",
7467 Fset_safe_terminal_coding_system_internal,
7468 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7469 doc: /* Internal use only. */)
7470 (coding_system)
7471 Lisp_Object coding_system;
7472 {
7473 CHECK_SYMBOL (coding_system);
7474 setup_coding_system (Fcheck_coding_system (coding_system),
7475 &safe_terminal_coding);
7476 /* Characer composition should be disabled. */
7477 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7478 safe_terminal_coding.src_multibyte = 1;
7479 safe_terminal_coding.dst_multibyte = 0;
7480 return Qnil;
7481 }
7482
7483 DEFUN ("terminal-coding-system",
7484 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7485 doc: /* Return coding system specified for terminal output. */)
7486 ()
7487 {
7488 return CODING_ID_NAME (terminal_coding.id);
7489 }
7490
7491 DEFUN ("set-keyboard-coding-system-internal",
7492 Fset_keyboard_coding_system_internal,
7493 Sset_keyboard_coding_system_internal, 1, 1, 0,
7494 doc: /* Internal use only. */)
7495 (coding_system)
7496 Lisp_Object coding_system;
7497 {
7498 CHECK_SYMBOL (coding_system);
7499 setup_coding_system (Fcheck_coding_system (coding_system),
7500 &keyboard_coding);
7501 /* Characer composition should be disabled. */
7502 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7503 return Qnil;
7504 }
7505
7506 DEFUN ("keyboard-coding-system",
7507 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7508 doc: /* Return coding system specified for decoding keyboard input. */)
7509 ()
7510 {
7511 return CODING_ID_NAME (keyboard_coding.id);
7512 }
7513
7514 \f
7515 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7516 Sfind_operation_coding_system, 1, MANY, 0,
7517 doc: /* Choose a coding system for an operation based on the target name.
7518 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7519 DECODING-SYSTEM is the coding system to use for decoding
7520 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7521 for encoding (in case OPERATION does encoding).
7522
7523 The first argument OPERATION specifies an I/O primitive:
7524 For file I/O, `insert-file-contents' or `write-region'.
7525 For process I/O, `call-process', `call-process-region', or `start-process'.
7526 For network I/O, `open-network-stream'.
7527
7528 The remaining arguments should be the same arguments that were passed
7529 to the primitive. Depending on which primitive, one of those arguments
7530 is selected as the TARGET. For example, if OPERATION does file I/O,
7531 whichever argument specifies the file name is TARGET.
7532
7533 TARGET has a meaning which depends on OPERATION:
7534 For file I/O, TARGET is a file name.
7535 For process I/O, TARGET is a process name.
7536 For network I/O, TARGET is a service name or a port number
7537
7538 This function looks up what specified for TARGET in,
7539 `file-coding-system-alist', `process-coding-system-alist',
7540 or `network-coding-system-alist' depending on OPERATION.
7541 They may specify a coding system, a cons of coding systems,
7542 or a function symbol to call.
7543 In the last case, we call the function with one argument,
7544 which is a list of all the arguments given to this function.
7545
7546 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7547 (nargs, args)
7548 int nargs;
7549 Lisp_Object *args;
7550 {
7551 Lisp_Object operation, target_idx, target, val;
7552 register Lisp_Object chain;
7553
7554 if (nargs < 2)
7555 error ("Too few arguments");
7556 operation = args[0];
7557 if (!SYMBOLP (operation)
7558 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7559 error ("Invalid first arguement");
7560 if (nargs < 1 + XINT (target_idx))
7561 error ("Too few arguments for operation: %s",
7562 XSYMBOL (operation)->name->data);
7563 target = args[XINT (target_idx) + 1];
7564 if (!(STRINGP (target)
7565 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7566 error ("Invalid %dth argument", XINT (target_idx) + 1);
7567
7568 chain = ((EQ (operation, Qinsert_file_contents)
7569 || EQ (operation, Qwrite_region))
7570 ? Vfile_coding_system_alist
7571 : (EQ (operation, Qopen_network_stream)
7572 ? Vnetwork_coding_system_alist
7573 : Vprocess_coding_system_alist));
7574 if (NILP (chain))
7575 return Qnil;
7576
7577 for (; CONSP (chain); chain = XCDR (chain))
7578 {
7579 Lisp_Object elt;
7580
7581 elt = XCAR (chain);
7582 if (CONSP (elt)
7583 && ((STRINGP (target)
7584 && STRINGP (XCAR (elt))
7585 && fast_string_match (XCAR (elt), target) >= 0)
7586 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7587 {
7588 val = XCDR (elt);
7589 /* Here, if VAL is both a valid coding system and a valid
7590 function symbol, we return VAL as a coding system. */
7591 if (CONSP (val))
7592 return val;
7593 if (! SYMBOLP (val))
7594 return Qnil;
7595 if (! NILP (Fcoding_system_p (val)))
7596 return Fcons (val, val);
7597 if (! NILP (Ffboundp (val)))
7598 {
7599 val = call1 (val, Flist (nargs, args));
7600 if (CONSP (val))
7601 return val;
7602 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7603 return Fcons (val, val);
7604 }
7605 return Qnil;
7606 }
7607 }
7608 return Qnil;
7609 }
7610
7611 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7612 Sset_coding_system_priority, 0, MANY, 0,
7613 doc: /* Assign higher priority to the coding systems given as arguments.
7614 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7615 (nargs, args)
7616 int nargs;
7617 Lisp_Object *args;
7618 {
7619 int i, j;
7620 int changed[coding_category_max];
7621 enum coding_category priorities[coding_category_max];
7622
7623 bzero (changed, sizeof changed);
7624
7625 for (i = j = 0; i < nargs; i++)
7626 {
7627 enum coding_category category;
7628 Lisp_Object spec, attrs;
7629
7630 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7631 attrs = AREF (spec, 0);
7632 category = XINT (CODING_ATTR_CATEGORY (attrs));
7633 if (changed[category])
7634 /* Ignore this coding system because a coding system of the
7635 same category already had a higher priority. */
7636 continue;
7637 changed[category] = 1;
7638 priorities[j++] = category;
7639 if (coding_categories[category].id >= 0
7640 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7641 setup_coding_system (args[i], &coding_categories[category]);
7642 }
7643
7644 /* Now we have decided top J priorities. Reflect the order of the
7645 original priorities to the remaining priorities. */
7646
7647 for (i = j, j = 0; i < coding_category_max; i++, j++)
7648 {
7649 while (j < coding_category_max
7650 && changed[coding_priorities[j]])
7651 j++;
7652 if (j == coding_category_max)
7653 abort ();
7654 priorities[i] = coding_priorities[j];
7655 }
7656
7657 bcopy (priorities, coding_priorities, sizeof priorities);
7658 return Qnil;
7659 }
7660
7661 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7662 Scoding_system_priority_list, 0, 1, 0,
7663 doc: /* Return a list of coding systems ordered by their priorities.
7664 HIGHESTP non-nil means just return the highest priority one. */)
7665 (highestp)
7666 Lisp_Object highestp;
7667 {
7668 int i;
7669 Lisp_Object val;
7670
7671 for (i = 0, val = Qnil; i < coding_category_max; i++)
7672 {
7673 enum coding_category category = coding_priorities[i];
7674 int id = coding_categories[category].id;
7675 Lisp_Object attrs;
7676
7677 if (id < 0)
7678 continue;
7679 attrs = CODING_ID_ATTRS (id);
7680 if (! NILP (highestp))
7681 return CODING_ATTR_BASE_NAME (attrs);
7682 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7683 }
7684 return Fnreverse (val);
7685 }
7686
7687 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7688
7689 static Lisp_Object
7690 make_subsidiaries (base)
7691 Lisp_Object base;
7692 {
7693 Lisp_Object subsidiaries;
7694 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7695 char *buf = (char *) alloca (base_name_len + 6);
7696 int i;
7697
7698 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7699 subsidiaries = Fmake_vector (make_number (3), Qnil);
7700 for (i = 0; i < 3; i++)
7701 {
7702 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7703 ASET (subsidiaries, i, intern (buf));
7704 }
7705 return subsidiaries;
7706 }
7707
7708
7709 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7710 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7711 doc: /* For internal use only.
7712 usage: (define-coding-system-internal ...) */)
7713 (nargs, args)
7714 int nargs;
7715 Lisp_Object *args;
7716 {
7717 Lisp_Object name;
7718 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7719 Lisp_Object attrs; /* Vector of attributes. */
7720 Lisp_Object eol_type;
7721 Lisp_Object aliases;
7722 Lisp_Object coding_type, charset_list, safe_charsets;
7723 enum coding_category category;
7724 Lisp_Object tail, val;
7725 int max_charset_id = 0;
7726 int i;
7727
7728 if (nargs < coding_arg_max)
7729 goto short_args;
7730
7731 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7732
7733 name = args[coding_arg_name];
7734 CHECK_SYMBOL (name);
7735 CODING_ATTR_BASE_NAME (attrs) = name;
7736
7737 val = args[coding_arg_mnemonic];
7738 if (! STRINGP (val))
7739 CHECK_CHARACTER (val);
7740 CODING_ATTR_MNEMONIC (attrs) = val;
7741
7742 coding_type = args[coding_arg_coding_type];
7743 CHECK_SYMBOL (coding_type);
7744 CODING_ATTR_TYPE (attrs) = coding_type;
7745
7746 charset_list = args[coding_arg_charset_list];
7747 if (SYMBOLP (charset_list))
7748 {
7749 if (EQ (charset_list, Qiso_2022))
7750 {
7751 if (! EQ (coding_type, Qiso_2022))
7752 error ("Invalid charset-list");
7753 charset_list = Viso_2022_charset_list;
7754 }
7755 else if (EQ (charset_list, Qemacs_mule))
7756 {
7757 if (! EQ (coding_type, Qemacs_mule))
7758 error ("Invalid charset-list");
7759 charset_list = Vemacs_mule_charset_list;
7760 }
7761 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7762 if (max_charset_id < XFASTINT (XCAR (tail)))
7763 max_charset_id = XFASTINT (XCAR (tail));
7764 }
7765 else
7766 {
7767 charset_list = Fcopy_sequence (charset_list);
7768 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7769 {
7770 struct charset *charset;
7771
7772 val = Fcar (tail);
7773 CHECK_CHARSET_GET_CHARSET (val, charset);
7774 if (EQ (coding_type, Qiso_2022)
7775 ? CHARSET_ISO_FINAL (charset) < 0
7776 : EQ (coding_type, Qemacs_mule)
7777 ? CHARSET_EMACS_MULE_ID (charset) < 0
7778 : 0)
7779 error ("Can't handle charset `%s'",
7780 XSYMBOL (CHARSET_NAME (charset))->name->data);
7781
7782 XCAR (tail) = make_number (charset->id);
7783 if (max_charset_id < charset->id)
7784 max_charset_id = charset->id;
7785 }
7786 }
7787 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7788
7789 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7790 make_number (255));
7791 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7792 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7793 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7794
7795 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7796
7797 val = args[coding_arg_decode_translation_table];
7798 if (! NILP (val))
7799 CHECK_CHAR_TABLE (val);
7800 CODING_ATTR_DECODE_TBL (attrs) = val;
7801
7802 val = args[coding_arg_encode_translation_table];
7803 if (! NILP (val))
7804 CHECK_CHAR_TABLE (val);
7805 CODING_ATTR_ENCODE_TBL (attrs) = val;
7806
7807 val = args[coding_arg_post_read_conversion];
7808 CHECK_SYMBOL (val);
7809 CODING_ATTR_POST_READ (attrs) = val;
7810
7811 val = args[coding_arg_pre_write_conversion];
7812 CHECK_SYMBOL (val);
7813 CODING_ATTR_PRE_WRITE (attrs) = val;
7814
7815 val = args[coding_arg_default_char];
7816 if (NILP (val))
7817 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7818 else
7819 {
7820 CHECK_CHARACTER (val);
7821 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7822 }
7823
7824 val = args[coding_arg_plist];
7825 CHECK_LIST (val);
7826 CODING_ATTR_PLIST (attrs) = val;
7827
7828 if (EQ (coding_type, Qcharset))
7829 {
7830 /* Generate a lisp vector of 256 elements. Each element is nil,
7831 integer, or a list of charset IDs.
7832
7833 If Nth element is nil, the byte code N is invalid in this
7834 coding system.
7835
7836 If Nth element is a number NUM, N is the first byte of a
7837 charset whose ID is NUM.
7838
7839 If Nth element is a list of charset IDs, N is the first byte
7840 of one of them. The list is sorted by dimensions of the
7841 charsets. A charset of smaller dimension comes firtst.
7842 */
7843 val = Fmake_vector (make_number (256), Qnil);
7844
7845 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7846 {
7847 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7848 int dim = CHARSET_DIMENSION (charset);
7849 int idx = (dim - 1) * 4;
7850
7851 if (CHARSET_ASCII_COMPATIBLE_P (charset))
7852 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7853
7854 for (i = charset->code_space[idx];
7855 i <= charset->code_space[idx + 1]; i++)
7856 {
7857 Lisp_Object tmp, tmp2;
7858 int dim2;
7859
7860 tmp = AREF (val, i);
7861 if (NILP (tmp))
7862 tmp = XCAR (tail);
7863 else if (NUMBERP (tmp))
7864 {
7865 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7866 if (dim < dim2)
7867 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7868 else
7869 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7870 }
7871 else
7872 {
7873 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7874 {
7875 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7876 if (dim < dim2)
7877 break;
7878 }
7879 if (NILP (tmp2))
7880 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7881 else
7882 {
7883 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7884 XSETCAR (tmp2, XCAR (tail));
7885 }
7886 }
7887 ASET (val, i, tmp);
7888 }
7889 }
7890 ASET (attrs, coding_attr_charset_valids, val);
7891 category = coding_category_charset;
7892 }
7893 else if (EQ (coding_type, Qccl))
7894 {
7895 Lisp_Object valids;
7896
7897 if (nargs < coding_arg_ccl_max)
7898 goto short_args;
7899
7900 val = args[coding_arg_ccl_decoder];
7901 CHECK_CCL_PROGRAM (val);
7902 if (VECTORP (val))
7903 val = Fcopy_sequence (val);
7904 ASET (attrs, coding_attr_ccl_decoder, val);
7905
7906 val = args[coding_arg_ccl_encoder];
7907 CHECK_CCL_PROGRAM (val);
7908 if (VECTORP (val))
7909 val = Fcopy_sequence (val);
7910 ASET (attrs, coding_attr_ccl_encoder, val);
7911
7912 val = args[coding_arg_ccl_valids];
7913 valids = Fmake_string (make_number (256), make_number (0));
7914 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7915 {
7916 int from, to;
7917
7918 val = Fcar (tail);
7919 if (INTEGERP (val))
7920 {
7921 from = to = XINT (val);
7922 if (from < 0 || from > 255)
7923 args_out_of_range_3 (val, make_number (0), make_number (255));
7924 }
7925 else
7926 {
7927 CHECK_CONS (val);
7928 CHECK_NUMBER (XCAR (val));
7929 CHECK_NUMBER (XCDR (val));
7930 from = XINT (XCAR (val));
7931 if (from < 0 || from > 255)
7932 args_out_of_range_3 (XCAR (val),
7933 make_number (0), make_number (255));
7934 to = XINT (XCDR (val));
7935 if (to < from || to > 255)
7936 args_out_of_range_3 (XCDR (val),
7937 XCAR (val), make_number (255));
7938 }
7939 for (i = from; i <= to; i++)
7940 XSTRING (valids)->data[i] = 1;
7941 }
7942 ASET (attrs, coding_attr_ccl_valids, valids);
7943
7944 category = coding_category_ccl;
7945 }
7946 else if (EQ (coding_type, Qutf_16))
7947 {
7948 Lisp_Object bom, endian;
7949
7950 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
7951
7952 if (nargs < coding_arg_utf16_max)
7953 goto short_args;
7954
7955 bom = args[coding_arg_utf16_bom];
7956 if (! NILP (bom) && ! EQ (bom, Qt))
7957 {
7958 CHECK_CONS (bom);
7959 CHECK_CODING_SYSTEM (XCAR (bom));
7960 CHECK_CODING_SYSTEM (XCDR (bom));
7961 }
7962 ASET (attrs, coding_attr_utf_16_bom, bom);
7963
7964 endian = args[coding_arg_utf16_endian];
7965 ASET (attrs, coding_attr_utf_16_endian, endian);
7966
7967 category = (CONSP (bom)
7968 ? coding_category_utf_16_auto
7969 : NILP (bom)
7970 ? (NILP (endian)
7971 ? coding_category_utf_16_be_nosig
7972 : coding_category_utf_16_le_nosig)
7973 : (NILP (endian)
7974 ? coding_category_utf_16_be
7975 : coding_category_utf_16_le));
7976 }
7977 else if (EQ (coding_type, Qiso_2022))
7978 {
7979 Lisp_Object initial, reg_usage, request, flags;
7980 int i, id;
7981
7982 if (nargs < coding_arg_iso2022_max)
7983 goto short_args;
7984
7985 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7986 CHECK_VECTOR (initial);
7987 for (i = 0; i < 4; i++)
7988 {
7989 val = Faref (initial, make_number (i));
7990 if (! NILP (val))
7991 {
7992 struct charset *charset;
7993
7994 CHECK_CHARSET_GET_CHARSET (val, charset);
7995 ASET (initial, i, make_number (CHARSET_ID (charset)));
7996 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
7997 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7998 }
7999 else
8000 ASET (initial, i, make_number (-1));
8001 }
8002
8003 reg_usage = args[coding_arg_iso2022_reg_usage];
8004 CHECK_CONS (reg_usage);
8005 CHECK_NATNUM (XCAR (reg_usage));
8006 CHECK_NATNUM (XCDR (reg_usage));
8007
8008 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8009 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8010 {
8011 int id;
8012
8013 val = Fcar (tail);
8014 CHECK_CONS (val);
8015 CHECK_CHARSET_GET_ID (XCAR (val), id);
8016 CHECK_NATNUM (XCDR (val));
8017 if (XINT (XCDR (val)) >= 4)
8018 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8019 XCAR (val) = make_number (id);
8020 }
8021
8022 flags = args[coding_arg_iso2022_flags];
8023 CHECK_NATNUM (flags);
8024 i = XINT (flags);
8025 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8026 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8027
8028 ASET (attrs, coding_attr_iso_initial, initial);
8029 ASET (attrs, coding_attr_iso_usage, reg_usage);
8030 ASET (attrs, coding_attr_iso_request, request);
8031 ASET (attrs, coding_attr_iso_flags, flags);
8032 setup_iso_safe_charsets (attrs);
8033
8034 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8035 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8036 | CODING_ISO_FLAG_SINGLE_SHIFT))
8037 ? coding_category_iso_7_else
8038 : EQ (args[coding_arg_charset_list], Qiso_2022)
8039 ? coding_category_iso_7
8040 : coding_category_iso_7_tight);
8041 else
8042 {
8043 int id = XINT (AREF (initial, 1));
8044
8045 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8046 || EQ (args[coding_arg_charset_list], Qiso_2022)
8047 || id < 0)
8048 ? coding_category_iso_8_else
8049 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8050 ? coding_category_iso_8_1
8051 : coding_category_iso_8_2);
8052 }
8053 if (category != coding_category_iso_8_1
8054 && category != coding_category_iso_8_2)
8055 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8056 }
8057 else if (EQ (coding_type, Qemacs_mule))
8058 {
8059 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8060 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8061 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8062 category = coding_category_emacs_mule;
8063 }
8064 else if (EQ (coding_type, Qshift_jis))
8065 {
8066
8067 struct charset *charset;
8068
8069 if (XINT (Flength (charset_list)) != 3)
8070 error ("There should be just three charsets");
8071
8072 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8073 if (CHARSET_DIMENSION (charset) != 1)
8074 error ("Dimension of charset %s is not one",
8075 XSYMBOL (CHARSET_NAME (charset))->name->data);
8076 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8077 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8078
8079 charset_list = XCDR (charset_list);
8080 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8081 if (CHARSET_DIMENSION (charset) != 1)
8082 error ("Dimension of charset %s is not one",
8083 XSYMBOL (CHARSET_NAME (charset))->name->data);
8084
8085 charset_list = XCDR (charset_list);
8086 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8087 if (CHARSET_DIMENSION (charset) != 2)
8088 error ("Dimension of charset %s is not two",
8089 XSYMBOL (CHARSET_NAME (charset))->name->data);
8090
8091 category = coding_category_sjis;
8092 Vsjis_coding_system = name;
8093 }
8094 else if (EQ (coding_type, Qbig5))
8095 {
8096 struct charset *charset;
8097
8098 if (XINT (Flength (charset_list)) != 2)
8099 error ("There should be just two charsets");
8100
8101 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8102 if (CHARSET_DIMENSION (charset) != 1)
8103 error ("Dimension of charset %s is not one",
8104 XSYMBOL (CHARSET_NAME (charset))->name->data);
8105 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8106 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8107
8108 charset_list = XCDR (charset_list);
8109 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8110 if (CHARSET_DIMENSION (charset) != 2)
8111 error ("Dimension of charset %s is not two",
8112 XSYMBOL (CHARSET_NAME (charset))->name->data);
8113
8114 category = coding_category_big5;
8115 Vbig5_coding_system = name;
8116 }
8117 else if (EQ (coding_type, Qraw_text))
8118 {
8119 category = coding_category_raw_text;
8120 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8121 }
8122 else if (EQ (coding_type, Qutf_8))
8123 {
8124 category = coding_category_utf_8;
8125 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8126 }
8127 else if (EQ (coding_type, Qundecided))
8128 category = coding_category_undecided;
8129 else
8130 error ("Invalid coding system type: %s",
8131 XSYMBOL (coding_type)->name->data);
8132
8133 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8134
8135 eol_type = args[coding_arg_eol_type];
8136 if (! NILP (eol_type)
8137 && ! EQ (eol_type, Qunix)
8138 && ! EQ (eol_type, Qdos)
8139 && ! EQ (eol_type, Qmac))
8140 error ("Invalid eol-type");
8141
8142 aliases = Fcons (name, Qnil);
8143
8144 if (NILP (eol_type))
8145 {
8146 eol_type = make_subsidiaries (name);
8147 for (i = 0; i < 3; i++)
8148 {
8149 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8150
8151 this_name = AREF (eol_type, i);
8152 this_aliases = Fcons (this_name, Qnil);
8153 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8154 this_spec = Fmake_vector (make_number (3), attrs);
8155 ASET (this_spec, 1, this_aliases);
8156 ASET (this_spec, 2, this_eol_type);
8157 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8158 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8159 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8160 Vcoding_system_alist);
8161 }
8162 }
8163
8164 spec_vec = Fmake_vector (make_number (3), attrs);
8165 ASET (spec_vec, 1, aliases);
8166 ASET (spec_vec, 2, eol_type);
8167
8168 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8169 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8170 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8171 Vcoding_system_alist);
8172
8173 {
8174 int id = coding_categories[category].id;
8175
8176 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8177 setup_coding_system (name, &coding_categories[category]);
8178 }
8179
8180 return Qnil;
8181
8182 short_args:
8183 return Fsignal (Qwrong_number_of_arguments,
8184 Fcons (intern ("define-coding-system-internal"),
8185 make_number (nargs)));
8186 }
8187
8188 /* Fixme: should this record the alias relationships for
8189 diagnostics? Should it update coding-system-list? */
8190 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8191 Sdefine_coding_system_alias, 2, 2, 0,
8192 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8193 (alias, coding_system)
8194 Lisp_Object alias, coding_system;
8195 {
8196 Lisp_Object spec, aliases, eol_type;
8197
8198 CHECK_SYMBOL (alias);
8199 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8200 aliases = AREF (spec, 1);
8201 while (!NILP (XCDR (aliases)))
8202 aliases = XCDR (aliases);
8203 XCDR (aliases) = Fcons (alias, Qnil);
8204
8205 eol_type = AREF (spec, 2);
8206 if (VECTORP (eol_type))
8207 {
8208 Lisp_Object subsidiaries;
8209 int i;
8210
8211 subsidiaries = make_subsidiaries (alias);
8212 for (i = 0; i < 3; i++)
8213 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8214 AREF (eol_type, i));
8215
8216 ASET (spec, 2, subsidiaries);
8217 }
8218
8219 Fputhash (alias, spec, Vcoding_system_hash_table);
8220 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8221 Vcoding_system_alist);
8222
8223 return Qnil;
8224 }
8225
8226 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8227 1, 1, 0,
8228 doc: /* Return the base of CODING-SYSTEM.
8229 Any alias or subsidiary coding system is not a base coding system. */)
8230 (coding_system)
8231 Lisp_Object coding_system;
8232 {
8233 Lisp_Object spec, attrs;
8234
8235 if (NILP (coding_system))
8236 return (Qno_conversion);
8237 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8238 attrs = AREF (spec, 0);
8239 return CODING_ATTR_BASE_NAME (attrs);
8240 }
8241
8242 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8243 1, 1, 0,
8244 doc: "Return the property list of CODING-SYSTEM.")
8245 (coding_system)
8246 Lisp_Object coding_system;
8247 {
8248 Lisp_Object spec, attrs;
8249
8250 if (NILP (coding_system))
8251 coding_system = Qno_conversion;
8252 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8253 attrs = AREF (spec, 0);
8254 return CODING_ATTR_PLIST (attrs);
8255 }
8256
8257
8258 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8259 1, 1, 0,
8260 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8261 (coding_system)
8262 Lisp_Object coding_system;
8263 {
8264 Lisp_Object spec;
8265
8266 if (NILP (coding_system))
8267 coding_system = Qno_conversion;
8268 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8269 return AREF (spec, 1);
8270 }
8271
8272 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8273 Scoding_system_eol_type, 1, 1, 0,
8274 doc: /* Return eol-type of CODING-SYSTEM.
8275 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8276
8277 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8278 and CR respectively.
8279
8280 A vector value indicates that a format of end-of-line should be
8281 detected automatically. Nth element of the vector is the subsidiary
8282 coding system whose eol-type is N. */)
8283 (coding_system)
8284 Lisp_Object coding_system;
8285 {
8286 Lisp_Object spec, eol_type;
8287 int n;
8288
8289 if (NILP (coding_system))
8290 coding_system = Qno_conversion;
8291 if (! CODING_SYSTEM_P (coding_system))
8292 return Qnil;
8293 spec = CODING_SYSTEM_SPEC (coding_system);
8294 eol_type = AREF (spec, 2);
8295 if (VECTORP (eol_type))
8296 return Fcopy_sequence (eol_type);
8297 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8298 return make_number (n);
8299 }
8300
8301 #endif /* emacs */
8302
8303 \f
8304 /*** 9. Post-amble ***/
8305
8306 void
8307 init_coding_once ()
8308 {
8309 int i;
8310
8311 for (i = 0; i < coding_category_max; i++)
8312 {
8313 coding_categories[i].id = -1;
8314 coding_priorities[i] = i;
8315 }
8316
8317 /* ISO2022 specific initialize routine. */
8318 for (i = 0; i < 0x20; i++)
8319 iso_code_class[i] = ISO_control_0;
8320 for (i = 0x21; i < 0x7F; i++)
8321 iso_code_class[i] = ISO_graphic_plane_0;
8322 for (i = 0x80; i < 0xA0; i++)
8323 iso_code_class[i] = ISO_control_1;
8324 for (i = 0xA1; i < 0xFF; i++)
8325 iso_code_class[i] = ISO_graphic_plane_1;
8326 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8327 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8328 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8329 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8330 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8331 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8332 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8333 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8334 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8335 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8336
8337 inhibit_pre_post_conversion = 0;
8338
8339 for (i = 0; i < 256; i++)
8340 {
8341 emacs_mule_bytes[i] = 1;
8342 }
8343 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8344 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8345 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8346 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8347 }
8348
8349 #ifdef emacs
8350
8351 void
8352 syms_of_coding ()
8353 {
8354 staticpro (&Vcoding_system_hash_table);
8355 Vcoding_system_hash_table = Fmakehash (Qeq);
8356
8357 staticpro (&Vsjis_coding_system);
8358 Vsjis_coding_system = Qnil;
8359
8360 staticpro (&Vbig5_coding_system);
8361 Vbig5_coding_system = Qnil;
8362
8363 staticpro (&Vcode_conversion_work_buf_list);
8364 Vcode_conversion_work_buf_list = Qnil;
8365
8366 staticpro (&Vcode_conversion_reused_work_buf);
8367 Vcode_conversion_reused_work_buf = Qnil;
8368
8369 DEFSYM (Qcharset, "charset");
8370 DEFSYM (Qtarget_idx, "target-idx");
8371 DEFSYM (Qcoding_system_history, "coding-system-history");
8372 Fset (Qcoding_system_history, Qnil);
8373
8374 /* Target FILENAME is the first argument. */
8375 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8376 /* Target FILENAME is the third argument. */
8377 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8378
8379 DEFSYM (Qcall_process, "call-process");
8380 /* Target PROGRAM is the first argument. */
8381 Fput (Qcall_process, Qtarget_idx, make_number (0));
8382
8383 DEFSYM (Qcall_process_region, "call-process-region");
8384 /* Target PROGRAM is the third argument. */
8385 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8386
8387 DEFSYM (Qstart_process, "start-process");
8388 /* Target PROGRAM is the third argument. */
8389 Fput (Qstart_process, Qtarget_idx, make_number (2));
8390
8391 DEFSYM (Qopen_network_stream, "open-network-stream");
8392 /* Target SERVICE is the fourth argument. */
8393 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8394
8395 DEFSYM (Qcoding_system, "coding-system");
8396 DEFSYM (Qcoding_aliases, "coding-aliases");
8397
8398 DEFSYM (Qeol_type, "eol-type");
8399 DEFSYM (Qunix, "unix");
8400 DEFSYM (Qdos, "dos");
8401
8402 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8403 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8404 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8405 DEFSYM (Qdefault_char, "default-char");
8406 DEFSYM (Qundecided, "undecided");
8407 DEFSYM (Qno_conversion, "no-conversion");
8408 DEFSYM (Qraw_text, "raw-text");
8409
8410 DEFSYM (Qiso_2022, "iso-2022");
8411
8412 DEFSYM (Qutf_8, "utf-8");
8413
8414 DEFSYM (Qutf_16, "utf-16");
8415 DEFSYM (Qsignature, "signature");
8416 DEFSYM (Qendian, "endian");
8417 DEFSYM (Qbig, "big");
8418 DEFSYM (Qlittle, "little");
8419
8420 DEFSYM (Qshift_jis, "shift-jis");
8421 DEFSYM (Qbig5, "big5");
8422
8423 DEFSYM (Qcoding_system_p, "coding-system-p");
8424
8425 DEFSYM (Qcoding_system_error, "coding-system-error");
8426 Fput (Qcoding_system_error, Qerror_conditions,
8427 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8428 Fput (Qcoding_system_error, Qerror_message,
8429 build_string ("Invalid coding system"));
8430
8431 /* Intern this now in case it isn't already done.
8432 Setting this variable twice is harmless.
8433 But don't staticpro it here--that is done in alloc.c. */
8434 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8435
8436 DEFSYM (Qtranslation_table, "translation-table");
8437 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8438 DEFSYM (Qtranslation_table_id, "translation-table-id");
8439 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8440 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8441
8442 DEFSYM (Qvalid_codes, "valid-codes");
8443
8444 DEFSYM (Qemacs_mule, "emacs-mule");
8445
8446 Vcoding_category_table
8447 = Fmake_vector (make_number (coding_category_max), Qnil);
8448 staticpro (&Vcoding_category_table);
8449 /* Followings are target of code detection. */
8450 ASET (Vcoding_category_table, coding_category_iso_7,
8451 intern ("coding-category-iso-7"));
8452 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8453 intern ("coding-category-iso-7-tight"));
8454 ASET (Vcoding_category_table, coding_category_iso_8_1,
8455 intern ("coding-category-iso-8-1"));
8456 ASET (Vcoding_category_table, coding_category_iso_8_2,
8457 intern ("coding-category-iso-8-2"));
8458 ASET (Vcoding_category_table, coding_category_iso_7_else,
8459 intern ("coding-category-iso-7-else"));
8460 ASET (Vcoding_category_table, coding_category_iso_8_else,
8461 intern ("coding-category-iso-8-else"));
8462 ASET (Vcoding_category_table, coding_category_utf_8,
8463 intern ("coding-category-utf-8"));
8464 ASET (Vcoding_category_table, coding_category_utf_16_be,
8465 intern ("coding-category-utf-16-be"));
8466 ASET (Vcoding_category_table, coding_category_utf_16_le,
8467 intern ("coding-category-utf-16-le"));
8468 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8469 intern ("coding-category-utf-16-be-nosig"));
8470 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8471 intern ("coding-category-utf-16-le-nosig"));
8472 ASET (Vcoding_category_table, coding_category_charset,
8473 intern ("coding-category-charset"));
8474 ASET (Vcoding_category_table, coding_category_sjis,
8475 intern ("coding-category-sjis"));
8476 ASET (Vcoding_category_table, coding_category_big5,
8477 intern ("coding-category-big5"));
8478 ASET (Vcoding_category_table, coding_category_ccl,
8479 intern ("coding-category-ccl"));
8480 ASET (Vcoding_category_table, coding_category_emacs_mule,
8481 intern ("coding-category-emacs-mule"));
8482 /* Followings are NOT target of code detection. */
8483 ASET (Vcoding_category_table, coding_category_raw_text,
8484 intern ("coding-category-raw-text"));
8485 ASET (Vcoding_category_table, coding_category_undecided,
8486 intern ("coding-category-undecided"));
8487
8488 defsubr (&Scoding_system_p);
8489 defsubr (&Sread_coding_system);
8490 defsubr (&Sread_non_nil_coding_system);
8491 defsubr (&Scheck_coding_system);
8492 defsubr (&Sdetect_coding_region);
8493 defsubr (&Sdetect_coding_string);
8494 defsubr (&Sfind_coding_systems_region_internal);
8495 defsubr (&Scheck_coding_systems_region);
8496 defsubr (&Sdecode_coding_region);
8497 defsubr (&Sencode_coding_region);
8498 defsubr (&Sdecode_coding_string);
8499 defsubr (&Sencode_coding_string);
8500 defsubr (&Sdecode_sjis_char);
8501 defsubr (&Sencode_sjis_char);
8502 defsubr (&Sdecode_big5_char);
8503 defsubr (&Sencode_big5_char);
8504 defsubr (&Sset_terminal_coding_system_internal);
8505 defsubr (&Sset_safe_terminal_coding_system_internal);
8506 defsubr (&Sterminal_coding_system);
8507 defsubr (&Sset_keyboard_coding_system_internal);
8508 defsubr (&Skeyboard_coding_system);
8509 defsubr (&Sfind_operation_coding_system);
8510 defsubr (&Sset_coding_system_priority);
8511 defsubr (&Sdefine_coding_system_internal);
8512 defsubr (&Sdefine_coding_system_alias);
8513 defsubr (&Scoding_system_base);
8514 defsubr (&Scoding_system_plist);
8515 defsubr (&Scoding_system_aliases);
8516 defsubr (&Scoding_system_eol_type);
8517 defsubr (&Scoding_system_priority_list);
8518
8519 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8520 doc: /* List of coding systems.
8521
8522 Do not alter the value of this variable manually. This variable should be
8523 updated by the functions `define-coding-system' and
8524 `define-coding-system-alias'. */);
8525 Vcoding_system_list = Qnil;
8526
8527 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8528 doc: /* Alist of coding system names.
8529 Each element is one element list of coding system name.
8530 This variable is given to `completing-read' as TABLE argument.
8531
8532 Do not alter the value of this variable manually. This variable should be
8533 updated by the functions `make-coding-system' and
8534 `define-coding-system-alias'. */);
8535 Vcoding_system_alist = Qnil;
8536
8537 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8538 doc: /* List of coding-categories (symbols) ordered by priority.
8539
8540 On detecting a coding system, Emacs tries code detection algorithms
8541 associated with each coding-category one by one in this order. When
8542 one algorithm agrees with a byte sequence of source text, the coding
8543 system bound to the corresponding coding-category is selected. */);
8544 {
8545 int i;
8546
8547 Vcoding_category_list = Qnil;
8548 for (i = coding_category_max - 1; i >= 0; i--)
8549 Vcoding_category_list
8550 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8551 Vcoding_category_list);
8552 }
8553
8554 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8555 doc: /* Specify the coding system for read operations.
8556 It is useful to bind this variable with `let', but do not set it globally.
8557 If the value is a coding system, it is used for decoding on read operation.
8558 If not, an appropriate element is used from one of the coding system alists:
8559 There are three such tables, `file-coding-system-alist',
8560 `process-coding-system-alist', and `network-coding-system-alist'. */);
8561 Vcoding_system_for_read = Qnil;
8562
8563 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8564 doc: /* Specify the coding system for write operations.
8565 Programs bind this variable with `let', but you should not set it globally.
8566 If the value is a coding system, it is used for encoding of output,
8567 when writing it to a file and when sending it to a file or subprocess.
8568
8569 If this does not specify a coding system, an appropriate element
8570 is used from one of the coding system alists:
8571 There are three such tables, `file-coding-system-alist',
8572 `process-coding-system-alist', and `network-coding-system-alist'.
8573 For output to files, if the above procedure does not specify a coding system,
8574 the value of `buffer-file-coding-system' is used. */);
8575 Vcoding_system_for_write = Qnil;
8576
8577 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8578 doc: /*
8579 Coding system used in the latest file or process I/O. */);
8580 Vlast_coding_system_used = Qnil;
8581
8582 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8583 doc: /*
8584 *Non-nil means always inhibit code conversion of end-of-line format.
8585 See info node `Coding Systems' and info node `Text and Binary' concerning
8586 such conversion. */);
8587 inhibit_eol_conversion = 0;
8588
8589 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8590 doc: /*
8591 Non-nil means process buffer inherits coding system of process output.
8592 Bind it to t if the process output is to be treated as if it were a file
8593 read from some filesystem. */);
8594 inherit_process_coding_system = 0;
8595
8596 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8597 doc: /*
8598 Alist to decide a coding system to use for a file I/O operation.
8599 The format is ((PATTERN . VAL) ...),
8600 where PATTERN is a regular expression matching a file name,
8601 VAL is a coding system, a cons of coding systems, or a function symbol.
8602 If VAL is a coding system, it is used for both decoding and encoding
8603 the file contents.
8604 If VAL is a cons of coding systems, the car part is used for decoding,
8605 and the cdr part is used for encoding.
8606 If VAL is a function symbol, the function must return a coding system
8607 or a cons of coding systems which are used as above. The function gets
8608 the arguments with which `find-operation-coding-systems' was called.
8609
8610 See also the function `find-operation-coding-system'
8611 and the variable `auto-coding-alist'. */);
8612 Vfile_coding_system_alist = Qnil;
8613
8614 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8615 doc: /*
8616 Alist to decide a coding system to use for a process I/O operation.
8617 The format is ((PATTERN . VAL) ...),
8618 where PATTERN is a regular expression matching a program name,
8619 VAL is a coding system, a cons of coding systems, or a function symbol.
8620 If VAL is a coding system, it is used for both decoding what received
8621 from the program and encoding what sent to the program.
8622 If VAL is a cons of coding systems, the car part is used for decoding,
8623 and the cdr part is used for encoding.
8624 If VAL is a function symbol, the function must return a coding system
8625 or a cons of coding systems which are used as above.
8626
8627 See also the function `find-operation-coding-system'. */);
8628 Vprocess_coding_system_alist = Qnil;
8629
8630 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8631 doc: /*
8632 Alist to decide a coding system to use for a network I/O operation.
8633 The format is ((PATTERN . VAL) ...),
8634 where PATTERN is a regular expression matching a network service name
8635 or is a port number to connect to,
8636 VAL is a coding system, a cons of coding systems, or a function symbol.
8637 If VAL is a coding system, it is used for both decoding what received
8638 from the network stream and encoding what sent to the network stream.
8639 If VAL is a cons of coding systems, the car part is used for decoding,
8640 and the cdr part is used for encoding.
8641 If VAL is a function symbol, the function must return a coding system
8642 or a cons of coding systems which are used as above.
8643
8644 See also the function `find-operation-coding-system'. */);
8645 Vnetwork_coding_system_alist = Qnil;
8646
8647 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8648 doc: /* Coding system to use with system messages.
8649 Also used for decoding keyboard input on X Window system. */);
8650 Vlocale_coding_system = Qnil;
8651
8652 /* The eol mnemonics are reset in startup.el system-dependently. */
8653 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8654 doc: /*
8655 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8656 eol_mnemonic_unix = build_string (":");
8657
8658 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8659 doc: /*
8660 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8661 eol_mnemonic_dos = build_string ("\\");
8662
8663 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8664 doc: /*
8665 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8666 eol_mnemonic_mac = build_string ("/");
8667
8668 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8669 doc: /*
8670 *String displayed in mode line when end-of-line format is not yet determined. */);
8671 eol_mnemonic_undecided = build_string (":");
8672
8673 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8674 doc: /*
8675 *Non-nil enables character translation while encoding and decoding. */);
8676 Venable_character_translation = Qt;
8677
8678 DEFVAR_LISP ("standard-translation-table-for-decode",
8679 &Vstandard_translation_table_for_decode,
8680 doc: /* Table for translating characters while decoding. */);
8681 Vstandard_translation_table_for_decode = Qnil;
8682
8683 DEFVAR_LISP ("standard-translation-table-for-encode",
8684 &Vstandard_translation_table_for_encode,
8685 doc: /* Table for translating characters while encoding. */);
8686 Vstandard_translation_table_for_encode = Qnil;
8687
8688 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8689 doc: /* Alist of charsets vs revision numbers.
8690 While encoding, if a charset (car part of an element) is found,
8691 designate it with the escape sequence identifying revision (cdr part
8692 of the element). */);
8693 Vcharset_revision_table = Qnil;
8694
8695 DEFVAR_LISP ("default-process-coding-system",
8696 &Vdefault_process_coding_system,
8697 doc: /* Cons of coding systems used for process I/O by default.
8698 The car part is used for decoding a process output,
8699 the cdr part is used for encoding a text to be sent to a process. */);
8700 Vdefault_process_coding_system = Qnil;
8701
8702 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8703 doc: /*
8704 Table of extra Latin codes in the range 128..159 (inclusive).
8705 This is a vector of length 256.
8706 If Nth element is non-nil, the existence of code N in a file
8707 \(or output of subprocess) doesn't prevent it to be detected as
8708 a coding system of ISO 2022 variant which has a flag
8709 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8710 or reading output of a subprocess.
8711 Only 128th through 159th elements has a meaning. */);
8712 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8713
8714 DEFVAR_LISP ("select-safe-coding-system-function",
8715 &Vselect_safe_coding_system_function,
8716 doc: /*
8717 Function to call to select safe coding system for encoding a text.
8718
8719 If set, this function is called to force a user to select a proper
8720 coding system which can encode the text in the case that a default
8721 coding system used in each operation can't encode the text.
8722
8723 The default value is `select-safe-coding-system' (which see). */);
8724 Vselect_safe_coding_system_function = Qnil;
8725
8726 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8727 &inhibit_iso_escape_detection,
8728 doc: /*
8729 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8730
8731 By default, on reading a file, Emacs tries to detect how the text is
8732 encoded. This code detection is sensitive to escape sequences. If
8733 the sequence is valid as ISO2022, the code is determined as one of
8734 the ISO2022 encodings, and the file is decoded by the corresponding
8735 coding system (e.g. `iso-2022-7bit').
8736
8737 However, there may be a case that you want to read escape sequences in
8738 a file as is. In such a case, you can set this variable to non-nil.
8739 Then, as the code detection ignores any escape sequences, no file is
8740 detected as encoded in some ISO2022 encoding. The result is that all
8741 escape sequences become visible in a buffer.
8742
8743 The default value is nil, and it is strongly recommended not to change
8744 it. That is because many Emacs Lisp source files that contain
8745 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8746 in Emacs's distribution, and they won't be decoded correctly on
8747 reading if you suppress escape sequence detection.
8748
8749 The other way to read escape sequences in a file without decoding is
8750 to explicitly specify some coding system that doesn't use ISO2022's
8751 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8752 inhibit_iso_escape_detection = 0;
8753
8754 {
8755 Lisp_Object args[coding_arg_max];
8756 Lisp_Object plist[14];
8757 int i;
8758
8759 for (i = 0; i < coding_arg_max; i++)
8760 args[i] = Qnil;
8761
8762 plist[0] = intern (":name");
8763 plist[1] = args[coding_arg_name] = Qno_conversion;
8764 plist[2] = intern (":mnemonic");
8765 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8766 plist[4] = intern (":coding-type");
8767 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8768 plist[6] = intern (":ascii-compatible-p");
8769 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8770 plist[8] = intern (":default-char");
8771 plist[9] = args[coding_arg_default_char] = make_number (0);
8772 plist[10] = intern (":docstring");
8773 plist[11] = build_string ("Do no conversion.\n\
8774 \n\
8775 When you visit a file with this coding, the file is read into a\n\
8776 unibyte buffer as is, thus each byte of a file is treated as a\n\
8777 character.");
8778 plist[12] = intern (":eol-type");
8779 plist[13] = args[coding_arg_eol_type] = Qunix;
8780 args[coding_arg_plist] = Flist (14, plist);
8781 Fdefine_coding_system_internal (coding_arg_max, args);
8782 }
8783
8784 setup_coding_system (Qno_conversion, &keyboard_coding);
8785 setup_coding_system (Qno_conversion, &terminal_coding);
8786 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8787 }
8788
8789 char *
8790 emacs_strerror (error_number)
8791 int error_number;
8792 {
8793 char *str;
8794
8795 synchronize_system_messages_locale ();
8796 str = strerror (error_number);
8797
8798 if (! NILP (Vlocale_coding_system))
8799 {
8800 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8801 Vlocale_coding_system,
8802 0);
8803 str = (char *) XSTRING (dec)->data;
8804 }
8805
8806 return str;
8807 }
8808
8809 #endif /* emacs */