(enum iso_code_class_type): Delete ISO_carriage_return.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 int coding_system_require_warning;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340 Lisp_Object Qutf_8_emacs;
341
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used;
350
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table;
354
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion;
357
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection;
360
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system;
363
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding;
366
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding;
370
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding;
373
374 Lisp_Object Vfile_coding_system_alist;
375 Lisp_Object Vprocess_coding_system_alist;
376 Lisp_Object Vnetwork_coding_system_alist;
377
378 Lisp_Object Vlocale_coding_system;
379
380 #endif /* emacs */
381
382 /* Flag to tell if we look up translation table on character code
383 conversion. */
384 Lisp_Object Venable_character_translation;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode;
389
390 Lisp_Object Qtranslation_table;
391 Lisp_Object Qtranslation_table_id;
392 Lisp_Object Qtranslation_table_for_decode;
393 Lisp_Object Qtranslation_table_for_encode;
394
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table;
397
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system;
400
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input;
403
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system;
406 Lisp_Object Vbig5_coding_system;
407
408
409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
411 static void decode_coding_utf_8 P_ ((struct coding_system *));
412 static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
423
424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
428
429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_big5 P_ ((struct coding_system *));
437 static int encode_coding_big5 P_ ((struct coding_system *));
438
439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_ccl P_ ((struct coding_system *));
442 static int encode_coding_ccl P_ ((struct coding_system *));
443
444 static void decode_coding_raw_text P_ ((struct coding_system *));
445 static int encode_coding_raw_text P_ ((struct coding_system *));
446
447
448 /* ISO2022 section */
449
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
453 reg)))
454
455
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
459 : -1))
460
461
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486
487 /* All code (1-byte) of ISO2022 is classified into one of the
488 followings. */
489 enum iso_code_class_type
490 {
491 ISO_control_0, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
508 };
509
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
512
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
516
517 /* If set, reset graphic planes and registers at end-of-line to the
518 initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
520
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
524
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
527
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
530
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
534
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
537
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
540
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
543
544 /* If set, assume designation states are reset at beginning of line on
545 output. */
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
547
548 /* If set, designation sequence should be placed at beginning of line
549 on output. */
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
551
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
554
555 /* If set, extra latin codes (128..159) are accepted as a valid code
556 on input. */
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
558
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
560
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
562
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
564
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
566
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
568
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
572
573
574 /* UTF-16 section */
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
577
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
580
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
583
584
585 /* CCL section */
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
592
593 /* Index for each coding category in `coding_categories' */
594
595 enum coding_category
596 {
597 coding_category_iso_7,
598 coding_category_iso_7_tight,
599 coding_category_iso_8_1,
600 coding_category_iso_8_2,
601 coding_category_iso_7_else,
602 coding_category_iso_8_else,
603 coding_category_utf_8,
604 coding_category_utf_16_auto,
605 coding_category_utf_16_be,
606 coding_category_utf_16_le,
607 coding_category_utf_16_be_nosig,
608 coding_category_utf_16_le_nosig,
609 coding_category_charset,
610 coding_category_sjis,
611 coding_category_big5,
612 coding_category_ccl,
613 coding_category_emacs_mule,
614 /* All above are targets of code detection. */
615 coding_category_raw_text,
616 coding_category_undecided,
617 coding_category_max
618 };
619
620 /* Definitions of flag bits used in detect_coding_XXXX. */
621 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
622 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
623 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
624 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
625 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
626 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
627 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
628 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
634 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
635 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
636 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
637 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
638 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
639
640 /* This value is returned if detect_coding_mask () find nothing other
641 than ASCII characters. */
642 #define CATEGORY_MASK_ANY \
643 (CATEGORY_MASK_ISO_7 \
644 | CATEGORY_MASK_ISO_7_TIGHT \
645 | CATEGORY_MASK_ISO_8_1 \
646 | CATEGORY_MASK_ISO_8_2 \
647 | CATEGORY_MASK_ISO_7_ELSE \
648 | CATEGORY_MASK_ISO_8_ELSE \
649 | CATEGORY_MASK_UTF_8 \
650 | CATEGORY_MASK_UTF_16_BE \
651 | CATEGORY_MASK_UTF_16_LE \
652 | CATEGORY_MASK_UTF_16_BE_NOSIG \
653 | CATEGORY_MASK_UTF_16_LE_NOSIG \
654 | CATEGORY_MASK_CHARSET \
655 | CATEGORY_MASK_SJIS \
656 | CATEGORY_MASK_BIG5 \
657 | CATEGORY_MASK_CCL \
658 | CATEGORY_MASK_EMACS_MULE)
659
660
661 #define CATEGORY_MASK_ISO_7BIT \
662 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
663
664 #define CATEGORY_MASK_ISO_8BIT \
665 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
666
667 #define CATEGORY_MASK_ISO_ELSE \
668 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
669
670 #define CATEGORY_MASK_ISO_ESCAPE \
671 (CATEGORY_MASK_ISO_7 \
672 | CATEGORY_MASK_ISO_7_TIGHT \
673 | CATEGORY_MASK_ISO_7_ELSE \
674 | CATEGORY_MASK_ISO_8_ELSE)
675
676 #define CATEGORY_MASK_ISO \
677 ( CATEGORY_MASK_ISO_7BIT \
678 | CATEGORY_MASK_ISO_8BIT \
679 | CATEGORY_MASK_ISO_ELSE)
680
681 #define CATEGORY_MASK_UTF_16 \
682 (CATEGORY_MASK_UTF_16_BE \
683 | CATEGORY_MASK_UTF_16_LE \
684 | CATEGORY_MASK_UTF_16_BE_NOSIG \
685 | CATEGORY_MASK_UTF_16_LE_NOSIG)
686
687
688 /* List of symbols `coding-category-xxx' ordered by priority. This
689 variable is exposed to Emacs Lisp. */
690 static Lisp_Object Vcoding_category_list;
691
692 /* Table of coding categories (Lisp symbols). This variable is for
693 internal use oly. */
694 static Lisp_Object Vcoding_category_table;
695
696 /* Table of coding-categories ordered by priority. */
697 static enum coding_category coding_priorities[coding_category_max];
698
699 /* Nth element is a coding context for the coding system bound to the
700 Nth coding category. */
701 static struct coding_system coding_categories[coding_category_max];
702
703 /*** Commonly used macros and functions ***/
704
705 #ifndef min
706 #define min(a, b) ((a) < (b) ? (a) : (b))
707 #endif
708 #ifndef max
709 #define max(a, b) ((a) > (b) ? (a) : (b))
710 #endif
711
712 #define CODING_GET_INFO(coding, attrs, charset_list) \
713 do { \
714 (attrs) = CODING_ID_ATTRS ((coding)->id); \
715 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
716 } while (0)
717
718
719 /* Safely get one byte from the source text pointed by SRC which ends
720 at SRC_END, and set C to that byte. If there are not enough bytes
721 in the source, it jumps to `no_more_source'. The caller
722 should declare and set these variables appropriately in advance:
723 src, src_end, multibytep
724 */
725
726 #define ONE_MORE_BYTE(c) \
727 do { \
728 if (src == src_end) \
729 { \
730 if (src_base < src) \
731 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
732 goto no_more_source; \
733 } \
734 c = *src++; \
735 if (multibytep && (c & 0x80)) \
736 { \
737 if ((c & 0xFE) != 0xC0) \
738 error ("Undecodable char found"); \
739 c = ((c & 1) << 6) | *src++; \
740 } \
741 consumed_chars++; \
742 } while (0)
743
744
745 #define ONE_MORE_BYTE_NO_CHECK(c) \
746 do { \
747 c = *src++; \
748 if (multibytep && (c & 0x80)) \
749 { \
750 if ((c & 0xFE) != 0xC0) \
751 error ("Undecodable char found"); \
752 c = ((c & 1) << 6) | *src++; \
753 } \
754 consumed_chars++; \
755 } while (0)
756
757
758 /* Store a byte C in the place pointed by DST and increment DST to the
759 next free point, and increment PRODUCED_CHARS. The caller should
760 assure that C is 0..127, and declare and set the variable `dst'
761 appropriately in advance.
762 */
763
764
765 #define EMIT_ONE_ASCII_BYTE(c) \
766 do { \
767 produced_chars++; \
768 *dst++ = (c); \
769 } while (0)
770
771
772 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
773
774 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
775 do { \
776 produced_chars += 2; \
777 *dst++ = (c1), *dst++ = (c2); \
778 } while (0)
779
780
781 /* Store a byte C in the place pointed by DST and increment DST to the
782 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
783 nonzero, store in an appropriate multibyte from. The caller should
784 declare and set the variables `dst' and `multibytep' appropriately
785 in advance. */
786
787 #define EMIT_ONE_BYTE(c) \
788 do { \
789 produced_chars++; \
790 if (multibytep) \
791 { \
792 int ch = (c); \
793 if (ch >= 0x80) \
794 ch = BYTE8_TO_CHAR (ch); \
795 CHAR_STRING_ADVANCE (ch, dst); \
796 } \
797 else \
798 *dst++ = (c); \
799 } while (0)
800
801
802 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
803
804 #define EMIT_TWO_BYTES(c1, c2) \
805 do { \
806 produced_chars += 2; \
807 if (multibytep) \
808 { \
809 int ch; \
810 \
811 ch = (c1); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 ch = (c2); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 } \
820 else \
821 { \
822 *dst++ = (c1); \
823 *dst++ = (c2); \
824 } \
825 } while (0)
826
827
828 #define EMIT_THREE_BYTES(c1, c2, c3) \
829 do { \
830 EMIT_ONE_BYTE (c1); \
831 EMIT_TWO_BYTES (c2, c3); \
832 } while (0)
833
834
835 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
836 do { \
837 EMIT_TWO_BYTES (c1, c2); \
838 EMIT_TWO_BYTES (c3, c4); \
839 } while (0)
840
841
842 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
843 do { \
844 charset_map_loaded = 0; \
845 c = DECODE_CHAR (charset, code); \
846 if (charset_map_loaded) \
847 { \
848 const unsigned char *orig = coding->source; \
849 EMACS_INT offset; \
850 \
851 coding_set_source (coding); \
852 offset = coding->source - orig; \
853 src += offset; \
854 src_base += offset; \
855 src_end += offset; \
856 } \
857 } while (0)
858
859
860 #define ASSURE_DESTINATION(bytes) \
861 do { \
862 if (dst + (bytes) >= dst_end) \
863 { \
864 int more_bytes = charbuf_end - charbuf + (bytes); \
865 \
866 dst = alloc_destination (coding, more_bytes, dst); \
867 dst_end = coding->destination + coding->dst_bytes; \
868 } \
869 } while (0)
870
871
872
873 static void
874 coding_set_source (coding)
875 struct coding_system *coding;
876 {
877 if (BUFFERP (coding->src_object))
878 {
879 struct buffer *buf = XBUFFER (coding->src_object);
880
881 if (coding->src_pos < 0)
882 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
883 else
884 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
885 }
886 else if (STRINGP (coding->src_object))
887 {
888 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
889 }
890 else
891 /* Otherwise, the source is C string and is never relocated
892 automatically. Thus we don't have to update anything. */
893 ;
894 }
895
896 static void
897 coding_set_destination (coding)
898 struct coding_system *coding;
899 {
900 if (BUFFERP (coding->dst_object))
901 {
902 if (coding->src_pos < 0)
903 {
904 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
905 coding->dst_bytes = (GAP_END_ADDR
906 - (coding->src_bytes - coding->consumed)
907 - coding->destination);
908 }
909 else
910 {
911 /* We are sure that coding->dst_pos_byte is before the gap
912 of the buffer. */
913 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
914 + coding->dst_pos_byte - 1);
915 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
916 - coding->destination);
917 }
918 }
919 else
920 /* Otherwise, the destination is C string and is never relocated
921 automatically. Thus we don't have to update anything. */
922 ;
923 }
924
925
926 static void
927 coding_alloc_by_realloc (coding, bytes)
928 struct coding_system *coding;
929 EMACS_INT bytes;
930 {
931 coding->destination = (unsigned char *) xrealloc (coding->destination,
932 coding->dst_bytes + bytes);
933 coding->dst_bytes += bytes;
934 }
935
936 static void
937 coding_alloc_by_making_gap (coding, bytes)
938 struct coding_system *coding;
939 EMACS_INT bytes;
940 {
941 if (BUFFERP (coding->dst_object)
942 && EQ (coding->src_object, coding->dst_object))
943 {
944 EMACS_INT add = coding->src_bytes - coding->consumed;
945
946 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
947 make_gap (bytes);
948 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
949 }
950 else
951 {
952 Lisp_Object this_buffer;
953
954 this_buffer = Fcurrent_buffer ();
955 set_buffer_internal (XBUFFER (coding->dst_object));
956 make_gap (bytes);
957 set_buffer_internal (XBUFFER (this_buffer));
958 }
959 }
960
961
962 static unsigned char *
963 alloc_destination (coding, nbytes, dst)
964 struct coding_system *coding;
965 EMACS_INT nbytes;
966 unsigned char *dst;
967 {
968 EMACS_INT offset = dst - coding->destination;
969
970 if (BUFFERP (coding->dst_object))
971 coding_alloc_by_making_gap (coding, nbytes);
972 else
973 coding_alloc_by_realloc (coding, nbytes);
974 coding->result = CODING_RESULT_SUCCESS;
975 coding_set_destination (coding);
976 dst = coding->destination + offset;
977 return dst;
978 }
979
980 /** Macros for annotations. */
981
982 /* Maximum length of annotation data (sum of annotations for
983 composition and charset). */
984 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
985
986 /* An annotation data is stored in the array coding->charbuf in this
987 format:
988 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
989 LENGTH is the number of elements in the annotation.
990 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
991 FROM and TO specify the range of text annotated. They are relative
992 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
993
994 The format of the following elements depend on ANNOTATION_MASK.
995
996 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
997 follows:
998 ... METHOD [ COMPOSITION-COMPONENTS ... ]
999 METHOD is one of enum composition_method.
1000 Optionnal COMPOSITION-COMPONENTS are characters and composition
1001 rules.
1002
1003 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1004 follows. */
1005
1006 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1007 do { \
1008 *(buf)++ = -(len); \
1009 *(buf)++ = (mask); \
1010 *(buf)++ = (from); \
1011 *(buf)++ = (to); \
1012 coding->annotated = 1; \
1013 } while (0);
1014
1015 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1016 do { \
1017 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1018 *buf++ = method; \
1019 } while (0)
1020
1021
1022 #define ADD_CHARSET_DATA(buf, from, to, id) \
1023 do { \
1024 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1025 *buf++ = id; \
1026 } while (0)
1027
1028 \f
1029 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1030
1031
1032
1033 \f
1034 /*** 3. UTF-8 ***/
1035
1036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1037 Check if a text is encoded in UTF-8. If it is, return 1, else
1038 return 0. */
1039
1040 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1041 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1042 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1043 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1044 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1045 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1046
1047 static int
1048 detect_coding_utf_8 (coding, detect_info)
1049 struct coding_system *coding;
1050 struct coding_detection_info *detect_info;
1051 {
1052 const unsigned char *src = coding->source, *src_base = src;
1053 const unsigned char *src_end = coding->source + coding->src_bytes;
1054 int multibytep = coding->src_multibyte;
1055 int consumed_chars = 0;
1056 int found = 0;
1057 int incomplete;
1058
1059 detect_info->checked |= CATEGORY_MASK_UTF_8;
1060 /* A coding system of this category is always ASCII compatible. */
1061 src += coding->head_ascii;
1062
1063 while (1)
1064 {
1065 int c, c1, c2, c3, c4;
1066
1067 incomplete = 0;
1068 ONE_MORE_BYTE (c);
1069 if (UTF_8_1_OCTET_P (c))
1070 continue;
1071 incomplete = 1;
1072 ONE_MORE_BYTE (c1);
1073 if (! UTF_8_EXTRA_OCTET_P (c1))
1074 break;
1075 if (UTF_8_2_OCTET_LEADING_P (c))
1076 {
1077 found = CATEGORY_MASK_UTF_8;
1078 continue;
1079 }
1080 ONE_MORE_BYTE (c2);
1081 if (! UTF_8_EXTRA_OCTET_P (c2))
1082 break;
1083 if (UTF_8_3_OCTET_LEADING_P (c))
1084 {
1085 found = CATEGORY_MASK_UTF_8;
1086 continue;
1087 }
1088 ONE_MORE_BYTE (c3);
1089 if (! UTF_8_EXTRA_OCTET_P (c3))
1090 break;
1091 if (UTF_8_4_OCTET_LEADING_P (c))
1092 {
1093 found = CATEGORY_MASK_UTF_8;
1094 continue;
1095 }
1096 ONE_MORE_BYTE (c4);
1097 if (! UTF_8_EXTRA_OCTET_P (c4))
1098 break;
1099 if (UTF_8_5_OCTET_LEADING_P (c))
1100 {
1101 found = CATEGORY_MASK_UTF_8;
1102 continue;
1103 }
1104 break;
1105 }
1106 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1107 return 0;
1108
1109 no_more_source:
1110 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1111 {
1112 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1113 return 0;
1114 }
1115 detect_info->found |= found;
1116 return 1;
1117 }
1118
1119
1120 static void
1121 decode_coding_utf_8 (coding)
1122 struct coding_system *coding;
1123 {
1124 const unsigned char *src = coding->source + coding->consumed;
1125 const unsigned char *src_end = coding->source + coding->src_bytes;
1126 const unsigned char *src_base;
1127 int *charbuf = coding->charbuf;
1128 int *charbuf_end = charbuf + coding->charbuf_size;
1129 int consumed_chars = 0, consumed_chars_base;
1130 int multibytep = coding->src_multibyte;
1131 Lisp_Object attr, charset_list;
1132
1133 CODING_GET_INFO (coding, attr, charset_list);
1134
1135 while (1)
1136 {
1137 int c, c1, c2, c3, c4, c5;
1138
1139 src_base = src;
1140 consumed_chars_base = consumed_chars;
1141
1142 if (charbuf >= charbuf_end)
1143 break;
1144
1145 ONE_MORE_BYTE (c1);
1146 if (UTF_8_1_OCTET_P(c1))
1147 {
1148 c = c1;
1149 }
1150 else
1151 {
1152 ONE_MORE_BYTE (c2);
1153 if (! UTF_8_EXTRA_OCTET_P (c2))
1154 goto invalid_code;
1155 if (UTF_8_2_OCTET_LEADING_P (c1))
1156 {
1157 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1158 /* Reject overlong sequences here and below. Encoders
1159 producing them are incorrect, they can be misleading,
1160 and they mess up read/write invariance. */
1161 if (c < 128)
1162 goto invalid_code;
1163 }
1164 else
1165 {
1166 ONE_MORE_BYTE (c3);
1167 if (! UTF_8_EXTRA_OCTET_P (c3))
1168 goto invalid_code;
1169 if (UTF_8_3_OCTET_LEADING_P (c1))
1170 {
1171 c = (((c1 & 0xF) << 12)
1172 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1173 if (c < 0x800
1174 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1175 goto invalid_code;
1176 }
1177 else
1178 {
1179 ONE_MORE_BYTE (c4);
1180 if (! UTF_8_EXTRA_OCTET_P (c4))
1181 goto invalid_code;
1182 if (UTF_8_4_OCTET_LEADING_P (c1))
1183 {
1184 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1185 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1186 if (c < 0x10000)
1187 goto invalid_code;
1188 }
1189 else
1190 {
1191 ONE_MORE_BYTE (c5);
1192 if (! UTF_8_EXTRA_OCTET_P (c5))
1193 goto invalid_code;
1194 if (UTF_8_5_OCTET_LEADING_P (c1))
1195 {
1196 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1197 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1198 | (c5 & 0x3F));
1199 if ((c > MAX_CHAR) || (c < 0x200000))
1200 goto invalid_code;
1201 }
1202 else
1203 goto invalid_code;
1204 }
1205 }
1206 }
1207 }
1208
1209 *charbuf++ = c;
1210 continue;
1211
1212 invalid_code:
1213 src = src_base;
1214 consumed_chars = consumed_chars_base;
1215 ONE_MORE_BYTE (c);
1216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1217 coding->errors++;
1218 }
1219
1220 no_more_source:
1221 coding->consumed_char += consumed_chars_base;
1222 coding->consumed = src_base - coding->source;
1223 coding->charbuf_used = charbuf - coding->charbuf;
1224 }
1225
1226
1227 static int
1228 encode_coding_utf_8 (coding)
1229 struct coding_system *coding;
1230 {
1231 int multibytep = coding->dst_multibyte;
1232 int *charbuf = coding->charbuf;
1233 int *charbuf_end = charbuf + coding->charbuf_used;
1234 unsigned char *dst = coding->destination + coding->produced;
1235 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1236 int produced_chars = 0;
1237 int c;
1238
1239 if (multibytep)
1240 {
1241 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1242
1243 while (charbuf < charbuf_end)
1244 {
1245 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1246
1247 ASSURE_DESTINATION (safe_room);
1248 c = *charbuf++;
1249 if (CHAR_BYTE8_P (c))
1250 {
1251 c = CHAR_TO_BYTE8 (c);
1252 EMIT_ONE_BYTE (c);
1253 }
1254 else
1255 {
1256 CHAR_STRING_ADVANCE (c, pend);
1257 for (p = str; p < pend; p++)
1258 EMIT_ONE_BYTE (*p);
1259 }
1260 }
1261 }
1262 else
1263 {
1264 int safe_room = MAX_MULTIBYTE_LENGTH;
1265
1266 while (charbuf < charbuf_end)
1267 {
1268 ASSURE_DESTINATION (safe_room);
1269 c = *charbuf++;
1270 dst += CHAR_STRING (c, dst);
1271 produced_chars++;
1272 }
1273 }
1274 coding->result = CODING_RESULT_SUCCESS;
1275 coding->produced_char += produced_chars;
1276 coding->produced = dst - coding->destination;
1277 return 0;
1278 }
1279
1280
1281 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1282 Check if a text is encoded in one of UTF-16 based coding systems.
1283 If it is, return 1, else return 0. */
1284
1285 #define UTF_16_HIGH_SURROGATE_P(val) \
1286 (((val) & 0xFC00) == 0xD800)
1287
1288 #define UTF_16_LOW_SURROGATE_P(val) \
1289 (((val) & 0xFC00) == 0xDC00)
1290
1291 #define UTF_16_INVALID_P(val) \
1292 (((val) == 0xFFFE) \
1293 || ((val) == 0xFFFF) \
1294 || UTF_16_LOW_SURROGATE_P (val))
1295
1296
1297 static int
1298 detect_coding_utf_16 (coding, detect_info)
1299 struct coding_system *coding;
1300 struct coding_detection_info *detect_info;
1301 {
1302 const unsigned char *src = coding->source, *src_base = src;
1303 const unsigned char *src_end = coding->source + coding->src_bytes;
1304 int multibytep = coding->src_multibyte;
1305 int consumed_chars = 0;
1306 int c1, c2;
1307
1308 detect_info->checked |= CATEGORY_MASK_UTF_16;
1309 if (coding->mode & CODING_MODE_LAST_BLOCK
1310 && (coding->src_chars & 1))
1311 {
1312 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1313 return 0;
1314 }
1315
1316 ONE_MORE_BYTE (c1);
1317 ONE_MORE_BYTE (c2);
1318 if ((c1 == 0xFF) && (c2 == 0xFE))
1319 {
1320 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1321 | CATEGORY_MASK_UTF_16_AUTO);
1322 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1323 | CATEGORY_MASK_UTF_16_BE_NOSIG
1324 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1325 }
1326 else if ((c1 == 0xFE) && (c2 == 0xFF))
1327 {
1328 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1329 | CATEGORY_MASK_UTF_16_AUTO);
1330 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1331 | CATEGORY_MASK_UTF_16_BE_NOSIG
1332 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1333 }
1334 else
1335 {
1336 unsigned char b1[256], b2[256];
1337 int b1_variants = 1, b2_variants = 1;
1338 int n;
1339
1340 bzero (b1, 256), bzero (b2, 256);
1341 b1[c1]++, b2[c2]++;
1342 for (n = 0; n < 256 && src < src_end; n++)
1343 {
1344 ONE_MORE_BYTE (c1);
1345 ONE_MORE_BYTE (c2);
1346 if (! b1[c1++]) b1_variants++;
1347 if (! b2[c2++]) b2_variants++;
1348 }
1349 if (b1_variants < b2_variants)
1350 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1351 else
1352 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1353 detect_info->rejected
1354 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1355 }
1356 no_more_source:
1357 return 1;
1358 }
1359
1360 static void
1361 decode_coding_utf_16 (coding)
1362 struct coding_system *coding;
1363 {
1364 const unsigned char *src = coding->source + coding->consumed;
1365 const unsigned char *src_end = coding->source + coding->src_bytes;
1366 const unsigned char *src_base;
1367 int *charbuf = coding->charbuf;
1368 int *charbuf_end = charbuf + coding->charbuf_size;
1369 int consumed_chars = 0, consumed_chars_base;
1370 int multibytep = coding->src_multibyte;
1371 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1372 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1373 int surrogate = CODING_UTF_16_SURROGATE (coding);
1374 Lisp_Object attr, charset_list;
1375
1376 CODING_GET_INFO (coding, attr, charset_list);
1377
1378 if (bom == utf_16_with_bom)
1379 {
1380 int c, c1, c2;
1381
1382 src_base = src;
1383 ONE_MORE_BYTE (c1);
1384 ONE_MORE_BYTE (c2);
1385 c = (c1 << 8) | c2;
1386
1387 if (endian == utf_16_big_endian
1388 ? c != 0xFEFF : c != 0xFFFE)
1389 {
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1392 src = src_base;
1393 coding->errors++;
1394 }
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397 else if (bom == utf_16_detect_bom)
1398 {
1399 /* We have already tried to detect BOM and failed in
1400 detect_coding. */
1401 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1402 }
1403
1404 while (1)
1405 {
1406 int c, c1, c2;
1407
1408 src_base = src;
1409 consumed_chars_base = consumed_chars;
1410
1411 if (charbuf + 2 >= charbuf_end)
1412 break;
1413
1414 ONE_MORE_BYTE (c1);
1415 ONE_MORE_BYTE (c2);
1416 c = (endian == utf_16_big_endian
1417 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1418 if (surrogate)
1419 {
1420 if (! UTF_16_LOW_SURROGATE_P (c))
1421 {
1422 if (endian == utf_16_big_endian)
1423 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1424 else
1425 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1426 *charbuf++ = c1;
1427 *charbuf++ = c2;
1428 coding->errors++;
1429 if (UTF_16_HIGH_SURROGATE_P (c))
1430 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1431 else
1432 *charbuf++ = c;
1433 }
1434 else
1435 {
1436 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1438 *charbuf++ = c;
1439 }
1440 }
1441 else
1442 {
1443 if (UTF_16_HIGH_SURROGATE_P (c))
1444 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1445 else
1446 *charbuf++ = c;
1447 }
1448 }
1449
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456 static int
1457 encode_coding_utf_16 (coding)
1458 struct coding_system *coding;
1459 {
1460 int multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 int safe_room = 8;
1466 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1467 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1468 int produced_chars = 0;
1469 Lisp_Object attrs, charset_list;
1470 int c;
1471
1472 CODING_GET_INFO (coding, attrs, charset_list);
1473
1474 if (bom != utf_16_without_bom)
1475 {
1476 ASSURE_DESTINATION (safe_room);
1477 if (big_endian)
1478 EMIT_TWO_BYTES (0xFE, 0xFF);
1479 else
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
1481 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1482 }
1483
1484 while (charbuf < charbuf_end)
1485 {
1486 ASSURE_DESTINATION (safe_room);
1487 c = *charbuf++;
1488 if (c >= MAX_UNICODE_CHAR)
1489 c = coding->default_char;
1490
1491 if (c < 0x10000)
1492 {
1493 if (big_endian)
1494 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1495 else
1496 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1497 }
1498 else
1499 {
1500 int c1, c2;
1501
1502 c -= 0x10000;
1503 c1 = (c >> 10) + 0xD800;
1504 c2 = (c & 0x3FF) + 0xDC00;
1505 if (big_endian)
1506 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1507 else
1508 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1509 }
1510 }
1511 coding->result = CODING_RESULT_SUCCESS;
1512 coding->produced = dst - coding->destination;
1513 coding->produced_char += produced_chars;
1514 return 0;
1515 }
1516
1517 \f
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1519
1520 /* Emacs' internal format for representation of multiple character
1521 sets is a kind of multi-byte encoding, i.e. characters are
1522 represented by variable-length sequences of one-byte codes.
1523
1524 ASCII characters and control characters (e.g. `tab', `newline') are
1525 represented by one-byte sequences which are their ASCII codes, in
1526 the range 0x00 through 0x7F.
1527
1528 8-bit characters of the range 0x80..0x9F are represented by
1529 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1530 code + 0x20).
1531
1532 8-bit characters of the range 0xA0..0xFF are represented by
1533 one-byte sequences which are their 8-bit code.
1534
1535 The other characters are represented by a sequence of `base
1536 leading-code', optional `extended leading-code', and one or two
1537 `position-code's. The length of the sequence is determined by the
1538 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1539 whereas extended leading-code and position-code take the range 0xA0
1540 through 0xFF. See `charset.h' for more details about leading-code
1541 and position-code.
1542
1543 --- CODE RANGE of Emacs' internal format ---
1544 character set range
1545 ------------- -----
1546 ascii 0x00..0x7F
1547 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548 eight-bit-graphic 0xA0..0xBF
1549 ELSE 0x81..0x9D + [0xA0..0xFF]+
1550 ---------------------------------------------
1551
1552 As this is the internal character representation, the format is
1553 usually not used externally (i.e. in a file or in a data sent to a
1554 process). But, it is possible to have a text externally in this
1555 format (i.e. by encoding by the coding system `emacs-mule').
1556
1557 In that case, a sequence of one-byte codes has a slightly different
1558 form.
1559
1560 At first, all characters in eight-bit-control are represented by
1561 one-byte sequences which are their 8-bit code.
1562
1563 Next, character composition data are represented by the byte
1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1565 where,
1566 METHOD is 0xF0 plus one of composition method (enum
1567 composition_method),
1568
1569 BYTES is 0xA0 plus a byte length of this composition data,
1570
1571 CHARS is 0x20 plus a number of characters composed by this
1572 data,
1573
1574 COMPONENTs are characters of multibye form or composition
1575 rules encoded by two-byte of ASCII codes.
1576
1577 In addition, for backward compatibility, the following formats are
1578 also recognized as composition data on decoding.
1579
1580 0x80 MSEQ ...
1581 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1582
1583 Here,
1584 MSEQ is a multibyte form but in these special format:
1585 ASCII: 0xA0 ASCII_CODE+0x80,
1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587 RULE is a one byte code of the range 0xA0..0xF0 that
1588 represents a composition rule.
1589 */
1590
1591 char emacs_mule_bytes[256];
1592
1593 int
1594 emacs_mule_char (coding, src, nbytes, nchars, id)
1595 struct coding_system *coding;
1596 unsigned char *src;
1597 int *nbytes, *nchars, *id;
1598 {
1599 const unsigned char *src_end = coding->source + coding->src_bytes;
1600 const unsigned char *src_base = src;
1601 int multibytep = coding->src_multibyte;
1602 struct charset *charset;
1603 unsigned code;
1604 int c;
1605 int consumed_chars = 0;
1606
1607 ONE_MORE_BYTE (c);
1608 switch (emacs_mule_bytes[c])
1609 {
1610 case 2:
1611 if (! (charset = emacs_mule_charset[c]))
1612 goto invalid_code;
1613 ONE_MORE_BYTE (c);
1614 code = c & 0x7F;
1615 break;
1616
1617 case 3:
1618 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1620 {
1621 ONE_MORE_BYTE (c);
1622 if (! (charset = emacs_mule_charset[c]))
1623 goto invalid_code;
1624 ONE_MORE_BYTE (c);
1625 code = c & 0x7F;
1626 }
1627 else
1628 {
1629 if (! (charset = emacs_mule_charset[c]))
1630 goto invalid_code;
1631 ONE_MORE_BYTE (c);
1632 code = (c & 0x7F) << 8;
1633 ONE_MORE_BYTE (c);
1634 code |= c & 0x7F;
1635 }
1636 break;
1637
1638 case 4:
1639 ONE_MORE_BYTE (c);
1640 if (! (charset = emacs_mule_charset[c]))
1641 goto invalid_code;
1642 ONE_MORE_BYTE (c);
1643 code = (c & 0x7F) << 8;
1644 ONE_MORE_BYTE (c);
1645 code |= c & 0x7F;
1646 break;
1647
1648 case 1:
1649 code = c;
1650 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1651 ? charset_ascii : charset_eight_bit);
1652 break;
1653
1654 default:
1655 abort ();
1656 }
1657 c = DECODE_CHAR (charset, code);
1658 if (c < 0)
1659 goto invalid_code;
1660 *nbytes = src - src_base;
1661 *nchars = consumed_chars;
1662 if (id)
1663 *id = charset->id;
1664 return c;
1665
1666 no_more_source:
1667 return -2;
1668
1669 invalid_code:
1670 return -1;
1671 }
1672
1673
1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1676 else return 0. */
1677
1678 static int
1679 detect_coding_emacs_mule (coding, detect_info)
1680 struct coding_system *coding;
1681 struct coding_detection_info *detect_info;
1682 {
1683 const unsigned char *src = coding->source, *src_base = src;
1684 const unsigned char *src_end = coding->source + coding->src_bytes;
1685 int multibytep = coding->src_multibyte;
1686 int consumed_chars = 0;
1687 int c;
1688 int found = 0;
1689 int incomplete;
1690
1691 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1692 /* A coding system of this category is always ASCII compatible. */
1693 src += coding->head_ascii;
1694
1695 while (1)
1696 {
1697 incomplete = 0;
1698 ONE_MORE_BYTE (c);
1699 incomplete = 1;
1700
1701 if (c == 0x80)
1702 {
1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
1707 const unsigned char *src_base;
1708
1709 repeat:
1710 src_base = src;
1711 do
1712 {
1713 ONE_MORE_BYTE (c);
1714 }
1715 while (c >= 0xA0);
1716
1717 if (src - src_base <= 4)
1718 break;
1719 found = CATEGORY_MASK_EMACS_MULE;
1720 if (c == 0x80)
1721 goto repeat;
1722 }
1723
1724 if (c < 0x80)
1725 {
1726 if (c < 0x20
1727 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1728 break;
1729 }
1730 else
1731 {
1732 const unsigned char *src_base = src - 1;
1733
1734 do
1735 {
1736 ONE_MORE_BYTE (c);
1737 }
1738 while (c >= 0xA0);
1739 if (src - src_base != emacs_mule_bytes[*src_base])
1740 break;
1741 found = CATEGORY_MASK_EMACS_MULE;
1742 }
1743 }
1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745 return 0;
1746
1747 no_more_source:
1748 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1749 {
1750 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1751 return 0;
1752 }
1753 detect_info->found |= found;
1754 return 1;
1755 }
1756
1757
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1759
1760 /* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1765 value 0. */
1766
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1768 if (1) \
1769 { \
1770 int c; \
1771 int nbytes, nchars; \
1772 \
1773 if (src == src_end) \
1774 break; \
1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776 if (c < 0) \
1777 { \
1778 if (c == -2) \
1779 break; \
1780 goto invalid_code; \
1781 } \
1782 *buf++ = c; \
1783 src += nbytes; \
1784 consumed_chars += nchars; \
1785 } \
1786 else
1787
1788
1789 /* Decode a composition rule represented as a component of composition
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1792 to -1. */
1793
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1795 do { \
1796 int c, gref, nref; \
1797 \
1798 if (src >= src_end) \
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
1801 c -= 0x20; \
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1804 \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1807 } while (0)
1808
1809
1810 /* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1813 to -1. */
1814
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1816 do { \
1817 int gref, nref; \
1818 \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1822 gref -= 0x20; \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1824 nref -= 0x20; \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1829 } while (0)
1830
1831
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1833 do { \
1834 /* Emacs 21 style format. The first three bytes at SRC are \
1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
1840 int from, to; \
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1843 \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1846 if (nbytes < 3) \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
1855 { \
1856 int i = 0; \
1857 while (consumed_chars < consumed_chars_limit) \
1858 { \
1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1861 else \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1863 i++; \
1864 } \
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
1867 charbuf_base[0] -= i; \
1868 } \
1869 } while (0)
1870
1871
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1873 do { \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
1876 enum composition_method method = COMPOSITION_RELATIVE; \
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1879 int i, j; \
1880 int from, to; \
1881 \
1882 src = src_base; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1886 if (i < 2) \
1887 goto invalid_code; \
1888 from = coding->produced_char + char_offset; \
1889 to = from + i; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1893 } while (0)
1894
1895
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1897 do { \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
1900 enum composition_method method = COMPOSITION_WITH_RULE; \
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1903 int i, j; \
1904 int from, to; \
1905 \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1908 { \
1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1911 } \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
1916 from = coding->produced_char + char_offset; \
1917 to = from + i; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1923 } while (0)
1924
1925
1926 static void
1927 decode_coding_emacs_mule (coding)
1928 struct coding_system *coding;
1929 {
1930 const unsigned char *src = coding->source + coding->consumed;
1931 const unsigned char *src_end = coding->source + coding->src_bytes;
1932 const unsigned char *src_base;
1933 int *charbuf = coding->charbuf;
1934 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1935 int consumed_chars = 0, consumed_chars_base;
1936 int multibytep = coding->src_multibyte;
1937 Lisp_Object attrs, charset_list;
1938 int char_offset = coding->produced_char;
1939 int last_offset = char_offset;
1940 int last_id = charset_ascii;
1941
1942 CODING_GET_INFO (coding, attrs, charset_list);
1943
1944 while (1)
1945 {
1946 int c;
1947
1948 src_base = src;
1949 consumed_chars_base = consumed_chars;
1950
1951 if (charbuf >= charbuf_end)
1952 break;
1953
1954 ONE_MORE_BYTE (c);
1955
1956 if (c < 0x80)
1957 {
1958 *charbuf++ = c;
1959 char_offset++;
1960 }
1961 else if (c == 0x80)
1962 {
1963 ONE_MORE_BYTE (c);
1964 if (c - 0xF2 >= COMPOSITION_RELATIVE
1965 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1966 DECODE_EMACS_MULE_21_COMPOSITION (c);
1967 else if (c < 0xC0)
1968 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1969 else if (c == 0xFF)
1970 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1971 else
1972 goto invalid_code;
1973 }
1974 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1975 {
1976 int nbytes, nchars;
1977 int id;
1978
1979 src = src_base;
1980 consumed_chars = consumed_chars_base;
1981 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1982 if (c < 0)
1983 {
1984 if (c == -2)
1985 break;
1986 goto invalid_code;
1987 }
1988 if (last_id != id)
1989 {
1990 if (last_id != charset_ascii)
1991 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
1992 last_id = id;
1993 last_offset = char_offset;
1994 }
1995 *charbuf++ = c;
1996 src += nbytes;
1997 consumed_chars += nchars;
1998 char_offset++;
1999 }
2000 continue;
2001
2002 invalid_code:
2003 src = src_base;
2004 consumed_chars = consumed_chars_base;
2005 ONE_MORE_BYTE (c);
2006 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2007 char_offset++;
2008 coding->errors++;
2009 }
2010
2011 no_more_source:
2012 if (last_id != charset_ascii)
2013 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2014 coding->consumed_char += consumed_chars_base;
2015 coding->consumed = src_base - coding->source;
2016 coding->charbuf_used = charbuf - coding->charbuf;
2017 }
2018
2019
2020 #define EMACS_MULE_LEADING_CODES(id, codes) \
2021 do { \
2022 if (id < 0xA0) \
2023 codes[0] = id, codes[1] = 0; \
2024 else if (id < 0xE0) \
2025 codes[0] = 0x9A, codes[1] = id; \
2026 else if (id < 0xF0) \
2027 codes[0] = 0x9B, codes[1] = id; \
2028 else if (id < 0xF5) \
2029 codes[0] = 0x9C, codes[1] = id; \
2030 else \
2031 codes[0] = 0x9D, codes[1] = id; \
2032 } while (0);
2033
2034
2035 static int
2036 encode_coding_emacs_mule (coding)
2037 struct coding_system *coding;
2038 {
2039 int multibytep = coding->dst_multibyte;
2040 int *charbuf = coding->charbuf;
2041 int *charbuf_end = charbuf + coding->charbuf_used;
2042 unsigned char *dst = coding->destination + coding->produced;
2043 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2044 int safe_room = 8;
2045 int produced_chars = 0;
2046 Lisp_Object attrs, charset_list;
2047 int c;
2048 int preferred_charset_id = -1;
2049
2050 CODING_GET_INFO (coding, attrs, charset_list);
2051 if (! EQ (charset_list, Vemacs_mule_charset_list))
2052 {
2053 CODING_ATTR_CHARSET_LIST (attrs)
2054 = charset_list = Vemacs_mule_charset_list;
2055 }
2056
2057 while (charbuf < charbuf_end)
2058 {
2059 ASSURE_DESTINATION (safe_room);
2060 c = *charbuf++;
2061
2062 if (c < 0)
2063 {
2064 /* Handle an annotation. */
2065 switch (*charbuf)
2066 {
2067 case CODING_ANNOTATE_COMPOSITION_MASK:
2068 /* Not yet implemented. */
2069 break;
2070 case CODING_ANNOTATE_CHARSET_MASK:
2071 preferred_charset_id = charbuf[3];
2072 if (preferred_charset_id >= 0
2073 && NILP (Fmemq (make_number (preferred_charset_id),
2074 charset_list)))
2075 preferred_charset_id = -1;
2076 break;
2077 default:
2078 abort ();
2079 }
2080 charbuf += -c - 1;
2081 continue;
2082 }
2083
2084 if (ASCII_CHAR_P (c))
2085 EMIT_ONE_ASCII_BYTE (c);
2086 else if (CHAR_BYTE8_P (c))
2087 {
2088 c = CHAR_TO_BYTE8 (c);
2089 EMIT_ONE_BYTE (c);
2090 }
2091 else
2092 {
2093 struct charset *charset;
2094 unsigned code;
2095 int dimension;
2096 int emacs_mule_id;
2097 unsigned char leading_codes[2];
2098
2099 if (preferred_charset_id >= 0)
2100 {
2101 charset = CHARSET_FROM_ID (preferred_charset_id);
2102 if (! CHAR_CHARSET_P (c, charset))
2103 charset = char_charset (c, charset_list, NULL);
2104 }
2105 else
2106 charset = char_charset (c, charset_list, &code);
2107 if (! charset)
2108 {
2109 c = coding->default_char;
2110 if (ASCII_CHAR_P (c))
2111 {
2112 EMIT_ONE_ASCII_BYTE (c);
2113 continue;
2114 }
2115 charset = char_charset (c, charset_list, &code);
2116 }
2117 dimension = CHARSET_DIMENSION (charset);
2118 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2119 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2120 EMIT_ONE_BYTE (leading_codes[0]);
2121 if (leading_codes[1])
2122 EMIT_ONE_BYTE (leading_codes[1]);
2123 if (dimension == 1)
2124 EMIT_ONE_BYTE (code | 0x80);
2125 else
2126 {
2127 code |= 0x8080;
2128 EMIT_ONE_BYTE (code >> 8);
2129 EMIT_ONE_BYTE (code & 0xFF);
2130 }
2131 }
2132 }
2133 coding->result = CODING_RESULT_SUCCESS;
2134 coding->produced_char += produced_chars;
2135 coding->produced = dst - coding->destination;
2136 return 0;
2137 }
2138
2139 \f
2140 /*** 7. ISO2022 handlers ***/
2141
2142 /* The following note describes the coding system ISO2022 briefly.
2143 Since the intention of this note is to help understand the
2144 functions in this file, some parts are NOT ACCURATE or are OVERLY
2145 SIMPLIFIED. For thorough understanding, please refer to the
2146 original document of ISO2022. This is equivalent to the standard
2147 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2148
2149 ISO2022 provides many mechanisms to encode several character sets
2150 in 7-bit and 8-bit environments. For 7-bit environments, all text
2151 is encoded using bytes less than 128. This may make the encoded
2152 text a little bit longer, but the text passes more easily through
2153 several types of gateway, some of which strip off the MSB (Most
2154 Significant Bit).
2155
2156 There are two kinds of character sets: control character sets and
2157 graphic character sets. The former contain control characters such
2158 as `newline' and `escape' to provide control functions (control
2159 functions are also provided by escape sequences). The latter
2160 contain graphic characters such as 'A' and '-'. Emacs recognizes
2161 two control character sets and many graphic character sets.
2162
2163 Graphic character sets are classified into one of the following
2164 four classes, according to the number of bytes (DIMENSION) and
2165 number of characters in one dimension (CHARS) of the set:
2166 - DIMENSION1_CHARS94
2167 - DIMENSION1_CHARS96
2168 - DIMENSION2_CHARS94
2169 - DIMENSION2_CHARS96
2170
2171 In addition, each character set is assigned an identification tag,
2172 unique for each set, called the "final character" (denoted as <F>
2173 hereafter). The <F> of each character set is decided by ECMA(*)
2174 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2175 (0x30..0x3F are for private use only).
2176
2177 Note (*): ECMA = European Computer Manufacturers Association
2178
2179 Here are examples of graphic character sets [NAME(<F>)]:
2180 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2181 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2182 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2183 o DIMENSION2_CHARS96 -- none for the moment
2184
2185 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2186 C0 [0x00..0x1F] -- control character plane 0
2187 GL [0x20..0x7F] -- graphic character plane 0
2188 C1 [0x80..0x9F] -- control character plane 1
2189 GR [0xA0..0xFF] -- graphic character plane 1
2190
2191 A control character set is directly designated and invoked to C0 or
2192 C1 by an escape sequence. The most common case is that:
2193 - ISO646's control character set is designated/invoked to C0, and
2194 - ISO6429's control character set is designated/invoked to C1,
2195 and usually these designations/invocations are omitted in encoded
2196 text. In a 7-bit environment, only C0 can be used, and a control
2197 character for C1 is encoded by an appropriate escape sequence to
2198 fit into the environment. All control characters for C1 are
2199 defined to have corresponding escape sequences.
2200
2201 A graphic character set is at first designated to one of four
2202 graphic registers (G0 through G3), then these graphic registers are
2203 invoked to GL or GR. These designations and invocations can be
2204 done independently. The most common case is that G0 is invoked to
2205 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2206 these invocations and designations are omitted in encoded text.
2207 In a 7-bit environment, only GL can be used.
2208
2209 When a graphic character set of CHARS94 is invoked to GL, codes
2210 0x20 and 0x7F of the GL area work as control characters SPACE and
2211 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2212 be used.
2213
2214 There are two ways of invocation: locking-shift and single-shift.
2215 With locking-shift, the invocation lasts until the next different
2216 invocation, whereas with single-shift, the invocation affects the
2217 following character only and doesn't affect the locking-shift
2218 state. Invocations are done by the following control characters or
2219 escape sequences:
2220
2221 ----------------------------------------------------------------------
2222 abbrev function cntrl escape seq description
2223 ----------------------------------------------------------------------
2224 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2225 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2226 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2227 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2228 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2229 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2230 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2231 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2232 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2233 ----------------------------------------------------------------------
2234 (*) These are not used by any known coding system.
2235
2236 Control characters for these functions are defined by macros
2237 ISO_CODE_XXX in `coding.h'.
2238
2239 Designations are done by the following escape sequences:
2240 ----------------------------------------------------------------------
2241 escape sequence description
2242 ----------------------------------------------------------------------
2243 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2244 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2245 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2246 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2247 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2248 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2249 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2250 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2251 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2252 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2253 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2254 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2255 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2256 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2257 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2258 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2259 ----------------------------------------------------------------------
2260
2261 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2262 of dimension 1, chars 94, and final character <F>, etc...
2263
2264 Note (*): Although these designations are not allowed in ISO2022,
2265 Emacs accepts them on decoding, and produces them on encoding
2266 CHARS96 character sets in a coding system which is characterized as
2267 7-bit environment, non-locking-shift, and non-single-shift.
2268
2269 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2270 '(' must be omitted. We refer to this as "short-form" hereafter.
2271
2272 Now you may notice that there are a lot of ways of encoding the
2273 same multilingual text in ISO2022. Actually, there exist many
2274 coding systems such as Compound Text (used in X11's inter client
2275 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2276 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2277 localized platforms), and all of these are variants of ISO2022.
2278
2279 In addition to the above, Emacs handles two more kinds of escape
2280 sequences: ISO6429's direction specification and Emacs' private
2281 sequence for specifying character composition.
2282
2283 ISO6429's direction specification takes the following form:
2284 o CSI ']' -- end of the current direction
2285 o CSI '0' ']' -- end of the current direction
2286 o CSI '1' ']' -- start of left-to-right text
2287 o CSI '2' ']' -- start of right-to-left text
2288 The control character CSI (0x9B: control sequence introducer) is
2289 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2290
2291 Character composition specification takes the following form:
2292 o ESC '0' -- start relative composition
2293 o ESC '1' -- end composition
2294 o ESC '2' -- start rule-base composition (*)
2295 o ESC '3' -- start relative composition with alternate chars (**)
2296 o ESC '4' -- start rule-base composition with alternate chars (**)
2297 Since these are not standard escape sequences of any ISO standard,
2298 the use of them with these meanings is restricted to Emacs only.
2299
2300 (*) This form is used only in Emacs 20.7 and older versions,
2301 but newer versions can safely decode it.
2302 (**) This form is used only in Emacs 21.1 and newer versions,
2303 and older versions can't decode it.
2304
2305 Here's a list of example usages of these composition escape
2306 sequences (categorized by `enum composition_method').
2307
2308 COMPOSITION_RELATIVE:
2309 ESC 0 CHAR [ CHAR ] ESC 1
2310 COMPOSITION_WITH_RULE:
2311 ESC 2 CHAR [ RULE CHAR ] ESC 1
2312 COMPOSITION_WITH_ALTCHARS:
2313 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2314 COMPOSITION_WITH_RULE_ALTCHARS:
2315 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2316
2317 enum iso_code_class_type iso_code_class[256];
2318
2319 #define SAFE_CHARSET_P(coding, id) \
2320 ((id) <= (coding)->max_charset_id \
2321 && (coding)->safe_charsets[id] >= 0)
2322
2323
2324 #define SHIFT_OUT_OK(category) \
2325 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2326
2327 static void
2328 setup_iso_safe_charsets (attrs)
2329 Lisp_Object attrs;
2330 {
2331 Lisp_Object charset_list, safe_charsets;
2332 Lisp_Object request;
2333 Lisp_Object reg_usage;
2334 Lisp_Object tail;
2335 int reg94, reg96;
2336 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2337 int max_charset_id;
2338
2339 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2340 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2341 && ! EQ (charset_list, Viso_2022_charset_list))
2342 {
2343 CODING_ATTR_CHARSET_LIST (attrs)
2344 = charset_list = Viso_2022_charset_list;
2345 ASET (attrs, coding_attr_safe_charsets, Qnil);
2346 }
2347
2348 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2349 return;
2350
2351 max_charset_id = 0;
2352 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2353 {
2354 int id = XINT (XCAR (tail));
2355 if (max_charset_id < id)
2356 max_charset_id = id;
2357 }
2358
2359 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2360 make_number (255));
2361 request = AREF (attrs, coding_attr_iso_request);
2362 reg_usage = AREF (attrs, coding_attr_iso_usage);
2363 reg94 = XINT (XCAR (reg_usage));
2364 reg96 = XINT (XCDR (reg_usage));
2365
2366 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2367 {
2368 Lisp_Object id;
2369 Lisp_Object reg;
2370 struct charset *charset;
2371
2372 id = XCAR (tail);
2373 charset = CHARSET_FROM_ID (XINT (id));
2374 reg = Fcdr (Fassq (id, request));
2375 if (! NILP (reg))
2376 SSET (safe_charsets, XINT (id), XINT (reg));
2377 else if (charset->iso_chars_96)
2378 {
2379 if (reg96 < 4)
2380 SSET (safe_charsets, XINT (id), reg96);
2381 }
2382 else
2383 {
2384 if (reg94 < 4)
2385 SSET (safe_charsets, XINT (id), reg94);
2386 }
2387 }
2388 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2389 }
2390
2391
2392 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2393 Check if a text is encoded in one of ISO-2022 based codig systems.
2394 If it is, return 1, else return 0. */
2395
2396 static int
2397 detect_coding_iso_2022 (coding, detect_info)
2398 struct coding_system *coding;
2399 struct coding_detection_info *detect_info;
2400 {
2401 const unsigned char *src = coding->source, *src_base = src;
2402 const unsigned char *src_end = coding->source + coding->src_bytes;
2403 int multibytep = coding->src_multibyte;
2404 int single_shifting = 0;
2405 int id;
2406 int c, c1;
2407 int consumed_chars = 0;
2408 int i;
2409 int rejected = 0;
2410 int found = 0;
2411
2412 detect_info->checked |= CATEGORY_MASK_ISO;
2413
2414 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2415 {
2416 struct coding_system *this = &(coding_categories[i]);
2417 Lisp_Object attrs, val;
2418
2419 attrs = CODING_ID_ATTRS (this->id);
2420 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2421 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2422 setup_iso_safe_charsets (attrs);
2423 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2424 this->max_charset_id = SCHARS (val) - 1;
2425 this->safe_charsets = (char *) SDATA (val);
2426 }
2427
2428 /* A coding system of this category is always ASCII compatible. */
2429 src += coding->head_ascii;
2430
2431 while (rejected != CATEGORY_MASK_ISO)
2432 {
2433 ONE_MORE_BYTE (c);
2434 switch (c)
2435 {
2436 case ISO_CODE_ESC:
2437 if (inhibit_iso_escape_detection)
2438 break;
2439 single_shifting = 0;
2440 ONE_MORE_BYTE (c);
2441 if (c >= '(' && c <= '/')
2442 {
2443 /* Designation sequence for a charset of dimension 1. */
2444 ONE_MORE_BYTE (c1);
2445 if (c1 < ' ' || c1 >= 0x80
2446 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2447 /* Invalid designation sequence. Just ignore. */
2448 break;
2449 }
2450 else if (c == '$')
2451 {
2452 /* Designation sequence for a charset of dimension 2. */
2453 ONE_MORE_BYTE (c);
2454 if (c >= '@' && c <= 'B')
2455 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2456 id = iso_charset_table[1][0][c];
2457 else if (c >= '(' && c <= '/')
2458 {
2459 ONE_MORE_BYTE (c1);
2460 if (c1 < ' ' || c1 >= 0x80
2461 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2462 /* Invalid designation sequence. Just ignore. */
2463 break;
2464 }
2465 else
2466 /* Invalid designation sequence. Just ignore it. */
2467 break;
2468 }
2469 else if (c == 'N' || c == 'O')
2470 {
2471 /* ESC <Fe> for SS2 or SS3. */
2472 single_shifting = 1;
2473 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2474 break;
2475 }
2476 else if (c >= '0' && c <= '4')
2477 {
2478 /* ESC <Fp> for start/end composition. */
2479 found |= CATEGORY_MASK_ISO;
2480 break;
2481 }
2482 else
2483 {
2484 /* Invalid escape sequence. Just ignore it. */
2485 break;
2486 }
2487
2488 /* We found a valid designation sequence for CHARSET. */
2489 rejected |= CATEGORY_MASK_ISO_8BIT;
2490 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2491 id))
2492 found |= CATEGORY_MASK_ISO_7;
2493 else
2494 rejected |= CATEGORY_MASK_ISO_7;
2495 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2496 id))
2497 found |= CATEGORY_MASK_ISO_7_TIGHT;
2498 else
2499 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2500 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2501 id))
2502 found |= CATEGORY_MASK_ISO_7_ELSE;
2503 else
2504 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2505 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2506 id))
2507 found |= CATEGORY_MASK_ISO_8_ELSE;
2508 else
2509 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2510 break;
2511
2512 case ISO_CODE_SO:
2513 case ISO_CODE_SI:
2514 /* Locking shift out/in. */
2515 if (inhibit_iso_escape_detection)
2516 break;
2517 single_shifting = 0;
2518 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2519 found |= CATEGORY_MASK_ISO_ELSE;
2520 break;
2521
2522 case ISO_CODE_CSI:
2523 /* Control sequence introducer. */
2524 single_shifting = 0;
2525 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2526 found |= CATEGORY_MASK_ISO_8_ELSE;
2527 goto check_extra_latin;
2528
2529
2530 case ISO_CODE_SS2:
2531 case ISO_CODE_SS3:
2532 /* Single shift. */
2533 if (inhibit_iso_escape_detection)
2534 break;
2535 single_shifting = 1;
2536 rejected |= CATEGORY_MASK_ISO_7BIT;
2537 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2538 & CODING_ISO_FLAG_SINGLE_SHIFT)
2539 found |= CATEGORY_MASK_ISO_8_1;
2540 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2541 & CODING_ISO_FLAG_SINGLE_SHIFT)
2542 found |= CATEGORY_MASK_ISO_8_2;
2543 goto check_extra_latin;
2544
2545 default:
2546 if (c < 0x80)
2547 {
2548 single_shifting = 0;
2549 break;
2550 }
2551 if (c >= 0xA0)
2552 {
2553 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2554 found |= CATEGORY_MASK_ISO_8_1;
2555 /* Check the length of succeeding codes of the range
2556 0xA0..0FF. If the byte length is even, we include
2557 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2558 only when we are not single shifting. */
2559 if (! single_shifting
2560 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2561 {
2562 int i = 1;
2563 while (src < src_end)
2564 {
2565 ONE_MORE_BYTE (c);
2566 if (c < 0xA0)
2567 break;
2568 i++;
2569 }
2570
2571 if (i & 1 && src < src_end)
2572 rejected |= CATEGORY_MASK_ISO_8_2;
2573 else
2574 found |= CATEGORY_MASK_ISO_8_2;
2575 }
2576 break;
2577 }
2578 check_extra_latin:
2579 single_shifting = 0;
2580 if (! VECTORP (Vlatin_extra_code_table)
2581 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2582 {
2583 rejected = CATEGORY_MASK_ISO;
2584 break;
2585 }
2586 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2587 & CODING_ISO_FLAG_LATIN_EXTRA)
2588 found |= CATEGORY_MASK_ISO_8_1;
2589 else
2590 rejected |= CATEGORY_MASK_ISO_8_1;
2591 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2592 & CODING_ISO_FLAG_LATIN_EXTRA)
2593 found |= CATEGORY_MASK_ISO_8_2;
2594 else
2595 rejected |= CATEGORY_MASK_ISO_8_2;
2596 }
2597 }
2598 detect_info->rejected |= CATEGORY_MASK_ISO;
2599 return 0;
2600
2601 no_more_source:
2602 detect_info->rejected |= rejected;
2603 detect_info->found |= (found & ~rejected);
2604 return 1;
2605 }
2606
2607
2608 /* Set designation state into CODING. */
2609 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2610 do { \
2611 int id, prev; \
2612 \
2613 if (final < '0' || final >= 128 \
2614 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2615 || !SAFE_CHARSET_P (coding, id)) \
2616 { \
2617 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2618 goto invalid_code; \
2619 } \
2620 prev = CODING_ISO_DESIGNATION (coding, reg); \
2621 if (id == charset_jisx0201_roman) \
2622 { \
2623 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2624 id = charset_ascii; \
2625 } \
2626 else if (id == charset_jisx0208_1978) \
2627 { \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2629 id = charset_jisx0208; \
2630 } \
2631 CODING_ISO_DESIGNATION (coding, reg) = id; \
2632 /* If there was an invalid designation to REG previously, and this \
2633 designation is ASCII to REG, we should keep this designation \
2634 sequence. */ \
2635 if (prev == -2 && id == charset_ascii) \
2636 goto invalid_code; \
2637 } while (0)
2638
2639
2640 #define MAYBE_FINISH_COMPOSITION() \
2641 do { \
2642 int i; \
2643 if (composition_state == COMPOSING_NO) \
2644 break; \
2645 /* It is assured that we have enough room for producing \
2646 characters stored in the table `components'. */ \
2647 if (charbuf + component_idx > charbuf_end) \
2648 goto no_more_source; \
2649 composition_state = COMPOSING_NO; \
2650 if (method == COMPOSITION_RELATIVE \
2651 || method == COMPOSITION_WITH_ALTCHARS) \
2652 { \
2653 for (i = 0; i < component_idx; i++) \
2654 *charbuf++ = components[i]; \
2655 char_offset += component_idx; \
2656 } \
2657 else \
2658 { \
2659 for (i = 0; i < component_idx; i += 2) \
2660 *charbuf++ = components[i]; \
2661 char_offset += (component_idx / 2) + 1; \
2662 } \
2663 } while (0)
2664
2665
2666 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2667 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2668 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2669 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2670 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2671 */
2672
2673 #define DECODE_COMPOSITION_START(c1) \
2674 do { \
2675 if (c1 == '0' \
2676 && composition_state == COMPOSING_COMPONENT_RULE) \
2677 { \
2678 component_len = component_idx; \
2679 composition_state = COMPOSING_CHAR; \
2680 } \
2681 else \
2682 { \
2683 const unsigned char *p; \
2684 \
2685 MAYBE_FINISH_COMPOSITION (); \
2686 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2687 goto no_more_source; \
2688 for (p = src; p < src_end - 1; p++) \
2689 if (*p == ISO_CODE_ESC && p[1] == '1') \
2690 break; \
2691 if (p == src_end - 1) \
2692 { \
2693 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2694 goto invalid_code; \
2695 goto no_more_source; \
2696 } \
2697 \
2698 /* This is surely the start of a composition. */ \
2699 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2700 : c1 == '2' ? COMPOSITION_WITH_RULE \
2701 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2702 : COMPOSITION_WITH_RULE_ALTCHARS); \
2703 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2704 : COMPOSING_COMPONENT_CHAR); \
2705 component_idx = component_len = 0; \
2706 } \
2707 } while (0)
2708
2709
2710 /* Handle compositoin end sequence ESC 1. */
2711
2712 #define DECODE_COMPOSITION_END() \
2713 do { \
2714 int nchars = (component_len > 0 ? component_idx - component_len \
2715 : method == COMPOSITION_RELATIVE ? component_idx \
2716 : (component_idx + 1) / 2); \
2717 int i; \
2718 int *saved_charbuf = charbuf; \
2719 int from = char_offset; \
2720 int to = from + nchars; \
2721 \
2722 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2723 if (method != COMPOSITION_RELATIVE) \
2724 { \
2725 if (component_len == 0) \
2726 for (i = 0; i < component_idx; i++) \
2727 *charbuf++ = components[i]; \
2728 else \
2729 for (i = 0; i < component_len; i++) \
2730 *charbuf++ = components[i]; \
2731 *saved_charbuf = saved_charbuf - charbuf; \
2732 } \
2733 if (method == COMPOSITION_WITH_RULE) \
2734 for (i = 0; i < component_idx; i += 2, char_offset++) \
2735 *charbuf++ = components[i]; \
2736 else \
2737 for (i = component_len; i < component_idx; i++, char_offset++) \
2738 *charbuf++ = components[i]; \
2739 coding->annotated = 1; \
2740 composition_state = COMPOSING_NO; \
2741 } while (0)
2742
2743
2744 /* Decode a composition rule from the byte C1 (and maybe one more byte
2745 from SRC) and store one encoded composition rule in
2746 coding->cmp_data. */
2747
2748 #define DECODE_COMPOSITION_RULE(c1) \
2749 do { \
2750 (c1) -= 32; \
2751 if (c1 < 81) /* old format (before ver.21) */ \
2752 { \
2753 int gref = (c1) / 9; \
2754 int nref = (c1) % 9; \
2755 if (gref == 4) gref = 10; \
2756 if (nref == 4) nref = 10; \
2757 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2758 } \
2759 else if (c1 < 93) /* new format (after ver.21) */ \
2760 { \
2761 ONE_MORE_BYTE (c2); \
2762 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2763 } \
2764 else \
2765 c1 = 0; \
2766 } while (0)
2767
2768
2769 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2770
2771 static void
2772 decode_coding_iso_2022 (coding)
2773 struct coding_system *coding;
2774 {
2775 const unsigned char *src = coding->source + coding->consumed;
2776 const unsigned char *src_end = coding->source + coding->src_bytes;
2777 const unsigned char *src_base;
2778 int *charbuf = coding->charbuf;
2779 int *charbuf_end
2780 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2781 int consumed_chars = 0, consumed_chars_base;
2782 int multibytep = coding->src_multibyte;
2783 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2784 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2785 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2786 struct charset *charset;
2787 int c;
2788 /* For handling composition sequence. */
2789 #define COMPOSING_NO 0
2790 #define COMPOSING_CHAR 1
2791 #define COMPOSING_RULE 2
2792 #define COMPOSING_COMPONENT_CHAR 3
2793 #define COMPOSING_COMPONENT_RULE 4
2794
2795 int composition_state = COMPOSING_NO;
2796 enum composition_method method;
2797 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2798 int component_idx;
2799 int component_len;
2800 Lisp_Object attrs, charset_list;
2801 int char_offset = coding->produced_char;
2802 int last_offset = char_offset;
2803 int last_id = charset_ascii;
2804
2805 CODING_GET_INFO (coding, attrs, charset_list);
2806 setup_iso_safe_charsets (attrs);
2807
2808 while (1)
2809 {
2810 int c1, c2;
2811
2812 src_base = src;
2813 consumed_chars_base = consumed_chars;
2814
2815 if (charbuf >= charbuf_end)
2816 break;
2817
2818 ONE_MORE_BYTE (c1);
2819
2820 /* We produce at most one character. */
2821 switch (iso_code_class [c1])
2822 {
2823 case ISO_0x20_or_0x7F:
2824 if (composition_state != COMPOSING_NO)
2825 {
2826 if (composition_state == COMPOSING_RULE
2827 || composition_state == COMPOSING_COMPONENT_RULE)
2828 {
2829 DECODE_COMPOSITION_RULE (c1);
2830 components[component_idx++] = c1;
2831 composition_state--;
2832 continue;
2833 }
2834 }
2835 if (charset_id_0 < 0
2836 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2837 /* This is SPACE or DEL. */
2838 charset = CHARSET_FROM_ID (charset_ascii);
2839 else
2840 charset = CHARSET_FROM_ID (charset_id_0);
2841 break;
2842
2843 case ISO_graphic_plane_0:
2844 if (composition_state != COMPOSING_NO)
2845 {
2846 if (composition_state == COMPOSING_RULE
2847 || composition_state == COMPOSING_COMPONENT_RULE)
2848 {
2849 DECODE_COMPOSITION_RULE (c1);
2850 components[component_idx++] = c1;
2851 composition_state--;
2852 continue;
2853 }
2854 }
2855 charset = CHARSET_FROM_ID (charset_id_0);
2856 break;
2857
2858 case ISO_0xA0_or_0xFF:
2859 if (charset_id_1 < 0
2860 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2861 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2862 goto invalid_code;
2863 /* This is a graphic character, we fall down ... */
2864
2865 case ISO_graphic_plane_1:
2866 if (charset_id_1 < 0)
2867 goto invalid_code;
2868 charset = CHARSET_FROM_ID (charset_id_1);
2869 break;
2870
2871 case ISO_control_0:
2872 MAYBE_FINISH_COMPOSITION ();
2873 charset = CHARSET_FROM_ID (charset_ascii);
2874 break;
2875
2876 case ISO_control_1:
2877 MAYBE_FINISH_COMPOSITION ();
2878 goto invalid_code;
2879
2880 case ISO_shift_out:
2881 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2882 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2883 goto invalid_code;
2884 CODING_ISO_INVOCATION (coding, 0) = 1;
2885 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2886 continue;
2887
2888 case ISO_shift_in:
2889 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2890 goto invalid_code;
2891 CODING_ISO_INVOCATION (coding, 0) = 0;
2892 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2893 continue;
2894
2895 case ISO_single_shift_2_7:
2896 case ISO_single_shift_2:
2897 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2898 goto invalid_code;
2899 /* SS2 is handled as an escape sequence of ESC 'N' */
2900 c1 = 'N';
2901 goto label_escape_sequence;
2902
2903 case ISO_single_shift_3:
2904 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2905 goto invalid_code;
2906 /* SS2 is handled as an escape sequence of ESC 'O' */
2907 c1 = 'O';
2908 goto label_escape_sequence;
2909
2910 case ISO_control_sequence_introducer:
2911 /* CSI is handled as an escape sequence of ESC '[' ... */
2912 c1 = '[';
2913 goto label_escape_sequence;
2914
2915 case ISO_escape:
2916 ONE_MORE_BYTE (c1);
2917 label_escape_sequence:
2918 /* Escape sequences handled here are invocation,
2919 designation, direction specification, and character
2920 composition specification. */
2921 switch (c1)
2922 {
2923 case '&': /* revision of following character set */
2924 ONE_MORE_BYTE (c1);
2925 if (!(c1 >= '@' && c1 <= '~'))
2926 goto invalid_code;
2927 ONE_MORE_BYTE (c1);
2928 if (c1 != ISO_CODE_ESC)
2929 goto invalid_code;
2930 ONE_MORE_BYTE (c1);
2931 goto label_escape_sequence;
2932
2933 case '$': /* designation of 2-byte character set */
2934 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2935 goto invalid_code;
2936 ONE_MORE_BYTE (c1);
2937 if (c1 >= '@' && c1 <= 'B')
2938 { /* designation of JISX0208.1978, GB2312.1980,
2939 or JISX0208.1980 */
2940 DECODE_DESIGNATION (0, 2, 0, c1);
2941 }
2942 else if (c1 >= 0x28 && c1 <= 0x2B)
2943 { /* designation of DIMENSION2_CHARS94 character set */
2944 ONE_MORE_BYTE (c2);
2945 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2946 }
2947 else if (c1 >= 0x2C && c1 <= 0x2F)
2948 { /* designation of DIMENSION2_CHARS96 character set */
2949 ONE_MORE_BYTE (c2);
2950 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2951 }
2952 else
2953 goto invalid_code;
2954 /* We must update these variables now. */
2955 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2956 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2957 continue;
2958
2959 case 'n': /* invocation of locking-shift-2 */
2960 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2961 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2962 goto invalid_code;
2963 CODING_ISO_INVOCATION (coding, 0) = 2;
2964 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2965 continue;
2966
2967 case 'o': /* invocation of locking-shift-3 */
2968 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2969 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2970 goto invalid_code;
2971 CODING_ISO_INVOCATION (coding, 0) = 3;
2972 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2973 continue;
2974
2975 case 'N': /* invocation of single-shift-2 */
2976 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2977 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2978 goto invalid_code;
2979 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2980 ONE_MORE_BYTE (c1);
2981 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2982 goto invalid_code;
2983 break;
2984
2985 case 'O': /* invocation of single-shift-3 */
2986 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2987 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2988 goto invalid_code;
2989 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2990 ONE_MORE_BYTE (c1);
2991 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2992 goto invalid_code;
2993 break;
2994
2995 case '0': case '2': case '3': case '4': /* start composition */
2996 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2997 goto invalid_code;
2998 DECODE_COMPOSITION_START (c1);
2999 continue;
3000
3001 case '1': /* end composition */
3002 if (composition_state == COMPOSING_NO)
3003 goto invalid_code;
3004 DECODE_COMPOSITION_END ();
3005 continue;
3006
3007 case '[': /* specification of direction */
3008 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3009 goto invalid_code;
3010 /* For the moment, nested direction is not supported.
3011 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3012 left-to-right, and nozero means right-to-left. */
3013 ONE_MORE_BYTE (c1);
3014 switch (c1)
3015 {
3016 case ']': /* end of the current direction */
3017 coding->mode &= ~CODING_MODE_DIRECTION;
3018
3019 case '0': /* end of the current direction */
3020 case '1': /* start of left-to-right direction */
3021 ONE_MORE_BYTE (c1);
3022 if (c1 == ']')
3023 coding->mode &= ~CODING_MODE_DIRECTION;
3024 else
3025 goto invalid_code;
3026 break;
3027
3028 case '2': /* start of right-to-left direction */
3029 ONE_MORE_BYTE (c1);
3030 if (c1 == ']')
3031 coding->mode |= CODING_MODE_DIRECTION;
3032 else
3033 goto invalid_code;
3034 break;
3035
3036 default:
3037 goto invalid_code;
3038 }
3039 continue;
3040
3041 case '%':
3042 ONE_MORE_BYTE (c1);
3043 if (c1 == '/')
3044 {
3045 /* CTEXT extended segment:
3046 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3047 We keep these bytes as is for the moment.
3048 They may be decoded by post-read-conversion. */
3049 int dim, M, L;
3050 int size;
3051
3052 ONE_MORE_BYTE (dim);
3053 ONE_MORE_BYTE (M);
3054 ONE_MORE_BYTE (L);
3055 size = ((M - 128) * 128) + (L - 128);
3056 if (charbuf + 8 + size > charbuf_end)
3057 goto break_loop;
3058 *charbuf++ = ISO_CODE_ESC;
3059 *charbuf++ = '%';
3060 *charbuf++ = '/';
3061 *charbuf++ = dim;
3062 *charbuf++ = BYTE8_TO_CHAR (M);
3063 *charbuf++ = BYTE8_TO_CHAR (L);
3064 while (size-- > 0)
3065 {
3066 ONE_MORE_BYTE (c1);
3067 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3068 }
3069 }
3070 else if (c1 == 'G')
3071 {
3072 /* XFree86 extension for embedding UTF-8 in CTEXT:
3073 ESC % G --UTF-8-BYTES-- ESC % @
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3076 int *p = charbuf;
3077
3078 if (p + 6 > charbuf_end)
3079 goto break_loop;
3080 *p++ = ISO_CODE_ESC;
3081 *p++ = '%';
3082 *p++ = 'G';
3083 while (p < charbuf_end)
3084 {
3085 ONE_MORE_BYTE (c1);
3086 if (c1 == ISO_CODE_ESC
3087 && src + 1 < src_end
3088 && src[0] == '%'
3089 && src[1] == '@')
3090 break;
3091 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3092 }
3093 if (p + 3 > charbuf_end)
3094 goto break_loop;
3095 *p++ = ISO_CODE_ESC;
3096 *p++ = '%';
3097 *p++ = '@';
3098 charbuf = p;
3099 }
3100 else
3101 goto invalid_code;
3102 continue;
3103 break;
3104
3105 default:
3106 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3107 goto invalid_code;
3108 if (c1 >= 0x28 && c1 <= 0x2B)
3109 { /* designation of DIMENSION1_CHARS94 character set */
3110 ONE_MORE_BYTE (c2);
3111 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3112 }
3113 else if (c1 >= 0x2C && c1 <= 0x2F)
3114 { /* designation of DIMENSION1_CHARS96 character set */
3115 ONE_MORE_BYTE (c2);
3116 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3117 }
3118 else
3119 goto invalid_code;
3120 /* We must update these variables now. */
3121 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3122 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3123 continue;
3124 }
3125 }
3126
3127 if (charset->id != charset_ascii
3128 && last_id != charset->id)
3129 {
3130 if (last_id != charset_ascii)
3131 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3132 last_id = charset->id;
3133 last_offset = char_offset;
3134 }
3135
3136 /* Now we know CHARSET and 1st position code C1 of a character.
3137 Produce a decoded character while getting 2nd position code
3138 C2 if necessary. */
3139 c1 &= 0x7F;
3140 if (CHARSET_DIMENSION (charset) > 1)
3141 {
3142 ONE_MORE_BYTE (c2);
3143 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3144 /* C2 is not in a valid range. */
3145 goto invalid_code;
3146 c1 = (c1 << 8) | (c2 & 0x7F);
3147 if (CHARSET_DIMENSION (charset) > 2)
3148 {
3149 ONE_MORE_BYTE (c2);
3150 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3151 /* C2 is not in a valid range. */
3152 goto invalid_code;
3153 c1 = (c1 << 8) | (c2 & 0x7F);
3154 }
3155 }
3156
3157 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3158 if (c < 0)
3159 {
3160 MAYBE_FINISH_COMPOSITION ();
3161 for (; src_base < src; src_base++, char_offset++)
3162 {
3163 if (ASCII_BYTE_P (*src_base))
3164 *charbuf++ = *src_base;
3165 else
3166 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3167 }
3168 }
3169 else if (composition_state == COMPOSING_NO)
3170 {
3171 *charbuf++ = c;
3172 char_offset++;
3173 }
3174 else
3175 {
3176 components[component_idx++] = c;
3177 if (method == COMPOSITION_WITH_RULE
3178 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3179 && composition_state == COMPOSING_COMPONENT_CHAR))
3180 composition_state++;
3181 }
3182 continue;
3183
3184 invalid_code:
3185 MAYBE_FINISH_COMPOSITION ();
3186 src = src_base;
3187 consumed_chars = consumed_chars_base;
3188 ONE_MORE_BYTE (c);
3189 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3190 char_offset++;
3191 coding->errors++;
3192 continue;
3193
3194 break_loop:
3195 break;
3196 }
3197
3198 no_more_source:
3199 if (last_id != charset_ascii)
3200 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3201 coding->consumed_char += consumed_chars_base;
3202 coding->consumed = src_base - coding->source;
3203 coding->charbuf_used = charbuf - coding->charbuf;
3204 }
3205
3206
3207 /* ISO2022 encoding stuff. */
3208
3209 /*
3210 It is not enough to say just "ISO2022" on encoding, we have to
3211 specify more details. In Emacs, each coding system of ISO2022
3212 variant has the following specifications:
3213 1. Initial designation to G0 thru G3.
3214 2. Allows short-form designation?
3215 3. ASCII should be designated to G0 before control characters?
3216 4. ASCII should be designated to G0 at end of line?
3217 5. 7-bit environment or 8-bit environment?
3218 6. Use locking-shift?
3219 7. Use Single-shift?
3220 And the following two are only for Japanese:
3221 8. Use ASCII in place of JIS0201-1976-Roman?
3222 9. Use JISX0208-1983 in place of JISX0208-1978?
3223 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3224 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3225 details.
3226 */
3227
3228 /* Produce codes (escape sequence) for designating CHARSET to graphic
3229 register REG at DST, and increment DST. If <final-char> of CHARSET is
3230 '@', 'A', or 'B' and the coding system CODING allows, produce
3231 designation sequence of short-form. */
3232
3233 #define ENCODE_DESIGNATION(charset, reg, coding) \
3234 do { \
3235 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3236 char *intermediate_char_94 = "()*+"; \
3237 char *intermediate_char_96 = ",-./"; \
3238 int revision = -1; \
3239 int c; \
3240 \
3241 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3242 revision = CHARSET_ISO_REVISION (charset); \
3243 \
3244 if (revision >= 0) \
3245 { \
3246 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3247 EMIT_ONE_BYTE ('@' + revision); \
3248 } \
3249 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3250 if (CHARSET_DIMENSION (charset) == 1) \
3251 { \
3252 if (! CHARSET_ISO_CHARS_96 (charset)) \
3253 c = intermediate_char_94[reg]; \
3254 else \
3255 c = intermediate_char_96[reg]; \
3256 EMIT_ONE_ASCII_BYTE (c); \
3257 } \
3258 else \
3259 { \
3260 EMIT_ONE_ASCII_BYTE ('$'); \
3261 if (! CHARSET_ISO_CHARS_96 (charset)) \
3262 { \
3263 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3264 || reg != 0 \
3265 || final_char < '@' || final_char > 'B') \
3266 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3267 } \
3268 else \
3269 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3270 } \
3271 EMIT_ONE_ASCII_BYTE (final_char); \
3272 \
3273 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3274 } while (0)
3275
3276
3277 /* The following two macros produce codes (control character or escape
3278 sequence) for ISO2022 single-shift functions (single-shift-2 and
3279 single-shift-3). */
3280
3281 #define ENCODE_SINGLE_SHIFT_2 \
3282 do { \
3283 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3284 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3285 else \
3286 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3287 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3288 } while (0)
3289
3290
3291 #define ENCODE_SINGLE_SHIFT_3 \
3292 do { \
3293 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3294 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3295 else \
3296 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3297 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3298 } while (0)
3299
3300
3301 /* The following four macros produce codes (control character or
3302 escape sequence) for ISO2022 locking-shift functions (shift-in,
3303 shift-out, locking-shift-2, and locking-shift-3). */
3304
3305 #define ENCODE_SHIFT_IN \
3306 do { \
3307 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3308 CODING_ISO_INVOCATION (coding, 0) = 0; \
3309 } while (0)
3310
3311
3312 #define ENCODE_SHIFT_OUT \
3313 do { \
3314 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3315 CODING_ISO_INVOCATION (coding, 0) = 1; \
3316 } while (0)
3317
3318
3319 #define ENCODE_LOCKING_SHIFT_2 \
3320 do { \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3322 CODING_ISO_INVOCATION (coding, 0) = 2; \
3323 } while (0)
3324
3325
3326 #define ENCODE_LOCKING_SHIFT_3 \
3327 do { \
3328 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3329 CODING_ISO_INVOCATION (coding, 0) = 3; \
3330 } while (0)
3331
3332
3333 /* Produce codes for a DIMENSION1 character whose character set is
3334 CHARSET and whose position-code is C1. Designation and invocation
3335 sequences are also produced in advance if necessary. */
3336
3337 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3338 do { \
3339 int id = CHARSET_ID (charset); \
3340 \
3341 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3342 && id == charset_ascii) \
3343 { \
3344 id = charset_jisx0201_roman; \
3345 charset = CHARSET_FROM_ID (id); \
3346 } \
3347 \
3348 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3349 { \
3350 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3351 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3352 else \
3353 EMIT_ONE_BYTE (c1 | 0x80); \
3354 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3355 break; \
3356 } \
3357 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3358 { \
3359 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3360 break; \
3361 } \
3362 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3363 { \
3364 EMIT_ONE_BYTE (c1 | 0x80); \
3365 break; \
3366 } \
3367 else \
3368 /* Since CHARSET is not yet invoked to any graphic planes, we \
3369 must invoke it, or, at first, designate it to some graphic \
3370 register. Then repeat the loop to actually produce the \
3371 character. */ \
3372 dst = encode_invocation_designation (charset, coding, dst, \
3373 &produced_chars); \
3374 } while (1)
3375
3376
3377 /* Produce codes for a DIMENSION2 character whose character set is
3378 CHARSET and whose position-codes are C1 and C2. Designation and
3379 invocation codes are also produced in advance if necessary. */
3380
3381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3382 do { \
3383 int id = CHARSET_ID (charset); \
3384 \
3385 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3386 && id == charset_jisx0208) \
3387 { \
3388 id = charset_jisx0208_1978; \
3389 charset = CHARSET_FROM_ID (id); \
3390 } \
3391 \
3392 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3393 { \
3394 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3395 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3396 else \
3397 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3398 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3399 break; \
3400 } \
3401 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3402 { \
3403 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3404 break; \
3405 } \
3406 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3407 { \
3408 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3409 break; \
3410 } \
3411 else \
3412 /* Since CHARSET is not yet invoked to any graphic planes, we \
3413 must invoke it, or, at first, designate it to some graphic \
3414 register. Then repeat the loop to actually produce the \
3415 character. */ \
3416 dst = encode_invocation_designation (charset, coding, dst, \
3417 &produced_chars); \
3418 } while (1)
3419
3420
3421 #define ENCODE_ISO_CHARACTER(charset, c) \
3422 do { \
3423 int code = ENCODE_CHAR ((charset),(c)); \
3424 \
3425 if (CHARSET_DIMENSION (charset) == 1) \
3426 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3427 else \
3428 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3429 } while (0)
3430
3431
3432 /* Produce designation and invocation codes at a place pointed by DST
3433 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3434 Return new DST. */
3435
3436 unsigned char *
3437 encode_invocation_designation (charset, coding, dst, p_nchars)
3438 struct charset *charset;
3439 struct coding_system *coding;
3440 unsigned char *dst;
3441 int *p_nchars;
3442 {
3443 int multibytep = coding->dst_multibyte;
3444 int produced_chars = *p_nchars;
3445 int reg; /* graphic register number */
3446 int id = CHARSET_ID (charset);
3447
3448 /* At first, check designations. */
3449 for (reg = 0; reg < 4; reg++)
3450 if (id == CODING_ISO_DESIGNATION (coding, reg))
3451 break;
3452
3453 if (reg >= 4)
3454 {
3455 /* CHARSET is not yet designated to any graphic registers. */
3456 /* At first check the requested designation. */
3457 reg = CODING_ISO_REQUEST (coding, id);
3458 if (reg < 0)
3459 /* Since CHARSET requests no special designation, designate it
3460 to graphic register 0. */
3461 reg = 0;
3462
3463 ENCODE_DESIGNATION (charset, reg, coding);
3464 }
3465
3466 if (CODING_ISO_INVOCATION (coding, 0) != reg
3467 && CODING_ISO_INVOCATION (coding, 1) != reg)
3468 {
3469 /* Since the graphic register REG is not invoked to any graphic
3470 planes, invoke it to graphic plane 0. */
3471 switch (reg)
3472 {
3473 case 0: /* graphic register 0 */
3474 ENCODE_SHIFT_IN;
3475 break;
3476
3477 case 1: /* graphic register 1 */
3478 ENCODE_SHIFT_OUT;
3479 break;
3480
3481 case 2: /* graphic register 2 */
3482 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3483 ENCODE_SINGLE_SHIFT_2;
3484 else
3485 ENCODE_LOCKING_SHIFT_2;
3486 break;
3487
3488 case 3: /* graphic register 3 */
3489 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3490 ENCODE_SINGLE_SHIFT_3;
3491 else
3492 ENCODE_LOCKING_SHIFT_3;
3493 break;
3494 }
3495 }
3496
3497 *p_nchars = produced_chars;
3498 return dst;
3499 }
3500
3501 /* The following three macros produce codes for indicating direction
3502 of text. */
3503 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3504 do { \
3505 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3506 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3507 else \
3508 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3509 } while (0)
3510
3511
3512 #define ENCODE_DIRECTION_R2L() \
3513 do { \
3514 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3515 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3516 } while (0)
3517
3518
3519 #define ENCODE_DIRECTION_L2R() \
3520 do { \
3521 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3522 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3523 } while (0)
3524
3525
3526 /* Produce codes for designation and invocation to reset the graphic
3527 planes and registers to initial state. */
3528 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3529 do { \
3530 int reg; \
3531 struct charset *charset; \
3532 \
3533 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3534 ENCODE_SHIFT_IN; \
3535 for (reg = 0; reg < 4; reg++) \
3536 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3537 && (CODING_ISO_DESIGNATION (coding, reg) \
3538 != CODING_ISO_INITIAL (coding, reg))) \
3539 { \
3540 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3541 ENCODE_DESIGNATION (charset, reg, coding); \
3542 } \
3543 } while (0)
3544
3545
3546 /* Produce designation sequences of charsets in the line started from
3547 SRC to a place pointed by DST, and return updated DST.
3548
3549 If the current block ends before any end-of-line, we may fail to
3550 find all the necessary designations. */
3551
3552 static unsigned char *
3553 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3554 struct coding_system *coding;
3555 int *charbuf, *charbuf_end;
3556 unsigned char *dst;
3557 {
3558 struct charset *charset;
3559 /* Table of charsets to be designated to each graphic register. */
3560 int r[4];
3561 int c, found = 0, reg;
3562 int produced_chars = 0;
3563 int multibytep = coding->dst_multibyte;
3564 Lisp_Object attrs;
3565 Lisp_Object charset_list;
3566
3567 attrs = CODING_ID_ATTRS (coding->id);
3568 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3569 if (EQ (charset_list, Qiso_2022))
3570 charset_list = Viso_2022_charset_list;
3571
3572 for (reg = 0; reg < 4; reg++)
3573 r[reg] = -1;
3574
3575 while (found < 4)
3576 {
3577 int id;
3578
3579 c = *charbuf++;
3580 if (c == '\n')
3581 break;
3582 charset = char_charset (c, charset_list, NULL);
3583 id = CHARSET_ID (charset);
3584 reg = CODING_ISO_REQUEST (coding, id);
3585 if (reg >= 0 && r[reg] < 0)
3586 {
3587 found++;
3588 r[reg] = id;
3589 }
3590 }
3591
3592 if (found)
3593 {
3594 for (reg = 0; reg < 4; reg++)
3595 if (r[reg] >= 0
3596 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3597 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3598 }
3599
3600 return dst;
3601 }
3602
3603 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3604
3605 static int
3606 encode_coding_iso_2022 (coding)
3607 struct coding_system *coding;
3608 {
3609 int multibytep = coding->dst_multibyte;
3610 int *charbuf = coding->charbuf;
3611 int *charbuf_end = charbuf + coding->charbuf_used;
3612 unsigned char *dst = coding->destination + coding->produced;
3613 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3614 int safe_room = 16;
3615 int bol_designation
3616 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3617 && CODING_ISO_BOL (coding));
3618 int produced_chars = 0;
3619 Lisp_Object attrs, eol_type, charset_list;
3620 int ascii_compatible;
3621 int c;
3622 int preferred_charset_id = -1;
3623
3624 CODING_GET_INFO (coding, attrs, charset_list);
3625 eol_type = CODING_ID_EOL_TYPE (coding->id);
3626 if (VECTORP (eol_type))
3627 eol_type = Qunix;
3628
3629 setup_iso_safe_charsets (attrs);
3630 /* Charset list may have been changed. */
3631 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3632 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3633
3634 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3635
3636 while (charbuf < charbuf_end)
3637 {
3638 ASSURE_DESTINATION (safe_room);
3639
3640 if (bol_designation)
3641 {
3642 unsigned char *dst_prev = dst;
3643
3644 /* We have to produce designation sequences if any now. */
3645 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3646 bol_designation = 0;
3647 /* We are sure that designation sequences are all ASCII bytes. */
3648 produced_chars += dst - dst_prev;
3649 }
3650
3651 c = *charbuf++;
3652
3653 if (c < 0)
3654 {
3655 /* Handle an annotation. */
3656 switch (*charbuf)
3657 {
3658 case CODING_ANNOTATE_COMPOSITION_MASK:
3659 /* Not yet implemented. */
3660 break;
3661 case CODING_ANNOTATE_CHARSET_MASK:
3662 preferred_charset_id = charbuf[3];
3663 if (preferred_charset_id >= 0
3664 && NILP (Fmemq (make_number (preferred_charset_id),
3665 charset_list)))
3666 preferred_charset_id = -1;
3667 break;
3668 default:
3669 abort ();
3670 }
3671 charbuf += -c - 1;
3672 continue;
3673 }
3674
3675 /* Now encode the character C. */
3676 if (c < 0x20 || c == 0x7F)
3677 {
3678 if (c == '\n'
3679 || (c == '\r' && EQ (eol_type, Qmac)))
3680 {
3681 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3682 ENCODE_RESET_PLANE_AND_REGISTER ();
3683 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3684 {
3685 int i;
3686
3687 for (i = 0; i < 4; i++)
3688 CODING_ISO_DESIGNATION (coding, i)
3689 = CODING_ISO_INITIAL (coding, i);
3690 }
3691 bol_designation
3692 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3693 }
3694 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3695 ENCODE_RESET_PLANE_AND_REGISTER ();
3696 EMIT_ONE_ASCII_BYTE (c);
3697 }
3698 else if (ASCII_CHAR_P (c))
3699 {
3700 if (ascii_compatible)
3701 EMIT_ONE_ASCII_BYTE (c);
3702 else
3703 {
3704 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3705 ENCODE_ISO_CHARACTER (charset, c);
3706 }
3707 }
3708 else if (CHAR_BYTE8_P (c))
3709 {
3710 c = CHAR_TO_BYTE8 (c);
3711 EMIT_ONE_BYTE (c);
3712 }
3713 else
3714 {
3715 struct charset *charset;
3716
3717 if (preferred_charset_id >= 0)
3718 {
3719 charset = CHARSET_FROM_ID (preferred_charset_id);
3720 if (! CHAR_CHARSET_P (c, charset))
3721 charset = char_charset (c, charset_list, NULL);
3722 }
3723 else
3724 charset = char_charset (c, charset_list, NULL);
3725 if (!charset)
3726 {
3727 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3728 {
3729 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3730 charset = CHARSET_FROM_ID (charset_ascii);
3731 }
3732 else
3733 {
3734 c = coding->default_char;
3735 charset = char_charset (c, charset_list, NULL);
3736 }
3737 }
3738 ENCODE_ISO_CHARACTER (charset, c);
3739 }
3740 }
3741
3742 if (coding->mode & CODING_MODE_LAST_BLOCK
3743 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3744 {
3745 ASSURE_DESTINATION (safe_room);
3746 ENCODE_RESET_PLANE_AND_REGISTER ();
3747 }
3748 coding->result = CODING_RESULT_SUCCESS;
3749 CODING_ISO_BOL (coding) = bol_designation;
3750 coding->produced_char += produced_chars;
3751 coding->produced = dst - coding->destination;
3752 return 0;
3753 }
3754
3755 \f
3756 /*** 8,9. SJIS and BIG5 handlers ***/
3757
3758 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3759 quite widely. So, for the moment, Emacs supports them in the bare
3760 C code. But, in the future, they may be supported only by CCL. */
3761
3762 /* SJIS is a coding system encoding three character sets: ASCII, right
3763 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3764 as is. A character of charset katakana-jisx0201 is encoded by
3765 "position-code + 0x80". A character of charset japanese-jisx0208
3766 is encoded in 2-byte but two position-codes are divided and shifted
3767 so that it fit in the range below.
3768
3769 --- CODE RANGE of SJIS ---
3770 (character set) (range)
3771 ASCII 0x00 .. 0x7F
3772 KATAKANA-JISX0201 0xA0 .. 0xDF
3773 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3774 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3775 -------------------------------
3776
3777 */
3778
3779 /* BIG5 is a coding system encoding two character sets: ASCII and
3780 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3781 character set and is encoded in two-byte.
3782
3783 --- CODE RANGE of BIG5 ---
3784 (character set) (range)
3785 ASCII 0x00 .. 0x7F
3786 Big5 (1st byte) 0xA1 .. 0xFE
3787 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3788 --------------------------
3789
3790 */
3791
3792 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3793 Check if a text is encoded in SJIS. If it is, return
3794 CATEGORY_MASK_SJIS, else return 0. */
3795
3796 static int
3797 detect_coding_sjis (coding, detect_info)
3798 struct coding_system *coding;
3799 struct coding_detection_info *detect_info;
3800 {
3801 const unsigned char *src = coding->source, *src_base = src;
3802 const unsigned char *src_end = coding->source + coding->src_bytes;
3803 int multibytep = coding->src_multibyte;
3804 int consumed_chars = 0;
3805 int found = 0;
3806 int c;
3807 int incomplete;
3808
3809 detect_info->checked |= CATEGORY_MASK_SJIS;
3810 /* A coding system of this category is always ASCII compatible. */
3811 src += coding->head_ascii;
3812
3813 while (1)
3814 {
3815 incomplete = 0;
3816 ONE_MORE_BYTE (c);
3817 incomplete = 1;
3818 if (c < 0x80)
3819 continue;
3820 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3821 {
3822 ONE_MORE_BYTE (c);
3823 if (c < 0x40 || c == 0x7F || c > 0xFC)
3824 break;
3825 found = CATEGORY_MASK_SJIS;
3826 }
3827 else if (c >= 0xA0 && c < 0xE0)
3828 found = CATEGORY_MASK_SJIS;
3829 else
3830 break;
3831 }
3832 detect_info->rejected |= CATEGORY_MASK_SJIS;
3833 return 0;
3834
3835 no_more_source:
3836 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3837 {
3838 detect_info->rejected |= CATEGORY_MASK_SJIS;
3839 return 0;
3840 }
3841 detect_info->found |= found;
3842 return 1;
3843 }
3844
3845 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3846 Check if a text is encoded in BIG5. If it is, return
3847 CATEGORY_MASK_BIG5, else return 0. */
3848
3849 static int
3850 detect_coding_big5 (coding, detect_info)
3851 struct coding_system *coding;
3852 struct coding_detection_info *detect_info;
3853 {
3854 const unsigned char *src = coding->source, *src_base = src;
3855 const unsigned char *src_end = coding->source + coding->src_bytes;
3856 int multibytep = coding->src_multibyte;
3857 int consumed_chars = 0;
3858 int found = 0;
3859 int c;
3860 int incomplete;
3861
3862 detect_info->checked |= CATEGORY_MASK_BIG5;
3863 /* A coding system of this category is always ASCII compatible. */
3864 src += coding->head_ascii;
3865
3866 while (1)
3867 {
3868 incomplete = 0;
3869 ONE_MORE_BYTE (c);
3870 incomplete = 1;
3871 if (c < 0x80)
3872 continue;
3873 if (c >= 0xA1)
3874 {
3875 ONE_MORE_BYTE (c);
3876 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3877 return 0;
3878 found = CATEGORY_MASK_BIG5;
3879 }
3880 else
3881 break;
3882 }
3883 detect_info->rejected |= CATEGORY_MASK_BIG5;
3884 return 0;
3885
3886 no_more_source:
3887 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3888 {
3889 detect_info->rejected |= CATEGORY_MASK_BIG5;
3890 return 0;
3891 }
3892 detect_info->found |= found;
3893 return 1;
3894 }
3895
3896 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3897 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3898
3899 static void
3900 decode_coding_sjis (coding)
3901 struct coding_system *coding;
3902 {
3903 const unsigned char *src = coding->source + coding->consumed;
3904 const unsigned char *src_end = coding->source + coding->src_bytes;
3905 const unsigned char *src_base;
3906 int *charbuf = coding->charbuf;
3907 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3908 int consumed_chars = 0, consumed_chars_base;
3909 int multibytep = coding->src_multibyte;
3910 struct charset *charset_roman, *charset_kanji, *charset_kana;
3911 Lisp_Object attrs, charset_list, val;
3912 int char_offset = coding->produced_char;
3913 int last_offset = char_offset;
3914 int last_id = charset_ascii;
3915
3916 CODING_GET_INFO (coding, attrs, charset_list);
3917
3918 val = charset_list;
3919 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3920 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3921 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3922
3923 while (1)
3924 {
3925 int c, c1;
3926 struct charset *charset;
3927
3928 src_base = src;
3929 consumed_chars_base = consumed_chars;
3930
3931 if (charbuf >= charbuf_end)
3932 break;
3933
3934 ONE_MORE_BYTE (c);
3935
3936 if (c < 0x80)
3937 charset = charset_roman;
3938 else
3939 {
3940 if (c >= 0xF0)
3941 goto invalid_code;
3942 if (c < 0xA0 || c >= 0xE0)
3943 {
3944 /* SJIS -> JISX0208 */
3945 ONE_MORE_BYTE (c1);
3946 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3947 goto invalid_code;
3948 c = (c << 8) | c1;
3949 SJIS_TO_JIS (c);
3950 charset = charset_kanji;
3951 }
3952 else if (c > 0xA0)
3953 {
3954 /* SJIS -> JISX0201-Kana */
3955 c &= 0x7F;
3956 charset = charset_kana;
3957 }
3958 else
3959 goto invalid_code;
3960 }
3961 if (charset->id != charset_ascii
3962 && last_id != charset->id)
3963 {
3964 if (last_id != charset_ascii)
3965 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3966 last_id = charset->id;
3967 last_offset = char_offset;
3968 }
3969 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3970 *charbuf++ = c;
3971 char_offset++;
3972 continue;
3973
3974 invalid_code:
3975 src = src_base;
3976 consumed_chars = consumed_chars_base;
3977 ONE_MORE_BYTE (c);
3978 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3979 char_offset++;
3980 coding->errors++;
3981 }
3982
3983 no_more_source:
3984 if (last_id != charset_ascii)
3985 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3986 coding->consumed_char += consumed_chars_base;
3987 coding->consumed = src_base - coding->source;
3988 coding->charbuf_used = charbuf - coding->charbuf;
3989 }
3990
3991 static void
3992 decode_coding_big5 (coding)
3993 struct coding_system *coding;
3994 {
3995 const unsigned char *src = coding->source + coding->consumed;
3996 const unsigned char *src_end = coding->source + coding->src_bytes;
3997 const unsigned char *src_base;
3998 int *charbuf = coding->charbuf;
3999 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4000 int consumed_chars = 0, consumed_chars_base;
4001 int multibytep = coding->src_multibyte;
4002 struct charset *charset_roman, *charset_big5;
4003 Lisp_Object attrs, charset_list, val;
4004 int char_offset = coding->produced_char;
4005 int last_offset = char_offset;
4006 int last_id = charset_ascii;
4007
4008 CODING_GET_INFO (coding, attrs, charset_list);
4009 val = charset_list;
4010 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4011 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4012
4013 while (1)
4014 {
4015 int c, c1;
4016 struct charset *charset;
4017
4018 src_base = src;
4019 consumed_chars_base = consumed_chars;
4020
4021 if (charbuf >= charbuf_end)
4022 break;
4023
4024 ONE_MORE_BYTE (c);
4025
4026 if (c < 0x80)
4027 charset = charset_roman;
4028 else
4029 {
4030 /* BIG5 -> Big5 */
4031 if (c < 0xA1 || c > 0xFE)
4032 goto invalid_code;
4033 ONE_MORE_BYTE (c1);
4034 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4035 goto invalid_code;
4036 c = c << 8 | c1;
4037 charset = charset_big5;
4038 }
4039 if (charset->id != charset_ascii
4040 && last_id != charset->id)
4041 {
4042 if (last_id != charset_ascii)
4043 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4044 last_id = charset->id;
4045 last_offset = char_offset;
4046 }
4047 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4048 *charbuf++ = c;
4049 char_offset++;
4050 continue;
4051
4052 invalid_code:
4053 src = src_base;
4054 consumed_chars = consumed_chars_base;
4055 ONE_MORE_BYTE (c);
4056 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4057 char_offset++;
4058 coding->errors++;
4059 }
4060
4061 no_more_source:
4062 if (last_id != charset_ascii)
4063 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4064 coding->consumed_char += consumed_chars_base;
4065 coding->consumed = src_base - coding->source;
4066 coding->charbuf_used = charbuf - coding->charbuf;
4067 }
4068
4069 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4070 This function can encode charsets `ascii', `katakana-jisx0201',
4071 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4072 are sure that all these charsets are registered as official charset
4073 (i.e. do not have extended leading-codes). Characters of other
4074 charsets are produced without any encoding. If SJIS_P is 1, encode
4075 SJIS text, else encode BIG5 text. */
4076
4077 static int
4078 encode_coding_sjis (coding)
4079 struct coding_system *coding;
4080 {
4081 int multibytep = coding->dst_multibyte;
4082 int *charbuf = coding->charbuf;
4083 int *charbuf_end = charbuf + coding->charbuf_used;
4084 unsigned char *dst = coding->destination + coding->produced;
4085 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4086 int safe_room = 4;
4087 int produced_chars = 0;
4088 Lisp_Object attrs, charset_list, val;
4089 int ascii_compatible;
4090 struct charset *charset_roman, *charset_kanji, *charset_kana;
4091 int c;
4092
4093 CODING_GET_INFO (coding, attrs, charset_list);
4094 val = charset_list;
4095 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4096 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4097 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4098
4099 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4100
4101 while (charbuf < charbuf_end)
4102 {
4103 ASSURE_DESTINATION (safe_room);
4104 c = *charbuf++;
4105 /* Now encode the character C. */
4106 if (ASCII_CHAR_P (c) && ascii_compatible)
4107 EMIT_ONE_ASCII_BYTE (c);
4108 else if (CHAR_BYTE8_P (c))
4109 {
4110 c = CHAR_TO_BYTE8 (c);
4111 EMIT_ONE_BYTE (c);
4112 }
4113 else
4114 {
4115 unsigned code;
4116 struct charset *charset = char_charset (c, charset_list, &code);
4117
4118 if (!charset)
4119 {
4120 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4121 {
4122 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4123 charset = CHARSET_FROM_ID (charset_ascii);
4124 }
4125 else
4126 {
4127 c = coding->default_char;
4128 charset = char_charset (c, charset_list, &code);
4129 }
4130 }
4131 if (code == CHARSET_INVALID_CODE (charset))
4132 abort ();
4133 if (charset == charset_kanji)
4134 {
4135 int c1, c2;
4136 JIS_TO_SJIS (code);
4137 c1 = code >> 8, c2 = code & 0xFF;
4138 EMIT_TWO_BYTES (c1, c2);
4139 }
4140 else if (charset == charset_kana)
4141 EMIT_ONE_BYTE (code | 0x80);
4142 else
4143 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4144 }
4145 }
4146 coding->result = CODING_RESULT_SUCCESS;
4147 coding->produced_char += produced_chars;
4148 coding->produced = dst - coding->destination;
4149 return 0;
4150 }
4151
4152 static int
4153 encode_coding_big5 (coding)
4154 struct coding_system *coding;
4155 {
4156 int multibytep = coding->dst_multibyte;
4157 int *charbuf = coding->charbuf;
4158 int *charbuf_end = charbuf + coding->charbuf_used;
4159 unsigned char *dst = coding->destination + coding->produced;
4160 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4161 int safe_room = 4;
4162 int produced_chars = 0;
4163 Lisp_Object attrs, charset_list, val;
4164 int ascii_compatible;
4165 struct charset *charset_roman, *charset_big5;
4166 int c;
4167
4168 CODING_GET_INFO (coding, attrs, charset_list);
4169 val = charset_list;
4170 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4171 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4172 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4173
4174 while (charbuf < charbuf_end)
4175 {
4176 ASSURE_DESTINATION (safe_room);
4177 c = *charbuf++;
4178 /* Now encode the character C. */
4179 if (ASCII_CHAR_P (c) && ascii_compatible)
4180 EMIT_ONE_ASCII_BYTE (c);
4181 else if (CHAR_BYTE8_P (c))
4182 {
4183 c = CHAR_TO_BYTE8 (c);
4184 EMIT_ONE_BYTE (c);
4185 }
4186 else
4187 {
4188 unsigned code;
4189 struct charset *charset = char_charset (c, charset_list, &code);
4190
4191 if (! charset)
4192 {
4193 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4194 {
4195 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4196 charset = CHARSET_FROM_ID (charset_ascii);
4197 }
4198 else
4199 {
4200 c = coding->default_char;
4201 charset = char_charset (c, charset_list, &code);
4202 }
4203 }
4204 if (code == CHARSET_INVALID_CODE (charset))
4205 abort ();
4206 if (charset == charset_big5)
4207 {
4208 int c1, c2;
4209
4210 c1 = code >> 8, c2 = code & 0xFF;
4211 EMIT_TWO_BYTES (c1, c2);
4212 }
4213 else
4214 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4215 }
4216 }
4217 coding->result = CODING_RESULT_SUCCESS;
4218 coding->produced_char += produced_chars;
4219 coding->produced = dst - coding->destination;
4220 return 0;
4221 }
4222
4223 \f
4224 /*** 10. CCL handlers ***/
4225
4226 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4227 Check if a text is encoded in a coding system of which
4228 encoder/decoder are written in CCL program. If it is, return
4229 CATEGORY_MASK_CCL, else return 0. */
4230
4231 static int
4232 detect_coding_ccl (coding, detect_info)
4233 struct coding_system *coding;
4234 struct coding_detection_info *detect_info;
4235 {
4236 const unsigned char *src = coding->source, *src_base = src;
4237 const unsigned char *src_end = coding->source + coding->src_bytes;
4238 int multibytep = coding->src_multibyte;
4239 int consumed_chars = 0;
4240 int found = 0;
4241 unsigned char *valids = CODING_CCL_VALIDS (coding);
4242 int head_ascii = coding->head_ascii;
4243 Lisp_Object attrs;
4244
4245 detect_info->checked |= CATEGORY_MASK_CCL;
4246
4247 coding = &coding_categories[coding_category_ccl];
4248 attrs = CODING_ID_ATTRS (coding->id);
4249 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4250 src += head_ascii;
4251
4252 while (1)
4253 {
4254 int c;
4255 ONE_MORE_BYTE (c);
4256 if (! valids[c])
4257 break;
4258 if ((valids[c] > 1))
4259 found = CATEGORY_MASK_CCL;
4260 }
4261 detect_info->rejected |= CATEGORY_MASK_CCL;
4262 return 0;
4263
4264 no_more_source:
4265 detect_info->found |= found;
4266 return 1;
4267 }
4268
4269 static void
4270 decode_coding_ccl (coding)
4271 struct coding_system *coding;
4272 {
4273 const unsigned char *src = coding->source + coding->consumed;
4274 const unsigned char *src_end = coding->source + coding->src_bytes;
4275 int *charbuf = coding->charbuf;
4276 int *charbuf_end = charbuf + coding->charbuf_size;
4277 int consumed_chars = 0;
4278 int multibytep = coding->src_multibyte;
4279 struct ccl_program ccl;
4280 int source_charbuf[1024];
4281 int source_byteidx[1024];
4282 Lisp_Object attrs, charset_list;
4283
4284 CODING_GET_INFO (coding, attrs, charset_list);
4285 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4286
4287 while (src < src_end)
4288 {
4289 const unsigned char *p = src;
4290 int *source, *source_end;
4291 int i = 0;
4292
4293 if (multibytep)
4294 while (i < 1024 && p < src_end)
4295 {
4296 source_byteidx[i] = p - src;
4297 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4298 }
4299 else
4300 while (i < 1024 && p < src_end)
4301 source_charbuf[i++] = *p++;
4302
4303 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4304 ccl.last_block = 1;
4305
4306 source = source_charbuf;
4307 source_end = source + i;
4308 while (source < source_end)
4309 {
4310 ccl_driver (&ccl, source, charbuf,
4311 source_end - source, charbuf_end - charbuf,
4312 charset_list);
4313 source += ccl.consumed;
4314 charbuf += ccl.produced;
4315 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4316 break;
4317 }
4318 if (source < source_end)
4319 src += source_byteidx[source - source_charbuf];
4320 else
4321 src = p;
4322 consumed_chars += source - source_charbuf;
4323
4324 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4325 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4326 break;
4327 }
4328
4329 switch (ccl.status)
4330 {
4331 case CCL_STAT_SUSPEND_BY_SRC:
4332 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4333 break;
4334 case CCL_STAT_SUSPEND_BY_DST:
4335 break;
4336 case CCL_STAT_QUIT:
4337 case CCL_STAT_INVALID_CMD:
4338 coding->result = CODING_RESULT_INTERRUPT;
4339 break;
4340 default:
4341 coding->result = CODING_RESULT_SUCCESS;
4342 break;
4343 }
4344 coding->consumed_char += consumed_chars;
4345 coding->consumed = src - coding->source;
4346 coding->charbuf_used = charbuf - coding->charbuf;
4347 }
4348
4349 static int
4350 encode_coding_ccl (coding)
4351 struct coding_system *coding;
4352 {
4353 struct ccl_program ccl;
4354 int multibytep = coding->dst_multibyte;
4355 int *charbuf = coding->charbuf;
4356 int *charbuf_end = charbuf + coding->charbuf_used;
4357 unsigned char *dst = coding->destination + coding->produced;
4358 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4359 unsigned char *adjusted_dst_end = dst_end - 1;
4360 int destination_charbuf[1024];
4361 int i, produced_chars = 0;
4362 Lisp_Object attrs, charset_list;
4363
4364 CODING_GET_INFO (coding, attrs, charset_list);
4365 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4366
4367 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4368 ccl.dst_multibyte = coding->dst_multibyte;
4369
4370 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4371 {
4372 int dst_bytes = dst_end - dst;
4373 if (dst_bytes > 1024)
4374 dst_bytes = 1024;
4375
4376 ccl_driver (&ccl, charbuf, destination_charbuf,
4377 charbuf_end - charbuf, dst_bytes, charset_list);
4378 charbuf += ccl.consumed;
4379 if (multibytep)
4380 for (i = 0; i < ccl.produced; i++)
4381 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4382 else
4383 {
4384 for (i = 0; i < ccl.produced; i++)
4385 *dst++ = destination_charbuf[i] & 0xFF;
4386 produced_chars += ccl.produced;
4387 }
4388 }
4389
4390 switch (ccl.status)
4391 {
4392 case CCL_STAT_SUSPEND_BY_SRC:
4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394 break;
4395 case CCL_STAT_SUSPEND_BY_DST:
4396 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4397 break;
4398 case CCL_STAT_QUIT:
4399 case CCL_STAT_INVALID_CMD:
4400 coding->result = CODING_RESULT_INTERRUPT;
4401 break;
4402 default:
4403 coding->result = CODING_RESULT_SUCCESS;
4404 break;
4405 }
4406
4407 coding->produced_char += produced_chars;
4408 coding->produced = dst - coding->destination;
4409 return 0;
4410 }
4411
4412
4413 \f
4414 /*** 10, 11. no-conversion handlers ***/
4415
4416 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4417
4418 static void
4419 decode_coding_raw_text (coding)
4420 struct coding_system *coding;
4421 {
4422 coding->chars_at_source = 1;
4423 coding->consumed_char = 0;
4424 coding->consumed = 0;
4425 coding->result = CODING_RESULT_SUCCESS;
4426 }
4427
4428 static int
4429 encode_coding_raw_text (coding)
4430 struct coding_system *coding;
4431 {
4432 int multibytep = coding->dst_multibyte;
4433 int *charbuf = coding->charbuf;
4434 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4435 unsigned char *dst = coding->destination + coding->produced;
4436 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4437 int produced_chars = 0;
4438 int c;
4439
4440 if (multibytep)
4441 {
4442 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4443
4444 if (coding->src_multibyte)
4445 while (charbuf < charbuf_end)
4446 {
4447 ASSURE_DESTINATION (safe_room);
4448 c = *charbuf++;
4449 if (ASCII_CHAR_P (c))
4450 EMIT_ONE_ASCII_BYTE (c);
4451 else if (CHAR_BYTE8_P (c))
4452 {
4453 c = CHAR_TO_BYTE8 (c);
4454 EMIT_ONE_BYTE (c);
4455 }
4456 else
4457 {
4458 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4459
4460 CHAR_STRING_ADVANCE (c, p1);
4461 while (p0 < p1)
4462 {
4463 EMIT_ONE_BYTE (*p0);
4464 p0++;
4465 }
4466 }
4467 }
4468 else
4469 while (charbuf < charbuf_end)
4470 {
4471 ASSURE_DESTINATION (safe_room);
4472 c = *charbuf++;
4473 EMIT_ONE_BYTE (c);
4474 }
4475 }
4476 else
4477 {
4478 if (coding->src_multibyte)
4479 {
4480 int safe_room = MAX_MULTIBYTE_LENGTH;
4481
4482 while (charbuf < charbuf_end)
4483 {
4484 ASSURE_DESTINATION (safe_room);
4485 c = *charbuf++;
4486 if (ASCII_CHAR_P (c))
4487 *dst++ = c;
4488 else if (CHAR_BYTE8_P (c))
4489 *dst++ = CHAR_TO_BYTE8 (c);
4490 else
4491 CHAR_STRING_ADVANCE (c, dst);
4492 produced_chars++;
4493 }
4494 }
4495 else
4496 {
4497 ASSURE_DESTINATION (charbuf_end - charbuf);
4498 while (charbuf < charbuf_end && dst < dst_end)
4499 *dst++ = *charbuf++;
4500 produced_chars = dst - (coding->destination + coding->dst_bytes);
4501 }
4502 }
4503 coding->result = CODING_RESULT_SUCCESS;
4504 coding->produced_char += produced_chars;
4505 coding->produced = dst - coding->destination;
4506 return 0;
4507 }
4508
4509 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4510 Check if a text is encoded in a charset-based coding system. If it
4511 is, return 1, else return 0. */
4512
4513 static int
4514 detect_coding_charset (coding, detect_info)
4515 struct coding_system *coding;
4516 struct coding_detection_info *detect_info;
4517 {
4518 const unsigned char *src = coding->source, *src_base = src;
4519 const unsigned char *src_end = coding->source + coding->src_bytes;
4520 int multibytep = coding->src_multibyte;
4521 int consumed_chars = 0;
4522 Lisp_Object attrs, valids;
4523 int found = 0;
4524
4525 detect_info->checked |= CATEGORY_MASK_CHARSET;
4526
4527 coding = &coding_categories[coding_category_charset];
4528 attrs = CODING_ID_ATTRS (coding->id);
4529 valids = AREF (attrs, coding_attr_charset_valids);
4530
4531 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4532 src += coding->head_ascii;
4533
4534 while (1)
4535 {
4536 int c;
4537
4538 ONE_MORE_BYTE (c);
4539 if (NILP (AREF (valids, c)))
4540 break;
4541 if (c >= 0x80)
4542 found = CATEGORY_MASK_CHARSET;
4543 }
4544 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4545 return 0;
4546
4547 no_more_source:
4548 detect_info->found |= found;
4549 return 1;
4550 }
4551
4552 static void
4553 decode_coding_charset (coding)
4554 struct coding_system *coding;
4555 {
4556 const unsigned char *src = coding->source + coding->consumed;
4557 const unsigned char *src_end = coding->source + coding->src_bytes;
4558 const unsigned char *src_base;
4559 int *charbuf = coding->charbuf;
4560 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4561 int consumed_chars = 0, consumed_chars_base;
4562 int multibytep = coding->src_multibyte;
4563 Lisp_Object attrs, charset_list, valids;
4564 int char_offset = coding->produced_char;
4565 int last_offset = char_offset;
4566 int last_id = charset_ascii;
4567
4568 CODING_GET_INFO (coding, attrs, charset_list);
4569 valids = AREF (attrs, coding_attr_charset_valids);
4570
4571 while (1)
4572 {
4573 int c;
4574 Lisp_Object val;
4575 struct charset *charset;
4576 int dim;
4577 int len = 1;
4578 unsigned code;
4579
4580 src_base = src;
4581 consumed_chars_base = consumed_chars;
4582
4583 if (charbuf >= charbuf_end)
4584 break;
4585
4586 ONE_MORE_BYTE (c);
4587 code = c;
4588
4589 val = AREF (valids, c);
4590 if (NILP (val))
4591 goto invalid_code;
4592 if (INTEGERP (val))
4593 {
4594 charset = CHARSET_FROM_ID (XFASTINT (val));
4595 dim = CHARSET_DIMENSION (charset);
4596 while (len < dim)
4597 {
4598 ONE_MORE_BYTE (c);
4599 code = (code << 8) | c;
4600 len++;
4601 }
4602 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4603 charset, code, c);
4604 }
4605 else
4606 {
4607 /* VAL is a list of charset IDs. It is assured that the
4608 list is sorted by charset dimensions (smaller one
4609 comes first). */
4610 while (CONSP (val))
4611 {
4612 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4613 dim = CHARSET_DIMENSION (charset);
4614 while (len < dim)
4615 {
4616 ONE_MORE_BYTE (c);
4617 code = (code << 8) | c;
4618 len++;
4619 }
4620 CODING_DECODE_CHAR (coding, src, src_base,
4621 src_end, charset, code, c);
4622 if (c >= 0)
4623 break;
4624 val = XCDR (val);
4625 }
4626 }
4627 if (c < 0)
4628 goto invalid_code;
4629 if (charset->id != charset_ascii
4630 && last_id != charset->id)
4631 {
4632 if (last_id != charset_ascii)
4633 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4634 last_id = charset->id;
4635 last_offset = char_offset;
4636 }
4637
4638 *charbuf++ = c;
4639 char_offset++;
4640 continue;
4641
4642 invalid_code:
4643 src = src_base;
4644 consumed_chars = consumed_chars_base;
4645 ONE_MORE_BYTE (c);
4646 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4647 char_offset++;
4648 coding->errors++;
4649 }
4650
4651 no_more_source:
4652 if (last_id != charset_ascii)
4653 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4654 coding->consumed_char += consumed_chars_base;
4655 coding->consumed = src_base - coding->source;
4656 coding->charbuf_used = charbuf - coding->charbuf;
4657 }
4658
4659 static int
4660 encode_coding_charset (coding)
4661 struct coding_system *coding;
4662 {
4663 int multibytep = coding->dst_multibyte;
4664 int *charbuf = coding->charbuf;
4665 int *charbuf_end = charbuf + coding->charbuf_used;
4666 unsigned char *dst = coding->destination + coding->produced;
4667 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4668 int safe_room = MAX_MULTIBYTE_LENGTH;
4669 int produced_chars = 0;
4670 Lisp_Object attrs, charset_list;
4671 int ascii_compatible;
4672 int c;
4673
4674 CODING_GET_INFO (coding, attrs, charset_list);
4675 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4676
4677 while (charbuf < charbuf_end)
4678 {
4679 struct charset *charset;
4680 unsigned code;
4681
4682 ASSURE_DESTINATION (safe_room);
4683 c = *charbuf++;
4684 if (ascii_compatible && ASCII_CHAR_P (c))
4685 EMIT_ONE_ASCII_BYTE (c);
4686 else if (CHAR_BYTE8_P (c))
4687 {
4688 c = CHAR_TO_BYTE8 (c);
4689 EMIT_ONE_BYTE (c);
4690 }
4691 else
4692 {
4693 charset = char_charset (c, charset_list, &code);
4694 if (charset)
4695 {
4696 if (CHARSET_DIMENSION (charset) == 1)
4697 EMIT_ONE_BYTE (code);
4698 else if (CHARSET_DIMENSION (charset) == 2)
4699 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4700 else if (CHARSET_DIMENSION (charset) == 3)
4701 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4702 else
4703 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4704 (code >> 8) & 0xFF, code & 0xFF);
4705 }
4706 else
4707 {
4708 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4709 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4710 else
4711 c = coding->default_char;
4712 EMIT_ONE_BYTE (c);
4713 }
4714 }
4715 }
4716
4717 coding->result = CODING_RESULT_SUCCESS;
4718 coding->produced_char += produced_chars;
4719 coding->produced = dst - coding->destination;
4720 return 0;
4721 }
4722
4723 \f
4724 /*** 7. C library functions ***/
4725
4726 /* Setup coding context CODING from information about CODING_SYSTEM.
4727 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4728 CODING_SYSTEM is invalid, signal an error. */
4729
4730 void
4731 setup_coding_system (coding_system, coding)
4732 Lisp_Object coding_system;
4733 struct coding_system *coding;
4734 {
4735 Lisp_Object attrs;
4736 Lisp_Object eol_type;
4737 Lisp_Object coding_type;
4738 Lisp_Object val;
4739
4740 if (NILP (coding_system))
4741 coding_system = Qno_conversion;
4742
4743 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4744
4745 attrs = CODING_ID_ATTRS (coding->id);
4746 eol_type = CODING_ID_EOL_TYPE (coding->id);
4747
4748 coding->mode = 0;
4749 coding->head_ascii = -1;
4750 coding->common_flags
4751 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4752 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4753 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4754 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4755 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4756 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4757 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4758
4759 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4760 coding->max_charset_id = SCHARS (val) - 1;
4761 coding->safe_charsets = (char *) SDATA (val);
4762 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4763
4764 coding_type = CODING_ATTR_TYPE (attrs);
4765 if (EQ (coding_type, Qundecided))
4766 {
4767 coding->detector = NULL;
4768 coding->decoder = decode_coding_raw_text;
4769 coding->encoder = encode_coding_raw_text;
4770 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4771 }
4772 else if (EQ (coding_type, Qiso_2022))
4773 {
4774 int i;
4775 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4776
4777 /* Invoke graphic register 0 to plane 0. */
4778 CODING_ISO_INVOCATION (coding, 0) = 0;
4779 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4780 CODING_ISO_INVOCATION (coding, 1)
4781 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4782 /* Setup the initial status of designation. */
4783 for (i = 0; i < 4; i++)
4784 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4785 /* Not single shifting initially. */
4786 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4787 /* Beginning of buffer should also be regarded as bol. */
4788 CODING_ISO_BOL (coding) = 1;
4789 coding->detector = detect_coding_iso_2022;
4790 coding->decoder = decode_coding_iso_2022;
4791 coding->encoder = encode_coding_iso_2022;
4792 if (flags & CODING_ISO_FLAG_SAFE)
4793 coding->mode |= CODING_MODE_SAFE_ENCODING;
4794 coding->common_flags
4795 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4796 | CODING_REQUIRE_FLUSHING_MASK);
4797 if (flags & CODING_ISO_FLAG_COMPOSITION)
4798 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4799 if (flags & CODING_ISO_FLAG_DESIGNATION)
4800 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4801 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4802 {
4803 setup_iso_safe_charsets (attrs);
4804 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4805 coding->max_charset_id = SCHARS (val) - 1;
4806 coding->safe_charsets = (char *) SDATA (val);
4807 }
4808 CODING_ISO_FLAGS (coding) = flags;
4809 }
4810 else if (EQ (coding_type, Qcharset))
4811 {
4812 coding->detector = detect_coding_charset;
4813 coding->decoder = decode_coding_charset;
4814 coding->encoder = encode_coding_charset;
4815 coding->common_flags
4816 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4817 }
4818 else if (EQ (coding_type, Qutf_8))
4819 {
4820 coding->detector = detect_coding_utf_8;
4821 coding->decoder = decode_coding_utf_8;
4822 coding->encoder = encode_coding_utf_8;
4823 coding->common_flags
4824 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4825 }
4826 else if (EQ (coding_type, Qutf_16))
4827 {
4828 val = AREF (attrs, coding_attr_utf_16_bom);
4829 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4830 : EQ (val, Qt) ? utf_16_with_bom
4831 : utf_16_without_bom);
4832 val = AREF (attrs, coding_attr_utf_16_endian);
4833 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4834 : utf_16_little_endian);
4835 CODING_UTF_16_SURROGATE (coding) = 0;
4836 coding->detector = detect_coding_utf_16;
4837 coding->decoder = decode_coding_utf_16;
4838 coding->encoder = encode_coding_utf_16;
4839 coding->common_flags
4840 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4841 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4842 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4843 }
4844 else if (EQ (coding_type, Qccl))
4845 {
4846 coding->detector = detect_coding_ccl;
4847 coding->decoder = decode_coding_ccl;
4848 coding->encoder = encode_coding_ccl;
4849 coding->common_flags
4850 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4851 | CODING_REQUIRE_FLUSHING_MASK);
4852 }
4853 else if (EQ (coding_type, Qemacs_mule))
4854 {
4855 coding->detector = detect_coding_emacs_mule;
4856 coding->decoder = decode_coding_emacs_mule;
4857 coding->encoder = encode_coding_emacs_mule;
4858 coding->common_flags
4859 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4860 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4861 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4862 {
4863 Lisp_Object tail, safe_charsets;
4864 int max_charset_id = 0;
4865
4866 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4867 tail = XCDR (tail))
4868 if (max_charset_id < XFASTINT (XCAR (tail)))
4869 max_charset_id = XFASTINT (XCAR (tail));
4870 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4871 make_number (255));
4872 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4873 tail = XCDR (tail))
4874 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4875 coding->max_charset_id = max_charset_id;
4876 coding->safe_charsets = (char *) SDATA (safe_charsets);
4877 }
4878 }
4879 else if (EQ (coding_type, Qshift_jis))
4880 {
4881 coding->detector = detect_coding_sjis;
4882 coding->decoder = decode_coding_sjis;
4883 coding->encoder = encode_coding_sjis;
4884 coding->common_flags
4885 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4886 }
4887 else if (EQ (coding_type, Qbig5))
4888 {
4889 coding->detector = detect_coding_big5;
4890 coding->decoder = decode_coding_big5;
4891 coding->encoder = encode_coding_big5;
4892 coding->common_flags
4893 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4894 }
4895 else /* EQ (coding_type, Qraw_text) */
4896 {
4897 coding->detector = NULL;
4898 coding->decoder = decode_coding_raw_text;
4899 coding->encoder = encode_coding_raw_text;
4900 }
4901
4902 return;
4903 }
4904
4905 /* Return raw-text or one of its subsidiaries that has the same
4906 eol_type as CODING-SYSTEM. */
4907
4908 Lisp_Object
4909 raw_text_coding_system (coding_system)
4910 Lisp_Object coding_system;
4911 {
4912 Lisp_Object spec, attrs;
4913 Lisp_Object eol_type, raw_text_eol_type;
4914
4915 if (NILP (coding_system))
4916 return Qraw_text;
4917 spec = CODING_SYSTEM_SPEC (coding_system);
4918 attrs = AREF (spec, 0);
4919
4920 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4921 return coding_system;
4922
4923 eol_type = AREF (spec, 2);
4924 if (VECTORP (eol_type))
4925 return Qraw_text;
4926 spec = CODING_SYSTEM_SPEC (Qraw_text);
4927 raw_text_eol_type = AREF (spec, 2);
4928 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4929 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4930 : AREF (raw_text_eol_type, 2));
4931 }
4932
4933
4934 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4935 does, return one of the subsidiary that has the same eol-spec as
4936 PARENT. Otherwise, return CODING_SYSTEM. */
4937
4938 Lisp_Object
4939 coding_inherit_eol_type (coding_system, parent)
4940 Lisp_Object coding_system, parent;
4941 {
4942 Lisp_Object spec, eol_type;
4943
4944 if (NILP (coding_system))
4945 coding_system = Qraw_text;
4946 spec = CODING_SYSTEM_SPEC (coding_system);
4947 eol_type = AREF (spec, 2);
4948 if (VECTORP (eol_type)
4949 && ! NILP (parent))
4950 {
4951 Lisp_Object parent_spec;
4952 Lisp_Object parent_eol_type;
4953
4954 parent_spec
4955 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4956 parent_eol_type = AREF (parent_spec, 2);
4957 if (EQ (parent_eol_type, Qunix))
4958 coding_system = AREF (eol_type, 0);
4959 else if (EQ (parent_eol_type, Qdos))
4960 coding_system = AREF (eol_type, 1);
4961 else if (EQ (parent_eol_type, Qmac))
4962 coding_system = AREF (eol_type, 2);
4963 }
4964 return coding_system;
4965 }
4966
4967 /* Emacs has a mechanism to automatically detect a coding system if it
4968 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4969 it's impossible to distinguish some coding systems accurately
4970 because they use the same range of codes. So, at first, coding
4971 systems are categorized into 7, those are:
4972
4973 o coding-category-emacs-mule
4974
4975 The category for a coding system which has the same code range
4976 as Emacs' internal format. Assigned the coding-system (Lisp
4977 symbol) `emacs-mule' by default.
4978
4979 o coding-category-sjis
4980
4981 The category for a coding system which has the same code range
4982 as SJIS. Assigned the coding-system (Lisp
4983 symbol) `japanese-shift-jis' by default.
4984
4985 o coding-category-iso-7
4986
4987 The category for a coding system which has the same code range
4988 as ISO2022 of 7-bit environment. This doesn't use any locking
4989 shift and single shift functions. This can encode/decode all
4990 charsets. Assigned the coding-system (Lisp symbol)
4991 `iso-2022-7bit' by default.
4992
4993 o coding-category-iso-7-tight
4994
4995 Same as coding-category-iso-7 except that this can
4996 encode/decode only the specified charsets.
4997
4998 o coding-category-iso-8-1
4999
5000 The category for a coding system which has the same code range
5001 as ISO2022 of 8-bit environment and graphic plane 1 used only
5002 for DIMENSION1 charset. This doesn't use any locking shift
5003 and single shift functions. Assigned the coding-system (Lisp
5004 symbol) `iso-latin-1' by default.
5005
5006 o coding-category-iso-8-2
5007
5008 The category for a coding system which has the same code range
5009 as ISO2022 of 8-bit environment and graphic plane 1 used only
5010 for DIMENSION2 charset. This doesn't use any locking shift
5011 and single shift functions. Assigned the coding-system (Lisp
5012 symbol) `japanese-iso-8bit' by default.
5013
5014 o coding-category-iso-7-else
5015
5016 The category for a coding system which has the same code range
5017 as ISO2022 of 7-bit environemnt but uses locking shift or
5018 single shift functions. Assigned the coding-system (Lisp
5019 symbol) `iso-2022-7bit-lock' by default.
5020
5021 o coding-category-iso-8-else
5022
5023 The category for a coding system which has the same code range
5024 as ISO2022 of 8-bit environemnt but uses locking shift or
5025 single shift functions. Assigned the coding-system (Lisp
5026 symbol) `iso-2022-8bit-ss2' by default.
5027
5028 o coding-category-big5
5029
5030 The category for a coding system which has the same code range
5031 as BIG5. Assigned the coding-system (Lisp symbol)
5032 `cn-big5' by default.
5033
5034 o coding-category-utf-8
5035
5036 The category for a coding system which has the same code range
5037 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5038 symbol) `utf-8' by default.
5039
5040 o coding-category-utf-16-be
5041
5042 The category for a coding system in which a text has an
5043 Unicode signature (cf. Unicode Standard) in the order of BIG
5044 endian at the head. Assigned the coding-system (Lisp symbol)
5045 `utf-16-be' by default.
5046
5047 o coding-category-utf-16-le
5048
5049 The category for a coding system in which a text has an
5050 Unicode signature (cf. Unicode Standard) in the order of
5051 LITTLE endian at the head. Assigned the coding-system (Lisp
5052 symbol) `utf-16-le' by default.
5053
5054 o coding-category-ccl
5055
5056 The category for a coding system of which encoder/decoder is
5057 written in CCL programs. The default value is nil, i.e., no
5058 coding system is assigned.
5059
5060 o coding-category-binary
5061
5062 The category for a coding system not categorized in any of the
5063 above. Assigned the coding-system (Lisp symbol)
5064 `no-conversion' by default.
5065
5066 Each of them is a Lisp symbol and the value is an actual
5067 `coding-system's (this is also a Lisp symbol) assigned by a user.
5068 What Emacs does actually is to detect a category of coding system.
5069 Then, it uses a `coding-system' assigned to it. If Emacs can't
5070 decide only one possible category, it selects a category of the
5071 highest priority. Priorities of categories are also specified by a
5072 user in a Lisp variable `coding-category-list'.
5073
5074 */
5075
5076 #define EOL_SEEN_NONE 0
5077 #define EOL_SEEN_LF 1
5078 #define EOL_SEEN_CR 2
5079 #define EOL_SEEN_CRLF 4
5080
5081 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5082 SOURCE is encoded. If CATEGORY is one of
5083 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5084 two-byte, else they are encoded by one-byte.
5085
5086 Return one of EOL_SEEN_XXX. */
5087
5088 #define MAX_EOL_CHECK_COUNT 3
5089
5090 static int
5091 detect_eol (source, src_bytes, category)
5092 unsigned char *source;
5093 EMACS_INT src_bytes;
5094 enum coding_category category;
5095 {
5096 unsigned char *src = source, *src_end = src + src_bytes;
5097 unsigned char c;
5098 int total = 0;
5099 int eol_seen = EOL_SEEN_NONE;
5100
5101 if ((1 << category) & CATEGORY_MASK_UTF_16)
5102 {
5103 int msb, lsb;
5104
5105 msb = category == (coding_category_utf_16_le
5106 | coding_category_utf_16_le_nosig);
5107 lsb = 1 - msb;
5108
5109 while (src + 1 < src_end)
5110 {
5111 c = src[lsb];
5112 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5113 {
5114 int this_eol;
5115
5116 if (c == '\n')
5117 this_eol = EOL_SEEN_LF;
5118 else if (src + 3 >= src_end
5119 || src[msb + 2] != 0
5120 || src[lsb + 2] != '\n')
5121 this_eol = EOL_SEEN_CR;
5122 else
5123 this_eol = EOL_SEEN_CRLF;
5124
5125 if (eol_seen == EOL_SEEN_NONE)
5126 /* This is the first end-of-line. */
5127 eol_seen = this_eol;
5128 else if (eol_seen != this_eol)
5129 {
5130 /* The found type is different from what found before. */
5131 eol_seen = EOL_SEEN_LF;
5132 break;
5133 }
5134 if (++total == MAX_EOL_CHECK_COUNT)
5135 break;
5136 }
5137 src += 2;
5138 }
5139 }
5140 else
5141 {
5142 while (src < src_end)
5143 {
5144 c = *src++;
5145 if (c == '\n' || c == '\r')
5146 {
5147 int this_eol;
5148
5149 if (c == '\n')
5150 this_eol = EOL_SEEN_LF;
5151 else if (src >= src_end || *src != '\n')
5152 this_eol = EOL_SEEN_CR;
5153 else
5154 this_eol = EOL_SEEN_CRLF, src++;
5155
5156 if (eol_seen == EOL_SEEN_NONE)
5157 /* This is the first end-of-line. */
5158 eol_seen = this_eol;
5159 else if (eol_seen != this_eol)
5160 {
5161 /* The found type is different from what found before. */
5162 eol_seen = EOL_SEEN_LF;
5163 break;
5164 }
5165 if (++total == MAX_EOL_CHECK_COUNT)
5166 break;
5167 }
5168 }
5169 }
5170 return eol_seen;
5171 }
5172
5173
5174 static Lisp_Object
5175 adjust_coding_eol_type (coding, eol_seen)
5176 struct coding_system *coding;
5177 int eol_seen;
5178 {
5179 Lisp_Object eol_type;
5180
5181 eol_type = CODING_ID_EOL_TYPE (coding->id);
5182 if (eol_seen & EOL_SEEN_LF)
5183 {
5184 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5185 eol_type = Qunix;
5186 }
5187 else if (eol_seen & EOL_SEEN_CRLF)
5188 {
5189 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5190 eol_type = Qdos;
5191 }
5192 else if (eol_seen & EOL_SEEN_CR)
5193 {
5194 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5195 eol_type = Qmac;
5196 }
5197 return eol_type;
5198 }
5199
5200 /* Detect how a text specified in CODING is encoded. If a coding
5201 system is detected, update fields of CODING by the detected coding
5202 system. */
5203
5204 void
5205 detect_coding (coding)
5206 struct coding_system *coding;
5207 {
5208 const unsigned char *src, *src_end;
5209 Lisp_Object attrs, coding_type;
5210
5211 coding->consumed = coding->consumed_char = 0;
5212 coding->produced = coding->produced_char = 0;
5213 coding_set_source (coding);
5214
5215 src_end = coding->source + coding->src_bytes;
5216
5217 /* If we have not yet decided the text encoding type, detect it
5218 now. */
5219 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5220 {
5221 int c, i;
5222
5223 for (i = 0, src = coding->source; src < src_end; i++, src++)
5224 {
5225 c = *src;
5226 if (c & 0x80 || (c < 0x20 && (c == 0
5227 || c == ISO_CODE_ESC
5228 || c == ISO_CODE_SI
5229 || c == ISO_CODE_SO)))
5230 break;
5231 }
5232 /* Skipped bytes must be even for utf-16 detector. */
5233 if (i % 2)
5234 src--;
5235 coding->head_ascii = src - (coding->source + coding->consumed);
5236
5237 if (coding->head_ascii < coding->src_bytes)
5238 {
5239 struct coding_detection_info detect_info;
5240 enum coding_category category;
5241 struct coding_system *this;
5242
5243 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5244 for (i = 0; i < coding_category_raw_text; i++)
5245 {
5246 category = coding_priorities[i];
5247 this = coding_categories + category;
5248 if (this->id < 0)
5249 {
5250 /* No coding system of this category is defined. */
5251 detect_info.rejected |= (1 << category);
5252 }
5253 else if (category >= coding_category_raw_text)
5254 continue;
5255 else if (detect_info.checked & (1 << category))
5256 {
5257 if (detect_info.found & (1 << category))
5258 break;
5259 }
5260 else if ((*(this->detector)) (coding, &detect_info)
5261 && detect_info.found & (1 << category))
5262 {
5263 if (category == coding_category_utf_16_auto)
5264 {
5265 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5266 category = coding_category_utf_16_le;
5267 else
5268 category = coding_category_utf_16_be;
5269 }
5270 break;
5271 }
5272 }
5273 if (i < coding_category_raw_text)
5274 setup_coding_system (CODING_ID_NAME (this->id), coding);
5275 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5276 setup_coding_system (Qraw_text, coding);
5277 else if (detect_info.rejected)
5278 for (i = 0; i < coding_category_raw_text; i++)
5279 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5280 {
5281 this = coding_categories + coding_priorities[i];
5282 setup_coding_system (CODING_ID_NAME (this->id), coding);
5283 break;
5284 }
5285 }
5286 }
5287 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5288 == coding_category_utf_16_auto)
5289 {
5290 Lisp_Object coding_systems;
5291 struct coding_detection_info detect_info;
5292
5293 coding_systems
5294 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5295 detect_info.found = detect_info.rejected = 0;
5296 if (CONSP (coding_systems)
5297 && detect_coding_utf_16 (coding, &detect_info))
5298 {
5299 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5300 setup_coding_system (XCAR (coding_systems), coding);
5301 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5302 setup_coding_system (XCDR (coding_systems), coding);
5303 }
5304 }
5305 }
5306
5307
5308 static void
5309 decode_eol (coding)
5310 struct coding_system *coding;
5311 {
5312 Lisp_Object eol_type;
5313 unsigned char *p, *pbeg, *pend;
5314
5315 eol_type = CODING_ID_EOL_TYPE (coding->id);
5316 if (EQ (eol_type, Qunix))
5317 return;
5318
5319 if (NILP (coding->dst_object))
5320 pbeg = coding->destination;
5321 else
5322 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5323 pend = pbeg + coding->produced;
5324
5325 if (VECTORP (eol_type))
5326 {
5327 int eol_seen = EOL_SEEN_NONE;
5328
5329 for (p = pbeg; p < pend; p++)
5330 {
5331 if (*p == '\n')
5332 eol_seen |= EOL_SEEN_LF;
5333 else if (*p == '\r')
5334 {
5335 if (p + 1 < pend && *(p + 1) == '\n')
5336 {
5337 eol_seen |= EOL_SEEN_CRLF;
5338 p++;
5339 }
5340 else
5341 eol_seen |= EOL_SEEN_CR;
5342 }
5343 }
5344 if (eol_seen != EOL_SEEN_NONE
5345 && eol_seen != EOL_SEEN_LF
5346 && eol_seen != EOL_SEEN_CRLF
5347 && eol_seen != EOL_SEEN_CR)
5348 eol_seen = EOL_SEEN_LF;
5349 if (eol_seen != EOL_SEEN_NONE)
5350 eol_type = adjust_coding_eol_type (coding, eol_seen);
5351 }
5352
5353 if (EQ (eol_type, Qmac))
5354 {
5355 for (p = pbeg; p < pend; p++)
5356 if (*p == '\r')
5357 *p = '\n';
5358 }
5359 else if (EQ (eol_type, Qdos))
5360 {
5361 int n = 0;
5362
5363 if (NILP (coding->dst_object))
5364 {
5365 for (p = pend - 2; p >= pbeg; p--)
5366 if (*p == '\r')
5367 {
5368 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5369 n++;
5370 }
5371 }
5372 else
5373 {
5374 for (p = pend - 2; p >= pbeg; p--)
5375 if (*p == '\r')
5376 {
5377 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5378 int pos = BYTE_TO_CHAR (pos_byte);
5379
5380 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5381 n++;
5382 }
5383 }
5384 coding->produced -= n;
5385 coding->produced_char -= n;
5386 }
5387 }
5388
5389 static void
5390 translate_chars (coding, table)
5391 struct coding_system *coding;
5392 Lisp_Object table;
5393 {
5394 int *charbuf = coding->charbuf;
5395 int *charbuf_end = charbuf + coding->charbuf_used;
5396 int c;
5397
5398 if (coding->chars_at_source)
5399 return;
5400
5401 while (charbuf < charbuf_end)
5402 {
5403 c = *charbuf;
5404 if (c < 0)
5405 charbuf += c;
5406 else
5407 *charbuf++ = translate_char (table, c);
5408 }
5409 }
5410
5411 static int
5412 produce_chars (coding)
5413 struct coding_system *coding;
5414 {
5415 unsigned char *dst = coding->destination + coding->produced;
5416 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5417 int produced;
5418 int produced_chars = 0;
5419
5420 if (! coding->chars_at_source)
5421 {
5422 /* Characters are in coding->charbuf. */
5423 int *buf = coding->charbuf;
5424 int *buf_end = buf + coding->charbuf_used;
5425 unsigned char *adjusted_dst_end;
5426
5427 if (BUFFERP (coding->src_object)
5428 && EQ (coding->src_object, coding->dst_object))
5429 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5430 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5431
5432 while (buf < buf_end)
5433 {
5434 int c = *buf++;
5435
5436 if (dst >= adjusted_dst_end)
5437 {
5438 dst = alloc_destination (coding,
5439 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5440 dst);
5441 dst_end = coding->destination + coding->dst_bytes;
5442 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5443 }
5444 if (c >= 0)
5445 {
5446 if (coding->dst_multibyte
5447 || ! CHAR_BYTE8_P (c))
5448 CHAR_STRING_ADVANCE (c, dst);
5449 else
5450 *dst++ = CHAR_TO_BYTE8 (c);
5451 produced_chars++;
5452 }
5453 else
5454 /* This is an annotation datum. (-C) is the length of
5455 it. */
5456 buf += -c - 1;
5457 }
5458 }
5459 else
5460 {
5461 const unsigned char *src = coding->source;
5462 const unsigned char *src_end = src + coding->src_bytes;
5463 Lisp_Object eol_type;
5464
5465 eol_type = CODING_ID_EOL_TYPE (coding->id);
5466
5467 if (coding->src_multibyte != coding->dst_multibyte)
5468 {
5469 if (coding->src_multibyte)
5470 {
5471 int multibytep = 1;
5472 int consumed_chars;
5473
5474 while (1)
5475 {
5476 const unsigned char *src_base = src;
5477 int c;
5478
5479 ONE_MORE_BYTE (c);
5480 if (c == '\r')
5481 {
5482 if (EQ (eol_type, Qdos))
5483 {
5484 if (src == src_end)
5485 {
5486 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5487 goto no_more_source;
5488 }
5489 if (*src == '\n')
5490 c = *src++;
5491 }
5492 else if (EQ (eol_type, Qmac))
5493 c = '\n';
5494 }
5495 if (dst == dst_end)
5496 {
5497 coding->consumed = src - coding->source;
5498
5499 if (EQ (coding->src_object, coding->dst_object))
5500 dst_end = (unsigned char *) src;
5501 if (dst == dst_end)
5502 {
5503 dst = alloc_destination (coding, src_end - src + 1,
5504 dst);
5505 dst_end = coding->destination + coding->dst_bytes;
5506 coding_set_source (coding);
5507 src = coding->source + coding->consumed;
5508 src_end = coding->source + coding->src_bytes;
5509 }
5510 }
5511 *dst++ = c;
5512 produced_chars++;
5513 }
5514 no_more_source:
5515 ;
5516 }
5517 else
5518 while (src < src_end)
5519 {
5520 int multibytep = 1;
5521 int c = *src++;
5522
5523 if (c == '\r')
5524 {
5525 if (EQ (eol_type, Qdos))
5526 {
5527 if (src < src_end
5528 && *src == '\n')
5529 c = *src++;
5530 }
5531 else if (EQ (eol_type, Qmac))
5532 c = '\n';
5533 }
5534 if (dst >= dst_end - 1)
5535 {
5536 coding->consumed = src - coding->source;
5537
5538 if (EQ (coding->src_object, coding->dst_object))
5539 dst_end = (unsigned char *) src;
5540 if (dst >= dst_end - 1)
5541 {
5542 dst = alloc_destination (coding, src_end - src + 2,
5543 dst);
5544 dst_end = coding->destination + coding->dst_bytes;
5545 coding_set_source (coding);
5546 src = coding->source + coding->consumed;
5547 src_end = coding->source + coding->src_bytes;
5548 }
5549 }
5550 EMIT_ONE_BYTE (c);
5551 }
5552 }
5553 else
5554 {
5555 if (!EQ (coding->src_object, coding->dst_object))
5556 {
5557 int require = coding->src_bytes - coding->dst_bytes;
5558
5559 if (require > 0)
5560 {
5561 EMACS_INT offset = src - coding->source;
5562
5563 dst = alloc_destination (coding, require, dst);
5564 coding_set_source (coding);
5565 src = coding->source + offset;
5566 src_end = coding->source + coding->src_bytes;
5567 }
5568 }
5569 produced_chars = coding->src_chars;
5570 while (src < src_end)
5571 {
5572 int c = *src++;
5573
5574 if (c == '\r')
5575 {
5576 if (EQ (eol_type, Qdos))
5577 {
5578 if (src < src_end
5579 && *src == '\n')
5580 c = *src++;
5581 produced_chars--;
5582 }
5583 else if (EQ (eol_type, Qmac))
5584 c = '\n';
5585 }
5586 *dst++ = c;
5587 }
5588 }
5589 coding->consumed = coding->src_bytes;
5590 coding->consumed_char = coding->src_chars;
5591 }
5592
5593 produced = dst - (coding->destination + coding->produced);
5594 if (BUFFERP (coding->dst_object))
5595 insert_from_gap (produced_chars, produced);
5596 coding->produced += produced;
5597 coding->produced_char += produced_chars;
5598 return produced_chars;
5599 }
5600
5601 /* Compose text in CODING->object according to the annotation data at
5602 CHARBUF. CHARBUF is an array:
5603 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5604 */
5605
5606 static INLINE void
5607 produce_composition (coding, charbuf)
5608 struct coding_system *coding;
5609 int *charbuf;
5610 {
5611 int len;
5612 EMACS_INT from, to;
5613 enum composition_method method;
5614 Lisp_Object components;
5615
5616 len = -charbuf[0];
5617 from = coding->dst_pos + charbuf[2];
5618 to = coding->dst_pos + charbuf[3];
5619 method = (enum composition_method) (charbuf[4]);
5620
5621 if (method == COMPOSITION_RELATIVE)
5622 components = Qnil;
5623 else
5624 {
5625 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5626 int i;
5627
5628 len -= 5;
5629 charbuf += 5;
5630 for (i = 0; i < len; i++)
5631 args[i] = make_number (charbuf[i]);
5632 components = (method == COMPOSITION_WITH_ALTCHARS
5633 ? Fstring (len, args) : Fvector (len, args));
5634 }
5635 compose_text (from, to, components, Qnil, coding->dst_object);
5636 }
5637
5638
5639 /* Put `charset' property on text in CODING->object according to
5640 the annotation data at CHARBUF. CHARBUF is an array:
5641 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5642 */
5643
5644 static INLINE void
5645 produce_charset (coding, charbuf)
5646 struct coding_system *coding;
5647 int *charbuf;
5648 {
5649 EMACS_INT from = coding->dst_pos + charbuf[2];
5650 EMACS_INT to = coding->dst_pos + charbuf[3];
5651 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5652
5653 Fput_text_property (make_number (from), make_number (to),
5654 Qcharset, CHARSET_NAME (charset),
5655 coding->dst_object);
5656 }
5657
5658
5659 #define CHARBUF_SIZE 0x4000
5660
5661 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5662 do { \
5663 int size = CHARBUF_SIZE;; \
5664 \
5665 coding->charbuf = NULL; \
5666 while (size > 1024) \
5667 { \
5668 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5669 if (coding->charbuf) \
5670 break; \
5671 size >>= 1; \
5672 } \
5673 if (! coding->charbuf) \
5674 { \
5675 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5676 return coding->result; \
5677 } \
5678 coding->charbuf_size = size; \
5679 } while (0)
5680
5681
5682 static void
5683 produce_annotation (coding)
5684 struct coding_system *coding;
5685 {
5686 int *charbuf = coding->charbuf;
5687 int *charbuf_end = charbuf + coding->charbuf_used;
5688
5689 if (NILP (coding->dst_object))
5690 return;
5691
5692 while (charbuf < charbuf_end)
5693 {
5694 if (*charbuf >= 0)
5695 charbuf++;
5696 else
5697 {
5698 int len = -*charbuf;
5699 switch (charbuf[1])
5700 {
5701 case CODING_ANNOTATE_COMPOSITION_MASK:
5702 produce_composition (coding, charbuf);
5703 break;
5704 case CODING_ANNOTATE_CHARSET_MASK:
5705 produce_charset (coding, charbuf);
5706 break;
5707 default:
5708 abort ();
5709 }
5710 charbuf += len;
5711 }
5712 }
5713 }
5714
5715 /* Decode the data at CODING->src_object into CODING->dst_object.
5716 CODING->src_object is a buffer, a string, or nil.
5717 CODING->dst_object is a buffer.
5718
5719 If CODING->src_object is a buffer, it must be the current buffer.
5720 In this case, if CODING->src_pos is positive, it is a position of
5721 the source text in the buffer, otherwise, the source text is in the
5722 gap area of the buffer, and CODING->src_pos specifies the offset of
5723 the text from GPT (which must be the same as PT). If this is the
5724 same buffer as CODING->dst_object, CODING->src_pos must be
5725 negative.
5726
5727 If CODING->src_object is a string, CODING->src_pos in an index to
5728 that string.
5729
5730 If CODING->src_object is nil, CODING->source must already point to
5731 the non-relocatable memory area. In this case, CODING->src_pos is
5732 an offset from CODING->source.
5733
5734 The decoded data is inserted at the current point of the buffer
5735 CODING->dst_object.
5736 */
5737
5738 static int
5739 decode_coding (coding)
5740 struct coding_system *coding;
5741 {
5742 Lisp_Object attrs;
5743 Lisp_Object undo_list;
5744
5745 if (BUFFERP (coding->src_object)
5746 && coding->src_pos > 0
5747 && coding->src_pos < GPT
5748 && coding->src_pos + coding->src_chars > GPT)
5749 move_gap_both (coding->src_pos, coding->src_pos_byte);
5750
5751 undo_list = Qt;
5752 if (BUFFERP (coding->dst_object))
5753 {
5754 if (current_buffer != XBUFFER (coding->dst_object))
5755 set_buffer_internal (XBUFFER (coding->dst_object));
5756 if (GPT != PT)
5757 move_gap_both (PT, PT_BYTE);
5758 undo_list = current_buffer->undo_list;
5759 current_buffer->undo_list = Qt;
5760 }
5761
5762 coding->consumed = coding->consumed_char = 0;
5763 coding->produced = coding->produced_char = 0;
5764 coding->chars_at_source = 0;
5765 coding->result = CODING_RESULT_SUCCESS;
5766 coding->errors = 0;
5767
5768 ALLOC_CONVERSION_WORK_AREA (coding);
5769
5770 attrs = CODING_ID_ATTRS (coding->id);
5771
5772 do
5773 {
5774 coding_set_source (coding);
5775 coding->annotated = 0;
5776 (*(coding->decoder)) (coding);
5777 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5778 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5779 else if (!NILP (Vstandard_translation_table_for_decode))
5780 translate_chars (coding, Vstandard_translation_table_for_decode);
5781 coding_set_destination (coding);
5782 produce_chars (coding);
5783 if (coding->annotated)
5784 produce_annotation (coding);
5785 }
5786 while (coding->consumed < coding->src_bytes
5787 && ! coding->result);
5788
5789 coding->carryover_bytes = 0;
5790 if (coding->consumed < coding->src_bytes)
5791 {
5792 int nbytes = coding->src_bytes - coding->consumed;
5793 const unsigned char *src;
5794
5795 coding_set_source (coding);
5796 coding_set_destination (coding);
5797 src = coding->source + coding->consumed;
5798
5799 if (coding->mode & CODING_MODE_LAST_BLOCK)
5800 {
5801 /* Flush out unprocessed data as binary chars. We are sure
5802 that the number of data is less than the size of
5803 coding->charbuf. */
5804 while (nbytes-- > 0)
5805 {
5806 int c = *src++;
5807
5808 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5809 }
5810 produce_chars (coding);
5811 }
5812 else
5813 {
5814 /* Record unprocessed bytes in coding->carryover. We are
5815 sure that the number of data is less than the size of
5816 coding->carryover. */
5817 unsigned char *p = coding->carryover;
5818
5819 coding->carryover_bytes = nbytes;
5820 while (nbytes-- > 0)
5821 *p++ = *src++;
5822 }
5823 coding->consumed = coding->src_bytes;
5824 }
5825
5826 if (BUFFERP (coding->dst_object))
5827 {
5828 current_buffer->undo_list = undo_list;
5829 record_insert (coding->dst_pos, coding->produced_char);
5830 }
5831 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5832 decode_eol (coding);
5833 return coding->result;
5834 }
5835
5836
5837 /* Extract an annotation datum from a composition starting at POS and
5838 ending before LIMIT of CODING->src_object (buffer or string), store
5839 the data in BUF, set *STOP to a starting position of the next
5840 composition (if any) or to LIMIT, and return the address of the
5841 next element of BUF.
5842
5843 If such an annotation is not found, set *STOP to a starting
5844 position of a composition after POS (if any) or to LIMIT, and
5845 return BUF. */
5846
5847 static INLINE int *
5848 handle_composition_annotation (pos, limit, coding, buf, stop)
5849 EMACS_INT pos, limit;
5850 struct coding_system *coding;
5851 int *buf;
5852 EMACS_INT *stop;
5853 {
5854 EMACS_INT start, end;
5855 Lisp_Object prop;
5856
5857 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5858 || end > limit)
5859 *stop = limit;
5860 else if (start > pos)
5861 *stop = start;
5862 else
5863 {
5864 if (start == pos)
5865 {
5866 /* We found a composition. Store the corresponding
5867 annotation data in BUF. */
5868 int *head = buf;
5869 enum composition_method method = COMPOSITION_METHOD (prop);
5870 int nchars = COMPOSITION_LENGTH (prop);
5871
5872 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5873 if (method != COMPOSITION_RELATIVE)
5874 {
5875 Lisp_Object components;
5876 int len, i, i_byte;
5877
5878 components = COMPOSITION_COMPONENTS (prop);
5879 if (VECTORP (components))
5880 {
5881 len = XVECTOR (components)->size;
5882 for (i = 0; i < len; i++)
5883 *buf++ = XINT (AREF (components, i));
5884 }
5885 else if (STRINGP (components))
5886 {
5887 len = SCHARS (components);
5888 i = i_byte = 0;
5889 while (i < len)
5890 {
5891 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5892 buf++;
5893 }
5894 }
5895 else if (INTEGERP (components))
5896 {
5897 len = 1;
5898 *buf++ = XINT (components);
5899 }
5900 else if (CONSP (components))
5901 {
5902 for (len = 0; CONSP (components);
5903 len++, components = XCDR (components))
5904 *buf++ = XINT (XCAR (components));
5905 }
5906 else
5907 abort ();
5908 *head -= len;
5909 }
5910 }
5911
5912 if (find_composition (end, limit, &start, &end, &prop,
5913 coding->src_object)
5914 && end <= limit)
5915 *stop = start;
5916 else
5917 *stop = limit;
5918 }
5919 return buf;
5920 }
5921
5922
5923 /* Extract an annotation datum from a text property `charset' at POS of
5924 CODING->src_object (buffer of string), store the data in BUF, set
5925 *STOP to the position where the value of `charset' property changes
5926 (limiting by LIMIT), and return the address of the next element of
5927 BUF.
5928
5929 If the property value is nil, set *STOP to the position where the
5930 property value is non-nil (limiting by LIMIT), and return BUF. */
5931
5932 static INLINE int *
5933 handle_charset_annotation (pos, limit, coding, buf, stop)
5934 EMACS_INT pos, limit;
5935 struct coding_system *coding;
5936 int *buf;
5937 EMACS_INT *stop;
5938 {
5939 Lisp_Object val, next;
5940 int id;
5941
5942 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5943 if (! NILP (val) && CHARSETP (val))
5944 id = XINT (CHARSET_SYMBOL_ID (val));
5945 else
5946 id = -1;
5947 ADD_CHARSET_DATA (buf, 0, 0, id);
5948 next = Fnext_single_property_change (make_number (pos), Qcharset,
5949 coding->src_object,
5950 make_number (limit));
5951 *stop = XINT (next);
5952 return buf;
5953 }
5954
5955
5956 static void
5957 consume_chars (coding)
5958 struct coding_system *coding;
5959 {
5960 int *buf = coding->charbuf;
5961 int *buf_end = coding->charbuf + coding->charbuf_size;
5962 const unsigned char *src = coding->source + coding->consumed;
5963 const unsigned char *src_end = coding->source + coding->src_bytes;
5964 EMACS_INT pos = coding->src_pos + coding->consumed_char;
5965 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5966 int multibytep = coding->src_multibyte;
5967 Lisp_Object eol_type;
5968 int c;
5969 EMACS_INT stop, stop_composition, stop_charset;
5970
5971 eol_type = CODING_ID_EOL_TYPE (coding->id);
5972 if (VECTORP (eol_type))
5973 eol_type = Qunix;
5974
5975 /* Note: composition handling is not yet implemented. */
5976 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5977
5978 if (NILP (coding->src_object))
5979 stop = stop_composition = stop_charset = end_pos;
5980 else
5981 {
5982 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5983 stop = stop_composition = pos;
5984 else
5985 stop = stop_composition = end_pos;
5986 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5987 stop = stop_charset = pos;
5988 else
5989 stop_charset = end_pos;
5990 }
5991
5992 /* Compensate for CRLF and conversion. */
5993 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5994 while (buf < buf_end)
5995 {
5996 if (pos == stop)
5997 {
5998 if (pos == end_pos)
5999 break;
6000 if (pos == stop_composition)
6001 buf = handle_composition_annotation (pos, end_pos, coding,
6002 buf, &stop_composition);
6003 if (pos == stop_charset)
6004 buf = handle_charset_annotation (pos, end_pos, coding,
6005 buf, &stop_charset);
6006 stop = (stop_composition < stop_charset
6007 ? stop_composition : stop_charset);
6008 }
6009
6010 if (! multibytep)
6011 {
6012 EMACS_INT bytes;
6013
6014 if (! CODING_FOR_UNIBYTE (coding)
6015 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6016 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6017 else
6018 c = *src++, pos++;
6019 }
6020 else
6021 c = STRING_CHAR_ADVANCE (src), pos++;
6022 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6023 c = '\n';
6024 if (! EQ (eol_type, Qunix))
6025 {
6026 if (c == '\n')
6027 {
6028 if (EQ (eol_type, Qdos))
6029 *buf++ = '\r';
6030 else
6031 c = '\r';
6032 }
6033 }
6034 *buf++ = c;
6035 }
6036
6037 coding->consumed = src - coding->source;
6038 coding->consumed_char = pos - coding->src_pos;
6039 coding->charbuf_used = buf - coding->charbuf;
6040 coding->chars_at_source = 0;
6041 }
6042
6043
6044 /* Encode the text at CODING->src_object into CODING->dst_object.
6045 CODING->src_object is a buffer or a string.
6046 CODING->dst_object is a buffer or nil.
6047
6048 If CODING->src_object is a buffer, it must be the current buffer.
6049 In this case, if CODING->src_pos is positive, it is a position of
6050 the source text in the buffer, otherwise. the source text is in the
6051 gap area of the buffer, and coding->src_pos specifies the offset of
6052 the text from GPT (which must be the same as PT). If this is the
6053 same buffer as CODING->dst_object, CODING->src_pos must be
6054 negative and CODING should not have `pre-write-conversion'.
6055
6056 If CODING->src_object is a string, CODING should not have
6057 `pre-write-conversion'.
6058
6059 If CODING->dst_object is a buffer, the encoded data is inserted at
6060 the current point of that buffer.
6061
6062 If CODING->dst_object is nil, the encoded data is placed at the
6063 memory area specified by CODING->destination. */
6064
6065 static int
6066 encode_coding (coding)
6067 struct coding_system *coding;
6068 {
6069 Lisp_Object attrs;
6070
6071 attrs = CODING_ID_ATTRS (coding->id);
6072
6073 if (BUFFERP (coding->dst_object))
6074 {
6075 set_buffer_internal (XBUFFER (coding->dst_object));
6076 coding->dst_multibyte
6077 = ! NILP (current_buffer->enable_multibyte_characters);
6078 }
6079
6080 coding->consumed = coding->consumed_char = 0;
6081 coding->produced = coding->produced_char = 0;
6082 coding->result = CODING_RESULT_SUCCESS;
6083 coding->errors = 0;
6084
6085 ALLOC_CONVERSION_WORK_AREA (coding);
6086
6087 do {
6088 coding_set_source (coding);
6089 consume_chars (coding);
6090
6091 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6092 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6093 else if (!NILP (Vstandard_translation_table_for_encode))
6094 translate_chars (coding, Vstandard_translation_table_for_encode);
6095
6096 coding_set_destination (coding);
6097 (*(coding->encoder)) (coding);
6098 } while (coding->consumed_char < coding->src_chars);
6099
6100 if (BUFFERP (coding->dst_object))
6101 insert_from_gap (coding->produced_char, coding->produced);
6102
6103 return (coding->result);
6104 }
6105
6106
6107 /* Name (or base name) of work buffer for code conversion. */
6108 static Lisp_Object Vcode_conversion_workbuf_name;
6109
6110 /* A working buffer used by the top level conversion. Once it is
6111 created, it is never destroyed. It has the name
6112 Vcode_conversion_workbuf_name. The other working buffers are
6113 destroyed after the use is finished, and their names are modified
6114 versions of Vcode_conversion_workbuf_name. */
6115 static Lisp_Object Vcode_conversion_reused_workbuf;
6116
6117 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6118 static int reused_workbuf_in_use;
6119
6120
6121 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6122 multibyteness of returning buffer. */
6123
6124 Lisp_Object
6125 make_conversion_work_buffer (multibyte)
6126 {
6127 Lisp_Object name, workbuf;
6128 struct buffer *current;
6129
6130 if (reused_workbuf_in_use++)
6131 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6132 else
6133 name = Vcode_conversion_workbuf_name;
6134 workbuf = Fget_buffer_create (name);
6135 current = current_buffer;
6136 set_buffer_internal (XBUFFER (workbuf));
6137 Ferase_buffer ();
6138 current_buffer->undo_list = Qt;
6139 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6140 set_buffer_internal (current);
6141 return workbuf;
6142 }
6143
6144
6145 static Lisp_Object
6146 code_conversion_restore (arg)
6147 Lisp_Object arg;
6148 {
6149 Lisp_Object current, workbuf;
6150
6151 current = XCAR (arg);
6152 workbuf = XCDR (arg);
6153 if (! NILP (workbuf))
6154 {
6155 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6156 reused_workbuf_in_use = 0;
6157 else if (! NILP (Fbuffer_live_p (workbuf)))
6158 Fkill_buffer (workbuf);
6159 }
6160 set_buffer_internal (XBUFFER (current));
6161 return Qnil;
6162 }
6163
6164 Lisp_Object
6165 code_conversion_save (with_work_buf, multibyte)
6166 int with_work_buf, multibyte;
6167 {
6168 Lisp_Object workbuf = Qnil;
6169
6170 if (with_work_buf)
6171 workbuf = make_conversion_work_buffer (multibyte);
6172 record_unwind_protect (code_conversion_restore,
6173 Fcons (Fcurrent_buffer (), workbuf));
6174 return workbuf;
6175 }
6176
6177 int
6178 decode_coding_gap (coding, chars, bytes)
6179 struct coding_system *coding;
6180 EMACS_INT chars, bytes;
6181 {
6182 int count = specpdl_ptr - specpdl;
6183 Lisp_Object attrs;
6184
6185 code_conversion_save (0, 0);
6186
6187 coding->src_object = Fcurrent_buffer ();
6188 coding->src_chars = chars;
6189 coding->src_bytes = bytes;
6190 coding->src_pos = -chars;
6191 coding->src_pos_byte = -bytes;
6192 coding->src_multibyte = chars < bytes;
6193 coding->dst_object = coding->src_object;
6194 coding->dst_pos = PT;
6195 coding->dst_pos_byte = PT_BYTE;
6196 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6197 coding->mode |= CODING_MODE_LAST_BLOCK;
6198
6199 if (CODING_REQUIRE_DETECTION (coding))
6200 detect_coding (coding);
6201
6202 decode_coding (coding);
6203
6204 attrs = CODING_ID_ATTRS (coding->id);
6205 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6206 {
6207 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6208 Lisp_Object val;
6209
6210 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6211 val = call1 (CODING_ATTR_POST_READ (attrs),
6212 make_number (coding->produced_char));
6213 CHECK_NATNUM (val);
6214 coding->produced_char += Z - prev_Z;
6215 coding->produced += Z_BYTE - prev_Z_BYTE;
6216 }
6217
6218 unbind_to (count, Qnil);
6219 return coding->result;
6220 }
6221
6222 int
6223 encode_coding_gap (coding, chars, bytes)
6224 struct coding_system *coding;
6225 EMACS_INT chars, bytes;
6226 {
6227 int count = specpdl_ptr - specpdl;
6228
6229 code_conversion_save (0, 0);
6230
6231 coding->src_object = Fcurrent_buffer ();
6232 coding->src_chars = chars;
6233 coding->src_bytes = bytes;
6234 coding->src_pos = -chars;
6235 coding->src_pos_byte = -bytes;
6236 coding->src_multibyte = chars < bytes;
6237 coding->dst_object = coding->src_object;
6238 coding->dst_pos = PT;
6239 coding->dst_pos_byte = PT_BYTE;
6240
6241 encode_coding (coding);
6242
6243 unbind_to (count, Qnil);
6244 return coding->result;
6245 }
6246
6247
6248 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6249 SRC_OBJECT into DST_OBJECT by coding context CODING.
6250
6251 SRC_OBJECT is a buffer, a string, or Qnil.
6252
6253 If it is a buffer, the text is at point of the buffer. FROM and TO
6254 are positions in the buffer.
6255
6256 If it is a string, the text is at the beginning of the string.
6257 FROM and TO are indices to the string.
6258
6259 If it is nil, the text is at coding->source. FROM and TO are
6260 indices to coding->source.
6261
6262 DST_OBJECT is a buffer, Qt, or Qnil.
6263
6264 If it is a buffer, the decoded text is inserted at point of the
6265 buffer. If the buffer is the same as SRC_OBJECT, the source text
6266 is deleted.
6267
6268 If it is Qt, a string is made from the decoded text, and
6269 set in CODING->dst_object.
6270
6271 If it is Qnil, the decoded text is stored at CODING->destination.
6272 The caller must allocate CODING->dst_bytes bytes at
6273 CODING->destination by xmalloc. If the decoded text is longer than
6274 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6275 */
6276
6277 void
6278 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6279 dst_object)
6280 struct coding_system *coding;
6281 Lisp_Object src_object;
6282 EMACS_INT from, from_byte, to, to_byte;
6283 Lisp_Object dst_object;
6284 {
6285 int count = specpdl_ptr - specpdl;
6286 unsigned char *destination;
6287 EMACS_INT dst_bytes;
6288 EMACS_INT chars = to - from;
6289 EMACS_INT bytes = to_byte - from_byte;
6290 Lisp_Object attrs;
6291 Lisp_Object buffer;
6292 int saved_pt = -1, saved_pt_byte;
6293
6294 buffer = Fcurrent_buffer ();
6295
6296 if (NILP (dst_object))
6297 {
6298 destination = coding->destination;
6299 dst_bytes = coding->dst_bytes;
6300 }
6301
6302 coding->src_object = src_object;
6303 coding->src_chars = chars;
6304 coding->src_bytes = bytes;
6305 coding->src_multibyte = chars < bytes;
6306
6307 if (STRINGP (src_object))
6308 {
6309 coding->src_pos = from;
6310 coding->src_pos_byte = from_byte;
6311 }
6312 else if (BUFFERP (src_object))
6313 {
6314 set_buffer_internal (XBUFFER (src_object));
6315 if (from != GPT)
6316 move_gap_both (from, from_byte);
6317 if (EQ (src_object, dst_object))
6318 {
6319 saved_pt = PT, saved_pt_byte = PT_BYTE;
6320 TEMP_SET_PT_BOTH (from, from_byte);
6321 del_range_both (from, from_byte, to, to_byte, 1);
6322 coding->src_pos = -chars;
6323 coding->src_pos_byte = -bytes;
6324 }
6325 else
6326 {
6327 coding->src_pos = from;
6328 coding->src_pos_byte = from_byte;
6329 }
6330 }
6331
6332 if (CODING_REQUIRE_DETECTION (coding))
6333 detect_coding (coding);
6334 attrs = CODING_ID_ATTRS (coding->id);
6335
6336 if (EQ (dst_object, Qt)
6337 || (! NILP (CODING_ATTR_POST_READ (attrs))
6338 && NILP (dst_object)))
6339 {
6340 coding->dst_object = code_conversion_save (1, 1);
6341 coding->dst_pos = BEG;
6342 coding->dst_pos_byte = BEG_BYTE;
6343 coding->dst_multibyte = 1;
6344 }
6345 else if (BUFFERP (dst_object))
6346 {
6347 code_conversion_save (0, 0);
6348 coding->dst_object = dst_object;
6349 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6350 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6351 coding->dst_multibyte
6352 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6353 }
6354 else
6355 {
6356 code_conversion_save (0, 0);
6357 coding->dst_object = Qnil;
6358 coding->dst_multibyte = 1;
6359 }
6360
6361 decode_coding (coding);
6362
6363 if (BUFFERP (coding->dst_object))
6364 set_buffer_internal (XBUFFER (coding->dst_object));
6365
6366 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6367 {
6368 struct gcpro gcpro1, gcpro2;
6369 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6370 Lisp_Object val;
6371
6372 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6373 GCPRO2 (coding->src_object, coding->dst_object);
6374 val = call1 (CODING_ATTR_POST_READ (attrs),
6375 make_number (coding->produced_char));
6376 UNGCPRO;
6377 CHECK_NATNUM (val);
6378 coding->produced_char += Z - prev_Z;
6379 coding->produced += Z_BYTE - prev_Z_BYTE;
6380 }
6381
6382 if (EQ (dst_object, Qt))
6383 {
6384 coding->dst_object = Fbuffer_string ();
6385 }
6386 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6387 {
6388 set_buffer_internal (XBUFFER (coding->dst_object));
6389 if (dst_bytes < coding->produced)
6390 {
6391 destination
6392 = (unsigned char *) xrealloc (destination, coding->produced);
6393 if (! destination)
6394 {
6395 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6396 unbind_to (count, Qnil);
6397 return;
6398 }
6399 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6400 move_gap_both (BEGV, BEGV_BYTE);
6401 bcopy (BEGV_ADDR, destination, coding->produced);
6402 coding->destination = destination;
6403 }
6404 }
6405
6406 if (saved_pt >= 0)
6407 {
6408 /* This is the case of:
6409 (BUFFERP (src_object) && EQ (src_object, dst_object))
6410 As we have moved PT while replacing the original buffer
6411 contents, we must recover it now. */
6412 set_buffer_internal (XBUFFER (src_object));
6413 if (saved_pt < from)
6414 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6415 else if (saved_pt < from + chars)
6416 TEMP_SET_PT_BOTH (from, from_byte);
6417 else if (! NILP (current_buffer->enable_multibyte_characters))
6418 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6419 saved_pt_byte + (coding->produced - bytes));
6420 else
6421 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6422 saved_pt_byte + (coding->produced - bytes));
6423 }
6424
6425 unbind_to (count, Qnil);
6426 }
6427
6428
6429 void
6430 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6431 dst_object)
6432 struct coding_system *coding;
6433 Lisp_Object src_object;
6434 EMACS_INT from, from_byte, to, to_byte;
6435 Lisp_Object dst_object;
6436 {
6437 int count = specpdl_ptr - specpdl;
6438 EMACS_INT chars = to - from;
6439 EMACS_INT bytes = to_byte - from_byte;
6440 Lisp_Object attrs;
6441 Lisp_Object buffer;
6442 int saved_pt = -1, saved_pt_byte;
6443
6444 buffer = Fcurrent_buffer ();
6445
6446 coding->src_object = src_object;
6447 coding->src_chars = chars;
6448 coding->src_bytes = bytes;
6449 coding->src_multibyte = chars < bytes;
6450
6451 attrs = CODING_ID_ATTRS (coding->id);
6452
6453 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6454 {
6455 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6456 set_buffer_internal (XBUFFER (coding->src_object));
6457 if (STRINGP (src_object))
6458 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6459 else if (BUFFERP (src_object))
6460 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6461 else
6462 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6463
6464 if (EQ (src_object, dst_object))
6465 {
6466 set_buffer_internal (XBUFFER (src_object));
6467 saved_pt = PT, saved_pt_byte = PT_BYTE;
6468 del_range_both (from, from_byte, to, to_byte, 1);
6469 set_buffer_internal (XBUFFER (coding->src_object));
6470 }
6471
6472 call2 (CODING_ATTR_PRE_WRITE (attrs),
6473 make_number (BEG), make_number (Z));
6474 coding->src_object = Fcurrent_buffer ();
6475 if (BEG != GPT)
6476 move_gap_both (BEG, BEG_BYTE);
6477 coding->src_chars = Z - BEG;
6478 coding->src_bytes = Z_BYTE - BEG_BYTE;
6479 coding->src_pos = BEG;
6480 coding->src_pos_byte = BEG_BYTE;
6481 coding->src_multibyte = Z < Z_BYTE;
6482 }
6483 else if (STRINGP (src_object))
6484 {
6485 code_conversion_save (0, 0);
6486 coding->src_pos = from;
6487 coding->src_pos_byte = from_byte;
6488 }
6489 else if (BUFFERP (src_object))
6490 {
6491 code_conversion_save (0, 0);
6492 set_buffer_internal (XBUFFER (src_object));
6493 if (EQ (src_object, dst_object))
6494 {
6495 saved_pt = PT, saved_pt_byte = PT_BYTE;
6496 coding->src_object = del_range_1 (from, to, 1, 1);
6497 coding->src_pos = 0;
6498 coding->src_pos_byte = 0;
6499 }
6500 else
6501 {
6502 if (from < GPT && to >= GPT)
6503 move_gap_both (from, from_byte);
6504 coding->src_pos = from;
6505 coding->src_pos_byte = from_byte;
6506 }
6507 }
6508 else
6509 code_conversion_save (0, 0);
6510
6511 if (BUFFERP (dst_object))
6512 {
6513 coding->dst_object = dst_object;
6514 if (EQ (src_object, dst_object))
6515 {
6516 coding->dst_pos = from;
6517 coding->dst_pos_byte = from_byte;
6518 }
6519 else
6520 {
6521 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6522 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6523 }
6524 coding->dst_multibyte
6525 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6526 }
6527 else if (EQ (dst_object, Qt))
6528 {
6529 coding->dst_object = Qnil;
6530 coding->dst_bytes = coding->src_chars;
6531 if (coding->dst_bytes == 0)
6532 coding->dst_bytes = 1;
6533 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6534 coding->dst_multibyte = 0;
6535 }
6536 else
6537 {
6538 coding->dst_object = Qnil;
6539 coding->dst_multibyte = 0;
6540 }
6541
6542 encode_coding (coding);
6543
6544 if (EQ (dst_object, Qt))
6545 {
6546 if (BUFFERP (coding->dst_object))
6547 coding->dst_object = Fbuffer_string ();
6548 else
6549 {
6550 coding->dst_object
6551 = make_unibyte_string ((char *) coding->destination,
6552 coding->produced);
6553 xfree (coding->destination);
6554 }
6555 }
6556
6557 if (saved_pt >= 0)
6558 {
6559 /* This is the case of:
6560 (BUFFERP (src_object) && EQ (src_object, dst_object))
6561 As we have moved PT while replacing the original buffer
6562 contents, we must recover it now. */
6563 set_buffer_internal (XBUFFER (src_object));
6564 if (saved_pt < from)
6565 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6566 else if (saved_pt < from + chars)
6567 TEMP_SET_PT_BOTH (from, from_byte);
6568 else if (! NILP (current_buffer->enable_multibyte_characters))
6569 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6570 saved_pt_byte + (coding->produced - bytes));
6571 else
6572 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6573 saved_pt_byte + (coding->produced - bytes));
6574 }
6575
6576 unbind_to (count, Qnil);
6577 }
6578
6579
6580 Lisp_Object
6581 preferred_coding_system ()
6582 {
6583 int id = coding_categories[coding_priorities[0]].id;
6584
6585 return CODING_ID_NAME (id);
6586 }
6587
6588 \f
6589 #ifdef emacs
6590 /*** 8. Emacs Lisp library functions ***/
6591
6592 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6593 doc: /* Return t if OBJECT is nil or a coding-system.
6594 See the documentation of `define-coding-system' for information
6595 about coding-system objects. */)
6596 (obj)
6597 Lisp_Object obj;
6598 {
6599 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6600 }
6601
6602 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6603 Sread_non_nil_coding_system, 1, 1, 0,
6604 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6605 (prompt)
6606 Lisp_Object prompt;
6607 {
6608 Lisp_Object val;
6609 do
6610 {
6611 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6612 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6613 }
6614 while (SCHARS (val) == 0);
6615 return (Fintern (val, Qnil));
6616 }
6617
6618 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6619 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6620 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6621 (prompt, default_coding_system)
6622 Lisp_Object prompt, default_coding_system;
6623 {
6624 Lisp_Object val;
6625 if (SYMBOLP (default_coding_system))
6626 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6627 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6628 Qt, Qnil, Qcoding_system_history,
6629 default_coding_system, Qnil);
6630 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6631 }
6632
6633 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6634 1, 1, 0,
6635 doc: /* Check validity of CODING-SYSTEM.
6636 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6637 (coding_system)
6638 Lisp_Object coding_system;
6639 {
6640 CHECK_SYMBOL (coding_system);
6641 if (!NILP (Fcoding_system_p (coding_system)))
6642 return coding_system;
6643 while (1)
6644 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6645 }
6646
6647 \f
6648 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6649 HIGHEST is nonzero, return the coding system of the highest
6650 priority among the detected coding systems. Otherwize return a
6651 list of detected coding systems sorted by their priorities. If
6652 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6653 multibyte form but contains only ASCII and eight-bit chars.
6654 Otherwise, the bytes are raw bytes.
6655
6656 CODING-SYSTEM controls the detection as below:
6657
6658 If it is nil, detect both text-format and eol-format. If the
6659 text-format part of CODING-SYSTEM is already specified
6660 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6661 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6662 detect only text-format. */
6663
6664 Lisp_Object
6665 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6666 coding_system)
6667 const unsigned char *src;
6668 int src_chars, src_bytes, highest;
6669 int multibytep;
6670 Lisp_Object coding_system;
6671 {
6672 const unsigned char *src_end = src + src_bytes;
6673 Lisp_Object attrs, eol_type;
6674 Lisp_Object val;
6675 struct coding_system coding;
6676 int id;
6677 struct coding_detection_info detect_info;
6678 enum coding_category base_category;
6679
6680 if (NILP (coding_system))
6681 coding_system = Qundecided;
6682 setup_coding_system (coding_system, &coding);
6683 attrs = CODING_ID_ATTRS (coding.id);
6684 eol_type = CODING_ID_EOL_TYPE (coding.id);
6685 coding_system = CODING_ATTR_BASE_NAME (attrs);
6686
6687 coding.source = src;
6688 coding.src_chars = src_chars;
6689 coding.src_bytes = src_bytes;
6690 coding.src_multibyte = multibytep;
6691 coding.consumed = 0;
6692 coding.mode |= CODING_MODE_LAST_BLOCK;
6693
6694 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6695
6696 /* At first, detect text-format if necessary. */
6697 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6698 if (base_category == coding_category_undecided)
6699 {
6700 enum coding_category category;
6701 struct coding_system *this;
6702 int c, i;
6703
6704 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6705 for (i = 0; src < src_end; i++, src++)
6706 {
6707 c = *src;
6708 if (c & 0x80 || (c < 0x20 && (c == 0
6709 || c == ISO_CODE_ESC
6710 || c == ISO_CODE_SI
6711 || c == ISO_CODE_SO)))
6712 break;
6713 }
6714 /* Skipped bytes must be even for utf-16 detecor. */
6715 if (i % 2)
6716 src--;
6717 coding.head_ascii = src - coding.source;
6718
6719 if (src < src_end)
6720 for (i = 0; i < coding_category_raw_text; i++)
6721 {
6722 category = coding_priorities[i];
6723 this = coding_categories + category;
6724
6725 if (this->id < 0)
6726 {
6727 /* No coding system of this category is defined. */
6728 detect_info.rejected |= (1 << category);
6729 }
6730 else if (category >= coding_category_raw_text)
6731 continue;
6732 else if (detect_info.checked & (1 << category))
6733 {
6734 if (highest
6735 && (detect_info.found & (1 << category)))
6736 break;
6737 }
6738 else
6739 {
6740 if ((*(this->detector)) (&coding, &detect_info)
6741 && highest
6742 && (detect_info.found & (1 << category)))
6743 {
6744 if (category == coding_category_utf_16_auto)
6745 {
6746 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6747 category = coding_category_utf_16_le;
6748 else
6749 category = coding_category_utf_16_be;
6750 }
6751 break;
6752 }
6753 }
6754 }
6755
6756 if (detect_info.rejected == CATEGORY_MASK_ANY)
6757 {
6758 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6759 id = coding_categories[coding_category_raw_text].id;
6760 val = Fcons (make_number (id), Qnil);
6761 }
6762 else if (! detect_info.rejected && ! detect_info.found)
6763 {
6764 detect_info.found = CATEGORY_MASK_ANY;
6765 id = coding_categories[coding_category_undecided].id;
6766 val = Fcons (make_number (id), Qnil);
6767 }
6768 else if (highest)
6769 {
6770 if (detect_info.found)
6771 {
6772 detect_info.found = 1 << category;
6773 val = Fcons (make_number (this->id), Qnil);
6774 }
6775 else
6776 for (i = 0; i < coding_category_raw_text; i++)
6777 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6778 {
6779 detect_info.found = 1 << coding_priorities[i];
6780 id = coding_categories[coding_priorities[i]].id;
6781 val = Fcons (make_number (id), Qnil);
6782 break;
6783 }
6784 }
6785 else
6786 {
6787 int mask = detect_info.rejected | detect_info.found;
6788 int found = 0;
6789 val = Qnil;
6790
6791 for (i = coding_category_raw_text - 1; i >= 0; i--)
6792 {
6793 category = coding_priorities[i];
6794 if (! (mask & (1 << category)))
6795 {
6796 found |= 1 << category;
6797 id = coding_categories[category].id;
6798 val = Fcons (make_number (id), val);
6799 }
6800 }
6801 for (i = coding_category_raw_text - 1; i >= 0; i--)
6802 {
6803 category = coding_priorities[i];
6804 if (detect_info.found & (1 << category))
6805 {
6806 id = coding_categories[category].id;
6807 val = Fcons (make_number (id), val);
6808 }
6809 }
6810 detect_info.found |= found;
6811 }
6812 }
6813 else if (base_category == coding_category_utf_16_auto)
6814 {
6815 if (detect_coding_utf_16 (&coding, &detect_info))
6816 {
6817 enum coding_category category;
6818 struct coding_system *this;
6819
6820 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6821 this = coding_categories + coding_category_utf_16_le;
6822 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6823 this = coding_categories + coding_category_utf_16_be;
6824 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6825 this = coding_categories + coding_category_utf_16_be_nosig;
6826 else
6827 this = coding_categories + coding_category_utf_16_le_nosig;
6828 val = Fcons (make_number (this->id), Qnil);
6829 }
6830 }
6831 else
6832 {
6833 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6834 val = Fcons (make_number (coding.id), Qnil);
6835 }
6836
6837 /* Then, detect eol-format if necessary. */
6838 {
6839 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6840 Lisp_Object tail;
6841
6842 if (VECTORP (eol_type))
6843 {
6844 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6845 normal_eol = detect_eol (coding.source, src_bytes,
6846 coding_category_raw_text);
6847 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6848 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6849 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6850 coding_category_utf_16_be);
6851 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6852 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6853 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6854 coding_category_utf_16_le);
6855 }
6856 else
6857 {
6858 if (EQ (eol_type, Qunix))
6859 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6860 else if (EQ (eol_type, Qdos))
6861 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6862 else
6863 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6864 }
6865
6866 for (tail = val; CONSP (tail); tail = XCDR (tail))
6867 {
6868 enum coding_category category;
6869 int this_eol;
6870
6871 id = XINT (XCAR (tail));
6872 attrs = CODING_ID_ATTRS (id);
6873 category = XINT (CODING_ATTR_CATEGORY (attrs));
6874 eol_type = CODING_ID_EOL_TYPE (id);
6875 if (VECTORP (eol_type))
6876 {
6877 if (category == coding_category_utf_16_be
6878 || category == coding_category_utf_16_be_nosig)
6879 this_eol = utf_16_be_eol;
6880 else if (category == coding_category_utf_16_le
6881 || category == coding_category_utf_16_le_nosig)
6882 this_eol = utf_16_le_eol;
6883 else
6884 this_eol = normal_eol;
6885
6886 if (this_eol == EOL_SEEN_LF)
6887 XSETCAR (tail, AREF (eol_type, 0));
6888 else if (this_eol == EOL_SEEN_CRLF)
6889 XSETCAR (tail, AREF (eol_type, 1));
6890 else if (this_eol == EOL_SEEN_CR)
6891 XSETCAR (tail, AREF (eol_type, 2));
6892 else
6893 XSETCAR (tail, CODING_ID_NAME (id));
6894 }
6895 else
6896 XSETCAR (tail, CODING_ID_NAME (id));
6897 }
6898 }
6899
6900 return (highest ? XCAR (val) : val);
6901 }
6902
6903
6904 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6905 2, 3, 0,
6906 doc: /* Detect coding system of the text in the region between START and END.
6907 Return a list of possible coding systems ordered by priority.
6908
6909 If only ASCII characters are found, it returns a list of single element
6910 `undecided' or its subsidiary coding system according to a detected
6911 end-of-line format.
6912
6913 If optional argument HIGHEST is non-nil, return the coding system of
6914 highest priority. */)
6915 (start, end, highest)
6916 Lisp_Object start, end, highest;
6917 {
6918 int from, to;
6919 int from_byte, to_byte;
6920
6921 CHECK_NUMBER_COERCE_MARKER (start);
6922 CHECK_NUMBER_COERCE_MARKER (end);
6923
6924 validate_region (&start, &end);
6925 from = XINT (start), to = XINT (end);
6926 from_byte = CHAR_TO_BYTE (from);
6927 to_byte = CHAR_TO_BYTE (to);
6928
6929 if (from < GPT && to >= GPT)
6930 move_gap_both (to, to_byte);
6931
6932 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6933 to - from, to_byte - from_byte,
6934 !NILP (highest),
6935 !NILP (current_buffer
6936 ->enable_multibyte_characters),
6937 Qnil);
6938 }
6939
6940 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6941 1, 2, 0,
6942 doc: /* Detect coding system of the text in STRING.
6943 Return a list of possible coding systems ordered by priority.
6944
6945 If only ASCII characters are found, it returns a list of single element
6946 `undecided' or its subsidiary coding system according to a detected
6947 end-of-line format.
6948
6949 If optional argument HIGHEST is non-nil, return the coding system of
6950 highest priority. */)
6951 (string, highest)
6952 Lisp_Object string, highest;
6953 {
6954 CHECK_STRING (string);
6955
6956 return detect_coding_system (SDATA (string),
6957 SCHARS (string), SBYTES (string),
6958 !NILP (highest), STRING_MULTIBYTE (string),
6959 Qnil);
6960 }
6961
6962
6963 static INLINE int
6964 char_encodable_p (c, attrs)
6965 int c;
6966 Lisp_Object attrs;
6967 {
6968 Lisp_Object tail;
6969 struct charset *charset;
6970
6971 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6972 CONSP (tail); tail = XCDR (tail))
6973 {
6974 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6975 if (CHAR_CHARSET_P (c, charset))
6976 break;
6977 }
6978 return (! NILP (tail));
6979 }
6980
6981
6982 /* Return a list of coding systems that safely encode the text between
6983 START and END. If EXCLUDE is non-nil, it is a list of coding
6984 systems not to check. The returned list doesn't contain any such
6985 coding systems. In any case, if the text contains only ASCII or is
6986 unibyte, return t. */
6987
6988 DEFUN ("find-coding-systems-region-internal",
6989 Ffind_coding_systems_region_internal,
6990 Sfind_coding_systems_region_internal, 2, 3, 0,
6991 doc: /* Internal use only. */)
6992 (start, end, exclude)
6993 Lisp_Object start, end, exclude;
6994 {
6995 Lisp_Object coding_attrs_list, safe_codings;
6996 EMACS_INT start_byte, end_byte;
6997 const unsigned char *p, *pbeg, *pend;
6998 int c;
6999 Lisp_Object tail, elt;
7000
7001 if (STRINGP (start))
7002 {
7003 if (!STRING_MULTIBYTE (start)
7004 || SCHARS (start) == SBYTES (start))
7005 return Qt;
7006 start_byte = 0;
7007 end_byte = SBYTES (start);
7008 }
7009 else
7010 {
7011 CHECK_NUMBER_COERCE_MARKER (start);
7012 CHECK_NUMBER_COERCE_MARKER (end);
7013 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7014 args_out_of_range (start, end);
7015 if (NILP (current_buffer->enable_multibyte_characters))
7016 return Qt;
7017 start_byte = CHAR_TO_BYTE (XINT (start));
7018 end_byte = CHAR_TO_BYTE (XINT (end));
7019 if (XINT (end) - XINT (start) == end_byte - start_byte)
7020 return Qt;
7021
7022 if (XINT (start) < GPT && XINT (end) > GPT)
7023 {
7024 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7025 move_gap_both (XINT (start), start_byte);
7026 else
7027 move_gap_both (XINT (end), end_byte);
7028 }
7029 }
7030
7031 coding_attrs_list = Qnil;
7032 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7033 if (NILP (exclude)
7034 || NILP (Fmemq (XCAR (tail), exclude)))
7035 {
7036 Lisp_Object attrs;
7037
7038 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7039 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7040 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7041 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7042 }
7043
7044 if (STRINGP (start))
7045 p = pbeg = SDATA (start);
7046 else
7047 p = pbeg = BYTE_POS_ADDR (start_byte);
7048 pend = p + (end_byte - start_byte);
7049
7050 while (p < pend && ASCII_BYTE_P (*p)) p++;
7051 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7052
7053 while (p < pend)
7054 {
7055 if (ASCII_BYTE_P (*p))
7056 p++;
7057 else
7058 {
7059 c = STRING_CHAR_ADVANCE (p);
7060
7061 charset_map_loaded = 0;
7062 for (tail = coding_attrs_list; CONSP (tail);)
7063 {
7064 elt = XCAR (tail);
7065 if (NILP (elt))
7066 tail = XCDR (tail);
7067 else if (char_encodable_p (c, elt))
7068 tail = XCDR (tail);
7069 else if (CONSP (XCDR (tail)))
7070 {
7071 XSETCAR (tail, XCAR (XCDR (tail)));
7072 XSETCDR (tail, XCDR (XCDR (tail)));
7073 }
7074 else
7075 {
7076 XSETCAR (tail, Qnil);
7077 tail = XCDR (tail);
7078 }
7079 }
7080 if (charset_map_loaded)
7081 {
7082 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7083
7084 if (STRINGP (start))
7085 pbeg = SDATA (start);
7086 else
7087 pbeg = BYTE_POS_ADDR (start_byte);
7088 p = pbeg + p_offset;
7089 pend = pbeg + pend_offset;
7090 }
7091 }
7092 }
7093
7094 safe_codings = Qnil;
7095 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7096 if (! NILP (XCAR (tail)))
7097 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7098
7099 return safe_codings;
7100 }
7101
7102
7103 DEFUN ("unencodable-char-position", Funencodable_char_position,
7104 Sunencodable_char_position, 3, 5, 0,
7105 doc: /*
7106 Return position of first un-encodable character in a region.
7107 START and END specfiy the region and CODING-SYSTEM specifies the
7108 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7109
7110 If optional 4th argument COUNT is non-nil, it specifies at most how
7111 many un-encodable characters to search. In this case, the value is a
7112 list of positions.
7113
7114 If optional 5th argument STRING is non-nil, it is a string to search
7115 for un-encodable characters. In that case, START and END are indexes
7116 to the string. */)
7117 (start, end, coding_system, count, string)
7118 Lisp_Object start, end, coding_system, count, string;
7119 {
7120 int n;
7121 struct coding_system coding;
7122 Lisp_Object attrs, charset_list;
7123 Lisp_Object positions;
7124 int from, to;
7125 const unsigned char *p, *stop, *pend;
7126 int ascii_compatible;
7127
7128 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7129 attrs = CODING_ID_ATTRS (coding.id);
7130 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7131 return Qnil;
7132 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7133 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7134
7135 if (NILP (string))
7136 {
7137 validate_region (&start, &end);
7138 from = XINT (start);
7139 to = XINT (end);
7140 if (NILP (current_buffer->enable_multibyte_characters)
7141 || (ascii_compatible
7142 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7143 return Qnil;
7144 p = CHAR_POS_ADDR (from);
7145 pend = CHAR_POS_ADDR (to);
7146 if (from < GPT && to >= GPT)
7147 stop = GPT_ADDR;
7148 else
7149 stop = pend;
7150 }
7151 else
7152 {
7153 CHECK_STRING (string);
7154 CHECK_NATNUM (start);
7155 CHECK_NATNUM (end);
7156 from = XINT (start);
7157 to = XINT (end);
7158 if (from > to
7159 || to > SCHARS (string))
7160 args_out_of_range_3 (string, start, end);
7161 if (! STRING_MULTIBYTE (string))
7162 return Qnil;
7163 p = SDATA (string) + string_char_to_byte (string, from);
7164 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7165 if (ascii_compatible && (to - from) == (pend - p))
7166 return Qnil;
7167 }
7168
7169 if (NILP (count))
7170 n = 1;
7171 else
7172 {
7173 CHECK_NATNUM (count);
7174 n = XINT (count);
7175 }
7176
7177 positions = Qnil;
7178 while (1)
7179 {
7180 int c;
7181
7182 if (ascii_compatible)
7183 while (p < stop && ASCII_BYTE_P (*p))
7184 p++, from++;
7185 if (p >= stop)
7186 {
7187 if (p >= pend)
7188 break;
7189 stop = pend;
7190 p = GAP_END_ADDR;
7191 }
7192
7193 c = STRING_CHAR_ADVANCE (p);
7194 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7195 && ! char_charset (c, charset_list, NULL))
7196 {
7197 positions = Fcons (make_number (from), positions);
7198 n--;
7199 if (n == 0)
7200 break;
7201 }
7202
7203 from++;
7204 }
7205
7206 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7207 }
7208
7209
7210 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7211 Scheck_coding_systems_region, 3, 3, 0,
7212 doc: /* Check if the region is encodable by coding systems.
7213
7214 START and END are buffer positions specifying the region.
7215 CODING-SYSTEM-LIST is a list of coding systems to check.
7216
7217 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7218 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7219 whole region, POS0, POS1, ... are buffer positions where non-encodable
7220 characters are found.
7221
7222 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7223 value is nil.
7224
7225 START may be a string. In that case, check if the string is
7226 encodable, and the value contains indices to the string instead of
7227 buffer positions. END is ignored. */)
7228 (start, end, coding_system_list)
7229 Lisp_Object start, end, coding_system_list;
7230 {
7231 Lisp_Object list;
7232 EMACS_INT start_byte, end_byte;
7233 int pos;
7234 const unsigned char *p, *pbeg, *pend;
7235 int c;
7236 Lisp_Object tail, elt;
7237
7238 if (STRINGP (start))
7239 {
7240 if (!STRING_MULTIBYTE (start)
7241 && SCHARS (start) != SBYTES (start))
7242 return Qnil;
7243 start_byte = 0;
7244 end_byte = SBYTES (start);
7245 pos = 0;
7246 }
7247 else
7248 {
7249 CHECK_NUMBER_COERCE_MARKER (start);
7250 CHECK_NUMBER_COERCE_MARKER (end);
7251 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7252 args_out_of_range (start, end);
7253 if (NILP (current_buffer->enable_multibyte_characters))
7254 return Qnil;
7255 start_byte = CHAR_TO_BYTE (XINT (start));
7256 end_byte = CHAR_TO_BYTE (XINT (end));
7257 if (XINT (end) - XINT (start) == end_byte - start_byte)
7258 return Qt;
7259
7260 if (XINT (start) < GPT && XINT (end) > GPT)
7261 {
7262 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7263 move_gap_both (XINT (start), start_byte);
7264 else
7265 move_gap_both (XINT (end), end_byte);
7266 }
7267 pos = XINT (start);
7268 }
7269
7270 list = Qnil;
7271 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7272 {
7273 elt = XCAR (tail);
7274 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7275 Qnil)),
7276 list);
7277 }
7278
7279 if (STRINGP (start))
7280 p = pbeg = SDATA (start);
7281 else
7282 p = pbeg = BYTE_POS_ADDR (start_byte);
7283 pend = p + (end_byte - start_byte);
7284
7285 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7286 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7287
7288 while (p < pend)
7289 {
7290 if (ASCII_BYTE_P (*p))
7291 p++;
7292 else
7293 {
7294 c = STRING_CHAR_ADVANCE (p);
7295
7296 charset_map_loaded = 0;
7297 for (tail = list; CONSP (tail); tail = XCDR (tail))
7298 {
7299 elt = XCDR (XCAR (tail));
7300 if (! char_encodable_p (c, XCAR (elt)))
7301 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7302 }
7303 if (charset_map_loaded)
7304 {
7305 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7306
7307 if (STRINGP (start))
7308 pbeg = SDATA (start);
7309 else
7310 pbeg = BYTE_POS_ADDR (start_byte);
7311 p = pbeg + p_offset;
7312 pend = pbeg + pend_offset;
7313 }
7314 }
7315 pos++;
7316 }
7317
7318 tail = list;
7319 list = Qnil;
7320 for (; CONSP (tail); tail = XCDR (tail))
7321 {
7322 elt = XCAR (tail);
7323 if (CONSP (XCDR (XCDR (elt))))
7324 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7325 list);
7326 }
7327
7328 return list;
7329 }
7330
7331
7332
7333 Lisp_Object
7334 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7335 Lisp_Object start, end, coding_system, dst_object;
7336 int encodep, norecord;
7337 {
7338 struct coding_system coding;
7339 EMACS_INT from, from_byte, to, to_byte;
7340 Lisp_Object src_object;
7341
7342 CHECK_NUMBER_COERCE_MARKER (start);
7343 CHECK_NUMBER_COERCE_MARKER (end);
7344 if (NILP (coding_system))
7345 coding_system = Qno_conversion;
7346 else
7347 CHECK_CODING_SYSTEM (coding_system);
7348 src_object = Fcurrent_buffer ();
7349 if (NILP (dst_object))
7350 dst_object = src_object;
7351 else if (! EQ (dst_object, Qt))
7352 CHECK_BUFFER (dst_object);
7353
7354 validate_region (&start, &end);
7355 from = XFASTINT (start);
7356 from_byte = CHAR_TO_BYTE (from);
7357 to = XFASTINT (end);
7358 to_byte = CHAR_TO_BYTE (to);
7359
7360 setup_coding_system (coding_system, &coding);
7361 coding.mode |= CODING_MODE_LAST_BLOCK;
7362
7363 if (encodep)
7364 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7365 dst_object);
7366 else
7367 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7368 dst_object);
7369 if (! norecord)
7370 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7371
7372 if (coding.result != CODING_RESULT_SUCCESS)
7373 error ("Code conversion error: %d", coding.result);
7374
7375 return (BUFFERP (dst_object)
7376 ? make_number (coding.produced_char)
7377 : coding.dst_object);
7378 }
7379
7380
7381 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7382 3, 4, "r\nzCoding system: ",
7383 doc: /* Decode the current region from the specified coding system.
7384 When called from a program, takes four arguments:
7385 START, END, CODING-SYSTEM, and DESTINATION.
7386 START and END are buffer positions.
7387
7388 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7389 If nil, the region between START and END is replace by the decoded text.
7390 If buffer, the decoded text is inserted in the buffer.
7391 If t, the decoded text is returned.
7392
7393 This function sets `last-coding-system-used' to the precise coding system
7394 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7395 not fully specified.)
7396 It returns the length of the decoded text. */)
7397 (start, end, coding_system, destination)
7398 Lisp_Object start, end, coding_system, destination;
7399 {
7400 return code_convert_region (start, end, coding_system, destination, 0, 0);
7401 }
7402
7403 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7404 3, 4, "r\nzCoding system: ",
7405 doc: /* Encode the current region by specified coding system.
7406 When called from a program, takes three arguments:
7407 START, END, and CODING-SYSTEM. START and END are buffer positions.
7408
7409 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7410 If nil, the region between START and END is replace by the encoded text.
7411 If buffer, the encoded text is inserted in the buffer.
7412 If t, the encoded text is returned.
7413
7414 This function sets `last-coding-system-used' to the precise coding system
7415 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7416 not fully specified.)
7417 It returns the length of the encoded text. */)
7418 (start, end, coding_system, destination)
7419 Lisp_Object start, end, coding_system, destination;
7420 {
7421 return code_convert_region (start, end, coding_system, destination, 1, 0);
7422 }
7423
7424 Lisp_Object
7425 code_convert_string (string, coding_system, dst_object,
7426 encodep, nocopy, norecord)
7427 Lisp_Object string, coding_system, dst_object;
7428 int encodep, nocopy, norecord;
7429 {
7430 struct coding_system coding;
7431 EMACS_INT chars, bytes;
7432
7433 CHECK_STRING (string);
7434 if (NILP (coding_system))
7435 {
7436 if (! norecord)
7437 Vlast_coding_system_used = Qno_conversion;
7438 if (NILP (dst_object))
7439 return (nocopy ? Fcopy_sequence (string) : string);
7440 }
7441
7442 if (NILP (coding_system))
7443 coding_system = Qno_conversion;
7444 else
7445 CHECK_CODING_SYSTEM (coding_system);
7446 if (NILP (dst_object))
7447 dst_object = Qt;
7448 else if (! EQ (dst_object, Qt))
7449 CHECK_BUFFER (dst_object);
7450
7451 setup_coding_system (coding_system, &coding);
7452 coding.mode |= CODING_MODE_LAST_BLOCK;
7453 chars = SCHARS (string);
7454 bytes = SBYTES (string);
7455 if (encodep)
7456 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7457 else
7458 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7459 if (! norecord)
7460 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7461
7462 if (coding.result != CODING_RESULT_SUCCESS)
7463 error ("Code conversion error: %d", coding.result);
7464
7465 return (BUFFERP (dst_object)
7466 ? make_number (coding.produced_char)
7467 : coding.dst_object);
7468 }
7469
7470
7471 /* Encode or decode STRING according to CODING_SYSTEM.
7472 Do not set Vlast_coding_system_used.
7473
7474 This function is called only from macros DECODE_FILE and
7475 ENCODE_FILE, thus we ignore character composition. */
7476
7477 Lisp_Object
7478 code_convert_string_norecord (string, coding_system, encodep)
7479 Lisp_Object string, coding_system;
7480 int encodep;
7481 {
7482 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7483 }
7484
7485
7486 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7487 2, 4, 0,
7488 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7489
7490 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7491 if the decoding operation is trivial.
7492
7493 Optional fourth arg BUFFER non-nil meant that the decoded text is
7494 inserted in BUFFER instead of returned as a string. In this case,
7495 the return value is BUFFER.
7496
7497 This function sets `last-coding-system-used' to the precise coding system
7498 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7499 not fully specified. */)
7500 (string, coding_system, nocopy, buffer)
7501 Lisp_Object string, coding_system, nocopy, buffer;
7502 {
7503 return code_convert_string (string, coding_system, buffer,
7504 0, ! NILP (nocopy), 0);
7505 }
7506
7507 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7508 2, 4, 0,
7509 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7510
7511 Optional third arg NOCOPY non-nil means it is OK to return STRING
7512 itself if the encoding operation is trivial.
7513
7514 Optional fourth arg BUFFER non-nil meant that the encoded text is
7515 inserted in BUFFER instead of returned as a string. In this case,
7516 the return value is BUFFER.
7517
7518 This function sets `last-coding-system-used' to the precise coding system
7519 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7520 not fully specified.) */)
7521 (string, coding_system, nocopy, buffer)
7522 Lisp_Object string, coding_system, nocopy, buffer;
7523 {
7524 return code_convert_string (string, coding_system, buffer,
7525 1, ! NILP (nocopy), 1);
7526 }
7527
7528 \f
7529 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7530 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7531 Return the corresponding character. */)
7532 (code)
7533 Lisp_Object code;
7534 {
7535 Lisp_Object spec, attrs, val;
7536 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7537 int c;
7538
7539 CHECK_NATNUM (code);
7540 c = XFASTINT (code);
7541 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7542 attrs = AREF (spec, 0);
7543
7544 if (ASCII_BYTE_P (c)
7545 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7546 return code;
7547
7548 val = CODING_ATTR_CHARSET_LIST (attrs);
7549 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7550 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7551 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7552
7553 if (c <= 0x7F)
7554 charset = charset_roman;
7555 else if (c >= 0xA0 && c < 0xDF)
7556 {
7557 charset = charset_kana;
7558 c -= 0x80;
7559 }
7560 else
7561 {
7562 int s1 = c >> 8, s2 = c & 0xFF;
7563
7564 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7565 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7566 error ("Invalid code: %d", code);
7567 SJIS_TO_JIS (c);
7568 charset = charset_kanji;
7569 }
7570 c = DECODE_CHAR (charset, c);
7571 if (c < 0)
7572 error ("Invalid code: %d", code);
7573 return make_number (c);
7574 }
7575
7576
7577 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7578 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7579 Return the corresponding code in SJIS. */)
7580 (ch)
7581 Lisp_Object ch;
7582 {
7583 Lisp_Object spec, attrs, charset_list;
7584 int c;
7585 struct charset *charset;
7586 unsigned code;
7587
7588 CHECK_CHARACTER (ch);
7589 c = XFASTINT (ch);
7590 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7591 attrs = AREF (spec, 0);
7592
7593 if (ASCII_CHAR_P (c)
7594 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7595 return ch;
7596
7597 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7598 charset = char_charset (c, charset_list, &code);
7599 if (code == CHARSET_INVALID_CODE (charset))
7600 error ("Can't encode by shift_jis encoding: %d", c);
7601 JIS_TO_SJIS (code);
7602
7603 return make_number (code);
7604 }
7605
7606 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7607 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7608 Return the corresponding character. */)
7609 (code)
7610 Lisp_Object code;
7611 {
7612 Lisp_Object spec, attrs, val;
7613 struct charset *charset_roman, *charset_big5, *charset;
7614 int c;
7615
7616 CHECK_NATNUM (code);
7617 c = XFASTINT (code);
7618 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7619 attrs = AREF (spec, 0);
7620
7621 if (ASCII_BYTE_P (c)
7622 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7623 return code;
7624
7625 val = CODING_ATTR_CHARSET_LIST (attrs);
7626 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7627 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7628
7629 if (c <= 0x7F)
7630 charset = charset_roman;
7631 else
7632 {
7633 int b1 = c >> 8, b2 = c & 0x7F;
7634 if (b1 < 0xA1 || b1 > 0xFE
7635 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7636 error ("Invalid code: %d", code);
7637 charset = charset_big5;
7638 }
7639 c = DECODE_CHAR (charset, (unsigned )c);
7640 if (c < 0)
7641 error ("Invalid code: %d", code);
7642 return make_number (c);
7643 }
7644
7645 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7646 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7647 Return the corresponding character code in Big5. */)
7648 (ch)
7649 Lisp_Object ch;
7650 {
7651 Lisp_Object spec, attrs, charset_list;
7652 struct charset *charset;
7653 int c;
7654 unsigned code;
7655
7656 CHECK_CHARACTER (ch);
7657 c = XFASTINT (ch);
7658 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7659 attrs = AREF (spec, 0);
7660 if (ASCII_CHAR_P (c)
7661 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7662 return ch;
7663
7664 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7665 charset = char_charset (c, charset_list, &code);
7666 if (code == CHARSET_INVALID_CODE (charset))
7667 error ("Can't encode by Big5 encoding: %d", c);
7668
7669 return make_number (code);
7670 }
7671
7672 \f
7673 DEFUN ("set-terminal-coding-system-internal",
7674 Fset_terminal_coding_system_internal,
7675 Sset_terminal_coding_system_internal, 1, 1, 0,
7676 doc: /* Internal use only. */)
7677 (coding_system)
7678 Lisp_Object coding_system;
7679 {
7680 CHECK_SYMBOL (coding_system);
7681 setup_coding_system (Fcheck_coding_system (coding_system),
7682 &terminal_coding);
7683
7684 /* We had better not send unsafe characters to terminal. */
7685 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7686 /* Characer composition should be disabled. */
7687 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7688 terminal_coding.src_multibyte = 1;
7689 terminal_coding.dst_multibyte = 0;
7690 return Qnil;
7691 }
7692
7693 DEFUN ("set-safe-terminal-coding-system-internal",
7694 Fset_safe_terminal_coding_system_internal,
7695 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7696 doc: /* Internal use only. */)
7697 (coding_system)
7698 Lisp_Object coding_system;
7699 {
7700 CHECK_SYMBOL (coding_system);
7701 setup_coding_system (Fcheck_coding_system (coding_system),
7702 &safe_terminal_coding);
7703 /* Characer composition should be disabled. */
7704 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7705 safe_terminal_coding.src_multibyte = 1;
7706 safe_terminal_coding.dst_multibyte = 0;
7707 return Qnil;
7708 }
7709
7710 DEFUN ("terminal-coding-system",
7711 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7712 doc: /* Return coding system specified for terminal output. */)
7713 ()
7714 {
7715 return CODING_ID_NAME (terminal_coding.id);
7716 }
7717
7718 DEFUN ("set-keyboard-coding-system-internal",
7719 Fset_keyboard_coding_system_internal,
7720 Sset_keyboard_coding_system_internal, 1, 1, 0,
7721 doc: /* Internal use only. */)
7722 (coding_system)
7723 Lisp_Object coding_system;
7724 {
7725 CHECK_SYMBOL (coding_system);
7726 setup_coding_system (Fcheck_coding_system (coding_system),
7727 &keyboard_coding);
7728 /* Characer composition should be disabled. */
7729 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7730 return Qnil;
7731 }
7732
7733 DEFUN ("keyboard-coding-system",
7734 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7735 doc: /* Return coding system specified for decoding keyboard input. */)
7736 ()
7737 {
7738 return CODING_ID_NAME (keyboard_coding.id);
7739 }
7740
7741 \f
7742 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7743 Sfind_operation_coding_system, 1, MANY, 0,
7744 doc: /* Choose a coding system for an operation based on the target name.
7745 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7746 DECODING-SYSTEM is the coding system to use for decoding
7747 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7748 for encoding (in case OPERATION does encoding).
7749
7750 The first argument OPERATION specifies an I/O primitive:
7751 For file I/O, `insert-file-contents' or `write-region'.
7752 For process I/O, `call-process', `call-process-region', or `start-process'.
7753 For network I/O, `open-network-stream'.
7754
7755 The remaining arguments should be the same arguments that were passed
7756 to the primitive. Depending on which primitive, one of those arguments
7757 is selected as the TARGET. For example, if OPERATION does file I/O,
7758 whichever argument specifies the file name is TARGET.
7759
7760 TARGET has a meaning which depends on OPERATION:
7761 For file I/O, TARGET is a file name.
7762 For process I/O, TARGET is a process name.
7763 For network I/O, TARGET is a service name or a port number
7764
7765 This function looks up what specified for TARGET in,
7766 `file-coding-system-alist', `process-coding-system-alist',
7767 or `network-coding-system-alist' depending on OPERATION.
7768 They may specify a coding system, a cons of coding systems,
7769 or a function symbol to call.
7770 In the last case, we call the function with one argument,
7771 which is a list of all the arguments given to this function.
7772
7773 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7774 (nargs, args)
7775 int nargs;
7776 Lisp_Object *args;
7777 {
7778 Lisp_Object operation, target_idx, target, val;
7779 register Lisp_Object chain;
7780
7781 if (nargs < 2)
7782 error ("Too few arguments");
7783 operation = args[0];
7784 if (!SYMBOLP (operation)
7785 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7786 error ("Invalid first arguement");
7787 if (nargs < 1 + XINT (target_idx))
7788 error ("Too few arguments for operation: %s",
7789 SDATA (SYMBOL_NAME (operation)));
7790 target = args[XINT (target_idx) + 1];
7791 if (!(STRINGP (target)
7792 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7793 error ("Invalid %dth argument", XINT (target_idx) + 1);
7794
7795 chain = ((EQ (operation, Qinsert_file_contents)
7796 || EQ (operation, Qwrite_region))
7797 ? Vfile_coding_system_alist
7798 : (EQ (operation, Qopen_network_stream)
7799 ? Vnetwork_coding_system_alist
7800 : Vprocess_coding_system_alist));
7801 if (NILP (chain))
7802 return Qnil;
7803
7804 for (; CONSP (chain); chain = XCDR (chain))
7805 {
7806 Lisp_Object elt;
7807
7808 elt = XCAR (chain);
7809 if (CONSP (elt)
7810 && ((STRINGP (target)
7811 && STRINGP (XCAR (elt))
7812 && fast_string_match (XCAR (elt), target) >= 0)
7813 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7814 {
7815 val = XCDR (elt);
7816 /* Here, if VAL is both a valid coding system and a valid
7817 function symbol, we return VAL as a coding system. */
7818 if (CONSP (val))
7819 return val;
7820 if (! SYMBOLP (val))
7821 return Qnil;
7822 if (! NILP (Fcoding_system_p (val)))
7823 return Fcons (val, val);
7824 if (! NILP (Ffboundp (val)))
7825 {
7826 val = call1 (val, Flist (nargs, args));
7827 if (CONSP (val))
7828 return val;
7829 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7830 return Fcons (val, val);
7831 }
7832 return Qnil;
7833 }
7834 }
7835 return Qnil;
7836 }
7837
7838 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7839 Sset_coding_system_priority, 0, MANY, 0,
7840 doc: /* Assign higher priority to the coding systems given as arguments.
7841 If multiple coding systems belongs to the same category,
7842 all but the first one are ignored.
7843
7844 usage: (set-coding-system-priority ...) */)
7845 (nargs, args)
7846 int nargs;
7847 Lisp_Object *args;
7848 {
7849 int i, j;
7850 int changed[coding_category_max];
7851 enum coding_category priorities[coding_category_max];
7852
7853 bzero (changed, sizeof changed);
7854
7855 for (i = j = 0; i < nargs; i++)
7856 {
7857 enum coding_category category;
7858 Lisp_Object spec, attrs;
7859
7860 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7861 attrs = AREF (spec, 0);
7862 category = XINT (CODING_ATTR_CATEGORY (attrs));
7863 if (changed[category])
7864 /* Ignore this coding system because a coding system of the
7865 same category already had a higher priority. */
7866 continue;
7867 changed[category] = 1;
7868 priorities[j++] = category;
7869 if (coding_categories[category].id >= 0
7870 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7871 setup_coding_system (args[i], &coding_categories[category]);
7872 Fset (AREF (Vcoding_category_table, category), args[i]);
7873 }
7874
7875 /* Now we have decided top J priorities. Reflect the order of the
7876 original priorities to the remaining priorities. */
7877
7878 for (i = j, j = 0; i < coding_category_max; i++, j++)
7879 {
7880 while (j < coding_category_max
7881 && changed[coding_priorities[j]])
7882 j++;
7883 if (j == coding_category_max)
7884 abort ();
7885 priorities[i] = coding_priorities[j];
7886 }
7887
7888 bcopy (priorities, coding_priorities, sizeof priorities);
7889
7890 /* Update `coding-category-list'. */
7891 Vcoding_category_list = Qnil;
7892 for (i = coding_category_max - 1; i >= 0; i--)
7893 Vcoding_category_list
7894 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7895 Vcoding_category_list);
7896
7897 return Qnil;
7898 }
7899
7900 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7901 Scoding_system_priority_list, 0, 1, 0,
7902 doc: /* Return a list of coding systems ordered by their priorities.
7903 HIGHESTP non-nil means just return the highest priority one. */)
7904 (highestp)
7905 Lisp_Object highestp;
7906 {
7907 int i;
7908 Lisp_Object val;
7909
7910 for (i = 0, val = Qnil; i < coding_category_max; i++)
7911 {
7912 enum coding_category category = coding_priorities[i];
7913 int id = coding_categories[category].id;
7914 Lisp_Object attrs;
7915
7916 if (id < 0)
7917 continue;
7918 attrs = CODING_ID_ATTRS (id);
7919 if (! NILP (highestp))
7920 return CODING_ATTR_BASE_NAME (attrs);
7921 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7922 }
7923 return Fnreverse (val);
7924 }
7925
7926 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7927
7928 static Lisp_Object
7929 make_subsidiaries (base)
7930 Lisp_Object base;
7931 {
7932 Lisp_Object subsidiaries;
7933 int base_name_len = SBYTES (SYMBOL_NAME (base));
7934 char *buf = (char *) alloca (base_name_len + 6);
7935 int i;
7936
7937 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7938 subsidiaries = Fmake_vector (make_number (3), Qnil);
7939 for (i = 0; i < 3; i++)
7940 {
7941 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7942 ASET (subsidiaries, i, intern (buf));
7943 }
7944 return subsidiaries;
7945 }
7946
7947
7948 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7949 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7950 doc: /* For internal use only.
7951 usage: (define-coding-system-internal ...) */)
7952 (nargs, args)
7953 int nargs;
7954 Lisp_Object *args;
7955 {
7956 Lisp_Object name;
7957 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7958 Lisp_Object attrs; /* Vector of attributes. */
7959 Lisp_Object eol_type;
7960 Lisp_Object aliases;
7961 Lisp_Object coding_type, charset_list, safe_charsets;
7962 enum coding_category category;
7963 Lisp_Object tail, val;
7964 int max_charset_id = 0;
7965 int i;
7966
7967 if (nargs < coding_arg_max)
7968 goto short_args;
7969
7970 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7971
7972 name = args[coding_arg_name];
7973 CHECK_SYMBOL (name);
7974 CODING_ATTR_BASE_NAME (attrs) = name;
7975
7976 val = args[coding_arg_mnemonic];
7977 if (! STRINGP (val))
7978 CHECK_CHARACTER (val);
7979 CODING_ATTR_MNEMONIC (attrs) = val;
7980
7981 coding_type = args[coding_arg_coding_type];
7982 CHECK_SYMBOL (coding_type);
7983 CODING_ATTR_TYPE (attrs) = coding_type;
7984
7985 charset_list = args[coding_arg_charset_list];
7986 if (SYMBOLP (charset_list))
7987 {
7988 if (EQ (charset_list, Qiso_2022))
7989 {
7990 if (! EQ (coding_type, Qiso_2022))
7991 error ("Invalid charset-list");
7992 charset_list = Viso_2022_charset_list;
7993 }
7994 else if (EQ (charset_list, Qemacs_mule))
7995 {
7996 if (! EQ (coding_type, Qemacs_mule))
7997 error ("Invalid charset-list");
7998 charset_list = Vemacs_mule_charset_list;
7999 }
8000 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8001 if (max_charset_id < XFASTINT (XCAR (tail)))
8002 max_charset_id = XFASTINT (XCAR (tail));
8003 }
8004 else
8005 {
8006 charset_list = Fcopy_sequence (charset_list);
8007 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8008 {
8009 struct charset *charset;
8010
8011 val = Fcar (tail);
8012 CHECK_CHARSET_GET_CHARSET (val, charset);
8013 if (EQ (coding_type, Qiso_2022)
8014 ? CHARSET_ISO_FINAL (charset) < 0
8015 : EQ (coding_type, Qemacs_mule)
8016 ? CHARSET_EMACS_MULE_ID (charset) < 0
8017 : 0)
8018 error ("Can't handle charset `%s'",
8019 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8020
8021 XSETCAR (tail, make_number (charset->id));
8022 if (max_charset_id < charset->id)
8023 max_charset_id = charset->id;
8024 }
8025 }
8026 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8027
8028 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8029 make_number (255));
8030 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8031 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8032 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8033
8034 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8035
8036 val = args[coding_arg_decode_translation_table];
8037 if (! NILP (val))
8038 CHECK_CHAR_TABLE (val);
8039 CODING_ATTR_DECODE_TBL (attrs) = val;
8040
8041 val = args[coding_arg_encode_translation_table];
8042 if (! NILP (val))
8043 CHECK_CHAR_TABLE (val);
8044 CODING_ATTR_ENCODE_TBL (attrs) = val;
8045
8046 val = args[coding_arg_post_read_conversion];
8047 CHECK_SYMBOL (val);
8048 CODING_ATTR_POST_READ (attrs) = val;
8049
8050 val = args[coding_arg_pre_write_conversion];
8051 CHECK_SYMBOL (val);
8052 CODING_ATTR_PRE_WRITE (attrs) = val;
8053
8054 val = args[coding_arg_default_char];
8055 if (NILP (val))
8056 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8057 else
8058 {
8059 CHECK_CHARACTER (val);
8060 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8061 }
8062
8063 val = args[coding_arg_for_unibyte];
8064 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8065
8066 val = args[coding_arg_plist];
8067 CHECK_LIST (val);
8068 CODING_ATTR_PLIST (attrs) = val;
8069
8070 if (EQ (coding_type, Qcharset))
8071 {
8072 /* Generate a lisp vector of 256 elements. Each element is nil,
8073 integer, or a list of charset IDs.
8074
8075 If Nth element is nil, the byte code N is invalid in this
8076 coding system.
8077
8078 If Nth element is a number NUM, N is the first byte of a
8079 charset whose ID is NUM.
8080
8081 If Nth element is a list of charset IDs, N is the first byte
8082 of one of them. The list is sorted by dimensions of the
8083 charsets. A charset of smaller dimension comes firtst. */
8084 val = Fmake_vector (make_number (256), Qnil);
8085
8086 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8087 {
8088 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8089 int dim = CHARSET_DIMENSION (charset);
8090 int idx = (dim - 1) * 4;
8091
8092 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8093 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8094
8095 for (i = charset->code_space[idx];
8096 i <= charset->code_space[idx + 1]; i++)
8097 {
8098 Lisp_Object tmp, tmp2;
8099 int dim2;
8100
8101 tmp = AREF (val, i);
8102 if (NILP (tmp))
8103 tmp = XCAR (tail);
8104 else if (NUMBERP (tmp))
8105 {
8106 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8107 if (dim < dim2)
8108 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8109 else
8110 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8111 }
8112 else
8113 {
8114 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8115 {
8116 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8117 if (dim < dim2)
8118 break;
8119 }
8120 if (NILP (tmp2))
8121 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8122 else
8123 {
8124 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8125 XSETCAR (tmp2, XCAR (tail));
8126 }
8127 }
8128 ASET (val, i, tmp);
8129 }
8130 }
8131 ASET (attrs, coding_attr_charset_valids, val);
8132 category = coding_category_charset;
8133 }
8134 else if (EQ (coding_type, Qccl))
8135 {
8136 Lisp_Object valids;
8137
8138 if (nargs < coding_arg_ccl_max)
8139 goto short_args;
8140
8141 val = args[coding_arg_ccl_decoder];
8142 CHECK_CCL_PROGRAM (val);
8143 if (VECTORP (val))
8144 val = Fcopy_sequence (val);
8145 ASET (attrs, coding_attr_ccl_decoder, val);
8146
8147 val = args[coding_arg_ccl_encoder];
8148 CHECK_CCL_PROGRAM (val);
8149 if (VECTORP (val))
8150 val = Fcopy_sequence (val);
8151 ASET (attrs, coding_attr_ccl_encoder, val);
8152
8153 val = args[coding_arg_ccl_valids];
8154 valids = Fmake_string (make_number (256), make_number (0));
8155 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8156 {
8157 int from, to;
8158
8159 val = Fcar (tail);
8160 if (INTEGERP (val))
8161 {
8162 from = to = XINT (val);
8163 if (from < 0 || from > 255)
8164 args_out_of_range_3 (val, make_number (0), make_number (255));
8165 }
8166 else
8167 {
8168 CHECK_CONS (val);
8169 CHECK_NATNUM_CAR (val);
8170 CHECK_NATNUM_CDR (val);
8171 from = XINT (XCAR (val));
8172 if (from > 255)
8173 args_out_of_range_3 (XCAR (val),
8174 make_number (0), make_number (255));
8175 to = XINT (XCDR (val));
8176 if (to < from || to > 255)
8177 args_out_of_range_3 (XCDR (val),
8178 XCAR (val), make_number (255));
8179 }
8180 for (i = from; i <= to; i++)
8181 SSET (valids, i, 1);
8182 }
8183 ASET (attrs, coding_attr_ccl_valids, valids);
8184
8185 category = coding_category_ccl;
8186 }
8187 else if (EQ (coding_type, Qutf_16))
8188 {
8189 Lisp_Object bom, endian;
8190
8191 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8192
8193 if (nargs < coding_arg_utf16_max)
8194 goto short_args;
8195
8196 bom = args[coding_arg_utf16_bom];
8197 if (! NILP (bom) && ! EQ (bom, Qt))
8198 {
8199 CHECK_CONS (bom);
8200 val = XCAR (bom);
8201 CHECK_CODING_SYSTEM (val);
8202 val = XCDR (bom);
8203 CHECK_CODING_SYSTEM (val);
8204 }
8205 ASET (attrs, coding_attr_utf_16_bom, bom);
8206
8207 endian = args[coding_arg_utf16_endian];
8208 CHECK_SYMBOL (endian);
8209 if (NILP (endian))
8210 endian = Qbig;
8211 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8212 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8213 ASET (attrs, coding_attr_utf_16_endian, endian);
8214
8215 category = (CONSP (bom)
8216 ? coding_category_utf_16_auto
8217 : NILP (bom)
8218 ? (EQ (endian, Qbig)
8219 ? coding_category_utf_16_be_nosig
8220 : coding_category_utf_16_le_nosig)
8221 : (EQ (endian, Qbig)
8222 ? coding_category_utf_16_be
8223 : coding_category_utf_16_le));
8224 }
8225 else if (EQ (coding_type, Qiso_2022))
8226 {
8227 Lisp_Object initial, reg_usage, request, flags;
8228 int i;
8229
8230 if (nargs < coding_arg_iso2022_max)
8231 goto short_args;
8232
8233 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8234 CHECK_VECTOR (initial);
8235 for (i = 0; i < 4; i++)
8236 {
8237 val = Faref (initial, make_number (i));
8238 if (! NILP (val))
8239 {
8240 struct charset *charset;
8241
8242 CHECK_CHARSET_GET_CHARSET (val, charset);
8243 ASET (initial, i, make_number (CHARSET_ID (charset)));
8244 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8245 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8246 }
8247 else
8248 ASET (initial, i, make_number (-1));
8249 }
8250
8251 reg_usage = args[coding_arg_iso2022_reg_usage];
8252 CHECK_CONS (reg_usage);
8253 CHECK_NUMBER_CAR (reg_usage);
8254 CHECK_NUMBER_CDR (reg_usage);
8255
8256 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8257 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8258 {
8259 int id;
8260 Lisp_Object tmp;
8261
8262 val = Fcar (tail);
8263 CHECK_CONS (val);
8264 tmp = XCAR (val);
8265 CHECK_CHARSET_GET_ID (tmp, id);
8266 CHECK_NATNUM_CDR (val);
8267 if (XINT (XCDR (val)) >= 4)
8268 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8269 XSETCAR (val, make_number (id));
8270 }
8271
8272 flags = args[coding_arg_iso2022_flags];
8273 CHECK_NATNUM (flags);
8274 i = XINT (flags);
8275 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8276 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8277
8278 ASET (attrs, coding_attr_iso_initial, initial);
8279 ASET (attrs, coding_attr_iso_usage, reg_usage);
8280 ASET (attrs, coding_attr_iso_request, request);
8281 ASET (attrs, coding_attr_iso_flags, flags);
8282 setup_iso_safe_charsets (attrs);
8283
8284 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8285 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8286 | CODING_ISO_FLAG_SINGLE_SHIFT))
8287 ? coding_category_iso_7_else
8288 : EQ (args[coding_arg_charset_list], Qiso_2022)
8289 ? coding_category_iso_7
8290 : coding_category_iso_7_tight);
8291 else
8292 {
8293 int id = XINT (AREF (initial, 1));
8294
8295 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8296 || EQ (args[coding_arg_charset_list], Qiso_2022)
8297 || id < 0)
8298 ? coding_category_iso_8_else
8299 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8300 ? coding_category_iso_8_1
8301 : coding_category_iso_8_2);
8302 }
8303 if (category != coding_category_iso_8_1
8304 && category != coding_category_iso_8_2)
8305 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8306 }
8307 else if (EQ (coding_type, Qemacs_mule))
8308 {
8309 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8310 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8311 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8312 category = coding_category_emacs_mule;
8313 }
8314 else if (EQ (coding_type, Qshift_jis))
8315 {
8316
8317 struct charset *charset;
8318
8319 if (XINT (Flength (charset_list)) != 3)
8320 error ("There should be just three charsets");
8321
8322 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8323 if (CHARSET_DIMENSION (charset) != 1)
8324 error ("Dimension of charset %s is not one",
8325 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8326 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8327 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8328
8329 charset_list = XCDR (charset_list);
8330 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8331 if (CHARSET_DIMENSION (charset) != 1)
8332 error ("Dimension of charset %s is not one",
8333 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8334
8335 charset_list = XCDR (charset_list);
8336 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8337 if (CHARSET_DIMENSION (charset) != 2)
8338 error ("Dimension of charset %s is not two",
8339 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8340
8341 category = coding_category_sjis;
8342 Vsjis_coding_system = name;
8343 }
8344 else if (EQ (coding_type, Qbig5))
8345 {
8346 struct charset *charset;
8347
8348 if (XINT (Flength (charset_list)) != 2)
8349 error ("There should be just two charsets");
8350
8351 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8352 if (CHARSET_DIMENSION (charset) != 1)
8353 error ("Dimension of charset %s is not one",
8354 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8355 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8356 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8357
8358 charset_list = XCDR (charset_list);
8359 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8360 if (CHARSET_DIMENSION (charset) != 2)
8361 error ("Dimension of charset %s is not two",
8362 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8363
8364 category = coding_category_big5;
8365 Vbig5_coding_system = name;
8366 }
8367 else if (EQ (coding_type, Qraw_text))
8368 {
8369 category = coding_category_raw_text;
8370 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8371 }
8372 else if (EQ (coding_type, Qutf_8))
8373 {
8374 category = coding_category_utf_8;
8375 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8376 }
8377 else if (EQ (coding_type, Qundecided))
8378 category = coding_category_undecided;
8379 else
8380 error ("Invalid coding system type: %s",
8381 SDATA (SYMBOL_NAME (coding_type)));
8382
8383 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8384 CODING_ATTR_PLIST (attrs)
8385 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8386 CODING_ATTR_PLIST (attrs)));
8387
8388 eol_type = args[coding_arg_eol_type];
8389 if (! NILP (eol_type)
8390 && ! EQ (eol_type, Qunix)
8391 && ! EQ (eol_type, Qdos)
8392 && ! EQ (eol_type, Qmac))
8393 error ("Invalid eol-type");
8394
8395 aliases = Fcons (name, Qnil);
8396
8397 if (NILP (eol_type))
8398 {
8399 eol_type = make_subsidiaries (name);
8400 for (i = 0; i < 3; i++)
8401 {
8402 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8403
8404 this_name = AREF (eol_type, i);
8405 this_aliases = Fcons (this_name, Qnil);
8406 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8407 this_spec = Fmake_vector (make_number (3), attrs);
8408 ASET (this_spec, 1, this_aliases);
8409 ASET (this_spec, 2, this_eol_type);
8410 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8411 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8412 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8413 Vcoding_system_alist);
8414 }
8415 }
8416
8417 spec_vec = Fmake_vector (make_number (3), attrs);
8418 ASET (spec_vec, 1, aliases);
8419 ASET (spec_vec, 2, eol_type);
8420
8421 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8422 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8423 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8424 Vcoding_system_alist);
8425
8426 {
8427 int id = coding_categories[category].id;
8428
8429 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8430 setup_coding_system (name, &coding_categories[category]);
8431 }
8432
8433 return Qnil;
8434
8435 short_args:
8436 return Fsignal (Qwrong_number_of_arguments,
8437 Fcons (intern ("define-coding-system-internal"),
8438 make_number (nargs)));
8439 }
8440
8441
8442 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8443 Sdefine_coding_system_alias, 2, 2, 0,
8444 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8445 (alias, coding_system)
8446 Lisp_Object alias, coding_system;
8447 {
8448 Lisp_Object spec, aliases, eol_type;
8449
8450 CHECK_SYMBOL (alias);
8451 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8452 aliases = AREF (spec, 1);
8453 /* ALISES should be a list of length more than zero, and the first
8454 element is a base coding system. Append ALIAS at the tail of the
8455 list. */
8456 while (!NILP (XCDR (aliases)))
8457 aliases = XCDR (aliases);
8458 XSETCDR (aliases, Fcons (alias, Qnil));
8459
8460 eol_type = AREF (spec, 2);
8461 if (VECTORP (eol_type))
8462 {
8463 Lisp_Object subsidiaries;
8464 int i;
8465
8466 subsidiaries = make_subsidiaries (alias);
8467 for (i = 0; i < 3; i++)
8468 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8469 AREF (eol_type, i));
8470 }
8471
8472 Fputhash (alias, spec, Vcoding_system_hash_table);
8473 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8474 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8475 Vcoding_system_alist);
8476
8477 return Qnil;
8478 }
8479
8480 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8481 1, 1, 0,
8482 doc: /* Return the base of CODING-SYSTEM.
8483 Any alias or subsidiary coding system is not a base coding system. */)
8484 (coding_system)
8485 Lisp_Object coding_system;
8486 {
8487 Lisp_Object spec, attrs;
8488
8489 if (NILP (coding_system))
8490 return (Qno_conversion);
8491 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8492 attrs = AREF (spec, 0);
8493 return CODING_ATTR_BASE_NAME (attrs);
8494 }
8495
8496 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8497 1, 1, 0,
8498 doc: "Return the property list of CODING-SYSTEM.")
8499 (coding_system)
8500 Lisp_Object coding_system;
8501 {
8502 Lisp_Object spec, attrs;
8503
8504 if (NILP (coding_system))
8505 coding_system = Qno_conversion;
8506 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8507 attrs = AREF (spec, 0);
8508 return CODING_ATTR_PLIST (attrs);
8509 }
8510
8511
8512 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8513 1, 1, 0,
8514 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8515 (coding_system)
8516 Lisp_Object coding_system;
8517 {
8518 Lisp_Object spec;
8519
8520 if (NILP (coding_system))
8521 coding_system = Qno_conversion;
8522 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8523 return AREF (spec, 1);
8524 }
8525
8526 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8527 Scoding_system_eol_type, 1, 1, 0,
8528 doc: /* Return eol-type of CODING-SYSTEM.
8529 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8530
8531 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8532 and CR respectively.
8533
8534 A vector value indicates that a format of end-of-line should be
8535 detected automatically. Nth element of the vector is the subsidiary
8536 coding system whose eol-type is N. */)
8537 (coding_system)
8538 Lisp_Object coding_system;
8539 {
8540 Lisp_Object spec, eol_type;
8541 int n;
8542
8543 if (NILP (coding_system))
8544 coding_system = Qno_conversion;
8545 if (! CODING_SYSTEM_P (coding_system))
8546 return Qnil;
8547 spec = CODING_SYSTEM_SPEC (coding_system);
8548 eol_type = AREF (spec, 2);
8549 if (VECTORP (eol_type))
8550 return Fcopy_sequence (eol_type);
8551 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8552 return make_number (n);
8553 }
8554
8555 #endif /* emacs */
8556
8557 \f
8558 /*** 9. Post-amble ***/
8559
8560 void
8561 init_coding_once ()
8562 {
8563 int i;
8564
8565 for (i = 0; i < coding_category_max; i++)
8566 {
8567 coding_categories[i].id = -1;
8568 coding_priorities[i] = i;
8569 }
8570
8571 /* ISO2022 specific initialize routine. */
8572 for (i = 0; i < 0x20; i++)
8573 iso_code_class[i] = ISO_control_0;
8574 for (i = 0x21; i < 0x7F; i++)
8575 iso_code_class[i] = ISO_graphic_plane_0;
8576 for (i = 0x80; i < 0xA0; i++)
8577 iso_code_class[i] = ISO_control_1;
8578 for (i = 0xA1; i < 0xFF; i++)
8579 iso_code_class[i] = ISO_graphic_plane_1;
8580 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8581 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8582 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8583 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8584 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8585 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8586 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8587 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8588 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8589
8590 for (i = 0; i < 256; i++)
8591 {
8592 emacs_mule_bytes[i] = 1;
8593 }
8594 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8595 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8596 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8597 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8598 }
8599
8600 #ifdef emacs
8601
8602 void
8603 syms_of_coding ()
8604 {
8605 staticpro (&Vcoding_system_hash_table);
8606 {
8607 Lisp_Object args[2];
8608 args[0] = QCtest;
8609 args[1] = Qeq;
8610 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8611 }
8612
8613 staticpro (&Vsjis_coding_system);
8614 Vsjis_coding_system = Qnil;
8615
8616 staticpro (&Vbig5_coding_system);
8617 Vbig5_coding_system = Qnil;
8618
8619 staticpro (&Vcode_conversion_reused_workbuf);
8620 Vcode_conversion_reused_workbuf = Qnil;
8621
8622 staticpro (&Vcode_conversion_workbuf_name);
8623 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8624
8625 reused_workbuf_in_use = 0;
8626
8627 DEFSYM (Qcharset, "charset");
8628 DEFSYM (Qtarget_idx, "target-idx");
8629 DEFSYM (Qcoding_system_history, "coding-system-history");
8630 Fset (Qcoding_system_history, Qnil);
8631
8632 /* Target FILENAME is the first argument. */
8633 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8634 /* Target FILENAME is the third argument. */
8635 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8636
8637 DEFSYM (Qcall_process, "call-process");
8638 /* Target PROGRAM is the first argument. */
8639 Fput (Qcall_process, Qtarget_idx, make_number (0));
8640
8641 DEFSYM (Qcall_process_region, "call-process-region");
8642 /* Target PROGRAM is the third argument. */
8643 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8644
8645 DEFSYM (Qstart_process, "start-process");
8646 /* Target PROGRAM is the third argument. */
8647 Fput (Qstart_process, Qtarget_idx, make_number (2));
8648
8649 DEFSYM (Qopen_network_stream, "open-network-stream");
8650 /* Target SERVICE is the fourth argument. */
8651 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8652
8653 DEFSYM (Qcoding_system, "coding-system");
8654 DEFSYM (Qcoding_aliases, "coding-aliases");
8655
8656 DEFSYM (Qeol_type, "eol-type");
8657 DEFSYM (Qunix, "unix");
8658 DEFSYM (Qdos, "dos");
8659
8660 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8661 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8662 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8663 DEFSYM (Qdefault_char, "default-char");
8664 DEFSYM (Qundecided, "undecided");
8665 DEFSYM (Qno_conversion, "no-conversion");
8666 DEFSYM (Qraw_text, "raw-text");
8667
8668 DEFSYM (Qiso_2022, "iso-2022");
8669
8670 DEFSYM (Qutf_8, "utf-8");
8671 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8672
8673 DEFSYM (Qutf_16, "utf-16");
8674 DEFSYM (Qbig, "big");
8675 DEFSYM (Qlittle, "little");
8676
8677 DEFSYM (Qshift_jis, "shift-jis");
8678 DEFSYM (Qbig5, "big5");
8679
8680 DEFSYM (Qcoding_system_p, "coding-system-p");
8681
8682 DEFSYM (Qcoding_system_error, "coding-system-error");
8683 Fput (Qcoding_system_error, Qerror_conditions,
8684 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8685 Fput (Qcoding_system_error, Qerror_message,
8686 build_string ("Invalid coding system"));
8687
8688 /* Intern this now in case it isn't already done.
8689 Setting this variable twice is harmless.
8690 But don't staticpro it here--that is done in alloc.c. */
8691 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8692
8693 DEFSYM (Qtranslation_table, "translation-table");
8694 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8695 DEFSYM (Qtranslation_table_id, "translation-table-id");
8696 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8697 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8698
8699 DEFSYM (Qvalid_codes, "valid-codes");
8700
8701 DEFSYM (Qemacs_mule, "emacs-mule");
8702
8703 DEFSYM (QCcategory, ":category");
8704
8705 Vcoding_category_table
8706 = Fmake_vector (make_number (coding_category_max), Qnil);
8707 staticpro (&Vcoding_category_table);
8708 /* Followings are target of code detection. */
8709 ASET (Vcoding_category_table, coding_category_iso_7,
8710 intern ("coding-category-iso-7"));
8711 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8712 intern ("coding-category-iso-7-tight"));
8713 ASET (Vcoding_category_table, coding_category_iso_8_1,
8714 intern ("coding-category-iso-8-1"));
8715 ASET (Vcoding_category_table, coding_category_iso_8_2,
8716 intern ("coding-category-iso-8-2"));
8717 ASET (Vcoding_category_table, coding_category_iso_7_else,
8718 intern ("coding-category-iso-7-else"));
8719 ASET (Vcoding_category_table, coding_category_iso_8_else,
8720 intern ("coding-category-iso-8-else"));
8721 ASET (Vcoding_category_table, coding_category_utf_8,
8722 intern ("coding-category-utf-8"));
8723 ASET (Vcoding_category_table, coding_category_utf_16_be,
8724 intern ("coding-category-utf-16-be"));
8725 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8726 intern ("coding-category-utf-16-auto"));
8727 ASET (Vcoding_category_table, coding_category_utf_16_le,
8728 intern ("coding-category-utf-16-le"));
8729 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8730 intern ("coding-category-utf-16-be-nosig"));
8731 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8732 intern ("coding-category-utf-16-le-nosig"));
8733 ASET (Vcoding_category_table, coding_category_charset,
8734 intern ("coding-category-charset"));
8735 ASET (Vcoding_category_table, coding_category_sjis,
8736 intern ("coding-category-sjis"));
8737 ASET (Vcoding_category_table, coding_category_big5,
8738 intern ("coding-category-big5"));
8739 ASET (Vcoding_category_table, coding_category_ccl,
8740 intern ("coding-category-ccl"));
8741 ASET (Vcoding_category_table, coding_category_emacs_mule,
8742 intern ("coding-category-emacs-mule"));
8743 /* Followings are NOT target of code detection. */
8744 ASET (Vcoding_category_table, coding_category_raw_text,
8745 intern ("coding-category-raw-text"));
8746 ASET (Vcoding_category_table, coding_category_undecided,
8747 intern ("coding-category-undecided"));
8748
8749 defsubr (&Scoding_system_p);
8750 defsubr (&Sread_coding_system);
8751 defsubr (&Sread_non_nil_coding_system);
8752 defsubr (&Scheck_coding_system);
8753 defsubr (&Sdetect_coding_region);
8754 defsubr (&Sdetect_coding_string);
8755 defsubr (&Sfind_coding_systems_region_internal);
8756 defsubr (&Sunencodable_char_position);
8757 defsubr (&Scheck_coding_systems_region);
8758 defsubr (&Sdecode_coding_region);
8759 defsubr (&Sencode_coding_region);
8760 defsubr (&Sdecode_coding_string);
8761 defsubr (&Sencode_coding_string);
8762 defsubr (&Sdecode_sjis_char);
8763 defsubr (&Sencode_sjis_char);
8764 defsubr (&Sdecode_big5_char);
8765 defsubr (&Sencode_big5_char);
8766 defsubr (&Sset_terminal_coding_system_internal);
8767 defsubr (&Sset_safe_terminal_coding_system_internal);
8768 defsubr (&Sterminal_coding_system);
8769 defsubr (&Sset_keyboard_coding_system_internal);
8770 defsubr (&Skeyboard_coding_system);
8771 defsubr (&Sfind_operation_coding_system);
8772 defsubr (&Sset_coding_system_priority);
8773 defsubr (&Sdefine_coding_system_internal);
8774 defsubr (&Sdefine_coding_system_alias);
8775 defsubr (&Scoding_system_base);
8776 defsubr (&Scoding_system_plist);
8777 defsubr (&Scoding_system_aliases);
8778 defsubr (&Scoding_system_eol_type);
8779 defsubr (&Scoding_system_priority_list);
8780
8781 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8782 doc: /* List of coding systems.
8783
8784 Do not alter the value of this variable manually. This variable should be
8785 updated by the functions `define-coding-system' and
8786 `define-coding-system-alias'. */);
8787 Vcoding_system_list = Qnil;
8788
8789 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8790 doc: /* Alist of coding system names.
8791 Each element is one element list of coding system name.
8792 This variable is given to `completing-read' as TABLE argument.
8793
8794 Do not alter the value of this variable manually. This variable should be
8795 updated by the functions `make-coding-system' and
8796 `define-coding-system-alias'. */);
8797 Vcoding_system_alist = Qnil;
8798
8799 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8800 doc: /* List of coding-categories (symbols) ordered by priority.
8801
8802 On detecting a coding system, Emacs tries code detection algorithms
8803 associated with each coding-category one by one in this order. When
8804 one algorithm agrees with a byte sequence of source text, the coding
8805 system bound to the corresponding coding-category is selected. */);
8806 {
8807 int i;
8808
8809 Vcoding_category_list = Qnil;
8810 for (i = coding_category_max - 1; i >= 0; i--)
8811 Vcoding_category_list
8812 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8813 Vcoding_category_list);
8814 }
8815
8816 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8817 doc: /* Specify the coding system for read operations.
8818 It is useful to bind this variable with `let', but do not set it globally.
8819 If the value is a coding system, it is used for decoding on read operation.
8820 If not, an appropriate element is used from one of the coding system alists:
8821 There are three such tables, `file-coding-system-alist',
8822 `process-coding-system-alist', and `network-coding-system-alist'. */);
8823 Vcoding_system_for_read = Qnil;
8824
8825 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8826 doc: /* Specify the coding system for write operations.
8827 Programs bind this variable with `let', but you should not set it globally.
8828 If the value is a coding system, it is used for encoding of output,
8829 when writing it to a file and when sending it to a file or subprocess.
8830
8831 If this does not specify a coding system, an appropriate element
8832 is used from one of the coding system alists:
8833 There are three such tables, `file-coding-system-alist',
8834 `process-coding-system-alist', and `network-coding-system-alist'.
8835 For output to files, if the above procedure does not specify a coding system,
8836 the value of `buffer-file-coding-system' is used. */);
8837 Vcoding_system_for_write = Qnil;
8838
8839 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8840 doc: /*
8841 Coding system used in the latest file or process I/O. */);
8842 Vlast_coding_system_used = Qnil;
8843
8844 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8845 doc: /*
8846 *Non-nil means always inhibit code conversion of end-of-line format.
8847 See info node `Coding Systems' and info node `Text and Binary' concerning
8848 such conversion. */);
8849 inhibit_eol_conversion = 0;
8850
8851 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8852 doc: /*
8853 Non-nil means process buffer inherits coding system of process output.
8854 Bind it to t if the process output is to be treated as if it were a file
8855 read from some filesystem. */);
8856 inherit_process_coding_system = 0;
8857
8858 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8859 doc: /*
8860 Alist to decide a coding system to use for a file I/O operation.
8861 The format is ((PATTERN . VAL) ...),
8862 where PATTERN is a regular expression matching a file name,
8863 VAL is a coding system, a cons of coding systems, or a function symbol.
8864 If VAL is a coding system, it is used for both decoding and encoding
8865 the file contents.
8866 If VAL is a cons of coding systems, the car part is used for decoding,
8867 and the cdr part is used for encoding.
8868 If VAL is a function symbol, the function must return a coding system
8869 or a cons of coding systems which are used as above. The function gets
8870 the arguments with which `find-operation-coding-systems' was called.
8871
8872 See also the function `find-operation-coding-system'
8873 and the variable `auto-coding-alist'. */);
8874 Vfile_coding_system_alist = Qnil;
8875
8876 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8877 doc: /*
8878 Alist to decide a coding system to use for a process I/O operation.
8879 The format is ((PATTERN . VAL) ...),
8880 where PATTERN is a regular expression matching a program name,
8881 VAL is a coding system, a cons of coding systems, or a function symbol.
8882 If VAL is a coding system, it is used for both decoding what received
8883 from the program and encoding what sent to the program.
8884 If VAL is a cons of coding systems, the car part is used for decoding,
8885 and the cdr part is used for encoding.
8886 If VAL is a function symbol, the function must return a coding system
8887 or a cons of coding systems which are used as above.
8888
8889 See also the function `find-operation-coding-system'. */);
8890 Vprocess_coding_system_alist = Qnil;
8891
8892 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8893 doc: /*
8894 Alist to decide a coding system to use for a network I/O operation.
8895 The format is ((PATTERN . VAL) ...),
8896 where PATTERN is a regular expression matching a network service name
8897 or is a port number to connect to,
8898 VAL is a coding system, a cons of coding systems, or a function symbol.
8899 If VAL is a coding system, it is used for both decoding what received
8900 from the network stream and encoding what sent to the network stream.
8901 If VAL is a cons of coding systems, the car part is used for decoding,
8902 and the cdr part is used for encoding.
8903 If VAL is a function symbol, the function must return a coding system
8904 or a cons of coding systems which are used as above.
8905
8906 See also the function `find-operation-coding-system'. */);
8907 Vnetwork_coding_system_alist = Qnil;
8908
8909 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8910 doc: /* Coding system to use with system messages.
8911 Also used for decoding keyboard input on X Window system. */);
8912 Vlocale_coding_system = Qnil;
8913
8914 /* The eol mnemonics are reset in startup.el system-dependently. */
8915 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8916 doc: /*
8917 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8918 eol_mnemonic_unix = build_string (":");
8919
8920 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8921 doc: /*
8922 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8923 eol_mnemonic_dos = build_string ("\\");
8924
8925 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8926 doc: /*
8927 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8928 eol_mnemonic_mac = build_string ("/");
8929
8930 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8931 doc: /*
8932 *String displayed in mode line when end-of-line format is not yet determined. */);
8933 eol_mnemonic_undecided = build_string (":");
8934
8935 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8936 doc: /*
8937 *Non-nil enables character translation while encoding and decoding. */);
8938 Venable_character_translation = Qt;
8939
8940 DEFVAR_LISP ("standard-translation-table-for-decode",
8941 &Vstandard_translation_table_for_decode,
8942 doc: /* Table for translating characters while decoding. */);
8943 Vstandard_translation_table_for_decode = Qnil;
8944
8945 DEFVAR_LISP ("standard-translation-table-for-encode",
8946 &Vstandard_translation_table_for_encode,
8947 doc: /* Table for translating characters while encoding. */);
8948 Vstandard_translation_table_for_encode = Qnil;
8949
8950 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8951 doc: /* Alist of charsets vs revision numbers.
8952 While encoding, if a charset (car part of an element) is found,
8953 designate it with the escape sequence identifying revision (cdr part
8954 of the element). */);
8955 Vcharset_revision_table = Qnil;
8956
8957 DEFVAR_LISP ("default-process-coding-system",
8958 &Vdefault_process_coding_system,
8959 doc: /* Cons of coding systems used for process I/O by default.
8960 The car part is used for decoding a process output,
8961 the cdr part is used for encoding a text to be sent to a process. */);
8962 Vdefault_process_coding_system = Qnil;
8963
8964 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8965 doc: /*
8966 Table of extra Latin codes in the range 128..159 (inclusive).
8967 This is a vector of length 256.
8968 If Nth element is non-nil, the existence of code N in a file
8969 \(or output of subprocess) doesn't prevent it to be detected as
8970 a coding system of ISO 2022 variant which has a flag
8971 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8972 or reading output of a subprocess.
8973 Only 128th through 159th elements has a meaning. */);
8974 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8975
8976 DEFVAR_LISP ("select-safe-coding-system-function",
8977 &Vselect_safe_coding_system_function,
8978 doc: /*
8979 Function to call to select safe coding system for encoding a text.
8980
8981 If set, this function is called to force a user to select a proper
8982 coding system which can encode the text in the case that a default
8983 coding system used in each operation can't encode the text.
8984
8985 The default value is `select-safe-coding-system' (which see). */);
8986 Vselect_safe_coding_system_function = Qnil;
8987
8988 DEFVAR_BOOL ("coding-system-require-warning",
8989 &coding_system_require_warning,
8990 doc: /* Internal use only.
8991 If non-nil, on writing a file, `select-safe-coding-system-function' is
8992 called even if `coding-system-for-write' is non-nil. The command
8993 `universal-coding-system-argument' binds this variable to t temporarily. */);
8994 coding_system_require_warning = 0;
8995
8996
8997 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8998 &inhibit_iso_escape_detection,
8999 doc: /*
9000 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9001
9002 By default, on reading a file, Emacs tries to detect how the text is
9003 encoded. This code detection is sensitive to escape sequences. If
9004 the sequence is valid as ISO2022, the code is determined as one of
9005 the ISO2022 encodings, and the file is decoded by the corresponding
9006 coding system (e.g. `iso-2022-7bit').
9007
9008 However, there may be a case that you want to read escape sequences in
9009 a file as is. In such a case, you can set this variable to non-nil.
9010 Then, as the code detection ignores any escape sequences, no file is
9011 detected as encoded in some ISO2022 encoding. The result is that all
9012 escape sequences become visible in a buffer.
9013
9014 The default value is nil, and it is strongly recommended not to change
9015 it. That is because many Emacs Lisp source files that contain
9016 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9017 in Emacs's distribution, and they won't be decoded correctly on
9018 reading if you suppress escape sequence detection.
9019
9020 The other way to read escape sequences in a file without decoding is
9021 to explicitly specify some coding system that doesn't use ISO2022's
9022 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9023 inhibit_iso_escape_detection = 0;
9024
9025 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9026 doc: /* Char table for translating self-inserting characters.
9027 This is applied to the result of input methods, not their input. See also
9028 `keyboard-translate-table'. */);
9029 Vtranslation_table_for_input = Qnil;
9030
9031 {
9032 Lisp_Object args[coding_arg_max];
9033 Lisp_Object plist[16];
9034 int i;
9035
9036 for (i = 0; i < coding_arg_max; i++)
9037 args[i] = Qnil;
9038
9039 plist[0] = intern (":name");
9040 plist[1] = args[coding_arg_name] = Qno_conversion;
9041 plist[2] = intern (":mnemonic");
9042 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9043 plist[4] = intern (":coding-type");
9044 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9045 plist[6] = intern (":ascii-compatible-p");
9046 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9047 plist[8] = intern (":default-char");
9048 plist[9] = args[coding_arg_default_char] = make_number (0);
9049 plist[10] = intern (":for-unibyte");
9050 plist[11] = args[coding_arg_for_unibyte] = Qt;
9051 plist[12] = intern (":docstring");
9052 plist[13] = build_string ("Do no conversion.\n\
9053 \n\
9054 When you visit a file with this coding, the file is read into a\n\
9055 unibyte buffer as is, thus each byte of a file is treated as a\n\
9056 character.");
9057 plist[14] = intern (":eol-type");
9058 plist[15] = args[coding_arg_eol_type] = Qunix;
9059 args[coding_arg_plist] = Flist (16, plist);
9060 Fdefine_coding_system_internal (coding_arg_max, args);
9061 }
9062
9063 setup_coding_system (Qno_conversion, &keyboard_coding);
9064 setup_coding_system (Qno_conversion, &terminal_coding);
9065 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9066
9067 {
9068 int i;
9069
9070 for (i = 0; i < coding_category_max; i++)
9071 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9072 }
9073 }
9074
9075 char *
9076 emacs_strerror (error_number)
9077 int error_number;
9078 {
9079 char *str;
9080
9081 synchronize_system_messages_locale ();
9082 str = strerror (error_number);
9083
9084 if (! NILP (Vlocale_coding_system))
9085 {
9086 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9087 Vlocale_coding_system,
9088 0);
9089 str = (char *) SDATA (dec);
9090 }
9091
9092 return str;
9093 }
9094
9095 #endif /* emacs */