(Fdefine_coding_system_internal): Fix previous change.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
322 Lisp_Object Qinterrupted, Qinsufficient_memory;
323
324 int coding_system_require_warning;
325
326 Lisp_Object Vselect_safe_coding_system_function;
327
328 /* Mnemonic string for each format of end-of-line. */
329 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
330 /* Mnemonic string to indicate format of end-of-line is not yet
331 decided. */
332 Lisp_Object eol_mnemonic_undecided;
333
334 #ifdef emacs
335
336 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
337
338 Lisp_Object Qcoding_system_p, Qcoding_system_error;
339
340 /* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342 Lisp_Object Qemacs_mule, Qraw_text;
343 Lisp_Object Qutf_8_emacs;
344
345 /* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
347 /* Coding-system for reading files and receiving data from process. */
348 Lisp_Object Vcoding_system_for_read;
349 /* Coding-system for writing files and sending data to process. */
350 Lisp_Object Vcoding_system_for_write;
351 /* Coding-system actually used in the latest I/O. */
352 Lisp_Object Vlast_coding_system_used;
353 /* Set to non-nil when an error is detected while code conversion. */
354 Lisp_Object Vlast_code_conversion_error;
355 /* A vector of length 256 which contains information about special
356 Latin codes (especially for dealing with Microsoft codes). */
357 Lisp_Object Vlatin_extra_code_table;
358
359 /* Flag to inhibit code conversion of end-of-line format. */
360 int inhibit_eol_conversion;
361
362 /* Flag to inhibit ISO2022 escape sequence detection. */
363 int inhibit_iso_escape_detection;
364
365 /* Flag to make buffer-file-coding-system inherit from process-coding. */
366 int inherit_process_coding_system;
367
368 /* Coding system to be used to encode text for terminal display. */
369 struct coding_system terminal_coding;
370
371 /* Coding system to be used to encode text for terminal display when
372 terminal coding system is nil. */
373 struct coding_system safe_terminal_coding;
374
375 /* Coding system of what is sent from terminal keyboard. */
376 struct coding_system keyboard_coding;
377
378 Lisp_Object Vfile_coding_system_alist;
379 Lisp_Object Vprocess_coding_system_alist;
380 Lisp_Object Vnetwork_coding_system_alist;
381
382 Lisp_Object Vlocale_coding_system;
383
384 #endif /* emacs */
385
386 /* Flag to tell if we look up translation table on character code
387 conversion. */
388 Lisp_Object Venable_character_translation;
389 /* Standard translation table to look up on decoding (reading). */
390 Lisp_Object Vstandard_translation_table_for_decode;
391 /* Standard translation table to look up on encoding (writing). */
392 Lisp_Object Vstandard_translation_table_for_encode;
393
394 Lisp_Object Qtranslation_table;
395 Lisp_Object Qtranslation_table_id;
396 Lisp_Object Qtranslation_table_for_decode;
397 Lisp_Object Qtranslation_table_for_encode;
398
399 /* Alist of charsets vs revision number. */
400 static Lisp_Object Vcharset_revision_table;
401
402 /* Default coding systems used for process I/O. */
403 Lisp_Object Vdefault_process_coding_system;
404
405 /* Char table for translating Quail and self-inserting input. */
406 Lisp_Object Vtranslation_table_for_input;
407
408 /* Two special coding systems. */
409 Lisp_Object Vsjis_coding_system;
410 Lisp_Object Vbig5_coding_system;
411
412 static void record_conversion_result (struct coding_system *coding,
413 enum coding_result_code result);
414 static int detect_coding_utf_8 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_8 P_ ((struct coding_system *));
417 static int encode_coding_utf_8 P_ ((struct coding_system *));
418
419 static int detect_coding_utf_16 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_utf_16 P_ ((struct coding_system *));
422 static int encode_coding_utf_16 P_ ((struct coding_system *));
423
424 static int detect_coding_iso_2022 P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_iso_2022 P_ ((struct coding_system *));
427 static int encode_coding_iso_2022 P_ ((struct coding_system *));
428
429 static int detect_coding_emacs_mule P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_emacs_mule P_ ((struct coding_system *));
432 static int encode_coding_emacs_mule P_ ((struct coding_system *));
433
434 static int detect_coding_sjis P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_sjis P_ ((struct coding_system *));
437 static int encode_coding_sjis P_ ((struct coding_system *));
438
439 static int detect_coding_big5 P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_big5 P_ ((struct coding_system *));
442 static int encode_coding_big5 P_ ((struct coding_system *));
443
444 static int detect_coding_ccl P_ ((struct coding_system *,
445 struct coding_detection_info *info));
446 static void decode_coding_ccl P_ ((struct coding_system *));
447 static int encode_coding_ccl P_ ((struct coding_system *));
448
449 static void decode_coding_raw_text P_ ((struct coding_system *));
450 static int encode_coding_raw_text P_ ((struct coding_system *));
451
452
453 /* ISO2022 section */
454
455 #define CODING_ISO_INITIAL(coding, reg) \
456 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
457 coding_attr_iso_initial), \
458 reg)))
459
460
461 #define CODING_ISO_REQUEST(coding, charset_id) \
462 ((charset_id <= (coding)->max_charset_id \
463 ? (coding)->safe_charsets[charset_id] \
464 : -1))
465
466
467 #define CODING_ISO_FLAGS(coding) \
468 ((coding)->spec.iso_2022.flags)
469 #define CODING_ISO_DESIGNATION(coding, reg) \
470 ((coding)->spec.iso_2022.current_designation[reg])
471 #define CODING_ISO_INVOCATION(coding, plane) \
472 ((coding)->spec.iso_2022.current_invocation[plane])
473 #define CODING_ISO_SINGLE_SHIFTING(coding) \
474 ((coding)->spec.iso_2022.single_shifting)
475 #define CODING_ISO_BOL(coding) \
476 ((coding)->spec.iso_2022.bol)
477 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
478 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
479
480 /* Control characters of ISO2022. */
481 /* code */ /* function */
482 #define ISO_CODE_LF 0x0A /* line-feed */
483 #define ISO_CODE_CR 0x0D /* carriage-return */
484 #define ISO_CODE_SO 0x0E /* shift-out */
485 #define ISO_CODE_SI 0x0F /* shift-in */
486 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
487 #define ISO_CODE_ESC 0x1B /* escape */
488 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
489 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
490 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
491
492 /* All code (1-byte) of ISO2022 is classified into one of the
493 followings. */
494 enum iso_code_class_type
495 {
496 ISO_control_0, /* Control codes in the range
497 0x00..0x1F and 0x7F, except for the
498 following 5 codes. */
499 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
500 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
501 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
502 ISO_escape, /* ISO_CODE_SO (0x1B) */
503 ISO_control_1, /* Control codes in the range
504 0x80..0x9F, except for the
505 following 3 codes. */
506 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
507 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
508 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
509 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
510 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
511 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
512 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
513 };
514
515 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
516 `iso-flags' attribute of an iso2022 coding system. */
517
518 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
519 instead of the correct short-form sequence (e.g. ESC $ A). */
520 #define CODING_ISO_FLAG_LONG_FORM 0x0001
521
522 /* If set, reset graphic planes and registers at end-of-line to the
523 initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
525
526 /* If set, reset graphic planes and registers before any control
527 characters to the initial state. */
528 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
529
530 /* If set, encode by 7-bit environment. */
531 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
532
533 /* If set, use locking-shift function. */
534 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
535
536 /* If set, use single-shift function. Overwrite
537 CODING_ISO_FLAG_LOCKING_SHIFT. */
538 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
539
540 /* If set, use designation escape sequence. */
541 #define CODING_ISO_FLAG_DESIGNATION 0x0040
542
543 /* If set, produce revision number sequence. */
544 #define CODING_ISO_FLAG_REVISION 0x0080
545
546 /* If set, produce ISO6429's direction specifying sequence. */
547 #define CODING_ISO_FLAG_DIRECTION 0x0100
548
549 /* If set, assume designation states are reset at beginning of line on
550 output. */
551 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
552
553 /* If set, designation sequence should be placed at beginning of line
554 on output. */
555 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
556
557 /* If set, do not encode unsafe charactes on output. */
558 #define CODING_ISO_FLAG_SAFE 0x0800
559
560 /* If set, extra latin codes (128..159) are accepted as a valid code
561 on input. */
562 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
563
564 #define CODING_ISO_FLAG_COMPOSITION 0x2000
565
566 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
567
568 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
569
570 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
571
572 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
573
574 /* A character to be produced on output if encoding of the original
575 character is prohibited by CODING_ISO_FLAG_SAFE. */
576 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
577
578
579 /* UTF-16 section */
580 #define CODING_UTF_16_BOM(coding) \
581 ((coding)->spec.utf_16.bom)
582
583 #define CODING_UTF_16_ENDIAN(coding) \
584 ((coding)->spec.utf_16.endian)
585
586 #define CODING_UTF_16_SURROGATE(coding) \
587 ((coding)->spec.utf_16.surrogate)
588
589
590 /* CCL section */
591 #define CODING_CCL_DECODER(coding) \
592 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
593 #define CODING_CCL_ENCODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
595 #define CODING_CCL_VALIDS(coding) \
596 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
597
598 /* Index for each coding category in `coding_categories' */
599
600 enum coding_category
601 {
602 coding_category_iso_7,
603 coding_category_iso_7_tight,
604 coding_category_iso_8_1,
605 coding_category_iso_8_2,
606 coding_category_iso_7_else,
607 coding_category_iso_8_else,
608 coding_category_utf_8,
609 coding_category_utf_16_auto,
610 coding_category_utf_16_be,
611 coding_category_utf_16_le,
612 coding_category_utf_16_be_nosig,
613 coding_category_utf_16_le_nosig,
614 coding_category_charset,
615 coding_category_sjis,
616 coding_category_big5,
617 coding_category_ccl,
618 coding_category_emacs_mule,
619 /* All above are targets of code detection. */
620 coding_category_raw_text,
621 coding_category_undecided,
622 coding_category_max
623 };
624
625 /* Definitions of flag bits used in detect_coding_XXXX. */
626 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
627 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
628 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
629 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
630 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
631 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
632 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
633 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
634 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
635 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
636 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
637 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
638 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
639 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
640 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
641 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
642 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
643 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
644
645 /* This value is returned if detect_coding_mask () find nothing other
646 than ASCII characters. */
647 #define CATEGORY_MASK_ANY \
648 (CATEGORY_MASK_ISO_7 \
649 | CATEGORY_MASK_ISO_7_TIGHT \
650 | CATEGORY_MASK_ISO_8_1 \
651 | CATEGORY_MASK_ISO_8_2 \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE \
654 | CATEGORY_MASK_UTF_8 \
655 | CATEGORY_MASK_UTF_16_BE \
656 | CATEGORY_MASK_UTF_16_LE \
657 | CATEGORY_MASK_UTF_16_BE_NOSIG \
658 | CATEGORY_MASK_UTF_16_LE_NOSIG \
659 | CATEGORY_MASK_CHARSET \
660 | CATEGORY_MASK_SJIS \
661 | CATEGORY_MASK_BIG5 \
662 | CATEGORY_MASK_CCL \
663 | CATEGORY_MASK_EMACS_MULE)
664
665
666 #define CATEGORY_MASK_ISO_7BIT \
667 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
668
669 #define CATEGORY_MASK_ISO_8BIT \
670 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
671
672 #define CATEGORY_MASK_ISO_ELSE \
673 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
674
675 #define CATEGORY_MASK_ISO_ESCAPE \
676 (CATEGORY_MASK_ISO_7 \
677 | CATEGORY_MASK_ISO_7_TIGHT \
678 | CATEGORY_MASK_ISO_7_ELSE \
679 | CATEGORY_MASK_ISO_8_ELSE)
680
681 #define CATEGORY_MASK_ISO \
682 ( CATEGORY_MASK_ISO_7BIT \
683 | CATEGORY_MASK_ISO_8BIT \
684 | CATEGORY_MASK_ISO_ELSE)
685
686 #define CATEGORY_MASK_UTF_16 \
687 (CATEGORY_MASK_UTF_16_BE \
688 | CATEGORY_MASK_UTF_16_LE \
689 | CATEGORY_MASK_UTF_16_BE_NOSIG \
690 | CATEGORY_MASK_UTF_16_LE_NOSIG)
691
692
693 /* List of symbols `coding-category-xxx' ordered by priority. This
694 variable is exposed to Emacs Lisp. */
695 static Lisp_Object Vcoding_category_list;
696
697 /* Table of coding categories (Lisp symbols). This variable is for
698 internal use oly. */
699 static Lisp_Object Vcoding_category_table;
700
701 /* Table of coding-categories ordered by priority. */
702 static enum coding_category coding_priorities[coding_category_max];
703
704 /* Nth element is a coding context for the coding system bound to the
705 Nth coding category. */
706 static struct coding_system coding_categories[coding_category_max];
707
708 /*** Commonly used macros and functions ***/
709
710 #ifndef min
711 #define min(a, b) ((a) < (b) ? (a) : (b))
712 #endif
713 #ifndef max
714 #define max(a, b) ((a) > (b) ? (a) : (b))
715 #endif
716
717 #define CODING_GET_INFO(coding, attrs, charset_list) \
718 do { \
719 (attrs) = CODING_ID_ATTRS ((coding)->id); \
720 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
721 } while (0)
722
723
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. If multibytep is
727 nonzero, and a multibyte character is found at SRC, set C to the
728 negative value of the character code. The caller should declare
729 and set these variables appropriately in advance:
730 src, src_end, multibytep */
731
732 #define ONE_MORE_BYTE(c) \
733 do { \
734 if (src == src_end) \
735 { \
736 if (src_base < src) \
737 record_conversion_result \
738 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
739 goto no_more_source; \
740 } \
741 c = *src++; \
742 if (multibytep && (c & 0x80)) \
743 { \
744 if ((c & 0xFE) == 0xC0) \
745 c = ((c & 1) << 6) | *src++; \
746 else \
747 { \
748 c = - string_char (--src, &src, NULL); \
749 record_conversion_result \
750 (coding, CODING_RESULT_INVALID_SRC); \
751 } \
752 } \
753 consumed_chars++; \
754 } while (0)
755
756
757 #define ONE_MORE_BYTE_NO_CHECK(c) \
758 do { \
759 c = *src++; \
760 if (multibytep && (c & 0x80)) \
761 { \
762 if ((c & 0xFE) == 0xC0) \
763 c = ((c & 1) << 6) | *src++; \
764 else \
765 { \
766 c = - string_char (--src, &src, NULL); \
767 record_conversion_result \
768 (coding, CODING_RESULT_INVALID_SRC); \
769 } \
770 } \
771 consumed_chars++; \
772 } while (0)
773
774
775 /* Store a byte C in the place pointed by DST and increment DST to the
776 next free point, and increment PRODUCED_CHARS. The caller should
777 assure that C is 0..127, and declare and set the variable `dst'
778 appropriately in advance.
779 */
780
781
782 #define EMIT_ONE_ASCII_BYTE(c) \
783 do { \
784 produced_chars++; \
785 *dst++ = (c); \
786 } while (0)
787
788
789 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
790
791 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
792 do { \
793 produced_chars += 2; \
794 *dst++ = (c1), *dst++ = (c2); \
795 } while (0)
796
797
798 /* Store a byte C in the place pointed by DST and increment DST to the
799 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
800 nonzero, store in an appropriate multibyte from. The caller should
801 declare and set the variables `dst' and `multibytep' appropriately
802 in advance. */
803
804 #define EMIT_ONE_BYTE(c) \
805 do { \
806 produced_chars++; \
807 if (multibytep) \
808 { \
809 int ch = (c); \
810 if (ch >= 0x80) \
811 ch = BYTE8_TO_CHAR (ch); \
812 CHAR_STRING_ADVANCE (ch, dst); \
813 } \
814 else \
815 *dst++ = (c); \
816 } while (0)
817
818
819 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
820
821 #define EMIT_TWO_BYTES(c1, c2) \
822 do { \
823 produced_chars += 2; \
824 if (multibytep) \
825 { \
826 int ch; \
827 \
828 ch = (c1); \
829 if (ch >= 0x80) \
830 ch = BYTE8_TO_CHAR (ch); \
831 CHAR_STRING_ADVANCE (ch, dst); \
832 ch = (c2); \
833 if (ch >= 0x80) \
834 ch = BYTE8_TO_CHAR (ch); \
835 CHAR_STRING_ADVANCE (ch, dst); \
836 } \
837 else \
838 { \
839 *dst++ = (c1); \
840 *dst++ = (c2); \
841 } \
842 } while (0)
843
844
845 #define EMIT_THREE_BYTES(c1, c2, c3) \
846 do { \
847 EMIT_ONE_BYTE (c1); \
848 EMIT_TWO_BYTES (c2, c3); \
849 } while (0)
850
851
852 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
853 do { \
854 EMIT_TWO_BYTES (c1, c2); \
855 EMIT_TWO_BYTES (c3, c4); \
856 } while (0)
857
858
859 static void
860 record_conversion_result (struct coding_system *coding,
861 enum coding_result_code result)
862 {
863 coding->result = result;
864 switch (result)
865 {
866 case CODING_RESULT_INSUFFICIENT_SRC:
867 Vlast_code_conversion_error = Qinsufficient_source;
868 break;
869 case CODING_RESULT_INCONSISTENT_EOL:
870 Vlast_code_conversion_error = Qinconsistent_eol;
871 break;
872 case CODING_RESULT_INVALID_SRC:
873 Vlast_code_conversion_error = Qinvalid_source;
874 break;
875 case CODING_RESULT_INTERRUPT:
876 Vlast_code_conversion_error = Qinterrupted;
877 break;
878 case CODING_RESULT_INSUFFICIENT_MEM:
879 Vlast_code_conversion_error = Qinsufficient_memory;
880 break;
881 }
882 }
883
884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
885 do { \
886 charset_map_loaded = 0; \
887 c = DECODE_CHAR (charset, code); \
888 if (charset_map_loaded) \
889 { \
890 const unsigned char *orig = coding->source; \
891 EMACS_INT offset; \
892 \
893 coding_set_source (coding); \
894 offset = coding->source - orig; \
895 src += offset; \
896 src_base += offset; \
897 src_end += offset; \
898 } \
899 } while (0)
900
901
902 #define ASSURE_DESTINATION(bytes) \
903 do { \
904 if (dst + (bytes) >= dst_end) \
905 { \
906 int more_bytes = charbuf_end - charbuf + (bytes); \
907 \
908 dst = alloc_destination (coding, more_bytes, dst); \
909 dst_end = coding->destination + coding->dst_bytes; \
910 } \
911 } while (0)
912
913
914
915 static void
916 coding_set_source (coding)
917 struct coding_system *coding;
918 {
919 if (BUFFERP (coding->src_object))
920 {
921 struct buffer *buf = XBUFFER (coding->src_object);
922
923 if (coding->src_pos < 0)
924 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
925 else
926 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
927 }
928 else if (STRINGP (coding->src_object))
929 {
930 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
931 }
932 else
933 /* Otherwise, the source is C string and is never relocated
934 automatically. Thus we don't have to update anything. */
935 ;
936 }
937
938 static void
939 coding_set_destination (coding)
940 struct coding_system *coding;
941 {
942 if (BUFFERP (coding->dst_object))
943 {
944 if (coding->src_pos < 0)
945 {
946 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
947 coding->dst_bytes = (GAP_END_ADDR
948 - (coding->src_bytes - coding->consumed)
949 - coding->destination);
950 }
951 else
952 {
953 /* We are sure that coding->dst_pos_byte is before the gap
954 of the buffer. */
955 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
956 + coding->dst_pos_byte - 1);
957 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
958 - coding->destination);
959 }
960 }
961 else
962 /* Otherwise, the destination is C string and is never relocated
963 automatically. Thus we don't have to update anything. */
964 ;
965 }
966
967
968 static void
969 coding_alloc_by_realloc (coding, bytes)
970 struct coding_system *coding;
971 EMACS_INT bytes;
972 {
973 coding->destination = (unsigned char *) xrealloc (coding->destination,
974 coding->dst_bytes + bytes);
975 coding->dst_bytes += bytes;
976 }
977
978 static void
979 coding_alloc_by_making_gap (coding, bytes)
980 struct coding_system *coding;
981 EMACS_INT bytes;
982 {
983 if (BUFFERP (coding->dst_object)
984 && EQ (coding->src_object, coding->dst_object))
985 {
986 EMACS_INT add = coding->src_bytes - coding->consumed;
987
988 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
989 make_gap (bytes);
990 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
991 }
992 else
993 {
994 Lisp_Object this_buffer;
995
996 this_buffer = Fcurrent_buffer ();
997 set_buffer_internal (XBUFFER (coding->dst_object));
998 make_gap (bytes);
999 set_buffer_internal (XBUFFER (this_buffer));
1000 }
1001 }
1002
1003
1004 static unsigned char *
1005 alloc_destination (coding, nbytes, dst)
1006 struct coding_system *coding;
1007 EMACS_INT nbytes;
1008 unsigned char *dst;
1009 {
1010 EMACS_INT offset = dst - coding->destination;
1011
1012 if (BUFFERP (coding->dst_object))
1013 coding_alloc_by_making_gap (coding, nbytes);
1014 else
1015 coding_alloc_by_realloc (coding, nbytes);
1016 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1017 coding_set_destination (coding);
1018 dst = coding->destination + offset;
1019 return dst;
1020 }
1021
1022 /** Macros for annotations. */
1023
1024 /* Maximum length of annotation data (sum of annotations for
1025 composition and charset). */
1026 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1027
1028 /* An annotation data is stored in the array coding->charbuf in this
1029 format:
1030 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1031 LENGTH is the number of elements in the annotation.
1032 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1033 FROM and TO specify the range of text annotated. They are relative
1034 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1035
1036 The format of the following elements depend on ANNOTATION_MASK.
1037
1038 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1039 follows:
1040 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1041 METHOD is one of enum composition_method.
1042 Optionnal COMPOSITION-COMPONENTS are characters and composition
1043 rules.
1044
1045 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1046 follows. */
1047
1048 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1049 do { \
1050 *(buf)++ = -(len); \
1051 *(buf)++ = (mask); \
1052 *(buf)++ = (from); \
1053 *(buf)++ = (to); \
1054 coding->annotated = 1; \
1055 } while (0);
1056
1057 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1058 do { \
1059 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1060 *buf++ = method; \
1061 } while (0)
1062
1063
1064 #define ADD_CHARSET_DATA(buf, from, to, id) \
1065 do { \
1066 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1067 *buf++ = id; \
1068 } while (0)
1069
1070 \f
1071 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1072
1073
1074
1075 \f
1076 /*** 3. UTF-8 ***/
1077
1078 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1079 Check if a text is encoded in UTF-8. If it is, return 1, else
1080 return 0. */
1081
1082 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1083 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1084 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1085 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1086 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1087 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1088
1089 static int
1090 detect_coding_utf_8 (coding, detect_info)
1091 struct coding_system *coding;
1092 struct coding_detection_info *detect_info;
1093 {
1094 const unsigned char *src = coding->source, *src_base;
1095 const unsigned char *src_end = coding->source + coding->src_bytes;
1096 int multibytep = coding->src_multibyte;
1097 int consumed_chars = 0;
1098 int found = 0;
1099
1100 detect_info->checked |= CATEGORY_MASK_UTF_8;
1101 /* A coding system of this category is always ASCII compatible. */
1102 src += coding->head_ascii;
1103
1104 while (1)
1105 {
1106 int c, c1, c2, c3, c4;
1107
1108 src_base = src;
1109 ONE_MORE_BYTE (c);
1110 if (c < 0 || UTF_8_1_OCTET_P (c))
1111 continue;
1112 ONE_MORE_BYTE (c1);
1113 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1114 break;
1115 if (UTF_8_2_OCTET_LEADING_P (c))
1116 {
1117 found = CATEGORY_MASK_UTF_8;
1118 continue;
1119 }
1120 ONE_MORE_BYTE (c2);
1121 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1122 break;
1123 if (UTF_8_3_OCTET_LEADING_P (c))
1124 {
1125 found = CATEGORY_MASK_UTF_8;
1126 continue;
1127 }
1128 ONE_MORE_BYTE (c3);
1129 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1130 break;
1131 if (UTF_8_4_OCTET_LEADING_P (c))
1132 {
1133 found = CATEGORY_MASK_UTF_8;
1134 continue;
1135 }
1136 ONE_MORE_BYTE (c4);
1137 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1138 break;
1139 if (UTF_8_5_OCTET_LEADING_P (c))
1140 {
1141 found = CATEGORY_MASK_UTF_8;
1142 continue;
1143 }
1144 break;
1145 }
1146 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1147 return 0;
1148
1149 no_more_source:
1150 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1151 {
1152 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1153 return 0;
1154 }
1155 detect_info->found |= found;
1156 return 1;
1157 }
1158
1159
1160 static void
1161 decode_coding_utf_8 (coding)
1162 struct coding_system *coding;
1163 {
1164 const unsigned char *src = coding->source + coding->consumed;
1165 const unsigned char *src_end = coding->source + coding->src_bytes;
1166 const unsigned char *src_base;
1167 int *charbuf = coding->charbuf;
1168 int *charbuf_end = charbuf + coding->charbuf_size;
1169 int consumed_chars = 0, consumed_chars_base;
1170 int multibytep = coding->src_multibyte;
1171 Lisp_Object attr, charset_list;
1172
1173 CODING_GET_INFO (coding, attr, charset_list);
1174
1175 while (1)
1176 {
1177 int c, c1, c2, c3, c4, c5;
1178
1179 src_base = src;
1180 consumed_chars_base = consumed_chars;
1181
1182 if (charbuf >= charbuf_end)
1183 break;
1184
1185 ONE_MORE_BYTE (c1);
1186 if (c1 < 0)
1187 {
1188 c = - c1;
1189 }
1190 else if (UTF_8_1_OCTET_P(c1))
1191 {
1192 c = c1;
1193 }
1194 else
1195 {
1196 ONE_MORE_BYTE (c2);
1197 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1198 goto invalid_code;
1199 if (UTF_8_2_OCTET_LEADING_P (c1))
1200 {
1201 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1202 /* Reject overlong sequences here and below. Encoders
1203 producing them are incorrect, they can be misleading,
1204 and they mess up read/write invariance. */
1205 if (c < 128)
1206 goto invalid_code;
1207 }
1208 else
1209 {
1210 ONE_MORE_BYTE (c3);
1211 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1212 goto invalid_code;
1213 if (UTF_8_3_OCTET_LEADING_P (c1))
1214 {
1215 c = (((c1 & 0xF) << 12)
1216 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1217 if (c < 0x800
1218 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1219 goto invalid_code;
1220 }
1221 else
1222 {
1223 ONE_MORE_BYTE (c4);
1224 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1225 goto invalid_code;
1226 if (UTF_8_4_OCTET_LEADING_P (c1))
1227 {
1228 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1229 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1230 if (c < 0x10000)
1231 goto invalid_code;
1232 }
1233 else
1234 {
1235 ONE_MORE_BYTE (c5);
1236 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1237 goto invalid_code;
1238 if (UTF_8_5_OCTET_LEADING_P (c1))
1239 {
1240 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1241 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1242 | (c5 & 0x3F));
1243 if ((c > MAX_CHAR) || (c < 0x200000))
1244 goto invalid_code;
1245 }
1246 else
1247 goto invalid_code;
1248 }
1249 }
1250 }
1251 }
1252
1253 *charbuf++ = c;
1254 continue;
1255
1256 invalid_code:
1257 src = src_base;
1258 consumed_chars = consumed_chars_base;
1259 ONE_MORE_BYTE (c);
1260 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1261 coding->errors++;
1262 }
1263
1264 no_more_source:
1265 coding->consumed_char += consumed_chars_base;
1266 coding->consumed = src_base - coding->source;
1267 coding->charbuf_used = charbuf - coding->charbuf;
1268 }
1269
1270
1271 static int
1272 encode_coding_utf_8 (coding)
1273 struct coding_system *coding;
1274 {
1275 int multibytep = coding->dst_multibyte;
1276 int *charbuf = coding->charbuf;
1277 int *charbuf_end = charbuf + coding->charbuf_used;
1278 unsigned char *dst = coding->destination + coding->produced;
1279 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1280 int produced_chars = 0;
1281 int c;
1282
1283 if (multibytep)
1284 {
1285 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1286
1287 while (charbuf < charbuf_end)
1288 {
1289 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1290
1291 ASSURE_DESTINATION (safe_room);
1292 c = *charbuf++;
1293 if (CHAR_BYTE8_P (c))
1294 {
1295 c = CHAR_TO_BYTE8 (c);
1296 EMIT_ONE_BYTE (c);
1297 }
1298 else
1299 {
1300 CHAR_STRING_ADVANCE (c, pend);
1301 for (p = str; p < pend; p++)
1302 EMIT_ONE_BYTE (*p);
1303 }
1304 }
1305 }
1306 else
1307 {
1308 int safe_room = MAX_MULTIBYTE_LENGTH;
1309
1310 while (charbuf < charbuf_end)
1311 {
1312 ASSURE_DESTINATION (safe_room);
1313 c = *charbuf++;
1314 dst += CHAR_STRING (c, dst);
1315 produced_chars++;
1316 }
1317 }
1318 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1319 coding->produced_char += produced_chars;
1320 coding->produced = dst - coding->destination;
1321 return 0;
1322 }
1323
1324
1325 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1326 Check if a text is encoded in one of UTF-16 based coding systems.
1327 If it is, return 1, else return 0. */
1328
1329 #define UTF_16_HIGH_SURROGATE_P(val) \
1330 (((val) & 0xFC00) == 0xD800)
1331
1332 #define UTF_16_LOW_SURROGATE_P(val) \
1333 (((val) & 0xFC00) == 0xDC00)
1334
1335 #define UTF_16_INVALID_P(val) \
1336 (((val) == 0xFFFE) \
1337 || ((val) == 0xFFFF) \
1338 || UTF_16_LOW_SURROGATE_P (val))
1339
1340
1341 static int
1342 detect_coding_utf_16 (coding, detect_info)
1343 struct coding_system *coding;
1344 struct coding_detection_info *detect_info;
1345 {
1346 const unsigned char *src = coding->source, *src_base = src;
1347 const unsigned char *src_end = coding->source + coding->src_bytes;
1348 int multibytep = coding->src_multibyte;
1349 int consumed_chars = 0;
1350 int c1, c2;
1351
1352 detect_info->checked |= CATEGORY_MASK_UTF_16;
1353 if (coding->mode & CODING_MODE_LAST_BLOCK
1354 && (coding->src_chars & 1))
1355 {
1356 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1357 return 0;
1358 }
1359
1360 ONE_MORE_BYTE (c1);
1361 ONE_MORE_BYTE (c2);
1362 if ((c1 == 0xFF) && (c2 == 0xFE))
1363 {
1364 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1365 | CATEGORY_MASK_UTF_16_AUTO);
1366 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1367 | CATEGORY_MASK_UTF_16_BE_NOSIG
1368 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1369 }
1370 else if ((c1 == 0xFE) && (c2 == 0xFF))
1371 {
1372 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1373 | CATEGORY_MASK_UTF_16_AUTO);
1374 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1375 | CATEGORY_MASK_UTF_16_BE_NOSIG
1376 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1377 }
1378 else if (c1 >= 0 && c2 >= 0)
1379 {
1380 unsigned char b1[256], b2[256];
1381 int b1_variants = 1, b2_variants = 1;
1382 int n;
1383
1384 bzero (b1, 256), bzero (b2, 256);
1385 b1[c1]++, b2[c2]++;
1386 for (n = 0; n < 256 && src < src_end; n++)
1387 {
1388 src_base = src;
1389 ONE_MORE_BYTE (c1);
1390 ONE_MORE_BYTE (c2);
1391 if (c1 < 0 || c2 < 0)
1392 break;
1393 if (! b1[c1++]) b1_variants++;
1394 if (! b2[c2++]) b2_variants++;
1395 }
1396 if (b1_variants < b2_variants)
1397 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1398 else
1399 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1400 detect_info->rejected
1401 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1402 }
1403 no_more_source:
1404 return 1;
1405 }
1406
1407 static void
1408 decode_coding_utf_16 (coding)
1409 struct coding_system *coding;
1410 {
1411 const unsigned char *src = coding->source + coding->consumed;
1412 const unsigned char *src_end = coding->source + coding->src_bytes;
1413 const unsigned char *src_base;
1414 int *charbuf = coding->charbuf;
1415 int *charbuf_end = charbuf + coding->charbuf_size;
1416 int consumed_chars = 0, consumed_chars_base;
1417 int multibytep = coding->src_multibyte;
1418 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1419 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1420 int surrogate = CODING_UTF_16_SURROGATE (coding);
1421 Lisp_Object attr, charset_list;
1422
1423 CODING_GET_INFO (coding, attr, charset_list);
1424
1425 if (bom == utf_16_with_bom)
1426 {
1427 int c, c1, c2;
1428
1429 src_base = src;
1430 ONE_MORE_BYTE (c1);
1431 ONE_MORE_BYTE (c2);
1432 c = (c1 << 8) | c2;
1433
1434 if (endian == utf_16_big_endian
1435 ? c != 0xFEFF : c != 0xFFFE)
1436 {
1437 /* The first two bytes are not BOM. Treat them as bytes
1438 for a normal character. */
1439 src = src_base;
1440 coding->errors++;
1441 }
1442 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1443 }
1444 else if (bom == utf_16_detect_bom)
1445 {
1446 /* We have already tried to detect BOM and failed in
1447 detect_coding. */
1448 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1449 }
1450
1451 while (1)
1452 {
1453 int c, c1, c2;
1454
1455 src_base = src;
1456 consumed_chars_base = consumed_chars;
1457
1458 if (charbuf + 2 >= charbuf_end)
1459 break;
1460
1461 ONE_MORE_BYTE (c1);
1462 if (c1 < 0)
1463 {
1464 *charbuf++ = -c1;
1465 continue;
1466 }
1467 ONE_MORE_BYTE (c2);
1468 if (c2 < 0)
1469 {
1470 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1471 *charbuf++ = -c2;
1472 continue;
1473 }
1474 c = (endian == utf_16_big_endian
1475 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1476 if (surrogate)
1477 {
1478 if (! UTF_16_LOW_SURROGATE_P (c))
1479 {
1480 if (endian == utf_16_big_endian)
1481 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1482 else
1483 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1484 *charbuf++ = c1;
1485 *charbuf++ = c2;
1486 coding->errors++;
1487 if (UTF_16_HIGH_SURROGATE_P (c))
1488 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1489 else
1490 *charbuf++ = c;
1491 }
1492 else
1493 {
1494 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1495 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1496 *charbuf++ = c;
1497 }
1498 }
1499 else
1500 {
1501 if (UTF_16_HIGH_SURROGATE_P (c))
1502 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1503 else
1504 *charbuf++ = c;
1505 }
1506 }
1507
1508 no_more_source:
1509 coding->consumed_char += consumed_chars_base;
1510 coding->consumed = src_base - coding->source;
1511 coding->charbuf_used = charbuf - coding->charbuf;
1512 }
1513
1514 static int
1515 encode_coding_utf_16 (coding)
1516 struct coding_system *coding;
1517 {
1518 int multibytep = coding->dst_multibyte;
1519 int *charbuf = coding->charbuf;
1520 int *charbuf_end = charbuf + coding->charbuf_used;
1521 unsigned char *dst = coding->destination + coding->produced;
1522 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1523 int safe_room = 8;
1524 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1525 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1526 int produced_chars = 0;
1527 Lisp_Object attrs, charset_list;
1528 int c;
1529
1530 CODING_GET_INFO (coding, attrs, charset_list);
1531
1532 if (bom != utf_16_without_bom)
1533 {
1534 ASSURE_DESTINATION (safe_room);
1535 if (big_endian)
1536 EMIT_TWO_BYTES (0xFE, 0xFF);
1537 else
1538 EMIT_TWO_BYTES (0xFF, 0xFE);
1539 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1540 }
1541
1542 while (charbuf < charbuf_end)
1543 {
1544 ASSURE_DESTINATION (safe_room);
1545 c = *charbuf++;
1546 if (c >= MAX_UNICODE_CHAR)
1547 c = coding->default_char;
1548
1549 if (c < 0x10000)
1550 {
1551 if (big_endian)
1552 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1553 else
1554 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1555 }
1556 else
1557 {
1558 int c1, c2;
1559
1560 c -= 0x10000;
1561 c1 = (c >> 10) + 0xD800;
1562 c2 = (c & 0x3FF) + 0xDC00;
1563 if (big_endian)
1564 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1565 else
1566 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1567 }
1568 }
1569 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1570 coding->produced = dst - coding->destination;
1571 coding->produced_char += produced_chars;
1572 return 0;
1573 }
1574
1575 \f
1576 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1577
1578 /* Emacs' internal format for representation of multiple character
1579 sets is a kind of multi-byte encoding, i.e. characters are
1580 represented by variable-length sequences of one-byte codes.
1581
1582 ASCII characters and control characters (e.g. `tab', `newline') are
1583 represented by one-byte sequences which are their ASCII codes, in
1584 the range 0x00 through 0x7F.
1585
1586 8-bit characters of the range 0x80..0x9F are represented by
1587 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1588 code + 0x20).
1589
1590 8-bit characters of the range 0xA0..0xFF are represented by
1591 one-byte sequences which are their 8-bit code.
1592
1593 The other characters are represented by a sequence of `base
1594 leading-code', optional `extended leading-code', and one or two
1595 `position-code's. The length of the sequence is determined by the
1596 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1597 whereas extended leading-code and position-code take the range 0xA0
1598 through 0xFF. See `charset.h' for more details about leading-code
1599 and position-code.
1600
1601 --- CODE RANGE of Emacs' internal format ---
1602 character set range
1603 ------------- -----
1604 ascii 0x00..0x7F
1605 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1606 eight-bit-graphic 0xA0..0xBF
1607 ELSE 0x81..0x9D + [0xA0..0xFF]+
1608 ---------------------------------------------
1609
1610 As this is the internal character representation, the format is
1611 usually not used externally (i.e. in a file or in a data sent to a
1612 process). But, it is possible to have a text externally in this
1613 format (i.e. by encoding by the coding system `emacs-mule').
1614
1615 In that case, a sequence of one-byte codes has a slightly different
1616 form.
1617
1618 At first, all characters in eight-bit-control are represented by
1619 one-byte sequences which are their 8-bit code.
1620
1621 Next, character composition data are represented by the byte
1622 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1623 where,
1624 METHOD is 0xF0 plus one of composition method (enum
1625 composition_method),
1626
1627 BYTES is 0xA0 plus a byte length of this composition data,
1628
1629 CHARS is 0x20 plus a number of characters composed by this
1630 data,
1631
1632 COMPONENTs are characters of multibye form or composition
1633 rules encoded by two-byte of ASCII codes.
1634
1635 In addition, for backward compatibility, the following formats are
1636 also recognized as composition data on decoding.
1637
1638 0x80 MSEQ ...
1639 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1640
1641 Here,
1642 MSEQ is a multibyte form but in these special format:
1643 ASCII: 0xA0 ASCII_CODE+0x80,
1644 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1645 RULE is a one byte code of the range 0xA0..0xF0 that
1646 represents a composition rule.
1647 */
1648
1649 char emacs_mule_bytes[256];
1650
1651 int
1652 emacs_mule_char (coding, src, nbytes, nchars, id)
1653 struct coding_system *coding;
1654 const unsigned char *src;
1655 int *nbytes, *nchars, *id;
1656 {
1657 const unsigned char *src_end = coding->source + coding->src_bytes;
1658 const unsigned char *src_base = src;
1659 int multibytep = coding->src_multibyte;
1660 struct charset *charset;
1661 unsigned code;
1662 int c;
1663 int consumed_chars = 0;
1664
1665 ONE_MORE_BYTE (c);
1666 if (c < 0)
1667 {
1668 c = -c;
1669 charset = emacs_mule_charset[0];
1670 }
1671 else
1672 {
1673 switch (emacs_mule_bytes[c])
1674 {
1675 case 2:
1676 if (! (charset = emacs_mule_charset[c]))
1677 goto invalid_code;
1678 ONE_MORE_BYTE (c);
1679 if (c < 0)
1680 goto invalid_code;
1681 code = c & 0x7F;
1682 break;
1683
1684 case 3:
1685 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1686 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1687 {
1688 ONE_MORE_BYTE (c);
1689 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1690 goto invalid_code;
1691 ONE_MORE_BYTE (c);
1692 if (c < 0)
1693 goto invalid_code;
1694 code = c & 0x7F;
1695 }
1696 else
1697 {
1698 if (! (charset = emacs_mule_charset[c]))
1699 goto invalid_code;
1700 ONE_MORE_BYTE (c);
1701 if (c < 0)
1702 goto invalid_code;
1703 code = (c & 0x7F) << 8;
1704 ONE_MORE_BYTE (c);
1705 if (c < 0)
1706 goto invalid_code;
1707 code |= c & 0x7F;
1708 }
1709 break;
1710
1711 case 4:
1712 ONE_MORE_BYTE (c);
1713 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1714 goto invalid_code;
1715 ONE_MORE_BYTE (c);
1716 if (c < 0)
1717 goto invalid_code;
1718 code = (c & 0x7F) << 8;
1719 ONE_MORE_BYTE (c);
1720 if (c < 0)
1721 goto invalid_code;
1722 code |= c & 0x7F;
1723 break;
1724
1725 case 1:
1726 code = c;
1727 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1728 ? charset_ascii : charset_eight_bit);
1729 break;
1730
1731 default:
1732 abort ();
1733 }
1734 c = DECODE_CHAR (charset, code);
1735 if (c < 0)
1736 goto invalid_code;
1737 }
1738 *nbytes = src - src_base;
1739 *nchars = consumed_chars;
1740 if (id)
1741 *id = charset->id;
1742 return c;
1743
1744 no_more_source:
1745 return -2;
1746
1747 invalid_code:
1748 return -1;
1749 }
1750
1751
1752 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1753 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1754 else return 0. */
1755
1756 static int
1757 detect_coding_emacs_mule (coding, detect_info)
1758 struct coding_system *coding;
1759 struct coding_detection_info *detect_info;
1760 {
1761 const unsigned char *src = coding->source, *src_base;
1762 const unsigned char *src_end = coding->source + coding->src_bytes;
1763 int multibytep = coding->src_multibyte;
1764 int consumed_chars = 0;
1765 int c;
1766 int found = 0;
1767
1768 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1769 /* A coding system of this category is always ASCII compatible. */
1770 src += coding->head_ascii;
1771
1772 while (1)
1773 {
1774 src_base = src;
1775 ONE_MORE_BYTE (c);
1776 if (c < 0)
1777 continue;
1778 if (c == 0x80)
1779 {
1780 /* Perhaps the start of composite character. We simple skip
1781 it because analyzing it is too heavy for detecting. But,
1782 at least, we check that the composite character
1783 constitues of more than 4 bytes. */
1784 const unsigned char *src_base;
1785
1786 repeat:
1787 src_base = src;
1788 do
1789 {
1790 ONE_MORE_BYTE (c);
1791 }
1792 while (c >= 0xA0);
1793
1794 if (src - src_base <= 4)
1795 break;
1796 found = CATEGORY_MASK_EMACS_MULE;
1797 if (c == 0x80)
1798 goto repeat;
1799 }
1800
1801 if (c < 0x80)
1802 {
1803 if (c < 0x20
1804 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1805 break;
1806 }
1807 else
1808 {
1809 const unsigned char *src_base = src - 1;
1810
1811 do
1812 {
1813 ONE_MORE_BYTE (c);
1814 }
1815 while (c >= 0xA0);
1816 if (src - src_base != emacs_mule_bytes[*src_base])
1817 break;
1818 found = CATEGORY_MASK_EMACS_MULE;
1819 }
1820 }
1821 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1822 return 0;
1823
1824 no_more_source:
1825 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1826 {
1827 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1828 return 0;
1829 }
1830 detect_info->found |= found;
1831 return 1;
1832 }
1833
1834
1835 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1836
1837 /* Decode a character represented as a component of composition
1838 sequence of Emacs 20/21 style at SRC. Set C to that character and
1839 update SRC to the head of next character (or an encoded composition
1840 rule). If SRC doesn't points a composition component, set C to -1.
1841 If SRC points an invalid byte sequence, global exit by a return
1842 value 0. */
1843
1844 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1845 if (1) \
1846 { \
1847 int c; \
1848 int nbytes, nchars; \
1849 \
1850 if (src == src_end) \
1851 break; \
1852 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1853 if (c < 0) \
1854 { \
1855 if (c == -2) \
1856 break; \
1857 goto invalid_code; \
1858 } \
1859 *buf++ = c; \
1860 src += nbytes; \
1861 consumed_chars += nchars; \
1862 } \
1863 else
1864
1865
1866 /* Decode a composition rule represented as a component of composition
1867 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1868 and increment BUF. If SRC points an invalid byte sequence, set C
1869 to -1. */
1870
1871 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1872 do { \
1873 int c, gref, nref; \
1874 \
1875 if (src >= src_end) \
1876 goto invalid_code; \
1877 ONE_MORE_BYTE_NO_CHECK (c); \
1878 c -= 0x20; \
1879 if (c < 0 || c >= 81) \
1880 goto invalid_code; \
1881 \
1882 gref = c / 9, nref = c % 9; \
1883 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1884 } while (0)
1885
1886
1887 /* Decode a composition rule represented as a component of composition
1888 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1889 and increment BUF. If SRC points an invalid byte sequence, set C
1890 to -1. */
1891
1892 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1893 do { \
1894 int gref, nref; \
1895 \
1896 if (src + 1>= src_end) \
1897 goto invalid_code; \
1898 ONE_MORE_BYTE_NO_CHECK (gref); \
1899 gref -= 0x20; \
1900 ONE_MORE_BYTE_NO_CHECK (nref); \
1901 nref -= 0x20; \
1902 if (gref < 0 || gref >= 81 \
1903 || nref < 0 || nref >= 81) \
1904 goto invalid_code; \
1905 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1906 } while (0)
1907
1908
1909 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1910 do { \
1911 /* Emacs 21 style format. The first three bytes at SRC are \
1912 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1913 the byte length of this composition information, CHARS is the \
1914 number of characters composed by this composition. */ \
1915 enum composition_method method = c - 0xF2; \
1916 int *charbuf_base = charbuf; \
1917 int from, to; \
1918 int consumed_chars_limit; \
1919 int nbytes, nchars; \
1920 \
1921 ONE_MORE_BYTE (c); \
1922 if (c < 0) \
1923 goto invalid_code; \
1924 nbytes = c - 0xA0; \
1925 if (nbytes < 3) \
1926 goto invalid_code; \
1927 ONE_MORE_BYTE (c); \
1928 if (c < 0) \
1929 goto invalid_code; \
1930 nchars = c - 0xA0; \
1931 from = coding->produced + char_offset; \
1932 to = from + nchars; \
1933 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1934 consumed_chars_limit = consumed_chars_base + nbytes; \
1935 if (method != COMPOSITION_RELATIVE) \
1936 { \
1937 int i = 0; \
1938 while (consumed_chars < consumed_chars_limit) \
1939 { \
1940 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1941 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1942 else \
1943 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1944 i++; \
1945 } \
1946 if (consumed_chars < consumed_chars_limit) \
1947 goto invalid_code; \
1948 charbuf_base[0] -= i; \
1949 } \
1950 } while (0)
1951
1952
1953 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1954 do { \
1955 /* Emacs 20 style format for relative composition. */ \
1956 /* Store multibyte form of characters to be composed. */ \
1957 enum composition_method method = COMPOSITION_RELATIVE; \
1958 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1959 int *buf = components; \
1960 int i, j; \
1961 int from, to; \
1962 \
1963 src = src_base; \
1964 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1965 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1966 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1967 if (i < 2) \
1968 goto invalid_code; \
1969 from = coding->produced_char + char_offset; \
1970 to = from + i; \
1971 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1972 for (j = 0; j < i; j++) \
1973 *charbuf++ = components[j]; \
1974 } while (0)
1975
1976
1977 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1978 do { \
1979 /* Emacs 20 style format for rule-base composition. */ \
1980 /* Store multibyte form of characters to be composed. */ \
1981 enum composition_method method = COMPOSITION_WITH_RULE; \
1982 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1983 int *buf = components; \
1984 int i, j; \
1985 int from, to; \
1986 \
1987 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1988 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1989 { \
1990 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1991 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1992 } \
1993 if (i < 1 || (buf - components) % 2 == 0) \
1994 goto invalid_code; \
1995 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1996 goto no_more_source; \
1997 from = coding->produced_char + char_offset; \
1998 to = from + i; \
1999 ADD_COMPOSITION_DATA (buf, from, to, method); \
2000 for (j = 0; j < i; j++) \
2001 *charbuf++ = components[j]; \
2002 for (j = 0; j < i; j += 2) \
2003 *charbuf++ = components[j]; \
2004 } while (0)
2005
2006
2007 static void
2008 decode_coding_emacs_mule (coding)
2009 struct coding_system *coding;
2010 {
2011 const unsigned char *src = coding->source + coding->consumed;
2012 const unsigned char *src_end = coding->source + coding->src_bytes;
2013 const unsigned char *src_base;
2014 int *charbuf = coding->charbuf;
2015 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2016 int consumed_chars = 0, consumed_chars_base;
2017 int multibytep = coding->src_multibyte;
2018 Lisp_Object attrs, charset_list;
2019 int char_offset = coding->produced_char;
2020 int last_offset = char_offset;
2021 int last_id = charset_ascii;
2022
2023 CODING_GET_INFO (coding, attrs, charset_list);
2024
2025 while (1)
2026 {
2027 int c;
2028
2029 src_base = src;
2030 consumed_chars_base = consumed_chars;
2031
2032 if (charbuf >= charbuf_end)
2033 break;
2034
2035 ONE_MORE_BYTE (c);
2036 if (c < 0)
2037 {
2038 *charbuf++ = -c;
2039 char_offset++;
2040 }
2041 else if (c < 0x80)
2042 {
2043 *charbuf++ = c;
2044 char_offset++;
2045 }
2046 else if (c == 0x80)
2047 {
2048 ONE_MORE_BYTE (c);
2049 if (c < 0)
2050 goto invalid_code;
2051 if (c - 0xF2 >= COMPOSITION_RELATIVE
2052 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2053 DECODE_EMACS_MULE_21_COMPOSITION (c);
2054 else if (c < 0xC0)
2055 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2056 else if (c == 0xFF)
2057 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2058 else
2059 goto invalid_code;
2060 }
2061 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2062 {
2063 int nbytes, nchars;
2064 int id;
2065
2066 src = src_base;
2067 consumed_chars = consumed_chars_base;
2068 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2069 if (c < 0)
2070 {
2071 if (c == -2)
2072 break;
2073 goto invalid_code;
2074 }
2075 if (last_id != id)
2076 {
2077 if (last_id != charset_ascii)
2078 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2079 last_id = id;
2080 last_offset = char_offset;
2081 }
2082 *charbuf++ = c;
2083 src += nbytes;
2084 consumed_chars += nchars;
2085 char_offset++;
2086 }
2087 continue;
2088
2089 invalid_code:
2090 src = src_base;
2091 consumed_chars = consumed_chars_base;
2092 ONE_MORE_BYTE (c);
2093 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2094 char_offset++;
2095 coding->errors++;
2096 }
2097
2098 no_more_source:
2099 if (last_id != charset_ascii)
2100 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2101 coding->consumed_char += consumed_chars_base;
2102 coding->consumed = src_base - coding->source;
2103 coding->charbuf_used = charbuf - coding->charbuf;
2104 }
2105
2106
2107 #define EMACS_MULE_LEADING_CODES(id, codes) \
2108 do { \
2109 if (id < 0xA0) \
2110 codes[0] = id, codes[1] = 0; \
2111 else if (id < 0xE0) \
2112 codes[0] = 0x9A, codes[1] = id; \
2113 else if (id < 0xF0) \
2114 codes[0] = 0x9B, codes[1] = id; \
2115 else if (id < 0xF5) \
2116 codes[0] = 0x9C, codes[1] = id; \
2117 else \
2118 codes[0] = 0x9D, codes[1] = id; \
2119 } while (0);
2120
2121
2122 static int
2123 encode_coding_emacs_mule (coding)
2124 struct coding_system *coding;
2125 {
2126 int multibytep = coding->dst_multibyte;
2127 int *charbuf = coding->charbuf;
2128 int *charbuf_end = charbuf + coding->charbuf_used;
2129 unsigned char *dst = coding->destination + coding->produced;
2130 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2131 int safe_room = 8;
2132 int produced_chars = 0;
2133 Lisp_Object attrs, charset_list;
2134 int c;
2135 int preferred_charset_id = -1;
2136
2137 CODING_GET_INFO (coding, attrs, charset_list);
2138 if (! EQ (charset_list, Vemacs_mule_charset_list))
2139 {
2140 CODING_ATTR_CHARSET_LIST (attrs)
2141 = charset_list = Vemacs_mule_charset_list;
2142 }
2143
2144 while (charbuf < charbuf_end)
2145 {
2146 ASSURE_DESTINATION (safe_room);
2147 c = *charbuf++;
2148
2149 if (c < 0)
2150 {
2151 /* Handle an annotation. */
2152 switch (*charbuf)
2153 {
2154 case CODING_ANNOTATE_COMPOSITION_MASK:
2155 /* Not yet implemented. */
2156 break;
2157 case CODING_ANNOTATE_CHARSET_MASK:
2158 preferred_charset_id = charbuf[3];
2159 if (preferred_charset_id >= 0
2160 && NILP (Fmemq (make_number (preferred_charset_id),
2161 charset_list)))
2162 preferred_charset_id = -1;
2163 break;
2164 default:
2165 abort ();
2166 }
2167 charbuf += -c - 1;
2168 continue;
2169 }
2170
2171 if (ASCII_CHAR_P (c))
2172 EMIT_ONE_ASCII_BYTE (c);
2173 else if (CHAR_BYTE8_P (c))
2174 {
2175 c = CHAR_TO_BYTE8 (c);
2176 EMIT_ONE_BYTE (c);
2177 }
2178 else
2179 {
2180 struct charset *charset;
2181 unsigned code;
2182 int dimension;
2183 int emacs_mule_id;
2184 unsigned char leading_codes[2];
2185
2186 if (preferred_charset_id >= 0)
2187 {
2188 charset = CHARSET_FROM_ID (preferred_charset_id);
2189 if (! CHAR_CHARSET_P (c, charset))
2190 charset = char_charset (c, charset_list, NULL);
2191 }
2192 else
2193 charset = char_charset (c, charset_list, &code);
2194 if (! charset)
2195 {
2196 c = coding->default_char;
2197 if (ASCII_CHAR_P (c))
2198 {
2199 EMIT_ONE_ASCII_BYTE (c);
2200 continue;
2201 }
2202 charset = char_charset (c, charset_list, &code);
2203 }
2204 dimension = CHARSET_DIMENSION (charset);
2205 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2206 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2207 EMIT_ONE_BYTE (leading_codes[0]);
2208 if (leading_codes[1])
2209 EMIT_ONE_BYTE (leading_codes[1]);
2210 if (dimension == 1)
2211 EMIT_ONE_BYTE (code | 0x80);
2212 else
2213 {
2214 code |= 0x8080;
2215 EMIT_ONE_BYTE (code >> 8);
2216 EMIT_ONE_BYTE (code & 0xFF);
2217 }
2218 }
2219 }
2220 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2221 coding->produced_char += produced_chars;
2222 coding->produced = dst - coding->destination;
2223 return 0;
2224 }
2225
2226 \f
2227 /*** 7. ISO2022 handlers ***/
2228
2229 /* The following note describes the coding system ISO2022 briefly.
2230 Since the intention of this note is to help understand the
2231 functions in this file, some parts are NOT ACCURATE or are OVERLY
2232 SIMPLIFIED. For thorough understanding, please refer to the
2233 original document of ISO2022. This is equivalent to the standard
2234 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2235
2236 ISO2022 provides many mechanisms to encode several character sets
2237 in 7-bit and 8-bit environments. For 7-bit environments, all text
2238 is encoded using bytes less than 128. This may make the encoded
2239 text a little bit longer, but the text passes more easily through
2240 several types of gateway, some of which strip off the MSB (Most
2241 Significant Bit).
2242
2243 There are two kinds of character sets: control character sets and
2244 graphic character sets. The former contain control characters such
2245 as `newline' and `escape' to provide control functions (control
2246 functions are also provided by escape sequences). The latter
2247 contain graphic characters such as 'A' and '-'. Emacs recognizes
2248 two control character sets and many graphic character sets.
2249
2250 Graphic character sets are classified into one of the following
2251 four classes, according to the number of bytes (DIMENSION) and
2252 number of characters in one dimension (CHARS) of the set:
2253 - DIMENSION1_CHARS94
2254 - DIMENSION1_CHARS96
2255 - DIMENSION2_CHARS94
2256 - DIMENSION2_CHARS96
2257
2258 In addition, each character set is assigned an identification tag,
2259 unique for each set, called the "final character" (denoted as <F>
2260 hereafter). The <F> of each character set is decided by ECMA(*)
2261 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2262 (0x30..0x3F are for private use only).
2263
2264 Note (*): ECMA = European Computer Manufacturers Association
2265
2266 Here are examples of graphic character sets [NAME(<F>)]:
2267 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2268 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2269 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2270 o DIMENSION2_CHARS96 -- none for the moment
2271
2272 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2273 C0 [0x00..0x1F] -- control character plane 0
2274 GL [0x20..0x7F] -- graphic character plane 0
2275 C1 [0x80..0x9F] -- control character plane 1
2276 GR [0xA0..0xFF] -- graphic character plane 1
2277
2278 A control character set is directly designated and invoked to C0 or
2279 C1 by an escape sequence. The most common case is that:
2280 - ISO646's control character set is designated/invoked to C0, and
2281 - ISO6429's control character set is designated/invoked to C1,
2282 and usually these designations/invocations are omitted in encoded
2283 text. In a 7-bit environment, only C0 can be used, and a control
2284 character for C1 is encoded by an appropriate escape sequence to
2285 fit into the environment. All control characters for C1 are
2286 defined to have corresponding escape sequences.
2287
2288 A graphic character set is at first designated to one of four
2289 graphic registers (G0 through G3), then these graphic registers are
2290 invoked to GL or GR. These designations and invocations can be
2291 done independently. The most common case is that G0 is invoked to
2292 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2293 these invocations and designations are omitted in encoded text.
2294 In a 7-bit environment, only GL can be used.
2295
2296 When a graphic character set of CHARS94 is invoked to GL, codes
2297 0x20 and 0x7F of the GL area work as control characters SPACE and
2298 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2299 be used.
2300
2301 There are two ways of invocation: locking-shift and single-shift.
2302 With locking-shift, the invocation lasts until the next different
2303 invocation, whereas with single-shift, the invocation affects the
2304 following character only and doesn't affect the locking-shift
2305 state. Invocations are done by the following control characters or
2306 escape sequences:
2307
2308 ----------------------------------------------------------------------
2309 abbrev function cntrl escape seq description
2310 ----------------------------------------------------------------------
2311 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2312 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2313 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2314 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2315 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2316 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2317 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2318 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2319 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2320 ----------------------------------------------------------------------
2321 (*) These are not used by any known coding system.
2322
2323 Control characters for these functions are defined by macros
2324 ISO_CODE_XXX in `coding.h'.
2325
2326 Designations are done by the following escape sequences:
2327 ----------------------------------------------------------------------
2328 escape sequence description
2329 ----------------------------------------------------------------------
2330 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2331 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2332 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2333 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2334 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2335 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2336 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2337 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2338 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2339 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2340 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2341 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2342 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2343 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2344 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2345 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2346 ----------------------------------------------------------------------
2347
2348 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2349 of dimension 1, chars 94, and final character <F>, etc...
2350
2351 Note (*): Although these designations are not allowed in ISO2022,
2352 Emacs accepts them on decoding, and produces them on encoding
2353 CHARS96 character sets in a coding system which is characterized as
2354 7-bit environment, non-locking-shift, and non-single-shift.
2355
2356 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2357 '(' must be omitted. We refer to this as "short-form" hereafter.
2358
2359 Now you may notice that there are a lot of ways of encoding the
2360 same multilingual text in ISO2022. Actually, there exist many
2361 coding systems such as Compound Text (used in X11's inter client
2362 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2363 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2364 localized platforms), and all of these are variants of ISO2022.
2365
2366 In addition to the above, Emacs handles two more kinds of escape
2367 sequences: ISO6429's direction specification and Emacs' private
2368 sequence for specifying character composition.
2369
2370 ISO6429's direction specification takes the following form:
2371 o CSI ']' -- end of the current direction
2372 o CSI '0' ']' -- end of the current direction
2373 o CSI '1' ']' -- start of left-to-right text
2374 o CSI '2' ']' -- start of right-to-left text
2375 The control character CSI (0x9B: control sequence introducer) is
2376 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2377
2378 Character composition specification takes the following form:
2379 o ESC '0' -- start relative composition
2380 o ESC '1' -- end composition
2381 o ESC '2' -- start rule-base composition (*)
2382 o ESC '3' -- start relative composition with alternate chars (**)
2383 o ESC '4' -- start rule-base composition with alternate chars (**)
2384 Since these are not standard escape sequences of any ISO standard,
2385 the use of them with these meanings is restricted to Emacs only.
2386
2387 (*) This form is used only in Emacs 20.7 and older versions,
2388 but newer versions can safely decode it.
2389 (**) This form is used only in Emacs 21.1 and newer versions,
2390 and older versions can't decode it.
2391
2392 Here's a list of example usages of these composition escape
2393 sequences (categorized by `enum composition_method').
2394
2395 COMPOSITION_RELATIVE:
2396 ESC 0 CHAR [ CHAR ] ESC 1
2397 COMPOSITION_WITH_RULE:
2398 ESC 2 CHAR [ RULE CHAR ] ESC 1
2399 COMPOSITION_WITH_ALTCHARS:
2400 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2401 COMPOSITION_WITH_RULE_ALTCHARS:
2402 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2403
2404 enum iso_code_class_type iso_code_class[256];
2405
2406 #define SAFE_CHARSET_P(coding, id) \
2407 ((id) <= (coding)->max_charset_id \
2408 && (coding)->safe_charsets[id] >= 0)
2409
2410
2411 #define SHIFT_OUT_OK(category) \
2412 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2413
2414 static void
2415 setup_iso_safe_charsets (attrs)
2416 Lisp_Object attrs;
2417 {
2418 Lisp_Object charset_list, safe_charsets;
2419 Lisp_Object request;
2420 Lisp_Object reg_usage;
2421 Lisp_Object tail;
2422 int reg94, reg96;
2423 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2424 int max_charset_id;
2425
2426 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2427 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2428 && ! EQ (charset_list, Viso_2022_charset_list))
2429 {
2430 CODING_ATTR_CHARSET_LIST (attrs)
2431 = charset_list = Viso_2022_charset_list;
2432 ASET (attrs, coding_attr_safe_charsets, Qnil);
2433 }
2434
2435 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2436 return;
2437
2438 max_charset_id = 0;
2439 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2440 {
2441 int id = XINT (XCAR (tail));
2442 if (max_charset_id < id)
2443 max_charset_id = id;
2444 }
2445
2446 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2447 make_number (255));
2448 request = AREF (attrs, coding_attr_iso_request);
2449 reg_usage = AREF (attrs, coding_attr_iso_usage);
2450 reg94 = XINT (XCAR (reg_usage));
2451 reg96 = XINT (XCDR (reg_usage));
2452
2453 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2454 {
2455 Lisp_Object id;
2456 Lisp_Object reg;
2457 struct charset *charset;
2458
2459 id = XCAR (tail);
2460 charset = CHARSET_FROM_ID (XINT (id));
2461 reg = Fcdr (Fassq (id, request));
2462 if (! NILP (reg))
2463 SSET (safe_charsets, XINT (id), XINT (reg));
2464 else if (charset->iso_chars_96)
2465 {
2466 if (reg96 < 4)
2467 SSET (safe_charsets, XINT (id), reg96);
2468 }
2469 else
2470 {
2471 if (reg94 < 4)
2472 SSET (safe_charsets, XINT (id), reg94);
2473 }
2474 }
2475 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2476 }
2477
2478
2479 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2480 Check if a text is encoded in one of ISO-2022 based codig systems.
2481 If it is, return 1, else return 0. */
2482
2483 static int
2484 detect_coding_iso_2022 (coding, detect_info)
2485 struct coding_system *coding;
2486 struct coding_detection_info *detect_info;
2487 {
2488 const unsigned char *src = coding->source, *src_base = src;
2489 const unsigned char *src_end = coding->source + coding->src_bytes;
2490 int multibytep = coding->src_multibyte;
2491 int single_shifting = 0;
2492 int id;
2493 int c, c1;
2494 int consumed_chars = 0;
2495 int i;
2496 int rejected = 0;
2497 int found = 0;
2498
2499 detect_info->checked |= CATEGORY_MASK_ISO;
2500
2501 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2502 {
2503 struct coding_system *this = &(coding_categories[i]);
2504 Lisp_Object attrs, val;
2505
2506 attrs = CODING_ID_ATTRS (this->id);
2507 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2508 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2509 setup_iso_safe_charsets (attrs);
2510 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2511 this->max_charset_id = SCHARS (val) - 1;
2512 this->safe_charsets = (char *) SDATA (val);
2513 }
2514
2515 /* A coding system of this category is always ASCII compatible. */
2516 src += coding->head_ascii;
2517
2518 while (rejected != CATEGORY_MASK_ISO)
2519 {
2520 src_base = src;
2521 ONE_MORE_BYTE (c);
2522 switch (c)
2523 {
2524 case ISO_CODE_ESC:
2525 if (inhibit_iso_escape_detection)
2526 break;
2527 single_shifting = 0;
2528 ONE_MORE_BYTE (c);
2529 if (c >= '(' && c <= '/')
2530 {
2531 /* Designation sequence for a charset of dimension 1. */
2532 ONE_MORE_BYTE (c1);
2533 if (c1 < ' ' || c1 >= 0x80
2534 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2535 /* Invalid designation sequence. Just ignore. */
2536 break;
2537 }
2538 else if (c == '$')
2539 {
2540 /* Designation sequence for a charset of dimension 2. */
2541 ONE_MORE_BYTE (c);
2542 if (c >= '@' && c <= 'B')
2543 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2544 id = iso_charset_table[1][0][c];
2545 else if (c >= '(' && c <= '/')
2546 {
2547 ONE_MORE_BYTE (c1);
2548 if (c1 < ' ' || c1 >= 0x80
2549 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2550 /* Invalid designation sequence. Just ignore. */
2551 break;
2552 }
2553 else
2554 /* Invalid designation sequence. Just ignore it. */
2555 break;
2556 }
2557 else if (c == 'N' || c == 'O')
2558 {
2559 /* ESC <Fe> for SS2 or SS3. */
2560 single_shifting = 1;
2561 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2562 break;
2563 }
2564 else if (c >= '0' && c <= '4')
2565 {
2566 /* ESC <Fp> for start/end composition. */
2567 found |= CATEGORY_MASK_ISO;
2568 break;
2569 }
2570 else
2571 {
2572 /* Invalid escape sequence. Just ignore it. */
2573 break;
2574 }
2575
2576 /* We found a valid designation sequence for CHARSET. */
2577 rejected |= CATEGORY_MASK_ISO_8BIT;
2578 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2579 id))
2580 found |= CATEGORY_MASK_ISO_7;
2581 else
2582 rejected |= CATEGORY_MASK_ISO_7;
2583 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2584 id))
2585 found |= CATEGORY_MASK_ISO_7_TIGHT;
2586 else
2587 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2588 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2589 id))
2590 found |= CATEGORY_MASK_ISO_7_ELSE;
2591 else
2592 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2593 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2594 id))
2595 found |= CATEGORY_MASK_ISO_8_ELSE;
2596 else
2597 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2598 break;
2599
2600 case ISO_CODE_SO:
2601 case ISO_CODE_SI:
2602 /* Locking shift out/in. */
2603 if (inhibit_iso_escape_detection)
2604 break;
2605 single_shifting = 0;
2606 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2607 found |= CATEGORY_MASK_ISO_ELSE;
2608 break;
2609
2610 case ISO_CODE_CSI:
2611 /* Control sequence introducer. */
2612 single_shifting = 0;
2613 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2614 found |= CATEGORY_MASK_ISO_8_ELSE;
2615 goto check_extra_latin;
2616
2617
2618 case ISO_CODE_SS2:
2619 case ISO_CODE_SS3:
2620 /* Single shift. */
2621 if (inhibit_iso_escape_detection)
2622 break;
2623 single_shifting = 1;
2624 rejected |= CATEGORY_MASK_ISO_7BIT;
2625 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2626 & CODING_ISO_FLAG_SINGLE_SHIFT)
2627 found |= CATEGORY_MASK_ISO_8_1;
2628 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2629 & CODING_ISO_FLAG_SINGLE_SHIFT)
2630 found |= CATEGORY_MASK_ISO_8_2;
2631 goto check_extra_latin;
2632
2633 default:
2634 if (c < 0)
2635 continue;
2636 if (c < 0x80)
2637 {
2638 single_shifting = 0;
2639 break;
2640 }
2641 if (c >= 0xA0)
2642 {
2643 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2644 found |= CATEGORY_MASK_ISO_8_1;
2645 /* Check the length of succeeding codes of the range
2646 0xA0..0FF. If the byte length is even, we include
2647 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2648 only when we are not single shifting. */
2649 if (! single_shifting
2650 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2651 {
2652 int i = 1;
2653 while (src < src_end)
2654 {
2655 ONE_MORE_BYTE (c);
2656 if (c < 0xA0)
2657 break;
2658 i++;
2659 }
2660
2661 if (i & 1 && src < src_end)
2662 rejected |= CATEGORY_MASK_ISO_8_2;
2663 else
2664 found |= CATEGORY_MASK_ISO_8_2;
2665 }
2666 break;
2667 }
2668 check_extra_latin:
2669 single_shifting = 0;
2670 if (! VECTORP (Vlatin_extra_code_table)
2671 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2672 {
2673 rejected = CATEGORY_MASK_ISO;
2674 break;
2675 }
2676 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2677 & CODING_ISO_FLAG_LATIN_EXTRA)
2678 found |= CATEGORY_MASK_ISO_8_1;
2679 else
2680 rejected |= CATEGORY_MASK_ISO_8_1;
2681 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2682 & CODING_ISO_FLAG_LATIN_EXTRA)
2683 found |= CATEGORY_MASK_ISO_8_2;
2684 else
2685 rejected |= CATEGORY_MASK_ISO_8_2;
2686 }
2687 }
2688 detect_info->rejected |= CATEGORY_MASK_ISO;
2689 return 0;
2690
2691 no_more_source:
2692 detect_info->rejected |= rejected;
2693 detect_info->found |= (found & ~rejected);
2694 return 1;
2695 }
2696
2697
2698 /* Set designation state into CODING. */
2699 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2700 do { \
2701 int id, prev; \
2702 \
2703 if (final < '0' || final >= 128 \
2704 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2705 || !SAFE_CHARSET_P (coding, id)) \
2706 { \
2707 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2708 goto invalid_code; \
2709 } \
2710 prev = CODING_ISO_DESIGNATION (coding, reg); \
2711 if (id == charset_jisx0201_roman) \
2712 { \
2713 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2714 id = charset_ascii; \
2715 } \
2716 else if (id == charset_jisx0208_1978) \
2717 { \
2718 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2719 id = charset_jisx0208; \
2720 } \
2721 CODING_ISO_DESIGNATION (coding, reg) = id; \
2722 /* If there was an invalid designation to REG previously, and this \
2723 designation is ASCII to REG, we should keep this designation \
2724 sequence. */ \
2725 if (prev == -2 && id == charset_ascii) \
2726 goto invalid_code; \
2727 } while (0)
2728
2729
2730 #define MAYBE_FINISH_COMPOSITION() \
2731 do { \
2732 int i; \
2733 if (composition_state == COMPOSING_NO) \
2734 break; \
2735 /* It is assured that we have enough room for producing \
2736 characters stored in the table `components'. */ \
2737 if (charbuf + component_idx > charbuf_end) \
2738 goto no_more_source; \
2739 composition_state = COMPOSING_NO; \
2740 if (method == COMPOSITION_RELATIVE \
2741 || method == COMPOSITION_WITH_ALTCHARS) \
2742 { \
2743 for (i = 0; i < component_idx; i++) \
2744 *charbuf++ = components[i]; \
2745 char_offset += component_idx; \
2746 } \
2747 else \
2748 { \
2749 for (i = 0; i < component_idx; i += 2) \
2750 *charbuf++ = components[i]; \
2751 char_offset += (component_idx / 2) + 1; \
2752 } \
2753 } while (0)
2754
2755
2756 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2757 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2758 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2759 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2760 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2761 */
2762
2763 #define DECODE_COMPOSITION_START(c1) \
2764 do { \
2765 if (c1 == '0' \
2766 && composition_state == COMPOSING_COMPONENT_RULE) \
2767 { \
2768 component_len = component_idx; \
2769 composition_state = COMPOSING_CHAR; \
2770 } \
2771 else \
2772 { \
2773 const unsigned char *p; \
2774 \
2775 MAYBE_FINISH_COMPOSITION (); \
2776 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2777 goto no_more_source; \
2778 for (p = src; p < src_end - 1; p++) \
2779 if (*p == ISO_CODE_ESC && p[1] == '1') \
2780 break; \
2781 if (p == src_end - 1) \
2782 { \
2783 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2784 goto invalid_code; \
2785 goto no_more_source; \
2786 } \
2787 \
2788 /* This is surely the start of a composition. */ \
2789 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2790 : c1 == '2' ? COMPOSITION_WITH_RULE \
2791 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2792 : COMPOSITION_WITH_RULE_ALTCHARS); \
2793 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2794 : COMPOSING_COMPONENT_CHAR); \
2795 component_idx = component_len = 0; \
2796 } \
2797 } while (0)
2798
2799
2800 /* Handle compositoin end sequence ESC 1. */
2801
2802 #define DECODE_COMPOSITION_END() \
2803 do { \
2804 int nchars = (component_len > 0 ? component_idx - component_len \
2805 : method == COMPOSITION_RELATIVE ? component_idx \
2806 : (component_idx + 1) / 2); \
2807 int i; \
2808 int *saved_charbuf = charbuf; \
2809 int from = char_offset; \
2810 int to = from + nchars; \
2811 \
2812 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2813 if (method != COMPOSITION_RELATIVE) \
2814 { \
2815 if (component_len == 0) \
2816 for (i = 0; i < component_idx; i++) \
2817 *charbuf++ = components[i]; \
2818 else \
2819 for (i = 0; i < component_len; i++) \
2820 *charbuf++ = components[i]; \
2821 *saved_charbuf = saved_charbuf - charbuf; \
2822 } \
2823 if (method == COMPOSITION_WITH_RULE) \
2824 for (i = 0; i < component_idx; i += 2, char_offset++) \
2825 *charbuf++ = components[i]; \
2826 else \
2827 for (i = component_len; i < component_idx; i++, char_offset++) \
2828 *charbuf++ = components[i]; \
2829 coding->annotated = 1; \
2830 composition_state = COMPOSING_NO; \
2831 } while (0)
2832
2833
2834 /* Decode a composition rule from the byte C1 (and maybe one more byte
2835 from SRC) and store one encoded composition rule in
2836 coding->cmp_data. */
2837
2838 #define DECODE_COMPOSITION_RULE(c1) \
2839 do { \
2840 (c1) -= 32; \
2841 if (c1 < 81) /* old format (before ver.21) */ \
2842 { \
2843 int gref = (c1) / 9; \
2844 int nref = (c1) % 9; \
2845 if (gref == 4) gref = 10; \
2846 if (nref == 4) nref = 10; \
2847 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2848 } \
2849 else if (c1 < 93) /* new format (after ver.21) */ \
2850 { \
2851 ONE_MORE_BYTE (c2); \
2852 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2853 } \
2854 else \
2855 c1 = 0; \
2856 } while (0)
2857
2858
2859 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2860
2861 static void
2862 decode_coding_iso_2022 (coding)
2863 struct coding_system *coding;
2864 {
2865 const unsigned char *src = coding->source + coding->consumed;
2866 const unsigned char *src_end = coding->source + coding->src_bytes;
2867 const unsigned char *src_base;
2868 int *charbuf = coding->charbuf;
2869 int *charbuf_end
2870 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2871 int consumed_chars = 0, consumed_chars_base;
2872 int multibytep = coding->src_multibyte;
2873 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2874 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2875 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2876 struct charset *charset;
2877 int c;
2878 /* For handling composition sequence. */
2879 #define COMPOSING_NO 0
2880 #define COMPOSING_CHAR 1
2881 #define COMPOSING_RULE 2
2882 #define COMPOSING_COMPONENT_CHAR 3
2883 #define COMPOSING_COMPONENT_RULE 4
2884
2885 int composition_state = COMPOSING_NO;
2886 enum composition_method method;
2887 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2888 int component_idx;
2889 int component_len;
2890 Lisp_Object attrs, charset_list;
2891 int char_offset = coding->produced_char;
2892 int last_offset = char_offset;
2893 int last_id = charset_ascii;
2894
2895 CODING_GET_INFO (coding, attrs, charset_list);
2896 setup_iso_safe_charsets (attrs);
2897
2898 while (1)
2899 {
2900 int c1, c2;
2901
2902 src_base = src;
2903 consumed_chars_base = consumed_chars;
2904
2905 if (charbuf >= charbuf_end)
2906 break;
2907
2908 ONE_MORE_BYTE (c1);
2909 if (c1 < 0)
2910 goto invalid_code;
2911
2912 /* We produce at most one character. */
2913 switch (iso_code_class [c1])
2914 {
2915 case ISO_0x20_or_0x7F:
2916 if (composition_state != COMPOSING_NO)
2917 {
2918 if (composition_state == COMPOSING_RULE
2919 || composition_state == COMPOSING_COMPONENT_RULE)
2920 {
2921 DECODE_COMPOSITION_RULE (c1);
2922 components[component_idx++] = c1;
2923 composition_state--;
2924 continue;
2925 }
2926 }
2927 if (charset_id_0 < 0
2928 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2929 /* This is SPACE or DEL. */
2930 charset = CHARSET_FROM_ID (charset_ascii);
2931 else
2932 charset = CHARSET_FROM_ID (charset_id_0);
2933 break;
2934
2935 case ISO_graphic_plane_0:
2936 if (composition_state != COMPOSING_NO)
2937 {
2938 if (composition_state == COMPOSING_RULE
2939 || composition_state == COMPOSING_COMPONENT_RULE)
2940 {
2941 DECODE_COMPOSITION_RULE (c1);
2942 components[component_idx++] = c1;
2943 composition_state--;
2944 continue;
2945 }
2946 }
2947 charset = CHARSET_FROM_ID (charset_id_0);
2948 break;
2949
2950 case ISO_0xA0_or_0xFF:
2951 if (charset_id_1 < 0
2952 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2953 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2954 goto invalid_code;
2955 /* This is a graphic character, we fall down ... */
2956
2957 case ISO_graphic_plane_1:
2958 if (charset_id_1 < 0)
2959 goto invalid_code;
2960 charset = CHARSET_FROM_ID (charset_id_1);
2961 break;
2962
2963 case ISO_control_0:
2964 MAYBE_FINISH_COMPOSITION ();
2965 charset = CHARSET_FROM_ID (charset_ascii);
2966 break;
2967
2968 case ISO_control_1:
2969 MAYBE_FINISH_COMPOSITION ();
2970 goto invalid_code;
2971
2972 case ISO_shift_out:
2973 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2974 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2975 goto invalid_code;
2976 CODING_ISO_INVOCATION (coding, 0) = 1;
2977 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2978 continue;
2979
2980 case ISO_shift_in:
2981 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2982 goto invalid_code;
2983 CODING_ISO_INVOCATION (coding, 0) = 0;
2984 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2985 continue;
2986
2987 case ISO_single_shift_2_7:
2988 case ISO_single_shift_2:
2989 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2990 goto invalid_code;
2991 /* SS2 is handled as an escape sequence of ESC 'N' */
2992 c1 = 'N';
2993 goto label_escape_sequence;
2994
2995 case ISO_single_shift_3:
2996 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2997 goto invalid_code;
2998 /* SS2 is handled as an escape sequence of ESC 'O' */
2999 c1 = 'O';
3000 goto label_escape_sequence;
3001
3002 case ISO_control_sequence_introducer:
3003 /* CSI is handled as an escape sequence of ESC '[' ... */
3004 c1 = '[';
3005 goto label_escape_sequence;
3006
3007 case ISO_escape:
3008 ONE_MORE_BYTE (c1);
3009 label_escape_sequence:
3010 /* Escape sequences handled here are invocation,
3011 designation, direction specification, and character
3012 composition specification. */
3013 switch (c1)
3014 {
3015 case '&': /* revision of following character set */
3016 ONE_MORE_BYTE (c1);
3017 if (!(c1 >= '@' && c1 <= '~'))
3018 goto invalid_code;
3019 ONE_MORE_BYTE (c1);
3020 if (c1 != ISO_CODE_ESC)
3021 goto invalid_code;
3022 ONE_MORE_BYTE (c1);
3023 goto label_escape_sequence;
3024
3025 case '$': /* designation of 2-byte character set */
3026 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3027 goto invalid_code;
3028 ONE_MORE_BYTE (c1);
3029 if (c1 >= '@' && c1 <= 'B')
3030 { /* designation of JISX0208.1978, GB2312.1980,
3031 or JISX0208.1980 */
3032 DECODE_DESIGNATION (0, 2, 0, c1);
3033 }
3034 else if (c1 >= 0x28 && c1 <= 0x2B)
3035 { /* designation of DIMENSION2_CHARS94 character set */
3036 ONE_MORE_BYTE (c2);
3037 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3038 }
3039 else if (c1 >= 0x2C && c1 <= 0x2F)
3040 { /* designation of DIMENSION2_CHARS96 character set */
3041 ONE_MORE_BYTE (c2);
3042 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3043 }
3044 else
3045 goto invalid_code;
3046 /* We must update these variables now. */
3047 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3048 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3049 continue;
3050
3051 case 'n': /* invocation of locking-shift-2 */
3052 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3053 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3054 goto invalid_code;
3055 CODING_ISO_INVOCATION (coding, 0) = 2;
3056 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3057 continue;
3058
3059 case 'o': /* invocation of locking-shift-3 */
3060 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3061 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3062 goto invalid_code;
3063 CODING_ISO_INVOCATION (coding, 0) = 3;
3064 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3065 continue;
3066
3067 case 'N': /* invocation of single-shift-2 */
3068 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3069 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3070 goto invalid_code;
3071 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3072 ONE_MORE_BYTE (c1);
3073 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3074 goto invalid_code;
3075 break;
3076
3077 case 'O': /* invocation of single-shift-3 */
3078 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3079 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3080 goto invalid_code;
3081 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3082 ONE_MORE_BYTE (c1);
3083 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3084 goto invalid_code;
3085 break;
3086
3087 case '0': case '2': case '3': case '4': /* start composition */
3088 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3089 goto invalid_code;
3090 DECODE_COMPOSITION_START (c1);
3091 continue;
3092
3093 case '1': /* end composition */
3094 if (composition_state == COMPOSING_NO)
3095 goto invalid_code;
3096 DECODE_COMPOSITION_END ();
3097 continue;
3098
3099 case '[': /* specification of direction */
3100 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3101 goto invalid_code;
3102 /* For the moment, nested direction is not supported.
3103 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3104 left-to-right, and nozero means right-to-left. */
3105 ONE_MORE_BYTE (c1);
3106 switch (c1)
3107 {
3108 case ']': /* end of the current direction */
3109 coding->mode &= ~CODING_MODE_DIRECTION;
3110
3111 case '0': /* end of the current direction */
3112 case '1': /* start of left-to-right direction */
3113 ONE_MORE_BYTE (c1);
3114 if (c1 == ']')
3115 coding->mode &= ~CODING_MODE_DIRECTION;
3116 else
3117 goto invalid_code;
3118 break;
3119
3120 case '2': /* start of right-to-left direction */
3121 ONE_MORE_BYTE (c1);
3122 if (c1 == ']')
3123 coding->mode |= CODING_MODE_DIRECTION;
3124 else
3125 goto invalid_code;
3126 break;
3127
3128 default:
3129 goto invalid_code;
3130 }
3131 continue;
3132
3133 case '%':
3134 ONE_MORE_BYTE (c1);
3135 if (c1 == '/')
3136 {
3137 /* CTEXT extended segment:
3138 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3139 We keep these bytes as is for the moment.
3140 They may be decoded by post-read-conversion. */
3141 int dim, M, L;
3142 int size;
3143
3144 ONE_MORE_BYTE (dim);
3145 ONE_MORE_BYTE (M);
3146 ONE_MORE_BYTE (L);
3147 size = ((M - 128) * 128) + (L - 128);
3148 if (charbuf + 8 + size > charbuf_end)
3149 goto break_loop;
3150 *charbuf++ = ISO_CODE_ESC;
3151 *charbuf++ = '%';
3152 *charbuf++ = '/';
3153 *charbuf++ = dim;
3154 *charbuf++ = BYTE8_TO_CHAR (M);
3155 *charbuf++ = BYTE8_TO_CHAR (L);
3156 while (size-- > 0)
3157 {
3158 ONE_MORE_BYTE (c1);
3159 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3160 }
3161 }
3162 else if (c1 == 'G')
3163 {
3164 /* XFree86 extension for embedding UTF-8 in CTEXT:
3165 ESC % G --UTF-8-BYTES-- ESC % @
3166 We keep these bytes as is for the moment.
3167 They may be decoded by post-read-conversion. */
3168 int *p = charbuf;
3169
3170 if (p + 6 > charbuf_end)
3171 goto break_loop;
3172 *p++ = ISO_CODE_ESC;
3173 *p++ = '%';
3174 *p++ = 'G';
3175 while (p < charbuf_end)
3176 {
3177 ONE_MORE_BYTE (c1);
3178 if (c1 == ISO_CODE_ESC
3179 && src + 1 < src_end
3180 && src[0] == '%'
3181 && src[1] == '@')
3182 break;
3183 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3184 }
3185 if (p + 3 > charbuf_end)
3186 goto break_loop;
3187 *p++ = ISO_CODE_ESC;
3188 *p++ = '%';
3189 *p++ = '@';
3190 charbuf = p;
3191 }
3192 else
3193 goto invalid_code;
3194 continue;
3195 break;
3196
3197 default:
3198 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3199 goto invalid_code;
3200 if (c1 >= 0x28 && c1 <= 0x2B)
3201 { /* designation of DIMENSION1_CHARS94 character set */
3202 ONE_MORE_BYTE (c2);
3203 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3204 }
3205 else if (c1 >= 0x2C && c1 <= 0x2F)
3206 { /* designation of DIMENSION1_CHARS96 character set */
3207 ONE_MORE_BYTE (c2);
3208 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3209 }
3210 else
3211 goto invalid_code;
3212 /* We must update these variables now. */
3213 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3214 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3215 continue;
3216 }
3217 }
3218
3219 if (charset->id != charset_ascii
3220 && last_id != charset->id)
3221 {
3222 if (last_id != charset_ascii)
3223 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224 last_id = charset->id;
3225 last_offset = char_offset;
3226 }
3227
3228 /* Now we know CHARSET and 1st position code C1 of a character.
3229 Produce a decoded character while getting 2nd position code
3230 C2 if necessary. */
3231 c1 &= 0x7F;
3232 if (CHARSET_DIMENSION (charset) > 1)
3233 {
3234 ONE_MORE_BYTE (c2);
3235 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3236 /* C2 is not in a valid range. */
3237 goto invalid_code;
3238 c1 = (c1 << 8) | (c2 & 0x7F);
3239 if (CHARSET_DIMENSION (charset) > 2)
3240 {
3241 ONE_MORE_BYTE (c2);
3242 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3243 /* C2 is not in a valid range. */
3244 goto invalid_code;
3245 c1 = (c1 << 8) | (c2 & 0x7F);
3246 }
3247 }
3248
3249 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3250 if (c < 0)
3251 {
3252 MAYBE_FINISH_COMPOSITION ();
3253 for (; src_base < src; src_base++, char_offset++)
3254 {
3255 if (ASCII_BYTE_P (*src_base))
3256 *charbuf++ = *src_base;
3257 else
3258 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3259 }
3260 }
3261 else if (composition_state == COMPOSING_NO)
3262 {
3263 *charbuf++ = c;
3264 char_offset++;
3265 }
3266 else
3267 {
3268 components[component_idx++] = c;
3269 if (method == COMPOSITION_WITH_RULE
3270 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3271 && composition_state == COMPOSING_COMPONENT_CHAR))
3272 composition_state++;
3273 }
3274 continue;
3275
3276 invalid_code:
3277 MAYBE_FINISH_COMPOSITION ();
3278 src = src_base;
3279 consumed_chars = consumed_chars_base;
3280 ONE_MORE_BYTE (c);
3281 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3282 char_offset++;
3283 coding->errors++;
3284 continue;
3285
3286 break_loop:
3287 break;
3288 }
3289
3290 no_more_source:
3291 if (last_id != charset_ascii)
3292 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3293 coding->consumed_char += consumed_chars_base;
3294 coding->consumed = src_base - coding->source;
3295 coding->charbuf_used = charbuf - coding->charbuf;
3296 }
3297
3298
3299 /* ISO2022 encoding stuff. */
3300
3301 /*
3302 It is not enough to say just "ISO2022" on encoding, we have to
3303 specify more details. In Emacs, each coding system of ISO2022
3304 variant has the following specifications:
3305 1. Initial designation to G0 thru G3.
3306 2. Allows short-form designation?
3307 3. ASCII should be designated to G0 before control characters?
3308 4. ASCII should be designated to G0 at end of line?
3309 5. 7-bit environment or 8-bit environment?
3310 6. Use locking-shift?
3311 7. Use Single-shift?
3312 And the following two are only for Japanese:
3313 8. Use ASCII in place of JIS0201-1976-Roman?
3314 9. Use JISX0208-1983 in place of JISX0208-1978?
3315 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3316 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3317 details.
3318 */
3319
3320 /* Produce codes (escape sequence) for designating CHARSET to graphic
3321 register REG at DST, and increment DST. If <final-char> of CHARSET is
3322 '@', 'A', or 'B' and the coding system CODING allows, produce
3323 designation sequence of short-form. */
3324
3325 #define ENCODE_DESIGNATION(charset, reg, coding) \
3326 do { \
3327 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3328 char *intermediate_char_94 = "()*+"; \
3329 char *intermediate_char_96 = ",-./"; \
3330 int revision = -1; \
3331 int c; \
3332 \
3333 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3334 revision = CHARSET_ISO_REVISION (charset); \
3335 \
3336 if (revision >= 0) \
3337 { \
3338 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3339 EMIT_ONE_BYTE ('@' + revision); \
3340 } \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3342 if (CHARSET_DIMENSION (charset) == 1) \
3343 { \
3344 if (! CHARSET_ISO_CHARS_96 (charset)) \
3345 c = intermediate_char_94[reg]; \
3346 else \
3347 c = intermediate_char_96[reg]; \
3348 EMIT_ONE_ASCII_BYTE (c); \
3349 } \
3350 else \
3351 { \
3352 EMIT_ONE_ASCII_BYTE ('$'); \
3353 if (! CHARSET_ISO_CHARS_96 (charset)) \
3354 { \
3355 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3356 || reg != 0 \
3357 || final_char < '@' || final_char > 'B') \
3358 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3359 } \
3360 else \
3361 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3362 } \
3363 EMIT_ONE_ASCII_BYTE (final_char); \
3364 \
3365 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3366 } while (0)
3367
3368
3369 /* The following two macros produce codes (control character or escape
3370 sequence) for ISO2022 single-shift functions (single-shift-2 and
3371 single-shift-3). */
3372
3373 #define ENCODE_SINGLE_SHIFT_2 \
3374 do { \
3375 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3376 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3377 else \
3378 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3379 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3380 } while (0)
3381
3382
3383 #define ENCODE_SINGLE_SHIFT_3 \
3384 do { \
3385 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3386 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3387 else \
3388 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3389 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3390 } while (0)
3391
3392
3393 /* The following four macros produce codes (control character or
3394 escape sequence) for ISO2022 locking-shift functions (shift-in,
3395 shift-out, locking-shift-2, and locking-shift-3). */
3396
3397 #define ENCODE_SHIFT_IN \
3398 do { \
3399 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3400 CODING_ISO_INVOCATION (coding, 0) = 0; \
3401 } while (0)
3402
3403
3404 #define ENCODE_SHIFT_OUT \
3405 do { \
3406 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3407 CODING_ISO_INVOCATION (coding, 0) = 1; \
3408 } while (0)
3409
3410
3411 #define ENCODE_LOCKING_SHIFT_2 \
3412 do { \
3413 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3414 CODING_ISO_INVOCATION (coding, 0) = 2; \
3415 } while (0)
3416
3417
3418 #define ENCODE_LOCKING_SHIFT_3 \
3419 do { \
3420 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3421 CODING_ISO_INVOCATION (coding, 0) = 3; \
3422 } while (0)
3423
3424
3425 /* Produce codes for a DIMENSION1 character whose character set is
3426 CHARSET and whose position-code is C1. Designation and invocation
3427 sequences are also produced in advance if necessary. */
3428
3429 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3430 do { \
3431 int id = CHARSET_ID (charset); \
3432 \
3433 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3434 && id == charset_ascii) \
3435 { \
3436 id = charset_jisx0201_roman; \
3437 charset = CHARSET_FROM_ID (id); \
3438 } \
3439 \
3440 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3441 { \
3442 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3443 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3444 else \
3445 EMIT_ONE_BYTE (c1 | 0x80); \
3446 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3447 break; \
3448 } \
3449 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3450 { \
3451 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3452 break; \
3453 } \
3454 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3455 { \
3456 EMIT_ONE_BYTE (c1 | 0x80); \
3457 break; \
3458 } \
3459 else \
3460 /* Since CHARSET is not yet invoked to any graphic planes, we \
3461 must invoke it, or, at first, designate it to some graphic \
3462 register. Then repeat the loop to actually produce the \
3463 character. */ \
3464 dst = encode_invocation_designation (charset, coding, dst, \
3465 &produced_chars); \
3466 } while (1)
3467
3468
3469 /* Produce codes for a DIMENSION2 character whose character set is
3470 CHARSET and whose position-codes are C1 and C2. Designation and
3471 invocation codes are also produced in advance if necessary. */
3472
3473 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3474 do { \
3475 int id = CHARSET_ID (charset); \
3476 \
3477 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3478 && id == charset_jisx0208) \
3479 { \
3480 id = charset_jisx0208_1978; \
3481 charset = CHARSET_FROM_ID (id); \
3482 } \
3483 \
3484 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3485 { \
3486 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3487 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3488 else \
3489 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3490 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3491 break; \
3492 } \
3493 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3494 { \
3495 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3496 break; \
3497 } \
3498 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3499 { \
3500 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3501 break; \
3502 } \
3503 else \
3504 /* Since CHARSET is not yet invoked to any graphic planes, we \
3505 must invoke it, or, at first, designate it to some graphic \
3506 register. Then repeat the loop to actually produce the \
3507 character. */ \
3508 dst = encode_invocation_designation (charset, coding, dst, \
3509 &produced_chars); \
3510 } while (1)
3511
3512
3513 #define ENCODE_ISO_CHARACTER(charset, c) \
3514 do { \
3515 int code = ENCODE_CHAR ((charset),(c)); \
3516 \
3517 if (CHARSET_DIMENSION (charset) == 1) \
3518 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3519 else \
3520 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3521 } while (0)
3522
3523
3524 /* Produce designation and invocation codes at a place pointed by DST
3525 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3526 Return new DST. */
3527
3528 unsigned char *
3529 encode_invocation_designation (charset, coding, dst, p_nchars)
3530 struct charset *charset;
3531 struct coding_system *coding;
3532 unsigned char *dst;
3533 int *p_nchars;
3534 {
3535 int multibytep = coding->dst_multibyte;
3536 int produced_chars = *p_nchars;
3537 int reg; /* graphic register number */
3538 int id = CHARSET_ID (charset);
3539
3540 /* At first, check designations. */
3541 for (reg = 0; reg < 4; reg++)
3542 if (id == CODING_ISO_DESIGNATION (coding, reg))
3543 break;
3544
3545 if (reg >= 4)
3546 {
3547 /* CHARSET is not yet designated to any graphic registers. */
3548 /* At first check the requested designation. */
3549 reg = CODING_ISO_REQUEST (coding, id);
3550 if (reg < 0)
3551 /* Since CHARSET requests no special designation, designate it
3552 to graphic register 0. */
3553 reg = 0;
3554
3555 ENCODE_DESIGNATION (charset, reg, coding);
3556 }
3557
3558 if (CODING_ISO_INVOCATION (coding, 0) != reg
3559 && CODING_ISO_INVOCATION (coding, 1) != reg)
3560 {
3561 /* Since the graphic register REG is not invoked to any graphic
3562 planes, invoke it to graphic plane 0. */
3563 switch (reg)
3564 {
3565 case 0: /* graphic register 0 */
3566 ENCODE_SHIFT_IN;
3567 break;
3568
3569 case 1: /* graphic register 1 */
3570 ENCODE_SHIFT_OUT;
3571 break;
3572
3573 case 2: /* graphic register 2 */
3574 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3575 ENCODE_SINGLE_SHIFT_2;
3576 else
3577 ENCODE_LOCKING_SHIFT_2;
3578 break;
3579
3580 case 3: /* graphic register 3 */
3581 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3582 ENCODE_SINGLE_SHIFT_3;
3583 else
3584 ENCODE_LOCKING_SHIFT_3;
3585 break;
3586 }
3587 }
3588
3589 *p_nchars = produced_chars;
3590 return dst;
3591 }
3592
3593 /* The following three macros produce codes for indicating direction
3594 of text. */
3595 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3596 do { \
3597 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3598 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3599 else \
3600 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3601 } while (0)
3602
3603
3604 #define ENCODE_DIRECTION_R2L() \
3605 do { \
3606 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3607 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3608 } while (0)
3609
3610
3611 #define ENCODE_DIRECTION_L2R() \
3612 do { \
3613 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3614 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3615 } while (0)
3616
3617
3618 /* Produce codes for designation and invocation to reset the graphic
3619 planes and registers to initial state. */
3620 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3621 do { \
3622 int reg; \
3623 struct charset *charset; \
3624 \
3625 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3626 ENCODE_SHIFT_IN; \
3627 for (reg = 0; reg < 4; reg++) \
3628 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3629 && (CODING_ISO_DESIGNATION (coding, reg) \
3630 != CODING_ISO_INITIAL (coding, reg))) \
3631 { \
3632 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3633 ENCODE_DESIGNATION (charset, reg, coding); \
3634 } \
3635 } while (0)
3636
3637
3638 /* Produce designation sequences of charsets in the line started from
3639 SRC to a place pointed by DST, and return updated DST.
3640
3641 If the current block ends before any end-of-line, we may fail to
3642 find all the necessary designations. */
3643
3644 static unsigned char *
3645 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3646 struct coding_system *coding;
3647 int *charbuf, *charbuf_end;
3648 unsigned char *dst;
3649 {
3650 struct charset *charset;
3651 /* Table of charsets to be designated to each graphic register. */
3652 int r[4];
3653 int c, found = 0, reg;
3654 int produced_chars = 0;
3655 int multibytep = coding->dst_multibyte;
3656 Lisp_Object attrs;
3657 Lisp_Object charset_list;
3658
3659 attrs = CODING_ID_ATTRS (coding->id);
3660 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3661 if (EQ (charset_list, Qiso_2022))
3662 charset_list = Viso_2022_charset_list;
3663
3664 for (reg = 0; reg < 4; reg++)
3665 r[reg] = -1;
3666
3667 while (found < 4)
3668 {
3669 int id;
3670
3671 c = *charbuf++;
3672 if (c == '\n')
3673 break;
3674 charset = char_charset (c, charset_list, NULL);
3675 id = CHARSET_ID (charset);
3676 reg = CODING_ISO_REQUEST (coding, id);
3677 if (reg >= 0 && r[reg] < 0)
3678 {
3679 found++;
3680 r[reg] = id;
3681 }
3682 }
3683
3684 if (found)
3685 {
3686 for (reg = 0; reg < 4; reg++)
3687 if (r[reg] >= 0
3688 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3689 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3690 }
3691
3692 return dst;
3693 }
3694
3695 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3696
3697 static int
3698 encode_coding_iso_2022 (coding)
3699 struct coding_system *coding;
3700 {
3701 int multibytep = coding->dst_multibyte;
3702 int *charbuf = coding->charbuf;
3703 int *charbuf_end = charbuf + coding->charbuf_used;
3704 unsigned char *dst = coding->destination + coding->produced;
3705 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3706 int safe_room = 16;
3707 int bol_designation
3708 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3709 && CODING_ISO_BOL (coding));
3710 int produced_chars = 0;
3711 Lisp_Object attrs, eol_type, charset_list;
3712 int ascii_compatible;
3713 int c;
3714 int preferred_charset_id = -1;
3715
3716 CODING_GET_INFO (coding, attrs, charset_list);
3717 eol_type = CODING_ID_EOL_TYPE (coding->id);
3718 if (VECTORP (eol_type))
3719 eol_type = Qunix;
3720
3721 setup_iso_safe_charsets (attrs);
3722 /* Charset list may have been changed. */
3723 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3724 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3725
3726 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3727
3728 while (charbuf < charbuf_end)
3729 {
3730 ASSURE_DESTINATION (safe_room);
3731
3732 if (bol_designation)
3733 {
3734 unsigned char *dst_prev = dst;
3735
3736 /* We have to produce designation sequences if any now. */
3737 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3738 bol_designation = 0;
3739 /* We are sure that designation sequences are all ASCII bytes. */
3740 produced_chars += dst - dst_prev;
3741 }
3742
3743 c = *charbuf++;
3744
3745 if (c < 0)
3746 {
3747 /* Handle an annotation. */
3748 switch (*charbuf)
3749 {
3750 case CODING_ANNOTATE_COMPOSITION_MASK:
3751 /* Not yet implemented. */
3752 break;
3753 case CODING_ANNOTATE_CHARSET_MASK:
3754 preferred_charset_id = charbuf[3];
3755 if (preferred_charset_id >= 0
3756 && NILP (Fmemq (make_number (preferred_charset_id),
3757 charset_list)))
3758 preferred_charset_id = -1;
3759 break;
3760 default:
3761 abort ();
3762 }
3763 charbuf += -c - 1;
3764 continue;
3765 }
3766
3767 /* Now encode the character C. */
3768 if (c < 0x20 || c == 0x7F)
3769 {
3770 if (c == '\n'
3771 || (c == '\r' && EQ (eol_type, Qmac)))
3772 {
3773 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3774 ENCODE_RESET_PLANE_AND_REGISTER ();
3775 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3776 {
3777 int i;
3778
3779 for (i = 0; i < 4; i++)
3780 CODING_ISO_DESIGNATION (coding, i)
3781 = CODING_ISO_INITIAL (coding, i);
3782 }
3783 bol_designation
3784 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3785 }
3786 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3787 ENCODE_RESET_PLANE_AND_REGISTER ();
3788 EMIT_ONE_ASCII_BYTE (c);
3789 }
3790 else if (ASCII_CHAR_P (c))
3791 {
3792 if (ascii_compatible)
3793 EMIT_ONE_ASCII_BYTE (c);
3794 else
3795 {
3796 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3797 ENCODE_ISO_CHARACTER (charset, c);
3798 }
3799 }
3800 else if (CHAR_BYTE8_P (c))
3801 {
3802 c = CHAR_TO_BYTE8 (c);
3803 EMIT_ONE_BYTE (c);
3804 }
3805 else
3806 {
3807 struct charset *charset;
3808
3809 if (preferred_charset_id >= 0)
3810 {
3811 charset = CHARSET_FROM_ID (preferred_charset_id);
3812 if (! CHAR_CHARSET_P (c, charset))
3813 charset = char_charset (c, charset_list, NULL);
3814 }
3815 else
3816 charset = char_charset (c, charset_list, NULL);
3817 if (!charset)
3818 {
3819 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3820 {
3821 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3822 charset = CHARSET_FROM_ID (charset_ascii);
3823 }
3824 else
3825 {
3826 c = coding->default_char;
3827 charset = char_charset (c, charset_list, NULL);
3828 }
3829 }
3830 ENCODE_ISO_CHARACTER (charset, c);
3831 }
3832 }
3833
3834 if (coding->mode & CODING_MODE_LAST_BLOCK
3835 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3836 {
3837 ASSURE_DESTINATION (safe_room);
3838 ENCODE_RESET_PLANE_AND_REGISTER ();
3839 }
3840 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3841 CODING_ISO_BOL (coding) = bol_designation;
3842 coding->produced_char += produced_chars;
3843 coding->produced = dst - coding->destination;
3844 return 0;
3845 }
3846
3847 \f
3848 /*** 8,9. SJIS and BIG5 handlers ***/
3849
3850 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3851 quite widely. So, for the moment, Emacs supports them in the bare
3852 C code. But, in the future, they may be supported only by CCL. */
3853
3854 /* SJIS is a coding system encoding three character sets: ASCII, right
3855 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3856 as is. A character of charset katakana-jisx0201 is encoded by
3857 "position-code + 0x80". A character of charset japanese-jisx0208
3858 is encoded in 2-byte but two position-codes are divided and shifted
3859 so that it fit in the range below.
3860
3861 --- CODE RANGE of SJIS ---
3862 (character set) (range)
3863 ASCII 0x00 .. 0x7F
3864 KATAKANA-JISX0201 0xA0 .. 0xDF
3865 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3866 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3867 -------------------------------
3868
3869 */
3870
3871 /* BIG5 is a coding system encoding two character sets: ASCII and
3872 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3873 character set and is encoded in two-byte.
3874
3875 --- CODE RANGE of BIG5 ---
3876 (character set) (range)
3877 ASCII 0x00 .. 0x7F
3878 Big5 (1st byte) 0xA1 .. 0xFE
3879 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3880 --------------------------
3881
3882 */
3883
3884 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3885 Check if a text is encoded in SJIS. If it is, return
3886 CATEGORY_MASK_SJIS, else return 0. */
3887
3888 static int
3889 detect_coding_sjis (coding, detect_info)
3890 struct coding_system *coding;
3891 struct coding_detection_info *detect_info;
3892 {
3893 const unsigned char *src = coding->source, *src_base;
3894 const unsigned char *src_end = coding->source + coding->src_bytes;
3895 int multibytep = coding->src_multibyte;
3896 int consumed_chars = 0;
3897 int found = 0;
3898 int c;
3899
3900 detect_info->checked |= CATEGORY_MASK_SJIS;
3901 /* A coding system of this category is always ASCII compatible. */
3902 src += coding->head_ascii;
3903
3904 while (1)
3905 {
3906 src_base = src;
3907 ONE_MORE_BYTE (c);
3908 if (c < 0x80)
3909 continue;
3910 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3911 {
3912 ONE_MORE_BYTE (c);
3913 if (c < 0x40 || c == 0x7F || c > 0xFC)
3914 break;
3915 found = CATEGORY_MASK_SJIS;
3916 }
3917 else if (c >= 0xA0 && c < 0xE0)
3918 found = CATEGORY_MASK_SJIS;
3919 else
3920 break;
3921 }
3922 detect_info->rejected |= CATEGORY_MASK_SJIS;
3923 return 0;
3924
3925 no_more_source:
3926 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3927 {
3928 detect_info->rejected |= CATEGORY_MASK_SJIS;
3929 return 0;
3930 }
3931 detect_info->found |= found;
3932 return 1;
3933 }
3934
3935 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3936 Check if a text is encoded in BIG5. If it is, return
3937 CATEGORY_MASK_BIG5, else return 0. */
3938
3939 static int
3940 detect_coding_big5 (coding, detect_info)
3941 struct coding_system *coding;
3942 struct coding_detection_info *detect_info;
3943 {
3944 const unsigned char *src = coding->source, *src_base;
3945 const unsigned char *src_end = coding->source + coding->src_bytes;
3946 int multibytep = coding->src_multibyte;
3947 int consumed_chars = 0;
3948 int found = 0;
3949 int c;
3950
3951 detect_info->checked |= CATEGORY_MASK_BIG5;
3952 /* A coding system of this category is always ASCII compatible. */
3953 src += coding->head_ascii;
3954
3955 while (1)
3956 {
3957 src_base = src;
3958 ONE_MORE_BYTE (c);
3959 if (c < 0x80)
3960 continue;
3961 if (c >= 0xA1)
3962 {
3963 ONE_MORE_BYTE (c);
3964 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3965 return 0;
3966 found = CATEGORY_MASK_BIG5;
3967 }
3968 else
3969 break;
3970 }
3971 detect_info->rejected |= CATEGORY_MASK_BIG5;
3972 return 0;
3973
3974 no_more_source:
3975 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3976 {
3977 detect_info->rejected |= CATEGORY_MASK_BIG5;
3978 return 0;
3979 }
3980 detect_info->found |= found;
3981 return 1;
3982 }
3983
3984 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3985 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3986
3987 static void
3988 decode_coding_sjis (coding)
3989 struct coding_system *coding;
3990 {
3991 const unsigned char *src = coding->source + coding->consumed;
3992 const unsigned char *src_end = coding->source + coding->src_bytes;
3993 const unsigned char *src_base;
3994 int *charbuf = coding->charbuf;
3995 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3996 int consumed_chars = 0, consumed_chars_base;
3997 int multibytep = coding->src_multibyte;
3998 struct charset *charset_roman, *charset_kanji, *charset_kana;
3999 Lisp_Object attrs, charset_list, val;
4000 int char_offset = coding->produced_char;
4001 int last_offset = char_offset;
4002 int last_id = charset_ascii;
4003
4004 CODING_GET_INFO (coding, attrs, charset_list);
4005
4006 val = charset_list;
4007 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4008 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4009 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4010
4011 while (1)
4012 {
4013 int c, c1;
4014 struct charset *charset;
4015
4016 src_base = src;
4017 consumed_chars_base = consumed_chars;
4018
4019 if (charbuf >= charbuf_end)
4020 break;
4021
4022 ONE_MORE_BYTE (c);
4023 if (c < 0)
4024 goto invalid_code;
4025 if (c < 0x80)
4026 charset = charset_roman;
4027 else if (c == 0x80)
4028 goto invalid_code;
4029 else
4030 {
4031 if (c >= 0xF0)
4032 goto invalid_code;
4033 if (c < 0xA0 || c >= 0xE0)
4034 {
4035 /* SJIS -> JISX0208 */
4036 ONE_MORE_BYTE (c1);
4037 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4038 goto invalid_code;
4039 c = (c << 8) | c1;
4040 SJIS_TO_JIS (c);
4041 charset = charset_kanji;
4042 }
4043 else if (c > 0xA0)
4044 {
4045 /* SJIS -> JISX0201-Kana */
4046 c &= 0x7F;
4047 charset = charset_kana;
4048 }
4049 else
4050 goto invalid_code;
4051 }
4052 if (charset->id != charset_ascii
4053 && last_id != charset->id)
4054 {
4055 if (last_id != charset_ascii)
4056 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4057 last_id = charset->id;
4058 last_offset = char_offset;
4059 }
4060 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4061 *charbuf++ = c;
4062 char_offset++;
4063 continue;
4064
4065 invalid_code:
4066 src = src_base;
4067 consumed_chars = consumed_chars_base;
4068 ONE_MORE_BYTE (c);
4069 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4070 char_offset++;
4071 coding->errors++;
4072 }
4073
4074 no_more_source:
4075 if (last_id != charset_ascii)
4076 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4077 coding->consumed_char += consumed_chars_base;
4078 coding->consumed = src_base - coding->source;
4079 coding->charbuf_used = charbuf - coding->charbuf;
4080 }
4081
4082 static void
4083 decode_coding_big5 (coding)
4084 struct coding_system *coding;
4085 {
4086 const unsigned char *src = coding->source + coding->consumed;
4087 const unsigned char *src_end = coding->source + coding->src_bytes;
4088 const unsigned char *src_base;
4089 int *charbuf = coding->charbuf;
4090 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4091 int consumed_chars = 0, consumed_chars_base;
4092 int multibytep = coding->src_multibyte;
4093 struct charset *charset_roman, *charset_big5;
4094 Lisp_Object attrs, charset_list, val;
4095 int char_offset = coding->produced_char;
4096 int last_offset = char_offset;
4097 int last_id = charset_ascii;
4098
4099 CODING_GET_INFO (coding, attrs, charset_list);
4100 val = charset_list;
4101 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4102 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4103
4104 while (1)
4105 {
4106 int c, c1;
4107 struct charset *charset;
4108
4109 src_base = src;
4110 consumed_chars_base = consumed_chars;
4111
4112 if (charbuf >= charbuf_end)
4113 break;
4114
4115 ONE_MORE_BYTE (c);
4116
4117 if (c < 0)
4118 goto invalid_code;
4119 if (c < 0x80)
4120 charset = charset_roman;
4121 else
4122 {
4123 /* BIG5 -> Big5 */
4124 if (c < 0xA1 || c > 0xFE)
4125 goto invalid_code;
4126 ONE_MORE_BYTE (c1);
4127 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4128 goto invalid_code;
4129 c = c << 8 | c1;
4130 charset = charset_big5;
4131 }
4132 if (charset->id != charset_ascii
4133 && last_id != charset->id)
4134 {
4135 if (last_id != charset_ascii)
4136 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4137 last_id = charset->id;
4138 last_offset = char_offset;
4139 }
4140 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4141 *charbuf++ = c;
4142 char_offset++;
4143 continue;
4144
4145 invalid_code:
4146 src = src_base;
4147 consumed_chars = consumed_chars_base;
4148 ONE_MORE_BYTE (c);
4149 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4150 char_offset++;
4151 coding->errors++;
4152 }
4153
4154 no_more_source:
4155 if (last_id != charset_ascii)
4156 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4157 coding->consumed_char += consumed_chars_base;
4158 coding->consumed = src_base - coding->source;
4159 coding->charbuf_used = charbuf - coding->charbuf;
4160 }
4161
4162 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4163 This function can encode charsets `ascii', `katakana-jisx0201',
4164 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4165 are sure that all these charsets are registered as official charset
4166 (i.e. do not have extended leading-codes). Characters of other
4167 charsets are produced without any encoding. If SJIS_P is 1, encode
4168 SJIS text, else encode BIG5 text. */
4169
4170 static int
4171 encode_coding_sjis (coding)
4172 struct coding_system *coding;
4173 {
4174 int multibytep = coding->dst_multibyte;
4175 int *charbuf = coding->charbuf;
4176 int *charbuf_end = charbuf + coding->charbuf_used;
4177 unsigned char *dst = coding->destination + coding->produced;
4178 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4179 int safe_room = 4;
4180 int produced_chars = 0;
4181 Lisp_Object attrs, charset_list, val;
4182 int ascii_compatible;
4183 struct charset *charset_roman, *charset_kanji, *charset_kana;
4184 int c;
4185
4186 CODING_GET_INFO (coding, attrs, charset_list);
4187 val = charset_list;
4188 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4189 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4190 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4191
4192 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4193
4194 while (charbuf < charbuf_end)
4195 {
4196 ASSURE_DESTINATION (safe_room);
4197 c = *charbuf++;
4198 /* Now encode the character C. */
4199 if (ASCII_CHAR_P (c) && ascii_compatible)
4200 EMIT_ONE_ASCII_BYTE (c);
4201 else if (CHAR_BYTE8_P (c))
4202 {
4203 c = CHAR_TO_BYTE8 (c);
4204 EMIT_ONE_BYTE (c);
4205 }
4206 else
4207 {
4208 unsigned code;
4209 struct charset *charset = char_charset (c, charset_list, &code);
4210
4211 if (!charset)
4212 {
4213 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4214 {
4215 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4216 charset = CHARSET_FROM_ID (charset_ascii);
4217 }
4218 else
4219 {
4220 c = coding->default_char;
4221 charset = char_charset (c, charset_list, &code);
4222 }
4223 }
4224 if (code == CHARSET_INVALID_CODE (charset))
4225 abort ();
4226 if (charset == charset_kanji)
4227 {
4228 int c1, c2;
4229 JIS_TO_SJIS (code);
4230 c1 = code >> 8, c2 = code & 0xFF;
4231 EMIT_TWO_BYTES (c1, c2);
4232 }
4233 else if (charset == charset_kana)
4234 EMIT_ONE_BYTE (code | 0x80);
4235 else
4236 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4237 }
4238 }
4239 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4240 coding->produced_char += produced_chars;
4241 coding->produced = dst - coding->destination;
4242 return 0;
4243 }
4244
4245 static int
4246 encode_coding_big5 (coding)
4247 struct coding_system *coding;
4248 {
4249 int multibytep = coding->dst_multibyte;
4250 int *charbuf = coding->charbuf;
4251 int *charbuf_end = charbuf + coding->charbuf_used;
4252 unsigned char *dst = coding->destination + coding->produced;
4253 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4254 int safe_room = 4;
4255 int produced_chars = 0;
4256 Lisp_Object attrs, charset_list, val;
4257 int ascii_compatible;
4258 struct charset *charset_roman, *charset_big5;
4259 int c;
4260
4261 CODING_GET_INFO (coding, attrs, charset_list);
4262 val = charset_list;
4263 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4264 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4265 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4266
4267 while (charbuf < charbuf_end)
4268 {
4269 ASSURE_DESTINATION (safe_room);
4270 c = *charbuf++;
4271 /* Now encode the character C. */
4272 if (ASCII_CHAR_P (c) && ascii_compatible)
4273 EMIT_ONE_ASCII_BYTE (c);
4274 else if (CHAR_BYTE8_P (c))
4275 {
4276 c = CHAR_TO_BYTE8 (c);
4277 EMIT_ONE_BYTE (c);
4278 }
4279 else
4280 {
4281 unsigned code;
4282 struct charset *charset = char_charset (c, charset_list, &code);
4283
4284 if (! charset)
4285 {
4286 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4287 {
4288 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4289 charset = CHARSET_FROM_ID (charset_ascii);
4290 }
4291 else
4292 {
4293 c = coding->default_char;
4294 charset = char_charset (c, charset_list, &code);
4295 }
4296 }
4297 if (code == CHARSET_INVALID_CODE (charset))
4298 abort ();
4299 if (charset == charset_big5)
4300 {
4301 int c1, c2;
4302
4303 c1 = code >> 8, c2 = code & 0xFF;
4304 EMIT_TWO_BYTES (c1, c2);
4305 }
4306 else
4307 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4308 }
4309 }
4310 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4311 coding->produced_char += produced_chars;
4312 coding->produced = dst - coding->destination;
4313 return 0;
4314 }
4315
4316 \f
4317 /*** 10. CCL handlers ***/
4318
4319 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4320 Check if a text is encoded in a coding system of which
4321 encoder/decoder are written in CCL program. If it is, return
4322 CATEGORY_MASK_CCL, else return 0. */
4323
4324 static int
4325 detect_coding_ccl (coding, detect_info)
4326 struct coding_system *coding;
4327 struct coding_detection_info *detect_info;
4328 {
4329 const unsigned char *src = coding->source, *src_base;
4330 const unsigned char *src_end = coding->source + coding->src_bytes;
4331 int multibytep = coding->src_multibyte;
4332 int consumed_chars = 0;
4333 int found = 0;
4334 unsigned char *valids = CODING_CCL_VALIDS (coding);
4335 int head_ascii = coding->head_ascii;
4336 Lisp_Object attrs;
4337
4338 detect_info->checked |= CATEGORY_MASK_CCL;
4339
4340 coding = &coding_categories[coding_category_ccl];
4341 attrs = CODING_ID_ATTRS (coding->id);
4342 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4343 src += head_ascii;
4344
4345 while (1)
4346 {
4347 int c;
4348
4349 src_base = src;
4350 ONE_MORE_BYTE (c);
4351 if (c < 0 || ! valids[c])
4352 break;
4353 if ((valids[c] > 1))
4354 found = CATEGORY_MASK_CCL;
4355 }
4356 detect_info->rejected |= CATEGORY_MASK_CCL;
4357 return 0;
4358
4359 no_more_source:
4360 detect_info->found |= found;
4361 return 1;
4362 }
4363
4364 static void
4365 decode_coding_ccl (coding)
4366 struct coding_system *coding;
4367 {
4368 const unsigned char *src = coding->source + coding->consumed;
4369 const unsigned char *src_end = coding->source + coding->src_bytes;
4370 int *charbuf = coding->charbuf;
4371 int *charbuf_end = charbuf + coding->charbuf_size;
4372 int consumed_chars = 0;
4373 int multibytep = coding->src_multibyte;
4374 struct ccl_program ccl;
4375 int source_charbuf[1024];
4376 int source_byteidx[1024];
4377 Lisp_Object attrs, charset_list;
4378
4379 CODING_GET_INFO (coding, attrs, charset_list);
4380 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4381
4382 while (src < src_end)
4383 {
4384 const unsigned char *p = src;
4385 int *source, *source_end;
4386 int i = 0;
4387
4388 if (multibytep)
4389 while (i < 1024 && p < src_end)
4390 {
4391 source_byteidx[i] = p - src;
4392 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4393 }
4394 else
4395 while (i < 1024 && p < src_end)
4396 source_charbuf[i++] = *p++;
4397
4398 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4399 ccl.last_block = 1;
4400
4401 source = source_charbuf;
4402 source_end = source + i;
4403 while (source < source_end)
4404 {
4405 ccl_driver (&ccl, source, charbuf,
4406 source_end - source, charbuf_end - charbuf,
4407 charset_list);
4408 source += ccl.consumed;
4409 charbuf += ccl.produced;
4410 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4411 break;
4412 }
4413 if (source < source_end)
4414 src += source_byteidx[source - source_charbuf];
4415 else
4416 src = p;
4417 consumed_chars += source - source_charbuf;
4418
4419 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4420 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4421 break;
4422 }
4423
4424 switch (ccl.status)
4425 {
4426 case CCL_STAT_SUSPEND_BY_SRC:
4427 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4428 break;
4429 case CCL_STAT_SUSPEND_BY_DST:
4430 break;
4431 case CCL_STAT_QUIT:
4432 case CCL_STAT_INVALID_CMD:
4433 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4434 break;
4435 default:
4436 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4437 break;
4438 }
4439 coding->consumed_char += consumed_chars;
4440 coding->consumed = src - coding->source;
4441 coding->charbuf_used = charbuf - coding->charbuf;
4442 }
4443
4444 static int
4445 encode_coding_ccl (coding)
4446 struct coding_system *coding;
4447 {
4448 struct ccl_program ccl;
4449 int multibytep = coding->dst_multibyte;
4450 int *charbuf = coding->charbuf;
4451 int *charbuf_end = charbuf + coding->charbuf_used;
4452 unsigned char *dst = coding->destination + coding->produced;
4453 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4454 unsigned char *adjusted_dst_end = dst_end - 1;
4455 int destination_charbuf[1024];
4456 int i, produced_chars = 0;
4457 Lisp_Object attrs, charset_list;
4458
4459 CODING_GET_INFO (coding, attrs, charset_list);
4460 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4461
4462 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4463 ccl.dst_multibyte = coding->dst_multibyte;
4464
4465 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4466 {
4467 int dst_bytes = dst_end - dst;
4468 if (dst_bytes > 1024)
4469 dst_bytes = 1024;
4470
4471 ccl_driver (&ccl, charbuf, destination_charbuf,
4472 charbuf_end - charbuf, dst_bytes, charset_list);
4473 charbuf += ccl.consumed;
4474 if (multibytep)
4475 for (i = 0; i < ccl.produced; i++)
4476 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4477 else
4478 {
4479 for (i = 0; i < ccl.produced; i++)
4480 *dst++ = destination_charbuf[i] & 0xFF;
4481 produced_chars += ccl.produced;
4482 }
4483 }
4484
4485 switch (ccl.status)
4486 {
4487 case CCL_STAT_SUSPEND_BY_SRC:
4488 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4489 break;
4490 case CCL_STAT_SUSPEND_BY_DST:
4491 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4492 break;
4493 case CCL_STAT_QUIT:
4494 case CCL_STAT_INVALID_CMD:
4495 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4496 break;
4497 default:
4498 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4499 break;
4500 }
4501
4502 coding->produced_char += produced_chars;
4503 coding->produced = dst - coding->destination;
4504 return 0;
4505 }
4506
4507
4508 \f
4509 /*** 10, 11. no-conversion handlers ***/
4510
4511 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4512
4513 static void
4514 decode_coding_raw_text (coding)
4515 struct coding_system *coding;
4516 {
4517 coding->chars_at_source = 1;
4518 coding->consumed_char = 0;
4519 coding->consumed = 0;
4520 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4521 }
4522
4523 static int
4524 encode_coding_raw_text (coding)
4525 struct coding_system *coding;
4526 {
4527 int multibytep = coding->dst_multibyte;
4528 int *charbuf = coding->charbuf;
4529 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4530 unsigned char *dst = coding->destination + coding->produced;
4531 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4532 int produced_chars = 0;
4533 int c;
4534
4535 if (multibytep)
4536 {
4537 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4538
4539 if (coding->src_multibyte)
4540 while (charbuf < charbuf_end)
4541 {
4542 ASSURE_DESTINATION (safe_room);
4543 c = *charbuf++;
4544 if (ASCII_CHAR_P (c))
4545 EMIT_ONE_ASCII_BYTE (c);
4546 else if (CHAR_BYTE8_P (c))
4547 {
4548 c = CHAR_TO_BYTE8 (c);
4549 EMIT_ONE_BYTE (c);
4550 }
4551 else
4552 {
4553 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4554
4555 CHAR_STRING_ADVANCE (c, p1);
4556 while (p0 < p1)
4557 {
4558 EMIT_ONE_BYTE (*p0);
4559 p0++;
4560 }
4561 }
4562 }
4563 else
4564 while (charbuf < charbuf_end)
4565 {
4566 ASSURE_DESTINATION (safe_room);
4567 c = *charbuf++;
4568 EMIT_ONE_BYTE (c);
4569 }
4570 }
4571 else
4572 {
4573 if (coding->src_multibyte)
4574 {
4575 int safe_room = MAX_MULTIBYTE_LENGTH;
4576
4577 while (charbuf < charbuf_end)
4578 {
4579 ASSURE_DESTINATION (safe_room);
4580 c = *charbuf++;
4581 if (ASCII_CHAR_P (c))
4582 *dst++ = c;
4583 else if (CHAR_BYTE8_P (c))
4584 *dst++ = CHAR_TO_BYTE8 (c);
4585 else
4586 CHAR_STRING_ADVANCE (c, dst);
4587 produced_chars++;
4588 }
4589 }
4590 else
4591 {
4592 ASSURE_DESTINATION (charbuf_end - charbuf);
4593 while (charbuf < charbuf_end && dst < dst_end)
4594 *dst++ = *charbuf++;
4595 produced_chars = dst - (coding->destination + coding->dst_bytes);
4596 }
4597 }
4598 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4599 coding->produced_char += produced_chars;
4600 coding->produced = dst - coding->destination;
4601 return 0;
4602 }
4603
4604 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4605 Check if a text is encoded in a charset-based coding system. If it
4606 is, return 1, else return 0. */
4607
4608 static int
4609 detect_coding_charset (coding, detect_info)
4610 struct coding_system *coding;
4611 struct coding_detection_info *detect_info;
4612 {
4613 const unsigned char *src = coding->source, *src_base;
4614 const unsigned char *src_end = coding->source + coding->src_bytes;
4615 int multibytep = coding->src_multibyte;
4616 int consumed_chars = 0;
4617 Lisp_Object attrs, valids;
4618 int found = 0;
4619
4620 detect_info->checked |= CATEGORY_MASK_CHARSET;
4621
4622 coding = &coding_categories[coding_category_charset];
4623 attrs = CODING_ID_ATTRS (coding->id);
4624 valids = AREF (attrs, coding_attr_charset_valids);
4625
4626 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4627 src += coding->head_ascii;
4628
4629 while (1)
4630 {
4631 int c;
4632
4633 src_base = src;
4634 ONE_MORE_BYTE (c);
4635 if (c < 0)
4636 continue;
4637 if (NILP (AREF (valids, c)))
4638 break;
4639 if (c >= 0x80)
4640 found = CATEGORY_MASK_CHARSET;
4641 }
4642 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4643 return 0;
4644
4645 no_more_source:
4646 detect_info->found |= found;
4647 return 1;
4648 }
4649
4650 static void
4651 decode_coding_charset (coding)
4652 struct coding_system *coding;
4653 {
4654 const unsigned char *src = coding->source + coding->consumed;
4655 const unsigned char *src_end = coding->source + coding->src_bytes;
4656 const unsigned char *src_base;
4657 int *charbuf = coding->charbuf;
4658 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4659 int consumed_chars = 0, consumed_chars_base;
4660 int multibytep = coding->src_multibyte;
4661 Lisp_Object attrs, charset_list, valids;
4662 int char_offset = coding->produced_char;
4663 int last_offset = char_offset;
4664 int last_id = charset_ascii;
4665
4666 CODING_GET_INFO (coding, attrs, charset_list);
4667 valids = AREF (attrs, coding_attr_charset_valids);
4668
4669 while (1)
4670 {
4671 int c;
4672 Lisp_Object val;
4673 struct charset *charset;
4674 int dim;
4675 int len = 1;
4676 unsigned code;
4677
4678 src_base = src;
4679 consumed_chars_base = consumed_chars;
4680
4681 if (charbuf >= charbuf_end)
4682 break;
4683
4684 ONE_MORE_BYTE (c);
4685 if (c < 0)
4686 goto invalid_code;
4687 code = c;
4688
4689 val = AREF (valids, c);
4690 if (NILP (val))
4691 goto invalid_code;
4692 if (INTEGERP (val))
4693 {
4694 charset = CHARSET_FROM_ID (XFASTINT (val));
4695 dim = CHARSET_DIMENSION (charset);
4696 while (len < dim)
4697 {
4698 ONE_MORE_BYTE (c);
4699 code = (code << 8) | c;
4700 len++;
4701 }
4702 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4703 charset, code, c);
4704 }
4705 else
4706 {
4707 /* VAL is a list of charset IDs. It is assured that the
4708 list is sorted by charset dimensions (smaller one
4709 comes first). */
4710 while (CONSP (val))
4711 {
4712 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4713 dim = CHARSET_DIMENSION (charset);
4714 while (len < dim)
4715 {
4716 ONE_MORE_BYTE (c);
4717 code = (code << 8) | c;
4718 len++;
4719 }
4720 CODING_DECODE_CHAR (coding, src, src_base,
4721 src_end, charset, code, c);
4722 if (c >= 0)
4723 break;
4724 val = XCDR (val);
4725 }
4726 }
4727 if (c < 0)
4728 goto invalid_code;
4729 if (charset->id != charset_ascii
4730 && last_id != charset->id)
4731 {
4732 if (last_id != charset_ascii)
4733 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4734 last_id = charset->id;
4735 last_offset = char_offset;
4736 }
4737
4738 *charbuf++ = c;
4739 char_offset++;
4740 continue;
4741
4742 invalid_code:
4743 src = src_base;
4744 consumed_chars = consumed_chars_base;
4745 ONE_MORE_BYTE (c);
4746 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4747 char_offset++;
4748 coding->errors++;
4749 }
4750
4751 no_more_source:
4752 if (last_id != charset_ascii)
4753 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4754 coding->consumed_char += consumed_chars_base;
4755 coding->consumed = src_base - coding->source;
4756 coding->charbuf_used = charbuf - coding->charbuf;
4757 }
4758
4759 static int
4760 encode_coding_charset (coding)
4761 struct coding_system *coding;
4762 {
4763 int multibytep = coding->dst_multibyte;
4764 int *charbuf = coding->charbuf;
4765 int *charbuf_end = charbuf + coding->charbuf_used;
4766 unsigned char *dst = coding->destination + coding->produced;
4767 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4768 int safe_room = MAX_MULTIBYTE_LENGTH;
4769 int produced_chars = 0;
4770 Lisp_Object attrs, charset_list;
4771 int ascii_compatible;
4772 int c;
4773
4774 CODING_GET_INFO (coding, attrs, charset_list);
4775 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4776
4777 while (charbuf < charbuf_end)
4778 {
4779 struct charset *charset;
4780 unsigned code;
4781
4782 ASSURE_DESTINATION (safe_room);
4783 c = *charbuf++;
4784 if (ascii_compatible && ASCII_CHAR_P (c))
4785 EMIT_ONE_ASCII_BYTE (c);
4786 else if (CHAR_BYTE8_P (c))
4787 {
4788 c = CHAR_TO_BYTE8 (c);
4789 EMIT_ONE_BYTE (c);
4790 }
4791 else
4792 {
4793 charset = char_charset (c, charset_list, &code);
4794 if (charset)
4795 {
4796 if (CHARSET_DIMENSION (charset) == 1)
4797 EMIT_ONE_BYTE (code);
4798 else if (CHARSET_DIMENSION (charset) == 2)
4799 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4800 else if (CHARSET_DIMENSION (charset) == 3)
4801 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4802 else
4803 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4804 (code >> 8) & 0xFF, code & 0xFF);
4805 }
4806 else
4807 {
4808 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4809 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4810 else
4811 c = coding->default_char;
4812 EMIT_ONE_BYTE (c);
4813 }
4814 }
4815 }
4816
4817 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4818 coding->produced_char += produced_chars;
4819 coding->produced = dst - coding->destination;
4820 return 0;
4821 }
4822
4823 \f
4824 /*** 7. C library functions ***/
4825
4826 /* Setup coding context CODING from information about CODING_SYSTEM.
4827 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4828 CODING_SYSTEM is invalid, signal an error. */
4829
4830 void
4831 setup_coding_system (coding_system, coding)
4832 Lisp_Object coding_system;
4833 struct coding_system *coding;
4834 {
4835 Lisp_Object attrs;
4836 Lisp_Object eol_type;
4837 Lisp_Object coding_type;
4838 Lisp_Object val;
4839
4840 if (NILP (coding_system))
4841 coding_system = Qno_conversion;
4842
4843 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4844
4845 attrs = CODING_ID_ATTRS (coding->id);
4846 eol_type = CODING_ID_EOL_TYPE (coding->id);
4847
4848 coding->mode = 0;
4849 coding->head_ascii = -1;
4850 coding->common_flags
4851 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4852 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4853 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4854 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4855 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4856 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4857 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4858
4859 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4860 coding->max_charset_id = SCHARS (val) - 1;
4861 coding->safe_charsets = (char *) SDATA (val);
4862 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4863
4864 coding_type = CODING_ATTR_TYPE (attrs);
4865 if (EQ (coding_type, Qundecided))
4866 {
4867 coding->detector = NULL;
4868 coding->decoder = decode_coding_raw_text;
4869 coding->encoder = encode_coding_raw_text;
4870 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4871 }
4872 else if (EQ (coding_type, Qiso_2022))
4873 {
4874 int i;
4875 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4876
4877 /* Invoke graphic register 0 to plane 0. */
4878 CODING_ISO_INVOCATION (coding, 0) = 0;
4879 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4880 CODING_ISO_INVOCATION (coding, 1)
4881 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4882 /* Setup the initial status of designation. */
4883 for (i = 0; i < 4; i++)
4884 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4885 /* Not single shifting initially. */
4886 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4887 /* Beginning of buffer should also be regarded as bol. */
4888 CODING_ISO_BOL (coding) = 1;
4889 coding->detector = detect_coding_iso_2022;
4890 coding->decoder = decode_coding_iso_2022;
4891 coding->encoder = encode_coding_iso_2022;
4892 if (flags & CODING_ISO_FLAG_SAFE)
4893 coding->mode |= CODING_MODE_SAFE_ENCODING;
4894 coding->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4896 | CODING_REQUIRE_FLUSHING_MASK);
4897 if (flags & CODING_ISO_FLAG_COMPOSITION)
4898 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4899 if (flags & CODING_ISO_FLAG_DESIGNATION)
4900 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4901 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4902 {
4903 setup_iso_safe_charsets (attrs);
4904 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4905 coding->max_charset_id = SCHARS (val) - 1;
4906 coding->safe_charsets = (char *) SDATA (val);
4907 }
4908 CODING_ISO_FLAGS (coding) = flags;
4909 }
4910 else if (EQ (coding_type, Qcharset))
4911 {
4912 coding->detector = detect_coding_charset;
4913 coding->decoder = decode_coding_charset;
4914 coding->encoder = encode_coding_charset;
4915 coding->common_flags
4916 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4917 }
4918 else if (EQ (coding_type, Qutf_8))
4919 {
4920 coding->detector = detect_coding_utf_8;
4921 coding->decoder = decode_coding_utf_8;
4922 coding->encoder = encode_coding_utf_8;
4923 coding->common_flags
4924 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4925 }
4926 else if (EQ (coding_type, Qutf_16))
4927 {
4928 val = AREF (attrs, coding_attr_utf_16_bom);
4929 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4930 : EQ (val, Qt) ? utf_16_with_bom
4931 : utf_16_without_bom);
4932 val = AREF (attrs, coding_attr_utf_16_endian);
4933 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4934 : utf_16_little_endian);
4935 CODING_UTF_16_SURROGATE (coding) = 0;
4936 coding->detector = detect_coding_utf_16;
4937 coding->decoder = decode_coding_utf_16;
4938 coding->encoder = encode_coding_utf_16;
4939 coding->common_flags
4940 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4941 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4942 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4943 }
4944 else if (EQ (coding_type, Qccl))
4945 {
4946 coding->detector = detect_coding_ccl;
4947 coding->decoder = decode_coding_ccl;
4948 coding->encoder = encode_coding_ccl;
4949 coding->common_flags
4950 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4951 | CODING_REQUIRE_FLUSHING_MASK);
4952 }
4953 else if (EQ (coding_type, Qemacs_mule))
4954 {
4955 coding->detector = detect_coding_emacs_mule;
4956 coding->decoder = decode_coding_emacs_mule;
4957 coding->encoder = encode_coding_emacs_mule;
4958 coding->common_flags
4959 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4960 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4961 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4962 {
4963 Lisp_Object tail, safe_charsets;
4964 int max_charset_id = 0;
4965
4966 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4967 tail = XCDR (tail))
4968 if (max_charset_id < XFASTINT (XCAR (tail)))
4969 max_charset_id = XFASTINT (XCAR (tail));
4970 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4971 make_number (255));
4972 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4973 tail = XCDR (tail))
4974 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4975 coding->max_charset_id = max_charset_id;
4976 coding->safe_charsets = (char *) SDATA (safe_charsets);
4977 }
4978 }
4979 else if (EQ (coding_type, Qshift_jis))
4980 {
4981 coding->detector = detect_coding_sjis;
4982 coding->decoder = decode_coding_sjis;
4983 coding->encoder = encode_coding_sjis;
4984 coding->common_flags
4985 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4986 }
4987 else if (EQ (coding_type, Qbig5))
4988 {
4989 coding->detector = detect_coding_big5;
4990 coding->decoder = decode_coding_big5;
4991 coding->encoder = encode_coding_big5;
4992 coding->common_flags
4993 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4994 }
4995 else /* EQ (coding_type, Qraw_text) */
4996 {
4997 coding->detector = NULL;
4998 coding->decoder = decode_coding_raw_text;
4999 coding->encoder = encode_coding_raw_text;
5000 }
5001
5002 return;
5003 }
5004
5005 /* Return raw-text or one of its subsidiaries that has the same
5006 eol_type as CODING-SYSTEM. */
5007
5008 Lisp_Object
5009 raw_text_coding_system (coding_system)
5010 Lisp_Object coding_system;
5011 {
5012 Lisp_Object spec, attrs;
5013 Lisp_Object eol_type, raw_text_eol_type;
5014
5015 if (NILP (coding_system))
5016 return Qraw_text;
5017 spec = CODING_SYSTEM_SPEC (coding_system);
5018 attrs = AREF (spec, 0);
5019
5020 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5021 return coding_system;
5022
5023 eol_type = AREF (spec, 2);
5024 if (VECTORP (eol_type))
5025 return Qraw_text;
5026 spec = CODING_SYSTEM_SPEC (Qraw_text);
5027 raw_text_eol_type = AREF (spec, 2);
5028 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5029 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5030 : AREF (raw_text_eol_type, 2));
5031 }
5032
5033
5034 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5035 does, return one of the subsidiary that has the same eol-spec as
5036 PARENT. Otherwise, return CODING_SYSTEM. */
5037
5038 Lisp_Object
5039 coding_inherit_eol_type (coding_system, parent)
5040 Lisp_Object coding_system, parent;
5041 {
5042 Lisp_Object spec, eol_type;
5043
5044 if (NILP (coding_system))
5045 coding_system = Qraw_text;
5046 spec = CODING_SYSTEM_SPEC (coding_system);
5047 eol_type = AREF (spec, 2);
5048 if (VECTORP (eol_type)
5049 && ! NILP (parent))
5050 {
5051 Lisp_Object parent_spec;
5052 Lisp_Object parent_eol_type;
5053
5054 parent_spec
5055 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5056 parent_eol_type = AREF (parent_spec, 2);
5057 if (EQ (parent_eol_type, Qunix))
5058 coding_system = AREF (eol_type, 0);
5059 else if (EQ (parent_eol_type, Qdos))
5060 coding_system = AREF (eol_type, 1);
5061 else if (EQ (parent_eol_type, Qmac))
5062 coding_system = AREF (eol_type, 2);
5063 }
5064 return coding_system;
5065 }
5066
5067 /* Emacs has a mechanism to automatically detect a coding system if it
5068 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5069 it's impossible to distinguish some coding systems accurately
5070 because they use the same range of codes. So, at first, coding
5071 systems are categorized into 7, those are:
5072
5073 o coding-category-emacs-mule
5074
5075 The category for a coding system which has the same code range
5076 as Emacs' internal format. Assigned the coding-system (Lisp
5077 symbol) `emacs-mule' by default.
5078
5079 o coding-category-sjis
5080
5081 The category for a coding system which has the same code range
5082 as SJIS. Assigned the coding-system (Lisp
5083 symbol) `japanese-shift-jis' by default.
5084
5085 o coding-category-iso-7
5086
5087 The category for a coding system which has the same code range
5088 as ISO2022 of 7-bit environment. This doesn't use any locking
5089 shift and single shift functions. This can encode/decode all
5090 charsets. Assigned the coding-system (Lisp symbol)
5091 `iso-2022-7bit' by default.
5092
5093 o coding-category-iso-7-tight
5094
5095 Same as coding-category-iso-7 except that this can
5096 encode/decode only the specified charsets.
5097
5098 o coding-category-iso-8-1
5099
5100 The category for a coding system which has the same code range
5101 as ISO2022 of 8-bit environment and graphic plane 1 used only
5102 for DIMENSION1 charset. This doesn't use any locking shift
5103 and single shift functions. Assigned the coding-system (Lisp
5104 symbol) `iso-latin-1' by default.
5105
5106 o coding-category-iso-8-2
5107
5108 The category for a coding system which has the same code range
5109 as ISO2022 of 8-bit environment and graphic plane 1 used only
5110 for DIMENSION2 charset. This doesn't use any locking shift
5111 and single shift functions. Assigned the coding-system (Lisp
5112 symbol) `japanese-iso-8bit' by default.
5113
5114 o coding-category-iso-7-else
5115
5116 The category for a coding system which has the same code range
5117 as ISO2022 of 7-bit environemnt but uses locking shift or
5118 single shift functions. Assigned the coding-system (Lisp
5119 symbol) `iso-2022-7bit-lock' by default.
5120
5121 o coding-category-iso-8-else
5122
5123 The category for a coding system which has the same code range
5124 as ISO2022 of 8-bit environemnt but uses locking shift or
5125 single shift functions. Assigned the coding-system (Lisp
5126 symbol) `iso-2022-8bit-ss2' by default.
5127
5128 o coding-category-big5
5129
5130 The category for a coding system which has the same code range
5131 as BIG5. Assigned the coding-system (Lisp symbol)
5132 `cn-big5' by default.
5133
5134 o coding-category-utf-8
5135
5136 The category for a coding system which has the same code range
5137 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5138 symbol) `utf-8' by default.
5139
5140 o coding-category-utf-16-be
5141
5142 The category for a coding system in which a text has an
5143 Unicode signature (cf. Unicode Standard) in the order of BIG
5144 endian at the head. Assigned the coding-system (Lisp symbol)
5145 `utf-16-be' by default.
5146
5147 o coding-category-utf-16-le
5148
5149 The category for a coding system in which a text has an
5150 Unicode signature (cf. Unicode Standard) in the order of
5151 LITTLE endian at the head. Assigned the coding-system (Lisp
5152 symbol) `utf-16-le' by default.
5153
5154 o coding-category-ccl
5155
5156 The category for a coding system of which encoder/decoder is
5157 written in CCL programs. The default value is nil, i.e., no
5158 coding system is assigned.
5159
5160 o coding-category-binary
5161
5162 The category for a coding system not categorized in any of the
5163 above. Assigned the coding-system (Lisp symbol)
5164 `no-conversion' by default.
5165
5166 Each of them is a Lisp symbol and the value is an actual
5167 `coding-system's (this is also a Lisp symbol) assigned by a user.
5168 What Emacs does actually is to detect a category of coding system.
5169 Then, it uses a `coding-system' assigned to it. If Emacs can't
5170 decide only one possible category, it selects a category of the
5171 highest priority. Priorities of categories are also specified by a
5172 user in a Lisp variable `coding-category-list'.
5173
5174 */
5175
5176 #define EOL_SEEN_NONE 0
5177 #define EOL_SEEN_LF 1
5178 #define EOL_SEEN_CR 2
5179 #define EOL_SEEN_CRLF 4
5180
5181 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5182 SOURCE is encoded. If CATEGORY is one of
5183 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5184 two-byte, else they are encoded by one-byte.
5185
5186 Return one of EOL_SEEN_XXX. */
5187
5188 #define MAX_EOL_CHECK_COUNT 3
5189
5190 static int
5191 detect_eol (source, src_bytes, category)
5192 unsigned char *source;
5193 EMACS_INT src_bytes;
5194 enum coding_category category;
5195 {
5196 unsigned char *src = source, *src_end = src + src_bytes;
5197 unsigned char c;
5198 int total = 0;
5199 int eol_seen = EOL_SEEN_NONE;
5200
5201 if ((1 << category) & CATEGORY_MASK_UTF_16)
5202 {
5203 int msb, lsb;
5204
5205 msb = category == (coding_category_utf_16_le
5206 | coding_category_utf_16_le_nosig);
5207 lsb = 1 - msb;
5208
5209 while (src + 1 < src_end)
5210 {
5211 c = src[lsb];
5212 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5213 {
5214 int this_eol;
5215
5216 if (c == '\n')
5217 this_eol = EOL_SEEN_LF;
5218 else if (src + 3 >= src_end
5219 || src[msb + 2] != 0
5220 || src[lsb + 2] != '\n')
5221 this_eol = EOL_SEEN_CR;
5222 else
5223 this_eol = EOL_SEEN_CRLF;
5224
5225 if (eol_seen == EOL_SEEN_NONE)
5226 /* This is the first end-of-line. */
5227 eol_seen = this_eol;
5228 else if (eol_seen != this_eol)
5229 {
5230 /* The found type is different from what found before. */
5231 eol_seen = EOL_SEEN_LF;
5232 break;
5233 }
5234 if (++total == MAX_EOL_CHECK_COUNT)
5235 break;
5236 }
5237 src += 2;
5238 }
5239 }
5240 else
5241 {
5242 while (src < src_end)
5243 {
5244 c = *src++;
5245 if (c == '\n' || c == '\r')
5246 {
5247 int this_eol;
5248
5249 if (c == '\n')
5250 this_eol = EOL_SEEN_LF;
5251 else if (src >= src_end || *src != '\n')
5252 this_eol = EOL_SEEN_CR;
5253 else
5254 this_eol = EOL_SEEN_CRLF, src++;
5255
5256 if (eol_seen == EOL_SEEN_NONE)
5257 /* This is the first end-of-line. */
5258 eol_seen = this_eol;
5259 else if (eol_seen != this_eol)
5260 {
5261 /* The found type is different from what found before. */
5262 eol_seen = EOL_SEEN_LF;
5263 break;
5264 }
5265 if (++total == MAX_EOL_CHECK_COUNT)
5266 break;
5267 }
5268 }
5269 }
5270 return eol_seen;
5271 }
5272
5273
5274 static Lisp_Object
5275 adjust_coding_eol_type (coding, eol_seen)
5276 struct coding_system *coding;
5277 int eol_seen;
5278 {
5279 Lisp_Object eol_type;
5280
5281 eol_type = CODING_ID_EOL_TYPE (coding->id);
5282 if (eol_seen & EOL_SEEN_LF)
5283 {
5284 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5285 eol_type = Qunix;
5286 }
5287 else if (eol_seen & EOL_SEEN_CRLF)
5288 {
5289 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5290 eol_type = Qdos;
5291 }
5292 else if (eol_seen & EOL_SEEN_CR)
5293 {
5294 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5295 eol_type = Qmac;
5296 }
5297 return eol_type;
5298 }
5299
5300 /* Detect how a text specified in CODING is encoded. If a coding
5301 system is detected, update fields of CODING by the detected coding
5302 system. */
5303
5304 void
5305 detect_coding (coding)
5306 struct coding_system *coding;
5307 {
5308 const unsigned char *src, *src_end;
5309 Lisp_Object attrs, coding_type;
5310
5311 coding->consumed = coding->consumed_char = 0;
5312 coding->produced = coding->produced_char = 0;
5313 coding_set_source (coding);
5314
5315 src_end = coding->source + coding->src_bytes;
5316
5317 /* If we have not yet decided the text encoding type, detect it
5318 now. */
5319 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5320 {
5321 int c, i;
5322
5323 for (i = 0, src = coding->source; src < src_end; i++, src++)
5324 {
5325 c = *src;
5326 if (c & 0x80 || (c < 0x20 && (c == 0
5327 || c == ISO_CODE_ESC
5328 || c == ISO_CODE_SI
5329 || c == ISO_CODE_SO)))
5330 break;
5331 }
5332 coding->head_ascii = src - (coding->source + coding->consumed);
5333
5334 if (coding->head_ascii < coding->src_bytes)
5335 {
5336 struct coding_detection_info detect_info;
5337 enum coding_category category;
5338 struct coding_system *this;
5339
5340 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5341 for (i = 0; i < coding_category_raw_text; i++)
5342 {
5343 category = coding_priorities[i];
5344 this = coding_categories + category;
5345 if (this->id < 0)
5346 {
5347 /* No coding system of this category is defined. */
5348 detect_info.rejected |= (1 << category);
5349 }
5350 else if (category >= coding_category_raw_text)
5351 continue;
5352 else if (detect_info.checked & (1 << category))
5353 {
5354 if (detect_info.found & (1 << category))
5355 break;
5356 }
5357 else if ((*(this->detector)) (coding, &detect_info)
5358 && detect_info.found & (1 << category))
5359 {
5360 if (category == coding_category_utf_16_auto)
5361 {
5362 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5363 category = coding_category_utf_16_le;
5364 else
5365 category = coding_category_utf_16_be;
5366 }
5367 break;
5368 }
5369 }
5370 if (i < coding_category_raw_text)
5371 setup_coding_system (CODING_ID_NAME (this->id), coding);
5372 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5373 setup_coding_system (Qraw_text, coding);
5374 else if (detect_info.rejected)
5375 for (i = 0; i < coding_category_raw_text; i++)
5376 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5377 {
5378 this = coding_categories + coding_priorities[i];
5379 setup_coding_system (CODING_ID_NAME (this->id), coding);
5380 break;
5381 }
5382 }
5383 }
5384 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5385 == coding_category_utf_16_auto)
5386 {
5387 Lisp_Object coding_systems;
5388 struct coding_detection_info detect_info;
5389
5390 coding_systems
5391 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5392 detect_info.found = detect_info.rejected = 0;
5393 if (CONSP (coding_systems)
5394 && detect_coding_utf_16 (coding, &detect_info))
5395 {
5396 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5397 setup_coding_system (XCAR (coding_systems), coding);
5398 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5399 setup_coding_system (XCDR (coding_systems), coding);
5400 }
5401 }
5402 }
5403
5404
5405 static void
5406 decode_eol (coding)
5407 struct coding_system *coding;
5408 {
5409 Lisp_Object eol_type;
5410 unsigned char *p, *pbeg, *pend;
5411
5412 eol_type = CODING_ID_EOL_TYPE (coding->id);
5413 if (EQ (eol_type, Qunix))
5414 return;
5415
5416 if (NILP (coding->dst_object))
5417 pbeg = coding->destination;
5418 else
5419 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5420 pend = pbeg + coding->produced;
5421
5422 if (VECTORP (eol_type))
5423 {
5424 int eol_seen = EOL_SEEN_NONE;
5425
5426 for (p = pbeg; p < pend; p++)
5427 {
5428 if (*p == '\n')
5429 eol_seen |= EOL_SEEN_LF;
5430 else if (*p == '\r')
5431 {
5432 if (p + 1 < pend && *(p + 1) == '\n')
5433 {
5434 eol_seen |= EOL_SEEN_CRLF;
5435 p++;
5436 }
5437 else
5438 eol_seen |= EOL_SEEN_CR;
5439 }
5440 }
5441 if (eol_seen != EOL_SEEN_NONE
5442 && eol_seen != EOL_SEEN_LF
5443 && eol_seen != EOL_SEEN_CRLF
5444 && eol_seen != EOL_SEEN_CR)
5445 eol_seen = EOL_SEEN_LF;
5446 if (eol_seen != EOL_SEEN_NONE)
5447 eol_type = adjust_coding_eol_type (coding, eol_seen);
5448 }
5449
5450 if (EQ (eol_type, Qmac))
5451 {
5452 for (p = pbeg; p < pend; p++)
5453 if (*p == '\r')
5454 *p = '\n';
5455 }
5456 else if (EQ (eol_type, Qdos))
5457 {
5458 int n = 0;
5459
5460 if (NILP (coding->dst_object))
5461 {
5462 for (p = pend - 2; p >= pbeg; p--)
5463 if (*p == '\r')
5464 {
5465 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5466 n++;
5467 }
5468 }
5469 else
5470 {
5471 for (p = pend - 2; p >= pbeg; p--)
5472 if (*p == '\r')
5473 {
5474 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5475 int pos = BYTE_TO_CHAR (pos_byte);
5476
5477 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5478 n++;
5479 }
5480 }
5481 coding->produced -= n;
5482 coding->produced_char -= n;
5483 }
5484 }
5485
5486
5487 /* Return a translation table from coding system attribute vector ATTRS
5488 for encoding (ENCODEP is nonzero) or decoding (ENCODEP is zeor). */
5489
5490 static INLINE
5491 get_translation_table (attrs, encodep)
5492 {
5493 Lisp_Object standard, translation_table;
5494
5495 if (encodep)
5496 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5497 standard = Vstandard_translation_table_for_encode;
5498 else
5499 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5500 standard = Vstandard_translation_table_for_decode;
5501 if (! NILP (translation_table) && SYMBOLP (translation_table))
5502 translation_table = Fget (translation_table, Qtranslation_table);
5503 if (NILP (translation_table))
5504 translation_table = standard;
5505 if (! CHAR_TABLE_P (translation_table))
5506 translation_table = Qnil;
5507 return translation_table;
5508 }
5509
5510
5511 static void
5512 translate_chars (coding, table)
5513 struct coding_system *coding;
5514 Lisp_Object table;
5515 {
5516 int *charbuf = coding->charbuf;
5517 int *charbuf_end = charbuf + coding->charbuf_used;
5518 int c;
5519
5520 if (coding->chars_at_source)
5521 return;
5522
5523 while (charbuf < charbuf_end)
5524 {
5525 c = *charbuf;
5526 if (c < 0)
5527 charbuf += -c;
5528 else
5529 *charbuf++ = translate_char (table, c);
5530 }
5531 }
5532
5533 static int
5534 produce_chars (coding)
5535 struct coding_system *coding;
5536 {
5537 unsigned char *dst = coding->destination + coding->produced;
5538 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5539 int produced;
5540 int produced_chars = 0;
5541
5542 if (! coding->chars_at_source)
5543 {
5544 /* Characters are in coding->charbuf. */
5545 int *buf = coding->charbuf;
5546 int *buf_end = buf + coding->charbuf_used;
5547 unsigned char *adjusted_dst_end;
5548
5549 if (BUFFERP (coding->src_object)
5550 && EQ (coding->src_object, coding->dst_object))
5551 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5552 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5553
5554 while (buf < buf_end)
5555 {
5556 int c = *buf++;
5557
5558 if (dst >= adjusted_dst_end)
5559 {
5560 dst = alloc_destination (coding,
5561 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5562 dst);
5563 dst_end = coding->destination + coding->dst_bytes;
5564 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5565 }
5566 if (c >= 0)
5567 {
5568 if (coding->dst_multibyte
5569 || ! CHAR_BYTE8_P (c))
5570 CHAR_STRING_ADVANCE (c, dst);
5571 else
5572 *dst++ = CHAR_TO_BYTE8 (c);
5573 produced_chars++;
5574 }
5575 else
5576 /* This is an annotation datum. (-C) is the length of
5577 it. */
5578 buf += -c - 1;
5579 }
5580 }
5581 else
5582 {
5583 const unsigned char *src = coding->source;
5584 const unsigned char *src_end = src + coding->src_bytes;
5585 Lisp_Object eol_type;
5586
5587 eol_type = CODING_ID_EOL_TYPE (coding->id);
5588
5589 if (coding->src_multibyte != coding->dst_multibyte)
5590 {
5591 if (coding->src_multibyte)
5592 {
5593 int multibytep = 1;
5594 int consumed_chars;
5595
5596 while (1)
5597 {
5598 const unsigned char *src_base = src;
5599 int c;
5600
5601 ONE_MORE_BYTE (c);
5602 if (c == '\r')
5603 {
5604 if (EQ (eol_type, Qdos))
5605 {
5606 if (src == src_end)
5607 {
5608 record_conversion_result
5609 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5610 goto no_more_source;
5611 }
5612 if (*src == '\n')
5613 c = *src++;
5614 }
5615 else if (EQ (eol_type, Qmac))
5616 c = '\n';
5617 }
5618 if (dst == dst_end)
5619 {
5620 coding->consumed = src - coding->source;
5621
5622 if (EQ (coding->src_object, coding->dst_object))
5623 dst_end = (unsigned char *) src;
5624 if (dst == dst_end)
5625 {
5626 dst = alloc_destination (coding, src_end - src + 1,
5627 dst);
5628 dst_end = coding->destination + coding->dst_bytes;
5629 coding_set_source (coding);
5630 src = coding->source + coding->consumed;
5631 src_end = coding->source + coding->src_bytes;
5632 }
5633 }
5634 *dst++ = c;
5635 produced_chars++;
5636 }
5637 no_more_source:
5638 ;
5639 }
5640 else
5641 while (src < src_end)
5642 {
5643 int multibytep = 1;
5644 int c = *src++;
5645
5646 if (c == '\r')
5647 {
5648 if (EQ (eol_type, Qdos))
5649 {
5650 if (src < src_end
5651 && *src == '\n')
5652 c = *src++;
5653 }
5654 else if (EQ (eol_type, Qmac))
5655 c = '\n';
5656 }
5657 if (dst >= dst_end - 1)
5658 {
5659 coding->consumed = src - coding->source;
5660
5661 if (EQ (coding->src_object, coding->dst_object))
5662 dst_end = (unsigned char *) src;
5663 if (dst >= dst_end - 1)
5664 {
5665 dst = alloc_destination (coding, src_end - src + 2,
5666 dst);
5667 dst_end = coding->destination + coding->dst_bytes;
5668 coding_set_source (coding);
5669 src = coding->source + coding->consumed;
5670 src_end = coding->source + coding->src_bytes;
5671 }
5672 }
5673 EMIT_ONE_BYTE (c);
5674 }
5675 }
5676 else
5677 {
5678 if (!EQ (coding->src_object, coding->dst_object))
5679 {
5680 int require = coding->src_bytes - coding->dst_bytes;
5681
5682 if (require > 0)
5683 {
5684 EMACS_INT offset = src - coding->source;
5685
5686 dst = alloc_destination (coding, require, dst);
5687 coding_set_source (coding);
5688 src = coding->source + offset;
5689 src_end = coding->source + coding->src_bytes;
5690 }
5691 }
5692 produced_chars = coding->src_chars;
5693 while (src < src_end)
5694 {
5695 int c = *src++;
5696
5697 if (c == '\r')
5698 {
5699 if (EQ (eol_type, Qdos))
5700 {
5701 if (src < src_end
5702 && *src == '\n')
5703 c = *src++;
5704 produced_chars--;
5705 }
5706 else if (EQ (eol_type, Qmac))
5707 c = '\n';
5708 }
5709 *dst++ = c;
5710 }
5711 }
5712 coding->consumed = coding->src_bytes;
5713 coding->consumed_char = coding->src_chars;
5714 }
5715
5716 produced = dst - (coding->destination + coding->produced);
5717 if (BUFFERP (coding->dst_object))
5718 insert_from_gap (produced_chars, produced);
5719 coding->produced += produced;
5720 coding->produced_char += produced_chars;
5721 return produced_chars;
5722 }
5723
5724 /* Compose text in CODING->object according to the annotation data at
5725 CHARBUF. CHARBUF is an array:
5726 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5727 */
5728
5729 static INLINE void
5730 produce_composition (coding, charbuf)
5731 struct coding_system *coding;
5732 int *charbuf;
5733 {
5734 int len;
5735 EMACS_INT from, to;
5736 enum composition_method method;
5737 Lisp_Object components;
5738
5739 len = -charbuf[0];
5740 from = coding->dst_pos + charbuf[2];
5741 to = coding->dst_pos + charbuf[3];
5742 method = (enum composition_method) (charbuf[4]);
5743
5744 if (method == COMPOSITION_RELATIVE)
5745 components = Qnil;
5746 else
5747 {
5748 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5749 int i;
5750
5751 len -= 5;
5752 charbuf += 5;
5753 for (i = 0; i < len; i++)
5754 args[i] = make_number (charbuf[i]);
5755 components = (method == COMPOSITION_WITH_ALTCHARS
5756 ? Fstring (len, args) : Fvector (len, args));
5757 }
5758 compose_text (from, to, components, Qnil, coding->dst_object);
5759 }
5760
5761
5762 /* Put `charset' property on text in CODING->object according to
5763 the annotation data at CHARBUF. CHARBUF is an array:
5764 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5765 */
5766
5767 static INLINE void
5768 produce_charset (coding, charbuf)
5769 struct coding_system *coding;
5770 int *charbuf;
5771 {
5772 EMACS_INT from = coding->dst_pos + charbuf[2];
5773 EMACS_INT to = coding->dst_pos + charbuf[3];
5774 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5775
5776 Fput_text_property (make_number (from), make_number (to),
5777 Qcharset, CHARSET_NAME (charset),
5778 coding->dst_object);
5779 }
5780
5781
5782 #define CHARBUF_SIZE 0x4000
5783
5784 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5785 do { \
5786 int size = CHARBUF_SIZE;; \
5787 \
5788 coding->charbuf = NULL; \
5789 while (size > 1024) \
5790 { \
5791 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5792 if (coding->charbuf) \
5793 break; \
5794 size >>= 1; \
5795 } \
5796 if (! coding->charbuf) \
5797 { \
5798 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5799 return coding->result; \
5800 } \
5801 coding->charbuf_size = size; \
5802 } while (0)
5803
5804
5805 static void
5806 produce_annotation (coding)
5807 struct coding_system *coding;
5808 {
5809 int *charbuf = coding->charbuf;
5810 int *charbuf_end = charbuf + coding->charbuf_used;
5811
5812 if (NILP (coding->dst_object))
5813 return;
5814
5815 while (charbuf < charbuf_end)
5816 {
5817 if (*charbuf >= 0)
5818 charbuf++;
5819 else
5820 {
5821 int len = -*charbuf;
5822 switch (charbuf[1])
5823 {
5824 case CODING_ANNOTATE_COMPOSITION_MASK:
5825 produce_composition (coding, charbuf);
5826 break;
5827 case CODING_ANNOTATE_CHARSET_MASK:
5828 produce_charset (coding, charbuf);
5829 break;
5830 default:
5831 abort ();
5832 }
5833 charbuf += len;
5834 }
5835 }
5836 }
5837
5838 /* Decode the data at CODING->src_object into CODING->dst_object.
5839 CODING->src_object is a buffer, a string, or nil.
5840 CODING->dst_object is a buffer.
5841
5842 If CODING->src_object is a buffer, it must be the current buffer.
5843 In this case, if CODING->src_pos is positive, it is a position of
5844 the source text in the buffer, otherwise, the source text is in the
5845 gap area of the buffer, and CODING->src_pos specifies the offset of
5846 the text from GPT (which must be the same as PT). If this is the
5847 same buffer as CODING->dst_object, CODING->src_pos must be
5848 negative.
5849
5850 If CODING->src_object is a string, CODING->src_pos in an index to
5851 that string.
5852
5853 If CODING->src_object is nil, CODING->source must already point to
5854 the non-relocatable memory area. In this case, CODING->src_pos is
5855 an offset from CODING->source.
5856
5857 The decoded data is inserted at the current point of the buffer
5858 CODING->dst_object.
5859 */
5860
5861 static int
5862 decode_coding (coding)
5863 struct coding_system *coding;
5864 {
5865 Lisp_Object attrs;
5866 Lisp_Object undo_list;
5867 Lisp_Object translation_table;
5868
5869 if (BUFFERP (coding->src_object)
5870 && coding->src_pos > 0
5871 && coding->src_pos < GPT
5872 && coding->src_pos + coding->src_chars > GPT)
5873 move_gap_both (coding->src_pos, coding->src_pos_byte);
5874
5875 undo_list = Qt;
5876 if (BUFFERP (coding->dst_object))
5877 {
5878 if (current_buffer != XBUFFER (coding->dst_object))
5879 set_buffer_internal (XBUFFER (coding->dst_object));
5880 if (GPT != PT)
5881 move_gap_both (PT, PT_BYTE);
5882 undo_list = current_buffer->undo_list;
5883 current_buffer->undo_list = Qt;
5884 }
5885
5886 coding->consumed = coding->consumed_char = 0;
5887 coding->produced = coding->produced_char = 0;
5888 coding->chars_at_source = 0;
5889 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5890 coding->errors = 0;
5891
5892 ALLOC_CONVERSION_WORK_AREA (coding);
5893
5894 attrs = CODING_ID_ATTRS (coding->id);
5895 translation_table = get_translation_table (attrs, 1);
5896
5897 do
5898 {
5899 coding_set_source (coding);
5900 coding->annotated = 0;
5901 (*(coding->decoder)) (coding);
5902 if (!NILP (translation_table))
5903 translate_chars (coding, translation_table);
5904 coding_set_destination (coding);
5905 produce_chars (coding);
5906 if (coding->annotated)
5907 produce_annotation (coding);
5908 }
5909 while (coding->consumed < coding->src_bytes
5910 && ! coding->result);
5911
5912 coding->carryover_bytes = 0;
5913 if (coding->consumed < coding->src_bytes)
5914 {
5915 int nbytes = coding->src_bytes - coding->consumed;
5916 const unsigned char *src;
5917
5918 coding_set_source (coding);
5919 coding_set_destination (coding);
5920 src = coding->source + coding->consumed;
5921
5922 if (coding->mode & CODING_MODE_LAST_BLOCK)
5923 {
5924 /* Flush out unprocessed data as binary chars. We are sure
5925 that the number of data is less than the size of
5926 coding->charbuf. */
5927 coding->charbuf_used = 0;
5928 while (nbytes-- > 0)
5929 {
5930 int c = *src++;
5931
5932 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5933 }
5934 produce_chars (coding);
5935 }
5936 else
5937 {
5938 /* Record unprocessed bytes in coding->carryover. We are
5939 sure that the number of data is less than the size of
5940 coding->carryover. */
5941 unsigned char *p = coding->carryover;
5942
5943 coding->carryover_bytes = nbytes;
5944 while (nbytes-- > 0)
5945 *p++ = *src++;
5946 }
5947 coding->consumed = coding->src_bytes;
5948 }
5949
5950 if (BUFFERP (coding->dst_object))
5951 {
5952 current_buffer->undo_list = undo_list;
5953 record_insert (coding->dst_pos, coding->produced_char);
5954 }
5955 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5956 decode_eol (coding);
5957 return coding->result;
5958 }
5959
5960
5961 /* Extract an annotation datum from a composition starting at POS and
5962 ending before LIMIT of CODING->src_object (buffer or string), store
5963 the data in BUF, set *STOP to a starting position of the next
5964 composition (if any) or to LIMIT, and return the address of the
5965 next element of BUF.
5966
5967 If such an annotation is not found, set *STOP to a starting
5968 position of a composition after POS (if any) or to LIMIT, and
5969 return BUF. */
5970
5971 static INLINE int *
5972 handle_composition_annotation (pos, limit, coding, buf, stop)
5973 EMACS_INT pos, limit;
5974 struct coding_system *coding;
5975 int *buf;
5976 EMACS_INT *stop;
5977 {
5978 EMACS_INT start, end;
5979 Lisp_Object prop;
5980
5981 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5982 || end > limit)
5983 *stop = limit;
5984 else if (start > pos)
5985 *stop = start;
5986 else
5987 {
5988 if (start == pos)
5989 {
5990 /* We found a composition. Store the corresponding
5991 annotation data in BUF. */
5992 int *head = buf;
5993 enum composition_method method = COMPOSITION_METHOD (prop);
5994 int nchars = COMPOSITION_LENGTH (prop);
5995
5996 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5997 if (method != COMPOSITION_RELATIVE)
5998 {
5999 Lisp_Object components;
6000 int len, i, i_byte;
6001
6002 components = COMPOSITION_COMPONENTS (prop);
6003 if (VECTORP (components))
6004 {
6005 len = XVECTOR (components)->size;
6006 for (i = 0; i < len; i++)
6007 *buf++ = XINT (AREF (components, i));
6008 }
6009 else if (STRINGP (components))
6010 {
6011 len = SCHARS (components);
6012 i = i_byte = 0;
6013 while (i < len)
6014 {
6015 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6016 buf++;
6017 }
6018 }
6019 else if (INTEGERP (components))
6020 {
6021 len = 1;
6022 *buf++ = XINT (components);
6023 }
6024 else if (CONSP (components))
6025 {
6026 for (len = 0; CONSP (components);
6027 len++, components = XCDR (components))
6028 *buf++ = XINT (XCAR (components));
6029 }
6030 else
6031 abort ();
6032 *head -= len;
6033 }
6034 }
6035
6036 if (find_composition (end, limit, &start, &end, &prop,
6037 coding->src_object)
6038 && end <= limit)
6039 *stop = start;
6040 else
6041 *stop = limit;
6042 }
6043 return buf;
6044 }
6045
6046
6047 /* Extract an annotation datum from a text property `charset' at POS of
6048 CODING->src_object (buffer of string), store the data in BUF, set
6049 *STOP to the position where the value of `charset' property changes
6050 (limiting by LIMIT), and return the address of the next element of
6051 BUF.
6052
6053 If the property value is nil, set *STOP to the position where the
6054 property value is non-nil (limiting by LIMIT), and return BUF. */
6055
6056 static INLINE int *
6057 handle_charset_annotation (pos, limit, coding, buf, stop)
6058 EMACS_INT pos, limit;
6059 struct coding_system *coding;
6060 int *buf;
6061 EMACS_INT *stop;
6062 {
6063 Lisp_Object val, next;
6064 int id;
6065
6066 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6067 if (! NILP (val) && CHARSETP (val))
6068 id = XINT (CHARSET_SYMBOL_ID (val));
6069 else
6070 id = -1;
6071 ADD_CHARSET_DATA (buf, 0, 0, id);
6072 next = Fnext_single_property_change (make_number (pos), Qcharset,
6073 coding->src_object,
6074 make_number (limit));
6075 *stop = XINT (next);
6076 return buf;
6077 }
6078
6079
6080 static void
6081 consume_chars (coding)
6082 struct coding_system *coding;
6083 {
6084 int *buf = coding->charbuf;
6085 int *buf_end = coding->charbuf + coding->charbuf_size;
6086 const unsigned char *src = coding->source + coding->consumed;
6087 const unsigned char *src_end = coding->source + coding->src_bytes;
6088 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6089 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6090 int multibytep = coding->src_multibyte;
6091 Lisp_Object eol_type;
6092 int c;
6093 EMACS_INT stop, stop_composition, stop_charset;
6094
6095 eol_type = CODING_ID_EOL_TYPE (coding->id);
6096 if (VECTORP (eol_type))
6097 eol_type = Qunix;
6098
6099 /* Note: composition handling is not yet implemented. */
6100 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6101
6102 if (NILP (coding->src_object))
6103 stop = stop_composition = stop_charset = end_pos;
6104 else
6105 {
6106 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6107 stop = stop_composition = pos;
6108 else
6109 stop = stop_composition = end_pos;
6110 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6111 stop = stop_charset = pos;
6112 else
6113 stop_charset = end_pos;
6114 }
6115
6116 /* Compensate for CRLF and conversion. */
6117 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6118 while (buf < buf_end)
6119 {
6120 if (pos == stop)
6121 {
6122 if (pos == end_pos)
6123 break;
6124 if (pos == stop_composition)
6125 buf = handle_composition_annotation (pos, end_pos, coding,
6126 buf, &stop_composition);
6127 if (pos == stop_charset)
6128 buf = handle_charset_annotation (pos, end_pos, coding,
6129 buf, &stop_charset);
6130 stop = (stop_composition < stop_charset
6131 ? stop_composition : stop_charset);
6132 }
6133
6134 if (! multibytep)
6135 {
6136 EMACS_INT bytes;
6137
6138 if (! CODING_FOR_UNIBYTE (coding)
6139 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6140 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6141 else
6142 c = *src++, pos++;
6143 }
6144 else
6145 c = STRING_CHAR_ADVANCE (src), pos++;
6146 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6147 c = '\n';
6148 if (! EQ (eol_type, Qunix))
6149 {
6150 if (c == '\n')
6151 {
6152 if (EQ (eol_type, Qdos))
6153 *buf++ = '\r';
6154 else
6155 c = '\r';
6156 }
6157 }
6158 *buf++ = c;
6159 }
6160
6161 coding->consumed = src - coding->source;
6162 coding->consumed_char = pos - coding->src_pos;
6163 coding->charbuf_used = buf - coding->charbuf;
6164 coding->chars_at_source = 0;
6165 }
6166
6167
6168 /* Encode the text at CODING->src_object into CODING->dst_object.
6169 CODING->src_object is a buffer or a string.
6170 CODING->dst_object is a buffer or nil.
6171
6172 If CODING->src_object is a buffer, it must be the current buffer.
6173 In this case, if CODING->src_pos is positive, it is a position of
6174 the source text in the buffer, otherwise. the source text is in the
6175 gap area of the buffer, and coding->src_pos specifies the offset of
6176 the text from GPT (which must be the same as PT). If this is the
6177 same buffer as CODING->dst_object, CODING->src_pos must be
6178 negative and CODING should not have `pre-write-conversion'.
6179
6180 If CODING->src_object is a string, CODING should not have
6181 `pre-write-conversion'.
6182
6183 If CODING->dst_object is a buffer, the encoded data is inserted at
6184 the current point of that buffer.
6185
6186 If CODING->dst_object is nil, the encoded data is placed at the
6187 memory area specified by CODING->destination. */
6188
6189 static int
6190 encode_coding (coding)
6191 struct coding_system *coding;
6192 {
6193 Lisp_Object attrs;
6194 Lisp_Object translation_table;
6195
6196 attrs = CODING_ID_ATTRS (coding->id);
6197 translation_table = get_translation_table (attrs, 1);
6198
6199 if (BUFFERP (coding->dst_object))
6200 {
6201 set_buffer_internal (XBUFFER (coding->dst_object));
6202 coding->dst_multibyte
6203 = ! NILP (current_buffer->enable_multibyte_characters);
6204 }
6205
6206 coding->consumed = coding->consumed_char = 0;
6207 coding->produced = coding->produced_char = 0;
6208 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6209 coding->errors = 0;
6210
6211 ALLOC_CONVERSION_WORK_AREA (coding);
6212
6213 do {
6214 coding_set_source (coding);
6215 consume_chars (coding);
6216
6217 if (!NILP (translation_table))
6218 translate_chars (coding, translation_table);
6219
6220 coding_set_destination (coding);
6221 (*(coding->encoder)) (coding);
6222 } while (coding->consumed_char < coding->src_chars);
6223
6224 if (BUFFERP (coding->dst_object))
6225 insert_from_gap (coding->produced_char, coding->produced);
6226
6227 return (coding->result);
6228 }
6229
6230
6231 /* Name (or base name) of work buffer for code conversion. */
6232 static Lisp_Object Vcode_conversion_workbuf_name;
6233
6234 /* A working buffer used by the top level conversion. Once it is
6235 created, it is never destroyed. It has the name
6236 Vcode_conversion_workbuf_name. The other working buffers are
6237 destroyed after the use is finished, and their names are modified
6238 versions of Vcode_conversion_workbuf_name. */
6239 static Lisp_Object Vcode_conversion_reused_workbuf;
6240
6241 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6242 static int reused_workbuf_in_use;
6243
6244
6245 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6246 multibyteness of returning buffer. */
6247
6248 Lisp_Object
6249 make_conversion_work_buffer (multibyte)
6250 {
6251 Lisp_Object name, workbuf;
6252 struct buffer *current;
6253
6254 if (reused_workbuf_in_use++)
6255 {
6256 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6257 workbuf = Fget_buffer_create (name);
6258 }
6259 else
6260 {
6261 name = Vcode_conversion_workbuf_name;
6262 workbuf = Fget_buffer_create (name);
6263 if (NILP (Vcode_conversion_reused_workbuf))
6264 Vcode_conversion_reused_workbuf = workbuf;
6265 }
6266 current = current_buffer;
6267 set_buffer_internal (XBUFFER (workbuf));
6268 Ferase_buffer ();
6269 current_buffer->undo_list = Qt;
6270 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6271 set_buffer_internal (current);
6272 return workbuf;
6273 }
6274
6275
6276 static Lisp_Object
6277 code_conversion_restore (arg)
6278 Lisp_Object arg;
6279 {
6280 Lisp_Object current, workbuf;
6281
6282 current = XCAR (arg);
6283 workbuf = XCDR (arg);
6284 if (! NILP (workbuf))
6285 {
6286 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6287 reused_workbuf_in_use = 0;
6288 else if (! NILP (Fbuffer_live_p (workbuf)))
6289 Fkill_buffer (workbuf);
6290 }
6291 set_buffer_internal (XBUFFER (current));
6292 return Qnil;
6293 }
6294
6295 Lisp_Object
6296 code_conversion_save (with_work_buf, multibyte)
6297 int with_work_buf, multibyte;
6298 {
6299 Lisp_Object workbuf = Qnil;
6300
6301 if (with_work_buf)
6302 workbuf = make_conversion_work_buffer (multibyte);
6303 record_unwind_protect (code_conversion_restore,
6304 Fcons (Fcurrent_buffer (), workbuf));
6305 return workbuf;
6306 }
6307
6308 int
6309 decode_coding_gap (coding, chars, bytes)
6310 struct coding_system *coding;
6311 EMACS_INT chars, bytes;
6312 {
6313 int count = specpdl_ptr - specpdl;
6314 Lisp_Object attrs;
6315
6316 code_conversion_save (0, 0);
6317
6318 coding->src_object = Fcurrent_buffer ();
6319 coding->src_chars = chars;
6320 coding->src_bytes = bytes;
6321 coding->src_pos = -chars;
6322 coding->src_pos_byte = -bytes;
6323 coding->src_multibyte = chars < bytes;
6324 coding->dst_object = coding->src_object;
6325 coding->dst_pos = PT;
6326 coding->dst_pos_byte = PT_BYTE;
6327 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6328 coding->mode |= CODING_MODE_LAST_BLOCK;
6329
6330 if (CODING_REQUIRE_DETECTION (coding))
6331 detect_coding (coding);
6332
6333 decode_coding (coding);
6334
6335 attrs = CODING_ID_ATTRS (coding->id);
6336 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6337 {
6338 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6339 Lisp_Object val;
6340
6341 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6342 val = call1 (CODING_ATTR_POST_READ (attrs),
6343 make_number (coding->produced_char));
6344 CHECK_NATNUM (val);
6345 coding->produced_char += Z - prev_Z;
6346 coding->produced += Z_BYTE - prev_Z_BYTE;
6347 }
6348
6349 unbind_to (count, Qnil);
6350 return coding->result;
6351 }
6352
6353 int
6354 encode_coding_gap (coding, chars, bytes)
6355 struct coding_system *coding;
6356 EMACS_INT chars, bytes;
6357 {
6358 int count = specpdl_ptr - specpdl;
6359
6360 code_conversion_save (0, 0);
6361
6362 coding->src_object = Fcurrent_buffer ();
6363 coding->src_chars = chars;
6364 coding->src_bytes = bytes;
6365 coding->src_pos = -chars;
6366 coding->src_pos_byte = -bytes;
6367 coding->src_multibyte = chars < bytes;
6368 coding->dst_object = coding->src_object;
6369 coding->dst_pos = PT;
6370 coding->dst_pos_byte = PT_BYTE;
6371
6372 encode_coding (coding);
6373
6374 unbind_to (count, Qnil);
6375 return coding->result;
6376 }
6377
6378
6379 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6380 SRC_OBJECT into DST_OBJECT by coding context CODING.
6381
6382 SRC_OBJECT is a buffer, a string, or Qnil.
6383
6384 If it is a buffer, the text is at point of the buffer. FROM and TO
6385 are positions in the buffer.
6386
6387 If it is a string, the text is at the beginning of the string.
6388 FROM and TO are indices to the string.
6389
6390 If it is nil, the text is at coding->source. FROM and TO are
6391 indices to coding->source.
6392
6393 DST_OBJECT is a buffer, Qt, or Qnil.
6394
6395 If it is a buffer, the decoded text is inserted at point of the
6396 buffer. If the buffer is the same as SRC_OBJECT, the source text
6397 is deleted.
6398
6399 If it is Qt, a string is made from the decoded text, and
6400 set in CODING->dst_object.
6401
6402 If it is Qnil, the decoded text is stored at CODING->destination.
6403 The caller must allocate CODING->dst_bytes bytes at
6404 CODING->destination by xmalloc. If the decoded text is longer than
6405 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6406 */
6407
6408 void
6409 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6410 dst_object)
6411 struct coding_system *coding;
6412 Lisp_Object src_object;
6413 EMACS_INT from, from_byte, to, to_byte;
6414 Lisp_Object dst_object;
6415 {
6416 int count = specpdl_ptr - specpdl;
6417 unsigned char *destination;
6418 EMACS_INT dst_bytes;
6419 EMACS_INT chars = to - from;
6420 EMACS_INT bytes = to_byte - from_byte;
6421 Lisp_Object attrs;
6422 Lisp_Object buffer;
6423 int saved_pt = -1, saved_pt_byte;
6424
6425 buffer = Fcurrent_buffer ();
6426
6427 if (NILP (dst_object))
6428 {
6429 destination = coding->destination;
6430 dst_bytes = coding->dst_bytes;
6431 }
6432
6433 coding->src_object = src_object;
6434 coding->src_chars = chars;
6435 coding->src_bytes = bytes;
6436 coding->src_multibyte = chars < bytes;
6437
6438 if (STRINGP (src_object))
6439 {
6440 coding->src_pos = from;
6441 coding->src_pos_byte = from_byte;
6442 }
6443 else if (BUFFERP (src_object))
6444 {
6445 set_buffer_internal (XBUFFER (src_object));
6446 if (from != GPT)
6447 move_gap_both (from, from_byte);
6448 if (EQ (src_object, dst_object))
6449 {
6450 saved_pt = PT, saved_pt_byte = PT_BYTE;
6451 TEMP_SET_PT_BOTH (from, from_byte);
6452 del_range_both (from, from_byte, to, to_byte, 1);
6453 coding->src_pos = -chars;
6454 coding->src_pos_byte = -bytes;
6455 }
6456 else
6457 {
6458 coding->src_pos = from;
6459 coding->src_pos_byte = from_byte;
6460 }
6461 }
6462
6463 if (CODING_REQUIRE_DETECTION (coding))
6464 detect_coding (coding);
6465 attrs = CODING_ID_ATTRS (coding->id);
6466
6467 if (EQ (dst_object, Qt)
6468 || (! NILP (CODING_ATTR_POST_READ (attrs))
6469 && NILP (dst_object)))
6470 {
6471 coding->dst_object = code_conversion_save (1, 1);
6472 coding->dst_pos = BEG;
6473 coding->dst_pos_byte = BEG_BYTE;
6474 coding->dst_multibyte = 1;
6475 }
6476 else if (BUFFERP (dst_object))
6477 {
6478 code_conversion_save (0, 0);
6479 coding->dst_object = dst_object;
6480 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6481 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6482 coding->dst_multibyte
6483 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6484 }
6485 else
6486 {
6487 code_conversion_save (0, 0);
6488 coding->dst_object = Qnil;
6489 coding->dst_multibyte = 1;
6490 }
6491
6492 decode_coding (coding);
6493
6494 if (BUFFERP (coding->dst_object))
6495 set_buffer_internal (XBUFFER (coding->dst_object));
6496
6497 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6498 {
6499 struct gcpro gcpro1, gcpro2;
6500 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6501 Lisp_Object val;
6502
6503 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6504 GCPRO2 (coding->src_object, coding->dst_object);
6505 val = call1 (CODING_ATTR_POST_READ (attrs),
6506 make_number (coding->produced_char));
6507 UNGCPRO;
6508 CHECK_NATNUM (val);
6509 coding->produced_char += Z - prev_Z;
6510 coding->produced += Z_BYTE - prev_Z_BYTE;
6511 }
6512
6513 if (EQ (dst_object, Qt))
6514 {
6515 coding->dst_object = Fbuffer_string ();
6516 }
6517 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6518 {
6519 set_buffer_internal (XBUFFER (coding->dst_object));
6520 if (dst_bytes < coding->produced)
6521 {
6522 destination
6523 = (unsigned char *) xrealloc (destination, coding->produced);
6524 if (! destination)
6525 {
6526 record_conversion_result (coding,
6527 CODING_RESULT_INSUFFICIENT_DST);
6528 unbind_to (count, Qnil);
6529 return;
6530 }
6531 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6532 move_gap_both (BEGV, BEGV_BYTE);
6533 bcopy (BEGV_ADDR, destination, coding->produced);
6534 coding->destination = destination;
6535 }
6536 }
6537
6538 if (saved_pt >= 0)
6539 {
6540 /* This is the case of:
6541 (BUFFERP (src_object) && EQ (src_object, dst_object))
6542 As we have moved PT while replacing the original buffer
6543 contents, we must recover it now. */
6544 set_buffer_internal (XBUFFER (src_object));
6545 if (saved_pt < from)
6546 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6547 else if (saved_pt < from + chars)
6548 TEMP_SET_PT_BOTH (from, from_byte);
6549 else if (! NILP (current_buffer->enable_multibyte_characters))
6550 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6551 saved_pt_byte + (coding->produced - bytes));
6552 else
6553 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6554 saved_pt_byte + (coding->produced - bytes));
6555 }
6556
6557 unbind_to (count, coding->dst_object);
6558 }
6559
6560
6561 void
6562 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6563 dst_object)
6564 struct coding_system *coding;
6565 Lisp_Object src_object;
6566 EMACS_INT from, from_byte, to, to_byte;
6567 Lisp_Object dst_object;
6568 {
6569 int count = specpdl_ptr - specpdl;
6570 EMACS_INT chars = to - from;
6571 EMACS_INT bytes = to_byte - from_byte;
6572 Lisp_Object attrs;
6573 Lisp_Object buffer;
6574 int saved_pt = -1, saved_pt_byte;
6575
6576 buffer = Fcurrent_buffer ();
6577
6578 coding->src_object = src_object;
6579 coding->src_chars = chars;
6580 coding->src_bytes = bytes;
6581 coding->src_multibyte = chars < bytes;
6582
6583 attrs = CODING_ID_ATTRS (coding->id);
6584
6585 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6586 {
6587 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6588 set_buffer_internal (XBUFFER (coding->src_object));
6589 if (STRINGP (src_object))
6590 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6591 else if (BUFFERP (src_object))
6592 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6593 else
6594 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6595
6596 if (EQ (src_object, dst_object))
6597 {
6598 set_buffer_internal (XBUFFER (src_object));
6599 saved_pt = PT, saved_pt_byte = PT_BYTE;
6600 del_range_both (from, from_byte, to, to_byte, 1);
6601 set_buffer_internal (XBUFFER (coding->src_object));
6602 }
6603
6604 call2 (CODING_ATTR_PRE_WRITE (attrs),
6605 make_number (BEG), make_number (Z));
6606 coding->src_object = Fcurrent_buffer ();
6607 if (BEG != GPT)
6608 move_gap_both (BEG, BEG_BYTE);
6609 coding->src_chars = Z - BEG;
6610 coding->src_bytes = Z_BYTE - BEG_BYTE;
6611 coding->src_pos = BEG;
6612 coding->src_pos_byte = BEG_BYTE;
6613 coding->src_multibyte = Z < Z_BYTE;
6614 }
6615 else if (STRINGP (src_object))
6616 {
6617 code_conversion_save (0, 0);
6618 coding->src_pos = from;
6619 coding->src_pos_byte = from_byte;
6620 }
6621 else if (BUFFERP (src_object))
6622 {
6623 code_conversion_save (0, 0);
6624 set_buffer_internal (XBUFFER (src_object));
6625 if (EQ (src_object, dst_object))
6626 {
6627 saved_pt = PT, saved_pt_byte = PT_BYTE;
6628 coding->src_object = del_range_1 (from, to, 1, 1);
6629 coding->src_pos = 0;
6630 coding->src_pos_byte = 0;
6631 }
6632 else
6633 {
6634 if (from < GPT && to >= GPT)
6635 move_gap_both (from, from_byte);
6636 coding->src_pos = from;
6637 coding->src_pos_byte = from_byte;
6638 }
6639 }
6640 else
6641 code_conversion_save (0, 0);
6642
6643 if (BUFFERP (dst_object))
6644 {
6645 coding->dst_object = dst_object;
6646 if (EQ (src_object, dst_object))
6647 {
6648 coding->dst_pos = from;
6649 coding->dst_pos_byte = from_byte;
6650 }
6651 else
6652 {
6653 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6654 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6655 }
6656 coding->dst_multibyte
6657 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6658 }
6659 else if (EQ (dst_object, Qt))
6660 {
6661 coding->dst_object = Qnil;
6662 coding->dst_bytes = coding->src_chars;
6663 if (coding->dst_bytes == 0)
6664 coding->dst_bytes = 1;
6665 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6666 coding->dst_multibyte = 0;
6667 }
6668 else
6669 {
6670 coding->dst_object = Qnil;
6671 coding->dst_multibyte = 0;
6672 }
6673
6674 encode_coding (coding);
6675
6676 if (EQ (dst_object, Qt))
6677 {
6678 if (BUFFERP (coding->dst_object))
6679 coding->dst_object = Fbuffer_string ();
6680 else
6681 {
6682 coding->dst_object
6683 = make_unibyte_string ((char *) coding->destination,
6684 coding->produced);
6685 xfree (coding->destination);
6686 }
6687 }
6688
6689 if (saved_pt >= 0)
6690 {
6691 /* This is the case of:
6692 (BUFFERP (src_object) && EQ (src_object, dst_object))
6693 As we have moved PT while replacing the original buffer
6694 contents, we must recover it now. */
6695 set_buffer_internal (XBUFFER (src_object));
6696 if (saved_pt < from)
6697 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6698 else if (saved_pt < from + chars)
6699 TEMP_SET_PT_BOTH (from, from_byte);
6700 else if (! NILP (current_buffer->enable_multibyte_characters))
6701 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6702 saved_pt_byte + (coding->produced - bytes));
6703 else
6704 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6705 saved_pt_byte + (coding->produced - bytes));
6706 }
6707
6708 unbind_to (count, Qnil);
6709 }
6710
6711
6712 Lisp_Object
6713 preferred_coding_system ()
6714 {
6715 int id = coding_categories[coding_priorities[0]].id;
6716
6717 return CODING_ID_NAME (id);
6718 }
6719
6720 \f
6721 #ifdef emacs
6722 /*** 8. Emacs Lisp library functions ***/
6723
6724 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6725 doc: /* Return t if OBJECT is nil or a coding-system.
6726 See the documentation of `define-coding-system' for information
6727 about coding-system objects. */)
6728 (obj)
6729 Lisp_Object obj;
6730 {
6731 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6732 }
6733
6734 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6735 Sread_non_nil_coding_system, 1, 1, 0,
6736 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6737 (prompt)
6738 Lisp_Object prompt;
6739 {
6740 Lisp_Object val;
6741 do
6742 {
6743 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6744 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6745 }
6746 while (SCHARS (val) == 0);
6747 return (Fintern (val, Qnil));
6748 }
6749
6750 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6751 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6752 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6753 (prompt, default_coding_system)
6754 Lisp_Object prompt, default_coding_system;
6755 {
6756 Lisp_Object val;
6757 if (SYMBOLP (default_coding_system))
6758 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6759 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6760 Qt, Qnil, Qcoding_system_history,
6761 default_coding_system, Qnil);
6762 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6763 }
6764
6765 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6766 1, 1, 0,
6767 doc: /* Check validity of CODING-SYSTEM.
6768 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6769 (coding_system)
6770 Lisp_Object coding_system;
6771 {
6772 CHECK_SYMBOL (coding_system);
6773 if (!NILP (Fcoding_system_p (coding_system)))
6774 return coding_system;
6775 while (1)
6776 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6777 }
6778
6779 \f
6780 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6781 HIGHEST is nonzero, return the coding system of the highest
6782 priority among the detected coding systems. Otherwize return a
6783 list of detected coding systems sorted by their priorities. If
6784 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6785 multibyte form but contains only ASCII and eight-bit chars.
6786 Otherwise, the bytes are raw bytes.
6787
6788 CODING-SYSTEM controls the detection as below:
6789
6790 If it is nil, detect both text-format and eol-format. If the
6791 text-format part of CODING-SYSTEM is already specified
6792 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6793 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6794 detect only text-format. */
6795
6796 Lisp_Object
6797 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6798 coding_system)
6799 const unsigned char *src;
6800 int src_chars, src_bytes, highest;
6801 int multibytep;
6802 Lisp_Object coding_system;
6803 {
6804 const unsigned char *src_end = src + src_bytes;
6805 Lisp_Object attrs, eol_type;
6806 Lisp_Object val;
6807 struct coding_system coding;
6808 int id;
6809 struct coding_detection_info detect_info;
6810 enum coding_category base_category;
6811
6812 if (NILP (coding_system))
6813 coding_system = Qundecided;
6814 setup_coding_system (coding_system, &coding);
6815 attrs = CODING_ID_ATTRS (coding.id);
6816 eol_type = CODING_ID_EOL_TYPE (coding.id);
6817 coding_system = CODING_ATTR_BASE_NAME (attrs);
6818
6819 coding.source = src;
6820 coding.src_chars = src_chars;
6821 coding.src_bytes = src_bytes;
6822 coding.src_multibyte = multibytep;
6823 coding.consumed = 0;
6824 coding.mode |= CODING_MODE_LAST_BLOCK;
6825
6826 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6827
6828 /* At first, detect text-format if necessary. */
6829 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6830 if (base_category == coding_category_undecided)
6831 {
6832 enum coding_category category;
6833 struct coding_system *this;
6834 int c, i;
6835
6836 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6837 for (i = 0; src < src_end; i++, src++)
6838 {
6839 c = *src;
6840 if (c & 0x80 || (c < 0x20 && (c == 0
6841 || c == ISO_CODE_ESC
6842 || c == ISO_CODE_SI
6843 || c == ISO_CODE_SO)))
6844 break;
6845 }
6846 coding.head_ascii = src - coding.source;
6847
6848 if (src < src_end)
6849 for (i = 0; i < coding_category_raw_text; i++)
6850 {
6851 category = coding_priorities[i];
6852 this = coding_categories + category;
6853
6854 if (this->id < 0)
6855 {
6856 /* No coding system of this category is defined. */
6857 detect_info.rejected |= (1 << category);
6858 }
6859 else if (category >= coding_category_raw_text)
6860 continue;
6861 else if (detect_info.checked & (1 << category))
6862 {
6863 if (highest
6864 && (detect_info.found & (1 << category)))
6865 break;
6866 }
6867 else
6868 {
6869 if ((*(this->detector)) (&coding, &detect_info)
6870 && highest
6871 && (detect_info.found & (1 << category)))
6872 {
6873 if (category == coding_category_utf_16_auto)
6874 {
6875 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6876 category = coding_category_utf_16_le;
6877 else
6878 category = coding_category_utf_16_be;
6879 }
6880 break;
6881 }
6882 }
6883 }
6884
6885 if (detect_info.rejected == CATEGORY_MASK_ANY)
6886 {
6887 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6888 id = coding_categories[coding_category_raw_text].id;
6889 val = Fcons (make_number (id), Qnil);
6890 }
6891 else if (! detect_info.rejected && ! detect_info.found)
6892 {
6893 detect_info.found = CATEGORY_MASK_ANY;
6894 id = coding_categories[coding_category_undecided].id;
6895 val = Fcons (make_number (id), Qnil);
6896 }
6897 else if (highest)
6898 {
6899 if (detect_info.found)
6900 {
6901 detect_info.found = 1 << category;
6902 val = Fcons (make_number (this->id), Qnil);
6903 }
6904 else
6905 for (i = 0; i < coding_category_raw_text; i++)
6906 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6907 {
6908 detect_info.found = 1 << coding_priorities[i];
6909 id = coding_categories[coding_priorities[i]].id;
6910 val = Fcons (make_number (id), Qnil);
6911 break;
6912 }
6913 }
6914 else
6915 {
6916 int mask = detect_info.rejected | detect_info.found;
6917 int found = 0;
6918 val = Qnil;
6919
6920 for (i = coding_category_raw_text - 1; i >= 0; i--)
6921 {
6922 category = coding_priorities[i];
6923 if (! (mask & (1 << category)))
6924 {
6925 found |= 1 << category;
6926 id = coding_categories[category].id;
6927 val = Fcons (make_number (id), val);
6928 }
6929 }
6930 for (i = coding_category_raw_text - 1; i >= 0; i--)
6931 {
6932 category = coding_priorities[i];
6933 if (detect_info.found & (1 << category))
6934 {
6935 id = coding_categories[category].id;
6936 val = Fcons (make_number (id), val);
6937 }
6938 }
6939 detect_info.found |= found;
6940 }
6941 }
6942 else if (base_category == coding_category_utf_16_auto)
6943 {
6944 if (detect_coding_utf_16 (&coding, &detect_info))
6945 {
6946 enum coding_category category;
6947 struct coding_system *this;
6948
6949 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6950 this = coding_categories + coding_category_utf_16_le;
6951 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6952 this = coding_categories + coding_category_utf_16_be;
6953 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6954 this = coding_categories + coding_category_utf_16_be_nosig;
6955 else
6956 this = coding_categories + coding_category_utf_16_le_nosig;
6957 val = Fcons (make_number (this->id), Qnil);
6958 }
6959 }
6960 else
6961 {
6962 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6963 val = Fcons (make_number (coding.id), Qnil);
6964 }
6965
6966 /* Then, detect eol-format if necessary. */
6967 {
6968 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6969 Lisp_Object tail;
6970
6971 if (VECTORP (eol_type))
6972 {
6973 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6974 normal_eol = detect_eol (coding.source, src_bytes,
6975 coding_category_raw_text);
6976 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6977 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6978 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6979 coding_category_utf_16_be);
6980 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6981 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6982 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6983 coding_category_utf_16_le);
6984 }
6985 else
6986 {
6987 if (EQ (eol_type, Qunix))
6988 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6989 else if (EQ (eol_type, Qdos))
6990 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6991 else
6992 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6993 }
6994
6995 for (tail = val; CONSP (tail); tail = XCDR (tail))
6996 {
6997 enum coding_category category;
6998 int this_eol;
6999
7000 id = XINT (XCAR (tail));
7001 attrs = CODING_ID_ATTRS (id);
7002 category = XINT (CODING_ATTR_CATEGORY (attrs));
7003 eol_type = CODING_ID_EOL_TYPE (id);
7004 if (VECTORP (eol_type))
7005 {
7006 if (category == coding_category_utf_16_be
7007 || category == coding_category_utf_16_be_nosig)
7008 this_eol = utf_16_be_eol;
7009 else if (category == coding_category_utf_16_le
7010 || category == coding_category_utf_16_le_nosig)
7011 this_eol = utf_16_le_eol;
7012 else
7013 this_eol = normal_eol;
7014
7015 if (this_eol == EOL_SEEN_LF)
7016 XSETCAR (tail, AREF (eol_type, 0));
7017 else if (this_eol == EOL_SEEN_CRLF)
7018 XSETCAR (tail, AREF (eol_type, 1));
7019 else if (this_eol == EOL_SEEN_CR)
7020 XSETCAR (tail, AREF (eol_type, 2));
7021 else
7022 XSETCAR (tail, CODING_ID_NAME (id));
7023 }
7024 else
7025 XSETCAR (tail, CODING_ID_NAME (id));
7026 }
7027 }
7028
7029 return (highest ? XCAR (val) : val);
7030 }
7031
7032
7033 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7034 2, 3, 0,
7035 doc: /* Detect coding system of the text in the region between START and END.
7036 Return a list of possible coding systems ordered by priority.
7037
7038 If only ASCII characters are found, it returns a list of single element
7039 `undecided' or its subsidiary coding system according to a detected
7040 end-of-line format.
7041
7042 If optional argument HIGHEST is non-nil, return the coding system of
7043 highest priority. */)
7044 (start, end, highest)
7045 Lisp_Object start, end, highest;
7046 {
7047 int from, to;
7048 int from_byte, to_byte;
7049
7050 CHECK_NUMBER_COERCE_MARKER (start);
7051 CHECK_NUMBER_COERCE_MARKER (end);
7052
7053 validate_region (&start, &end);
7054 from = XINT (start), to = XINT (end);
7055 from_byte = CHAR_TO_BYTE (from);
7056 to_byte = CHAR_TO_BYTE (to);
7057
7058 if (from < GPT && to >= GPT)
7059 move_gap_both (to, to_byte);
7060
7061 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7062 to - from, to_byte - from_byte,
7063 !NILP (highest),
7064 !NILP (current_buffer
7065 ->enable_multibyte_characters),
7066 Qnil);
7067 }
7068
7069 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7070 1, 2, 0,
7071 doc: /* Detect coding system of the text in STRING.
7072 Return a list of possible coding systems ordered by priority.
7073
7074 If only ASCII characters are found, it returns a list of single element
7075 `undecided' or its subsidiary coding system according to a detected
7076 end-of-line format.
7077
7078 If optional argument HIGHEST is non-nil, return the coding system of
7079 highest priority. */)
7080 (string, highest)
7081 Lisp_Object string, highest;
7082 {
7083 CHECK_STRING (string);
7084
7085 return detect_coding_system (SDATA (string),
7086 SCHARS (string), SBYTES (string),
7087 !NILP (highest), STRING_MULTIBYTE (string),
7088 Qnil);
7089 }
7090
7091
7092 static INLINE int
7093 char_encodable_p (c, attrs)
7094 int c;
7095 Lisp_Object attrs;
7096 {
7097 Lisp_Object tail;
7098 struct charset *charset;
7099 Lisp_Object translation_table;
7100
7101 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7102 if (CHAR_TABLE_P (translation_table))
7103 c = translate_char (translation_table, c);
7104 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7105 CONSP (tail); tail = XCDR (tail))
7106 {
7107 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7108 if (CHAR_CHARSET_P (c, charset))
7109 break;
7110 }
7111 return (! NILP (tail));
7112 }
7113
7114
7115 /* Return a list of coding systems that safely encode the text between
7116 START and END. If EXCLUDE is non-nil, it is a list of coding
7117 systems not to check. The returned list doesn't contain any such
7118 coding systems. In any case, if the text contains only ASCII or is
7119 unibyte, return t. */
7120
7121 DEFUN ("find-coding-systems-region-internal",
7122 Ffind_coding_systems_region_internal,
7123 Sfind_coding_systems_region_internal, 2, 3, 0,
7124 doc: /* Internal use only. */)
7125 (start, end, exclude)
7126 Lisp_Object start, end, exclude;
7127 {
7128 Lisp_Object coding_attrs_list, safe_codings;
7129 EMACS_INT start_byte, end_byte;
7130 const unsigned char *p, *pbeg, *pend;
7131 int c;
7132 Lisp_Object tail, elt;
7133
7134 if (STRINGP (start))
7135 {
7136 if (!STRING_MULTIBYTE (start)
7137 || SCHARS (start) == SBYTES (start))
7138 return Qt;
7139 start_byte = 0;
7140 end_byte = SBYTES (start);
7141 }
7142 else
7143 {
7144 CHECK_NUMBER_COERCE_MARKER (start);
7145 CHECK_NUMBER_COERCE_MARKER (end);
7146 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7147 args_out_of_range (start, end);
7148 if (NILP (current_buffer->enable_multibyte_characters))
7149 return Qt;
7150 start_byte = CHAR_TO_BYTE (XINT (start));
7151 end_byte = CHAR_TO_BYTE (XINT (end));
7152 if (XINT (end) - XINT (start) == end_byte - start_byte)
7153 return Qt;
7154
7155 if (XINT (start) < GPT && XINT (end) > GPT)
7156 {
7157 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7158 move_gap_both (XINT (start), start_byte);
7159 else
7160 move_gap_both (XINT (end), end_byte);
7161 }
7162 }
7163
7164 coding_attrs_list = Qnil;
7165 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7166 if (NILP (exclude)
7167 || NILP (Fmemq (XCAR (tail), exclude)))
7168 {
7169 Lisp_Object attrs;
7170
7171 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7172 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7173 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7174 {
7175 ASET (attrs, coding_attr_trans_tbl,
7176 get_translation_table (attrs, 1));
7177 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7178 }
7179 }
7180
7181 if (STRINGP (start))
7182 p = pbeg = SDATA (start);
7183 else
7184 p = pbeg = BYTE_POS_ADDR (start_byte);
7185 pend = p + (end_byte - start_byte);
7186
7187 while (p < pend && ASCII_BYTE_P (*p)) p++;
7188 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7189
7190 while (p < pend)
7191 {
7192 if (ASCII_BYTE_P (*p))
7193 p++;
7194 else
7195 {
7196 c = STRING_CHAR_ADVANCE (p);
7197
7198 charset_map_loaded = 0;
7199 for (tail = coding_attrs_list; CONSP (tail);)
7200 {
7201 elt = XCAR (tail);
7202 if (NILP (elt))
7203 tail = XCDR (tail);
7204 else if (char_encodable_p (c, elt))
7205 tail = XCDR (tail);
7206 else if (CONSP (XCDR (tail)))
7207 {
7208 XSETCAR (tail, XCAR (XCDR (tail)));
7209 XSETCDR (tail, XCDR (XCDR (tail)));
7210 }
7211 else
7212 {
7213 XSETCAR (tail, Qnil);
7214 tail = XCDR (tail);
7215 }
7216 }
7217 if (charset_map_loaded)
7218 {
7219 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7220
7221 if (STRINGP (start))
7222 pbeg = SDATA (start);
7223 else
7224 pbeg = BYTE_POS_ADDR (start_byte);
7225 p = pbeg + p_offset;
7226 pend = pbeg + pend_offset;
7227 }
7228 }
7229 }
7230
7231 safe_codings = Qnil;
7232 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7233 if (! NILP (XCAR (tail)))
7234 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7235
7236 return safe_codings;
7237 }
7238
7239
7240 DEFUN ("unencodable-char-position", Funencodable_char_position,
7241 Sunencodable_char_position, 3, 5, 0,
7242 doc: /*
7243 Return position of first un-encodable character in a region.
7244 START and END specfiy the region and CODING-SYSTEM specifies the
7245 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7246
7247 If optional 4th argument COUNT is non-nil, it specifies at most how
7248 many un-encodable characters to search. In this case, the value is a
7249 list of positions.
7250
7251 If optional 5th argument STRING is non-nil, it is a string to search
7252 for un-encodable characters. In that case, START and END are indexes
7253 to the string. */)
7254 (start, end, coding_system, count, string)
7255 Lisp_Object start, end, coding_system, count, string;
7256 {
7257 int n;
7258 struct coding_system coding;
7259 Lisp_Object attrs, charset_list, translation_table;
7260 Lisp_Object positions;
7261 int from, to;
7262 const unsigned char *p, *stop, *pend;
7263 int ascii_compatible;
7264
7265 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7266 attrs = CODING_ID_ATTRS (coding.id);
7267 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7268 return Qnil;
7269 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7270 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7271 translation_table = get_translation_table (attrs, 1);
7272
7273 if (NILP (string))
7274 {
7275 validate_region (&start, &end);
7276 from = XINT (start);
7277 to = XINT (end);
7278 if (NILP (current_buffer->enable_multibyte_characters)
7279 || (ascii_compatible
7280 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7281 return Qnil;
7282 p = CHAR_POS_ADDR (from);
7283 pend = CHAR_POS_ADDR (to);
7284 if (from < GPT && to >= GPT)
7285 stop = GPT_ADDR;
7286 else
7287 stop = pend;
7288 }
7289 else
7290 {
7291 CHECK_STRING (string);
7292 CHECK_NATNUM (start);
7293 CHECK_NATNUM (end);
7294 from = XINT (start);
7295 to = XINT (end);
7296 if (from > to
7297 || to > SCHARS (string))
7298 args_out_of_range_3 (string, start, end);
7299 if (! STRING_MULTIBYTE (string))
7300 return Qnil;
7301 p = SDATA (string) + string_char_to_byte (string, from);
7302 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7303 if (ascii_compatible && (to - from) == (pend - p))
7304 return Qnil;
7305 }
7306
7307 if (NILP (count))
7308 n = 1;
7309 else
7310 {
7311 CHECK_NATNUM (count);
7312 n = XINT (count);
7313 }
7314
7315 positions = Qnil;
7316 while (1)
7317 {
7318 int c;
7319
7320 if (ascii_compatible)
7321 while (p < stop && ASCII_BYTE_P (*p))
7322 p++, from++;
7323 if (p >= stop)
7324 {
7325 if (p >= pend)
7326 break;
7327 stop = pend;
7328 p = GAP_END_ADDR;
7329 }
7330
7331 c = STRING_CHAR_ADVANCE (p);
7332 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7333 && ! char_charset (translate_char (translation_table, c),
7334 charset_list, NULL))
7335 {
7336 positions = Fcons (make_number (from), positions);
7337 n--;
7338 if (n == 0)
7339 break;
7340 }
7341
7342 from++;
7343 }
7344
7345 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7346 }
7347
7348
7349 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7350 Scheck_coding_systems_region, 3, 3, 0,
7351 doc: /* Check if the region is encodable by coding systems.
7352
7353 START and END are buffer positions specifying the region.
7354 CODING-SYSTEM-LIST is a list of coding systems to check.
7355
7356 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7357 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7358 whole region, POS0, POS1, ... are buffer positions where non-encodable
7359 characters are found.
7360
7361 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7362 value is nil.
7363
7364 START may be a string. In that case, check if the string is
7365 encodable, and the value contains indices to the string instead of
7366 buffer positions. END is ignored. */)
7367 (start, end, coding_system_list)
7368 Lisp_Object start, end, coding_system_list;
7369 {
7370 Lisp_Object list;
7371 EMACS_INT start_byte, end_byte;
7372 int pos;
7373 const unsigned char *p, *pbeg, *pend;
7374 int c;
7375 Lisp_Object tail, elt, attrs;
7376
7377 if (STRINGP (start))
7378 {
7379 if (!STRING_MULTIBYTE (start)
7380 && SCHARS (start) != SBYTES (start))
7381 return Qnil;
7382 start_byte = 0;
7383 end_byte = SBYTES (start);
7384 pos = 0;
7385 }
7386 else
7387 {
7388 CHECK_NUMBER_COERCE_MARKER (start);
7389 CHECK_NUMBER_COERCE_MARKER (end);
7390 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7391 args_out_of_range (start, end);
7392 if (NILP (current_buffer->enable_multibyte_characters))
7393 return Qnil;
7394 start_byte = CHAR_TO_BYTE (XINT (start));
7395 end_byte = CHAR_TO_BYTE (XINT (end));
7396 if (XINT (end) - XINT (start) == end_byte - start_byte)
7397 return Qt;
7398
7399 if (XINT (start) < GPT && XINT (end) > GPT)
7400 {
7401 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7402 move_gap_both (XINT (start), start_byte);
7403 else
7404 move_gap_both (XINT (end), end_byte);
7405 }
7406 pos = XINT (start);
7407 }
7408
7409 list = Qnil;
7410 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7411 {
7412 elt = XCAR (tail);
7413 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7414 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7415 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7416 }
7417
7418 if (STRINGP (start))
7419 p = pbeg = SDATA (start);
7420 else
7421 p = pbeg = BYTE_POS_ADDR (start_byte);
7422 pend = p + (end_byte - start_byte);
7423
7424 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7425 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7426
7427 while (p < pend)
7428 {
7429 if (ASCII_BYTE_P (*p))
7430 p++;
7431 else
7432 {
7433 c = STRING_CHAR_ADVANCE (p);
7434
7435 charset_map_loaded = 0;
7436 for (tail = list; CONSP (tail); tail = XCDR (tail))
7437 {
7438 elt = XCDR (XCAR (tail));
7439 if (! char_encodable_p (c, XCAR (elt)))
7440 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7441 }
7442 if (charset_map_loaded)
7443 {
7444 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7445
7446 if (STRINGP (start))
7447 pbeg = SDATA (start);
7448 else
7449 pbeg = BYTE_POS_ADDR (start_byte);
7450 p = pbeg + p_offset;
7451 pend = pbeg + pend_offset;
7452 }
7453 }
7454 pos++;
7455 }
7456
7457 tail = list;
7458 list = Qnil;
7459 for (; CONSP (tail); tail = XCDR (tail))
7460 {
7461 elt = XCAR (tail);
7462 if (CONSP (XCDR (XCDR (elt))))
7463 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7464 list);
7465 }
7466
7467 return list;
7468 }
7469
7470
7471 Lisp_Object
7472 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7473 Lisp_Object start, end, coding_system, dst_object;
7474 int encodep, norecord;
7475 {
7476 struct coding_system coding;
7477 EMACS_INT from, from_byte, to, to_byte;
7478 Lisp_Object src_object;
7479
7480 CHECK_NUMBER_COERCE_MARKER (start);
7481 CHECK_NUMBER_COERCE_MARKER (end);
7482 if (NILP (coding_system))
7483 coding_system = Qno_conversion;
7484 else
7485 CHECK_CODING_SYSTEM (coding_system);
7486 src_object = Fcurrent_buffer ();
7487 if (NILP (dst_object))
7488 dst_object = src_object;
7489 else if (! EQ (dst_object, Qt))
7490 CHECK_BUFFER (dst_object);
7491
7492 validate_region (&start, &end);
7493 from = XFASTINT (start);
7494 from_byte = CHAR_TO_BYTE (from);
7495 to = XFASTINT (end);
7496 to_byte = CHAR_TO_BYTE (to);
7497
7498 setup_coding_system (coding_system, &coding);
7499 coding.mode |= CODING_MODE_LAST_BLOCK;
7500
7501 if (encodep)
7502 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7503 dst_object);
7504 else
7505 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7506 dst_object);
7507 if (! norecord)
7508 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7509
7510 return (BUFFERP (dst_object)
7511 ? make_number (coding.produced_char)
7512 : coding.dst_object);
7513 }
7514
7515
7516 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7517 3, 4, "r\nzCoding system: ",
7518 doc: /* Decode the current region from the specified coding system.
7519 When called from a program, takes four arguments:
7520 START, END, CODING-SYSTEM, and DESTINATION.
7521 START and END are buffer positions.
7522
7523 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7524 If nil, the region between START and END is replace by the decoded text.
7525 If buffer, the decoded text is inserted in the buffer.
7526 If t, the decoded text is returned.
7527
7528 This function sets `last-coding-system-used' to the precise coding system
7529 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7530 not fully specified.)
7531 It returns the length of the decoded text. */)
7532 (start, end, coding_system, destination)
7533 Lisp_Object start, end, coding_system, destination;
7534 {
7535 return code_convert_region (start, end, coding_system, destination, 0, 0);
7536 }
7537
7538 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7539 3, 4, "r\nzCoding system: ",
7540 doc: /* Encode the current region by specified coding system.
7541 When called from a program, takes three arguments:
7542 START, END, and CODING-SYSTEM. START and END are buffer positions.
7543
7544 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7545 If nil, the region between START and END is replace by the encoded text.
7546 If buffer, the encoded text is inserted in the buffer.
7547 If t, the encoded text is returned.
7548
7549 This function sets `last-coding-system-used' to the precise coding system
7550 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7551 not fully specified.)
7552 It returns the length of the encoded text. */)
7553 (start, end, coding_system, destination)
7554 Lisp_Object start, end, coding_system, destination;
7555 {
7556 return code_convert_region (start, end, coding_system, destination, 1, 0);
7557 }
7558
7559 Lisp_Object
7560 code_convert_string (string, coding_system, dst_object,
7561 encodep, nocopy, norecord)
7562 Lisp_Object string, coding_system, dst_object;
7563 int encodep, nocopy, norecord;
7564 {
7565 struct coding_system coding;
7566 EMACS_INT chars, bytes;
7567
7568 CHECK_STRING (string);
7569 if (NILP (coding_system))
7570 {
7571 if (! norecord)
7572 Vlast_coding_system_used = Qno_conversion;
7573 if (NILP (dst_object))
7574 return (nocopy ? Fcopy_sequence (string) : string);
7575 }
7576
7577 if (NILP (coding_system))
7578 coding_system = Qno_conversion;
7579 else
7580 CHECK_CODING_SYSTEM (coding_system);
7581 if (NILP (dst_object))
7582 dst_object = Qt;
7583 else if (! EQ (dst_object, Qt))
7584 CHECK_BUFFER (dst_object);
7585
7586 setup_coding_system (coding_system, &coding);
7587 coding.mode |= CODING_MODE_LAST_BLOCK;
7588 chars = SCHARS (string);
7589 bytes = SBYTES (string);
7590 if (encodep)
7591 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7592 else
7593 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7594 if (! norecord)
7595 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7596
7597 return (BUFFERP (dst_object)
7598 ? make_number (coding.produced_char)
7599 : coding.dst_object);
7600 }
7601
7602
7603 /* Encode or decode STRING according to CODING_SYSTEM.
7604 Do not set Vlast_coding_system_used.
7605
7606 This function is called only from macros DECODE_FILE and
7607 ENCODE_FILE, thus we ignore character composition. */
7608
7609 Lisp_Object
7610 code_convert_string_norecord (string, coding_system, encodep)
7611 Lisp_Object string, coding_system;
7612 int encodep;
7613 {
7614 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7615 }
7616
7617
7618 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7619 2, 4, 0,
7620 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7621
7622 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7623 if the decoding operation is trivial.
7624
7625 Optional fourth arg BUFFER non-nil meant that the decoded text is
7626 inserted in BUFFER instead of returned as a string. In this case,
7627 the return value is BUFFER.
7628
7629 This function sets `last-coding-system-used' to the precise coding system
7630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7631 not fully specified. */)
7632 (string, coding_system, nocopy, buffer)
7633 Lisp_Object string, coding_system, nocopy, buffer;
7634 {
7635 return code_convert_string (string, coding_system, buffer,
7636 0, ! NILP (nocopy), 0);
7637 }
7638
7639 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7640 2, 4, 0,
7641 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7642
7643 Optional third arg NOCOPY non-nil means it is OK to return STRING
7644 itself if the encoding operation is trivial.
7645
7646 Optional fourth arg BUFFER non-nil meant that the encoded text is
7647 inserted in BUFFER instead of returned as a string. In this case,
7648 the return value is BUFFER.
7649
7650 This function sets `last-coding-system-used' to the precise coding system
7651 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7652 not fully specified.) */)
7653 (string, coding_system, nocopy, buffer)
7654 Lisp_Object string, coding_system, nocopy, buffer;
7655 {
7656 return code_convert_string (string, coding_system, buffer,
7657 1, ! NILP (nocopy), 1);
7658 }
7659
7660 \f
7661 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7662 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7663 Return the corresponding character. */)
7664 (code)
7665 Lisp_Object code;
7666 {
7667 Lisp_Object spec, attrs, val;
7668 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7669 int c;
7670
7671 CHECK_NATNUM (code);
7672 c = XFASTINT (code);
7673 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7674 attrs = AREF (spec, 0);
7675
7676 if (ASCII_BYTE_P (c)
7677 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7678 return code;
7679
7680 val = CODING_ATTR_CHARSET_LIST (attrs);
7681 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7682 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7683 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7684
7685 if (c <= 0x7F)
7686 charset = charset_roman;
7687 else if (c >= 0xA0 && c < 0xDF)
7688 {
7689 charset = charset_kana;
7690 c -= 0x80;
7691 }
7692 else
7693 {
7694 int s1 = c >> 8, s2 = c & 0xFF;
7695
7696 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7697 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7698 error ("Invalid code: %d", code);
7699 SJIS_TO_JIS (c);
7700 charset = charset_kanji;
7701 }
7702 c = DECODE_CHAR (charset, c);
7703 if (c < 0)
7704 error ("Invalid code: %d", code);
7705 return make_number (c);
7706 }
7707
7708
7709 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7710 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7711 Return the corresponding code in SJIS. */)
7712 (ch)
7713 Lisp_Object ch;
7714 {
7715 Lisp_Object spec, attrs, charset_list;
7716 int c;
7717 struct charset *charset;
7718 unsigned code;
7719
7720 CHECK_CHARACTER (ch);
7721 c = XFASTINT (ch);
7722 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7723 attrs = AREF (spec, 0);
7724
7725 if (ASCII_CHAR_P (c)
7726 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7727 return ch;
7728
7729 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7730 charset = char_charset (c, charset_list, &code);
7731 if (code == CHARSET_INVALID_CODE (charset))
7732 error ("Can't encode by shift_jis encoding: %d", c);
7733 JIS_TO_SJIS (code);
7734
7735 return make_number (code);
7736 }
7737
7738 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7739 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7740 Return the corresponding character. */)
7741 (code)
7742 Lisp_Object code;
7743 {
7744 Lisp_Object spec, attrs, val;
7745 struct charset *charset_roman, *charset_big5, *charset;
7746 int c;
7747
7748 CHECK_NATNUM (code);
7749 c = XFASTINT (code);
7750 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7751 attrs = AREF (spec, 0);
7752
7753 if (ASCII_BYTE_P (c)
7754 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7755 return code;
7756
7757 val = CODING_ATTR_CHARSET_LIST (attrs);
7758 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7759 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7760
7761 if (c <= 0x7F)
7762 charset = charset_roman;
7763 else
7764 {
7765 int b1 = c >> 8, b2 = c & 0x7F;
7766 if (b1 < 0xA1 || b1 > 0xFE
7767 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7768 error ("Invalid code: %d", code);
7769 charset = charset_big5;
7770 }
7771 c = DECODE_CHAR (charset, (unsigned )c);
7772 if (c < 0)
7773 error ("Invalid code: %d", code);
7774 return make_number (c);
7775 }
7776
7777 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7778 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7779 Return the corresponding character code in Big5. */)
7780 (ch)
7781 Lisp_Object ch;
7782 {
7783 Lisp_Object spec, attrs, charset_list;
7784 struct charset *charset;
7785 int c;
7786 unsigned code;
7787
7788 CHECK_CHARACTER (ch);
7789 c = XFASTINT (ch);
7790 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7791 attrs = AREF (spec, 0);
7792 if (ASCII_CHAR_P (c)
7793 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7794 return ch;
7795
7796 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7797 charset = char_charset (c, charset_list, &code);
7798 if (code == CHARSET_INVALID_CODE (charset))
7799 error ("Can't encode by Big5 encoding: %d", c);
7800
7801 return make_number (code);
7802 }
7803
7804 \f
7805 DEFUN ("set-terminal-coding-system-internal",
7806 Fset_terminal_coding_system_internal,
7807 Sset_terminal_coding_system_internal, 1, 1, 0,
7808 doc: /* Internal use only. */)
7809 (coding_system)
7810 Lisp_Object coding_system;
7811 {
7812 CHECK_SYMBOL (coding_system);
7813 setup_coding_system (Fcheck_coding_system (coding_system),
7814 &terminal_coding);
7815
7816 /* We had better not send unsafe characters to terminal. */
7817 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7818 /* Characer composition should be disabled. */
7819 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7820 terminal_coding.src_multibyte = 1;
7821 terminal_coding.dst_multibyte = 0;
7822 return Qnil;
7823 }
7824
7825 DEFUN ("set-safe-terminal-coding-system-internal",
7826 Fset_safe_terminal_coding_system_internal,
7827 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7828 doc: /* Internal use only. */)
7829 (coding_system)
7830 Lisp_Object coding_system;
7831 {
7832 CHECK_SYMBOL (coding_system);
7833 setup_coding_system (Fcheck_coding_system (coding_system),
7834 &safe_terminal_coding);
7835 /* Characer composition should be disabled. */
7836 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7837 safe_terminal_coding.src_multibyte = 1;
7838 safe_terminal_coding.dst_multibyte = 0;
7839 return Qnil;
7840 }
7841
7842 DEFUN ("terminal-coding-system",
7843 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7844 doc: /* Return coding system specified for terminal output. */)
7845 ()
7846 {
7847 return CODING_ID_NAME (terminal_coding.id);
7848 }
7849
7850 DEFUN ("set-keyboard-coding-system-internal",
7851 Fset_keyboard_coding_system_internal,
7852 Sset_keyboard_coding_system_internal, 1, 1, 0,
7853 doc: /* Internal use only. */)
7854 (coding_system)
7855 Lisp_Object coding_system;
7856 {
7857 CHECK_SYMBOL (coding_system);
7858 setup_coding_system (Fcheck_coding_system (coding_system),
7859 &keyboard_coding);
7860 /* Characer composition should be disabled. */
7861 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7862 return Qnil;
7863 }
7864
7865 DEFUN ("keyboard-coding-system",
7866 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7867 doc: /* Return coding system specified for decoding keyboard input. */)
7868 ()
7869 {
7870 return CODING_ID_NAME (keyboard_coding.id);
7871 }
7872
7873 \f
7874 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7875 Sfind_operation_coding_system, 1, MANY, 0,
7876 doc: /* Choose a coding system for an operation based on the target name.
7877 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7878 DECODING-SYSTEM is the coding system to use for decoding
7879 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7880 for encoding (in case OPERATION does encoding).
7881
7882 The first argument OPERATION specifies an I/O primitive:
7883 For file I/O, `insert-file-contents' or `write-region'.
7884 For process I/O, `call-process', `call-process-region', or `start-process'.
7885 For network I/O, `open-network-stream'.
7886
7887 The remaining arguments should be the same arguments that were passed
7888 to the primitive. Depending on which primitive, one of those arguments
7889 is selected as the TARGET. For example, if OPERATION does file I/O,
7890 whichever argument specifies the file name is TARGET.
7891
7892 TARGET has a meaning which depends on OPERATION:
7893 For file I/O, TARGET is a file name.
7894 For process I/O, TARGET is a process name.
7895 For network I/O, TARGET is a service name or a port number
7896
7897 This function looks up what specified for TARGET in,
7898 `file-coding-system-alist', `process-coding-system-alist',
7899 or `network-coding-system-alist' depending on OPERATION.
7900 They may specify a coding system, a cons of coding systems,
7901 or a function symbol to call.
7902 In the last case, we call the function with one argument,
7903 which is a list of all the arguments given to this function.
7904
7905 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7906 (nargs, args)
7907 int nargs;
7908 Lisp_Object *args;
7909 {
7910 Lisp_Object operation, target_idx, target, val;
7911 register Lisp_Object chain;
7912
7913 if (nargs < 2)
7914 error ("Too few arguments");
7915 operation = args[0];
7916 if (!SYMBOLP (operation)
7917 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7918 error ("Invalid first arguement");
7919 if (nargs < 1 + XINT (target_idx))
7920 error ("Too few arguments for operation: %s",
7921 SDATA (SYMBOL_NAME (operation)));
7922 target = args[XINT (target_idx) + 1];
7923 if (!(STRINGP (target)
7924 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7925 error ("Invalid %dth argument", XINT (target_idx) + 1);
7926
7927 chain = ((EQ (operation, Qinsert_file_contents)
7928 || EQ (operation, Qwrite_region))
7929 ? Vfile_coding_system_alist
7930 : (EQ (operation, Qopen_network_stream)
7931 ? Vnetwork_coding_system_alist
7932 : Vprocess_coding_system_alist));
7933 if (NILP (chain))
7934 return Qnil;
7935
7936 for (; CONSP (chain); chain = XCDR (chain))
7937 {
7938 Lisp_Object elt;
7939
7940 elt = XCAR (chain);
7941 if (CONSP (elt)
7942 && ((STRINGP (target)
7943 && STRINGP (XCAR (elt))
7944 && fast_string_match (XCAR (elt), target) >= 0)
7945 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7946 {
7947 val = XCDR (elt);
7948 /* Here, if VAL is both a valid coding system and a valid
7949 function symbol, we return VAL as a coding system. */
7950 if (CONSP (val))
7951 return val;
7952 if (! SYMBOLP (val))
7953 return Qnil;
7954 if (! NILP (Fcoding_system_p (val)))
7955 return Fcons (val, val);
7956 if (! NILP (Ffboundp (val)))
7957 {
7958 val = call1 (val, Flist (nargs, args));
7959 if (CONSP (val))
7960 return val;
7961 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7962 return Fcons (val, val);
7963 }
7964 return Qnil;
7965 }
7966 }
7967 return Qnil;
7968 }
7969
7970 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7971 Sset_coding_system_priority, 0, MANY, 0,
7972 doc: /* Assign higher priority to the coding systems given as arguments.
7973 If multiple coding systems belongs to the same category,
7974 all but the first one are ignored.
7975
7976 usage: (set-coding-system-priority ...) */)
7977 (nargs, args)
7978 int nargs;
7979 Lisp_Object *args;
7980 {
7981 int i, j;
7982 int changed[coding_category_max];
7983 enum coding_category priorities[coding_category_max];
7984
7985 bzero (changed, sizeof changed);
7986
7987 for (i = j = 0; i < nargs; i++)
7988 {
7989 enum coding_category category;
7990 Lisp_Object spec, attrs;
7991
7992 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7993 attrs = AREF (spec, 0);
7994 category = XINT (CODING_ATTR_CATEGORY (attrs));
7995 if (changed[category])
7996 /* Ignore this coding system because a coding system of the
7997 same category already had a higher priority. */
7998 continue;
7999 changed[category] = 1;
8000 priorities[j++] = category;
8001 if (coding_categories[category].id >= 0
8002 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8003 setup_coding_system (args[i], &coding_categories[category]);
8004 Fset (AREF (Vcoding_category_table, category), args[i]);
8005 }
8006
8007 /* Now we have decided top J priorities. Reflect the order of the
8008 original priorities to the remaining priorities. */
8009
8010 for (i = j, j = 0; i < coding_category_max; i++, j++)
8011 {
8012 while (j < coding_category_max
8013 && changed[coding_priorities[j]])
8014 j++;
8015 if (j == coding_category_max)
8016 abort ();
8017 priorities[i] = coding_priorities[j];
8018 }
8019
8020 bcopy (priorities, coding_priorities, sizeof priorities);
8021
8022 /* Update `coding-category-list'. */
8023 Vcoding_category_list = Qnil;
8024 for (i = coding_category_max - 1; i >= 0; i--)
8025 Vcoding_category_list
8026 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8027 Vcoding_category_list);
8028
8029 return Qnil;
8030 }
8031
8032 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8033 Scoding_system_priority_list, 0, 1, 0,
8034 doc: /* Return a list of coding systems ordered by their priorities.
8035 HIGHESTP non-nil means just return the highest priority one. */)
8036 (highestp)
8037 Lisp_Object highestp;
8038 {
8039 int i;
8040 Lisp_Object val;
8041
8042 for (i = 0, val = Qnil; i < coding_category_max; i++)
8043 {
8044 enum coding_category category = coding_priorities[i];
8045 int id = coding_categories[category].id;
8046 Lisp_Object attrs;
8047
8048 if (id < 0)
8049 continue;
8050 attrs = CODING_ID_ATTRS (id);
8051 if (! NILP (highestp))
8052 return CODING_ATTR_BASE_NAME (attrs);
8053 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8054 }
8055 return Fnreverse (val);
8056 }
8057
8058 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8059
8060 static Lisp_Object
8061 make_subsidiaries (base)
8062 Lisp_Object base;
8063 {
8064 Lisp_Object subsidiaries;
8065 int base_name_len = SBYTES (SYMBOL_NAME (base));
8066 char *buf = (char *) alloca (base_name_len + 6);
8067 int i;
8068
8069 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8070 subsidiaries = Fmake_vector (make_number (3), Qnil);
8071 for (i = 0; i < 3; i++)
8072 {
8073 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8074 ASET (subsidiaries, i, intern (buf));
8075 }
8076 return subsidiaries;
8077 }
8078
8079
8080 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8081 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8082 doc: /* For internal use only.
8083 usage: (define-coding-system-internal ...) */)
8084 (nargs, args)
8085 int nargs;
8086 Lisp_Object *args;
8087 {
8088 Lisp_Object name;
8089 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8090 Lisp_Object attrs; /* Vector of attributes. */
8091 Lisp_Object eol_type;
8092 Lisp_Object aliases;
8093 Lisp_Object coding_type, charset_list, safe_charsets;
8094 enum coding_category category;
8095 Lisp_Object tail, val;
8096 int max_charset_id = 0;
8097 int i;
8098
8099 if (nargs < coding_arg_max)
8100 goto short_args;
8101
8102 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8103
8104 name = args[coding_arg_name];
8105 CHECK_SYMBOL (name);
8106 CODING_ATTR_BASE_NAME (attrs) = name;
8107
8108 val = args[coding_arg_mnemonic];
8109 if (! STRINGP (val))
8110 CHECK_CHARACTER (val);
8111 CODING_ATTR_MNEMONIC (attrs) = val;
8112
8113 coding_type = args[coding_arg_coding_type];
8114 CHECK_SYMBOL (coding_type);
8115 CODING_ATTR_TYPE (attrs) = coding_type;
8116
8117 charset_list = args[coding_arg_charset_list];
8118 if (SYMBOLP (charset_list))
8119 {
8120 if (EQ (charset_list, Qiso_2022))
8121 {
8122 if (! EQ (coding_type, Qiso_2022))
8123 error ("Invalid charset-list");
8124 charset_list = Viso_2022_charset_list;
8125 }
8126 else if (EQ (charset_list, Qemacs_mule))
8127 {
8128 if (! EQ (coding_type, Qemacs_mule))
8129 error ("Invalid charset-list");
8130 charset_list = Vemacs_mule_charset_list;
8131 }
8132 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8133 if (max_charset_id < XFASTINT (XCAR (tail)))
8134 max_charset_id = XFASTINT (XCAR (tail));
8135 }
8136 else
8137 {
8138 charset_list = Fcopy_sequence (charset_list);
8139 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8140 {
8141 struct charset *charset;
8142
8143 val = Fcar (tail);
8144 CHECK_CHARSET_GET_CHARSET (val, charset);
8145 if (EQ (coding_type, Qiso_2022)
8146 ? CHARSET_ISO_FINAL (charset) < 0
8147 : EQ (coding_type, Qemacs_mule)
8148 ? CHARSET_EMACS_MULE_ID (charset) < 0
8149 : 0)
8150 error ("Can't handle charset `%s'",
8151 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8152
8153 XSETCAR (tail, make_number (charset->id));
8154 if (max_charset_id < charset->id)
8155 max_charset_id = charset->id;
8156 }
8157 }
8158 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8159
8160 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8161 make_number (255));
8162 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8163 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8164 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8165
8166 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8167
8168 val = args[coding_arg_decode_translation_table];
8169 if (! CHAR_TABLE_P (val))
8170 CHECK_SYMBOL (val);
8171 CODING_ATTR_DECODE_TBL (attrs) = val;
8172
8173 val = args[coding_arg_encode_translation_table];
8174 if (! CHAR_TABLE_P (val))
8175 CHECK_SYMBOL (val);
8176 CODING_ATTR_ENCODE_TBL (attrs) = val;
8177
8178 val = args[coding_arg_post_read_conversion];
8179 CHECK_SYMBOL (val);
8180 CODING_ATTR_POST_READ (attrs) = val;
8181
8182 val = args[coding_arg_pre_write_conversion];
8183 CHECK_SYMBOL (val);
8184 CODING_ATTR_PRE_WRITE (attrs) = val;
8185
8186 val = args[coding_arg_default_char];
8187 if (NILP (val))
8188 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8189 else
8190 {
8191 CHECK_CHARACTER (val);
8192 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8193 }
8194
8195 val = args[coding_arg_for_unibyte];
8196 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8197
8198 val = args[coding_arg_plist];
8199 CHECK_LIST (val);
8200 CODING_ATTR_PLIST (attrs) = val;
8201
8202 if (EQ (coding_type, Qcharset))
8203 {
8204 /* Generate a lisp vector of 256 elements. Each element is nil,
8205 integer, or a list of charset IDs.
8206
8207 If Nth element is nil, the byte code N is invalid in this
8208 coding system.
8209
8210 If Nth element is a number NUM, N is the first byte of a
8211 charset whose ID is NUM.
8212
8213 If Nth element is a list of charset IDs, N is the first byte
8214 of one of them. The list is sorted by dimensions of the
8215 charsets. A charset of smaller dimension comes firtst. */
8216 val = Fmake_vector (make_number (256), Qnil);
8217
8218 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8219 {
8220 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8221 int dim = CHARSET_DIMENSION (charset);
8222 int idx = (dim - 1) * 4;
8223
8224 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8225 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8226
8227 for (i = charset->code_space[idx];
8228 i <= charset->code_space[idx + 1]; i++)
8229 {
8230 Lisp_Object tmp, tmp2;
8231 int dim2;
8232
8233 tmp = AREF (val, i);
8234 if (NILP (tmp))
8235 tmp = XCAR (tail);
8236 else if (NUMBERP (tmp))
8237 {
8238 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8239 if (dim < dim2)
8240 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8241 else
8242 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8243 }
8244 else
8245 {
8246 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8247 {
8248 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8249 if (dim < dim2)
8250 break;
8251 }
8252 if (NILP (tmp2))
8253 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8254 else
8255 {
8256 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8257 XSETCAR (tmp2, XCAR (tail));
8258 }
8259 }
8260 ASET (val, i, tmp);
8261 }
8262 }
8263 ASET (attrs, coding_attr_charset_valids, val);
8264 category = coding_category_charset;
8265 }
8266 else if (EQ (coding_type, Qccl))
8267 {
8268 Lisp_Object valids;
8269
8270 if (nargs < coding_arg_ccl_max)
8271 goto short_args;
8272
8273 val = args[coding_arg_ccl_decoder];
8274 CHECK_CCL_PROGRAM (val);
8275 if (VECTORP (val))
8276 val = Fcopy_sequence (val);
8277 ASET (attrs, coding_attr_ccl_decoder, val);
8278
8279 val = args[coding_arg_ccl_encoder];
8280 CHECK_CCL_PROGRAM (val);
8281 if (VECTORP (val))
8282 val = Fcopy_sequence (val);
8283 ASET (attrs, coding_attr_ccl_encoder, val);
8284
8285 val = args[coding_arg_ccl_valids];
8286 valids = Fmake_string (make_number (256), make_number (0));
8287 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8288 {
8289 int from, to;
8290
8291 val = Fcar (tail);
8292 if (INTEGERP (val))
8293 {
8294 from = to = XINT (val);
8295 if (from < 0 || from > 255)
8296 args_out_of_range_3 (val, make_number (0), make_number (255));
8297 }
8298 else
8299 {
8300 CHECK_CONS (val);
8301 CHECK_NATNUM_CAR (val);
8302 CHECK_NATNUM_CDR (val);
8303 from = XINT (XCAR (val));
8304 if (from > 255)
8305 args_out_of_range_3 (XCAR (val),
8306 make_number (0), make_number (255));
8307 to = XINT (XCDR (val));
8308 if (to < from || to > 255)
8309 args_out_of_range_3 (XCDR (val),
8310 XCAR (val), make_number (255));
8311 }
8312 for (i = from; i <= to; i++)
8313 SSET (valids, i, 1);
8314 }
8315 ASET (attrs, coding_attr_ccl_valids, valids);
8316
8317 category = coding_category_ccl;
8318 }
8319 else if (EQ (coding_type, Qutf_16))
8320 {
8321 Lisp_Object bom, endian;
8322
8323 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8324
8325 if (nargs < coding_arg_utf16_max)
8326 goto short_args;
8327
8328 bom = args[coding_arg_utf16_bom];
8329 if (! NILP (bom) && ! EQ (bom, Qt))
8330 {
8331 CHECK_CONS (bom);
8332 val = XCAR (bom);
8333 CHECK_CODING_SYSTEM (val);
8334 val = XCDR (bom);
8335 CHECK_CODING_SYSTEM (val);
8336 }
8337 ASET (attrs, coding_attr_utf_16_bom, bom);
8338
8339 endian = args[coding_arg_utf16_endian];
8340 CHECK_SYMBOL (endian);
8341 if (NILP (endian))
8342 endian = Qbig;
8343 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8344 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8345 ASET (attrs, coding_attr_utf_16_endian, endian);
8346
8347 category = (CONSP (bom)
8348 ? coding_category_utf_16_auto
8349 : NILP (bom)
8350 ? (EQ (endian, Qbig)
8351 ? coding_category_utf_16_be_nosig
8352 : coding_category_utf_16_le_nosig)
8353 : (EQ (endian, Qbig)
8354 ? coding_category_utf_16_be
8355 : coding_category_utf_16_le));
8356 }
8357 else if (EQ (coding_type, Qiso_2022))
8358 {
8359 Lisp_Object initial, reg_usage, request, flags;
8360 int i;
8361
8362 if (nargs < coding_arg_iso2022_max)
8363 goto short_args;
8364
8365 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8366 CHECK_VECTOR (initial);
8367 for (i = 0; i < 4; i++)
8368 {
8369 val = Faref (initial, make_number (i));
8370 if (! NILP (val))
8371 {
8372 struct charset *charset;
8373
8374 CHECK_CHARSET_GET_CHARSET (val, charset);
8375 ASET (initial, i, make_number (CHARSET_ID (charset)));
8376 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8377 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8378 }
8379 else
8380 ASET (initial, i, make_number (-1));
8381 }
8382
8383 reg_usage = args[coding_arg_iso2022_reg_usage];
8384 CHECK_CONS (reg_usage);
8385 CHECK_NUMBER_CAR (reg_usage);
8386 CHECK_NUMBER_CDR (reg_usage);
8387
8388 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8389 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8390 {
8391 int id;
8392 Lisp_Object tmp;
8393
8394 val = Fcar (tail);
8395 CHECK_CONS (val);
8396 tmp = XCAR (val);
8397 CHECK_CHARSET_GET_ID (tmp, id);
8398 CHECK_NATNUM_CDR (val);
8399 if (XINT (XCDR (val)) >= 4)
8400 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8401 XSETCAR (val, make_number (id));
8402 }
8403
8404 flags = args[coding_arg_iso2022_flags];
8405 CHECK_NATNUM (flags);
8406 i = XINT (flags);
8407 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8408 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8409
8410 ASET (attrs, coding_attr_iso_initial, initial);
8411 ASET (attrs, coding_attr_iso_usage, reg_usage);
8412 ASET (attrs, coding_attr_iso_request, request);
8413 ASET (attrs, coding_attr_iso_flags, flags);
8414 setup_iso_safe_charsets (attrs);
8415
8416 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8417 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8418 | CODING_ISO_FLAG_SINGLE_SHIFT))
8419 ? coding_category_iso_7_else
8420 : EQ (args[coding_arg_charset_list], Qiso_2022)
8421 ? coding_category_iso_7
8422 : coding_category_iso_7_tight);
8423 else
8424 {
8425 int id = XINT (AREF (initial, 1));
8426
8427 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8428 || EQ (args[coding_arg_charset_list], Qiso_2022)
8429 || id < 0)
8430 ? coding_category_iso_8_else
8431 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8432 ? coding_category_iso_8_1
8433 : coding_category_iso_8_2);
8434 }
8435 if (category != coding_category_iso_8_1
8436 && category != coding_category_iso_8_2)
8437 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8438 }
8439 else if (EQ (coding_type, Qemacs_mule))
8440 {
8441 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8442 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8443 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8444 category = coding_category_emacs_mule;
8445 }
8446 else if (EQ (coding_type, Qshift_jis))
8447 {
8448
8449 struct charset *charset;
8450
8451 if (XINT (Flength (charset_list)) != 3
8452 && XINT (Flength (charset_list)) != 4)
8453 error ("There should be three or four charsets");
8454
8455 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8456 if (CHARSET_DIMENSION (charset) != 1)
8457 error ("Dimension of charset %s is not one",
8458 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8459 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8460 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8461
8462 charset_list = XCDR (charset_list);
8463 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8464 if (CHARSET_DIMENSION (charset) != 1)
8465 error ("Dimension of charset %s is not one",
8466 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8467
8468 charset_list = XCDR (charset_list);
8469 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8470 if (CHARSET_DIMENSION (charset) != 2)
8471 error ("Dimension of charset %s is not two",
8472 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8473
8474 charset_list = XCDR (charset_list);
8475 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8476 if (CHARSET_DIMENSION (charset) != 2)
8477 error ("Dimension of charset %s is not two",
8478 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8479
8480 category = coding_category_sjis;
8481 Vsjis_coding_system = name;
8482 }
8483 else if (EQ (coding_type, Qbig5))
8484 {
8485 struct charset *charset;
8486
8487 if (XINT (Flength (charset_list)) != 2)
8488 error ("There should be just two charsets");
8489
8490 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8491 if (CHARSET_DIMENSION (charset) != 1)
8492 error ("Dimension of charset %s is not one",
8493 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8494 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8495 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8496
8497 charset_list = XCDR (charset_list);
8498 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8499 if (CHARSET_DIMENSION (charset) != 2)
8500 error ("Dimension of charset %s is not two",
8501 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8502
8503 category = coding_category_big5;
8504 Vbig5_coding_system = name;
8505 }
8506 else if (EQ (coding_type, Qraw_text))
8507 {
8508 category = coding_category_raw_text;
8509 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8510 }
8511 else if (EQ (coding_type, Qutf_8))
8512 {
8513 category = coding_category_utf_8;
8514 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8515 }
8516 else if (EQ (coding_type, Qundecided))
8517 category = coding_category_undecided;
8518 else
8519 error ("Invalid coding system type: %s",
8520 SDATA (SYMBOL_NAME (coding_type)));
8521
8522 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8523 CODING_ATTR_PLIST (attrs)
8524 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8525 CODING_ATTR_PLIST (attrs)));
8526
8527 eol_type = args[coding_arg_eol_type];
8528 if (! NILP (eol_type)
8529 && ! EQ (eol_type, Qunix)
8530 && ! EQ (eol_type, Qdos)
8531 && ! EQ (eol_type, Qmac))
8532 error ("Invalid eol-type");
8533
8534 aliases = Fcons (name, Qnil);
8535
8536 if (NILP (eol_type))
8537 {
8538 eol_type = make_subsidiaries (name);
8539 for (i = 0; i < 3; i++)
8540 {
8541 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8542
8543 this_name = AREF (eol_type, i);
8544 this_aliases = Fcons (this_name, Qnil);
8545 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8546 this_spec = Fmake_vector (make_number (3), attrs);
8547 ASET (this_spec, 1, this_aliases);
8548 ASET (this_spec, 2, this_eol_type);
8549 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8550 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8551 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8552 Vcoding_system_alist);
8553 }
8554 }
8555
8556 spec_vec = Fmake_vector (make_number (3), attrs);
8557 ASET (spec_vec, 1, aliases);
8558 ASET (spec_vec, 2, eol_type);
8559
8560 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8561 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8562 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8563 Vcoding_system_alist);
8564
8565 {
8566 int id = coding_categories[category].id;
8567
8568 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8569 setup_coding_system (name, &coding_categories[category]);
8570 }
8571
8572 return Qnil;
8573
8574 short_args:
8575 return Fsignal (Qwrong_number_of_arguments,
8576 Fcons (intern ("define-coding-system-internal"),
8577 make_number (nargs)));
8578 }
8579
8580
8581 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8582 Sdefine_coding_system_alias, 2, 2, 0,
8583 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8584 (alias, coding_system)
8585 Lisp_Object alias, coding_system;
8586 {
8587 Lisp_Object spec, aliases, eol_type;
8588
8589 CHECK_SYMBOL (alias);
8590 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8591 aliases = AREF (spec, 1);
8592 /* ALISES should be a list of length more than zero, and the first
8593 element is a base coding system. Append ALIAS at the tail of the
8594 list. */
8595 while (!NILP (XCDR (aliases)))
8596 aliases = XCDR (aliases);
8597 XSETCDR (aliases, Fcons (alias, Qnil));
8598
8599 eol_type = AREF (spec, 2);
8600 if (VECTORP (eol_type))
8601 {
8602 Lisp_Object subsidiaries;
8603 int i;
8604
8605 subsidiaries = make_subsidiaries (alias);
8606 for (i = 0; i < 3; i++)
8607 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8608 AREF (eol_type, i));
8609 }
8610
8611 Fputhash (alias, spec, Vcoding_system_hash_table);
8612 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8613 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8614 Vcoding_system_alist);
8615
8616 return Qnil;
8617 }
8618
8619 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8620 1, 1, 0,
8621 doc: /* Return the base of CODING-SYSTEM.
8622 Any alias or subsidiary coding system is not a base coding system. */)
8623 (coding_system)
8624 Lisp_Object coding_system;
8625 {
8626 Lisp_Object spec, attrs;
8627
8628 if (NILP (coding_system))
8629 return (Qno_conversion);
8630 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8631 attrs = AREF (spec, 0);
8632 return CODING_ATTR_BASE_NAME (attrs);
8633 }
8634
8635 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8636 1, 1, 0,
8637 doc: "Return the property list of CODING-SYSTEM.")
8638 (coding_system)
8639 Lisp_Object coding_system;
8640 {
8641 Lisp_Object spec, attrs;
8642
8643 if (NILP (coding_system))
8644 coding_system = Qno_conversion;
8645 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8646 attrs = AREF (spec, 0);
8647 return CODING_ATTR_PLIST (attrs);
8648 }
8649
8650
8651 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8652 1, 1, 0,
8653 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8654 (coding_system)
8655 Lisp_Object coding_system;
8656 {
8657 Lisp_Object spec;
8658
8659 if (NILP (coding_system))
8660 coding_system = Qno_conversion;
8661 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8662 return AREF (spec, 1);
8663 }
8664
8665 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8666 Scoding_system_eol_type, 1, 1, 0,
8667 doc: /* Return eol-type of CODING-SYSTEM.
8668 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8669
8670 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8671 and CR respectively.
8672
8673 A vector value indicates that a format of end-of-line should be
8674 detected automatically. Nth element of the vector is the subsidiary
8675 coding system whose eol-type is N. */)
8676 (coding_system)
8677 Lisp_Object coding_system;
8678 {
8679 Lisp_Object spec, eol_type;
8680 int n;
8681
8682 if (NILP (coding_system))
8683 coding_system = Qno_conversion;
8684 if (! CODING_SYSTEM_P (coding_system))
8685 return Qnil;
8686 spec = CODING_SYSTEM_SPEC (coding_system);
8687 eol_type = AREF (spec, 2);
8688 if (VECTORP (eol_type))
8689 return Fcopy_sequence (eol_type);
8690 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8691 return make_number (n);
8692 }
8693
8694 #endif /* emacs */
8695
8696 \f
8697 /*** 9. Post-amble ***/
8698
8699 void
8700 init_coding_once ()
8701 {
8702 int i;
8703
8704 for (i = 0; i < coding_category_max; i++)
8705 {
8706 coding_categories[i].id = -1;
8707 coding_priorities[i] = i;
8708 }
8709
8710 /* ISO2022 specific initialize routine. */
8711 for (i = 0; i < 0x20; i++)
8712 iso_code_class[i] = ISO_control_0;
8713 for (i = 0x21; i < 0x7F; i++)
8714 iso_code_class[i] = ISO_graphic_plane_0;
8715 for (i = 0x80; i < 0xA0; i++)
8716 iso_code_class[i] = ISO_control_1;
8717 for (i = 0xA1; i < 0xFF; i++)
8718 iso_code_class[i] = ISO_graphic_plane_1;
8719 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8720 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8721 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8722 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8723 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8724 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8725 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8726 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8727 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8728
8729 for (i = 0; i < 256; i++)
8730 {
8731 emacs_mule_bytes[i] = 1;
8732 }
8733 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8734 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8735 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8736 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8737 }
8738
8739 #ifdef emacs
8740
8741 void
8742 syms_of_coding ()
8743 {
8744 staticpro (&Vcoding_system_hash_table);
8745 {
8746 Lisp_Object args[2];
8747 args[0] = QCtest;
8748 args[1] = Qeq;
8749 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8750 }
8751
8752 staticpro (&Vsjis_coding_system);
8753 Vsjis_coding_system = Qnil;
8754
8755 staticpro (&Vbig5_coding_system);
8756 Vbig5_coding_system = Qnil;
8757
8758 staticpro (&Vcode_conversion_reused_workbuf);
8759 Vcode_conversion_reused_workbuf = Qnil;
8760
8761 staticpro (&Vcode_conversion_workbuf_name);
8762 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8763
8764 reused_workbuf_in_use = 0;
8765
8766 DEFSYM (Qcharset, "charset");
8767 DEFSYM (Qtarget_idx, "target-idx");
8768 DEFSYM (Qcoding_system_history, "coding-system-history");
8769 Fset (Qcoding_system_history, Qnil);
8770
8771 /* Target FILENAME is the first argument. */
8772 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8773 /* Target FILENAME is the third argument. */
8774 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8775
8776 DEFSYM (Qcall_process, "call-process");
8777 /* Target PROGRAM is the first argument. */
8778 Fput (Qcall_process, Qtarget_idx, make_number (0));
8779
8780 DEFSYM (Qcall_process_region, "call-process-region");
8781 /* Target PROGRAM is the third argument. */
8782 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8783
8784 DEFSYM (Qstart_process, "start-process");
8785 /* Target PROGRAM is the third argument. */
8786 Fput (Qstart_process, Qtarget_idx, make_number (2));
8787
8788 DEFSYM (Qopen_network_stream, "open-network-stream");
8789 /* Target SERVICE is the fourth argument. */
8790 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8791
8792 DEFSYM (Qcoding_system, "coding-system");
8793 DEFSYM (Qcoding_aliases, "coding-aliases");
8794
8795 DEFSYM (Qeol_type, "eol-type");
8796 DEFSYM (Qunix, "unix");
8797 DEFSYM (Qdos, "dos");
8798
8799 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8800 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8801 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8802 DEFSYM (Qdefault_char, "default-char");
8803 DEFSYM (Qundecided, "undecided");
8804 DEFSYM (Qno_conversion, "no-conversion");
8805 DEFSYM (Qraw_text, "raw-text");
8806
8807 DEFSYM (Qiso_2022, "iso-2022");
8808
8809 DEFSYM (Qutf_8, "utf-8");
8810 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8811
8812 DEFSYM (Qutf_16, "utf-16");
8813 DEFSYM (Qbig, "big");
8814 DEFSYM (Qlittle, "little");
8815
8816 DEFSYM (Qshift_jis, "shift-jis");
8817 DEFSYM (Qbig5, "big5");
8818
8819 DEFSYM (Qcoding_system_p, "coding-system-p");
8820
8821 DEFSYM (Qcoding_system_error, "coding-system-error");
8822 Fput (Qcoding_system_error, Qerror_conditions,
8823 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8824 Fput (Qcoding_system_error, Qerror_message,
8825 build_string ("Invalid coding system"));
8826
8827 /* Intern this now in case it isn't already done.
8828 Setting this variable twice is harmless.
8829 But don't staticpro it here--that is done in alloc.c. */
8830 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8831
8832 DEFSYM (Qtranslation_table, "translation-table");
8833 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8834 DEFSYM (Qtranslation_table_id, "translation-table-id");
8835 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8836 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8837
8838 DEFSYM (Qvalid_codes, "valid-codes");
8839
8840 DEFSYM (Qemacs_mule, "emacs-mule");
8841
8842 DEFSYM (QCcategory, ":category");
8843
8844 Vcoding_category_table
8845 = Fmake_vector (make_number (coding_category_max), Qnil);
8846 staticpro (&Vcoding_category_table);
8847 /* Followings are target of code detection. */
8848 ASET (Vcoding_category_table, coding_category_iso_7,
8849 intern ("coding-category-iso-7"));
8850 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8851 intern ("coding-category-iso-7-tight"));
8852 ASET (Vcoding_category_table, coding_category_iso_8_1,
8853 intern ("coding-category-iso-8-1"));
8854 ASET (Vcoding_category_table, coding_category_iso_8_2,
8855 intern ("coding-category-iso-8-2"));
8856 ASET (Vcoding_category_table, coding_category_iso_7_else,
8857 intern ("coding-category-iso-7-else"));
8858 ASET (Vcoding_category_table, coding_category_iso_8_else,
8859 intern ("coding-category-iso-8-else"));
8860 ASET (Vcoding_category_table, coding_category_utf_8,
8861 intern ("coding-category-utf-8"));
8862 ASET (Vcoding_category_table, coding_category_utf_16_be,
8863 intern ("coding-category-utf-16-be"));
8864 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8865 intern ("coding-category-utf-16-auto"));
8866 ASET (Vcoding_category_table, coding_category_utf_16_le,
8867 intern ("coding-category-utf-16-le"));
8868 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8869 intern ("coding-category-utf-16-be-nosig"));
8870 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8871 intern ("coding-category-utf-16-le-nosig"));
8872 ASET (Vcoding_category_table, coding_category_charset,
8873 intern ("coding-category-charset"));
8874 ASET (Vcoding_category_table, coding_category_sjis,
8875 intern ("coding-category-sjis"));
8876 ASET (Vcoding_category_table, coding_category_big5,
8877 intern ("coding-category-big5"));
8878 ASET (Vcoding_category_table, coding_category_ccl,
8879 intern ("coding-category-ccl"));
8880 ASET (Vcoding_category_table, coding_category_emacs_mule,
8881 intern ("coding-category-emacs-mule"));
8882 /* Followings are NOT target of code detection. */
8883 ASET (Vcoding_category_table, coding_category_raw_text,
8884 intern ("coding-category-raw-text"));
8885 ASET (Vcoding_category_table, coding_category_undecided,
8886 intern ("coding-category-undecided"));
8887
8888 DEFSYM (Qinsufficient_source, "insufficient-source");
8889 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8890 DEFSYM (Qinvalid_source, "invalid-source");
8891 DEFSYM (Qinterrupted, "interrupted");
8892 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8893
8894 defsubr (&Scoding_system_p);
8895 defsubr (&Sread_coding_system);
8896 defsubr (&Sread_non_nil_coding_system);
8897 defsubr (&Scheck_coding_system);
8898 defsubr (&Sdetect_coding_region);
8899 defsubr (&Sdetect_coding_string);
8900 defsubr (&Sfind_coding_systems_region_internal);
8901 defsubr (&Sunencodable_char_position);
8902 defsubr (&Scheck_coding_systems_region);
8903 defsubr (&Sdecode_coding_region);
8904 defsubr (&Sencode_coding_region);
8905 defsubr (&Sdecode_coding_string);
8906 defsubr (&Sencode_coding_string);
8907 defsubr (&Sdecode_sjis_char);
8908 defsubr (&Sencode_sjis_char);
8909 defsubr (&Sdecode_big5_char);
8910 defsubr (&Sencode_big5_char);
8911 defsubr (&Sset_terminal_coding_system_internal);
8912 defsubr (&Sset_safe_terminal_coding_system_internal);
8913 defsubr (&Sterminal_coding_system);
8914 defsubr (&Sset_keyboard_coding_system_internal);
8915 defsubr (&Skeyboard_coding_system);
8916 defsubr (&Sfind_operation_coding_system);
8917 defsubr (&Sset_coding_system_priority);
8918 defsubr (&Sdefine_coding_system_internal);
8919 defsubr (&Sdefine_coding_system_alias);
8920 defsubr (&Scoding_system_base);
8921 defsubr (&Scoding_system_plist);
8922 defsubr (&Scoding_system_aliases);
8923 defsubr (&Scoding_system_eol_type);
8924 defsubr (&Scoding_system_priority_list);
8925
8926 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8927 doc: /* List of coding systems.
8928
8929 Do not alter the value of this variable manually. This variable should be
8930 updated by the functions `define-coding-system' and
8931 `define-coding-system-alias'. */);
8932 Vcoding_system_list = Qnil;
8933
8934 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8935 doc: /* Alist of coding system names.
8936 Each element is one element list of coding system name.
8937 This variable is given to `completing-read' as TABLE argument.
8938
8939 Do not alter the value of this variable manually. This variable should be
8940 updated by the functions `make-coding-system' and
8941 `define-coding-system-alias'. */);
8942 Vcoding_system_alist = Qnil;
8943
8944 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8945 doc: /* List of coding-categories (symbols) ordered by priority.
8946
8947 On detecting a coding system, Emacs tries code detection algorithms
8948 associated with each coding-category one by one in this order. When
8949 one algorithm agrees with a byte sequence of source text, the coding
8950 system bound to the corresponding coding-category is selected. */);
8951 {
8952 int i;
8953
8954 Vcoding_category_list = Qnil;
8955 for (i = coding_category_max - 1; i >= 0; i--)
8956 Vcoding_category_list
8957 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8958 Vcoding_category_list);
8959 }
8960
8961 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8962 doc: /* Specify the coding system for read operations.
8963 It is useful to bind this variable with `let', but do not set it globally.
8964 If the value is a coding system, it is used for decoding on read operation.
8965 If not, an appropriate element is used from one of the coding system alists:
8966 There are three such tables, `file-coding-system-alist',
8967 `process-coding-system-alist', and `network-coding-system-alist'. */);
8968 Vcoding_system_for_read = Qnil;
8969
8970 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8971 doc: /* Specify the coding system for write operations.
8972 Programs bind this variable with `let', but you should not set it globally.
8973 If the value is a coding system, it is used for encoding of output,
8974 when writing it to a file and when sending it to a file or subprocess.
8975
8976 If this does not specify a coding system, an appropriate element
8977 is used from one of the coding system alists:
8978 There are three such tables, `file-coding-system-alist',
8979 `process-coding-system-alist', and `network-coding-system-alist'.
8980 For output to files, if the above procedure does not specify a coding system,
8981 the value of `buffer-file-coding-system' is used. */);
8982 Vcoding_system_for_write = Qnil;
8983
8984 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8985 doc: /*
8986 Coding system used in the latest file or process I/O. */);
8987 Vlast_coding_system_used = Qnil;
8988
8989 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
8990 doc: /*
8991 Error status of the last code conversion.
8992
8993 When an error was detected in the last code conversion, this variable
8994 is set to one of the following symbols.
8995 `insufficient-source'
8996 `inconsistent-eol'
8997 `invalid-source'
8998 `interrupted'
8999 `insufficient-memory'
9000 When no error was detected, the value doesn't change. So, to check
9001 the error status of a code conversion by this variable, you must
9002 explicitly set this variable to nil before performing code
9003 conversion. */);
9004 Vlast_code_conversion_error = Qnil;
9005
9006 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9007 doc: /*
9008 *Non-nil means always inhibit code conversion of end-of-line format.
9009 See info node `Coding Systems' and info node `Text and Binary' concerning
9010 such conversion. */);
9011 inhibit_eol_conversion = 0;
9012
9013 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9014 doc: /*
9015 Non-nil means process buffer inherits coding system of process output.
9016 Bind it to t if the process output is to be treated as if it were a file
9017 read from some filesystem. */);
9018 inherit_process_coding_system = 0;
9019
9020 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9021 doc: /*
9022 Alist to decide a coding system to use for a file I/O operation.
9023 The format is ((PATTERN . VAL) ...),
9024 where PATTERN is a regular expression matching a file name,
9025 VAL is a coding system, a cons of coding systems, or a function symbol.
9026 If VAL is a coding system, it is used for both decoding and encoding
9027 the file contents.
9028 If VAL is a cons of coding systems, the car part is used for decoding,
9029 and the cdr part is used for encoding.
9030 If VAL is a function symbol, the function must return a coding system
9031 or a cons of coding systems which are used as above. The function gets
9032 the arguments with which `find-operation-coding-systems' was called.
9033
9034 See also the function `find-operation-coding-system'
9035 and the variable `auto-coding-alist'. */);
9036 Vfile_coding_system_alist = Qnil;
9037
9038 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9039 doc: /*
9040 Alist to decide a coding system to use for a process I/O operation.
9041 The format is ((PATTERN . VAL) ...),
9042 where PATTERN is a regular expression matching a program name,
9043 VAL is a coding system, a cons of coding systems, or a function symbol.
9044 If VAL is a coding system, it is used for both decoding what received
9045 from the program and encoding what sent to the program.
9046 If VAL is a cons of coding systems, the car part is used for decoding,
9047 and the cdr part is used for encoding.
9048 If VAL is a function symbol, the function must return a coding system
9049 or a cons of coding systems which are used as above.
9050
9051 See also the function `find-operation-coding-system'. */);
9052 Vprocess_coding_system_alist = Qnil;
9053
9054 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9055 doc: /*
9056 Alist to decide a coding system to use for a network I/O operation.
9057 The format is ((PATTERN . VAL) ...),
9058 where PATTERN is a regular expression matching a network service name
9059 or is a port number to connect to,
9060 VAL is a coding system, a cons of coding systems, or a function symbol.
9061 If VAL is a coding system, it is used for both decoding what received
9062 from the network stream and encoding what sent to the network stream.
9063 If VAL is a cons of coding systems, the car part is used for decoding,
9064 and the cdr part is used for encoding.
9065 If VAL is a function symbol, the function must return a coding system
9066 or a cons of coding systems which are used as above.
9067
9068 See also the function `find-operation-coding-system'. */);
9069 Vnetwork_coding_system_alist = Qnil;
9070
9071 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9072 doc: /* Coding system to use with system messages.
9073 Also used for decoding keyboard input on X Window system. */);
9074 Vlocale_coding_system = Qnil;
9075
9076 /* The eol mnemonics are reset in startup.el system-dependently. */
9077 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9078 doc: /*
9079 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9080 eol_mnemonic_unix = build_string (":");
9081
9082 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9083 doc: /*
9084 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9085 eol_mnemonic_dos = build_string ("\\");
9086
9087 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9088 doc: /*
9089 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9090 eol_mnemonic_mac = build_string ("/");
9091
9092 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9093 doc: /*
9094 *String displayed in mode line when end-of-line format is not yet determined. */);
9095 eol_mnemonic_undecided = build_string (":");
9096
9097 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9098 doc: /*
9099 *Non-nil enables character translation while encoding and decoding. */);
9100 Venable_character_translation = Qt;
9101
9102 DEFVAR_LISP ("standard-translation-table-for-decode",
9103 &Vstandard_translation_table_for_decode,
9104 doc: /* Table for translating characters while decoding. */);
9105 Vstandard_translation_table_for_decode = Qnil;
9106
9107 DEFVAR_LISP ("standard-translation-table-for-encode",
9108 &Vstandard_translation_table_for_encode,
9109 doc: /* Table for translating characters while encoding. */);
9110 Vstandard_translation_table_for_encode = Qnil;
9111
9112 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9113 doc: /* Alist of charsets vs revision numbers.
9114 While encoding, if a charset (car part of an element) is found,
9115 designate it with the escape sequence identifying revision (cdr part
9116 of the element). */);
9117 Vcharset_revision_table = Qnil;
9118
9119 DEFVAR_LISP ("default-process-coding-system",
9120 &Vdefault_process_coding_system,
9121 doc: /* Cons of coding systems used for process I/O by default.
9122 The car part is used for decoding a process output,
9123 the cdr part is used for encoding a text to be sent to a process. */);
9124 Vdefault_process_coding_system = Qnil;
9125
9126 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9127 doc: /*
9128 Table of extra Latin codes in the range 128..159 (inclusive).
9129 This is a vector of length 256.
9130 If Nth element is non-nil, the existence of code N in a file
9131 \(or output of subprocess) doesn't prevent it to be detected as
9132 a coding system of ISO 2022 variant which has a flag
9133 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9134 or reading output of a subprocess.
9135 Only 128th through 159th elements has a meaning. */);
9136 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9137
9138 DEFVAR_LISP ("select-safe-coding-system-function",
9139 &Vselect_safe_coding_system_function,
9140 doc: /*
9141 Function to call to select safe coding system for encoding a text.
9142
9143 If set, this function is called to force a user to select a proper
9144 coding system which can encode the text in the case that a default
9145 coding system used in each operation can't encode the text.
9146
9147 The default value is `select-safe-coding-system' (which see). */);
9148 Vselect_safe_coding_system_function = Qnil;
9149
9150 DEFVAR_BOOL ("coding-system-require-warning",
9151 &coding_system_require_warning,
9152 doc: /* Internal use only.
9153 If non-nil, on writing a file, `select-safe-coding-system-function' is
9154 called even if `coding-system-for-write' is non-nil. The command
9155 `universal-coding-system-argument' binds this variable to t temporarily. */);
9156 coding_system_require_warning = 0;
9157
9158
9159 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9160 &inhibit_iso_escape_detection,
9161 doc: /*
9162 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9163
9164 By default, on reading a file, Emacs tries to detect how the text is
9165 encoded. This code detection is sensitive to escape sequences. If
9166 the sequence is valid as ISO2022, the code is determined as one of
9167 the ISO2022 encodings, and the file is decoded by the corresponding
9168 coding system (e.g. `iso-2022-7bit').
9169
9170 However, there may be a case that you want to read escape sequences in
9171 a file as is. In such a case, you can set this variable to non-nil.
9172 Then, as the code detection ignores any escape sequences, no file is
9173 detected as encoded in some ISO2022 encoding. The result is that all
9174 escape sequences become visible in a buffer.
9175
9176 The default value is nil, and it is strongly recommended not to change
9177 it. That is because many Emacs Lisp source files that contain
9178 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9179 in Emacs's distribution, and they won't be decoded correctly on
9180 reading if you suppress escape sequence detection.
9181
9182 The other way to read escape sequences in a file without decoding is
9183 to explicitly specify some coding system that doesn't use ISO2022's
9184 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9185 inhibit_iso_escape_detection = 0;
9186
9187 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9188 doc: /* Char table for translating self-inserting characters.
9189 This is applied to the result of input methods, not their input. See also
9190 `keyboard-translate-table'. */);
9191 Vtranslation_table_for_input = Qnil;
9192
9193 {
9194 Lisp_Object args[coding_arg_max];
9195 Lisp_Object plist[16];
9196 int i;
9197
9198 for (i = 0; i < coding_arg_max; i++)
9199 args[i] = Qnil;
9200
9201 plist[0] = intern (":name");
9202 plist[1] = args[coding_arg_name] = Qno_conversion;
9203 plist[2] = intern (":mnemonic");
9204 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9205 plist[4] = intern (":coding-type");
9206 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9207 plist[6] = intern (":ascii-compatible-p");
9208 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9209 plist[8] = intern (":default-char");
9210 plist[9] = args[coding_arg_default_char] = make_number (0);
9211 plist[10] = intern (":for-unibyte");
9212 plist[11] = args[coding_arg_for_unibyte] = Qt;
9213 plist[12] = intern (":docstring");
9214 plist[13] = build_string ("Do no conversion.\n\
9215 \n\
9216 When you visit a file with this coding, the file is read into a\n\
9217 unibyte buffer as is, thus each byte of a file is treated as a\n\
9218 character.");
9219 plist[14] = intern (":eol-type");
9220 plist[15] = args[coding_arg_eol_type] = Qunix;
9221 args[coding_arg_plist] = Flist (16, plist);
9222 Fdefine_coding_system_internal (coding_arg_max, args);
9223 }
9224
9225 setup_coding_system (Qno_conversion, &keyboard_coding);
9226 setup_coding_system (Qno_conversion, &terminal_coding);
9227 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9228
9229 {
9230 int i;
9231
9232 for (i = 0; i < coding_category_max; i++)
9233 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9234 }
9235 }
9236
9237 char *
9238 emacs_strerror (error_number)
9239 int error_number;
9240 {
9241 char *str;
9242
9243 synchronize_system_messages_locale ();
9244 str = strerror (error_number);
9245
9246 if (! NILP (Vlocale_coding_system))
9247 {
9248 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9249 Vlocale_coding_system,
9250 0);
9251 str = (char *) SDATA (dec);
9252 }
9253
9254 return str;
9255 }
9256
9257 #endif /* emacs */